bcache in userspace; userspace fsck

author: Kent Overstreet <kent.overstreet@gmail.com> 2017-01-08 00:13:18 -0900
committer: Kent Overstreet <kent.overstreet@gmail.com> 2017-01-20 09:07:08 -0900
commit: b33fc8298f7e13226b9895abc57c9bfce5e3fa2d (patch)
tree: a3d2a5a909b6372f7777c1c5c18cef5f81d123a9
parent: 7f4191a202ea4558ca2d5eb8a47daea33c9999c7 (diff)
298 files changed, 74498 insertions, 1172 deletions
diff --git a/.bcache_revision b/.bcache_revision
new file mode 100644
index 0000000..e728408
--- /dev/null
+++ b/.bcache_revision
@@ -0,0 +1 @@
+BCACHE_REVISION=f8c8c133492ac9a63fdfeb9edf9bb26a3283db9f
diff --git a/.gitignore b/.gitignore
index e9dd6ec..6291a6f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 bcache
+bcache-userspace
 probe-bcache
 .*
 *.o
diff --git a/Makefile b/Makefile
index e11f606..c29d973 100644
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,21 @@
 
 PREFIX=/usr
 INSTALL=install
-CFLAGS+=-std=gnu99 -O2 -Wall -g -MMD -D_FILE_OFFSET_BITS=64 -I.
-LDFLAGS+=-static
+CFLAGS+=-std=gnu99 -O2 -g -flto -MMD -Wall			\
+	-Wno-unused-but-set-variable				\
+	-Wno-pointer-sign					\
+	-fno-strict-aliasing					\
+	-I. -Iinclude -Ilibbcache				\
+	-D_FILE_OFFSET_BITS=64					\
+	-D_GNU_SOURCE						\
+	-D_LGPL_SOURCE						\
+	-DRCU_MEMBARRIER					\
+	$(EXTRA_CFLAGS)
+LDFLAGS+=-O2 -g -flto
 
-PKGCONFIG_LIBS="blkid uuid"
+PKGCONFIG_LIBS="blkid uuid liburcu"
 CFLAGS+=`pkg-config --cflags	${PKGCONFIG_LIBS}`
-LDLIBS+=`pkg-config --libs	${PKGCONFIG_LIBS}` -lm
+LDLIBS+=`pkg-config --libs	${PKGCONFIG_LIBS}` -lm -lpthread -lrt
 
 ifeq ($(PREFIX),/usr)
 	ROOT_SBINDIR=/sbin
@@ -20,15 +29,18 @@ all: bcache
 CCANSRCS=$(wildcard ccan/*/*.c)
 CCANOBJS=$(patsubst %.c,%.o,$(CCANSRCS))
 
-libccan.a: $(CCANOBJS)
-	$(AR) r $@ $(CCANOBJS)
+# Linux kernel shim:
+LINUX_SRCS=$(wildcard linux/*.c linux/*/*.c)
+LINUX_OBJS=$(LINUX_SRCS:.c=.o)
 
-bcache-objs = bcache.o bcache-assemble.o bcache-device.o bcache-format.o\
-	bcache-fs.o bcache-run.o libbcache.o util.o
+OBJS=bcache.o bcache-assemble.o bcache-device.o bcache-format.o	\
+	bcache-fs.o bcache-run.o bcache-userspace-shim.o	\
+	libbcache.o tools-util.o $(LINUX_OBJS) $(CCANOBJS)
 
--include $(bcache-objs:.o=.d)
+DEPS=$(OBJS:.o=.d)
+-include $(DEPS)
 
-bcache: $(bcache-objs) libccan.a
+bcache: $(OBJS)
 
 .PHONY: install
 install: bcache
@@ -40,7 +52,7 @@ install: bcache
 
 .PHONY: clean
 clean:
-	$(RM) bcache *.o *.d *.a
+	$(RM) bcache $(OBJS) $(DEPS)
 
 .PHONY: deb
 deb: all
@@ -50,3 +62,11 @@ deb: all
 		--build=binary		\
 		--diff-ignore		\
 		--tar-ignore
+
+.PHONE: update-bcache-sources
+update-bcache-sources:
+	echo BCACHE_REVISION=`cd $(LINUX_DIR); git rev-parse HEAD` > .bcache_revision
+	cp $(LINUX_DIR)/drivers/md/bcache/*.[ch] libbcache/
+	cp $(LINUX_DIR)/include/trace/events/bcache.h include/trace/events/
+	cp $(LINUX_DIR)/include/uapi/linux/bcache.h include/linux/
+	cp $(LINUX_DIR)/include/uapi/linux/bcache-ioctl.h include/linux/
diff --git a/bcache-assemble.c b/bcache-assemble.c
index 77dff3e..1b49166 100644
--- a/bcache-assemble.c
+++ b/bcache-assemble.c
@@ -8,7 +8,7 @@
 
 #include <sys/ioctl.h>
 
-#include "bcache.h"
+#include "bcache-cmds.h"
 
 int cmd_assemble(int argc, char *argv[])
 {
diff --git a/bcache.h b/bcache-cmds.h
index 7ad53e7..36035c2 100644
--- a/bcache.h
+++ b/bcache-cmds.h
@@ -7,7 +7,7 @@
 #ifndef _BCACHE_H
 #define _BCACHE_H
 
-#include "util.h"
+#include "tools-util.h"
 
 int cmd_format(int argc, char *argv[]);
 
@@ -23,4 +23,6 @@ int cmd_device_show(int argc, char *argv[]);
 int cmd_device_add(int argc, char *argv[]);
 int cmd_device_remove(int argc, char *argv[]);
 
+int cmd_fsck(int argc, char *argv[]);
+
 #endif /* _BCACHE_H */
diff --git a/bcache-device.c b/bcache-device.c
index bb79a92..5ff0d82 100644
--- a/bcache-device.c
+++ b/bcache-device.c
@@ -1,4 +1,3 @@
-#include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <getopt.h>
@@ -13,7 +12,7 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-#include "bcache.h"
+#include "bcache-cmds.h"
 #include "libbcache.h"
 
 /* This code belongs under show_fs */
@@ -191,7 +190,7 @@ int cmd_device_add(int argc, char *argv[])
 			.dev = (__u64) argv[i],
 		};
 
-		if (ioctl(fs.fd, BCH_IOCTL_DISK_ADD, &ia))
+		if (ioctl(fs.ioctl_fd, BCH_IOCTL_DISK_ADD, &ia))
 			die("BCH_IOCTL_DISK_ADD error: %s", strerror(errno));
 	}
 
@@ -250,7 +249,7 @@ int cmd_device_remove(int argc, char *argv[])
 		if (force_metadata)
 			ir.flags |= BCH_FORCE_IF_METADATA_MISSING;
 
-		if (ioctl(fs.fd, BCH_IOCTL_DISK_REMOVE, &ir))
+		if (ioctl(fs.ioctl_fd, BCH_IOCTL_DISK_REMOVE, &ir))
 			die("BCH_IOCTL_DISK_REMOVE error: %s\n", strerror(errno));
 	}
 
diff --git a/bcache-format.c b/bcache-format.c
index ca04433..a7aabc3 100644
--- a/bcache-format.c
+++ b/bcache-format.c
@@ -5,8 +5,6 @@
  *
  * GPLv2
  */
-#define _GNU_SOURCE
-
 #include <errno.h>
 #include <fcntl.h>
 #include <getopt.h>
@@ -24,7 +22,7 @@
 
 #include "ccan/darray/darray.h"
 
-#include "bcache.h"
+#include "bcache-cmds.h"
 #include "libbcache.h"
 
 /* Open a block device, do magic blkid stuff: */
diff --git a/bcache-fs.c b/bcache-fs.c
index 57dc47a..2e82086 100644
--- a/bcache-fs.c
+++ b/bcache-fs.c
@@ -1,5 +1,5 @@
 
-#include "bcache.h"
+#include "bcache-cmds.h"
 
 struct bcache_fs {
 	/* options... */
@@ -10,11 +10,13 @@ struct bcache_fs {
 	u64		bytes_dirty;
 };
 
+#if 0
 static struct bcache_fs fill_fs(struct bcache_handle fs)
 {
 	return (struct bcache_fs) {
 	};
 }
+#endif
 
 int cmd_fs_show(int argc, char *argv[])
 {
@@ -23,6 +25,7 @@ int cmd_fs_show(int argc, char *argv[])
 
 	struct bcache_handle fs = bcache_fs_open(argv[1]);
 
+	fs = fs;
 	return 0;
 }
 
@@ -33,5 +36,6 @@ int cmd_fs_set(int argc, char *argv[])
 
 	struct bcache_handle fs = bcache_fs_open(argv[1]);
 
+	fs = fs;
 	return 0;
 }
diff --git a/bcache-run.c b/bcache-run.c
index 8a8bc05..f419407 100644
--- a/bcache-run.c
+++ b/bcache-run.c
@@ -11,7 +11,7 @@
 
 #include <uuid/uuid.h>
 
-#include "bcache.h"
+#include "bcache-cmds.h"
 
 int cmd_run(int argc, char *argv[])
 {
@@ -25,7 +25,7 @@ int cmd_stop(int argc, char *argv[])
 
 	struct bcache_handle fs = bcache_fs_open(argv[1]);
 
-	if (ioctl(fs.fd, BCH_IOCTL_STOP))
+	if (ioctl(fs.ioctl_fd, BCH_IOCTL_STOP))
 		die("BCH_IOCTL_STOP error: %s", strerror(errno));
 
 	return 0;
diff --git a/bcache-userspace-shim.c b/bcache-userspace-shim.c
new file mode 100644
index 0000000..c4a5466
--- /dev/null
+++ b/bcache-userspace-shim.c
@@ -0,0 +1,143 @@
+
+#include <errno.h>
+#include <linux/byteorder.h>
+#include <linux/types.h>
+
+/* stub out the bcache code we aren't building: */
+
+struct block_device;
+struct bcache_superblock;
+struct cache;
+struct cache_accounting;
+struct cache_set;
+struct closure;
+struct file;
+struct kobject;
+
+struct kmem_cache *bch_search_cache;
+
+const char *bch_backing_dev_register(struct bcache_superblock *sb)
+{
+	return "not implemented";
+}
+void bch_blockdevs_stop(struct cache_set *c) {}
+int bch_blockdev_volumes_start(struct cache_set *c) { return 0; }
+void bch_attach_backing_devs(struct cache_set *c) {}
+bool bch_is_open_backing_dev(struct block_device *bdev) { return false; }
+void bch_blockdev_exit(void) {}
+int bch_blockdev_init(void) { return 0; }
+
+void bch_fs_exit(void) {}
+int bch_fs_init(void) { return 0; }
+
+const struct file_operations bch_chardev_fops;
+
+void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
+				  u64 offset, int nr_sectors) {}
+void bch_writeback_recalc_oldest_gens(struct cache_set *c) {}
+
+void bch_notify_cache_set_read_write(struct cache_set *c) {}
+void bch_notify_cache_set_read_only(struct cache_set *c) {}
+void bch_notify_cache_set_stopped(struct cache_set *c) {}
+void bch_notify_cache_read_write(struct cache *c) {}
+void bch_notify_cache_read_only(struct cache *c) {}
+void bch_notify_cache_added(struct cache *c) {}
+void bch_notify_cache_removing(struct cache *c) {}
+void bch_notify_cache_removed(struct cache *c) {}
+void bch_notify_cache_remove_failed(struct cache *c) {}
+void bch_notify_cache_error(struct cache *c, bool b) {}
+
+int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
+				   struct kobject *parent) { return 0; }
+void bch_cache_accounting_destroy(struct cache_accounting *acc) {}
+void bch_cache_accounting_init(struct cache_accounting *acc,
+			       struct closure *parent) {}
+
+//#include "acl.c"
+#include "alloc.c"
+#include "bkey.c"
+#include "bkey_methods.c"
+//#include "blockdev.c"
+#include "bset.c"
+#include "btree_cache.c"
+#include "btree_gc.c"
+#include "btree_io.c"
+#include "btree_iter.c"
+#include "btree_update.c"
+#include "buckets.c"
+//#include "chardev.c"
+#include "checksum.c"
+#include "clock.c"
+#include "closure.c"
+#include "compress.c"
+#include "debug.c"
+#include "dirent.c"
+#include "error.c"
+#include "extents.c"
+//#include "fs.c"
+#include "fs-gc.c"
+//#include "fs-io.c"
+#include "inode.c"
+#include "io.c"
+#include "journal.c"
+#include "keybuf.c"
+#include "keylist.c"
+#include "migrate.c"
+#include "move.c"
+#include "movinggc.c"
+//#include "notify.c"
+#include "opts.c"
+//#include "request.c"
+#include "siphash.c"
+#include "six.c"
+//#include "stats.c"
+#include "super.c"
+//#include "sysfs.c"
+#include "tier.c"
+#include "trace.c"
+#include "util.c"
+//#include "writeback.c"
+#include "xattr.c"
+
+#define SHIM_KTYPE(type)						\
+struct kobj_type type ## _ktype = { .release = type ## _release, }
+
+static void bch_cache_set_internal_release(struct kobject *k) {}
+
+static void bch_cache_set_opts_dir_release(struct kobject *k) {}
+
+static void bch_cache_set_time_stats_release(struct kobject *k) {}
+
+SHIM_KTYPE(bch_cache);
+SHIM_KTYPE(bch_cache_set);
+SHIM_KTYPE(bch_cache_set_internal);
+SHIM_KTYPE(bch_cache_set_time_stats);
+SHIM_KTYPE(bch_cache_set_opts_dir);
+
+//#include "tools-util.h"
+
+int cmd_fsck(int argc, char *argv[])
+{
+	DECLARE_COMPLETION_ONSTACK(shutdown);
+	struct cache_set_opts opts = cache_set_opts_empty();
+	struct cache_set *c = NULL;
+	const char *err;
+
+	printf("registering %s...\n", argv[1]);
+
+	err = bch_register_cache_set(argv + 1, argc - 1, opts, &c);
+	if (err) {
+		BUG_ON(c);
+		fprintf(stderr, "error opening %s: %s\n", argv[1], err);
+		exit(EXIT_FAILURE);
+	}
+
+	c->stop_completion = &shutdown;
+	bch_cache_set_stop(c);
+	closure_put(&c->cl);
+
+	/* Killable? */
+	wait_for_completion(&shutdown);
+
+	return 0;
+}
diff --git a/bcache.c b/bcache.c
index 177ef52..9f09319 100644
--- a/bcache.c
+++ b/bcache.c
@@ -21,37 +21,42 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
-#include "bcache.h"
+#include "bcache-cmds.h"
 
 static void usage(void)
 {
 	puts("bcache - tool for managing bcache volumes/filesystems\n"
 	     "usage: bcache <command> [<args>]\n"
 	     "\n"
-	     "Commands for formatting, startup and shutdown\n"
+	     "Commands for formatting, startup and shutdown:\n"
 	     "  format         Format a new filesystem\n"
 	     "  assemble       Assemble an existing multi device filesystem\n"
 	     "  incremental    Incrementally assemble an existing multi device filesystem\n"
 	     "  run            Start a partially assembled filesystem\n"
 	     "  stop	       Stop a running filesystem\n"
 	     "\n"
-	     "Commands for managing a running filesystem\n"
+	     "Commands for managing a running filesystem:\n"
 	     "  fs_show        Show various information about a filesystem\n"
 	     "  fs_set         Modify filesystem options\n"
 	     "\n"
-	     "Commands for managing a specific device in a filesystem\n"
+	     "Commands for managing a specific device in a filesystem:\n"
 	     "  device_show    Show information about a formatted device\n"
 	     "  device_add     Add a device to an existing (running) filesystem\n"
-	     "  device_remove  Remove a device from an existing (running) filesystem\n");
-	exit(EXIT_SUCCESS);
+	     "  device_remove  Remove a device from an existing (running) filesystem\n"
+	     "\n"
+	     "Repair:\n"
+	     "  bcache fsck    Check an existing filesystem for errors\n");
 }
 
 int main(int argc, char *argv[])
 {
 	char *cmd;
 
+	setvbuf(stdout, NULL, _IOLBF, 0);
+
 	if (argc < 2) {
 		printf("%s: missing command\n", argv[0]);
+		usage();
 		exit(EXIT_FAILURE);
 	}
 
@@ -83,6 +88,9 @@ int main(int argc, char *argv[])
 	if (!strcmp(cmd, "device_remove"))
 		return cmd_device_remove(argc, argv);
 
+	if (!strcmp(cmd, "fsck"))
+		return cmd_fsck(argc, argv);
+
 	usage();
 	return 0;
 }
diff --git a/ccan/ilog/LICENSE b/ccan/ilog/LICENSE
deleted file mode 100644
index feb9b11..0000000
--- a/ccan/ilog/LICENSE
+++ /dev/null
@@ -1,28 +0,0 @@
-Statement of Purpose
-
-The laws of most jurisdictions throughout the world automatically confer exclusive Copyright and Related Rights (defined below) upon the creator and subsequent owner(s) (each and all, an "owner") of an original work of authorship and/or a database (each, a "Work").
-
-Certain owners wish to permanently relinquish those rights to a Work for the purpose of contributing to a commons of creative, cultural and scientific works ("Commons") that the public can reliably and without fear of later claims of infringement build upon, modify, incorporate in other works, reuse and redistribute as freely as possible in any form whatsoever and for any purposes, including without limitation commercial purposes. These owners may contribute to the Commons to promote the ideal of a free culture and the further production of creative, cultural and scientific works, or to gain reputation or greater distribution for their Work in part through the use and efforts of others.
-
-For these and/or other purposes and motivations, and without any expectation of additional consideration or compensation, the person associating CC0 with a Work (the "Affirmer"), to the extent that he or she is an owner of Copyright and Related Rights in the Work, voluntarily elects to apply CC0 to the Work and publicly distribute the Work under its terms, with knowledge of his or her Copyright and Related Rights in the Work and the meaning and intended legal effect of CC0 on those rights.
-
-1. Copyright and Related Rights. A Work made available under CC0 may be protected by copyright and related or neighboring rights ("Copyright and Related Rights"). Copyright and Related Rights include, but are not limited to, the following:
-
-    the right to reproduce, adapt, distribute, perform, display, communicate, and translate a Work;
-    moral rights retained by the original author(s) and/or performer(s);
-    publicity and privacy rights pertaining to a person's image or likeness depicted in a Work;
-    rights protecting against unfair competition in regards to a Work, subject to the limitations in paragraph 4(a), below;
-    rights protecting the extraction, dissemination, use and reuse of data in a Work;
-    database rights (such as those arising under Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, and under any national implementation thereof, including any amended or successor version of such directive); and
-    other similar, equivalent or corresponding rights throughout the world based on applicable law or treaty, and any national implementations thereof.
-
-2. Waiver. To the greatest extent permitted by, but not in contravention of, applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and unconditionally waives, abandons, and surrenders all of Affirmer's Copyright and Related Rights and associated claims and causes of action, whether now known or unknown (including existing as well as future claims and causes of action), in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each member of the public at large and to the detriment of Affirmer's heirs and successors, fully intending that such Waiver shall not be subject to revocation, rescission, cancellation, termination, or any other legal or equitable action to disrupt the quiet enjoyment of the Work by the public as contemplated by Affirmer's express Statement of Purpose.
-
-3. Public License Fallback. Should any part of the Waiver for any reason be judged legally invalid or ineffective under applicable law, then the Waiver shall be preserved to the maximum extent permitted taking into account Affirmer's express Statement of Purpose. In addition, to the extent the Waiver is so judged Affirmer hereby grants to each affected person a royalty-free, non transferable, non sublicensable, non exclusive, irrevocable and unconditional license to exercise Affirmer's Copyright and Related Rights in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "License"). The License shall be deemed effective as of the date CC0 was applied by Affirmer to the Work. Should any part of the License for any reason be judged legally invalid or ineffective under applicable law, such partial invalidity or ineffectiveness shall not invalidate the remainder of the License, and in such case Affirmer hereby affirms that he or she will not (i) exercise any of his or her remaining Copyright and Related Rights in the Work or (ii) assert any associated claims and causes of action with respect to the Work, in either case contrary to Affirmer's express Statement of Purpose.
-
-4. Limitations and Disclaimers.
-
-    No trademark or patent rights held by Affirmer are waived, abandoned, surrendered, licensed or otherwise affected by this document.
-    Affirmer offers the Work as-is and makes no representations or warranties of any kind concerning the Work, express, implied, statutory or otherwise, including without limitation warranties of title, merchantability, fitness for a particular purpose, non infringement, or the absence of latent or other defects, accuracy, or the present or absence of errors, whether or not discoverable, all to the greatest extent permissible under applicable law.
-    Affirmer disclaims responsibility for clearing rights of other persons that may apply to the Work or any use thereof, including without limitation any person's Copyright and Related Rights in the Work. Further, Affirmer disclaims responsibility for obtaining any necessary consents, permissions or other rights required for any use of the Work.
-    Affirmer understands and acknowledges that Creative Commons is not a party to this document and has no duty or obligation with respect to this CC0 or use of the Work.
diff --git a/ccan/ilog/_info b/ccan/ilog/_info
deleted file mode 100644
index f1f3f2d..0000000
--- a/ccan/ilog/_info
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * ilog - Integer logarithm.
- *
- * ilog_32() and ilog_64() compute the minimum number of bits required to store
- * an unsigned 32-bit or 64-bit value without any leading zero bits.
- *
- * This can also be thought of as the location of the highest set bit, with
- * counting starting from one (so that 0 returns 0, 1 returns 1, and 2**31
- * returns 32).
- *
- * When the value is known to be non-zero ilog32_nz() and ilog64_nz() can
- * compile into as few as two instructions, one of which may get optimized out
- * later.
- *
- * STATIC_ILOG_32 and STATIC_ILOG_64 allow computation on compile-time
- * constants, so other compile-time constants can be derived from them.
- *
- * Example:
- *  #include <stdio.h>
- *  #include <limits.h>
- *  #include <ccan/ilog/ilog.h>
- *
- *  int main(void){
- *    int i;
- *    printf("ilog32(0x%08X)=%i\n",0,ilog32(0));
- *    for(i=1;i<=STATIC_ILOG_32(USHRT_MAX);i++){
- *      uint32_t v;
- *      v=(uint32_t)1U<<(i-1);
- *      //Here we know v is non-zero, so we can use ilog32_nz().
- *      printf("ilog32(0x%08X)=%i\n",v,ilog32_nz(v));
- *    }
- *    return 0;
- *  }
- *
- * License: CC0 (Public domain)
- * Author: Timothy B. Terriberry <tterribe@xiph.org>
- */
-#include "config.h"
-#include <string.h>
-#include <stdio.h>
-
-int main(int _argc,const char *_argv[]){
-  /*Expect exactly one argument.*/
-  if(_argc!=2)return 1;
-  if(strcmp(_argv[1],"depends")==0){
-    printf("ccan/compiler\n");
-    return 0;
-  }
-  return 1;
-}
diff --git a/ccan/ilog/ilog.c b/ccan/ilog/ilog.c
deleted file mode 100644
index 5f5122d..0000000
--- a/ccan/ilog/ilog.c
+++ /dev/null
@@ -1,141 +0,0 @@
-/*(C) Timothy B. Terriberry (tterribe@xiph.org) 2001-2009 CC0 (Public domain).
- * See LICENSE file for details. */
-#include "ilog.h"
-#include <limits.h>
-
-/*The fastest fallback strategy for platforms with fast multiplication appears
-   to be based on de Bruijn sequences~\cite{LP98}.
-  Tests confirmed this to be true even on an ARM11, where it is actually faster
-   than using the native clz instruction.
-  Define ILOG_NODEBRUIJN to use a simpler fallback on platforms where
-   multiplication or table lookups are too expensive.
-
-  @UNPUBLISHED{LP98,
-    author="Charles E. Leiserson and Harald Prokop",
-    title="Using de {Bruijn} Sequences to Index a 1 in a Computer Word",
-    month=Jun,
-    year=1998,
-    note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}"
-  }*/
-static UNNEEDED const unsigned char DEBRUIJN_IDX32[32]={
-   0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8,
-  31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9
-};
-
-/* We always compile these in, in case someone takes address of function. */
-#undef ilog32_nz
-#undef ilog32
-#undef ilog64_nz
-#undef ilog64
-
-int ilog32(uint32_t _v){
-/*On a Pentium M, this branchless version tested as the fastest version without
-   multiplications on 1,000,000,000 random 32-bit integers, edging out a
-   similar version with branches, and a 256-entry LUT version.*/
-# if defined(ILOG_NODEBRUIJN)
-  int ret;
-  int m;
-  ret=_v>0;
-  m=(_v>0xFFFFU)<<4;
-  _v>>=m;
-  ret|=m;
-  m=(_v>0xFFU)<<3;
-  _v>>=m;
-  ret|=m;
-  m=(_v>0xFU)<<2;
-  _v>>=m;
-  ret|=m;
-  m=(_v>3)<<1;
-  _v>>=m;
-  ret|=m;
-  ret+=_v>1;
-  return ret;
-/*This de Bruijn sequence version is faster if you have a fast multiplier.*/
-# else
-  int ret;
-  ret=_v>0;
-  _v|=_v>>1;
-  _v|=_v>>2;
-  _v|=_v>>4;
-  _v|=_v>>8;
-  _v|=_v>>16;
-  _v=(_v>>1)+1;
-  ret+=DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F];
-  return ret;
-# endif
-}
-
-int ilog32_nz(uint32_t _v)
-{
-  return ilog32(_v);
-}
-
-int ilog64(uint64_t _v){
-# if defined(ILOG_NODEBRUIJN)
-  uint32_t v;
-  int      ret;
-  int      m;
-  ret=_v>0;
-  m=(_v>0xFFFFFFFFU)<<5;
-  v=(uint32_t)(_v>>m);
-  ret|=m;
-  m=(v>0xFFFFU)<<4;
-  v>>=m;
-  ret|=m;
-  m=(v>0xFFU)<<3;
-  v>>=m;
-  ret|=m;
-  m=(v>0xFU)<<2;
-  v>>=m;
-  ret|=m;
-  m=(v>3)<<1;
-  v>>=m;
-  ret|=m;
-  ret+=v>1;
-  return ret;
-# else
-/*If we don't have a 64-bit word, split it into two 32-bit halves.*/
-#  if LONG_MAX<9223372036854775807LL
-  uint32_t v;
-  int      ret;
-  int      m;
-  ret=_v>0;
-  m=(_v>0xFFFFFFFFU)<<5;
-  v=(uint32_t)(_v>>m);
-  ret|=m;
-  v|=v>>1;
-  v|=v>>2;
-  v|=v>>4;
-  v|=v>>8;
-  v|=v>>16;
-  v=(v>>1)+1;
-  ret+=DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F];
-  return ret;
-/*Otherwise do it in one 64-bit operation.*/
-#  else
-  static const unsigned char DEBRUIJN_IDX64[64]={
-     0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40,
-     5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57,
-    63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56,
-    62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58
-  };
-  int ret;
-  ret=_v>0;
-  _v|=_v>>1;
-  _v|=_v>>2;
-  _v|=_v>>4;
-  _v|=_v>>8;
-  _v|=_v>>16;
-  _v|=_v>>32;
-  _v=(_v>>1)+1;
-  ret+=DEBRUIJN_IDX64[_v*0x218A392CD3D5DBF>>58&0x3F];
-  return ret;
-#  endif
-# endif
-}
-
-int ilog64_nz(uint64_t _v)
-{
-  return ilog64(_v);
-}
-
diff --git a/ccan/ilog/ilog.h b/ccan/ilog/ilog.h
deleted file mode 100644
index 9adbb82..0000000
--- a/ccan/ilog/ilog.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* CC0 (Public domain) - see LICENSE file for details */
-#if !defined(_ilog_H)
-# define _ilog_H (1)
-# include "config.h"
-# include <stdint.h>
-# include <limits.h>
-# include <ccan/compiler/compiler.h>
-
-/**
- * ilog32 - Integer binary logarithm of a 32-bit value.
- * @_v: A 32-bit value.
- * Returns floor(log2(_v))+1, or 0 if _v==0.
- * This is the number of bits that would be required to represent _v in two's
- *  complement notation with all of the leading zeros stripped.
- * Note that many uses will resolve to the fast macro version instead.
- *
- * See Also:
- *	ilog32_nz(), ilog64()
- *
- * Example:
- *	// Rounds up to next power of 2 (if not a power of 2).
- *	static uint32_t round_up32(uint32_t i)
- *	{
- *		assert(i != 0);
- *		return 1U << ilog32(i-1);
- *	}
- */
-int ilog32(uint32_t _v) CONST_FUNCTION;
-
-/**
- * ilog32_nz - Integer binary logarithm of a non-zero 32-bit value.
- * @_v: A 32-bit value.
- * Returns floor(log2(_v))+1, or undefined if _v==0.
- * This is the number of bits that would be required to represent _v in two's
- *  complement notation with all of the leading zeros stripped.
- * Note that many uses will resolve to the fast macro version instead.
- * See Also:
- *	ilog32(), ilog64_nz()
- * Example:
- *	// Find Last Set (ie. highest bit set, 0 to 31).
- *	static uint32_t fls32(uint32_t i)
- *	{
- *		assert(i != 0);
- *		return ilog32_nz(i) - 1;
- *	}
- */
-int ilog32_nz(uint32_t _v) CONST_FUNCTION;
-
-/**
- * ilog64 - Integer binary logarithm of a 64-bit value.
- * @_v: A 64-bit value.
- * Returns floor(log2(_v))+1, or 0 if _v==0.
- * This is the number of bits that would be required to represent _v in two's
- *  complement notation with all of the leading zeros stripped.
- * Note that many uses will resolve to the fast macro version instead.
- * See Also:
- *	ilog64_nz(), ilog32()
- */
-int ilog64(uint64_t _v) CONST_FUNCTION;
-
-/**
- * ilog64_nz - Integer binary logarithm of a non-zero 64-bit value.
- * @_v: A 64-bit value.
- * Returns floor(log2(_v))+1, or undefined if _v==0.
- * This is the number of bits that would be required to represent _v in two's
- *  complement notation with all of the leading zeros stripped.
- * Note that many uses will resolve to the fast macro version instead.
- * See Also:
- *	ilog64(), ilog32_nz()
- */
-int ilog64_nz(uint64_t _v) CONST_FUNCTION;
-
-/**
- * STATIC_ILOG_32 - The integer logarithm of an (unsigned, 32-bit) constant.
- * @_v: A non-negative 32-bit constant.
- * Returns floor(log2(_v))+1, or 0 if _v==0.
- * This is the number of bits that would be required to represent _v in two's
- *  complement notation with all of the leading zeros stripped.
- * This macro should only be used when you need a compile-time constant,
- * otherwise ilog32 or ilog32_nz are just as fast and more flexible.
- *
- * Example:
- *	#define MY_PAGE_SIZE	4096
- *	#define MY_PAGE_BITS	(STATIC_ILOG_32(PAGE_SIZE) - 1)
- */
-#define STATIC_ILOG_32(_v) (STATIC_ILOG5((uint32_t)(_v)))
-
-/**
- * STATIC_ILOG_64 - The integer logarithm of an (unsigned, 64-bit) constant.
- * @_v: A non-negative 64-bit constant.
- * Returns floor(log2(_v))+1, or 0 if _v==0.
- * This is the number of bits that would be required to represent _v in two's
- *  complement notation with all of the leading zeros stripped.
- * This macro should only be used when you need a compile-time constant,
- * otherwise ilog64 or ilog64_nz are just as fast and more flexible.
- */
-#define STATIC_ILOG_64(_v) (STATIC_ILOG6((uint64_t)(_v)))
-
-/* Private implementation details */
-
-/*Note the casts to (int) below: this prevents "upgrading"
-   the type of an entire expression to an (unsigned) size_t.*/
-#if INT_MAX>=2147483647 && HAVE_BUILTIN_CLZ
-#define builtin_ilog32_nz(v) \
-	(((int)sizeof(unsigned)*CHAR_BIT) - __builtin_clz(v))
-#elif LONG_MAX>=2147483647L && HAVE_BUILTIN_CLZL
-#define builtin_ilog32_nz(v) \
-	(((int)sizeof(unsigned)*CHAR_BIT) - __builtin_clzl(v))
-#endif
-
-#if INT_MAX>=9223372036854775807LL && HAVE_BUILTIN_CLZ
-#define builtin_ilog64_nz(v) \
-	(((int)sizeof(unsigned)*CHAR_BIT) - __builtin_clz(v))
-#elif LONG_MAX>=9223372036854775807LL && HAVE_BUILTIN_CLZL
-#define builtin_ilog64_nz(v) \
-	(((int)sizeof(unsigned long)*CHAR_BIT) - __builtin_clzl(v))
-#elif HAVE_BUILTIN_CLZLL
-#define builtin_ilog64_nz(v) \
-	(((int)sizeof(unsigned long long)*CHAR_BIT) - __builtin_clzll(v))
-#endif
-
-#ifdef builtin_ilog32_nz
-#define ilog32(_v) (builtin_ilog32_nz(_v)&-!!(_v))
-#define ilog32_nz(_v) builtin_ilog32_nz(_v)
-#else
-#define ilog32_nz(_v) ilog32(_v)
-#define ilog32(_v) (IS_COMPILE_CONSTANT(_v) ? STATIC_ILOG_32(_v) : ilog32(_v))
-#endif /* builtin_ilog32_nz */
-
-#ifdef builtin_ilog64_nz
-#define ilog64(_v) (builtin_ilog64_nz(_v)&-!!(_v))
-#define ilog64_nz(_v) builtin_ilog64_nz(_v)
-#else
-#define ilog64_nz(_v) ilog64(_v)
-#define ilog64(_v) (IS_COMPILE_CONSTANT(_v) ? STATIC_ILOG_64(_v) : ilog64(_v))
-#endif /* builtin_ilog64_nz */
-
-/* Macros for evaluating compile-time constant ilog. */
-# define STATIC_ILOG0(_v) (!!(_v))
-# define STATIC_ILOG1(_v) (((_v)&0x2)?2:STATIC_ILOG0(_v))
-# define STATIC_ILOG2(_v) (((_v)&0xC)?2+STATIC_ILOG1((_v)>>2):STATIC_ILOG1(_v))
-# define STATIC_ILOG3(_v) \
- (((_v)&0xF0)?4+STATIC_ILOG2((_v)>>4):STATIC_ILOG2(_v))
-# define STATIC_ILOG4(_v) \
- (((_v)&0xFF00)?8+STATIC_ILOG3((_v)>>8):STATIC_ILOG3(_v))
-# define STATIC_ILOG5(_v) \
- (((_v)&0xFFFF0000)?16+STATIC_ILOG4((_v)>>16):STATIC_ILOG4(_v))
-# define STATIC_ILOG6(_v) \
- (((_v)&0xFFFFFFFF00000000ULL)?32+STATIC_ILOG5((_v)>>32):STATIC_ILOG5(_v))
-
-#endif /* _ilog_H */
diff --git a/ccan/ilog/test/run-out-of-line.c b/ccan/ilog/test/run-out-of-line.c
deleted file mode 100644
index 48205d3..0000000
--- a/ccan/ilog/test/run-out-of-line.c
+++ /dev/null
@@ -1,65 +0,0 @@
-#include <ccan/ilog/ilog.h>
-#include <ccan/ilog/ilog.c>
-#include <stdio.h>
-#include <ccan/tap/tap.h>
-
-/*Dead simple (but slow) versions to compare against.*/
-
-static int test_ilog32(uint32_t _v){
-  int ret;
-  for(ret=0;_v;ret++)_v>>=1;
-  return ret;
-}
-
-static int test_ilog64(uint64_t _v){
-  int ret;
-  for(ret=0;_v;ret++)_v>>=1;
-  return ret;
-}
-
-#define NTRIALS (64)
-
-int main(int _argc,const char *_argv[]){
-  int i;
-  int j;
-  int (*il32)(uint32_t) = ilog32;
-  int (*il64)(uint64_t) = ilog64;
-  int (*il32_nz)(uint32_t) = ilog32_nz;
-  int (*il64_nz)(uint64_t) = ilog64_nz;
-
-  /*This is how many tests you plan to run.*/
-  plan_tests(33 * NTRIALS * 3 + 65 * NTRIALS * 3);
-  for(i=0;i<=32;i++){
-    uint32_t v;
-    /*Test each bit in turn (and 0).*/
-    v=i?(uint32_t)1U<<(i-1):0;
-    for(j=0;j<NTRIALS;j++){
-      int l;
-      l=test_ilog32(v);
-      ok1(STATIC_ILOG_32(v)==l);
-      ok1(il32(v)==l);
-      ok1(il32_nz(v) == l || v == 0);
-      /*Also try a few more pseudo-random values with at most the same number
-         of bits.*/
-      v=(1103515245U*v+12345U)&0xFFFFFFFFU>>((33-i)>>1)>>((32-i)>>1);
-    }
-  }
-
-  for(i=0;i<=64;i++){
-    uint64_t v;
-    /*Test each bit in turn (and 0).*/
-    v=i?(uint64_t)1U<<(i-1):0;
-    for(j=0;j<NTRIALS;j++){
-      int l;
-      l=test_ilog64(v);
-      ok1(STATIC_ILOG_64(v)==l);
-      ok1(il64(v)==l);
-      ok1(il64_nz(v) == l || v == 0);
-      /*Also try a few more pseudo-random values with at most the same number
-         of bits.*/
-      v=(uint64_t)((2862933555777941757ULL*v+3037000493ULL)
-	&0xFFFFFFFFFFFFFFFFULL>>((65-i)>>1)>>((64-i)>>1));
-    }
-  }
-  return exit_status();
-}
diff --git a/ccan/ilog/test/run.c b/ccan/ilog/test/run.c
deleted file mode 100644
index bda59f9..0000000
--- a/ccan/ilog/test/run.c
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <ccan/ilog/ilog.h>
-#include <ccan/ilog/ilog.c>
-#include <stdio.h>
-#include <ccan/tap/tap.h>
-
-/*Dead simple (but slow) versions to compare against.*/
-
-static int test_ilog32(uint32_t _v){
-  int ret;
-  for(ret=0;_v;ret++)_v>>=1;
-  return ret;
-}
-
-static int test_ilog64(uint64_t _v){
-  int ret;
-  for(ret=0;_v;ret++)_v>>=1;
-  return ret;
-}
-
-#define NTRIALS (64)
-
-int main(int _argc,const char *_argv[]){
-  int i;
-  int j;
-  /*This is how many tests you plan to run.*/
-  plan_tests(33 * NTRIALS * 3 + 65 * NTRIALS * 3);
-  for(i=0;i<=32;i++){
-    uint32_t v;
-    /*Test each bit in turn (and 0).*/
-    v=i?(uint32_t)1U<<(i-1):0;
-    for(j=0;j<NTRIALS;j++){
-      int l;
-      l=test_ilog32(v);
-      ok1(STATIC_ILOG_32(v)==l);
-      ok1(ilog32(v)==l);
-      ok1(ilog32_nz(v) == l || v == 0);
-      /*Also try a few more pseudo-random values with at most the same number
-         of bits.*/
-      v=(1103515245U*v+12345U)&0xFFFFFFFFU>>((33-i)>>1)>>((32-i)>>1);
-    }
-  }
-
-  for(i=0;i<=64;i++){
-    uint64_t v;
-    /*Test each bit in turn (and 0).*/
-    v=i?(uint64_t)1U<<(i-1):0;
-    for(j=0;j<NTRIALS;j++){
-      int l;
-      l=test_ilog64(v);
-      ok1(STATIC_ILOG_64(v)==l);
-      ok1(ilog64(v)==l);
-      ok1(ilog64_nz(v) == l || v == 0);
-      /*Also try a few more pseudo-random values with at most the same number
-         of bits.*/
-      v=(uint64_t)((2862933555777941757ULL*v+3037000493ULL)
-	&0xFFFFFFFFFFFFFFFFULL>>((65-i)>>1)>>((64-i)>>1));
-    }
-  }
-  return exit_status();
-}
diff --git a/debian/control b/debian/control
index 13622a6..9bbc6f8 100644
--- a/debian/control
+++ b/debian/control
@@ -5,7 +5,7 @@ Section: utils
 Priority: optional
 Standards-Version: 3.9.5
 Build-Depends: debhelper (>= 9), pkg-config, libblkid-dev, uuid-dev,
-	libscrypt-dev, libsodium-dev, libkeyutils-dev
+	libscrypt-dev, libsodium-dev, libkeyutils-dev, liburcu-dev
 Vcs-Browser: http://anonscm.debian.org/gitweb/?p=collab-maint/bcache-tools.git
 Vcs-Git: git://anonscm.debian.org/collab-maint/bcache-tools.git
 Homepage: http://bcache.evilpiepirate.org/
diff --git a/fsck.bcache b/fsck.bcache
new file mode 100755
index 0000000..17abea9
--- /dev/null
+++ b/fsck.bcache
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+exec bcache fsck "$@"
diff --git a/include/asm/unaligned.h b/include/asm/unaligned.h
new file mode 100644
index 0000000..ced1a29
--- /dev/null
+++ b/include/asm/unaligned.h
@@ -0,0 +1,20 @@
+#ifndef _ASM_UNALIGNED_H
+#define _ASM_UNALIGNED_H
+
+#if defined(__LITTLE_ENDIAN)
+# include <linux/unaligned/le_struct.h>
+# include <linux/unaligned/be_byteshift.h>
+# include <linux/unaligned/generic.h>
+# define get_unaligned	__get_unaligned_le
+# define put_unaligned	__put_unaligned_le
+#elif defined(__BIG_ENDIAN)
+# include <linux/unaligned/be_struct.h>
+# include <linux/unaligned/le_byteshift.h>
+# include <linux/unaligned/generic.h>
+# define get_unaligned	__get_unaligned_be
+# define put_unaligned	__put_unaligned_be
+#else
+# error need to define endianess
+#endif
+
+#endif /* _ASM_UNALIGNED_H */
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
new file mode 100644
index 0000000..31f453e
--- /dev/null
+++ b/include/crypto/algapi.h
@@ -0,0 +1,212 @@
+/*
+ * Cryptographic API for algorithms (i.e., low-level API).
+ *
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) 
+ * any later version.
+ *
+ */
+#ifndef _CRYPTO_ALGAPI_H
+#define _CRYPTO_ALGAPI_H
+
+#include <linux/crypto.h>
+#include <linux/device.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+
+struct crypto_aead;
+struct crypto_instance;
+struct module;
+struct rtattr;
+struct seq_file;
+struct sk_buff;
+
+struct crypto_type {
+	unsigned int (*ctxsize)(struct crypto_alg *alg, u32 type, u32 mask);
+	unsigned int (*extsize)(struct crypto_alg *alg);
+	int (*init)(struct crypto_tfm *tfm, u32 type, u32 mask);
+	int (*init_tfm)(struct crypto_tfm *tfm);
+	void (*show)(struct seq_file *m, struct crypto_alg *alg);
+	struct crypto_alg *(*lookup)(const char *name, u32 type, u32 mask);
+	void (*free)(struct crypto_instance *inst);
+
+	unsigned int type;
+	unsigned int maskclear;
+	unsigned int maskset;
+	unsigned int tfmsize;
+};
+
+struct crypto_instance {
+	struct crypto_alg alg;
+
+	struct crypto_template *tmpl;
+	struct hlist_node list;
+
+	void *__ctx[] CRYPTO_MINALIGN_ATTR;
+};
+
+struct crypto_template {
+	struct list_head list;
+	struct hlist_head instances;
+	struct module *module;
+
+	struct crypto_instance *(*alloc)(struct rtattr **tb);
+	void (*free)(struct crypto_instance *inst);
+	int (*create)(struct crypto_template *tmpl, struct rtattr **tb);
+
+	char name[CRYPTO_MAX_ALG_NAME];
+};
+
+struct scatter_walk {
+	struct scatterlist *sg;
+	unsigned int offset;
+};
+
+struct blkcipher_walk {
+	union {
+		struct {
+			struct page *page;
+			unsigned long offset;
+		} phys;
+
+		struct {
+			u8 *page;
+			u8 *addr;
+		} virt;
+	} src, dst;
+
+	struct scatter_walk in;
+	unsigned int nbytes;
+
+	struct scatter_walk out;
+	unsigned int total;
+
+	void *page;
+	u8 *buffer;
+	u8 *iv;
+	unsigned int ivsize;
+
+	int flags;
+	unsigned int walk_blocksize;
+	unsigned int cipher_blocksize;
+	unsigned int alignmask;
+};
+
+extern const struct crypto_type crypto_blkcipher_type;
+
+struct crypto_attr_type *crypto_get_attr_type(struct rtattr **tb);
+int crypto_check_attr_type(struct rtattr **tb, u32 type);
+const char *crypto_attr_alg_name(struct rtattr *rta);
+struct crypto_alg *crypto_attr_alg2(struct rtattr *rta,
+				    const struct crypto_type *frontend,
+				    u32 type, u32 mask);
+
+static inline struct crypto_alg *crypto_attr_alg(struct rtattr *rta,
+						 u32 type, u32 mask)
+{
+	return crypto_attr_alg2(rta, NULL, type, mask);
+}
+
+int crypto_attr_u32(struct rtattr *rta, u32 *num);
+
+/* These functions require the input/output to be aligned as u32. */
+void crypto_inc(u8 *a, unsigned int size);
+void crypto_xor(u8 *dst, const u8 *src, unsigned int size);
+
+int blkcipher_walk_done(struct blkcipher_desc *desc,
+			struct blkcipher_walk *walk, int err);
+int blkcipher_walk_virt(struct blkcipher_desc *desc,
+			struct blkcipher_walk *walk);
+int blkcipher_walk_phys(struct blkcipher_desc *desc,
+			struct blkcipher_walk *walk);
+int blkcipher_walk_virt_block(struct blkcipher_desc *desc,
+			      struct blkcipher_walk *walk,
+			      unsigned int blocksize);
+int blkcipher_aead_walk_virt_block(struct blkcipher_desc *desc,
+				   struct blkcipher_walk *walk,
+				   struct crypto_aead *tfm,
+				   unsigned int blocksize);
+
+static inline void *crypto_tfm_ctx_aligned(struct crypto_tfm *tfm)
+{
+	return PTR_ALIGN(crypto_tfm_ctx(tfm),
+			 crypto_tfm_alg_alignmask(tfm) + 1);
+}
+
+static inline struct crypto_instance *crypto_tfm_alg_instance(
+	struct crypto_tfm *tfm)
+{
+	return container_of(tfm->__crt_alg, struct crypto_instance, alg);
+}
+
+static inline void *crypto_instance_ctx(struct crypto_instance *inst)
+{
+	return inst->__ctx;
+}
+
+static inline void *crypto_blkcipher_ctx(struct crypto_blkcipher *tfm)
+{
+	return crypto_tfm_ctx(&tfm->base);
+}
+
+static inline void *crypto_blkcipher_ctx_aligned(struct crypto_blkcipher *tfm)
+{
+	return crypto_tfm_ctx_aligned(&tfm->base);
+}
+
+static inline struct cipher_alg *crypto_cipher_alg(struct crypto_cipher *tfm)
+{
+	return &crypto_cipher_tfm(tfm)->__crt_alg->cra_cipher;
+}
+
+static inline void blkcipher_walk_init(struct blkcipher_walk *walk,
+				       struct scatterlist *dst,
+				       struct scatterlist *src,
+				       unsigned int nbytes)
+{
+	walk->in.sg = src;
+	walk->out.sg = dst;
+	walk->total = nbytes;
+}
+
+static inline struct crypto_alg *crypto_get_attr_alg(struct rtattr **tb,
+						     u32 type, u32 mask)
+{
+	return crypto_attr_alg(tb[1], type, mask);
+}
+
+static inline int crypto_requires_sync(u32 type, u32 mask)
+{
+	return (type ^ CRYPTO_ALG_ASYNC) & mask & CRYPTO_ALG_ASYNC;
+}
+
+noinline unsigned long __crypto_memneq(const void *a, const void *b, size_t size);
+
+/**
+ * crypto_memneq - Compare two areas of memory without leaking
+ *		   timing information.
+ *
+ * @a: One area of memory
+ * @b: Another area of memory
+ * @size: The size of the area.
+ *
+ * Returns 0 when data is equal, 1 otherwise.
+ */
+static inline int crypto_memneq(const void *a, const void *b, size_t size)
+{
+	return __crypto_memneq(a, b, size) != 0UL ? 1 : 0;
+}
+
+static inline void crypto_yield(u32 flags)
+{
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
+	if (flags & CRYPTO_TFM_REQ_MAY_SLEEP)
+		cond_resched();
+#endif
+}
+
+#endif	/* _CRYPTO_ALGAPI_H */
diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h
new file mode 100644
index 0000000..20d20f6
--- /dev/null
+++ b/include/crypto/chacha20.h
@@ -0,0 +1,26 @@
+/*
+ * Common values for the ChaCha20 algorithm
+ */
+
+#ifndef _CRYPTO_CHACHA20_H
+#define _CRYPTO_CHACHA20_H
+
+#include <linux/types.h>
+#include <linux/crypto.h>
+
+#define CHACHA20_IV_SIZE	16
+#define CHACHA20_KEY_SIZE	32
+#define CHACHA20_BLOCK_SIZE	64
+
+struct chacha20_ctx {
+	u32 key[8];
+};
+
+void chacha20_block(u32 *state, void *stream);
+void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv);
+int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
+			   unsigned int keysize);
+int crypto_chacha20_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+			  struct scatterlist *src, unsigned int nbytes);
+
+#endif
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
new file mode 100644
index 0000000..00bd4e7
--- /dev/null
+++ b/include/crypto/hash.h
@@ -0,0 +1,181 @@
+/*
+ * Hash: Hash algorithms under the crypto API
+ * 
+ * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) 
+ * any later version.
+ *
+ */
+
+#ifndef _CRYPTO_HASH_H
+#define _CRYPTO_HASH_H
+
+#include <linux/crypto.h>
+#include <linux/string.h>
+
+struct hash_alg_common {
+	unsigned int digestsize;
+	unsigned int statesize;
+
+	struct crypto_alg base;
+};
+
+struct shash_desc {
+	struct crypto_shash *tfm;
+	u32 flags;
+
+	void *__ctx[] CRYPTO_MINALIGN_ATTR;
+};
+
+#define SHASH_DESC_ON_STACK(shash, ctx)				  \
+	char __##shash##_desc[sizeof(struct shash_desc) +	  \
+		crypto_shash_descsize(ctx)] CRYPTO_MINALIGN_ATTR; \
+	struct shash_desc *shash = (struct shash_desc *)__##shash##_desc
+
+struct shash_alg {
+	int (*init)(struct shash_desc *desc);
+	int (*update)(struct shash_desc *desc, const u8 *data,
+		      unsigned int len);
+	int (*final)(struct shash_desc *desc, u8 *out);
+	int (*finup)(struct shash_desc *desc, const u8 *data,
+		     unsigned int len, u8 *out);
+	int (*digest)(struct shash_desc *desc, const u8 *data,
+		      unsigned int len, u8 *out);
+	int (*export)(struct shash_desc *desc, void *out);
+	int (*import)(struct shash_desc *desc, const void *in);
+	int (*setkey)(struct crypto_shash *tfm, const u8 *key,
+		      unsigned int keylen);
+
+	unsigned int descsize;
+
+	/* These fields must match hash_alg_common. */
+	unsigned int digestsize
+		__attribute__ ((aligned(__alignof__(struct hash_alg_common))));
+	unsigned int statesize;
+
+	struct crypto_alg base;
+};
+
+struct crypto_shash {
+	unsigned int descsize;
+	struct crypto_tfm base;
+};
+
+struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
+					u32 mask);
+
+static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm)
+{
+	return &tfm->base;
+}
+
+static inline void crypto_free_shash(struct crypto_shash *tfm)
+{
+	crypto_destroy_tfm(tfm, crypto_shash_tfm(tfm));
+}
+
+static inline const char *crypto_shash_alg_name(struct crypto_shash *tfm)
+{
+	return crypto_tfm_alg_name(crypto_shash_tfm(tfm));
+}
+
+static inline const char *crypto_shash_driver_name(struct crypto_shash *tfm)
+{
+	return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm));
+}
+
+static inline unsigned int crypto_shash_alignmask(
+	struct crypto_shash *tfm)
+{
+	return crypto_tfm_alg_alignmask(crypto_shash_tfm(tfm));
+}
+
+static inline unsigned int crypto_shash_blocksize(struct crypto_shash *tfm)
+{
+	return crypto_tfm_alg_blocksize(crypto_shash_tfm(tfm));
+}
+
+static inline struct shash_alg *__crypto_shash_alg(struct crypto_alg *alg)
+{
+	return container_of(alg, struct shash_alg, base);
+}
+
+static inline struct shash_alg *crypto_shash_alg(struct crypto_shash *tfm)
+{
+	return __crypto_shash_alg(crypto_shash_tfm(tfm)->__crt_alg);
+}
+
+static inline unsigned int crypto_shash_digestsize(struct crypto_shash *tfm)
+{
+	return crypto_shash_alg(tfm)->digestsize;
+}
+
+static inline unsigned int crypto_shash_statesize(struct crypto_shash *tfm)
+{
+	return crypto_shash_alg(tfm)->statesize;
+}
+
+static inline u32 crypto_shash_get_flags(struct crypto_shash *tfm)
+{
+	return crypto_tfm_get_flags(crypto_shash_tfm(tfm));
+}
+
+static inline void crypto_shash_set_flags(struct crypto_shash *tfm, u32 flags)
+{
+	crypto_tfm_set_flags(crypto_shash_tfm(tfm), flags);
+}
+
+static inline void crypto_shash_clear_flags(struct crypto_shash *tfm, u32 flags)
+{
+	crypto_tfm_clear_flags(crypto_shash_tfm(tfm), flags);
+}
+
+static inline unsigned int crypto_shash_descsize(struct crypto_shash *tfm)
+{
+	return tfm->descsize;
+}
+
+static inline void *shash_desc_ctx(struct shash_desc *desc)
+{
+	return desc->__ctx;
+}
+
+int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key,
+			unsigned int keylen);
+
+int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
+			unsigned int len, u8 *out);
+
+static inline int crypto_shash_export(struct shash_desc *desc, void *out)
+{
+	return crypto_shash_alg(desc->tfm)->export(desc, out);
+}
+
+static inline int crypto_shash_import(struct shash_desc *desc, const void *in)
+{
+	return crypto_shash_alg(desc->tfm)->import(desc, in);
+}
+
+static inline int crypto_shash_init(struct shash_desc *desc)
+{
+	return crypto_shash_alg(desc->tfm)->init(desc);
+}
+
+int crypto_shash_update(struct shash_desc *desc, const u8 *data,
+			unsigned int len);
+
+int crypto_shash_final(struct shash_desc *desc, u8 *out);
+
+int crypto_shash_finup(struct shash_desc *desc, const u8 *data,
+		       unsigned int len, u8 *out);
+
+static inline void shash_desc_zero(struct shash_desc *desc)
+{
+	memzero_explicit(desc,
+			 sizeof(*desc) + crypto_shash_descsize(desc->tfm));
+}
+
+#endif	/* _CRYPTO_HASH_H */
diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h
new file mode 100644
index 0000000..2d85c80
--- /dev/null
+++ b/include/crypto/internal/hash.h
@@ -0,0 +1,18 @@
+#ifndef _CRYPTO_INTERNAL_HASH_H
+#define _CRYPTO_INTERNAL_HASH_H
+
+#include <crypto/algapi.h>
+#include <crypto/hash.h>
+
+int crypto_register_shash(struct shash_alg *alg);
+int crypto_unregister_shash(struct shash_alg *alg);
+int crypto_register_shashes(struct shash_alg *algs, int count);
+int crypto_unregister_shashes(struct shash_alg *algs, int count);
+
+static inline struct crypto_shash *__crypto_shash_cast(struct crypto_tfm *tfm)
+{
+	return container_of(tfm, struct crypto_shash, base);
+}
+
+#endif	/* _CRYPTO_INTERNAL_HASH_H */
+
diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h
new file mode 100644
index 0000000..894df59
--- /dev/null
+++ b/include/crypto/poly1305.h
@@ -0,0 +1,41 @@
+/*
+ * Common values for the Poly1305 algorithm
+ */
+
+#ifndef _CRYPTO_POLY1305_H
+#define _CRYPTO_POLY1305_H
+
+#include <linux/types.h>
+#include <linux/crypto.h>
+
+#define POLY1305_BLOCK_SIZE	16
+#define POLY1305_KEY_SIZE	32
+#define POLY1305_DIGEST_SIZE	16
+
+struct poly1305_desc_ctx {
+	/* key */
+	u32 r[5];
+	/* finalize key */
+	u32 s[4];
+	/* accumulator */
+	u32 h[5];
+	/* partial buffer */
+	u8 buf[POLY1305_BLOCK_SIZE];
+	/* bytes used in partial buffer */
+	unsigned int buflen;
+	/* r key has been set */
+	bool rset;
+	/* s key has been set */
+	bool sset;
+};
+
+int crypto_poly1305_init(struct shash_desc *desc);
+int crypto_poly1305_setkey(struct crypto_shash *tfm,
+			   const u8 *key, unsigned int keylen);
+unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
+					const u8 *src, unsigned int srclen);
+int crypto_poly1305_update(struct shash_desc *desc,
+			   const u8 *src, unsigned int srclen);
+int crypto_poly1305_final(struct shash_desc *desc, u8 *dst);
+
+#endif
diff --git a/include/crypto/sha.h b/include/crypto/sha.h
new file mode 100644
index 0000000..c94d3eb
--- /dev/null
+++ b/include/crypto/sha.h
@@ -0,0 +1,110 @@
+/*
+ * Common values for SHA algorithms
+ */
+
+#ifndef _CRYPTO_SHA_H
+#define _CRYPTO_SHA_H
+
+#include <linux/types.h>
+
+#define SHA1_DIGEST_SIZE        20
+#define SHA1_BLOCK_SIZE         64
+
+#define SHA224_DIGEST_SIZE	28
+#define SHA224_BLOCK_SIZE	64
+
+#define SHA256_DIGEST_SIZE      32
+#define SHA256_BLOCK_SIZE       64
+
+#define SHA384_DIGEST_SIZE      48
+#define SHA384_BLOCK_SIZE       128
+
+#define SHA512_DIGEST_SIZE      64
+#define SHA512_BLOCK_SIZE       128
+
+#define SHA1_H0		0x67452301UL
+#define SHA1_H1		0xefcdab89UL
+#define SHA1_H2		0x98badcfeUL
+#define SHA1_H3		0x10325476UL
+#define SHA1_H4		0xc3d2e1f0UL
+
+#define SHA224_H0	0xc1059ed8UL
+#define SHA224_H1	0x367cd507UL
+#define SHA224_H2	0x3070dd17UL
+#define SHA224_H3	0xf70e5939UL
+#define SHA224_H4	0xffc00b31UL
+#define SHA224_H5	0x68581511UL
+#define SHA224_H6	0x64f98fa7UL
+#define SHA224_H7	0xbefa4fa4UL
+
+#define SHA256_H0	0x6a09e667UL
+#define SHA256_H1	0xbb67ae85UL
+#define SHA256_H2	0x3c6ef372UL
+#define SHA256_H3	0xa54ff53aUL
+#define SHA256_H4	0x510e527fUL
+#define SHA256_H5	0x9b05688cUL
+#define SHA256_H6	0x1f83d9abUL
+#define SHA256_H7	0x5be0cd19UL
+
+#define SHA384_H0	0xcbbb9d5dc1059ed8ULL
+#define SHA384_H1	0x629a292a367cd507ULL
+#define SHA384_H2	0x9159015a3070dd17ULL
+#define SHA384_H3	0x152fecd8f70e5939ULL
+#define SHA384_H4	0x67332667ffc00b31ULL
+#define SHA384_H5	0x8eb44a8768581511ULL
+#define SHA384_H6	0xdb0c2e0d64f98fa7ULL
+#define SHA384_H7	0x47b5481dbefa4fa4ULL
+
+#define SHA512_H0	0x6a09e667f3bcc908ULL
+#define SHA512_H1	0xbb67ae8584caa73bULL
+#define SHA512_H2	0x3c6ef372fe94f82bULL
+#define SHA512_H3	0xa54ff53a5f1d36f1ULL
+#define SHA512_H4	0x510e527fade682d1ULL
+#define SHA512_H5	0x9b05688c2b3e6c1fULL
+#define SHA512_H6	0x1f83d9abfb41bd6bULL
+#define SHA512_H7	0x5be0cd19137e2179ULL
+
+extern const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE];
+
+extern const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE];
+
+extern const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE];
+
+struct sha1_state {
+	u32 state[SHA1_DIGEST_SIZE / 4];
+	u64 count;
+	u8 buffer[SHA1_BLOCK_SIZE];
+};
+
+struct sha256_state {
+	u32 state[SHA256_DIGEST_SIZE / 4];
+	u64 count;
+	u8 buf[SHA256_BLOCK_SIZE];
+};
+
+struct sha512_state {
+	u64 state[SHA512_DIGEST_SIZE / 8];
+	u64 count[2];
+	u8 buf[SHA512_BLOCK_SIZE];
+};
+
+struct shash_desc;
+
+extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len);
+
+extern int crypto_sha1_finup(struct shash_desc *desc, const u8 *data,
+			     unsigned int len, u8 *hash);
+
+extern int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len);
+
+extern int crypto_sha256_finup(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, u8 *hash);
+
+extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len);
+
+extern int crypto_sha512_finup(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, u8 *hash);
+#endif
diff --git a/include/crypto/sha1_base.h b/include/crypto/sha1_base.h
new file mode 100644
index 0000000..01b002d
--- /dev/null
+++ b/include/crypto/sha1_base.h
@@ -0,0 +1,107 @@
+/*
+ * sha1_base.h - core logic for SHA-1 implementations
+ *
+ * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/internal/hash.h>
+#include <crypto/sha.h>
+#include <linux/byteorder.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+#include <asm/unaligned.h>
+
+typedef void (sha1_block_fn)(struct sha1_state *sst, u8 const *src, int blocks);
+
+static inline int sha1_base_init(struct shash_desc *desc)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA1_H0;
+	sctx->state[1] = SHA1_H1;
+	sctx->state[2] = SHA1_H2;
+	sctx->state[3] = SHA1_H3;
+	sctx->state[4] = SHA1_H4;
+	sctx->count = 0;
+
+	return 0;
+}
+
+static inline int sha1_base_do_update(struct shash_desc *desc,
+				      const u8 *data,
+				      unsigned int len,
+				      sha1_block_fn *block_fn)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
+
+	sctx->count += len;
+
+	if (unlikely((partial + len) >= SHA1_BLOCK_SIZE)) {
+		int blocks;
+
+		if (partial) {
+			int p = SHA1_BLOCK_SIZE - partial;
+
+			memcpy(sctx->buffer + partial, data, p);
+			data += p;
+			len -= p;
+
+			block_fn(sctx, sctx->buffer, 1);
+		}
+
+		blocks = len / SHA1_BLOCK_SIZE;
+		len %= SHA1_BLOCK_SIZE;
+
+		if (blocks) {
+			block_fn(sctx, data, blocks);
+			data += blocks * SHA1_BLOCK_SIZE;
+		}
+		partial = 0;
+	}
+	if (len)
+		memcpy(sctx->buffer + partial, data, len);
+
+	return 0;
+}
+
+static inline int sha1_base_do_finalize(struct shash_desc *desc,
+					sha1_block_fn *block_fn)
+{
+	const int bit_offset = SHA1_BLOCK_SIZE - sizeof(__be64);
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+	__be64 *bits = (__be64 *)(sctx->buffer + bit_offset);
+	unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
+
+	sctx->buffer[partial++] = 0x80;
+	if (partial > bit_offset) {
+		memset(sctx->buffer + partial, 0x0, SHA1_BLOCK_SIZE - partial);
+		partial = 0;
+
+		block_fn(sctx, sctx->buffer, 1);
+	}
+
+	memset(sctx->buffer + partial, 0x0, bit_offset - partial);
+	*bits = cpu_to_be64(sctx->count << 3);
+	block_fn(sctx, sctx->buffer, 1);
+
+	return 0;
+}
+
+static inline int sha1_base_finish(struct shash_desc *desc, u8 *out)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+	__be32 *digest = (__be32 *)out;
+	int i;
+
+	for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
+		put_unaligned_be32(sctx->state[i], digest++);
+
+	*sctx = (struct sha1_state){};
+	return 0;
+}
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
new file mode 100644
index 0000000..ad33ad3
--- /dev/null
+++ b/include/linux/atomic.h
@@ -0,0 +1,250 @@
+#ifndef __TOOLS_LINUX_ATOMIC_H
+#define __TOOLS_LINUX_ATOMIC_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+#define xchg(p, v)						\
+	__atomic_exchange_n(p, v, __ATOMIC_SEQ_CST)
+
+#define xchg_acquire(p, v)					\
+	__atomic_exchange_n(p, v, __ATOMIC_ACQUIRE)
+
+#define cmpxchg(p, old, new)					\
+({								\
+	typeof(*(p)) __old = (old);				\
+								\
+	__atomic_compare_exchange_n((p), &__old, new, false,	\
+				    __ATOMIC_SEQ_CST,		\
+				    __ATOMIC_SEQ_CST);		\
+	__old;							\
+})
+
+#define cmpxchg_acquire(p, old, new)				\
+({								\
+	typeof(*(p)) __old = (old);				\
+								\
+	__atomic_compare_exchange_n((p), &__old, new, false,	\
+				    __ATOMIC_ACQUIRE,		\
+				    __ATOMIC_ACQUIRE);		\
+	__old;							\
+})
+
+#define smp_mb__before_atomic()	__atomic_thread_fence(__ATOMIC_SEQ_CST)
+#define smp_mb__after_atomic()	__atomic_thread_fence(__ATOMIC_SEQ_CST)
+#define smp_wmb()		__atomic_thread_fence(__ATOMIC_SEQ_CST)
+#define smp_rmb()		__atomic_thread_fence(__ATOMIC_SEQ_CST)
+#define smp_mb()		__atomic_thread_fence(__ATOMIC_SEQ_CST)
+#define smp_read_barrier_depends()
+
+#define smp_store_mb(var, value)  do { WRITE_ONCE(var, value); smp_mb(); } while (0)
+
+#define smp_load_acquire(p)						\
+({									\
+	typeof(*p) ___p1 = READ_ONCE(*p);				\
+	smp_mb();							\
+	___p1;								\
+})
+
+#define smp_store_release(p, v)						\
+do {									\
+	smp_mb();							\
+	WRITE_ONCE(*p, v);						\
+} while (0)
+
+typedef struct {
+	int		counter;
+} atomic_t;
+
+static inline int atomic_read(const atomic_t *v)
+{
+	return __atomic_load_n(&v->counter, __ATOMIC_RELAXED);
+}
+
+static inline void atomic_set(atomic_t *v, int i)
+{
+	__atomic_store_n(&v->counter, i, __ATOMIC_RELAXED);
+}
+
+static inline int atomic_add_return(int i, atomic_t *v)
+{
+	return __atomic_add_fetch(&v->counter, i, __ATOMIC_RELAXED);
+}
+
+static inline int atomic_sub_return(int i, atomic_t *v)
+{
+	return __atomic_sub_fetch(&v->counter, i, __ATOMIC_RELAXED);
+}
+
+static inline int atomic_add_negative(int i, atomic_t *v)
+{
+	return atomic_add_return(i, v) < 0;
+}
+
+static inline void atomic_add(int i, atomic_t *v)
+{
+	atomic_add_return(i, v);
+}
+
+static inline void atomic_sub(int i, atomic_t *v)
+{
+	atomic_sub_return(i, v);
+}
+
+static inline void atomic_inc(atomic_t *v)
+{
+	atomic_add(1, v);
+}
+
+static inline void atomic_dec(atomic_t *v)
+{
+	atomic_sub(1, v);
+}
+
+#define atomic_dec_return(v)		atomic_sub_return(1, (v))
+#define atomic_inc_return(v)		atomic_add_return(1, (v))
+
+#define atomic_sub_and_test(i, v)	(atomic_sub_return((i), (v)) == 0)
+#define atomic_dec_and_test(v)		(atomic_dec_return(v) == 0)
+#define atomic_inc_and_test(v)		(atomic_inc_return(v) == 0)
+
+#define atomic_xchg(ptr, v)		(xchg(&(ptr)->counter, (v)))
+#define atomic_cmpxchg(v, old, new)	(cmpxchg(&((v)->counter), (old), (new)))
+
+static inline int atomic_add_unless(atomic_t *v, int a, int u)
+{
+	int c, old;
+	c = atomic_read(v);
+	while (c != u && (old = atomic_cmpxchg(v, c, c + a)) != c)
+		c = old;
+	return c;
+}
+
+#define atomic_inc_not_zero(v)		atomic_add_unless((v), 1, 0)
+
+typedef struct {
+	long		counter;
+} atomic_long_t;
+
+static inline long atomic_long_read(const atomic_long_t *v)
+{
+	return __atomic_load_n(&v->counter, __ATOMIC_RELAXED);
+}
+
+static inline void atomic_long_set(atomic_long_t *v, long i)
+{
+	__atomic_store_n(&v->counter, i, __ATOMIC_RELAXED);
+}
+
+static inline long atomic_long_add_return(long i, atomic_long_t *v)
+{
+	return __atomic_add_fetch(&v->counter, i, __ATOMIC_RELAXED);
+}
+
+static inline long atomic_long_sub_return(long i, atomic_long_t *v)
+{
+	return __atomic_sub_fetch(&v->counter, i, __ATOMIC_RELAXED);
+}
+
+static inline void atomic_long_add(long i, atomic_long_t *v)
+{
+	atomic_long_add_return(i, v);
+}
+
+static inline void atomic_long_sub(long i, atomic_long_t *v)
+{
+	atomic_long_sub_return(i, v);
+}
+
+static inline void atomic_long_inc(atomic_long_t *v)
+{
+	atomic_long_add(1, v);
+}
+
+static inline void atomic_long_dec(atomic_long_t *v)
+{
+	atomic_long_sub(1, v);
+}
+
+static inline long atomic_long_cmpxchg(atomic_long_t *v, long old, long new)
+{
+	return cmpxchg(&v->counter, old, new);
+}
+
+static inline bool atomic_long_inc_not_zero(atomic_long_t *i)
+{
+	long old, v = atomic_long_read(i);
+
+	do {
+		if (!(old = v))
+			return false;
+	} while ((v = atomic_long_cmpxchg(i, old, old + 1)) != old);
+
+	return true;
+}
+
+#define atomic_long_sub_and_test(i, v)	(atomic_long_sub_return((i), (v)) == 0)
+
+typedef struct {
+	u64		counter;
+} atomic64_t;
+
+static inline s64 atomic64_read(const atomic64_t *v)
+{
+	return __atomic_load_n(&v->counter, __ATOMIC_RELAXED);
+}
+
+static inline void atomic64_set(atomic64_t *v, s64 i)
+{
+	__atomic_store_n(&v->counter, i, __ATOMIC_RELAXED);
+}
+
+static inline s64 atomic64_add_return(s64 i, atomic64_t *v)
+{
+	return __atomic_add_fetch(&v->counter, i, __ATOMIC_RELAXED);
+}
+
+static inline s64 atomic64_sub_return(s64 i, atomic64_t *v)
+{
+	return __atomic_sub_fetch(&v->counter, i, __ATOMIC_RELAXED);
+}
+
+static inline void atomic64_add(s64 i, atomic64_t *v)
+{
+	atomic64_add_return(i, v);
+}
+
+static inline void atomic64_sub(s64 i, atomic64_t *v)
+{
+	atomic64_sub_return(i, v);
+}
+
+static inline void atomic64_inc(atomic64_t *v)
+{
+	atomic64_add(1, v);
+}
+
+static inline void atomic64_dec(atomic64_t *v)
+{
+	atomic64_sub(1, v);
+}
+
+#define atomic64_dec_return(v)		atomic64_sub_return(1, (v))
+#define atomic64_inc_return(v)		atomic64_add_return(1, (v))
+
+static inline s64 atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
+{
+	return cmpxchg(&v->counter, old, new);
+}
+
+static inline s64 atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
+{
+	return cmpxchg_acquire(&v->counter, old, new);
+}
+
+static inline s64 atomic64_add_return_release(s64 i, atomic64_t *v)
+{
+	return __atomic_add_fetch(&v->counter, i, __ATOMIC_RELEASE);
+}
+
+#endif /* __TOOLS_LINUX_ATOMIC_H */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
new file mode 100644
index 0000000..a68fca4
--- /dev/null
+++ b/include/linux/backing-dev.h
@@ -0,0 +1,42 @@
+#ifndef _LINUX_BACKING_DEV_H
+#define _LINUX_BACKING_DEV_H
+
+typedef int (congested_fn)(void *, int);
+
+enum wb_congested_state {
+	WB_async_congested,	/* The async (write) queue is getting full */
+	WB_sync_congested,	/* The sync queue is getting full */
+};
+
+struct backing_dev_info {
+	unsigned	ra_pages;
+	unsigned	capabilities;
+
+	congested_fn	*congested_fn;
+	void		*congested_data;
+};
+
+#define BDI_CAP_NO_ACCT_DIRTY	0x00000001
+#define BDI_CAP_NO_WRITEBACK	0x00000002
+#define BDI_CAP_NO_ACCT_WB	0x00000004
+#define BDI_CAP_STABLE_WRITES	0x00000008
+#define BDI_CAP_STRICTLIMIT	0x00000010
+#define BDI_CAP_CGROUP_WRITEBACK 0x00000020
+
+static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits)
+{
+	return 0;
+}
+
+static inline int __must_check bdi_setup_and_register(struct backing_dev_info *bdi,
+						      char *name)
+{
+	bdi->capabilities = 0;
+	return 0;
+}
+
+static inline void bdi_destroy(struct backing_dev_info *bdi) {}
+
+#define VM_MAX_READAHEAD	128	/* kbytes */
+
+#endif	/* _LINUX_BACKING_DEV_H */
diff --git a/bcache-ioctl.h b/include/linux/bcache-ioctl.h
index 0b1ecfc..8ca2fdb 100644
--- a/bcache-ioctl.h
+++ b/include/linux/bcache-ioctl.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_BCACHE_IOCTL_H
 #define _LINUX_BCACHE_IOCTL_H
 
+#include <linux/bcache.h>
 #include <linux/uuid.h>
 
 #ifdef __cplusplus
diff --git a/bcache-ondisk.h b/include/linux/bcache.h
index c141923..f09a44a 100644
--- a/bcache-ondisk.h
+++ b/include/linux/bcache.h
@@ -1,5 +1,5 @@
-#ifndef _BCACHE_TOOLS_ONDISK_H
-#define _BCACHE_TOOLS_ONDISK_H
+#ifndef _LINUX_BCACHE_H
+#define _LINUX_BCACHE_H
 
 /*
  * Bcache on disk data structures
@@ -102,16 +102,6 @@ struct bch_val {
 	__u64		__nothing[0];
 };
 
-struct bversion {
-#if defined(__LITTLE_ENDIAN)
-	__u64		low;
-	__u32		high;
-#elif defined(__BIG_ENDIAN)
-	__u32		high;
-	__u64		low;
-#endif
-} __attribute__((packed, aligned(4)));
-
 struct bkey {
 	__u64		_data[0];
 
@@ -119,7 +109,15 @@ struct bkey {
 	__u8		u64s;
 
 	/* Format of key (0 for format local to btree node) */
-	__u8		format;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8		format:7,
+			needs_whiteout:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u8		needs_whiteout:1,
+			format:7;
+#else
+#error edit for your odd byteorder.
+#endif
 
 	/* Type of the value */
 	__u8		type;
@@ -127,13 +125,13 @@ struct bkey {
 #if defined(__LITTLE_ENDIAN)
 	__u8		pad[1];
 
-	struct bversion	version;
+	__u32		version;
 	__u32		size;		/* extent size, in sectors */
 	struct bpos	p;
 #elif defined(__BIG_ENDIAN)
 	struct bpos	p;
 	__u32		size;		/* extent size, in sectors */
-	struct bversion	version;
+	__u32		version;
 
 	__u8		pad[1];
 #endif
@@ -146,7 +144,19 @@ struct bkey_packed {
 	__u8		u64s;
 
 	/* Format of key (0 for format local to btree node) */
-	__u8		format;
+
+	/*
+	 * XXX: next incompat on disk format change, switch format and
+	 * needs_whiteout - bkey_packed() will be cheaper if format is the high
+	 * bits of the bitfield
+	 */
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8		format:7,
+			needs_whiteout:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u8		needs_whiteout:1,
+			format:7;
+#endif
 
 	/* Type of the value */
 	__u8		type;
@@ -174,8 +184,7 @@ enum bch_bkey_fields {
 	BKEY_FIELD_OFFSET,
 	BKEY_FIELD_SNAPSHOT,
 	BKEY_FIELD_SIZE,
-	BKEY_FIELD_VERSION_HIGH,
-	BKEY_FIELD_VERSION_LOW,
+	BKEY_FIELD_VERSION,
 	BKEY_NR_FIELDS,
 };
 
@@ -191,8 +200,7 @@ enum bch_bkey_fields {
 		bkey_format_field(OFFSET,	p.offset),		\
 		bkey_format_field(SNAPSHOT,	p.snapshot),		\
 		bkey_format_field(SIZE,		size),			\
-		bkey_format_field(VERSION_HIGH,	version.high),		\
-		bkey_format_field(VERSION_LOW,	version.low),		\
+		bkey_format_field(VERSION,	version),		\
 	},								\
 })
 
@@ -237,11 +245,6 @@ static inline void bkey_init(struct bkey *k)
 
 #define bkey_bytes(_k)		((_k)->u64s * sizeof(__u64))
 
-static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src)
-{
-	memcpy(dst, src, bkey_bytes(&src->k));
-}
-
 #define __BKEY_PADDED(key, pad)					\
 	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
 
@@ -273,6 +276,7 @@ struct bkey_i_##name {							\
 #define KEY_TYPE_DISCARD		1
 #define KEY_TYPE_ERROR			2
 #define KEY_TYPE_COOKIE			3
+#define KEY_TYPE_PERSISTENT_DISCARD	4
 #define KEY_TYPE_GENERIC_NR		128
 
 struct bch_cookie {
@@ -385,31 +389,28 @@ struct bch_extent_crc32 {
 #define CRC32_EXTENT_SIZE_MAX	(1U << 7)
 
 /* 64k */
-#define BCH_COMPRESSED_EXTENT_MAX 128
+#define BCH_COMPRESSED_EXTENT_MAX 128U
 
 struct bch_extent_crc64 {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	__u64			type:3,
-				compressed_size:10,
-				uncompressed_size:10,
-				offset:10,
-				nonce:23,
+				offset:17,
+				compressed_size:18,
+				uncompressed_size:18,
 				csum_type:4,
 				compression_type:4;
 #elif defined (__BIG_ENDIAN_BITFIELD)
 	__u64			compression_type:4,
 				csum_type:4,
-				nonce:23,
-				offset:10,
-				uncompressed_size:10,
-				compressed_size:10,
+				uncompressed_size:18,
+				compressed_size:18,
+				offset:17,
 				type:3;
 #endif
 	__u64			csum;
 } __attribute__((packed, aligned(8)));
 
-#define CRC64_EXTENT_SIZE_MAX	(1U << 10) /* inclusive */
-#define CRC64_NONCE_MAX		(1U << 23) /* exclusive */
+#define CRC64_EXTENT_SIZE_MAX	(1U << 17)
 
 /*
  * @reservation - pointer hasn't been written to, just reserved
@@ -433,13 +434,15 @@ struct bch_extent_ptr {
 } __attribute__((packed, aligned(8)));
 
 union bch_extent_entry {
-#if defined(__LITTLE_ENDIAN__) ||  BITS_PER_LONG == 64
+#if defined(__LITTLE_ENDIAN) ||  __BITS_PER_LONG == 64
 	unsigned long			type;
-#elif BITS_PER_LONG == 32
+#elif __BITS_PER_LONG == 32
 	struct {
 		unsigned long		pad;
 		unsigned long		type;
 	};
+#else
+#error edit for your odd byteorder.
 #endif
 	struct bch_extent_crc32		crc32;
 	struct bch_extent_crc64		crc64;
@@ -476,19 +479,18 @@ BKEY_VAL_TYPE(extent,		BCH_EXTENT);
 	  sizeof(struct bch_extent_ptr)) / sizeof(u64))
 
 /* Maximum possible size of an entire extent value: */
-#if 0
 /* There's a hack in the keylist code that needs to be fixed.. */
 #define BKEY_EXTENT_VAL_U64s_MAX				\
 	(BKEY_EXTENT_PTR_U64s_MAX * BCH_REPLICAS_MAX)
-#else
-#define BKEY_EXTENT_VAL_U64s_MAX	8
-#endif
 
 /* * Maximum possible size of an entire extent, key + value: */
-#define BKEY_EXTENT_U64s_MAX	(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
+#define BKEY_EXTENT_U64s_MAX		(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
 
-#define BKEY_BTREE_PTR_VAL_U64s_MAX	BCH_REPLICAS_MAX
-#define BKEY_BTREE_PTR_U64s_MAX		(BKEY_U64s + BCH_REPLICAS_MAX)
+/* Btree pointers don't carry around checksums: */
+#define BKEY_BTREE_PTR_VAL_U64s_MAX				\
+	((sizeof(struct bch_extent_ptr)) / sizeof(u64) * BCH_REPLICAS_MAX)
+#define BKEY_BTREE_PTR_U64s_MAX					\
+	(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
 
 /* Inodes */
 
@@ -724,7 +726,7 @@ struct cache_sb {
 	uuid_le			user_uuid;
 
 	__le64			flags2;
-	__le64			encryption_key[5];
+	__le64			pad1[5];
 
 	/* Number of cache_member entries: */
 	__u8			nr_in_set;
@@ -768,7 +770,7 @@ LE64_BITMASK(CACHE_SET_DATA_REPLICAS_WANT,struct cache_sb, flags, 8, 12);
 
 LE64_BITMASK(CACHE_SB_CSUM_TYPE,	struct cache_sb, flags, 12, 16);
 
-LE64_BITMASK(CACHE_SET_META_CSUM_TYPE,struct cache_sb, flags, 16, 20);
+LE64_BITMASK(CACHE_SET_META_PREFERRED_CSUM_TYPE,struct cache_sb, flags, 16, 20);
 #define BCH_CSUM_NONE			0U
 #define BCH_CSUM_CRC32C			1U
 #define BCH_CSUM_CRC64			2U
@@ -789,7 +791,7 @@ enum bch_str_hash_type {
 
 #define BCH_STR_HASH_NR			4
 
-LE64_BITMASK(CACHE_SET_DATA_CSUM_TYPE, struct cache_sb, flags, 48, 52);
+LE64_BITMASK(CACHE_SET_DATA_PREFERRED_CSUM_TYPE, struct cache_sb, flags, 48, 52);
 
 LE64_BITMASK(CACHE_SET_COMPRESSION_TYPE, struct cache_sb, flags, 52, 56);
 enum {
@@ -855,12 +857,12 @@ LE64_BITMASK(CACHE_SET_JOURNAL_ENTRY_SIZE, struct cache_sb, flags2, 7, 15);
 	CACHE_SET_OPT(metadata_checksum,			\
 		      bch_csum_types,				\
 		      0, BCH_CSUM_NR,				\
-		      CACHE_SET_META_CSUM_TYPE,			\
+		      CACHE_SET_META_PREFERRED_CSUM_TYPE,	\
 		      true)					\
 	CACHE_SET_OPT(data_checksum,				\
 		      bch_csum_types,				\
 		      0, BCH_CSUM_NR,				\
-		      CACHE_SET_DATA_CSUM_TYPE,			\
+		      CACHE_SET_DATA_PREFERRED_CSUM_TYPE,	\
 		      true)					\
 	CACHE_SET_OPT(compression,				\
 		      bch_compression_types,			\
@@ -1000,14 +1002,9 @@ static inline __u64 bset_magic(struct cache_sb *sb)
 	return __le64_to_cpu(sb->set_magic) ^ BSET_MAGIC;
 }
 
-/* 128 bits, sufficient for cryptographic MACs: */
-struct bch_csum {
-	__le64			lo;
-	__le64			hi;
-};
-
 /* Journal */
 
+
 #define BCACHE_JSET_VERSION_UUIDv1	1
 #define BCACHE_JSET_VERSION_UUID	1	/* Always latest UUID format */
 #define BCACHE_JSET_VERSION_JKEYS	2
@@ -1057,8 +1054,7 @@ enum {
  * version is for on disk format changes.
  */
 struct jset {
-	struct bch_csum		csum;
-
+	__le64			csum;
 	__le64			magic;
 	__le32			version;
 	__le32			flags;
@@ -1085,8 +1081,7 @@ LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
 /* Bucket prios/gens */
 
 struct prio_set {
-	struct bch_csum		csum;
-
+	__le64			csum;
 	__le64			magic;
 	__le32			version;
 	__le32			flags;
@@ -1119,7 +1114,7 @@ enum btree_id {
 
 #undef DEF_BTREE_ID
 
-#define BTREE_MAX_DEPTH		4
+#define BTREE_MAX_DEPTH		4U
 
 /* Btree nodes */
 
@@ -1164,9 +1159,11 @@ LE32_BITMASK(BSET_CSUM_TYPE,	struct bset, flags, 0, 4);
 LE32_BITMASK(BSET_BTREE_LEVEL,	struct bset, flags, 4, 8);
 
 LE32_BITMASK(BSET_BIG_ENDIAN,	struct bset, flags, 8, 9);
+LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
+				struct bset, flags, 9, 10);
 
 struct btree_node {
-	struct bch_csum		csum;
+	__le64			csum;
 	__le64			magic;
 
 	/* Closed interval: */
@@ -1178,22 +1175,10 @@ struct btree_node {
 } __attribute__((packed));
 
 struct btree_node_entry {
-	struct bch_csum		csum;
-
+	__le64			csum;
 	struct bset		keys;
 } __attribute__((packed));
 
-/* Crypto: */
-
-struct nonce {
-	__le32 d[4];
-};
-
-#define BCACHE_MASTER_KEY_HEADER	"bch**key"
-#define BCACHE_MASTER_KEY_NONCE		((struct nonce)			\
-	{{ __cpu_to_le32(1), __cpu_to_le32(2),				\
-	   __cpu_to_le32(3), __cpu_to_le32(4) }})
-
 /* OBSOLETE */
 
 #define BITMASK(name, type, field, offset, end)				\
@@ -1291,60 +1276,9 @@ struct uuid_entry {
 
 BITMASK(UUID_FLASH_ONLY,	struct uuid_entry, flags, 0, 1);
 
-#define SB_SIZE				4096
-#define SB_JOURNAL_BUCKETS		256U
-
-struct cache_sb_v0 {
-	__u64			csum;
-	__u64			offset;	/* sector where this sb was written */
-	__u64			version;
-
-	uuid_le			magic;	/* bcache superblock UUID */
-
-	uuid_le			uuid;
-	union {
-		uuid_le		set_uuid;
-		__u64		set_magic;
-	};
-	__u8			label[SB_LABEL_SIZE];
-
-	__u64			flags;
-	__u64			seq;
-	__u64			pad[8];
-
-	union {
-	struct {
-		/* Cache devices */
-		__u64		nbuckets;	/* device size */
-
-		__u16		block_size;	/* sectors */
-		__u16		bucket_size;	/* sectors */
-
-		__u16		nr_in_set;
-		__u16		nr_this_dev;
-	};
-	struct {
-		/* Backing devices */
-		__u64		data_offset;
-
-		/*
-		 * block_size from the cache device section is still used by
-		 * backing devices, so don't add anything here until we fix
-		 * things to not need it for backing devices anymore
-		 */
-	};
-	};
-
-	__u32			last_mount;	/* time_t */
-
-	__u16			first_bucket;
-	__u16			u64s;
-	__u64			_data[SB_JOURNAL_BUCKETS]; /* journal buckets */
-};
-
 #ifdef __cplusplus
 }
 #endif
-#endif /* _BCACHE_TOOLS_ONDISK_H */
+#endif /* _LINUX_BCACHE_H */
 
 /* vim: set foldnestmax=2: */
diff --git a/include/linux/bio.h b/include/linux/bio.h
new file mode 100644
index 0000000..94e9048
--- /dev/null
+++ b/include/linux/bio.h
@@ -0,0 +1,461 @@
+/*
+ * 2.5 block I/O model
+ *
+ * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ */
+#ifndef __LINUX_BIO_H
+#define __LINUX_BIO_H
+
+#include <linux/mempool.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+
+#include <linux/blkdev.h>
+#include <linux/blk_types.h>
+#include <linux/workqueue.h>
+
+#define bio_prio(bio)			(bio)->bi_ioprio
+#define bio_set_prio(bio, prio)		((bio)->bi_ioprio = prio)
+
+#define bio_iter_iovec(bio, iter)				\
+	bvec_iter_bvec((bio)->bi_io_vec, (iter))
+
+#define bio_iter_page(bio, iter)				\
+	bvec_iter_page((bio)->bi_io_vec, (iter))
+#define bio_iter_len(bio, iter)					\
+	bvec_iter_len((bio)->bi_io_vec, (iter))
+#define bio_iter_offset(bio, iter)				\
+	bvec_iter_offset((bio)->bi_io_vec, (iter))
+
+#define bio_page(bio)		bio_iter_page((bio), (bio)->bi_iter)
+#define bio_offset(bio)		bio_iter_offset((bio), (bio)->bi_iter)
+#define bio_iovec(bio)		bio_iter_iovec((bio), (bio)->bi_iter)
+
+#define bio_multiple_segments(bio)				\
+	((bio)->bi_iter.bi_size != bio_iovec(bio).bv_len)
+
+#define bvec_iter_sectors(iter)	((iter).bi_size >> 9)
+#define bvec_iter_end_sector(iter) ((iter).bi_sector + bvec_iter_sectors((iter)))
+
+#define bio_sectors(bio)	bvec_iter_sectors((bio)->bi_iter)
+#define bio_end_sector(bio)	bvec_iter_end_sector((bio)->bi_iter)
+
+static inline bool bio_has_data(struct bio *bio)
+{
+	if (bio &&
+	    bio->bi_iter.bi_size &&
+	    bio_op(bio) != REQ_OP_DISCARD &&
+	    bio_op(bio) != REQ_OP_SECURE_ERASE)
+		return true;
+
+	return false;
+}
+
+static inline bool bio_no_advance_iter(struct bio *bio)
+{
+	return bio_op(bio) == REQ_OP_DISCARD ||
+	       bio_op(bio) == REQ_OP_SECURE_ERASE ||
+	       bio_op(bio) == REQ_OP_WRITE_SAME;
+}
+
+static inline bool bio_is_rw(struct bio *bio)
+{
+	if (!bio_has_data(bio))
+		return false;
+
+	if (bio_no_advance_iter(bio))
+		return false;
+
+	return true;
+}
+
+static inline bool bio_mergeable(struct bio *bio)
+{
+	if (bio->bi_opf & REQ_NOMERGE_FLAGS)
+		return false;
+
+	return true;
+}
+
+static inline unsigned int bio_cur_bytes(struct bio *bio)
+{
+	if (bio_has_data(bio))
+		return bio_iovec(bio).bv_len;
+	else /* dataless requests such as discard */
+		return bio->bi_iter.bi_size;
+}
+
+static inline void *bio_data(struct bio *bio)
+{
+	if (bio_has_data(bio))
+		return page_address(bio_page(bio)) + bio_offset(bio);
+
+	return NULL;
+}
+
+#define __bio_kmap_atomic(bio, iter)				\
+	(kmap_atomic(bio_iter_iovec((bio), (iter)).bv_page) +	\
+		bio_iter_iovec((bio), (iter)).bv_offset)
+
+#define __bio_kunmap_atomic(addr)	kunmap_atomic(addr)
+
+#define bio_for_each_segment_all(bvl, bio, i)				\
+	for (i = 0, bvl = (bio)->bi_io_vec; i < (bio)->bi_vcnt; i++, bvl++)
+
+static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
+				    unsigned bytes)
+{
+	iter->bi_sector += bytes >> 9;
+
+	if (bio_no_advance_iter(bio))
+		iter->bi_size -= bytes;
+	else
+		bvec_iter_advance(bio->bi_io_vec, iter, bytes);
+}
+
+#define __bio_for_each_segment(bvl, bio, iter, start)			\
+	for (iter = (start);						\
+	     (iter).bi_size &&						\
+		((bvl = bio_iter_iovec((bio), (iter))), 1);		\
+	     bio_advance_iter((bio), &(iter), (bvl).bv_len))
+
+#define bio_for_each_segment(bvl, bio, iter)				\
+	__bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter)
+
+#define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
+
+static inline unsigned bio_segments(struct bio *bio)
+{
+	unsigned segs = 0;
+	struct bio_vec bv;
+	struct bvec_iter iter;
+
+	/*
+	 * We special case discard/write same, because they interpret bi_size
+	 * differently:
+	 */
+
+	if (bio_op(bio) == REQ_OP_DISCARD)
+		return 1;
+
+	if (bio_op(bio) == REQ_OP_SECURE_ERASE)
+		return 1;
+
+	if (bio_op(bio) == REQ_OP_WRITE_SAME)
+		return 1;
+
+	bio_for_each_segment(bv, bio, iter)
+		segs++;
+
+	return segs;
+}
+
+static inline void bio_get(struct bio *bio)
+{
+	bio->bi_flags |= (1 << BIO_REFFED);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_cnt);
+}
+
+static inline bool bio_flagged(struct bio *bio, unsigned int bit)
+{
+	return (bio->bi_flags & (1U << bit)) != 0;
+}
+
+static inline void bio_set_flag(struct bio *bio, unsigned int bit)
+{
+	bio->bi_flags |= (1U << bit);
+}
+
+static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
+{
+	bio->bi_flags &= ~(1U << bit);
+}
+
+static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
+{
+	*bv = bio_iovec(bio);
+}
+
+static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
+{
+	struct bvec_iter iter = bio->bi_iter;
+	int idx;
+
+	if (unlikely(!bio_multiple_segments(bio))) {
+		*bv = bio_iovec(bio);
+		return;
+	}
+
+	bio_advance_iter(bio, &iter, iter.bi_size);
+
+	if (!iter.bi_bvec_done)
+		idx = iter.bi_idx - 1;
+	else	/* in the middle of bvec */
+		idx = iter.bi_idx;
+
+	*bv = bio->bi_io_vec[idx];
+
+	/*
+	 * iter.bi_bvec_done records actual length of the last bvec
+	 * if this bio ends in the middle of one io vector
+	 */
+	if (iter.bi_bvec_done)
+		bv->bv_len = iter.bi_bvec_done;
+}
+
+extern struct bio *bio_split(struct bio *bio, int sectors,
+			     gfp_t gfp, struct bio_set *bs);
+
+static inline struct bio *bio_next_split(struct bio *bio, int sectors,
+					 gfp_t gfp, struct bio_set *bs)
+{
+	if (sectors >= bio_sectors(bio))
+		return bio;
+
+	return bio_split(bio, sectors, gfp, bs);
+}
+
+struct bio_set {
+	unsigned int front_pad;
+};
+
+static inline void bioset_exit(struct bio_set *bs) {}
+
+static inline void bioset_free(struct bio_set *bs)
+{
+	kfree(bs);
+}
+
+static inline int bioset_init(struct bio_set *bs,
+			      unsigned pool_size,
+			      unsigned front_pad)
+{
+	bs->front_pad = front_pad;
+	return 0;
+}
+
+extern struct bio_set *bioset_create(unsigned int, unsigned int);
+extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int);
+
+extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
+extern void bio_put(struct bio *);
+
+extern void __bio_clone_fast(struct bio *, struct bio *);
+extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
+extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
+
+static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+{
+	return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
+}
+
+static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
+{
+	return bio_clone_bioset(bio, gfp_mask, NULL);
+
+}
+
+extern void bio_endio(struct bio *);
+extern void bio_endio_nodec(struct bio *);
+
+static inline void bio_io_error(struct bio *bio)
+{
+	bio->bi_error = -EIO;
+	bio_endio(bio);
+}
+
+extern void bio_advance(struct bio *, unsigned);
+
+extern void bio_reset(struct bio *);
+void bio_chain(struct bio *, struct bio *);
+
+static inline void bio_flush_dcache_pages(struct bio *bi)
+{
+}
+
+extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter dst_iter,
+			       struct bio *src, struct bvec_iter src_iter);
+extern void bio_copy_data(struct bio *dst, struct bio *src);
+extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
+
+void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter);
+
+static inline void zero_fill_bio(struct bio *bio)
+{
+	zero_fill_bio_iter(bio, bio->bi_iter);
+}
+
+static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
+{
+	return page_address(bvec->bv_page) + bvec->bv_offset;
+}
+
+static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
+{
+	*flags = 0;
+}
+
+static inline char *__bio_kmap_irq(struct bio *bio, struct bvec_iter iter,
+				   unsigned long *flags)
+{
+	return bvec_kmap_irq(&bio_iter_iovec(bio, iter), flags);
+}
+#define __bio_kunmap_irq(buf, flags)	bvec_kunmap_irq(buf, flags)
+
+#define bio_kmap_irq(bio, flags) \
+	__bio_kmap_irq((bio), (bio)->bi_iter, (flags))
+#define bio_kunmap_irq(buf,flags)	__bio_kunmap_irq(buf, flags)
+
+struct bio_list {
+	struct bio *head;
+	struct bio *tail;
+};
+
+static inline int bio_list_empty(const struct bio_list *bl)
+{
+	return bl->head == NULL;
+}
+
+static inline void bio_list_init(struct bio_list *bl)
+{
+	bl->head = bl->tail = NULL;
+}
+
+#define BIO_EMPTY_LIST	{ NULL, NULL }
+
+#define bio_list_for_each(bio, bl) \
+	for (bio = (bl)->head; bio; bio = bio->bi_next)
+
+static inline unsigned bio_list_size(const struct bio_list *bl)
+{
+	unsigned sz = 0;
+	struct bio *bio;
+
+	bio_list_for_each(bio, bl)
+		sz++;
+
+	return sz;
+}
+
+static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
+{
+	bio->bi_next = NULL;
+
+	if (bl->tail)
+		bl->tail->bi_next = bio;
+	else
+		bl->head = bio;
+
+	bl->tail = bio;
+}
+
+static inline void bio_list_add_head(struct bio_list *bl, struct bio *bio)
+{
+	bio->bi_next = bl->head;
+
+	bl->head = bio;
+
+	if (!bl->tail)
+		bl->tail = bio;
+}
+
+static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
+{
+	if (!bl2->head)
+		return;
+
+	if (bl->tail)
+		bl->tail->bi_next = bl2->head;
+	else
+		bl->head = bl2->head;
+
+	bl->tail = bl2->tail;
+}
+
+static inline void bio_list_merge_head(struct bio_list *bl,
+				       struct bio_list *bl2)
+{
+	if (!bl2->head)
+		return;
+
+	if (bl->head)
+		bl2->tail->bi_next = bl->head;
+	else
+		bl->tail = bl2->tail;
+
+	bl->head = bl2->head;
+}
+
+static inline struct bio *bio_list_peek(struct bio_list *bl)
+{
+	return bl->head;
+}
+
+static inline struct bio *bio_list_pop(struct bio_list *bl)
+{
+	struct bio *bio = bl->head;
+
+	if (bio) {
+		bl->head = bl->head->bi_next;
+		if (!bl->head)
+			bl->tail = NULL;
+
+		bio->bi_next = NULL;
+	}
+
+	return bio;
+}
+
+static inline struct bio *bio_list_get(struct bio_list *bl)
+{
+	struct bio *bio = bl->head;
+
+	bl->head = bl->tail = NULL;
+
+	return bio;
+}
+
+/*
+ * Increment chain count for the bio. Make sure the CHAIN flag update
+ * is visible before the raised count.
+ */
+static inline void bio_inc_remaining(struct bio *bio)
+{
+	bio_set_flag(bio, BIO_CHAIN);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_remaining);
+}
+
+static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+{
+	return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
+}
+
+static inline struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
+{
+	return bio_clone_bioset(bio, gfp_mask, NULL);
+}
+
+static inline void bio_init(struct bio *bio)
+{
+	memset(bio, 0, sizeof(*bio));
+	atomic_set(&bio->__bi_remaining, 1);
+	atomic_set(&bio->__bi_cnt, 1);
+}
+
+#endif /* __LINUX_BIO_H */
diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h
new file mode 100644
index 0000000..0e88820
--- /dev/null
+++ b/include/linux/bit_spinlock.h
@@ -0,0 +1,41 @@
+#ifndef __LINUX_BIT_SPINLOCK_H
+#define __LINUX_BIT_SPINLOCK_H
+
+#include <linux/kernel.h>
+#include <linux/preempt.h>
+#include <linux/atomic.h>
+#include <linux/bug.h>
+
+static inline void bit_spin_lock(int bitnum, unsigned long *addr)
+{
+	while (unlikely(test_and_set_bit_lock(bitnum, addr))) {
+		do {
+			cpu_relax();
+		} while (test_bit(bitnum, addr));
+	}
+}
+
+static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
+{
+	return !test_and_set_bit_lock(bitnum, addr);
+}
+
+static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
+{
+	BUG_ON(!test_bit(bitnum, addr));
+
+	clear_bit_unlock(bitnum, addr);
+}
+
+static inline void __bit_spin_unlock(int bitnum, unsigned long *addr)
+{
+	bit_spin_unlock(bitnum, addr);
+}
+
+static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
+{
+	return test_bit(bitnum, addr);
+}
+
+#endif /* __LINUX_BIT_SPINLOCK_H */
+
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
new file mode 100644
index 0000000..3baa61e
--- /dev/null
+++ b/include/linux/bitmap.h
@@ -0,0 +1,132 @@
+#ifndef _PERF_BITOPS_H
+#define _PERF_BITOPS_H
+
+#include <string.h>
+#include <linux/bitops.h>
+#include <stdlib.h>
+
+#define DECLARE_BITMAP(name,bits) \
+	unsigned long name[BITS_TO_LONGS(bits)]
+
+int __bitmap_weight(const unsigned long *bitmap, int bits);
+void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
+		 const unsigned long *bitmap2, int bits);
+int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
+		 const unsigned long *bitmap2, unsigned int bits);
+
+#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
+
+#define BITMAP_LAST_WORD_MASK(nbits)					\
+(									\
+	((nbits) % BITS_PER_LONG) ?					\
+		(1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL		\
+)
+
+#define small_const_nbits(nbits) \
+	(__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
+
+static inline void bitmap_zero(unsigned long *dst, int nbits)
+{
+	memset(dst, 0, BITS_TO_LONGS(nbits) * sizeof(unsigned long));
+}
+
+static inline int bitmap_weight(const unsigned long *src, int nbits)
+{
+	if (small_const_nbits(nbits))
+		return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
+	return __bitmap_weight(src, nbits);
+}
+
+static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
+			     const unsigned long *src2, int nbits)
+{
+	if (small_const_nbits(nbits))
+		*dst = *src1 | *src2;
+	else
+		__bitmap_or(dst, src1, src2, nbits);
+}
+
+/**
+ * bitmap_alloc - Allocate bitmap
+ * @nr: Bit to set
+ */
+static inline unsigned long *bitmap_alloc(int nbits)
+{
+	return calloc(1, BITS_TO_LONGS(nbits) * sizeof(unsigned long));
+}
+
+/*
+ * bitmap_scnprintf - print bitmap list into buffer
+ * @bitmap: bitmap
+ * @nbits: size of bitmap
+ * @buf: buffer to store output
+ * @size: size of @buf
+ */
+size_t bitmap_scnprintf(unsigned long *bitmap, int nbits,
+			char *buf, size_t size);
+
+/**
+ * bitmap_and - Do logical and on bitmaps
+ * @dst: resulting bitmap
+ * @src1: operand 1
+ * @src2: operand 2
+ * @nbits: size of bitmap
+ */
+static inline int bitmap_and(unsigned long *dst, const unsigned long *src1,
+			     const unsigned long *src2, unsigned int nbits)
+{
+	if (small_const_nbits(nbits))
+		return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0;
+	return __bitmap_and(dst, src1, src2, nbits);
+}
+
+static inline unsigned long _find_next_bit(const unsigned long *addr,
+		unsigned long nbits, unsigned long start, unsigned long invert)
+{
+	unsigned long tmp;
+
+	if (!nbits || start >= nbits)
+		return nbits;
+
+	tmp = addr[start / BITS_PER_LONG] ^ invert;
+
+	/* Handle 1st word. */
+	tmp &= BITMAP_FIRST_WORD_MASK(start);
+	start = round_down(start, BITS_PER_LONG);
+
+	while (!tmp) {
+		start += BITS_PER_LONG;
+		if (start >= nbits)
+			return nbits;
+
+		tmp = addr[start / BITS_PER_LONG] ^ invert;
+	}
+
+	return min(start + __ffs(tmp), nbits);
+}
+
+static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+			    unsigned long offset)
+{
+	return _find_next_bit(addr, size, offset, 0UL);
+}
+
+static inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
+				 unsigned long offset)
+{
+	return _find_next_bit(addr, size, offset, ~0UL);
+}
+
+static inline unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+{
+	unsigned long idx;
+
+	for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
+		if (addr[idx] != ~0UL)
+			return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
+	}
+
+	return size;
+}
+
+#endif /* _PERF_BITOPS_H */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
new file mode 100644
index 0000000..a0c6508
--- /dev/null
+++ b/include/linux/bitops.h
@@ -0,0 +1,275 @@
+#ifndef _TOOLS_LINUX_BITOPS_H_
+#define _TOOLS_LINUX_BITOPS_H_
+
+#include <asm/types.h>
+#include <linux/kernel.h>
+#include <linux/compiler.h>
+#include <linux/page.h>
+
+#ifndef __WORDSIZE
+#define __WORDSIZE (__SIZEOF_LONG__ * 8)
+#endif
+
+#ifndef BITS_PER_LONG
+# define BITS_PER_LONG __WORDSIZE
+#endif
+
+#define BIT_MASK(nr)		(1UL << ((nr) % BITS_PER_LONG))
+#define BIT_WORD(nr)		((nr) / BITS_PER_LONG)
+#define BITS_PER_BYTE		8
+#define BITS_TO_LONGS(nr)	DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
+#define BITS_TO_U64(nr)		DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64))
+#define BITS_TO_U32(nr)		DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u32))
+#define BITS_TO_BYTES(nr)	DIV_ROUND_UP(nr, BITS_PER_BYTE)
+
+static inline void __set_bit(int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+
+	*p  |= mask;
+}
+
+static inline void set_bit(long nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+
+	__atomic_or_fetch(p, mask, __ATOMIC_RELAXED);
+}
+
+static inline void clear_bit(long nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+
+	__atomic_and_fetch(p, ~mask, __ATOMIC_RELAXED);
+}
+
+static inline int test_bit(long nr, const volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *) addr) + BIT_WORD(nr);
+
+	return (*p & mask) != 0;
+}
+
+static inline int __test_and_set_bit(int nr, unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+	unsigned long old;
+
+	old = *p;
+	*p = old | mask;
+
+	return (old & mask) != 0;
+}
+
+static inline bool test_and_set_bit(long nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *) addr) + BIT_WORD(nr);
+	unsigned long old;
+
+	old = __atomic_fetch_or(p, mask, __ATOMIC_RELAXED);
+
+	return (old & mask) != 0;
+}
+
+static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+
+	__atomic_and_fetch(p, ~mask, __ATOMIC_RELEASE);
+}
+
+static inline bool test_and_set_bit_lock(long nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *) addr) + BIT_WORD(nr);
+	unsigned long old;
+
+	old = __atomic_fetch_or(p, mask, __ATOMIC_ACQUIRE);
+
+	return (old & mask) != 0;
+}
+
+#define for_each_set_bit(bit, addr, size) \
+	for ((bit) = find_first_bit((addr), (size));		\
+	     (bit) < (size);					\
+	     (bit) = find_next_bit((addr), (size), (bit) + 1))
+
+/* same as for_each_set_bit() but use bit as value to start with */
+#define for_each_set_bit_from(bit, addr, size) \
+	for ((bit) = find_next_bit((addr), (size), (bit));	\
+	     (bit) < (size);					\
+	     (bit) = find_next_bit((addr), (size), (bit) + 1))
+
+static inline unsigned long hweight_long(unsigned long w)
+{
+	return __builtin_popcountl(w);
+}
+
+/**
+ * rol64 - rotate a 64-bit value left
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline __u64 rol64(__u64 word, unsigned int shift)
+{
+	return (word << shift) | (word >> (64 - shift));
+}
+
+/**
+ * ror64 - rotate a 64-bit value right
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline __u64 ror64(__u64 word, unsigned int shift)
+{
+	return (word >> shift) | (word << (64 - shift));
+}
+
+/**
+ * rol32 - rotate a 32-bit value left
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline __u32 rol32(__u32 word, unsigned int shift)
+{
+	return (word << shift) | (word >> ((-shift) & 31));
+}
+
+/**
+ * ror32 - rotate a 32-bit value right
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline __u32 ror32(__u32 word, unsigned int shift)
+{
+	return (word >> shift) | (word << (32 - shift));
+}
+
+/**
+ * rol16 - rotate a 16-bit value left
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline __u16 rol16(__u16 word, unsigned int shift)
+{
+	return (word << shift) | (word >> (16 - shift));
+}
+
+/**
+ * ror16 - rotate a 16-bit value right
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline __u16 ror16(__u16 word, unsigned int shift)
+{
+	return (word >> shift) | (word << (16 - shift));
+}
+
+/**
+ * rol8 - rotate an 8-bit value left
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline __u8 rol8(__u8 word, unsigned int shift)
+{
+	return (word << shift) | (word >> (8 - shift));
+}
+
+/**
+ * ror8 - rotate an 8-bit value right
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline __u8 ror8(__u8 word, unsigned int shift)
+{
+	return (word >> shift) | (word << (8 - shift));
+}
+
+static inline unsigned long __fls(unsigned long word)
+{
+	return (sizeof(word) * 8) - 1 - __builtin_clzl(word);
+}
+
+static inline int fls(int x)
+{
+	return x ? sizeof(x) * 8 - __builtin_clz(x) : 0;
+}
+
+static inline int fls64(__u64 x)
+{
+#if BITS_PER_LONG == 32
+	__u32 h = x >> 32;
+	if (h)
+		return fls(h) + 32;
+	return fls(x);
+#elif BITS_PER_LONG == 64
+	if (x == 0)
+		return 0;
+	return __fls(x) + 1;
+#endif
+}
+
+static inline unsigned fls_long(unsigned long l)
+{
+	if (sizeof(l) == 4)
+		return fls(l);
+	return fls64(l);
+}
+
+static inline unsigned long __ffs(unsigned long word)
+{
+	return __builtin_ctzl(word);
+}
+
+static inline unsigned long __ffs64(u64 word)
+{
+#if BITS_PER_LONG == 32
+	if (((u32)word) == 0UL)
+		return __ffs((u32)(word >> 32)) + 32;
+#elif BITS_PER_LONG != 64
+#error BITS_PER_LONG not 32 or 64
+#endif
+	return __ffs((unsigned long)word);
+}
+
+#define ffz(x)  __ffs(~(x))
+
+static inline __attribute__((const))
+unsigned long rounddown_pow_of_two(unsigned long n)
+{
+	return 1UL << (fls_long(n) - 1);
+}
+
+static inline __attribute_const__
+int __get_order(unsigned long size)
+{
+	int order;
+
+	size--;
+	size >>= PAGE_SHIFT;
+#if BITS_PER_LONG == 32
+	order = fls(size);
+#else
+	order = fls64(size);
+#endif
+	return order;
+}
+
+#define get_order(n)						\
+(								\
+	__builtin_constant_p(n) ? (				\
+		((n) == 0UL) ? BITS_PER_LONG - PAGE_SHIFT :	\
+		(((n) < (1UL << PAGE_SHIFT)) ? 0 :		\
+		 ilog2((n) - 1) - PAGE_SHIFT + 1)		\
+	) :							\
+	__get_order(n)						\
+)
+
+#endif
diff --git a/include/linux/bitrev.h b/include/linux/bitrev.h
new file mode 100644
index 0000000..fb790b8
--- /dev/null
+++ b/include/linux/bitrev.h
@@ -0,0 +1,85 @@
+#ifndef _LINUX_BITREV_H
+#define _LINUX_BITREV_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_HAVE_ARCH_BITREVERSE
+#include <asm/bitrev.h>
+
+#define __bitrev32 __arch_bitrev32
+#define __bitrev16 __arch_bitrev16
+#define __bitrev8 __arch_bitrev8
+
+#else
+extern u8 const byte_rev_table[256];
+static inline u8 __bitrev8(u8 byte)
+{
+	return byte_rev_table[byte];
+}
+
+static inline u16 __bitrev16(u16 x)
+{
+	return (__bitrev8(x & 0xff) << 8) | __bitrev8(x >> 8);
+}
+
+static inline u32 __bitrev32(u32 x)
+{
+	return (__bitrev16(x & 0xffff) << 16) | __bitrev16(x >> 16);
+}
+
+#endif /* CONFIG_HAVE_ARCH_BITREVERSE */
+
+#define __constant_bitrev32(x)	\
+({					\
+	u32 __x = x;			\
+	__x = (__x >> 16) | (__x << 16);	\
+	__x = ((__x & (u32)0xFF00FF00UL) >> 8) | ((__x & (u32)0x00FF00FFUL) << 8);	\
+	__x = ((__x & (u32)0xF0F0F0F0UL) >> 4) | ((__x & (u32)0x0F0F0F0FUL) << 4);	\
+	__x = ((__x & (u32)0xCCCCCCCCUL) >> 2) | ((__x & (u32)0x33333333UL) << 2);	\
+	__x = ((__x & (u32)0xAAAAAAAAUL) >> 1) | ((__x & (u32)0x55555555UL) << 1);	\
+	__x;								\
+})
+
+#define __constant_bitrev16(x)	\
+({					\
+	u16 __x = x;			\
+	__x = (__x >> 8) | (__x << 8);	\
+	__x = ((__x & (u16)0xF0F0U) >> 4) | ((__x & (u16)0x0F0FU) << 4);	\
+	__x = ((__x & (u16)0xCCCCU) >> 2) | ((__x & (u16)0x3333U) << 2);	\
+	__x = ((__x & (u16)0xAAAAU) >> 1) | ((__x & (u16)0x5555U) << 1);	\
+	__x;								\
+})
+
+#define __constant_bitrev8(x)	\
+({					\
+	u8 __x = x;			\
+	__x = (__x >> 4) | (__x << 4);	\
+	__x = ((__x & (u8)0xCCU) >> 2) | ((__x & (u8)0x33U) << 2);	\
+	__x = ((__x & (u8)0xAAU) >> 1) | ((__x & (u8)0x55U) << 1);	\
+	__x;								\
+})
+
+#define bitrev32(x) \
+({			\
+	u32 __x = x;	\
+	__builtin_constant_p(__x) ?	\
+	__constant_bitrev32(__x) :			\
+	__bitrev32(__x);				\
+})
+
+#define bitrev16(x) \
+({			\
+	u16 __x = x;	\
+	__builtin_constant_p(__x) ?	\
+	__constant_bitrev16(__x) :			\
+	__bitrev16(__x);				\
+ })
+
+#define bitrev8(x) \
+({			\
+	u8 __x = x;	\
+	__builtin_constant_p(__x) ?	\
+	__constant_bitrev8(__x) :			\
+	__bitrev8(__x)	;			\
+ })
+#endif /* _LINUX_BITREV_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
new file mode 100644
index 0000000..4fc5af3
--- /dev/null
+++ b/include/linux/blk_types.h
@@ -0,0 +1,156 @@
+/*
+ * Block data types and constants.  Directly include this file only to
+ * break include dependency loop.
+ */
+#ifndef __LINUX_BLK_TYPES_H
+#define __LINUX_BLK_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/types.h>
+#include <linux/bvec.h>
+
+struct bio_set;
+struct bio;
+struct block_device;
+typedef void (bio_end_io_t) (struct bio *);
+typedef void (bio_destructor_t) (struct bio *);
+
+/*
+ * main unit of I/O for the block layer and lower layers (ie drivers and
+ * stacking drivers)
+ */
+struct bio {
+	struct bio		*bi_next;	/* request queue link */
+	struct block_device	*bi_bdev;
+	int			bi_error;
+	unsigned int		bi_opf;		/* bottom bits req flags,
+						 * top bits REQ_OP. Use
+						 * accessors.
+						 */
+	unsigned short		bi_flags;	/* status, command, etc */
+	unsigned short		bi_ioprio;
+
+	struct bvec_iter	bi_iter;
+
+	atomic_t		__bi_remaining;
+
+	bio_end_io_t		*bi_end_io;
+	void			*bi_private;
+
+	unsigned short		bi_vcnt;	/* how many bio_vec's */
+
+	/*
+	 * Everything starting with bi_max_vecs will be preserved by bio_reset()
+	 */
+
+	unsigned short		bi_max_vecs;	/* max bvl_vecs we can hold */
+
+	atomic_t		__bi_cnt;	/* pin count */
+
+	struct bio_vec		*bi_io_vec;	/* the actual vec list */
+
+	struct bio_set		*bi_pool;
+
+	/*
+	 * We can inline a number of vecs at the end of the bio, to avoid
+	 * double allocations for a small number of bio_vecs. This member
+	 * MUST obviously be kept at the very end of the bio.
+	 */
+	struct bio_vec		bi_inline_vecs[0];
+};
+
+#define BIO_OP_SHIFT	(8 * sizeof(unsigned int) - REQ_OP_BITS)
+#define bio_op(bio)	((bio)->bi_opf >> BIO_OP_SHIFT)
+
+#define bio_set_op_attrs(bio, op, op_flags) do {		\
+	WARN_ON(op >= (1 << REQ_OP_BITS));			\
+	(bio)->bi_opf &= ((1 << BIO_OP_SHIFT) - 1);		\
+	(bio)->bi_opf |= ((unsigned int) (op) << BIO_OP_SHIFT);	\
+	(bio)->bi_opf |= op_flags;				\
+} while (0)
+
+#define BIO_RESET_BYTES		offsetof(struct bio, bi_max_vecs)
+
+/*
+ * bio flags
+ */
+#define BIO_SEG_VALID	1	/* bi_phys_segments valid */
+#define BIO_CLONED	2	/* doesn't own data */
+#define BIO_BOUNCED	3	/* bio is a bounce bio */
+#define BIO_USER_MAPPED 4	/* contains user pages */
+#define BIO_NULL_MAPPED 5	/* contains invalid user pages */
+#define BIO_QUIET	6	/* Make BIO Quiet */
+#define BIO_CHAIN	7	/* chained bio, ->bi_remaining in effect */
+#define BIO_REFFED	8	/* bio has elevated ->bi_cnt */
+
+/*
+ * Flags starting here get preserved by bio_reset() - this includes
+ * BVEC_POOL_IDX()
+ */
+#define BIO_RESET_BITS	10
+
+/*
+ * We support 6 different bvec pools, the last one is magic in that it
+ * is backed by a mempool.
+ */
+#define BVEC_POOL_NR		6
+#define BVEC_POOL_MAX		(BVEC_POOL_NR - 1)
+
+/*
+ * Top 4 bits of bio flags indicate the pool the bvecs came from.  We add
+ * 1 to the actual index so that 0 indicates that there are no bvecs to be
+ * freed.
+ */
+#define BVEC_POOL_BITS		(4)
+#define BVEC_POOL_OFFSET	(16 - BVEC_POOL_BITS)
+#define BVEC_POOL_IDX(bio)	((bio)->bi_flags >> BVEC_POOL_OFFSET)
+
+/*
+ * Request flags.  For use in the cmd_flags field of struct request, and in
+ * bi_opf of struct bio.  Note that some flags are only valid in either one.
+ */
+enum rq_flag_bits {
+	__REQ_SYNC,		/* request is sync (sync write or read) */
+	__REQ_META,		/* metadata io request */
+	__REQ_PRIO,		/* boost priority in cfq */
+
+	__REQ_FUA,		/* forced unit access */
+	__REQ_PREFLUSH,		/* request for cache flush */
+};
+
+#define REQ_SYNC		(1ULL << __REQ_SYNC)
+#define REQ_META		(1ULL << __REQ_META)
+#define REQ_PRIO		(1ULL << __REQ_PRIO)
+
+#define REQ_NOMERGE_FLAGS	(REQ_PREFLUSH | REQ_FUA)
+
+#define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
+#define REQ_THROTTLED		(1ULL << __REQ_THROTTLED)
+
+#define REQ_FUA			(1ULL << __REQ_FUA)
+#define REQ_PREFLUSH		(1ULL << __REQ_PREFLUSH)
+
+#define RW_MASK			REQ_OP_WRITE
+
+#define READ			REQ_OP_READ
+#define WRITE			REQ_OP_WRITE
+
+#define READ_SYNC		REQ_SYNC
+#define WRITE_SYNC		(REQ_SYNC)
+#define WRITE_ODIRECT		REQ_SYNC
+#define WRITE_FLUSH		(REQ_SYNC | REQ_PREFLUSH)
+#define WRITE_FUA		(REQ_SYNC | REQ_FUA)
+#define WRITE_FLUSH_FUA		(REQ_SYNC | REQ_PREFLUSH | REQ_FUA)
+
+enum req_op {
+	REQ_OP_READ,
+	REQ_OP_WRITE,
+	REQ_OP_DISCARD,		/* request to discard sectors */
+	REQ_OP_SECURE_ERASE,	/* request to securely erase sectors */
+	REQ_OP_WRITE_SAME,	/* write same block many times */
+	REQ_OP_FLUSH,		/* request for cache flush */
+};
+
+#define REQ_OP_BITS 3
+
+#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
new file mode 100644
index 0000000..3c18594
--- /dev/null
+++ b/include/linux/blkdev.h
@@ -0,0 +1,188 @@
+#ifndef __TOOLS_LINUX_BLKDEV_H
+#define __TOOLS_LINUX_BLKDEV_H
+
+#include <linux/backing-dev.h>
+#include <linux/blk_types.h>
+
+typedef u64 sector_t;
+typedef unsigned fmode_t;
+
+struct bio;
+struct user_namespace;
+
+#define MINORBITS	20
+#define MINORMASK	((1U << MINORBITS) - 1)
+
+#define MAJOR(dev)	((unsigned int) ((dev) >> MINORBITS))
+#define MINOR(dev)	((unsigned int) ((dev) & MINORMASK))
+#define MKDEV(ma,mi)	(((ma) << MINORBITS) | (mi))
+
+/* file is open for reading */
+#define FMODE_READ		((__force fmode_t)0x1)
+/* file is open for writing */
+#define FMODE_WRITE		((__force fmode_t)0x2)
+/* file is seekable */
+#define FMODE_LSEEK		((__force fmode_t)0x4)
+/* file can be accessed using pread */
+#define FMODE_PREAD		((__force fmode_t)0x8)
+/* file can be accessed using pwrite */
+#define FMODE_PWRITE		((__force fmode_t)0x10)
+/* File is opened for execution with sys_execve / sys_uselib */
+#define FMODE_EXEC		((__force fmode_t)0x20)
+/* File is opened with O_NDELAY (only set for block devices) */
+#define FMODE_NDELAY		((__force fmode_t)0x40)
+/* File is opened with O_EXCL (only set for block devices) */
+#define FMODE_EXCL		((__force fmode_t)0x80)
+/* File is opened using open(.., 3, ..) and is writeable only for ioctls
+   (specialy hack for floppy.c) */
+#define FMODE_WRITE_IOCTL	((__force fmode_t)0x100)
+/* 32bit hashes as llseek() offset (for directories) */
+#define FMODE_32BITHASH         ((__force fmode_t)0x200)
+/* 64bit hashes as llseek() offset (for directories) */
+#define FMODE_64BITHASH         ((__force fmode_t)0x400)
+
+struct inode {
+	unsigned long		i_ino;
+	loff_t			i_size;
+	struct super_block	*i_sb;
+};
+
+struct file {
+	struct inode		*f_inode;
+};
+
+static inline struct inode *file_inode(const struct file *f)
+{
+	return f->f_inode;
+}
+
+#define BDEVNAME_SIZE	32
+
+struct request_queue {
+	struct backing_dev_info backing_dev_info;
+};
+
+struct gendisk {
+};
+
+struct block_device {
+	char			name[BDEVNAME_SIZE];
+	struct inode		*bd_inode;
+	struct request_queue	queue;
+	void			*bd_holder;
+	struct gendisk		*bd_disk;
+	struct gendisk		__bd_disk;
+	int			bd_fd;
+};
+
+void generic_make_request(struct bio *);
+int submit_bio_wait(struct bio *);
+int blkdev_issue_discard(struct block_device *, sector_t,
+			 sector_t, gfp_t, unsigned long);
+
+#define bdev_get_queue(bdev)		(&((bdev)->queue))
+
+#define blk_queue_discard(q)		((void) (q), 0)
+#define blk_queue_nonrot(q)		((void) (q), 0)
+
+static inline struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	return &q->backing_dev_info;
+}
+
+unsigned bdev_logical_block_size(struct block_device *bdev);
+sector_t get_capacity(struct gendisk *disk);
+
+void blkdev_put(struct block_device *bdev, fmode_t mode);
+void bdput(struct block_device *bdev);
+struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder);
+struct block_device *lookup_bdev(const char *path);
+
+struct super_block {
+	void			*s_fs_info;
+};
+
+/*
+ * File types
+ *
+ * NOTE! These match bits 12..15 of stat.st_mode
+ * (ie "(i_mode >> 12) & 15").
+ */
+#define DT_UNKNOWN	0
+#define DT_FIFO		1
+#define DT_CHR		2
+#define DT_DIR		4
+#define DT_BLK		6
+#define DT_REG		8
+#define DT_LNK		10
+#define DT_SOCK		12
+#define DT_WHT		14
+
+/*
+ * This is the "filldir" function type, used by readdir() to let
+ * the kernel specify what kind of dirent layout it wants to have.
+ * This allows the kernel to read directories into kernel space or
+ * to have different dirent layouts depending on the binary type.
+ */
+struct dir_context;
+typedef int (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64,
+			 unsigned);
+
+struct dir_context {
+	const filldir_t actor;
+	u64 pos;
+};
+
+/* /sys/fs */
+extern struct kobject *fs_kobj;
+
+struct file_operations {
+};
+
+static inline int register_chrdev(unsigned int major, const char *name,
+				  const struct file_operations *fops)
+{
+	return 1;
+}
+
+static inline void unregister_chrdev(unsigned int major, const char *name)
+{
+}
+
+static inline const char *bdevname(struct block_device *bdev, char *buf)
+{
+	snprintf(buf, BDEVNAME_SIZE, "%s", bdev->name);
+	return buf;
+}
+
+static inline bool op_is_write(unsigned int op)
+{
+	return op == REQ_OP_READ ? false : true;
+}
+
+/*
+ * return data direction, READ or WRITE
+ */
+static inline int bio_data_dir(struct bio *bio)
+{
+	return op_is_write(bio_op(bio)) ? WRITE : READ;
+}
+
+static inline bool dir_emit(struct dir_context *ctx,
+			    const char *name, int namelen,
+			    u64 ino, unsigned type)
+{
+	return ctx->actor(ctx, name, namelen, ctx->pos, ino, type) == 0;
+}
+
+static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)
+{
+	return true;
+}
+
+#define capable(cap)		true
+
+#endif /* __TOOLS_LINUX_BLKDEV_H */
+
diff --git a/include/linux/bug.h b/include/linux/bug.h
new file mode 100644
index 0000000..f01e5f7
--- /dev/null
+++ b/include/linux/bug.h
@@ -0,0 +1,31 @@
+#ifndef __TOOLS_LINUX_BUG_H
+#define __TOOLS_LINUX_BUG_H
+
+#include <assert.h>
+#include <linux/compiler.h>
+
+#define BUILD_BUG_ON_NOT_POWER_OF_2(n)			\
+	BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0))
+#define BUILD_BUG_ON_ZERO(e)	(sizeof(struct { int:-!!(e); }))
+#define BUILD_BUG_ON_NULL(e)	((void *)sizeof(struct { int:-!!(e); }))
+
+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+
+#define BUG()			do { assert(0); unreachable(); } while (0)
+#define BUG_ON(cond)		assert(!(cond))
+
+#define WARN_ON_ONCE(cond)	assert(!(cond))
+#define WARN_ONCE(cond, msg)	assert(!(cond))
+
+#define __WARN()		assert(0)
+#define __WARN_printf(arg...)	assert(0)
+#define WARN(cond, ...)		assert(!(cond))
+
+#define WARN_ON(condition) ({						\
+	int __ret_warn_on = !!(condition);				\
+	if (unlikely(__ret_warn_on))					\
+		__WARN();						\
+	unlikely(__ret_warn_on);					\
+})
+
+#endif /* __TOOLS_LINUX_BUG_H */
diff --git a/include/linux/bvec.h b/include/linux/bvec.h
new file mode 100644
index 0000000..89b65b8
--- /dev/null
+++ b/include/linux/bvec.h
@@ -0,0 +1,97 @@
+/*
+ * bvec iterator
+ *
+ * Copyright (C) 2001 Ming Lei <ming.lei@canonical.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ */
+#ifndef __LINUX_BVEC_ITER_H
+#define __LINUX_BVEC_ITER_H
+
+#include <linux/kernel.h>
+#include <linux/bug.h>
+
+/*
+ * was unsigned short, but we might as well be ready for > 64kB I/O pages
+ */
+struct bio_vec {
+	struct page	*bv_page;
+	unsigned int	bv_len;
+	unsigned int	bv_offset;
+};
+
+struct bvec_iter {
+	sector_t		bi_sector;	/* device address in 512 byte
+						   sectors */
+	unsigned int		bi_size;	/* residual I/O count */
+
+	unsigned int		bi_idx;		/* current index into bvl_vec */
+
+	unsigned int            bi_bvec_done;	/* number of bytes completed in
+						   current bvec */
+};
+
+/*
+ * various member access, note that bio_data should of course not be used
+ * on highmem page vectors
+ */
+#define __bvec_iter_bvec(bvec, iter)	(&(bvec)[(iter).bi_idx])
+
+#define bvec_iter_page(bvec, iter)				\
+	(__bvec_iter_bvec((bvec), (iter))->bv_page)
+
+#define bvec_iter_len(bvec, iter)				\
+	min((iter).bi_size,					\
+	    __bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done)
+
+#define bvec_iter_offset(bvec, iter)				\
+	(__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done)
+
+#define bvec_iter_bvec(bvec, iter)				\
+((struct bio_vec) {						\
+	.bv_page	= bvec_iter_page((bvec), (iter)),	\
+	.bv_len		= bvec_iter_len((bvec), (iter)),	\
+	.bv_offset	= bvec_iter_offset((bvec), (iter)),	\
+})
+
+static inline void bvec_iter_advance(const struct bio_vec *bv,
+				     struct bvec_iter *iter,
+				     unsigned bytes)
+{
+	WARN_ONCE(bytes > iter->bi_size,
+		  "Attempted to advance past end of bvec iter\n");
+
+	while (bytes) {
+		unsigned iter_len = bvec_iter_len(bv, *iter);
+		unsigned len = min(bytes, iter_len);
+
+		bytes -= len;
+		iter->bi_size -= len;
+		iter->bi_bvec_done += len;
+
+		if (iter->bi_bvec_done == __bvec_iter_bvec(bv, *iter)->bv_len) {
+			iter->bi_bvec_done = 0;
+			iter->bi_idx++;
+		}
+	}
+}
+
+#define for_each_bvec(bvl, bio_vec, iter, start)			\
+	for (iter = (start);						\
+	     (iter).bi_size &&						\
+		((bvl = bvec_iter_bvec((bio_vec), (iter))), 1);	\
+	     bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len))
+
+#endif /* __LINUX_BVEC_ITER_H */
diff --git a/include/linux/byteorder.h b/include/linux/byteorder.h
new file mode 100644
index 0000000..35ef228
--- /dev/null
+++ b/include/linux/byteorder.h
@@ -0,0 +1,74 @@
+#ifndef __LINUX_BYTEORDER_H
+#define __LINUX_BYTEORDER_H
+
+#include <asm/byteorder.h>
+
+#define swab16 __swab16
+#define swab32 __swab32
+#define swab64 __swab64
+#define swahw32 __swahw32
+#define swahb32 __swahb32
+#define swab16p __swab16p
+#define swab32p __swab32p
+#define swab64p __swab64p
+#define swahw32p __swahw32p
+#define swahb32p __swahb32p
+#define swab16s __swab16s
+#define swab32s __swab32s
+#define swab64s __swab64s
+#define swahw32s __swahw32s
+#define swahb32s __swahb32s
+
+#define cpu_to_le64 __cpu_to_le64
+#define le64_to_cpu __le64_to_cpu
+#define cpu_to_le32 __cpu_to_le32
+#define le32_to_cpu __le32_to_cpu
+#define cpu_to_le16 __cpu_to_le16
+#define le16_to_cpu __le16_to_cpu
+#define cpu_to_be64 __cpu_to_be64
+#define be64_to_cpu __be64_to_cpu
+#define cpu_to_be32 __cpu_to_be32
+#define be32_to_cpu __be32_to_cpu
+#define cpu_to_be16 __cpu_to_be16
+#define be16_to_cpu __be16_to_cpu
+#define cpu_to_le64p __cpu_to_le64p
+#define le64_to_cpup __le64_to_cpup
+#define cpu_to_le32p __cpu_to_le32p
+#define le32_to_cpup __le32_to_cpup
+#define cpu_to_le16p __cpu_to_le16p
+#define le16_to_cpup __le16_to_cpup
+#define cpu_to_be64p __cpu_to_be64p
+#define be64_to_cpup __be64_to_cpup
+#define cpu_to_be32p __cpu_to_be32p
+#define be32_to_cpup __be32_to_cpup
+#define cpu_to_be16p __cpu_to_be16p
+#define be16_to_cpup __be16_to_cpup
+#define cpu_to_le64s __cpu_to_le64s
+#define le64_to_cpus __le64_to_cpus
+#define cpu_to_le32s __cpu_to_le32s
+#define le32_to_cpus __le32_to_cpus
+#define cpu_to_le16s __cpu_to_le16s
+#define le16_to_cpus __le16_to_cpus
+#define cpu_to_be64s __cpu_to_be64s
+#define be64_to_cpus __be64_to_cpus
+#define cpu_to_be32s __cpu_to_be32s
+#define be32_to_cpus __be32_to_cpus
+#define cpu_to_be16s __cpu_to_be16s
+#define be16_to_cpus __be16_to_cpus
+
+static inline void le16_add_cpu(__le16 *var, u16 val)
+{
+	*var = cpu_to_le16(le16_to_cpu(*var) + val);
+}
+
+static inline void le32_add_cpu(__le32 *var, u32 val)
+{
+	*var = cpu_to_le32(le32_to_cpu(*var) + val);
+}
+
+static inline void le64_add_cpu(__le64 *var, u64 val)
+{
+	*var = cpu_to_le64(le64_to_cpu(*var) + val);
+}
+
+#endif /* __LINUX_BYTEORDER_H */
diff --git a/include/linux/cache.h b/include/linux/cache.h
new file mode 100644
index 0000000..4ee609a
--- /dev/null
+++ b/include/linux/cache.h
@@ -0,0 +1,16 @@
+#ifndef __TOOLS_LINUX_CACHE_H
+#define __TOOLS_LINUX_CACHE_H
+
+#define L1_CACHE_BYTES		64
+#define SMP_CACHE_BYTES		L1_CACHE_BYTES
+
+#define L1_CACHE_ALIGN(x)	__ALIGN_KERNEL(x, L1_CACHE_BYTES)
+
+#define __read_mostly
+#define __ro_after_init
+
+#define ____cacheline_aligned	__attribute__((__aligned__(SMP_CACHE_BYTES)))
+#define ____cacheline_aligned_in_smp ____cacheline_aligned
+
+#endif /* __TOOLS_LINUX_CACHE_H */
+
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
new file mode 100644
index 0000000..e5c31a6
--- /dev/null
+++ b/include/linux/compiler.h
@@ -0,0 +1,169 @@
+#ifndef _TOOLS_LINUX_COMPILER_H_
+#define _TOOLS_LINUX_COMPILER_H_
+
+/* Optimization barrier */
+/* The "volatile" is due to gcc bugs */
+#define barrier() __asm__ __volatile__("": : :"memory")
+#define barrier_data(ptr) __asm__ __volatile__("": :"r"(ptr) :"memory")
+
+#ifndef __always_inline
+# define __always_inline	inline __attribute__((always_inline))
+#endif
+
+#ifdef __ANDROID__
+/*
+ * FIXME: Big hammer to get rid of tons of:
+ *   "warning: always_inline function might not be inlinable"
+ *
+ * At least on android-ndk-r12/platforms/android-24/arch-arm
+ */
+#undef __always_inline
+#define __always_inline	inline
+#endif
+
+#define noinline
+#define noinline_for_stack noinline
+
+#define __user
+#define __kernel
+
+#define __pure			__attribute__((pure))
+#define __aligned(x)		__attribute__((aligned(x)))
+#define __printf(a, b)		__attribute__((format(printf, a, b)))
+#define __used			__attribute__((__used__))
+#define __maybe_unused		__attribute__((unused))
+#define __always_unused		__attribute__((unused))
+#define __packed		__attribute__((__packed__))
+#define __force
+#define __nocast
+#define __iomem
+#define __chk_user_ptr(x) (void)0
+#define __chk_io_ptr(x) (void)0
+#define __builtin_warning(x, y...) (1)
+#define __must_hold(x)
+#define __acquires(x)
+#define __releases(x)
+#define __acquire(x) (void)0
+#define __release(x) (void)0
+#define __cond_lock(x,c) (c)
+#define __percpu
+#define __rcu
+#define __sched
+#define __init
+#define __exit
+#define __private
+#define __must_check
+#define __malloc
+#define __weak			__attribute__((weak))
+#define likely(x)		__builtin_expect(!!(x), 1)
+#define unlikely(x)		__builtin_expect(!!(x), 0)
+#define unreachable()		__builtin_unreachable()
+#define __same_type(a, b)	__builtin_types_compatible_p(typeof(a), typeof(b))
+
+#define ___PASTE(a,b) a##b
+#define __PASTE(a,b) ___PASTE(a,b)
+#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __LINE__)
+
+#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
+
+#define __initcall(x)	/* unimplemented */
+#define __exitcall(x)	/* unimplemented */
+
+#include <linux/types.h>
+
+/*
+ * Following functions are taken from kernel sources and
+ * break aliasing rules in their original form.
+ *
+ * While kernel is compiled with -fno-strict-aliasing,
+ * perf uses -Wstrict-aliasing=3 which makes build fail
+ * under gcc 4.4.
+ *
+ * Using extra __may_alias__ type to allow aliasing
+ * in this case.
+ */
+typedef __u8  __attribute__((__may_alias__))  __u8_alias_t;
+typedef __u16 __attribute__((__may_alias__)) __u16_alias_t;
+typedef __u32 __attribute__((__may_alias__)) __u32_alias_t;
+typedef __u64 __attribute__((__may_alias__)) __u64_alias_t;
+
+static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
+{
+	switch (size) {
+	case 1: *(__u8_alias_t  *) res = *(volatile __u8_alias_t  *) p; break;
+	case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break;
+	case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break;
+	case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break;
+	default:
+		barrier();
+		__builtin_memcpy((void *)res, (const void *)p, size);
+		barrier();
+	}
+}
+
+static __always_inline void __write_once_size(volatile void *p, void *res, int size)
+{
+	switch (size) {
+	case 1: *(volatile  __u8_alias_t *) p = *(__u8_alias_t  *) res; break;
+	case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break;
+	case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break;
+	case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break;
+	default:
+		barrier();
+		__builtin_memcpy((void *)p, (const void *)res, size);
+		barrier();
+	}
+}
+
+/*
+ * Prevent the compiler from merging or refetching reads or writes. The
+ * compiler is also forbidden from reordering successive instances of
+ * READ_ONCE, WRITE_ONCE and ACCESS_ONCE (see below), but only when the
+ * compiler is aware of some particular ordering.  One way to make the
+ * compiler aware of ordering is to put the two invocations of READ_ONCE,
+ * WRITE_ONCE or ACCESS_ONCE() in different C statements.
+ *
+ * In contrast to ACCESS_ONCE these two macros will also work on aggregate
+ * data types like structs or unions. If the size of the accessed data
+ * type exceeds the word size of the machine (e.g., 32 bits or 64 bits)
+ * READ_ONCE() and WRITE_ONCE()  will fall back to memcpy and print a
+ * compile-time warning.
+ *
+ * Their two major use cases are: (1) Mediating communication between
+ * process-level code and irq/NMI handlers, all running on the same CPU,
+ * and (2) Ensuring that the compiler does not  fold, spindle, or otherwise
+ * mutilate accesses that either do not require ordering or that interact
+ * with an explicit memory barrier or atomic instruction that provides the
+ * required ordering.
+ */
+
+#define READ_ONCE(x) \
+	({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
+
+#define WRITE_ONCE(x, val) \
+	({ union { typeof(x) __val; char __c[1]; } __u = { .__val = (val) }; __write_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
+
+#define lockless_dereference(p) \
+({ \
+	typeof(p) _________p1 = READ_ONCE(p); \
+	typeof(*(p)) *___typecheck_p __maybe_unused; \
+	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
+	(_________p1); \
+})
+
+#define flush_cache_all()			do { } while (0)
+#define flush_cache_mm(mm)			do { } while (0)
+#define flush_cache_dup_mm(mm)			do { } while (0)
+#define flush_cache_range(vma, start, end)	do { } while (0)
+#define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
+#define flush_dcache_page(page)			do { } while (0)
+#define flush_dcache_mmap_lock(mapping)		do { } while (0)
+#define flush_dcache_mmap_unlock(mapping)	do { } while (0)
+#define flush_icache_range(start, end)		do { } while (0)
+#define flush_icache_page(vma,pg)		do { } while (0)
+#define flush_icache_user_range(vma,pg,adr,len)	do { } while (0)
+#define flush_cache_vmap(start, end)		do { } while (0)
+#define flush_cache_vunmap(start, end)		do { } while (0)
+
+#endif /* _TOOLS_LINUX_COMPILER_H */
diff --git a/include/linux/completion.h b/include/linux/completion.h
new file mode 100644
index 0000000..b8bac21
--- /dev/null
+++ b/include/linux/completion.h
@@ -0,0 +1,83 @@
+#ifndef __LINUX_COMPLETION_H
+#define __LINUX_COMPLETION_H
+
+/*
+ * (C) Copyright 2001 Linus Torvalds
+ *
+ * Atomic wait-for-completion handler data structures.
+ * See kernel/sched/completion.c for details.
+ */
+
+#include <linux/wait.h>
+
+/*
+ * struct completion - structure used to maintain state for a "completion"
+ *
+ * This is the opaque structure used to maintain the state for a "completion".
+ * Completions currently use a FIFO to queue threads that have to wait for
+ * the "completion" event.
+ *
+ * See also:  complete(), wait_for_completion() (and friends _timeout,
+ * _interruptible, _interruptible_timeout, and _killable), init_completion(),
+ * reinit_completion(), and macros DECLARE_COMPLETION(),
+ * DECLARE_COMPLETION_ONSTACK().
+ */
+struct completion {
+	unsigned int done;
+	wait_queue_head_t wait;
+};
+
+#define COMPLETION_INITIALIZER(work) \
+	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
+
+#define COMPLETION_INITIALIZER_ONSTACK(work) \
+	({ init_completion(&work); work; })
+
+#define DECLARE_COMPLETION(work) \
+	struct completion work = COMPLETION_INITIALIZER(work)
+#define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work)
+
+/**
+ * init_completion - Initialize a dynamically allocated completion
+ * @x:  pointer to completion structure that is to be initialized
+ *
+ * This inline function will initialize a dynamically created completion
+ * structure.
+ */
+static inline void init_completion(struct completion *x)
+{
+	x->done = 0;
+	init_waitqueue_head(&x->wait);
+}
+
+/**
+ * reinit_completion - reinitialize a completion structure
+ * @x:  pointer to completion structure that is to be reinitialized
+ *
+ * This inline function should be used to reinitialize a completion structure so it can
+ * be reused. This is especially important after complete_all() is used.
+ */
+static inline void reinit_completion(struct completion *x)
+{
+	x->done = 0;
+}
+
+extern void wait_for_completion(struct completion *);
+extern void wait_for_completion_io(struct completion *);
+extern int wait_for_completion_interruptible(struct completion *x);
+extern int wait_for_completion_killable(struct completion *x);
+extern unsigned long wait_for_completion_timeout(struct completion *x,
+						   unsigned long timeout);
+extern unsigned long wait_for_completion_io_timeout(struct completion *x,
+						    unsigned long timeout);
+extern long wait_for_completion_interruptible_timeout(
+	struct completion *x, unsigned long timeout);
+extern long wait_for_completion_killable_timeout(
+	struct completion *x, unsigned long timeout);
+extern bool try_wait_for_completion(struct completion *x);
+extern bool completion_done(struct completion *x);
+
+extern void complete(struct completion *);
+extern void complete_all(struct completion *);
+
+#endif
diff --git a/include/linux/console.h b/include/linux/console.h
new file mode 100644
index 0000000..d01aa9a
--- /dev/null
+++ b/include/linux/console.h
@@ -0,0 +1,7 @@
+#ifndef _LINUX_CONSOLE_H_
+#define _LINUX_CONSOLE_H_
+
+#define console_lock()
+#define console_unlock()
+
+#endif /* _LINUX_CONSOLE_H */
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
new file mode 100644
index 0000000..024d645
--- /dev/null
+++ b/include/linux/cpumask.h
@@ -0,0 +1,24 @@
+#ifndef __LINUX_CPUMASK_H
+#define __LINUX_CPUMASK_H
+
+#define num_online_cpus()	1U
+#define num_possible_cpus()	1U
+#define num_present_cpus()	1U
+#define num_active_cpus()	1U
+#define cpu_online(cpu)		((cpu) == 0)
+#define cpu_possible(cpu)	((cpu) == 0)
+#define cpu_present(cpu)	((cpu) == 0)
+#define cpu_active(cpu)		((cpu) == 0)
+
+#define for_each_cpu(cpu, mask)			\
+	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
+#define for_each_cpu_not(cpu, mask)		\
+	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
+#define for_each_cpu_and(cpu, mask, and)	\
+	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)and)
+
+#define for_each_possible_cpu(cpu) for_each_cpu((cpu), 1)
+#define for_each_online_cpu(cpu)   for_each_cpu((cpu), 1)
+#define for_each_present_cpu(cpu)  for_each_cpu((cpu), 1)
+
+#endif /* __LINUX_CPUMASK_H */
diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h
new file mode 100644
index 0000000..f198ab2
--- /dev/null
+++ b/include/linux/crc32c.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_CRC32C_H
+#define _LINUX_CRC32C_H
+
+#include "../../ccan/crc/crc.h"
+
+#endif	/* _LINUX_CRC32C_H */
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
new file mode 100644
index 0000000..cb9ad24
--- /dev/null
+++ b/include/linux/crypto.h
@@ -0,0 +1,921 @@
+/*
+ * Scatterlist Cryptographic API.
+ *
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * Copyright (c) 2002 David S. Miller (davem@redhat.com)
+ * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no>
+ * and Nettle, by Niels Möller.
+ * 
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) 
+ * any later version.
+ *
+ */
+#ifndef _LINUX_CRYPTO_H
+#define _LINUX_CRYPTO_H
+
+#include <linux/atomic.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/bug.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+/*
+ * Autoloaded crypto modules should only use a prefixed name to avoid allowing
+ * arbitrary modules to be loaded. Loading from userspace may still need the
+ * unprefixed names, so retains those aliases as well.
+ * This uses __MODULE_INFO directly instead of MODULE_ALIAS because pre-4.3
+ * gcc (e.g. avr32 toolchain) uses __LINE__ for uniqueness, and this macro
+ * expands twice on the same line. Instead, use a separate base name for the
+ * alias.
+ */
+#define MODULE_ALIAS_CRYPTO(name)	\
+		__MODULE_INFO(alias, alias_userspace, name);	\
+		__MODULE_INFO(alias, alias_crypto, "crypto-" name)
+
+/*
+ * Algorithm masks and types.
+ */
+#define CRYPTO_ALG_TYPE_MASK		0x0000000f
+#define CRYPTO_ALG_TYPE_CIPHER		0x00000001
+#define CRYPTO_ALG_TYPE_AEAD		0x00000003
+#define CRYPTO_ALG_TYPE_BLKCIPHER	0x00000004
+#define CRYPTO_ALG_TYPE_ABLKCIPHER	0x00000005
+#define CRYPTO_ALG_TYPE_SKCIPHER	0x00000005
+#define CRYPTO_ALG_TYPE_GIVCIPHER	0x00000006
+#define CRYPTO_ALG_TYPE_KPP		0x00000008
+#define CRYPTO_ALG_TYPE_RNG		0x0000000c
+#define CRYPTO_ALG_TYPE_AKCIPHER	0x0000000d
+#define CRYPTO_ALG_TYPE_DIGEST		0x0000000e
+#define CRYPTO_ALG_TYPE_HASH		0x0000000e
+#define CRYPTO_ALG_TYPE_SHASH		0x0000000e
+#define CRYPTO_ALG_TYPE_AHASH		0x0000000f
+
+#define CRYPTO_ALG_TYPE_HASH_MASK	0x0000000e
+#define CRYPTO_ALG_TYPE_AHASH_MASK	0x0000000e
+#define CRYPTO_ALG_TYPE_BLKCIPHER_MASK	0x0000000c
+
+#define CRYPTO_ALG_ASYNC		0x00000080
+
+/*
+ * Set this bit if and only if the algorithm requires another algorithm of
+ * the same type to handle corner cases.
+ */
+#define CRYPTO_ALG_NEED_FALLBACK	0x00000100
+
+/*
+ * This bit is set for symmetric key ciphers that have already been wrapped
+ * with a generic IV generator to prevent them from being wrapped again.
+ */
+#define CRYPTO_ALG_GENIV		0x00000200
+
+/*
+ * Set if the algorithm is an instance that is build from templates.
+ */
+#define CRYPTO_ALG_INSTANCE		0x00000800
+
+/* Set this bit if the algorithm provided is hardware accelerated but
+ * not available to userspace via instruction set or so.
+ */
+#define CRYPTO_ALG_KERN_DRIVER_ONLY	0x00001000
+
+/*
+ * Mark a cipher as a service implementation only usable by another
+ * cipher and never by a normal user of the kernel crypto API
+ */
+#define CRYPTO_ALG_INTERNAL		0x00002000
+
+/*
+ * Transform masks and values (for crt_flags).
+ */
+#define CRYPTO_TFM_REQ_MASK		0x000fff00
+#define CRYPTO_TFM_RES_MASK		0xfff00000
+
+#define CRYPTO_TFM_REQ_WEAK_KEY		0x00000100
+#define CRYPTO_TFM_REQ_MAY_SLEEP	0x00000200
+#define CRYPTO_TFM_REQ_MAY_BACKLOG	0x00000400
+#define CRYPTO_TFM_RES_WEAK_KEY		0x00100000
+#define CRYPTO_TFM_RES_BAD_KEY_LEN	0x00200000
+#define CRYPTO_TFM_RES_BAD_KEY_SCHED	0x00400000
+#define CRYPTO_TFM_RES_BAD_BLOCK_LEN	0x00800000
+#define CRYPTO_TFM_RES_BAD_FLAGS	0x01000000
+
+/*
+ * Miscellaneous stuff.
+ */
+#define CRYPTO_MAX_ALG_NAME		64
+
+/*
+ * The macro CRYPTO_MINALIGN_ATTR (along with the void * type in the actual
+ * declaration) is used to ensure that the crypto_tfm context structure is
+ * aligned correctly for the given architecture so that there are no alignment
+ * faults for C data types.  In particular, this is required on platforms such
+ * as arm where pointers are 32-bit aligned but there are data types such as
+ * u64 which require 64-bit alignment.
+ */
+#define CRYPTO_MINALIGN ARCH_KMALLOC_MINALIGN
+
+#define CRYPTO_MINALIGN_ATTR __attribute__ ((__aligned__(CRYPTO_MINALIGN)))
+
+struct scatterlist;
+struct crypto_blkcipher;
+struct crypto_tfm;
+struct crypto_type;
+struct skcipher_givcrypt_request;
+
+struct blkcipher_desc {
+	struct crypto_blkcipher *tfm;
+	void *info;
+	u32 flags;
+};
+
+struct cipher_desc {
+	struct crypto_tfm *tfm;
+	void (*crfn)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+	unsigned int (*prfn)(const struct cipher_desc *desc, u8 *dst,
+			     const u8 *src, unsigned int nbytes);
+	void *info;
+};
+
+struct blkcipher_alg {
+	int (*setkey)(struct crypto_tfm *tfm, const u8 *key,
+	              unsigned int keylen);
+	int (*encrypt)(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes);
+	int (*decrypt)(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes);
+
+	const char *geniv;
+
+	unsigned int min_keysize;
+	unsigned int max_keysize;
+	unsigned int ivsize;
+};
+
+struct cipher_alg {
+	unsigned int cia_min_keysize;
+	unsigned int cia_max_keysize;
+	int (*cia_setkey)(struct crypto_tfm *tfm, const u8 *key,
+	                  unsigned int keylen);
+	void (*cia_encrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+	void (*cia_decrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+};
+
+struct compress_alg {
+	int (*coa_compress)(struct crypto_tfm *tfm, const u8 *src,
+			    unsigned int slen, u8 *dst, unsigned int *dlen);
+	int (*coa_decompress)(struct crypto_tfm *tfm, const u8 *src,
+			      unsigned int slen, u8 *dst, unsigned int *dlen);
+};
+
+
+#define cra_blkcipher	cra_u.blkcipher
+#define cra_cipher	cra_u.cipher
+#define cra_compress	cra_u.compress
+
+struct crypto_alg {
+	struct list_head cra_list;
+	struct list_head cra_users;
+
+	u32 cra_flags;
+	unsigned int cra_blocksize;
+	unsigned int cra_ctxsize;
+	unsigned int cra_alignmask;
+
+	int cra_priority;
+	atomic_t cra_refcnt;
+
+	char cra_name[CRYPTO_MAX_ALG_NAME];
+	char cra_driver_name[CRYPTO_MAX_ALG_NAME];
+
+	const struct crypto_type *cra_type;
+
+	union {
+		struct blkcipher_alg blkcipher;
+		struct cipher_alg cipher;
+		struct compress_alg compress;
+	} cra_u;
+
+	int (*cra_init)(struct crypto_tfm *tfm);
+	void (*cra_exit)(struct crypto_tfm *tfm);
+	void (*cra_destroy)(struct crypto_alg *alg);
+
+	struct module *cra_module;
+} CRYPTO_MINALIGN_ATTR;
+
+/*
+ * Algorithm registration interface.
+ */
+int crypto_register_alg(struct crypto_alg *alg);
+int crypto_unregister_alg(struct crypto_alg *alg);
+int crypto_register_algs(struct crypto_alg *algs, int count);
+int crypto_unregister_algs(struct crypto_alg *algs, int count);
+
+/*
+ * Algorithm query interface.
+ */
+int crypto_has_alg(const char *name, u32 type, u32 mask);
+
+/*
+ * Transforms: user-instantiated objects which encapsulate algorithms
+ * and core processing logic.  Managed via crypto_alloc_*() and
+ * crypto_free_*(), as well as the various helpers below.
+ */
+
+struct blkcipher_tfm {
+	void *iv;
+	int (*setkey)(struct crypto_tfm *tfm, const u8 *key,
+		      unsigned int keylen);
+	int (*encrypt)(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes);
+	int (*decrypt)(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes);
+};
+
+struct cipher_tfm {
+	int (*cit_setkey)(struct crypto_tfm *tfm,
+	                  const u8 *key, unsigned int keylen);
+	void (*cit_encrypt_one)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+	void (*cit_decrypt_one)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+};
+
+struct compress_tfm {
+	int (*cot_compress)(struct crypto_tfm *tfm,
+	                    const u8 *src, unsigned int slen,
+	                    u8 *dst, unsigned int *dlen);
+	int (*cot_decompress)(struct crypto_tfm *tfm,
+	                      const u8 *src, unsigned int slen,
+	                      u8 *dst, unsigned int *dlen);
+};
+
+#define crt_blkcipher	crt_u.blkcipher
+#define crt_cipher	crt_u.cipher
+#define crt_compress	crt_u.compress
+
+struct crypto_tfm {
+
+	u32 crt_flags;
+
+	union {
+		struct blkcipher_tfm blkcipher;
+		struct cipher_tfm cipher;
+		struct compress_tfm compress;
+	} crt_u;
+
+	void (*exit)(struct crypto_tfm *tfm);
+
+	struct crypto_alg *__crt_alg;
+
+	void *__crt_ctx[] CRYPTO_MINALIGN_ATTR;
+};
+
+struct crypto_blkcipher {
+	struct crypto_tfm base;
+};
+
+struct crypto_cipher {
+	struct crypto_tfm base;
+};
+
+struct crypto_comp {
+	struct crypto_tfm base;
+};
+
+enum {
+	CRYPTOA_UNSPEC,
+	CRYPTOA_ALG,
+	CRYPTOA_TYPE,
+	CRYPTOA_U32,
+	__CRYPTOA_MAX,
+};
+
+#define CRYPTOA_MAX (__CRYPTOA_MAX - 1)
+
+/* Maximum number of (rtattr) parameters for each template. */
+#define CRYPTO_MAX_ATTRS 32
+
+struct crypto_attr_alg {
+	char name[CRYPTO_MAX_ALG_NAME];
+};
+
+struct crypto_attr_type {
+	u32 type;
+	u32 mask;
+};
+
+struct crypto_attr_u32 {
+	u32 num;
+};
+
+/* 
+ * Transform user interface.
+ */
+
+struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask);
+void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm);
+
+static inline void crypto_free_tfm(struct crypto_tfm *tfm)
+{
+	return crypto_destroy_tfm(tfm, tfm);
+}
+
+int alg_test(const char *driver, const char *alg, u32 type, u32 mask);
+
+/*
+ * Transform helpers which query the underlying algorithm.
+ */
+static inline const char *crypto_tfm_alg_name(struct crypto_tfm *tfm)
+{
+	return tfm->__crt_alg->cra_name;
+}
+
+static inline const char *crypto_tfm_alg_driver_name(struct crypto_tfm *tfm)
+{
+	return tfm->__crt_alg->cra_driver_name;
+}
+
+static inline int crypto_tfm_alg_priority(struct crypto_tfm *tfm)
+{
+	return tfm->__crt_alg->cra_priority;
+}
+
+static inline u32 crypto_tfm_alg_type(struct crypto_tfm *tfm)
+{
+	return tfm->__crt_alg->cra_flags & CRYPTO_ALG_TYPE_MASK;
+}
+
+static inline unsigned int crypto_tfm_alg_blocksize(struct crypto_tfm *tfm)
+{
+	return tfm->__crt_alg->cra_blocksize;
+}
+
+static inline unsigned int crypto_tfm_alg_alignmask(struct crypto_tfm *tfm)
+{
+	return tfm->__crt_alg->cra_alignmask;
+}
+
+static inline u32 crypto_tfm_get_flags(struct crypto_tfm *tfm)
+{
+	return tfm->crt_flags;
+}
+
+static inline void crypto_tfm_set_flags(struct crypto_tfm *tfm, u32 flags)
+{
+	tfm->crt_flags |= flags;
+}
+
+static inline void crypto_tfm_clear_flags(struct crypto_tfm *tfm, u32 flags)
+{
+	tfm->crt_flags &= ~flags;
+}
+
+static inline void *crypto_tfm_ctx(struct crypto_tfm *tfm)
+{
+	return tfm->__crt_ctx;
+}
+
+static inline unsigned int crypto_tfm_ctx_alignment(void)
+{
+	struct crypto_tfm *tfm;
+	return __alignof__(tfm->__crt_ctx);
+}
+
+static inline u32 crypto_skcipher_type(u32 type)
+{
+	type &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV);
+	type |= CRYPTO_ALG_TYPE_BLKCIPHER;
+	return type;
+}
+
+static inline u32 crypto_skcipher_mask(u32 mask)
+{
+	mask &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV);
+	mask |= CRYPTO_ALG_TYPE_BLKCIPHER_MASK;
+	return mask;
+}
+
+/**
+ * DOC: Synchronous Block Cipher API
+ *
+ * The synchronous block cipher API is used with the ciphers of type
+ * CRYPTO_ALG_TYPE_BLKCIPHER (listed as type "blkcipher" in /proc/crypto)
+ *
+ * Synchronous calls, have a context in the tfm. But since a single tfm can be
+ * used in multiple calls and in parallel, this info should not be changeable
+ * (unless a lock is used). This applies, for example, to the symmetric key.
+ * However, the IV is changeable, so there is an iv field in blkcipher_tfm
+ * structure for synchronous blkcipher api. So, its the only state info that can
+ * be kept for synchronous calls without using a big lock across a tfm.
+ *
+ * The block cipher API allows the use of a complete cipher, i.e. a cipher
+ * consisting of a template (a block chaining mode) and a single block cipher
+ * primitive (e.g. AES).
+ *
+ * The plaintext data buffer and the ciphertext data buffer are pointed to
+ * by using scatter/gather lists. The cipher operation is performed
+ * on all segments of the provided scatter/gather lists.
+ *
+ * The kernel crypto API supports a cipher operation "in-place" which means that
+ * the caller may provide the same scatter/gather list for the plaintext and
+ * cipher text. After the completion of the cipher operation, the plaintext
+ * data is replaced with the ciphertext data in case of an encryption and vice
+ * versa for a decryption. The caller must ensure that the scatter/gather lists
+ * for the output data point to sufficiently large buffers, i.e. multiples of
+ * the block size of the cipher.
+ */
+
+static inline struct crypto_blkcipher *__crypto_blkcipher_cast(
+	struct crypto_tfm *tfm)
+{
+	return (struct crypto_blkcipher *)tfm;
+}
+
+static inline struct crypto_blkcipher *crypto_blkcipher_cast(
+	struct crypto_tfm *tfm)
+{
+	BUG_ON(crypto_tfm_alg_type(tfm) != CRYPTO_ALG_TYPE_BLKCIPHER);
+	return __crypto_blkcipher_cast(tfm);
+}
+
+/**
+ * crypto_alloc_blkcipher() - allocate synchronous block cipher handle
+ * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
+ *	      blkcipher cipher
+ * @type: specifies the type of the cipher
+ * @mask: specifies the mask for the cipher
+ *
+ * Allocate a cipher handle for a block cipher. The returned struct
+ * crypto_blkcipher is the cipher handle that is required for any subsequent
+ * API invocation for that block cipher.
+ *
+ * Return: allocated cipher handle in case of success; IS_ERR() is true in case
+ *	   of an error, PTR_ERR() returns the error code.
+ */
+static inline struct crypto_blkcipher *crypto_alloc_blkcipher(
+	const char *alg_name, u32 type, u32 mask)
+{
+	type &= ~CRYPTO_ALG_TYPE_MASK;
+	type |= CRYPTO_ALG_TYPE_BLKCIPHER;
+	mask |= CRYPTO_ALG_TYPE_MASK;
+
+	return __crypto_blkcipher_cast(crypto_alloc_base(alg_name, type, mask));
+}
+
+static inline struct crypto_tfm *crypto_blkcipher_tfm(
+	struct crypto_blkcipher *tfm)
+{
+	return &tfm->base;
+}
+
+/**
+ * crypto_free_blkcipher() - zeroize and free the block cipher handle
+ * @tfm: cipher handle to be freed
+ */
+static inline void crypto_free_blkcipher(struct crypto_blkcipher *tfm)
+{
+	crypto_free_tfm(crypto_blkcipher_tfm(tfm));
+}
+
+/**
+ * crypto_has_blkcipher() - Search for the availability of a block cipher
+ * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
+ *	      block cipher
+ * @type: specifies the type of the cipher
+ * @mask: specifies the mask for the cipher
+ *
+ * Return: true when the block cipher is known to the kernel crypto API; false
+ *	   otherwise
+ */
+static inline int crypto_has_blkcipher(const char *alg_name, u32 type, u32 mask)
+{
+	type &= ~CRYPTO_ALG_TYPE_MASK;
+	type |= CRYPTO_ALG_TYPE_BLKCIPHER;
+	mask |= CRYPTO_ALG_TYPE_MASK;
+
+	return crypto_has_alg(alg_name, type, mask);
+}
+
+/**
+ * crypto_blkcipher_name() - return the name / cra_name from the cipher handle
+ * @tfm: cipher handle
+ *
+ * Return: The character string holding the name of the cipher
+ */
+static inline const char *crypto_blkcipher_name(struct crypto_blkcipher *tfm)
+{
+	return crypto_tfm_alg_name(crypto_blkcipher_tfm(tfm));
+}
+
+static inline struct blkcipher_tfm *crypto_blkcipher_crt(
+	struct crypto_blkcipher *tfm)
+{
+	return &crypto_blkcipher_tfm(tfm)->crt_blkcipher;
+}
+
+static inline struct blkcipher_alg *crypto_blkcipher_alg(
+	struct crypto_blkcipher *tfm)
+{
+	return &crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher;
+}
+
+/**
+ * crypto_blkcipher_ivsize() - obtain IV size
+ * @tfm: cipher handle
+ *
+ * The size of the IV for the block cipher referenced by the cipher handle is
+ * returned. This IV size may be zero if the cipher does not need an IV.
+ *
+ * Return: IV size in bytes
+ */
+static inline unsigned int crypto_blkcipher_ivsize(struct crypto_blkcipher *tfm)
+{
+	return crypto_blkcipher_alg(tfm)->ivsize;
+}
+
+/**
+ * crypto_blkcipher_blocksize() - obtain block size of cipher
+ * @tfm: cipher handle
+ *
+ * The block size for the block cipher referenced with the cipher handle is
+ * returned. The caller may use that information to allocate appropriate
+ * memory for the data returned by the encryption or decryption operation.
+ *
+ * Return: block size of cipher
+ */
+static inline unsigned int crypto_blkcipher_blocksize(
+	struct crypto_blkcipher *tfm)
+{
+	return crypto_tfm_alg_blocksize(crypto_blkcipher_tfm(tfm));
+}
+
+static inline unsigned int crypto_blkcipher_alignmask(
+	struct crypto_blkcipher *tfm)
+{
+	return crypto_tfm_alg_alignmask(crypto_blkcipher_tfm(tfm));
+}
+
+static inline u32 crypto_blkcipher_get_flags(struct crypto_blkcipher *tfm)
+{
+	return crypto_tfm_get_flags(crypto_blkcipher_tfm(tfm));
+}
+
+static inline void crypto_blkcipher_set_flags(struct crypto_blkcipher *tfm,
+					      u32 flags)
+{
+	crypto_tfm_set_flags(crypto_blkcipher_tfm(tfm), flags);
+}
+
+static inline void crypto_blkcipher_clear_flags(struct crypto_blkcipher *tfm,
+						u32 flags)
+{
+	crypto_tfm_clear_flags(crypto_blkcipher_tfm(tfm), flags);
+}
+
+/**
+ * crypto_blkcipher_setkey() - set key for cipher
+ * @tfm: cipher handle
+ * @key: buffer holding the key
+ * @keylen: length of the key in bytes
+ *
+ * The caller provided key is set for the block cipher referenced by the cipher
+ * handle.
+ *
+ * Note, the key length determines the cipher type. Many block ciphers implement
+ * different cipher modes depending on the key size, such as AES-128 vs AES-192
+ * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128
+ * is performed.
+ *
+ * Return: 0 if the setting of the key was successful; < 0 if an error occurred
+ */
+static inline int crypto_blkcipher_setkey(struct crypto_blkcipher *tfm,
+					  const u8 *key, unsigned int keylen)
+{
+	return crypto_blkcipher_crt(tfm)->setkey(crypto_blkcipher_tfm(tfm),
+						 key, keylen);
+}
+
+/**
+ * crypto_blkcipher_encrypt() - encrypt plaintext
+ * @desc: reference to the block cipher handle with meta data
+ * @dst: scatter/gather list that is filled by the cipher operation with the
+ *	ciphertext
+ * @src: scatter/gather list that holds the plaintext
+ * @nbytes: number of bytes of the plaintext to encrypt.
+ *
+ * Encrypt plaintext data using the IV set by the caller with a preceding
+ * call of crypto_blkcipher_set_iv.
+ *
+ * The blkcipher_desc data structure must be filled by the caller and can
+ * reside on the stack. The caller must fill desc as follows: desc.tfm is filled
+ * with the block cipher handle; desc.flags is filled with either
+ * CRYPTO_TFM_REQ_MAY_SLEEP or 0.
+ *
+ * Return: 0 if the cipher operation was successful; < 0 if an error occurred
+ */
+static inline int crypto_blkcipher_encrypt(struct blkcipher_desc *desc,
+					   struct scatterlist *dst,
+					   struct scatterlist *src,
+					   unsigned int nbytes)
+{
+	desc->info = crypto_blkcipher_crt(desc->tfm)->iv;
+	return crypto_blkcipher_crt(desc->tfm)->encrypt(desc, dst, src, nbytes);
+}
+
+/**
+ * crypto_blkcipher_encrypt_iv() - encrypt plaintext with dedicated IV
+ * @desc: reference to the block cipher handle with meta data
+ * @dst: scatter/gather list that is filled by the cipher operation with the
+ *	ciphertext
+ * @src: scatter/gather list that holds the plaintext
+ * @nbytes: number of bytes of the plaintext to encrypt.
+ *
+ * Encrypt plaintext data with the use of an IV that is solely used for this
+ * cipher operation. Any previously set IV is not used.
+ *
+ * The blkcipher_desc data structure must be filled by the caller and can
+ * reside on the stack. The caller must fill desc as follows: desc.tfm is filled
+ * with the block cipher handle; desc.info is filled with the IV to be used for
+ * the current operation; desc.flags is filled with either
+ * CRYPTO_TFM_REQ_MAY_SLEEP or 0.
+ *
+ * Return: 0 if the cipher operation was successful; < 0 if an error occurred
+ */
+static inline int crypto_blkcipher_encrypt_iv(struct blkcipher_desc *desc,
+					      struct scatterlist *dst,
+					      struct scatterlist *src,
+					      unsigned int nbytes)
+{
+	return crypto_blkcipher_crt(desc->tfm)->encrypt(desc, dst, src, nbytes);
+}
+
+/**
+ * crypto_blkcipher_decrypt() - decrypt ciphertext
+ * @desc: reference to the block cipher handle with meta data
+ * @dst: scatter/gather list that is filled by the cipher operation with the
+ *	plaintext
+ * @src: scatter/gather list that holds the ciphertext
+ * @nbytes: number of bytes of the ciphertext to decrypt.
+ *
+ * Decrypt ciphertext data using the IV set by the caller with a preceding
+ * call of crypto_blkcipher_set_iv.
+ *
+ * The blkcipher_desc data structure must be filled by the caller as documented
+ * for the crypto_blkcipher_encrypt call above.
+ *
+ * Return: 0 if the cipher operation was successful; < 0 if an error occurred
+ *
+ */
+static inline int crypto_blkcipher_decrypt(struct blkcipher_desc *desc,
+					   struct scatterlist *dst,
+					   struct scatterlist *src,
+					   unsigned int nbytes)
+{
+	desc->info = crypto_blkcipher_crt(desc->tfm)->iv;
+	return crypto_blkcipher_crt(desc->tfm)->decrypt(desc, dst, src, nbytes);
+}
+
+/**
+ * crypto_blkcipher_decrypt_iv() - decrypt ciphertext with dedicated IV
+ * @desc: reference to the block cipher handle with meta data
+ * @dst: scatter/gather list that is filled by the cipher operation with the
+ *	plaintext
+ * @src: scatter/gather list that holds the ciphertext
+ * @nbytes: number of bytes of the ciphertext to decrypt.
+ *
+ * Decrypt ciphertext data with the use of an IV that is solely used for this
+ * cipher operation. Any previously set IV is not used.
+ *
+ * The blkcipher_desc data structure must be filled by the caller as documented
+ * for the crypto_blkcipher_encrypt_iv call above.
+ *
+ * Return: 0 if the cipher operation was successful; < 0 if an error occurred
+ */
+static inline int crypto_blkcipher_decrypt_iv(struct blkcipher_desc *desc,
+					      struct scatterlist *dst,
+					      struct scatterlist *src,
+					      unsigned int nbytes)
+{
+	return crypto_blkcipher_crt(desc->tfm)->decrypt(desc, dst, src, nbytes);
+}
+
+/**
+ * crypto_blkcipher_set_iv() - set IV for cipher
+ * @tfm: cipher handle
+ * @src: buffer holding the IV
+ * @len: length of the IV in bytes
+ *
+ * The caller provided IV is set for the block cipher referenced by the cipher
+ * handle.
+ */
+static inline void crypto_blkcipher_set_iv(struct crypto_blkcipher *tfm,
+					   const u8 *src, unsigned int len)
+{
+	memcpy(crypto_blkcipher_crt(tfm)->iv, src, len);
+}
+
+/**
+ * crypto_blkcipher_get_iv() - obtain IV from cipher
+ * @tfm: cipher handle
+ * @dst: buffer filled with the IV
+ * @len: length of the buffer dst
+ *
+ * The caller can obtain the IV set for the block cipher referenced by the
+ * cipher handle and store it into the user-provided buffer. If the buffer
+ * has an insufficient space, the IV is truncated to fit the buffer.
+ */
+static inline void crypto_blkcipher_get_iv(struct crypto_blkcipher *tfm,
+					   u8 *dst, unsigned int len)
+{
+	memcpy(dst, crypto_blkcipher_crt(tfm)->iv, len);
+}
+
+/**
+ * DOC: Single Block Cipher API
+ *
+ * The single block cipher API is used with the ciphers of type
+ * CRYPTO_ALG_TYPE_CIPHER (listed as type "cipher" in /proc/crypto).
+ *
+ * Using the single block cipher API calls, operations with the basic cipher
+ * primitive can be implemented. These cipher primitives exclude any block
+ * chaining operations including IV handling.
+ *
+ * The purpose of this single block cipher API is to support the implementation
+ * of templates or other concepts that only need to perform the cipher operation
+ * on one block at a time. Templates invoke the underlying cipher primitive
+ * block-wise and process either the input or the output data of these cipher
+ * operations.
+ */
+
+static inline struct crypto_cipher *__crypto_cipher_cast(struct crypto_tfm *tfm)
+{
+	return (struct crypto_cipher *)tfm;
+}
+
+static inline struct crypto_cipher *crypto_cipher_cast(struct crypto_tfm *tfm)
+{
+	BUG_ON(crypto_tfm_alg_type(tfm) != CRYPTO_ALG_TYPE_CIPHER);
+	return __crypto_cipher_cast(tfm);
+}
+
+/**
+ * crypto_alloc_cipher() - allocate single block cipher handle
+ * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
+ *	     single block cipher
+ * @type: specifies the type of the cipher
+ * @mask: specifies the mask for the cipher
+ *
+ * Allocate a cipher handle for a single block cipher. The returned struct
+ * crypto_cipher is the cipher handle that is required for any subsequent API
+ * invocation for that single block cipher.
+ *
+ * Return: allocated cipher handle in case of success; IS_ERR() is true in case
+ *	   of an error, PTR_ERR() returns the error code.
+ */
+static inline struct crypto_cipher *crypto_alloc_cipher(const char *alg_name,
+							u32 type, u32 mask)
+{
+	type &= ~CRYPTO_ALG_TYPE_MASK;
+	type |= CRYPTO_ALG_TYPE_CIPHER;
+	mask |= CRYPTO_ALG_TYPE_MASK;
+
+	return __crypto_cipher_cast(crypto_alloc_base(alg_name, type, mask));
+}
+
+static inline struct crypto_tfm *crypto_cipher_tfm(struct crypto_cipher *tfm)
+{
+	return &tfm->base;
+}
+
+/**
+ * crypto_free_cipher() - zeroize and free the single block cipher handle
+ * @tfm: cipher handle to be freed
+ */
+static inline void crypto_free_cipher(struct crypto_cipher *tfm)
+{
+	crypto_free_tfm(crypto_cipher_tfm(tfm));
+}
+
+/**
+ * crypto_has_cipher() - Search for the availability of a single block cipher
+ * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
+ *	     single block cipher
+ * @type: specifies the type of the cipher
+ * @mask: specifies the mask for the cipher
+ *
+ * Return: true when the single block cipher is known to the kernel crypto API;
+ *	   false otherwise
+ */
+static inline int crypto_has_cipher(const char *alg_name, u32 type, u32 mask)
+{
+	type &= ~CRYPTO_ALG_TYPE_MASK;
+	type |= CRYPTO_ALG_TYPE_CIPHER;
+	mask |= CRYPTO_ALG_TYPE_MASK;
+
+	return crypto_has_alg(alg_name, type, mask);
+}
+
+static inline struct cipher_tfm *crypto_cipher_crt(struct crypto_cipher *tfm)
+{
+	return &crypto_cipher_tfm(tfm)->crt_cipher;
+}
+
+/**
+ * crypto_cipher_blocksize() - obtain block size for cipher
+ * @tfm: cipher handle
+ *
+ * The block size for the single block cipher referenced with the cipher handle
+ * tfm is returned. The caller may use that information to allocate appropriate
+ * memory for the data returned by the encryption or decryption operation
+ *
+ * Return: block size of cipher
+ */
+static inline unsigned int crypto_cipher_blocksize(struct crypto_cipher *tfm)
+{
+	return crypto_tfm_alg_blocksize(crypto_cipher_tfm(tfm));
+}
+
+static inline unsigned int crypto_cipher_alignmask(struct crypto_cipher *tfm)
+{
+	return crypto_tfm_alg_alignmask(crypto_cipher_tfm(tfm));
+}
+
+static inline u32 crypto_cipher_get_flags(struct crypto_cipher *tfm)
+{
+	return crypto_tfm_get_flags(crypto_cipher_tfm(tfm));
+}
+
+static inline void crypto_cipher_set_flags(struct crypto_cipher *tfm,
+					   u32 flags)
+{
+	crypto_tfm_set_flags(crypto_cipher_tfm(tfm), flags);
+}
+
+static inline void crypto_cipher_clear_flags(struct crypto_cipher *tfm,
+					     u32 flags)
+{
+	crypto_tfm_clear_flags(crypto_cipher_tfm(tfm), flags);
+}
+
+/**
+ * crypto_cipher_setkey() - set key for cipher
+ * @tfm: cipher handle
+ * @key: buffer holding the key
+ * @keylen: length of the key in bytes
+ *
+ * The caller provided key is set for the single block cipher referenced by the
+ * cipher handle.
+ *
+ * Note, the key length determines the cipher type. Many block ciphers implement
+ * different cipher modes depending on the key size, such as AES-128 vs AES-192
+ * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128
+ * is performed.
+ *
+ * Return: 0 if the setting of the key was successful; < 0 if an error occurred
+ */
+static inline int crypto_cipher_setkey(struct crypto_cipher *tfm,
+                                       const u8 *key, unsigned int keylen)
+{
+	return crypto_cipher_crt(tfm)->cit_setkey(crypto_cipher_tfm(tfm),
+						  key, keylen);
+}
+
+/**
+ * crypto_cipher_encrypt_one() - encrypt one block of plaintext
+ * @tfm: cipher handle
+ * @dst: points to the buffer that will be filled with the ciphertext
+ * @src: buffer holding the plaintext to be encrypted
+ *
+ * Invoke the encryption operation of one block. The caller must ensure that
+ * the plaintext and ciphertext buffers are at least one block in size.
+ */
+static inline void crypto_cipher_encrypt_one(struct crypto_cipher *tfm,
+					     u8 *dst, const u8 *src)
+{
+	crypto_cipher_crt(tfm)->cit_encrypt_one(crypto_cipher_tfm(tfm),
+						dst, src);
+}
+
+/**
+ * crypto_cipher_decrypt_one() - decrypt one block of ciphertext
+ * @tfm: cipher handle
+ * @dst: points to the buffer that will be filled with the plaintext
+ * @src: buffer holding the ciphertext to be decrypted
+ *
+ * Invoke the decryption operation of one block. The caller must ensure that
+ * the plaintext and ciphertext buffers are at least one block in size.
+ */
+static inline void crypto_cipher_decrypt_one(struct crypto_cipher *tfm,
+					     u8 *dst, const u8 *src)
+{
+	crypto_cipher_crt(tfm)->cit_decrypt_one(crypto_cipher_tfm(tfm),
+						dst, src);
+}
+
+#endif	/* _LINUX_CRYPTO_H */
+
diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
new file mode 100644
index 0000000..8dfcb83
--- /dev/null
+++ b/include/linux/cryptohash.h
@@ -0,0 +1,20 @@
+#ifndef __CRYPTOHASH_H
+#define __CRYPTOHASH_H
+
+#include <linux/types.h>
+
+#define SHA_DIGEST_WORDS 5
+#define SHA_MESSAGE_BYTES (512 /*bits*/ / 8)
+#define SHA_WORKSPACE_WORDS 16
+
+void sha_init(__u32 *buf);
+void sha_transform(__u32 *digest, const char *data, __u32 *W);
+
+#define MD5_DIGEST_WORDS 4
+#define MD5_MESSAGE_BYTES 64
+
+void md5_transform(__u32 *hash, __u32 const *in);
+
+__u32 half_md4_transform(__u32 buf[4], __u32 const in[8]);
+
+#endif
diff --git a/include/linux/ctype.h b/include/linux/ctype.h
new file mode 100644
index 0000000..26b7de5
--- /dev/null
+++ b/include/linux/ctype.h
@@ -0,0 +1,2 @@
+
+#include <ctype.h>
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
new file mode 100644
index 0000000..b569b2c
--- /dev/null
+++ b/include/linux/dcache.h
@@ -0,0 +1,31 @@
+#ifndef __LINUX_DCACHE_H
+#define __LINUX_DCACHE_H
+
+struct super_block;
+struct inode;
+
+/* The hash is always the low bits of hash_len */
+#ifdef __LITTLE_ENDIAN
+ #define HASH_LEN_DECLARE u32 hash; u32 len
+#else
+ #define HASH_LEN_DECLARE u32 len; u32 hash
+#endif
+
+struct qstr {
+	union {
+		struct {
+			HASH_LEN_DECLARE;
+		};
+		u64 hash_len;
+	};
+	const unsigned char *name;
+};
+
+#define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
+
+struct dentry {
+	struct super_block *d_sb;
+	struct inode *d_inode;
+};
+
+#endif	/* __LINUX_DCACHE_H */
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
new file mode 100644
index 0000000..4db5b3f
--- /dev/null
+++ b/include/linux/debugfs.h
@@ -0,0 +1,243 @@
+/*
+ *  debugfs.h - a tiny little debug file system
+ *
+ *  Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
+ *  Copyright (C) 2004 IBM Inc.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License version
+ *	2 as published by the Free Software Foundation.
+ *
+ *  debugfs is for people to use instead of /proc or /sys.
+ *  See Documentation/DocBook/filesystems for more details.
+ */
+
+#ifndef _DEBUGFS_H_
+#define _DEBUGFS_H_
+
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+
+struct device;
+struct file_operations;
+struct vfsmount;
+struct srcu_struct;
+
+struct debugfs_blob_wrapper {
+	void *data;
+	unsigned long size;
+};
+
+struct debugfs_reg32 {
+	char *name;
+	unsigned long offset;
+};
+
+struct debugfs_regset32 {
+	const struct debugfs_reg32 *regs;
+	int nregs;
+	void __iomem *base;
+};
+
+extern struct dentry *arch_debugfs_dir;
+
+extern struct srcu_struct debugfs_srcu;
+
+#include <linux/err.h>
+
+static inline struct dentry *debugfs_create_file(const char *name, umode_t mode,
+					struct dentry *parent, void *data,
+					const struct file_operations *fops)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
+					struct dentry *parent, void *data,
+					const struct file_operations *fops,
+					loff_t file_size)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct dentry *debugfs_create_dir(const char *name,
+						struct dentry *parent)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct dentry *debugfs_create_symlink(const char *name,
+						    struct dentry *parent,
+						    const char *dest)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct dentry *debugfs_create_automount(const char *name,
+					struct dentry *parent,
+					struct vfsmount *(*f)(void *),
+					void *data)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline void debugfs_remove(struct dentry *dentry)
+{ }
+
+static inline void debugfs_remove_recursive(struct dentry *dentry)
+{ }
+
+static inline int debugfs_use_file_start(const struct dentry *dentry,
+					int *srcu_idx)
+	__acquires(&debugfs_srcu)
+{
+	return 0;
+}
+
+static inline void debugfs_use_file_finish(int srcu_idx)
+	__releases(&debugfs_srcu)
+{ }
+
+#define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt)	\
+	static const struct file_operations __fops = { 0 }
+
+static inline struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
+                struct dentry *new_dir, char *new_name)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct dentry *debugfs_create_u8(const char *name, umode_t mode,
+					       struct dentry *parent,
+					       u8 *value)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct dentry *debugfs_create_u16(const char *name, umode_t mode,
+						struct dentry *parent,
+						u16 *value)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct dentry *debugfs_create_u32(const char *name, umode_t mode,
+						struct dentry *parent,
+						u32 *value)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct dentry *debugfs_create_u64(const char *name, umode_t mode,
+						struct dentry *parent,
+						u64 *value)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct dentry *debugfs_create_x8(const char *name, umode_t mode,
+					       struct dentry *parent,
+					       u8 *value)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct dentry *debugfs_create_x16(const char *name, umode_t mode,
+						struct dentry *parent,
+						u16 *value)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct dentry *debugfs_create_x32(const char *name, umode_t mode,
+						struct dentry *parent,
+						u32 *value)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct dentry *debugfs_create_x64(const char *name, umode_t mode,
+						struct dentry *parent,
+						u64 *value)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
+				     struct dentry *parent,
+				     size_t *value)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
+				     struct dentry *parent, atomic_t *value)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct dentry *debugfs_create_bool(const char *name, umode_t mode,
+						 struct dentry *parent,
+						 bool *value)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct dentry *debugfs_create_blob(const char *name, umode_t mode,
+				  struct dentry *parent,
+				  struct debugfs_blob_wrapper *blob)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct dentry *debugfs_create_regset32(const char *name,
+				   umode_t mode, struct dentry *parent,
+				   struct debugfs_regset32 *regset)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs,
+			 int nregs, void __iomem *base, char *prefix)
+{
+}
+
+static inline bool debugfs_initialized(void)
+{
+	return false;
+}
+
+static inline struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
+					struct dentry *parent,
+					u32 *array, u32 elements)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct dentry *debugfs_create_devm_seqfile(struct device *dev,
+							 const char *name,
+							 struct dentry *parent,
+					   int (*read_fn)(struct seq_file *s,
+							  void *data))
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline ssize_t debugfs_read_file_bool(struct file *file,
+					     char __user *user_buf,
+					     size_t count, loff_t *ppos)
+{
+	return -ENODEV;
+}
+
+static inline ssize_t debugfs_write_file_bool(struct file *file,
+					      const char __user *user_buf,
+					      size_t count, loff_t *ppos)
+{
+	return -ENODEV;
+}
+
+#endif
diff --git a/include/linux/device.h b/include/linux/device.h
new file mode 100644
index 0000000..2b2b849
--- /dev/null
+++ b/include/linux/device.h
@@ -0,0 +1,40 @@
+#ifndef _DEVICE_H_
+#define _DEVICE_H_
+
+#include <linux/slab.h>
+#include <linux/types.h>
+
+struct module;
+
+struct class {
+};
+
+static inline void class_destroy(struct class *class)
+{
+	kfree(class);
+}
+
+static inline struct class * __must_check class_create(struct module *owner,
+						       const char *name)
+{
+	return kzalloc(sizeof(struct class), GFP_KERNEL);
+}
+
+struct device {
+};
+
+static inline void device_unregister(struct device *dev)
+{
+	kfree(dev);
+}
+
+static inline void device_destroy(struct class *cls, dev_t devt) {}
+
+static inline struct device *device_create(struct class *cls, struct device *parent,
+			     dev_t devt, void *drvdata,
+			     const char *fmt, ...)
+{
+	return kzalloc(sizeof(struct device), GFP_KERNEL);
+}
+
+#endif /* _DEVICE_H_ */
diff --git a/include/linux/dynamic_fault.h b/include/linux/dynamic_fault.h
new file mode 100644
index 0000000..dd215dc
--- /dev/null
+++ b/include/linux/dynamic_fault.h
@@ -0,0 +1,7 @@
+#ifndef __TOOLS_LINUX_DYNAMIC_FAULT_H
+#define __TOOLS_LINUX_DYNAMIC_FAULT_H
+
+#define dynamic_fault(_class)			0
+#define race_fault()				0
+
+#endif /* __TOOLS_LINUX_DYNAMIC_FAULT_H */
diff --git a/include/linux/err.h b/include/linux/err.h
new file mode 100644
index 0000000..e94bdff
--- /dev/null
+++ b/include/linux/err.h
@@ -0,0 +1,68 @@
+#ifndef __TOOLS_LINUX_ERR_H
+#define __TOOLS_LINUX_ERR_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+#include <asm/errno.h>
+
+/*
+ * Original kernel header comment:
+ *
+ * Kernel pointers have redundant information, so we can use a
+ * scheme where we can return either an error code or a normal
+ * pointer with the same return value.
+ *
+ * This should be a per-architecture thing, to allow different
+ * error and pointer decisions.
+ *
+ * Userspace note:
+ * The same principle works for userspace, because 'error' pointers
+ * fall down to the unused hole far from user space, as described
+ * in Documentation/x86/x86_64/mm.txt for x86_64 arch:
+ *
+ * 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm hole caused by [48:63] sign extension
+ * ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
+ *
+ * It should be the same case for other architectures, because
+ * this code is used in generic kernel code.
+ */
+#define MAX_ERRNO	4095
+
+#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO)
+
+static inline void * __must_check ERR_PTR(long error_)
+{
+	return (void *) error_;
+}
+
+static inline long __must_check PTR_ERR(__force const void *ptr)
+{
+	return (long) ptr;
+}
+
+static inline bool __must_check IS_ERR(__force const void *ptr)
+{
+	return IS_ERR_VALUE((unsigned long)ptr);
+}
+
+static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr)
+{
+	return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr);
+}
+
+static inline void * __must_check ERR_CAST(__force const void *ptr)
+{
+	/* cast away the const */
+	return (void *) ptr;
+}
+
+static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr)
+{
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+	else
+		return 0;
+}
+
+#endif /* _LINUX_ERR_H */
diff --git a/include/linux/export.h b/include/linux/export.h
new file mode 100644
index 0000000..af9da96
--- /dev/null
+++ b/include/linux/export.h
@@ -0,0 +1,13 @@
+#ifndef _TOOLS_LINUX_EXPORT_H_
+#define _TOOLS_LINUX_EXPORT_H_
+
+#define EXPORT_SYMBOL(sym)
+#define EXPORT_SYMBOL_GPL(sym)
+#define EXPORT_SYMBOL_GPL_FUTURE(sym)
+#define EXPORT_UNUSED_SYMBOL(sym)
+#define EXPORT_UNUSED_SYMBOL_GPL(sym)
+
+#define THIS_MODULE ((struct module *)0)
+#define KBUILD_MODNAME
+
+#endif
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
new file mode 100644
index 0000000..2b76d8c
--- /dev/null
+++ b/include/linux/freezer.h
@@ -0,0 +1,7 @@
+#ifndef __TOOLS_LINUX_FREEZER_H
+#define __TOOLS_LINUX_FREEZER_H
+
+#define try_to_freeze()
+#define set_freezable()
+
+#endif /* __TOOLS_LINUX_FREEZER_H */
diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
new file mode 100644
index 0000000..1a951e9
--- /dev/null
+++ b/include/linux/generic-radix-tree.h
@@ -0,0 +1,137 @@
+#ifndef _LINUX_GENERIC_RADIX_TREE_H
+#define _LINUX_GENERIC_RADIX_TREE_H
+
+/*
+ * Generic radix trees/sparse arrays:
+ *
+ * A generic radix tree has all nodes of size PAGE_SIZE - both leaves and
+ * interior nodes.
+ */
+
+#include <linux/page.h>
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+
+struct genradix_node;
+
+struct __genradix {
+	struct genradix_node		*root;
+	size_t				depth;
+};
+
+/*
+ * NOTE: currently, sizeof(_type) must be a power of two and not larger than
+ * PAGE_SIZE:
+ */
+
+#define __GENRADIX_INITIALIZER					\
+	{							\
+		.tree = {					\
+			.root = NULL,				\
+			.depth = 0,				\
+		}						\
+	}
+
+/*
+ * We use a 0 size array to stash the type we're storing without taking any
+ * space at runtime - then the various accessor macros can use typeof() to get
+ * to it for casts/sizeof - we also force the alignment so that storing a type
+ * with a ridiculous alignment doesn't blow up the alignment or size of the
+ * genradix.
+ */
+
+#define DECLARE_GENRADIX_TYPE(_name, _type)			\
+struct _name {							\
+	struct __genradix	tree;				\
+	_type			type[0] __aligned(1);		\
+}
+
+#define DECLARE_GENRADIX(_name, _type)				\
+struct {							\
+	struct __genradix	tree;				\
+	_type			type[0] __aligned(1);		\
+} _name
+
+#define DEFINE_GENRADIX(_name, _type)				\
+	DECLARE_GENRADIX(_name, _type) = __GENRADIX_INITIALIZER
+
+#define genradix_init(_radix)					\
+do {								\
+	*(_radix) = (typeof(*_radix)) __GENRADIX_INITIALIZER;	\
+} while (0)
+
+void __genradix_free(struct __genradix *);
+
+#define genradix_free(_radix)	__genradix_free(&(_radix)->tree)
+
+static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
+{
+	BUILD_BUG_ON(obj_size > PAGE_SIZE);
+
+	if (!is_power_of_2(obj_size)) {
+		size_t objs_per_page = PAGE_SIZE / obj_size;
+
+		return (idx / objs_per_page) * PAGE_SIZE +
+			(idx % objs_per_page) * obj_size;
+	} else {
+		return idx * obj_size;
+	}
+}
+
+#define __genradix_cast(_radix)		(typeof((_radix)->type[0]) *)
+#define __genradix_obj_size(_radix)	sizeof((_radix)->type[0])
+#define __genradix_idx_to_offset(_radix, _idx)			\
+	__idx_to_offset(_idx, __genradix_obj_size(_radix))
+
+void *__genradix_ptr(struct __genradix *, size_t);
+
+/* Returns a pointer to element at @_idx */
+#define genradix_ptr(_radix, _idx)				\
+	(__genradix_cast(_radix)				\
+	 __genradix_ptr(&(_radix)->tree,			\
+			__genradix_idx_to_offset(_radix, _idx)))
+
+void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
+
+/* Returns a pointer to element at @_idx, allocating it if necessary */
+#define genradix_ptr_alloc(_radix, _idx, _gfp)			\
+	(__genradix_cast(_radix)				\
+	 __genradix_ptr_alloc(&(_radix)->tree,			\
+			__genradix_idx_to_offset(_radix, _idx),	\
+			_gfp))
+
+struct genradix_iter {
+	size_t			offset;
+	size_t			pos;
+};
+
+static inline void genradix_iter_init(struct genradix_iter *iter)
+{
+	iter->offset	= 0;
+	iter->pos	= 0;
+}
+
+void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
+
+#define genradix_iter_peek(_iter, _radix)			\
+	(__genradix_cast(_radix)				\
+	 __genradix_iter_peek(_iter, &(_radix)->tree,		\
+			      PAGE_SIZE / __genradix_obj_size(_radix)))
+
+static inline void __genradix_iter_advance(struct genradix_iter *iter,
+					   size_t obj_size)
+{
+	iter->offset += obj_size;
+
+	if (!is_power_of_2(obj_size) &&
+	    (iter->offset & (PAGE_SIZE - 1)) + obj_size > PAGE_SIZE)
+		iter->offset = round_up(iter->offset, PAGE_SIZE);
+
+	iter->pos++;
+}
+
+#define genradix_iter_advance(_iter, _radix)			\
+	__genradix_iter_advance(_iter, __genradix_obj_size(_radix))
+
+#endif /* _LINUX_GENERIC_RADIX_TREE_H */
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/include/linux/genhd.h
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
new file mode 100644
index 0000000..3830bc2
--- /dev/null
+++ b/include/linux/gfp.h
@@ -0,0 +1 @@
+#include <linux/slab.h>
diff --git a/include/linux/hash.h b/include/linux/hash.h
new file mode 100644
index 0000000..ad6fa21
--- /dev/null
+++ b/include/linux/hash.h
@@ -0,0 +1,104 @@
+#ifndef _LINUX_HASH_H
+#define _LINUX_HASH_H
+/* Fast hashing routine for ints,  longs and pointers.
+   (C) 2002 Nadia Yvette Chambers, IBM */
+
+#include <asm/types.h>
+#include <linux/compiler.h>
+
+/*
+ * The "GOLDEN_RATIO_PRIME" is used in ifs/btrfs/brtfs_inode.h and
+ * fs/inode.c.  It's not actually prime any more (the previous primes
+ * were actively bad for hashing), but the name remains.
+ */
+#if BITS_PER_LONG == 32
+#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_32
+#define hash_long(val, bits) hash_32(val, bits)
+#elif BITS_PER_LONG == 64
+#define hash_long(val, bits) hash_64(val, bits)
+#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_64
+#else
+#error Wordsize not 32 or 64
+#endif
+
+/*
+ * This hash multiplies the input by a large odd number and takes the
+ * high bits.  Since multiplication propagates changes to the most
+ * significant end only, it is essential that the high bits of the
+ * product be used for the hash value.
+ *
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ *
+ * Although a random odd number will do, it turns out that the golden
+ * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
+ * properties.  (See Knuth vol 3, section 6.4, exercise 9.)
+ *
+ * These are the negative, (1 - phi) = phi**2 = (3 - sqrt(5))/2,
+ * which is very slightly easier to multiply by and makes no
+ * difference to the hash distribution.
+ */
+#define GOLDEN_RATIO_32 0x61C88647
+#define GOLDEN_RATIO_64 0x61C8864680B583EBull
+
+#ifdef CONFIG_HAVE_ARCH_HASH
+/* This header may use the GOLDEN_RATIO_xx constants */
+#include <asm/hash.h>
+#endif
+
+/*
+ * The _generic versions exist only so lib/test_hash.c can compare
+ * the arch-optimized versions with the generic.
+ *
+ * Note that if you change these, any <asm/hash.h> that aren't updated
+ * to match need to have their HAVE_ARCH_* define values updated so the
+ * self-test will not false-positive.
+ */
+#ifndef HAVE_ARCH__HASH_32
+#define __hash_32 __hash_32_generic
+#endif
+static inline u32 __hash_32_generic(u32 val)
+{
+	return val * GOLDEN_RATIO_32;
+}
+
+#ifndef HAVE_ARCH_HASH_32
+#define hash_32 hash_32_generic
+#endif
+static inline u32 hash_32_generic(u32 val, unsigned int bits)
+{
+	/* High bits are more random, so use them. */
+	return __hash_32(val) >> (32 - bits);
+}
+
+#ifndef HAVE_ARCH_HASH_64
+#define hash_64 hash_64_generic
+#endif
+static __always_inline u32 hash_64_generic(u64 val, unsigned int bits)
+{
+#if BITS_PER_LONG == 64
+	/* 64x64-bit multiply is efficient on all 64-bit processors */
+	return val * GOLDEN_RATIO_64 >> (64 - bits);
+#else
+	/* Hash 64 bits using only 32x32-bit multiply. */
+	return hash_32((u32)val ^ __hash_32(val >> 32), bits);
+#endif
+}
+
+static inline u32 hash_ptr(const void *ptr, unsigned int bits)
+{
+	return hash_long((unsigned long)ptr, bits);
+}
+
+/* This really should be called fold32_ptr; it does no hashing to speak of. */
+static inline u32 hash32_ptr(const void *ptr)
+{
+	unsigned long val = (unsigned long)ptr;
+
+#if BITS_PER_LONG == 64
+	val ^= (val >> 32);
+#endif
+	return (u32)val;
+}
+
+#endif /* _LINUX_HASH_H */
diff --git a/include/linux/idr.h b/include/linux/idr.h
new file mode 100644
index 0000000..6f92825
--- /dev/null
+++ b/include/linux/idr.h
@@ -0,0 +1,208 @@
+/*
+ * include/linux/idr.h
+ * 
+ * 2002-10-18  written by Jim Houston jim.houston@ccur.com
+ *	Copyright (C) 2002 by Concurrent Computer Corporation
+ *	Distributed under the GNU GPL license version 2.
+ *
+ * Small id to pointer translation service avoiding fixed sized
+ * tables.
+ */
+
+#ifndef __IDR_H__
+#define __IDR_H__
+
+#include <linux/types.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+
+/*
+ * We want shallower trees and thus more bits covered at each layer.  8
+ * bits gives us large enough first layer for most use cases and maximum
+ * tree depth of 4.  Each idr_layer is slightly larger than 2k on 64bit and
+ * 1k on 32bit.
+ */
+#define IDR_BITS 8
+#define IDR_SIZE (1 << IDR_BITS)
+#define IDR_MASK ((1 << IDR_BITS)-1)
+
+struct idr_layer {
+	int			prefix;	/* the ID prefix of this idr_layer */
+	int			layer;	/* distance from leaf */
+	struct idr_layer __rcu	*ary[1<<IDR_BITS];
+	int			count;	/* When zero, we can release it */
+	union {
+		/* A zero bit means "space here" */
+		DECLARE_BITMAP(bitmap, IDR_SIZE);
+		struct rcu_head		rcu_head;
+	};
+};
+
+struct idr {
+	struct idr_layer __rcu	*hint;	/* the last layer allocated from */
+	struct idr_layer __rcu	*top;
+	int			layers;	/* only valid w/o concurrent changes */
+	int			cur;	/* current pos for cyclic allocation */
+	spinlock_t		lock;
+	int			id_free_cnt;
+	struct idr_layer	*id_free;
+};
+
+#define IDR_INIT(name)							\
+{									\
+	.lock			= __SPIN_LOCK_UNLOCKED(name.lock),	\
+}
+#define DEFINE_IDR(name)	struct idr name = IDR_INIT(name)
+
+/**
+ * DOC: idr sync
+ * idr synchronization (stolen from radix-tree.h)
+ *
+ * idr_find() is able to be called locklessly, using RCU. The caller must
+ * ensure calls to this function are made within rcu_read_lock() regions.
+ * Other readers (lock-free or otherwise) and modifications may be running
+ * concurrently.
+ *
+ * It is still required that the caller manage the synchronization and
+ * lifetimes of the items. So if RCU lock-free lookups are used, typically
+ * this would mean that the items have their own locks, or are amenable to
+ * lock-free access; and that the items are freed by RCU (or only freed after
+ * having been deleted from the idr tree *and* a synchronize_rcu() grace
+ * period).
+ */
+
+/*
+ * This is what we export.
+ */
+
+void *idr_find_slowpath(struct idr *idp, int id);
+void idr_preload(gfp_t gfp_mask);
+
+static inline int idr_alloc(struct idr *idp, void *ptr, int start, int end, gfp_t gfp_mask)
+{
+	return 0;
+}
+
+static inline void idr_remove(struct idr *idp, int id) {}
+
+int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask);
+int idr_for_each(struct idr *idp,
+		 int (*fn)(int id, void *p, void *data), void *data);
+void *idr_get_next(struct idr *idp, int *nextid);
+void *idr_replace(struct idr *idp, void *ptr, int id);
+void idr_destroy(struct idr *idp);
+void idr_init(struct idr *idp);
+bool idr_is_empty(struct idr *idp);
+
+/**
+ * idr_preload_end - end preload section started with idr_preload()
+ *
+ * Each idr_preload() should be matched with an invocation of this
+ * function.  See idr_preload() for details.
+ */
+static inline void idr_preload_end(void)
+{
+	preempt_enable();
+}
+
+/**
+ * idr_find - return pointer for given id
+ * @idr: idr handle
+ * @id: lookup key
+ *
+ * Return the pointer given the id it has been registered with.  A %NULL
+ * return indicates that @id is not valid or you passed %NULL in
+ * idr_get_new().
+ *
+ * This function can be called under rcu_read_lock(), given that the leaf
+ * pointers lifetimes are correctly managed.
+ */
+static inline void *idr_find(struct idr *idr, int id)
+{
+	struct idr_layer *hint = rcu_dereference_raw(idr->hint);
+
+	if (hint && (id & ~IDR_MASK) == hint->prefix)
+		return rcu_dereference_raw(hint->ary[id & IDR_MASK]);
+
+	return idr_find_slowpath(idr, id);
+}
+
+/**
+ * idr_for_each_entry - iterate over an idr's elements of a given type
+ * @idp:     idr handle
+ * @entry:   the type * to use as cursor
+ * @id:      id entry's key
+ *
+ * @entry and @id do not need to be initialized before the loop, and
+ * after normal terminatinon @entry is left with the value NULL.  This
+ * is convenient for a "not found" value.
+ */
+#define idr_for_each_entry(idp, entry, id)			\
+	for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id)
+
+/**
+ * idr_for_each_entry - continue iteration over an idr's elements of a given type
+ * @idp:     idr handle
+ * @entry:   the type * to use as cursor
+ * @id:      id entry's key
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position.
+ */
+#define idr_for_each_entry_continue(idp, entry, id)			\
+	for ((entry) = idr_get_next((idp), &(id));			\
+	     entry;							\
+	     ++id, (entry) = idr_get_next((idp), &(id)))
+
+/*
+ * IDA - IDR based id allocator, use when translation from id to
+ * pointer isn't necessary.
+ *
+ * IDA_BITMAP_LONGS is calculated to be one less to accommodate
+ * ida_bitmap->nr_busy so that the whole struct fits in 128 bytes.
+ */
+#define IDA_CHUNK_SIZE		128	/* 128 bytes per chunk */
+#define IDA_BITMAP_LONGS	(IDA_CHUNK_SIZE / sizeof(long) - 1)
+#define IDA_BITMAP_BITS 	(IDA_BITMAP_LONGS * sizeof(long) * 8)
+
+struct ida_bitmap {
+	long			nr_busy;
+	unsigned long		bitmap[IDA_BITMAP_LONGS];
+};
+
+struct ida {
+	struct idr		idr;
+	struct ida_bitmap	*free_bitmap;
+};
+
+#define IDA_INIT(name)		{ .idr = IDR_INIT((name).idr), .free_bitmap = NULL, }
+#define DEFINE_IDA(name)	struct ida name = IDA_INIT(name)
+
+int ida_pre_get(struct ida *ida, gfp_t gfp_mask);
+int ida_get_new_above(struct ida *ida, int starting_id, int *p_id);
+void ida_remove(struct ida *ida, int id);
+void ida_destroy(struct ida *ida);
+void ida_init(struct ida *ida);
+
+int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
+		   gfp_t gfp_mask);
+void ida_simple_remove(struct ida *ida, unsigned int id);
+
+/**
+ * ida_get_new - allocate new ID
+ * @ida:	idr handle
+ * @p_id:	pointer to the allocated handle
+ *
+ * Simple wrapper around ida_get_new_above() w/ @starting_id of zero.
+ */
+static inline int ida_get_new(struct ida *ida, int *p_id)
+{
+	return ida_get_new_above(ida, 0, p_id);
+}
+
+void __init idr_init_cache(void);
+
+#endif /* __IDR_H__ */
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
new file mode 100644
index 0000000..822c64a
--- /dev/null
+++ b/include/linux/ioprio.h
@@ -0,0 +1,46 @@
+#ifndef IOPRIO_H
+#define IOPRIO_H
+
+/*
+ * Gives us 8 prio classes with 13-bits of data for each class
+ */
+#define IOPRIO_BITS		(16)
+#define IOPRIO_CLASS_SHIFT	(13)
+#define IOPRIO_PRIO_MASK	((1UL << IOPRIO_CLASS_SHIFT) - 1)
+
+#define IOPRIO_PRIO_CLASS(mask)	((mask) >> IOPRIO_CLASS_SHIFT)
+#define IOPRIO_PRIO_DATA(mask)	((mask) & IOPRIO_PRIO_MASK)
+#define IOPRIO_PRIO_VALUE(class, data)	(((class) << IOPRIO_CLASS_SHIFT) | data)
+
+#define ioprio_valid(mask)	(IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE)
+
+/*
+ * These are the io priority groups as implemented by CFQ. RT is the realtime
+ * class, it always gets premium service. BE is the best-effort scheduling
+ * class, the default for any process. IDLE is the idle scheduling class, it
+ * is only served when no one else is using the disk.
+ */
+enum {
+	IOPRIO_CLASS_NONE,
+	IOPRIO_CLASS_RT,
+	IOPRIO_CLASS_BE,
+	IOPRIO_CLASS_IDLE,
+};
+
+/*
+ * 8 best effort priority levels are supported
+ */
+#define IOPRIO_BE_NR	(8)
+
+enum {
+	IOPRIO_WHO_PROCESS = 1,
+	IOPRIO_WHO_PGRP,
+	IOPRIO_WHO_USER,
+};
+
+/*
+ * Fallback BE priority
+ */
+#define IOPRIO_NORM	(4)
+
+#endif
diff --git a/include/linux/jhash.h b/include/linux/jhash.h
new file mode 100644
index 0000000..348c6f4
--- /dev/null
+++ b/include/linux/jhash.h
@@ -0,0 +1,175 @@
+#ifndef _LINUX_JHASH_H
+#define _LINUX_JHASH_H
+
+/* jhash.h: Jenkins hash support.
+ *
+ * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net)
+ *
+ * http://burtleburtle.net/bob/hash/
+ *
+ * These are the credits from Bob's sources:
+ *
+ * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+ *
+ * These are functions for producing 32-bit hashes for hash table lookup.
+ * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
+ * are externally useful functions.  Routines to test the hash are included
+ * if SELF_TEST is defined.  You can use this free for any purpose.  It's in
+ * the public domain.  It has no warranty.
+ *
+ * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu)
+ *
+ * I've modified Bob's hash to be useful in the Linux kernel, and
+ * any bugs present are my fault.
+ * Jozsef
+ */
+#include <linux/bitops.h>
+#include <linux/unaligned/packed_struct.h>
+
+/* Best hash sizes are of power of two */
+#define jhash_size(n)   ((u32)1<<(n))
+/* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */
+#define jhash_mask(n)   (jhash_size(n)-1)
+
+/* __jhash_mix -- mix 3 32-bit values reversibly. */
+#define __jhash_mix(a, b, c)			\
+{						\
+	a -= c;  a ^= rol32(c, 4);  c += b;	\
+	b -= a;  b ^= rol32(a, 6);  a += c;	\
+	c -= b;  c ^= rol32(b, 8);  b += a;	\
+	a -= c;  a ^= rol32(c, 16); c += b;	\
+	b -= a;  b ^= rol32(a, 19); a += c;	\
+	c -= b;  c ^= rol32(b, 4);  b += a;	\
+}
+
+/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
+#define __jhash_final(a, b, c)			\
+{						\
+	c ^= b; c -= rol32(b, 14);		\
+	a ^= c; a -= rol32(c, 11);		\
+	b ^= a; b -= rol32(a, 25);		\
+	c ^= b; c -= rol32(b, 16);		\
+	a ^= c; a -= rol32(c, 4);		\
+	b ^= a; b -= rol32(a, 14);		\
+	c ^= b; c -= rol32(b, 24);		\
+}
+
+/* An arbitrary initial parameter */
+#define JHASH_INITVAL		0xdeadbeef
+
+/* jhash - hash an arbitrary key
+ * @k: sequence of bytes as key
+ * @length: the length of the key
+ * @initval: the previous hash, or an arbitray value
+ *
+ * The generic version, hashes an arbitrary sequence of bytes.
+ * No alignment or length assumptions are made about the input key.
+ *
+ * Returns the hash value of the key. The result depends on endianness.
+ */
+static inline u32 jhash(const void *key, u32 length, u32 initval)
+{
+	u32 a, b, c;
+	const u8 *k = key;
+
+	/* Set up the internal state */
+	a = b = c = JHASH_INITVAL + length + initval;
+
+	/* All but the last block: affect some 32 bits of (a,b,c) */
+	while (length > 12) {
+		a += __get_unaligned_cpu32(k);
+		b += __get_unaligned_cpu32(k + 4);
+		c += __get_unaligned_cpu32(k + 8);
+		__jhash_mix(a, b, c);
+		length -= 12;
+		k += 12;
+	}
+	/* Last block: affect all 32 bits of (c) */
+	/* All the case statements fall through */
+	switch (length) {
+	case 12: c += (u32)k[11]<<24;
+	case 11: c += (u32)k[10]<<16;
+	case 10: c += (u32)k[9]<<8;
+	case 9:  c += k[8];
+	case 8:  b += (u32)k[7]<<24;
+	case 7:  b += (u32)k[6]<<16;
+	case 6:  b += (u32)k[5]<<8;
+	case 5:  b += k[4];
+	case 4:  a += (u32)k[3]<<24;
+	case 3:  a += (u32)k[2]<<16;
+	case 2:  a += (u32)k[1]<<8;
+	case 1:  a += k[0];
+		 __jhash_final(a, b, c);
+	case 0: /* Nothing left to add */
+		break;
+	}
+
+	return c;
+}
+
+/* jhash2 - hash an array of u32's
+ * @k: the key which must be an array of u32's
+ * @length: the number of u32's in the key
+ * @initval: the previous hash, or an arbitray value
+ *
+ * Returns the hash value of the key.
+ */
+static inline u32 jhash2(const u32 *k, u32 length, u32 initval)
+{
+	u32 a, b, c;
+
+	/* Set up the internal state */
+	a = b = c = JHASH_INITVAL + (length<<2) + initval;
+
+	/* Handle most of the key */
+	while (length > 3) {
+		a += k[0];
+		b += k[1];
+		c += k[2];
+		__jhash_mix(a, b, c);
+		length -= 3;
+		k += 3;
+	}
+
+	/* Handle the last 3 u32's: all the case statements fall through */
+	switch (length) {
+	case 3: c += k[2];
+	case 2: b += k[1];
+	case 1: a += k[0];
+		__jhash_final(a, b, c);
+	case 0:	/* Nothing left to add */
+		break;
+	}
+
+	return c;
+}
+
+
+/* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */
+static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
+{
+	a += initval;
+	b += initval;
+	c += initval;
+
+	__jhash_final(a, b, c);
+
+	return c;
+}
+
+static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
+{
+	return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2));
+}
+
+static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
+{
+	return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
+}
+
+static inline u32 jhash_1word(u32 a, u32 initval)
+{
+	return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2));
+}
+
+#endif /* _LINUX_JHASH_H */
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
new file mode 100644
index 0000000..00abaee
--- /dev/null
+++ b/include/linux/jiffies.h
@@ -0,0 +1,451 @@
+#ifndef _LINUX_JIFFIES_H
+#define _LINUX_JIFFIES_H
+
+#include <linux/kernel.h>
+#include <linux/time64.h>
+#include <linux/typecheck.h>
+#include <linux/types.h>
+
+#define HZ		100
+
+#define MSEC_PER_SEC	1000L
+#define USEC_PER_MSEC	1000L
+#define NSEC_PER_USEC	1000L
+#define NSEC_PER_MSEC	1000000L
+#define USEC_PER_SEC	1000000L
+#define NSEC_PER_SEC	1000000000L
+#define FSEC_PER_SEC	1000000000000000LL
+
+/*
+ * The following defines establish the engineering parameters of the PLL
+ * model. The HZ variable establishes the timer interrupt frequency, 100 Hz
+ * for the SunOS kernel, 256 Hz for the Ultrix kernel and 1024 Hz for the
+ * OSF/1 kernel. The SHIFT_HZ define expresses the same value as the
+ * nearest power of two in order to avoid hardware multiply operations.
+ */
+#if HZ >= 12 && HZ < 24
+# define SHIFT_HZ	4
+#elif HZ >= 24 && HZ < 48
+# define SHIFT_HZ	5
+#elif HZ >= 48 && HZ < 96
+# define SHIFT_HZ	6
+#elif HZ >= 96 && HZ < 192
+# define SHIFT_HZ	7
+#elif HZ >= 192 && HZ < 384
+# define SHIFT_HZ	8
+#elif HZ >= 384 && HZ < 768
+# define SHIFT_HZ	9
+#elif HZ >= 768 && HZ < 1536
+# define SHIFT_HZ	10
+#elif HZ >= 1536 && HZ < 3072
+# define SHIFT_HZ	11
+#elif HZ >= 3072 && HZ < 6144
+# define SHIFT_HZ	12
+#elif HZ >= 6144 && HZ < 12288
+# define SHIFT_HZ	13
+#else
+# error Invalid value of HZ.
+#endif
+
+/* Suppose we want to divide two numbers NOM and DEN: NOM/DEN, then we can
+ * improve accuracy by shifting LSH bits, hence calculating:
+ *     (NOM << LSH) / DEN
+ * This however means trouble for large NOM, because (NOM << LSH) may no
+ * longer fit in 32 bits. The following way of calculating this gives us
+ * some slack, under the following conditions:
+ *   - (NOM / DEN) fits in (32 - LSH) bits.
+ *   - (NOM % DEN) fits in (32 - LSH) bits.
+ */
+#define SH_DIV(NOM,DEN,LSH) (   (((NOM) / (DEN)) << (LSH))              \
+                             + ((((NOM) % (DEN)) << (LSH)) + (DEN) / 2) / (DEN))
+
+/* LATCH is used in the interval timer and ftape setup. */
+#define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ)	/* For divider */
+
+extern int register_refined_jiffies(long clock_tick_rate);
+
+/* TICK_NSEC is the time between ticks in nsec assuming SHIFTED_HZ */
+#define TICK_NSEC ((NSEC_PER_SEC+HZ/2)/HZ)
+
+/* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */
+#define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ)
+
+static inline u64 local_clock(void)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+
+	return ((s64) ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec;
+}
+
+extern unsigned long clock_t_to_jiffies(unsigned long x);
+extern u64 jiffies_64_to_clock_t(u64 x);
+extern u64 nsec_to_clock_t(u64 x);
+extern u64 nsecs_to_jiffies64(u64 n);
+extern unsigned long nsecs_to_jiffies(u64 n);
+
+static inline u64 get_jiffies_64(void)
+{
+	return nsecs_to_jiffies64(local_clock());
+}
+
+#define jiffies_64		get_jiffies_64()
+#define jiffies			((unsigned long) get_jiffies_64())
+
+/*
+ *	These inlines deal with timer wrapping correctly. You are 
+ *	strongly encouraged to use them
+ *	1. Because people otherwise forget
+ *	2. Because if the timer wrap changes in future you won't have to
+ *	   alter your driver code.
+ *
+ * time_after(a,b) returns true if the time a is after time b.
+ *
+ * Do this with "<0" and ">=0" to only test the sign of the result. A
+ * good compiler would generate better code (and a really good compiler
+ * wouldn't care). Gcc is currently neither.
+ */
+#define time_after(a,b)		\
+	(typecheck(unsigned long, a) && \
+	 typecheck(unsigned long, b) && \
+	 ((long)((b) - (a)) < 0))
+#define time_before(a,b)	time_after(b,a)
+
+#define time_after_eq(a,b)	\
+	(typecheck(unsigned long, a) && \
+	 typecheck(unsigned long, b) && \
+	 ((long)((a) - (b)) >= 0))
+#define time_before_eq(a,b)	time_after_eq(b,a)
+
+/*
+ * Calculate whether a is in the range of [b, c].
+ */
+#define time_in_range(a,b,c) \
+	(time_after_eq(a,b) && \
+	 time_before_eq(a,c))
+
+/*
+ * Calculate whether a is in the range of [b, c).
+ */
+#define time_in_range_open(a,b,c) \
+	(time_after_eq(a,b) && \
+	 time_before(a,c))
+
+/* Same as above, but does so with platform independent 64bit types.
+ * These must be used when utilizing jiffies_64 (i.e. return value of
+ * get_jiffies_64() */
+#define time_after64(a,b)	\
+	(typecheck(__u64, a) &&	\
+	 typecheck(__u64, b) && \
+	 ((__s64)((b) - (a)) < 0))
+#define time_before64(a,b)	time_after64(b,a)
+
+#define time_after_eq64(a,b)	\
+	(typecheck(__u64, a) && \
+	 typecheck(__u64, b) && \
+	 ((__s64)((a) - (b)) >= 0))
+#define time_before_eq64(a,b)	time_after_eq64(b,a)
+
+#define time_in_range64(a, b, c) \
+	(time_after_eq64(a, b) && \
+	 time_before_eq64(a, c))
+
+/*
+ * These four macros compare jiffies and 'a' for convenience.
+ */
+
+/* time_is_before_jiffies(a) return true if a is before jiffies */
+#define time_is_before_jiffies(a) time_after(jiffies, a)
+
+/* time_is_after_jiffies(a) return true if a is after jiffies */
+#define time_is_after_jiffies(a) time_before(jiffies, a)
+
+/* time_is_before_eq_jiffies(a) return true if a is before or equal to jiffies*/
+#define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a)
+
+/* time_is_after_eq_jiffies(a) return true if a is after or equal to jiffies*/
+#define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a)
+
+/*
+ * Have the 32 bit jiffies value wrap 5 minutes after boot
+ * so jiffies wrap bugs show up earlier.
+ */
+#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))
+
+/*
+ * Change timeval to jiffies, trying to avoid the
+ * most obvious overflows..
+ *
+ * And some not so obvious.
+ *
+ * Note that we don't want to return LONG_MAX, because
+ * for various timeout reasons we often end up having
+ * to wait "jiffies+1" in order to guarantee that we wait
+ * at _least_ "jiffies" - so "jiffies+1" had better still
+ * be positive.
+ */
+#define MAX_JIFFY_OFFSET ((LONG_MAX >> 1)-1)
+
+extern unsigned long preset_lpj;
+
+/*
+ * We want to do realistic conversions of time so we need to use the same
+ * values the update wall clock code uses as the jiffies size.  This value
+ * is: TICK_NSEC (which is defined in timex.h).  This
+ * is a constant and is in nanoseconds.  We will use scaled math
+ * with a set of scales defined here as SEC_JIFFIE_SC,  USEC_JIFFIE_SC and
+ * NSEC_JIFFIE_SC.  Note that these defines contain nothing but
+ * constants and so are computed at compile time.  SHIFT_HZ (computed in
+ * timex.h) adjusts the scaling for different HZ values.
+
+ * Scaled math???  What is that?
+ *
+ * Scaled math is a way to do integer math on values that would,
+ * otherwise, either overflow, underflow, or cause undesired div
+ * instructions to appear in the execution path.  In short, we "scale"
+ * up the operands so they take more bits (more precision, less
+ * underflow), do the desired operation and then "scale" the result back
+ * by the same amount.  If we do the scaling by shifting we avoid the
+ * costly mpy and the dastardly div instructions.
+
+ * Suppose, for example, we want to convert from seconds to jiffies
+ * where jiffies is defined in nanoseconds as NSEC_PER_JIFFIE.  The
+ * simple math is: jiff = (sec * NSEC_PER_SEC) / NSEC_PER_JIFFIE; We
+ * observe that (NSEC_PER_SEC / NSEC_PER_JIFFIE) is a constant which we
+ * might calculate at compile time, however, the result will only have
+ * about 3-4 bits of precision (less for smaller values of HZ).
+ *
+ * So, we scale as follows:
+ * jiff = (sec) * (NSEC_PER_SEC / NSEC_PER_JIFFIE);
+ * jiff = ((sec) * ((NSEC_PER_SEC * SCALE)/ NSEC_PER_JIFFIE)) / SCALE;
+ * Then we make SCALE a power of two so:
+ * jiff = ((sec) * ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE)) >> SCALE;
+ * Now we define:
+ * #define SEC_CONV = ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE))
+ * jiff = (sec * SEC_CONV) >> SCALE;
+ *
+ * Often the math we use will expand beyond 32-bits so we tell C how to
+ * do this and pass the 64-bit result of the mpy through the ">> SCALE"
+ * which should take the result back to 32-bits.  We want this expansion
+ * to capture as much precision as possible.  At the same time we don't
+ * want to overflow so we pick the SCALE to avoid this.  In this file,
+ * that means using a different scale for each range of HZ values (as
+ * defined in timex.h).
+ *
+ * For those who want to know, gcc will give a 64-bit result from a "*"
+ * operator if the result is a long long AND at least one of the
+ * operands is cast to long long (usually just prior to the "*" so as
+ * not to confuse it into thinking it really has a 64-bit operand,
+ * which, buy the way, it can do, but it takes more code and at least 2
+ * mpys).
+
+ * We also need to be aware that one second in nanoseconds is only a
+ * couple of bits away from overflowing a 32-bit word, so we MUST use
+ * 64-bits to get the full range time in nanoseconds.
+
+ */
+
+/*
+ * Here are the scales we will use.  One for seconds, nanoseconds and
+ * microseconds.
+ *
+ * Within the limits of cpp we do a rough cut at the SEC_JIFFIE_SC and
+ * check if the sign bit is set.  If not, we bump the shift count by 1.
+ * (Gets an extra bit of precision where we can use it.)
+ * We know it is set for HZ = 1024 and HZ = 100 not for 1000.
+ * Haven't tested others.
+
+ * Limits of cpp (for #if expressions) only long (no long long), but
+ * then we only need the most signicant bit.
+ */
+
+#define SEC_JIFFIE_SC (31 - SHIFT_HZ)
+#if !((((NSEC_PER_SEC << 2) / TICK_NSEC) << (SEC_JIFFIE_SC - 2)) & 0x80000000)
+#undef SEC_JIFFIE_SC
+#define SEC_JIFFIE_SC (32 - SHIFT_HZ)
+#endif
+#define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 29)
+#define SEC_CONVERSION ((unsigned long)((((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) +\
+                                TICK_NSEC -1) / (u64)TICK_NSEC))
+
+#define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\
+                                        TICK_NSEC -1) / (u64)TICK_NSEC))
+/*
+ * The maximum jiffie value is (MAX_INT >> 1).  Here we translate that
+ * into seconds.  The 64-bit case will overflow if we are not careful,
+ * so use the messy SH_DIV macro to do it.  Still all constants.
+ */
+#if BITS_PER_LONG < 64
+# define MAX_SEC_IN_JIFFIES \
+	(long)((u64)((u64)MAX_JIFFY_OFFSET * TICK_NSEC) / NSEC_PER_SEC)
+#else	/* take care of overflow on 64 bits machines */
+# define MAX_SEC_IN_JIFFIES \
+	(SH_DIV((MAX_JIFFY_OFFSET >> SEC_JIFFIE_SC) * TICK_NSEC, NSEC_PER_SEC, 1) - 1)
+
+#endif
+
+/*
+ * Convert various time units to each other:
+ */
+extern unsigned int jiffies_to_msecs(const unsigned long j);
+extern unsigned int jiffies_to_usecs(const unsigned long j);
+
+static inline u64 jiffies_to_nsecs(const unsigned long j)
+{
+	return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC;
+}
+
+extern unsigned long __msecs_to_jiffies(const unsigned int m);
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+/*
+ * HZ is equal to or smaller than 1000, and 1000 is a nice round
+ * multiple of HZ, divide with the factor between them, but round
+ * upwards:
+ */
+static inline unsigned long _msecs_to_jiffies(const unsigned int m)
+{
+	return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
+}
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+/*
+ * HZ is larger than 1000, and HZ is a nice round multiple of 1000 -
+ * simply multiply with the factor between them.
+ *
+ * But first make sure the multiplication result cannot overflow:
+ */
+static inline unsigned long _msecs_to_jiffies(const unsigned int m)
+{
+	if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+		return MAX_JIFFY_OFFSET;
+	return m * (HZ / MSEC_PER_SEC);
+}
+#else
+/*
+ * Generic case - multiply, round and divide. But first check that if
+ * we are doing a net multiplication, that we wouldn't overflow:
+ */
+static inline unsigned long _msecs_to_jiffies(const unsigned int m)
+{
+	if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+		return MAX_JIFFY_OFFSET;
+
+	return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) >> MSEC_TO_HZ_SHR32;
+}
+#endif
+/**
+ * msecs_to_jiffies: - convert milliseconds to jiffies
+ * @m:	time in milliseconds
+ *
+ * conversion is done as follows:
+ *
+ * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
+ *
+ * - 'too large' values [that would result in larger than
+ *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
+ *
+ * - all other values are converted to jiffies by either multiplying
+ *   the input value by a factor or dividing it with a factor and
+ *   handling any 32-bit overflows.
+ *   for the details see __msecs_to_jiffies()
+ *
+ * msecs_to_jiffies() checks for the passed in value being a constant
+ * via __builtin_constant_p() allowing gcc to eliminate most of the
+ * code, __msecs_to_jiffies() is called if the value passed does not
+ * allow constant folding and the actual conversion must be done at
+ * runtime.
+ * the HZ range specific helpers _msecs_to_jiffies() are called both
+ * directly here and from __msecs_to_jiffies() in the case where
+ * constant folding is not possible.
+ */
+static __always_inline unsigned long msecs_to_jiffies(const unsigned int m)
+{
+	if (__builtin_constant_p(m)) {
+		if ((int)m < 0)
+			return MAX_JIFFY_OFFSET;
+		return _msecs_to_jiffies(m);
+	} else {
+		return __msecs_to_jiffies(m);
+	}
+}
+
+extern unsigned long __usecs_to_jiffies(const unsigned int u);
+#if !(USEC_PER_SEC % HZ)
+static inline unsigned long _usecs_to_jiffies(const unsigned int u)
+{
+	return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
+}
+#else
+static inline unsigned long _usecs_to_jiffies(const unsigned int u)
+{
+	return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
+		>> USEC_TO_HZ_SHR32;
+}
+#endif
+
+/**
+ * usecs_to_jiffies: - convert microseconds to jiffies
+ * @u:	time in microseconds
+ *
+ * conversion is done as follows:
+ *
+ * - 'too large' values [that would result in larger than
+ *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
+ *
+ * - all other values are converted to jiffies by either multiplying
+ *   the input value by a factor or dividing it with a factor and
+ *   handling any 32-bit overflows as for msecs_to_jiffies.
+ *
+ * usecs_to_jiffies() checks for the passed in value being a constant
+ * via __builtin_constant_p() allowing gcc to eliminate most of the
+ * code, __usecs_to_jiffies() is called if the value passed does not
+ * allow constant folding and the actual conversion must be done at
+ * runtime.
+ * the HZ range specific helpers _usecs_to_jiffies() are called both
+ * directly here and from __msecs_to_jiffies() in the case where
+ * constant folding is not possible.
+ */
+static __always_inline unsigned long usecs_to_jiffies(const unsigned int u)
+{
+	if (__builtin_constant_p(u)) {
+		if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
+			return MAX_JIFFY_OFFSET;
+		return _usecs_to_jiffies(u);
+	} else {
+		return __usecs_to_jiffies(u);
+	}
+}
+
+extern unsigned long timespec64_to_jiffies(const struct timespec64 *value);
+
+extern void jiffies_to_timespec64(const unsigned long,
+				  struct timespec64 *value);
+static inline unsigned long timespec_to_jiffies(const struct timespec *value)
+{
+	struct timespec64 ts = timespec_to_timespec64(*value);
+
+	return timespec64_to_jiffies(&ts);
+}
+
+static inline void jiffies_to_timespec(const unsigned long j,
+				       struct timespec *value)
+{
+	struct timespec64 ts;
+
+	jiffies_to_timespec64(j, &ts);
+	*value = timespec64_to_timespec(ts);
+}
+
+extern unsigned long timeval_to_jiffies(const struct timeval *value);
+extern void jiffies_to_timeval(const unsigned long j,
+			       struct timeval *value);
+
+extern clock_t jiffies_to_clock_t(unsigned long x);
+static inline clock_t jiffies_delta_to_clock_t(long delta)
+{
+	return jiffies_to_clock_t(max(0L, delta));
+}
+
+#define TIMESTAMP_SIZE	30
+
+#endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
new file mode 100644
index 0000000..e4ffa86
--- /dev/null
+++ b/include/linux/kernel.h
@@ -0,0 +1,211 @@
+#ifndef __TOOLS_LINUX_KERNEL_H
+#define __TOOLS_LINUX_KERNEL_H
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include <linux/bug.h>
+#include <linux/compiler.h>
+
+#define IS_ENABLED(opt)		0
+#define EXPORT_SYMBOL(sym)
+
+#define U8_MAX		((u8)~0U)
+#define S8_MAX		((s8)(U8_MAX>>1))
+#define S8_MIN		((s8)(-S8_MAX - 1))
+#define U16_MAX		((u16)~0U)
+#define S16_MAX		((s16)(U16_MAX>>1))
+#define S16_MIN		((s16)(-S16_MAX - 1))
+#define U32_MAX		((u32)~0U)
+#define S32_MAX		((s32)(U32_MAX>>1))
+#define S32_MIN		((s32)(-S32_MAX - 1))
+#define U64_MAX		((u64)~0ULL)
+#define S64_MAX		((s64)(U64_MAX>>1))
+#define S64_MIN		((s64)(-S64_MAX - 1))
+
+#define ALIGN(x, a)	__ALIGN_MASK(x, (typeof(x))(a)-1)
+#define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
+
+#define PTR_ALIGN(p, a)		((typeof(p))ALIGN((unsigned long)(p), (a)))
+#define IS_ALIGNED(x, a)		(((x) & ((typeof(x))(a) - 1)) == 0)
+
+#define __must_be_array(a)	BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
+
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+
+#define mult_frac(x, numer, denom)(			\
+{							\
+	typeof(x) quot = (x) / (denom);			\
+	typeof(x) rem  = (x) % (denom);			\
+	(quot * (numer)) + ((rem * (numer)) / (denom));	\
+}							\
+)
+
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+#endif
+
+#ifndef container_of
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({			\
+	const typeof(((type *)0)->member) * __mptr = (ptr);	\
+	(type *)((char *)__mptr - offsetof(type, member)); })
+#endif
+
+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
+#define round_down(x, y) ((x) & ~__round_mask(x, y))
+
+#define roundup(x, y)					\
+({							\
+	const typeof(y) __y = y;			\
+	(((x) + (__y - 1)) / __y) * __y;		\
+})
+
+#define max(x, y) ({				\
+	typeof(x) _max1 = (x);			\
+	typeof(y) _max2 = (y);			\
+	(void) (&_max1 == &_max2);		\
+	_max1 > _max2 ? _max1 : _max2; })
+
+#define min(x, y) ({				\
+	typeof(x) _min1 = (x);			\
+	typeof(y) _min2 = (y);			\
+	(void) (&_min1 == &_min2);		\
+	_min1 < _min2 ? _min1 : _min2; })
+
+#define min_t(type, x, y) ({			\
+	type __min1 = (x);			\
+	type __min2 = (y);			\
+	__min1 < __min2 ? __min1: __min2; })
+
+#define max_t(type, x, y) ({			\
+	type __max1 = (x);			\
+	type __max2 = (y);			\
+	__max1 > __max2 ? __max1: __max2; })
+
+#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi)
+
+#define swap(a, b) \
+	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
+
+#define _RET_IP_		(unsigned long)__builtin_return_address(0)
+#define _THIS_IP_  ({ __label__ __here; __here: (unsigned long)&&__here; })
+
+#define might_sleep()
+
+#define NR_CPUS			32
+
+#define cpu_relax()		do {} while (0)
+#define cpu_relax_lowlatency()	do {} while (0)
+
+__printf(1, 2)
+static inline void panic(const char *fmt, ...)
+{
+       va_list args;
+
+       va_start(args, fmt);
+       vprintf(fmt, args);
+       va_end(args);
+
+       BUG();
+}
+
+unsigned long simple_strtoul(const char *,char **,unsigned int);
+long simple_strtol(const char *,char **,unsigned int);
+unsigned long long simple_strtoull(const char *,char **,unsigned int);
+long long simple_strtoll(const char *,char **,unsigned int);
+
+int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
+int __must_check _kstrtol(const char *s, unsigned int base, long *res);
+
+int __must_check kstrtoull(const char *s, unsigned int base, unsigned long long *res);
+int __must_check kstrtoll(const char *s, unsigned int base, long long *res);
+
+/**
+ * kstrtoul - convert a string to an unsigned long
+ * @s: The start of the string. The string must be null-terminated, and may also
+ *  include a single newline before its terminating null. The first character
+ *  may also be a plus sign, but not a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ *  given as 0, then the base of the string is automatically detected with the
+ *  conventional semantics - If it begins with 0x the number will be parsed as a
+ *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ *  parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Used as a replacement for the obsolete simple_strtoull. Return code must
+ * be checked.
+*/
+static inline int __must_check kstrtoul(const char *s, unsigned int base, unsigned long *res)
+{
+	/*
+	 * We want to shortcut function call, but
+	 * __builtin_types_compatible_p(unsigned long, unsigned long long) = 0.
+	 */
+	if (sizeof(unsigned long) == sizeof(unsigned long long) &&
+	    __alignof__(unsigned long) == __alignof__(unsigned long long))
+		return kstrtoull(s, base, (unsigned long long *)res);
+	else
+		return _kstrtoul(s, base, res);
+}
+
+/**
+ * kstrtol - convert a string to a long
+ * @s: The start of the string. The string must be null-terminated, and may also
+ *  include a single newline before its terminating null. The first character
+ *  may also be a plus sign or a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ *  given as 0, then the base of the string is automatically detected with the
+ *  conventional semantics - If it begins with 0x the number will be parsed as a
+ *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ *  parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Used as a replacement for the obsolete simple_strtoull. Return code must
+ * be checked.
+ */
+static inline int __must_check kstrtol(const char *s, unsigned int base, long *res)
+{
+	/*
+	 * We want to shortcut function call, but
+	 * __builtin_types_compatible_p(long, long long) = 0.
+	 */
+	if (sizeof(long) == sizeof(long long) &&
+	    __alignof__(long) == __alignof__(long long))
+		return kstrtoll(s, base, (long long *)res);
+	else
+		return _kstrtol(s, base, res);
+}
+
+int __must_check kstrtouint(const char *s, unsigned int base, unsigned int *res);
+int __must_check kstrtoint(const char *s, unsigned int base, int *res);
+
+/* Permissions on a sysfs file: you didn't miss the 0 prefix did you? */
+#define VERIFY_OCTAL_PERMISSIONS(perms)						\
+	(BUILD_BUG_ON_ZERO((perms) < 0) +					\
+	 BUILD_BUG_ON_ZERO((perms) > 0777) +					\
+	 /* USER_READABLE >= GROUP_READABLE >= OTHER_READABLE */		\
+	 BUILD_BUG_ON_ZERO((((perms) >> 6) & 4) < (((perms) >> 3) & 4)) +	\
+	 BUILD_BUG_ON_ZERO((((perms) >> 3) & 4) < ((perms) & 4)) +		\
+	 /* USER_WRITABLE >= GROUP_WRITABLE */					\
+	 BUILD_BUG_ON_ZERO((((perms) >> 6) & 2) < (((perms) >> 3) & 2)) +	\
+	 /* OTHER_WRITABLE?  Generally considered a bad idea. */		\
+	 BUILD_BUG_ON_ZERO((perms) & 2) +					\
+	 (perms))
+
+#define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
+
+#endif
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
new file mode 100644
index 0000000..2ec53f8
--- /dev/null
+++ b/include/linux/kobject.h
@@ -0,0 +1,142 @@
+/*
+ * kobject.h - generic kernel object infrastructure.
+ *
+ * Copyright (c) 2002-2003 Patrick Mochel
+ * Copyright (c) 2002-2003 Open Source Development Labs
+ * Copyright (c) 2006-2008 Greg Kroah-Hartman <greg@kroah.com>
+ * Copyright (c) 2006-2008 Novell Inc.
+ *
+ * This file is released under the GPLv2.
+ *
+ * Please read Documentation/kobject.txt before using the kobject
+ * interface, ESPECIALLY the parts about reference counts and object
+ * destructors.
+ */
+
+#ifndef _KOBJECT_H_
+#define _KOBJECT_H_
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+
+struct kset;
+
+struct kobj_type {
+	void (*release)(struct kobject *kobj);
+	const struct sysfs_ops *sysfs_ops;
+	struct attribute **default_attrs;
+	const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj);
+	const void *(*namespace)(struct kobject *kobj);
+};
+
+struct kobj_uevent_env {
+};
+
+struct kobj_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *attr,
+			char *buf);
+	ssize_t (*store)(struct kobject *kobj, struct kobj_attribute *attr,
+			 const char *buf, size_t count);
+};
+
+struct kobject {
+	struct kobject		*parent;
+	struct kset		*kset;
+	struct kobj_type	*ktype;
+	struct kernfs_node	*sd; /* sysfs directory entry */
+	struct kref		kref;
+	unsigned int state_initialized:1;
+	unsigned int state_in_sysfs:1;
+	unsigned int state_add_uevent_sent:1;
+	unsigned int state_remove_uevent_sent:1;
+	unsigned int uevent_suppress:1;
+};
+
+struct kset {
+	struct kobject		kobj;
+};
+
+static inline struct kobj_type *get_ktype(struct kobject *kobj)
+{
+	return kobj->ktype;
+}
+
+#define kobject_add(...)	0
+
+static inline void kobject_init(struct kobject *kobj, struct kobj_type *ktype)
+{
+	memset(kobj, 0, sizeof(*kobj));
+
+	kref_init(&kobj->kref);
+	kobj->ktype = ktype;
+	kobj->state_initialized = 1;
+}
+
+static inline void kobject_del(struct kobject *kobj);
+
+static inline void kobject_cleanup(struct kobject *kobj)
+{
+	struct kobj_type *t = get_ktype(kobj);
+
+	/* remove from sysfs if the caller did not do it */
+	if (kobj->state_in_sysfs)
+		kobject_del(kobj);
+
+	if (t && t->release)
+		t->release(kobj);
+}
+
+static inline void kobject_release(struct kref *kref)
+{
+	struct kobject *kobj = container_of(kref, struct kobject, kref);
+
+	kobject_cleanup(kobj);
+}
+
+static inline void kobject_put(struct kobject *kobj)
+{
+	BUG_ON(!kobj);
+	BUG_ON(!kobj->state_initialized);
+
+	kref_put(&kobj->kref, kobject_release);
+}
+
+static inline void kobject_del(struct kobject *kobj)
+{
+	struct kernfs_node *sd;
+
+	if (!kobj)
+		return;
+
+	sd = kobj->sd;
+	kobj->state_in_sysfs = 0;
+#if 0
+	kobj_kset_leave(kobj);
+#endif
+	kobject_put(kobj->parent);
+	kobj->parent = NULL;
+}
+
+static inline struct kobject *kobject_get(struct kobject *kobj)
+{
+	BUG_ON(!kobj);
+	BUG_ON(!kobj->state_initialized);
+
+	kref_get(&kobj->kref);
+	return kobj;
+}
+
+static inline void kset_unregister(struct kset *kset) {}
+
+#define kset_create_and_add(_name, _u, _parent)				\
+	((struct kset *) kzalloc(sizeof(struct kset), GFP_KERNEL))
+
+#endif /* _KOBJECT_H_ */
diff --git a/include/linux/kref.h b/include/linux/kref.h
new file mode 100644
index 0000000..e15828f
--- /dev/null
+++ b/include/linux/kref.h
@@ -0,0 +1,138 @@
+/*
+ * kref.h - library routines for handling generic reference counted objects
+ *
+ * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
+ * Copyright (C) 2004 IBM Corp.
+ *
+ * based on kobject.h which was:
+ * Copyright (C) 2002-2003 Patrick Mochel <mochel@osdl.org>
+ * Copyright (C) 2002-2003 Open Source Development Labs
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#ifndef _KREF_H_
+#define _KREF_H_
+
+#include <linux/bug.h>
+#include <linux/atomic.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+
+struct kref {
+	atomic_t refcount;
+};
+
+/**
+ * kref_init - initialize object.
+ * @kref: object in question.
+ */
+static inline void kref_init(struct kref *kref)
+{
+	atomic_set(&kref->refcount, 1);
+}
+
+/**
+ * kref_get - increment refcount for object.
+ * @kref: object.
+ */
+static inline void kref_get(struct kref *kref)
+{
+	/* If refcount was 0 before incrementing then we have a race
+	 * condition when this kref is freeing by some other thread right now.
+	 * In this case one should use kref_get_unless_zero()
+	 */
+	WARN_ON_ONCE(atomic_inc_return(&kref->refcount) < 2);
+}
+
+/**
+ * kref_sub - subtract a number of refcounts for object.
+ * @kref: object.
+ * @count: Number of recounts to subtract.
+ * @release: pointer to the function that will clean up the object when the
+ *	     last reference to the object is released.
+ *	     This pointer is required, and it is not acceptable to pass kfree
+ *	     in as this function.  If the caller does pass kfree to this
+ *	     function, you will be publicly mocked mercilessly by the kref
+ *	     maintainer, and anyone else who happens to notice it.  You have
+ *	     been warned.
+ *
+ * Subtract @count from the refcount, and if 0, call release().
+ * Return 1 if the object was removed, otherwise return 0.  Beware, if this
+ * function returns 0, you still can not count on the kref from remaining in
+ * memory.  Only use the return value if you want to see if the kref is now
+ * gone, not present.
+ */
+static inline int kref_sub(struct kref *kref, unsigned int count,
+	     void (*release)(struct kref *kref))
+{
+	WARN_ON(release == NULL);
+
+	if (atomic_sub_and_test((int) count, &kref->refcount)) {
+		release(kref);
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * kref_put - decrement refcount for object.
+ * @kref: object.
+ * @release: pointer to the function that will clean up the object when the
+ *	     last reference to the object is released.
+ *	     This pointer is required, and it is not acceptable to pass kfree
+ *	     in as this function.  If the caller does pass kfree to this
+ *	     function, you will be publicly mocked mercilessly by the kref
+ *	     maintainer, and anyone else who happens to notice it.  You have
+ *	     been warned.
+ *
+ * Decrement the refcount, and if 0, call release().
+ * Return 1 if the object was removed, otherwise return 0.  Beware, if this
+ * function returns 0, you still can not count on the kref from remaining in
+ * memory.  Only use the return value if you want to see if the kref is now
+ * gone, not present.
+ */
+static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
+{
+	return kref_sub(kref, 1, release);
+}
+
+static inline int kref_put_mutex(struct kref *kref,
+				 void (*release)(struct kref *kref),
+				 struct mutex *lock)
+{
+	WARN_ON(release == NULL);
+	if (unlikely(!atomic_add_unless(&kref->refcount, -1, 1))) {
+		mutex_lock(lock);
+		if (unlikely(!atomic_dec_and_test(&kref->refcount))) {
+			mutex_unlock(lock);
+			return 0;
+		}
+		release(kref);
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * kref_get_unless_zero - Increment refcount for object unless it is zero.
+ * @kref: object.
+ *
+ * Return non-zero if the increment succeeded. Otherwise return 0.
+ *
+ * This function is intended to simplify locking around refcounting for
+ * objects that can be looked up from a lookup structure, and which are
+ * removed from that lookup structure in the object destructor.
+ * Operations on such objects require at least a read lock around
+ * lookup + kref_get, and a write lock around kref_put + remove from lookup
+ * structure. Furthermore, RCU implementations become extremely tricky.
+ * With a lookup followed by a kref_get_unless_zero *with return value check*
+ * locking in the kref_put path can be deferred to the actual removal from
+ * the lookup structure and RCU lookups become trivial.
+ */
+static inline int __must_check kref_get_unless_zero(struct kref *kref)
+{
+	return atomic_add_unless(&kref->refcount, 1, 0);
+}
+#endif /* _KREF_H_ */
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
new file mode 100644
index 0000000..3a8cf10
--- /dev/null
+++ b/include/linux/kthread.h
@@ -0,0 +1,118 @@
+#ifndef _LINUX_KTHREAD_H
+#define _LINUX_KTHREAD_H
+
+/* Simple interface for creating and stopping kernel threads without mess. */
+#include <linux/err.h>
+#include <linux/lockdep.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+
+__printf(3, 4)
+struct task_struct *kthread_create(int (*threadfn)(void *data),
+				   void *data,
+				   const char namefmt[], ...);
+
+
+struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
+					  void *data,
+					  unsigned int cpu,
+					  const char *namefmt);
+
+/**
+ * kthread_run - create and wake a thread.
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @namefmt: printf-style name for the thread.
+ *
+ * Description: Convenient wrapper for kthread_create() followed by
+ * wake_up_process().  Returns the kthread or ERR_PTR(-ENOMEM).
+ */
+#define kthread_run(threadfn, data, namefmt, ...)			   \
+({									   \
+	struct task_struct *__k						   \
+		= kthread_create(threadfn, data, namefmt, ## __VA_ARGS__); \
+	if (!IS_ERR(__k))						   \
+		wake_up_process(__k);					   \
+	__k;								   \
+})
+
+int kthread_stop(struct task_struct *k);
+bool kthread_should_stop(void);
+bool kthread_should_park(void);
+bool kthread_freezable_should_stop(bool *was_frozen);
+void *kthread_data(struct task_struct *k);
+void *probe_kthread_data(struct task_struct *k);
+int kthread_park(struct task_struct *k);
+void kthread_unpark(struct task_struct *k);
+void kthread_parkme(void);
+
+int kthreadd(void *unused);
+extern struct task_struct *kthreadd_task;
+extern int tsk_fork_get_node(struct task_struct *tsk);
+
+/*
+ * Simple work processor based on kthread.
+ *
+ * This provides easier way to make use of kthreads.  A kthread_work
+ * can be queued and flushed using queue/flush_kthread_work()
+ * respectively.  Queued kthread_works are processed by a kthread
+ * running kthread_worker_fn().
+ */
+struct kthread_work;
+typedef void (*kthread_work_func_t)(struct kthread_work *work);
+
+struct kthread_worker {
+	spinlock_t		lock;
+	struct list_head	work_list;
+	struct task_struct	*task;
+	struct kthread_work	*current_work;
+};
+
+struct kthread_work {
+	struct list_head	node;
+	kthread_work_func_t	func;
+	struct kthread_worker	*worker;
+};
+
+#define KTHREAD_WORKER_INIT(worker)	{				\
+	.lock = __SPIN_LOCK_UNLOCKED((worker).lock),			\
+	.work_list = LIST_HEAD_INIT((worker).work_list),		\
+	}
+
+#define KTHREAD_WORK_INIT(work, fn)	{				\
+	.node = LIST_HEAD_INIT((work).node),				\
+	.func = (fn),							\
+	}
+
+#define DEFINE_KTHREAD_WORKER(worker)					\
+	struct kthread_worker worker = KTHREAD_WORKER_INIT(worker)
+
+#define DEFINE_KTHREAD_WORK(work, fn)					\
+	struct kthread_work work = KTHREAD_WORK_INIT(work, fn)
+
+#define DEFINE_KTHREAD_WORKER_ONSTACK(worker) DEFINE_KTHREAD_WORKER(worker)
+
+extern void __init_kthread_worker(struct kthread_worker *worker,
+			const char *name, struct lock_class_key *key);
+
+#define init_kthread_worker(worker)					\
+	do {								\
+		static struct lock_class_key __key;			\
+		__init_kthread_worker((worker), "("#worker")->lock", &__key); \
+	} while (0)
+
+#define init_kthread_work(work, fn)					\
+	do {								\
+		memset((work), 0, sizeof(struct kthread_work));		\
+		INIT_LIST_HEAD(&(work)->node);				\
+		(work)->func = (fn);					\
+	} while (0)
+
+int kthread_worker_fn(void *worker_ptr);
+
+bool queue_kthread_work(struct kthread_worker *worker,
+			struct kthread_work *work);
+void flush_kthread_work(struct kthread_work *work);
+void flush_kthread_worker(struct kthread_worker *worker);
+
+#endif /* _LINUX_KTHREAD_H */
diff --git a/include/linux/lglock.h b/include/linux/lglock.h
new file mode 100644
index 0000000..a9108bc
--- /dev/null
+++ b/include/linux/lglock.h
@@ -0,0 +1,18 @@
+#ifndef __TOOLS_LINUX_LGLOCK_H
+#define __TOOLS_LINUX_LGLOCK_H
+
+#include <pthread.h>
+
+struct lglock {
+	pthread_mutex_t lock;
+};
+
+#define lg_lock_free(l)		do {} while (0)
+#define lg_lock_init(l)		pthread_mutex_init(&(l)->lock, NULL)
+
+#define lg_local_lock(l)	pthread_mutex_lock(&(l)->lock)
+#define lg_local_unlock(l)	pthread_mutex_unlock(&(l)->lock)
+#define lg_global_lock(l)	pthread_mutex_lock(&(l)->lock)
+#define lg_global_unlock(l)	pthread_mutex_unlock(&(l)->lock)
+
+#endif /* __TOOLS_LINUX_LGLOCK_H */
diff --git a/include/linux/list.h b/include/linux/list.h
new file mode 100644
index 0000000..1da4238
--- /dev/null
+++ b/include/linux/list.h
@@ -0,0 +1,771 @@
+#ifndef __TOOLS_LINUX_LIST_H
+#define __TOOLS_LINUX_LIST_H
+
+#include <linux/types.h>
+#include <linux/poison.h>
+#include <linux/kernel.h>
+#include <linux/compiler.h>
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+	struct list_head name = LIST_HEAD_INIT(name)
+
+static inline void INIT_LIST_HEAD(struct list_head *list)
+{
+	list->next = list;
+	list->prev = list;
+}
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+#ifndef CONFIG_DEBUG_LIST
+static inline void __list_add(struct list_head *new,
+			      struct list_head *prev,
+			      struct list_head *next)
+{
+	next->prev = new;
+	new->next = next;
+	new->prev = prev;
+	prev->next = new;
+}
+#else
+extern void __list_add(struct list_head *new,
+			      struct list_head *prev,
+			      struct list_head *next);
+#endif
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+	__list_add(new, head, head->next);
+}
+
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+	__list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+	next->prev = prev;
+	WRITE_ONCE(prev->next, next);
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty() on entry does not return true after this, the entry is
+ * in an undefined state.
+ */
+#ifndef CONFIG_DEBUG_LIST
+static inline void __list_del_entry(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+}
+
+static inline void list_del(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	entry->next = LIST_POISON1;
+	entry->prev = LIST_POISON2;
+}
+#else
+extern void __list_del_entry(struct list_head *entry);
+extern void list_del(struct list_head *entry);
+#endif
+
+/**
+ * list_replace - replace old entry by new one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ *
+ * If @old was empty, it will be overwritten.
+ */
+static inline void list_replace(struct list_head *old,
+				struct list_head *new)
+{
+	new->next = old->next;
+	new->next->prev = new;
+	new->prev = old->prev;
+	new->prev->next = new;
+}
+
+static inline void list_replace_init(struct list_head *old,
+					struct list_head *new)
+{
+	list_replace(old, new);
+	INIT_LIST_HEAD(old);
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+	__list_del_entry(entry);
+	INIT_LIST_HEAD(entry);
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+	__list_del_entry(list);
+	list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static inline void list_move_tail(struct list_head *list,
+				  struct list_head *head)
+{
+	__list_del_entry(list);
+	list_add_tail(list, head);
+}
+
+/**
+ * list_is_last - tests whether @list is the last entry in list @head
+ * @list: the entry to test
+ * @head: the head of the list
+ */
+static inline int list_is_last(const struct list_head *list,
+				const struct list_head *head)
+{
+	return list->next == head;
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static inline int list_empty(const struct list_head *head)
+{
+	return head->next == head;
+}
+
+/**
+ * list_empty_careful - tests whether a list is empty and not being modified
+ * @head: the list to test
+ *
+ * Description:
+ * tests whether a list is empty _and_ checks that no other CPU might be
+ * in the process of modifying either member (next or prev)
+ *
+ * NOTE: using list_empty_careful() without synchronization
+ * can only be safe if the only activity that can happen
+ * to the list entry is list_del_init(). Eg. it cannot be used
+ * if another CPU could re-list_add() it.
+ */
+static inline int list_empty_careful(const struct list_head *head)
+{
+	struct list_head *next = head->next;
+	return (next == head) && (next == head->prev);
+}
+
+/**
+ * list_rotate_left - rotate the list to the left
+ * @head: the head of the list
+ */
+static inline void list_rotate_left(struct list_head *head)
+{
+	struct list_head *first;
+
+	if (!list_empty(head)) {
+		first = head->next;
+		list_move_tail(first, head);
+	}
+}
+
+/**
+ * list_is_singular - tests whether a list has just one entry.
+ * @head: the list to test.
+ */
+static inline int list_is_singular(const struct list_head *head)
+{
+	return !list_empty(head) && (head->next == head->prev);
+}
+
+static inline void __list_cut_position(struct list_head *list,
+		struct list_head *head, struct list_head *entry)
+{
+	struct list_head *new_first = entry->next;
+	list->next = head->next;
+	list->next->prev = list;
+	list->prev = entry;
+	entry->next = list;
+	head->next = new_first;
+	new_first->prev = head;
+}
+
+/**
+ * list_cut_position - cut a list into two
+ * @list: a new list to add all removed entries
+ * @head: a list with entries
+ * @entry: an entry within head, could be the head itself
+ *	and if so we won't cut the list
+ *
+ * This helper moves the initial part of @head, up to and
+ * including @entry, from @head to @list. You should
+ * pass on @entry an element you know is on @head. @list
+ * should be an empty list or a list you do not care about
+ * losing its data.
+ *
+ */
+static inline void list_cut_position(struct list_head *list,
+		struct list_head *head, struct list_head *entry)
+{
+	if (list_empty(head))
+		return;
+	if (list_is_singular(head) &&
+		(head->next != entry && head != entry))
+		return;
+	if (entry == head)
+		INIT_LIST_HEAD(list);
+	else
+		__list_cut_position(list, head, entry);
+}
+
+static inline void __list_splice(const struct list_head *list,
+				 struct list_head *prev,
+				 struct list_head *next)
+{
+	struct list_head *first = list->next;
+	struct list_head *last = list->prev;
+
+	first->prev = prev;
+	prev->next = first;
+
+	last->next = next;
+	next->prev = last;
+}
+
+/**
+ * list_splice - join two lists, this is designed for stacks
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice(const struct list_head *list,
+				struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head, head->next);
+}
+
+/**
+ * list_splice_tail - join two lists, each list being a queue
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice_tail(struct list_head *list,
+				struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head->prev, head);
+}
+
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_init(struct list_head *list,
+				    struct list_head *head)
+{
+	if (!list_empty(list)) {
+		__list_splice(list, head, head->next);
+		INIT_LIST_HEAD(list);
+	}
+}
+
+/**
+ * list_splice_tail_init - join two lists and reinitialise the emptied list
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * Each of the lists is a queue.
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_tail_init(struct list_head *list,
+					 struct list_head *head)
+{
+	if (!list_empty(list)) {
+		__list_splice(list, head->prev, head);
+		INIT_LIST_HEAD(list);
+	}
+}
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr:	the &struct list_head pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_head within the struct.
+ */
+#define list_entry(ptr, type, member) \
+	container_of(ptr, type, member)
+
+/**
+ * list_first_entry - get the first element from a list
+ * @ptr:	the list head to take the element from.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ */
+#define list_first_entry(ptr, type, member) \
+	list_entry((ptr)->next, type, member)
+
+/**
+ * list_last_entry - get the last element from a list
+ * @ptr:	the list head to take the element from.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ */
+#define list_last_entry(ptr, type, member) \
+	list_entry((ptr)->prev, type, member)
+
+/**
+ * list_first_entry_or_null - get the first element from a list
+ * @ptr:	the list head to take the element from.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Note that if the list is empty, it returns NULL.
+ */
+#define list_first_entry_or_null(ptr, type, member) \
+	(!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
+
+/**
+ * list_next_entry - get the next element in list
+ * @pos:	the type * to cursor
+ * @member:	the name of the list_head within the struct.
+ */
+#define list_next_entry(pos, member) \
+	list_entry((pos)->member.next, typeof(*(pos)), member)
+
+/**
+ * list_prev_entry - get the prev element in list
+ * @pos:	the type * to cursor
+ * @member:	the name of the list_head within the struct.
+ */
+#define list_prev_entry(pos, member) \
+	list_entry((pos)->member.prev, typeof(*(pos)), member)
+
+/**
+ * list_for_each	-	iterate over a list
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ */
+#define list_for_each(pos, head) \
+	for (pos = (head)->next; pos != (head); pos = pos->next)
+
+/**
+ * list_for_each_prev	-	iterate over a list backwards
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+	for (pos = (head)->prev; pos != (head); pos = pos->prev)
+
+/**
+ * list_for_each_safe - iterate over a list safe against removal of list entry
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @n:		another &struct list_head to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+	for (pos = (head)->next, n = pos->next; pos != (head); \
+		pos = n, n = pos->next)
+
+/**
+ * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @n:		another &struct list_head to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define list_for_each_prev_safe(pos, n, head) \
+	for (pos = (head)->prev, n = pos->prev; \
+	     pos != (head); \
+	     pos = n, n = pos->prev)
+
+/**
+ * list_for_each_entry	-	iterate over list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ */
+#define list_for_each_entry(pos, head, member)				\
+	for (pos = list_first_entry(head, typeof(*pos), member);	\
+	     &pos->member != (head);					\
+	     pos = list_next_entry(pos, member))
+
+/**
+ * list_for_each_entry_reverse - iterate backwards over list of given type.
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ */
+#define list_for_each_entry_reverse(pos, head, member)			\
+	for (pos = list_last_entry(head, typeof(*pos), member);		\
+	     &pos->member != (head); 					\
+	     pos = list_prev_entry(pos, member))
+
+/**
+ * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue()
+ * @pos:	the type * to use as a start point
+ * @head:	the head of the list
+ * @member:	the name of the list_head within the struct.
+ *
+ * Prepares a pos entry for use as a start point in list_for_each_entry_continue().
+ */
+#define list_prepare_entry(pos, head, member) \
+	((pos) ? : list_entry(head, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_continue - continue iteration over list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position.
+ */
+#define list_for_each_entry_continue(pos, head, member) 		\
+	for (pos = list_next_entry(pos, member);			\
+	     &pos->member != (head);					\
+	     pos = list_next_entry(pos, member))
+
+/**
+ * list_for_each_entry_continue_reverse - iterate backwards from the given point
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Start to iterate over list of given type backwards, continuing after
+ * the current position.
+ */
+#define list_for_each_entry_continue_reverse(pos, head, member)		\
+	for (pos = list_prev_entry(pos, member);			\
+	     &pos->member != (head);					\
+	     pos = list_prev_entry(pos, member))
+
+/**
+ * list_for_each_entry_from - iterate over list of given type from the current point
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Iterate over list of given type, continuing from current position.
+ */
+#define list_for_each_entry_from(pos, head, member) 			\
+	for (; &pos->member != (head);					\
+	     pos = list_next_entry(pos, member))
+
+/**
+ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member)			\
+	for (pos = list_first_entry(head, typeof(*pos), member),	\
+		n = list_next_entry(pos, member);			\
+	     &pos->member != (head); 					\
+	     pos = n, n = list_next_entry(n, member))
+
+/**
+ * list_for_each_entry_safe_continue - continue list iteration safe against removal
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Iterate over list of given type, continuing after current point,
+ * safe against removal of list entry.
+ */
+#define list_for_each_entry_safe_continue(pos, n, head, member) 		\
+	for (pos = list_next_entry(pos, member), 				\
+		n = list_next_entry(pos, member);				\
+	     &pos->member != (head);						\
+	     pos = n, n = list_next_entry(n, member))
+
+/**
+ * list_for_each_entry_safe_from - iterate over list from current point safe against removal
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Iterate over list of given type from current point, safe against
+ * removal of list entry.
+ */
+#define list_for_each_entry_safe_from(pos, n, head, member) 			\
+	for (n = list_next_entry(pos, member);					\
+	     &pos->member != (head);						\
+	     pos = n, n = list_next_entry(n, member))
+
+/**
+ * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Iterate backwards over list of given type, safe against removal
+ * of list entry.
+ */
+#define list_for_each_entry_safe_reverse(pos, n, head, member)		\
+	for (pos = list_last_entry(head, typeof(*pos), member),		\
+		n = list_prev_entry(pos, member);			\
+	     &pos->member != (head); 					\
+	     pos = n, n = list_prev_entry(n, member))
+
+/**
+ * list_safe_reset_next - reset a stale list_for_each_entry_safe loop
+ * @pos:	the loop cursor used in the list_for_each_entry_safe loop
+ * @n:		temporary storage used in list_for_each_entry_safe
+ * @member:	the name of the list_head within the struct.
+ *
+ * list_safe_reset_next is not safe to use in general if the list may be
+ * modified concurrently (eg. the lock is dropped in the loop body). An
+ * exception to this is if the cursor element (pos) is pinned in the list,
+ * and list_safe_reset_next is called after re-taking the lock and before
+ * completing the current iteration of the loop body.
+ */
+#define list_safe_reset_next(pos, n, member)				\
+	n = list_next_entry(pos, member)
+
+/*
+ * Double linked lists with a single pointer list head.
+ * Mostly useful for hash tables where the two pointer list head is
+ * too wasteful.
+ * You lose the ability to access the tail in O(1).
+ */
+
+#define HLIST_HEAD_INIT { .first = NULL }
+#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
+#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
+static inline void INIT_HLIST_NODE(struct hlist_node *h)
+{
+	h->next = NULL;
+	h->pprev = NULL;
+}
+
+static inline int hlist_unhashed(const struct hlist_node *h)
+{
+	return !h->pprev;
+}
+
+static inline int hlist_empty(const struct hlist_head *h)
+{
+	return !h->first;
+}
+
+static inline void __hlist_del(struct hlist_node *n)
+{
+	struct hlist_node *next = n->next;
+	struct hlist_node **pprev = n->pprev;
+
+	WRITE_ONCE(*pprev, next);
+	if (next)
+		next->pprev = pprev;
+}
+
+static inline void hlist_del(struct hlist_node *n)
+{
+	__hlist_del(n);
+	n->next = LIST_POISON1;
+	n->pprev = LIST_POISON2;
+}
+
+static inline void hlist_del_init(struct hlist_node *n)
+{
+	if (!hlist_unhashed(n)) {
+		__hlist_del(n);
+		INIT_HLIST_NODE(n);
+	}
+}
+
+static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
+{
+	struct hlist_node *first = h->first;
+	n->next = first;
+	if (first)
+		first->pprev = &n->next;
+	h->first = n;
+	n->pprev = &h->first;
+}
+
+/* next must be != NULL */
+static inline void hlist_add_before(struct hlist_node *n,
+					struct hlist_node *next)
+{
+	n->pprev = next->pprev;
+	n->next = next;
+	next->pprev = &n->next;
+	*(n->pprev) = n;
+}
+
+static inline void hlist_add_behind(struct hlist_node *n,
+				    struct hlist_node *prev)
+{
+	n->next = prev->next;
+	prev->next = n;
+	n->pprev = &prev->next;
+
+	if (n->next)
+		n->next->pprev  = &n->next;
+}
+
+/* after that we'll appear to be on some hlist and hlist_del will work */
+static inline void hlist_add_fake(struct hlist_node *n)
+{
+	n->pprev = &n->next;
+}
+
+static inline bool hlist_fake(struct hlist_node *h)
+{
+	return h->pprev == &h->next;
+}
+
+/*
+ * Move a list from one list head to another. Fixup the pprev
+ * reference of the first entry if it exists.
+ */
+static inline void hlist_move_list(struct hlist_head *old,
+				   struct hlist_head *new)
+{
+	new->first = old->first;
+	if (new->first)
+		new->first->pprev = &new->first;
+	old->first = NULL;
+}
+
+#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_for_each(pos, head) \
+	for (pos = (head)->first; pos ; pos = pos->next)
+
+#define hlist_for_each_safe(pos, n, head) \
+	for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
+	     pos = n)
+
+#define hlist_entry_safe(ptr, type, member) \
+	({ typeof(ptr) ____ptr = (ptr); \
+	   ____ptr ? hlist_entry(____ptr, type, member) : NULL; \
+	})
+
+/**
+ * hlist_for_each_entry	- iterate over list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry(pos, head, member)				\
+	for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\
+	     pos;							\
+	     pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
+
+/**
+ * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
+ * @pos:	the type * to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_continue(pos, member)			\
+	for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\
+	     pos;							\
+	     pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
+
+/**
+ * hlist_for_each_entry_from - iterate over a hlist continuing from current point
+ * @pos:	the type * to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_from(pos, member)				\
+	for (; pos;							\
+	     pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
+
+/**
+ * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another &struct hlist_node to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_safe(pos, n, head, member) 		\
+	for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\
+	     pos && ({ n = pos->member.next; 1; });			\
+	     pos = hlist_entry_safe(n, typeof(*pos), member))
+
+/**
+ * list_del_range - deletes range of entries from list.
+ * @begin: first element in the range to delete from the list.
+ * @end: last element in the range to delete from the list.
+ * Note: list_empty on the range of entries does not return true after this,
+ * the entries is in an undefined state.
+ */
+static inline void list_del_range(struct list_head *begin,
+				  struct list_head *end)
+{
+	begin->prev->next = end->next;
+	end->next->prev = begin->prev;
+}
+
+/**
+ * list_for_each_from	-	iterate over a list from one of its nodes
+ * @pos:  the &struct list_head to use as a loop cursor, from where to start
+ * @head: the head for your list.
+ */
+#define list_for_each_from(pos, head) \
+	for (; pos != (head); pos = pos->next)
+
+#endif /* __TOOLS_LINUX_LIST_H */
diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
new file mode 100644
index 0000000..b01fe10
--- /dev/null
+++ b/include/linux/list_nulls.h
@@ -0,0 +1,117 @@
+#ifndef _LINUX_LIST_NULLS_H
+#define _LINUX_LIST_NULLS_H
+
+#include <linux/poison.h>
+#include <linux/const.h>
+
+/*
+ * Special version of lists, where end of list is not a NULL pointer,
+ * but a 'nulls' marker, which can have many different values.
+ * (up to 2^31 different values guaranteed on all platforms)
+ *
+ * In the standard hlist, termination of a list is the NULL pointer.
+ * In this special 'nulls' variant, we use the fact that objects stored in
+ * a list are aligned on a word (4 or 8 bytes alignment).
+ * We therefore use the last significant bit of 'ptr' :
+ * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1)
+ * Set to 0 : This is a pointer to some object (ptr)
+ */
+
+struct hlist_nulls_head {
+	struct hlist_nulls_node *first;
+};
+
+struct hlist_nulls_node {
+	struct hlist_nulls_node *next, **pprev;
+};
+#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
+#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
+	((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))
+
+#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
+/**
+ * ptr_is_a_nulls - Test if a ptr is a nulls
+ * @ptr: ptr to be tested
+ *
+ */
+static inline int is_a_nulls(const struct hlist_nulls_node *ptr)
+{
+	return ((unsigned long)ptr & 1);
+}
+
+/**
+ * get_nulls_value - Get the 'nulls' value of the end of chain
+ * @ptr: end of chain
+ *
+ * Should be called only if is_a_nulls(ptr);
+ */
+static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr)
+{
+	return ((unsigned long)ptr) >> 1;
+}
+
+static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
+{
+	return !h->pprev;
+}
+
+static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
+{
+	return is_a_nulls(READ_ONCE(h->first));
+}
+
+static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
+					struct hlist_nulls_head *h)
+{
+	struct hlist_nulls_node *first = h->first;
+
+	n->next = first;
+	n->pprev = &h->first;
+	h->first = n;
+	if (!is_a_nulls(first))
+		first->pprev = &n->next;
+}
+
+static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
+{
+	struct hlist_nulls_node *next = n->next;
+	struct hlist_nulls_node **pprev = n->pprev;
+
+	WRITE_ONCE(*pprev, next);
+	if (!is_a_nulls(next))
+		next->pprev = pprev;
+}
+
+static inline void hlist_nulls_del(struct hlist_nulls_node *n)
+{
+	__hlist_nulls_del(n);
+	n->pprev = LIST_POISON2;
+}
+
+/**
+ * hlist_nulls_for_each_entry	- iterate over list of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry(tpos, pos, head, member)		       \
+	for (pos = (head)->first;					       \
+	     (!is_a_nulls(pos)) &&					       \
+		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry_from(tpos, pos, member)	\
+	for (; (!is_a_nulls(pos)) && 				\
+		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+#endif
diff --git a/include/linux/llist.h b/include/linux/llist.h
new file mode 100644
index 0000000..8abc2e0
--- /dev/null
+++ b/include/linux/llist.h
@@ -0,0 +1,201 @@
+#ifndef __TOOLS_LINUX_LLIST_H
+#define __TOOLS_LINUX_LLIST_H
+
+/*
+ * Lock-less NULL terminated single linked list
+ *
+ * If there are multiple producers and multiple consumers, llist_add
+ * can be used in producers and llist_del_all can be used in
+ * consumers.  They can work simultaneously without lock.  But
+ * llist_del_first can not be used here.  Because llist_del_first
+ * depends on list->first->next does not changed if list->first is not
+ * changed during its operation, but llist_del_first, llist_add,
+ * llist_add (or llist_del_all, llist_add, llist_add) sequence in
+ * another consumer may violate that.
+ *
+ * If there are multiple producers and one consumer, llist_add can be
+ * used in producers and llist_del_all or llist_del_first can be used
+ * in the consumer.
+ *
+ * This can be summarized as follow:
+ *
+ *           |   add    | del_first |  del_all
+ * add       |    -     |     -     |     -
+ * del_first |          |     L     |     L
+ * del_all   |          |           |     -
+ *
+ * Where "-" stands for no lock is needed, while "L" stands for lock
+ * is needed.
+ *
+ * The list entries deleted via llist_del_all can be traversed with
+ * traversing function such as llist_for_each etc.  But the list
+ * entries can not be traversed safely before deleted from the list.
+ * The order of deleted entries is from the newest to the oldest added
+ * one.  If you want to traverse from the oldest to the newest, you
+ * must reverse the order by yourself before traversing.
+ *
+ * The basic atomic operation of this list is cmpxchg on long.  On
+ * architectures that don't have NMI-safe cmpxchg implementation, the
+ * list can NOT be used in NMI handlers.  So code that uses the list in
+ * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
+ *
+ * Copyright 2010,2011 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/atomic.h>
+#include <linux/kernel.h>
+
+struct llist_head {
+	struct llist_node *first;
+};
+
+struct llist_node {
+	struct llist_node *next;
+};
+
+#define LLIST_HEAD_INIT(name)	{ NULL }
+#define LLIST_HEAD(name)	struct llist_head name = LLIST_HEAD_INIT(name)
+
+/**
+ * init_llist_head - initialize lock-less list head
+ * @head:	the head for your lock-less list
+ */
+static inline void init_llist_head(struct llist_head *list)
+{
+	list->first = NULL;
+}
+
+/**
+ * llist_entry - get the struct of this entry
+ * @ptr:	the &struct llist_node pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the llist_node within the struct.
+ */
+#define llist_entry(ptr, type, member)		\
+	container_of(ptr, type, member)
+
+/**
+ * llist_for_each - iterate over some deleted entries of a lock-less list
+ * @pos:	the &struct llist_node to use as a loop cursor
+ * @node:	the first entry of deleted list entries
+ *
+ * In general, some entries of the lock-less list can be traversed
+ * safely only after being deleted from list, so start with an entry
+ * instead of list head.
+ *
+ * If being used on entries deleted from lock-less list directly, the
+ * traverse order is from the newest to the oldest added entry.  If
+ * you want to traverse from the oldest to the newest, you must
+ * reverse the order by yourself before traversing.
+ */
+#define llist_for_each(pos, node)			\
+	for ((pos) = (node); pos; (pos) = (pos)->next)
+
+/**
+ * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @node:	the fist entry of deleted list entries.
+ * @member:	the name of the llist_node with the struct.
+ *
+ * In general, some entries of the lock-less list can be traversed
+ * safely only after being removed from list, so start with an entry
+ * instead of list head.
+ *
+ * If being used on entries deleted from lock-less list directly, the
+ * traverse order is from the newest to the oldest added entry.  If
+ * you want to traverse from the oldest to the newest, you must
+ * reverse the order by yourself before traversing.
+ */
+#define llist_for_each_entry(pos, node, member)				\
+	for ((pos) = llist_entry((node), typeof(*(pos)), member);	\
+	     &(pos)->member != NULL;					\
+	     (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member))
+
+/**
+ * llist_for_each_entry_safe - iterate over some deleted entries of lock-less list of given type
+ *			       safe against removal of list entry
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @node:	the first entry of deleted list entries.
+ * @member:	the name of the llist_node with the struct.
+ *
+ * In general, some entries of the lock-less list can be traversed
+ * safely only after being removed from list, so start with an entry
+ * instead of list head.
+ *
+ * If being used on entries deleted from lock-less list directly, the
+ * traverse order is from the newest to the oldest added entry.  If
+ * you want to traverse from the oldest to the newest, you must
+ * reverse the order by yourself before traversing.
+ */
+#define llist_for_each_entry_safe(pos, n, node, member)			       \
+	for (pos = llist_entry((node), typeof(*pos), member);		       \
+	     &pos->member != NULL &&					       \
+	        (n = llist_entry(pos->member.next, typeof(*n), member), true); \
+	     pos = n)
+
+/**
+ * llist_empty - tests whether a lock-less list is empty
+ * @head:	the list to test
+ *
+ * Not guaranteed to be accurate or up to date.  Just a quick way to
+ * test whether the list is empty without deleting something from the
+ * list.
+ */
+static inline bool llist_empty(const struct llist_head *head)
+{
+	return ACCESS_ONCE(head->first) == NULL;
+}
+
+static inline struct llist_node *llist_next(struct llist_node *node)
+{
+	return node->next;
+}
+
+extern bool llist_add_batch(struct llist_node *new_first,
+			    struct llist_node *new_last,
+			    struct llist_head *head);
+/**
+ * llist_add - add a new entry
+ * @new:	new entry to be added
+ * @head:	the head for your lock-less list
+ *
+ * Returns true if the list was empty prior to adding this entry.
+ */
+static inline bool llist_add(struct llist_node *new, struct llist_head *head)
+{
+	return llist_add_batch(new, new, head);
+}
+
+/**
+ * llist_del_all - delete all entries from lock-less list
+ * @head:	the head of lock-less list to delete all entries
+ *
+ * If list is empty, return NULL, otherwise, delete all entries and
+ * return the pointer to the first entry.  The order of entries
+ * deleted is from the newest to the oldest added one.
+ */
+static inline struct llist_node *llist_del_all(struct llist_head *head)
+{
+	return xchg(&head->first, NULL);
+}
+
+extern struct llist_node *llist_del_first(struct llist_head *head);
+
+struct llist_node *llist_reverse_order(struct llist_node *head);
+
+#endif /* __TOOLS_LINUX_LLIST_H */
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
new file mode 100644
index 0000000..d95d8da
--- /dev/null
+++ b/include/linux/lockdep.h
@@ -0,0 +1,55 @@
+#ifndef __TOOLS_LINUX_LOCKDEP_H
+#define __TOOLS_LINUX_LOCKDEP_H
+
+struct lock_class_key {};
+struct task_struct;
+
+# define lock_acquire(l, s, t, r, c, n, i)	do { } while (0)
+# define lock_release(l, n, i)			do { } while (0)
+# define lock_set_class(l, n, k, s, i)		do { } while (0)
+# define lock_set_subclass(l, s, i)		do { } while (0)
+# define lockdep_set_current_reclaim_state(g)	do { } while (0)
+# define lockdep_clear_current_reclaim_state()	do { } while (0)
+# define lockdep_trace_alloc(g)			do { } while (0)
+# define lockdep_info()				do { } while (0)
+# define lockdep_init_map(lock, name, key, sub) \
+		do { (void)(name); (void)(key); } while (0)
+# define lockdep_set_class(lock, key)		do { (void)(key); } while (0)
+# define lockdep_set_class_and_name(lock, key, name) \
+		do { (void)(key); (void)(name); } while (0)
+#define lockdep_set_class_and_subclass(lock, key, sub) \
+		do { (void)(key); } while (0)
+#define lockdep_set_subclass(lock, sub)		do { } while (0)
+
+#define lockdep_set_novalidate_class(lock) do { } while (0)
+
+#define lockdep_assert_held(l)			do { (void)(l); } while (0)
+#define lockdep_assert_held_once(l)		do { (void)(l); } while (0)
+
+#define lock_acquire_shared(l, s, t, n, i)
+
+#define lockdep_acquire_shared(lock)
+
+#define lock_contended(lockdep_map, ip) do {} while (0)
+#define lock_acquired(lockdep_map, ip) do {} while (0)
+
+static inline void debug_show_all_locks(void)
+{
+}
+
+static inline void debug_show_held_locks(struct task_struct *task)
+{
+}
+
+static inline void
+debug_check_no_locks_freed(const void *from, unsigned long len)
+{
+}
+
+static inline void
+debug_check_no_locks_held(void)
+{
+}
+
+#endif /* __TOOLS_LINUX_LOCKDEP_H */
+
diff --git a/include/linux/log2.h b/include/linux/log2.h
new file mode 100644
index 0000000..395cda2
--- /dev/null
+++ b/include/linux/log2.h
@@ -0,0 +1,187 @@
+/* Integer base 2 logarithm calculation
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _TOOLS_LINUX_LOG2_H
+#define _TOOLS_LINUX_LOG2_H
+
+#include <linux/bitops.h>
+
+/*
+ * deal with unrepresentable constant logarithms
+ */
+extern __attribute__((const, noreturn))
+int ____ilog2_NaN(void);
+
+/*
+ * non-constant log of base 2 calculators
+ * - the arch may override these in asm/bitops.h if they can be implemented
+ *   more efficiently than using fls() and fls64()
+ * - the arch is not required to handle n==0 if implementing the fallback
+ */
+static inline __attribute__((const))
+int __ilog2_u32(u32 n)
+{
+	return fls(n) - 1;
+}
+
+static inline __attribute__((const))
+int __ilog2_u64(u64 n)
+{
+	return fls64(n) - 1;
+}
+
+/*
+ *  Determine whether some value is a power of two, where zero is
+ * *not* considered a power of two.
+ */
+
+static inline __attribute__((const))
+bool is_power_of_2(unsigned long n)
+{
+	return (n != 0 && ((n & (n - 1)) == 0));
+}
+
+/*
+ * round up to nearest power of two
+ */
+static inline __attribute__((const))
+unsigned long __roundup_pow_of_two(unsigned long n)
+{
+	return 1UL << fls_long(n - 1);
+}
+
+/*
+ * round down to nearest power of two
+ */
+static inline __attribute__((const))
+unsigned long __rounddown_pow_of_two(unsigned long n)
+{
+	return 1UL << (fls_long(n) - 1);
+}
+
+/**
+ * ilog2 - log of base 2 of 32-bit or a 64-bit unsigned value
+ * @n - parameter
+ *
+ * constant-capable log of base 2 calculation
+ * - this can be used to initialise global variables from constant data, hence
+ *   the massive ternary operator construction
+ *
+ * selects the appropriately-sized optimised version depending on sizeof(n)
+ */
+#define ilog2(n)				\
+(						\
+	__builtin_constant_p(n) ? (		\
+		(n) < 1 ? ____ilog2_NaN() :	\
+		(n) & (1ULL << 63) ? 63 :	\
+		(n) & (1ULL << 62) ? 62 :	\
+		(n) & (1ULL << 61) ? 61 :	\
+		(n) & (1ULL << 60) ? 60 :	\
+		(n) & (1ULL << 59) ? 59 :	\
+		(n) & (1ULL << 58) ? 58 :	\
+		(n) & (1ULL << 57) ? 57 :	\
+		(n) & (1ULL << 56) ? 56 :	\
+		(n) & (1ULL << 55) ? 55 :	\
+		(n) & (1ULL << 54) ? 54 :	\
+		(n) & (1ULL << 53) ? 53 :	\
+		(n) & (1ULL << 52) ? 52 :	\
+		(n) & (1ULL << 51) ? 51 :	\
+		(n) & (1ULL << 50) ? 50 :	\
+		(n) & (1ULL << 49) ? 49 :	\
+		(n) & (1ULL << 48) ? 48 :	\
+		(n) & (1ULL << 47) ? 47 :	\
+		(n) & (1ULL << 46) ? 46 :	\
+		(n) & (1ULL << 45) ? 45 :	\
+		(n) & (1ULL << 44) ? 44 :	\
+		(n) & (1ULL << 43) ? 43 :	\
+		(n) & (1ULL << 42) ? 42 :	\
+		(n) & (1ULL << 41) ? 41 :	\
+		(n) & (1ULL << 40) ? 40 :	\
+		(n) & (1ULL << 39) ? 39 :	\
+		(n) & (1ULL << 38) ? 38 :	\
+		(n) & (1ULL << 37) ? 37 :	\
+		(n) & (1ULL << 36) ? 36 :	\
+		(n) & (1ULL << 35) ? 35 :	\
+		(n) & (1ULL << 34) ? 34 :	\
+		(n) & (1ULL << 33) ? 33 :	\
+		(n) & (1ULL << 32) ? 32 :	\
+		(n) & (1ULL << 31) ? 31 :	\
+		(n) & (1ULL << 30) ? 30 :	\
+		(n) & (1ULL << 29) ? 29 :	\
+		(n) & (1ULL << 28) ? 28 :	\
+		(n) & (1ULL << 27) ? 27 :	\
+		(n) & (1ULL << 26) ? 26 :	\
+		(n) & (1ULL << 25) ? 25 :	\
+		(n) & (1ULL << 24) ? 24 :	\
+		(n) & (1ULL << 23) ? 23 :	\
+		(n) & (1ULL << 22) ? 22 :	\
+		(n) & (1ULL << 21) ? 21 :	\
+		(n) & (1ULL << 20) ? 20 :	\
+		(n) & (1ULL << 19) ? 19 :	\
+		(n) & (1ULL << 18) ? 18 :	\
+		(n) & (1ULL << 17) ? 17 :	\
+		(n) & (1ULL << 16) ? 16 :	\
+		(n) & (1ULL << 15) ? 15 :	\
+		(n) & (1ULL << 14) ? 14 :	\
+		(n) & (1ULL << 13) ? 13 :	\
+		(n) & (1ULL << 12) ? 12 :	\
+		(n) & (1ULL << 11) ? 11 :	\
+		(n) & (1ULL << 10) ? 10 :	\
+		(n) & (1ULL <<  9) ?  9 :	\
+		(n) & (1ULL <<  8) ?  8 :	\
+		(n) & (1ULL <<  7) ?  7 :	\
+		(n) & (1ULL <<  6) ?  6 :	\
+		(n) & (1ULL <<  5) ?  5 :	\
+		(n) & (1ULL <<  4) ?  4 :	\
+		(n) & (1ULL <<  3) ?  3 :	\
+		(n) & (1ULL <<  2) ?  2 :	\
+		(n) & (1ULL <<  1) ?  1 :	\
+		(n) & (1ULL <<  0) ?  0 :	\
+		____ilog2_NaN()			\
+				   ) :		\
+	(sizeof(n) <= 4) ?			\
+	__ilog2_u32(n) :			\
+	__ilog2_u64(n)				\
+ )
+
+/**
+ * roundup_pow_of_two - round the given value up to nearest power of two
+ * @n - parameter
+ *
+ * round the given value up to the nearest power of two
+ * - the result is undefined when n == 0
+ * - this can be used to initialise global variables from constant data
+ */
+#define roundup_pow_of_two(n)			\
+(						\
+	__builtin_constant_p(n) ? (		\
+		(n == 1) ? 1 :			\
+		(1UL << (ilog2((n) - 1) + 1))	\
+				   ) :		\
+	__roundup_pow_of_two(n)			\
+ )
+
+/**
+ * rounddown_pow_of_two - round the given value down to nearest power of two
+ * @n - parameter
+ *
+ * round the given value down to the nearest power of two
+ * - the result is undefined when n == 0
+ * - this can be used to initialise global variables from constant data
+ */
+#define rounddown_pow_of_two(n)			\
+(						\
+	__builtin_constant_p(n) ? (		\
+		(1UL << ilog2(n))) :		\
+	__rounddown_pow_of_two(n)		\
+ )
+
+#endif /* _TOOLS_LINUX_LOG2_H */
diff --git a/include/linux/lz4.h b/include/linux/lz4.h
new file mode 100644
index 0000000..6b784c5
--- /dev/null
+++ b/include/linux/lz4.h
@@ -0,0 +1,87 @@
+#ifndef __LZ4_H__
+#define __LZ4_H__
+/*
+ * LZ4 Kernel Interface
+ *
+ * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define LZ4_MEM_COMPRESS	(16384)
+#define LZ4HC_MEM_COMPRESS	(262144 + (2 * sizeof(unsigned char *)))
+
+/*
+ * lz4_compressbound()
+ * Provides the maximum size that LZ4 may output in a "worst case" scenario
+ * (input data not compressible)
+ */
+static inline size_t lz4_compressbound(size_t isize)
+{
+	return isize + (isize / 255) + 16;
+}
+
+/*
+ * lz4_compress()
+ *	src     : source address of the original data
+ *	src_len : size of the original data
+ *	dst	: output buffer address of the compressed data
+ *		This requires 'dst' of size LZ4_COMPRESSBOUND.
+ *	dst_len : is the output size, which is returned after compress done
+ *	workmem : address of the working memory.
+ *		This requires 'workmem' of size LZ4_MEM_COMPRESS.
+ *	return  : Success if return 0
+ *		  Error if return (< 0)
+ *	note :  Destination buffer and workmem must be already allocated with
+ *		the defined size.
+ */
+int lz4_compress(const unsigned char *src, size_t src_len,
+		unsigned char *dst, size_t *dst_len, void *wrkmem);
+
+ /*
+  * lz4hc_compress()
+  *	 src	 : source address of the original data
+  *	 src_len : size of the original data
+  *	 dst	 : output buffer address of the compressed data
+  *		This requires 'dst' of size LZ4_COMPRESSBOUND.
+  *	 dst_len : is the output size, which is returned after compress done
+  *	 workmem : address of the working memory.
+  *		This requires 'workmem' of size LZ4HC_MEM_COMPRESS.
+  *	 return  : Success if return 0
+  *		   Error if return (< 0)
+  *	 note :  Destination buffer and workmem must be already allocated with
+  *		 the defined size.
+  */
+int lz4hc_compress(const unsigned char *src, size_t src_len,
+		unsigned char *dst, size_t *dst_len, void *wrkmem);
+
+/*
+ * lz4_decompress()
+ *	src     : source address of the compressed data
+ *	src_len : is the input size, whcih is returned after decompress done
+ *	dest	: output buffer address of the decompressed data
+ *	actual_dest_len: is the size of uncompressed data, supposing it's known
+ *	return  : Success if return 0
+ *		  Error if return (< 0)
+ *	note :  Destination buffer must be already allocated.
+ *		slightly faster than lz4_decompress_unknownoutputsize()
+ */
+int lz4_decompress(const unsigned char *src, size_t *src_len,
+		unsigned char *dest, size_t actual_dest_len);
+
+/*
+ * lz4_decompress_unknownoutputsize()
+ *	src     : source address of the compressed data
+ *	src_len : is the input size, therefore the compressed size
+ *	dest	: output buffer address of the decompressed data
+ *	dest_len: is the max size of the destination buffer, which is
+ *			returned with actual size of decompressed data after
+ *			decompress done
+ *	return  : Success if return 0
+ *		  Error if return (< 0)
+ *	note :  Destination buffer must be already allocated.
+ */
+int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len,
+		unsigned char *dest, size_t *dest_len);
+#endif
diff --git a/include/linux/math64.h b/include/linux/math64.h
new file mode 100644
index 0000000..5eb6f06
--- /dev/null
+++ b/include/linux/math64.h
@@ -0,0 +1,85 @@
+#ifndef _LINUX_MATH64_H
+#define _LINUX_MATH64_H
+
+#include <linux/types.h>
+
+#define do_div(n,base) ({					\
+	u32 __base = (base);					\
+	u32 __rem;						\
+	__rem = ((u64)(n)) % __base;				\
+	(n) = ((u64)(n)) / __base;				\
+	__rem;							\
+ })
+
+#define div64_long(x, y) div64_s64((x), (y))
+#define div64_ul(x, y)   div64_u64((x), (y))
+
+/**
+ * div_u64_rem - unsigned 64bit divide with 32bit divisor with remainder
+ *
+ * This is commonly provided by 32bit archs to provide an optimized 64bit
+ * divide.
+ */
+static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
+{
+	*remainder = dividend % divisor;
+	return dividend / divisor;
+}
+
+/**
+ * div_s64_rem - signed 64bit divide with 32bit divisor with remainder
+ */
+static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder)
+{
+	*remainder = dividend % divisor;
+	return dividend / divisor;
+}
+
+/**
+ * div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder
+ */
+static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
+{
+	*remainder = dividend % divisor;
+	return dividend / divisor;
+}
+
+/**
+ * div64_u64 - unsigned 64bit divide with 64bit divisor
+ */
+static inline u64 div64_u64(u64 dividend, u64 divisor)
+{
+	return dividend / divisor;
+}
+
+/**
+ * div64_s64 - signed 64bit divide with 64bit divisor
+ */
+static inline s64 div64_s64(s64 dividend, s64 divisor)
+{
+	return dividend / divisor;
+}
+
+/**
+ * div_u64 - unsigned 64bit divide with 32bit divisor
+ *
+ * This is the most common 64bit divide and should be used if possible,
+ * as many 32bit archs can optimize this variant better than a full 64bit
+ * divide.
+ */
+static inline u64 div_u64(u64 dividend, u32 divisor)
+{
+	u32 remainder;
+	return div_u64_rem(dividend, divisor, &remainder);
+}
+
+/**
+ * div_s64 - signed 64bit divide with 32bit divisor
+ */
+static inline s64 div_s64(s64 dividend, s32 divisor)
+{
+	s32 remainder;
+	return div_s64_rem(dividend, divisor, &remainder);
+}
+
+#endif /* _LINUX_MATH64_H */
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
new file mode 100644
index 0000000..c2789f9
--- /dev/null
+++ b/include/linux/mempool.h
@@ -0,0 +1,78 @@
+/*
+ * memory buffer pool support
+ */
+#ifndef _LINUX_MEMPOOL_H
+#define _LINUX_MEMPOOL_H
+
+#include <linux/compiler.h>
+#include <linux/bug.h>
+#include <linux/slab.h>
+
+struct kmem_cache;
+
+typedef struct mempool_s {
+	size_t		elem_size;
+} mempool_t;
+
+extern int mempool_resize(mempool_t *pool, int new_min_nr);
+
+static inline void mempool_free(void *element, mempool_t *pool)
+{
+	free(element);
+}
+
+static inline void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc
+{
+	BUG_ON(!pool->elem_size);
+	return kmalloc(pool->elem_size, gfp_mask);
+}
+
+static inline void mempool_exit(mempool_t *pool) {}
+
+static inline void mempool_destroy(mempool_t *pool)
+{
+	free(pool);
+}
+
+static inline int
+mempool_init_slab_pool(mempool_t *pool, int min_nr, struct kmem_cache *kc)
+{
+	pool->elem_size = 0;
+	return 0;
+}
+
+static inline mempool_t *
+mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
+{
+	mempool_t *pool = malloc(sizeof(*pool));
+	pool->elem_size = 0;
+	return pool;
+}
+
+static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+	pool->elem_size = size;
+	return 0;
+}
+
+static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
+{
+	mempool_t *pool = malloc(sizeof(*pool));
+	pool->elem_size = size;
+	return pool;
+}
+
+static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int order)
+{
+	pool->elem_size = PAGE_SIZE << order;
+	return 0;
+}
+
+static inline mempool_t *mempool_create_page_pool(int min_nr, int order)
+{
+	mempool_t *pool = malloc(sizeof(*pool));
+	pool->elem_size = PAGE_SIZE << order;
+	return pool;
+}
+
+#endif /* _LINUX_MEMPOOL_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
new file mode 100644
index 0000000..3830bc2
--- /dev/null
+++ b/include/linux/mm.h
@@ -0,0 +1 @@
+#include <linux/slab.h>
diff --git a/include/linux/module.h b/include/linux/module.h
new file mode 100644
index 0000000..3d988c1
--- /dev/null
+++ b/include/linux/module.h
@@ -0,0 +1,46 @@
+#ifndef _LINUX_MODULE_H
+#define _LINUX_MODULE_H
+
+#include <linux/stat.h>
+#include <linux/compiler.h>
+#include <linux/moduleparam.h>
+#include <linux/export.h>
+
+struct module;
+
+#define module_init(initfn)					\
+	__attribute__((constructor(109)))			\
+	static void __call_##initfn(void) { BUG_ON(initfn()); }
+
+#if 0
+#define module_exit(exitfn)					\
+	__attribute__((destructor(109)))			\
+	static void __call_##exitfn(void) { exitfn(); }
+#endif
+
+#define module_exit(exitfn)					\
+	__attribute__((unused))					\
+	static void __call_##exitfn(void) { exitfn(); }
+
+#define MODULE_INFO(tag, info)
+#define MODULE_ALIAS(_alias)
+#define MODULE_SOFTDEP(_softdep)
+#define MODULE_LICENSE(_license)
+#define MODULE_AUTHOR(_author)
+#define MODULE_DESCRIPTION(_description)
+#define MODULE_VERSION(_version)
+
+static inline void __module_get(struct module *module)
+{
+}
+
+static inline int try_module_get(struct module *module)
+{
+	return 1;
+}
+
+static inline void module_put(struct module *module)
+{
+}
+
+#endif /* _LINUX_MODULE_H */
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
new file mode 100644
index 0000000..6002673
--- /dev/null
+++ b/include/linux/moduleparam.h
@@ -0,0 +1,7 @@
+#ifndef _LINUX_MODULE_PARAMS_H
+#define _LINUX_MODULE_PARAMS_H
+
+#define module_param_named(name, value, type, perm)
+#define MODULE_PARM_DESC(_parm, desc)
+
+#endif /* _LINUX_MODULE_PARAMS_H */
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
new file mode 100644
index 0000000..964bd33
--- /dev/null
+++ b/include/linux/mutex.h
@@ -0,0 +1,15 @@
+#ifndef __TOOLS_LINUX_MUTEX_H
+#define __TOOLS_LINUX_MUTEX_H
+
+#include <pthread.h>
+
+struct mutex {
+	pthread_mutex_t lock;
+};
+
+#define mutex_init(l)		pthread_mutex_init(&(l)->lock, NULL)
+#define mutex_lock(l)		pthread_mutex_lock(&(l)->lock)
+#define mutex_trylock(l)	(!pthread_mutex_trylock(&(l)->lock))
+#define mutex_unlock(l)		pthread_mutex_unlock(&(l)->lock)
+
+#endif /* __TOOLS_LINUX_MUTEX_H */
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
new file mode 100644
index 0000000..29bd2e1
--- /dev/null
+++ b/include/linux/notifier.h
@@ -0,0 +1,197 @@
+/*
+ *	Routines to manage notifier chains for passing status changes to any
+ *	interested routines. We need this instead of hard coded call lists so
+ *	that modules can poke their nose into the innards. The network devices
+ *	needed them so here they are for the rest of you.
+ *
+ *				Alan Cox <Alan.Cox@linux.org>
+ */
+ 
+#ifndef _LINUX_NOTIFIER_H
+#define _LINUX_NOTIFIER_H
+#include <linux/errno.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+//#include <linux/srcu.h>
+
+/*
+ * Notifier chains are of four types:
+ *
+ *	Atomic notifier chains: Chain callbacks run in interrupt/atomic
+ *		context. Callouts are not allowed to block.
+ *	Blocking notifier chains: Chain callbacks run in process context.
+ *		Callouts are allowed to block.
+ *	Raw notifier chains: There are no restrictions on callbacks,
+ *		registration, or unregistration.  All locking and protection
+ *		must be provided by the caller.
+ *	SRCU notifier chains: A variant of blocking notifier chains, with
+ *		the same restrictions.
+ *
+ * atomic_notifier_chain_register() may be called from an atomic context,
+ * but blocking_notifier_chain_register() and srcu_notifier_chain_register()
+ * must be called from a process context.  Ditto for the corresponding
+ * _unregister() routines.
+ *
+ * atomic_notifier_chain_unregister(), blocking_notifier_chain_unregister(),
+ * and srcu_notifier_chain_unregister() _must not_ be called from within
+ * the call chain.
+ *
+ * SRCU notifier chains are an alternative form of blocking notifier chains.
+ * They use SRCU (Sleepable Read-Copy Update) instead of rw-semaphores for
+ * protection of the chain links.  This means there is _very_ low overhead
+ * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
+ * As compensation, srcu_notifier_chain_unregister() is rather expensive.
+ * SRCU notifier chains should be used when the chain will be called very
+ * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
+ * chains are slightly more difficult to use because they require special
+ * runtime initialization.
+ */
+
+struct notifier_block;
+
+typedef	int (*notifier_fn_t)(struct notifier_block *nb,
+			unsigned long action, void *data);
+
+struct notifier_block {
+	notifier_fn_t notifier_call;
+	struct notifier_block __rcu *next;
+	int priority;
+};
+
+struct atomic_notifier_head {
+	spinlock_t lock;
+	struct notifier_block __rcu *head;
+};
+
+struct blocking_notifier_head {
+	struct rw_semaphore rwsem;
+	struct notifier_block __rcu *head;
+};
+
+struct raw_notifier_head {
+	struct notifier_block __rcu *head;
+};
+
+#define ATOMIC_INIT_NOTIFIER_HEAD(name) do {	\
+		spin_lock_init(&(name)->lock);	\
+		(name)->head = NULL;		\
+	} while (0)
+#define BLOCKING_INIT_NOTIFIER_HEAD(name) do {	\
+		init_rwsem(&(name)->rwsem);	\
+		(name)->head = NULL;		\
+	} while (0)
+#define RAW_INIT_NOTIFIER_HEAD(name) do {	\
+		(name)->head = NULL;		\
+	} while (0)
+
+#define ATOMIC_NOTIFIER_INIT(name) {				\
+		.lock = __SPIN_LOCK_UNLOCKED(name.lock),	\
+		.head = NULL }
+#define BLOCKING_NOTIFIER_INIT(name) {				\
+		.rwsem = __RWSEM_INITIALIZER((name).rwsem),	\
+		.head = NULL }
+#define RAW_NOTIFIER_INIT(name)	{				\
+		.head = NULL }
+
+#define ATOMIC_NOTIFIER_HEAD(name)				\
+	struct atomic_notifier_head name =			\
+		ATOMIC_NOTIFIER_INIT(name)
+#define BLOCKING_NOTIFIER_HEAD(name)				\
+	struct blocking_notifier_head name =			\
+		BLOCKING_NOTIFIER_INIT(name)
+#define RAW_NOTIFIER_HEAD(name)					\
+	struct raw_notifier_head name =				\
+		RAW_NOTIFIER_INIT(name)
+
+#define NOTIFY_DONE		0x0000		/* Don't care */
+#define NOTIFY_OK		0x0001		/* Suits me */
+#define NOTIFY_STOP_MASK	0x8000		/* Don't call further */
+#define NOTIFY_BAD		(NOTIFY_STOP_MASK|0x0002)
+						/* Bad/Veto action */
+/*
+ * Clean way to return from the notifier and stop further calls.
+ */
+#define NOTIFY_STOP		(NOTIFY_OK|NOTIFY_STOP_MASK)
+
+#ifdef __KERNEL__
+
+extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
+		struct notifier_block *nb);
+extern int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+		struct notifier_block *nb);
+extern int raw_notifier_chain_register(struct raw_notifier_head *nh,
+		struct notifier_block *nb);
+
+extern int blocking_notifier_chain_cond_register(
+		struct blocking_notifier_head *nh,
+		struct notifier_block *nb);
+
+extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
+		struct notifier_block *nb);
+extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
+		struct notifier_block *nb);
+extern int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
+		struct notifier_block *nb);
+
+extern int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+		unsigned long val, void *v);
+extern int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+	unsigned long val, void *v, int nr_to_call, int *nr_calls);
+extern int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+		unsigned long val, void *v);
+extern int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+	unsigned long val, void *v, int nr_to_call, int *nr_calls);
+extern int raw_notifier_call_chain(struct raw_notifier_head *nh,
+		unsigned long val, void *v);
+extern int __raw_notifier_call_chain(struct raw_notifier_head *nh,
+	unsigned long val, void *v, int nr_to_call, int *nr_calls);
+
+/* Encapsulate (negative) errno value (in particular, NOTIFY_BAD <=> EPERM). */
+static inline int notifier_from_errno(int err)
+{
+	if (err)
+		return NOTIFY_STOP_MASK | (NOTIFY_OK - err);
+
+	return NOTIFY_OK;
+}
+
+/* Restore (negative) errno value from notify return value. */
+static inline int notifier_to_errno(int ret)
+{
+	ret &= ~NOTIFY_STOP_MASK;
+	return ret > NOTIFY_OK ? NOTIFY_OK - ret : 0;
+}
+
+/*
+ *	Declared notifiers so far. I can imagine quite a few more chains
+ *	over time (eg laptop power reset chains, reboot chain (to clean 
+ *	device units up), device [un]mount chain, module load/unload chain,
+ *	low memory chain, screenblank chain (for plug in modular screenblankers) 
+ *	VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
+ */
+ 
+/* CPU notfiers are defined in include/linux/cpu.h. */
+
+/* netdevice notifiers are defined in include/linux/netdevice.h */
+
+/* reboot notifiers are defined in include/linux/reboot.h. */
+
+/* Hibernation and suspend events are defined in include/linux/suspend.h. */
+
+/* Virtual Terminal events are defined in include/linux/vt.h. */
+
+#define NETLINK_URELEASE	0x0001	/* Unicast netlink socket released */
+
+/* Console keyboard events.
+ * Note: KBD_KEYCODE is always sent before KBD_UNBOUND_KEYCODE, KBD_UNICODE and
+ * KBD_KEYSYM. */
+#define KBD_KEYCODE		0x0001 /* Keyboard keycode, called before any other */
+#define KBD_UNBOUND_KEYCODE	0x0002 /* Keyboard keycode which is not bound to any other */
+#define KBD_UNICODE		0x0003 /* Keyboard unicode */
+#define KBD_KEYSYM		0x0004 /* Keyboard keysym */
+#define KBD_POST_KEYSYM		0x0005 /* Called after keyboard keysym interpretation */
+
+extern struct blocking_notifier_head reboot_notifier_list;
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_NOTIFIER_H */
diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
new file mode 100644
index 0000000..bde9f0d
--- /dev/null
+++ b/include/linux/osq_lock.h
@@ -0,0 +1,44 @@
+#ifndef __LINUX_OSQ_LOCK_H
+#define __LINUX_OSQ_LOCK_H
+
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ */
+struct optimistic_spin_node {
+	struct optimistic_spin_node *next, *prev;
+	int locked; /* 1 if lock acquired */
+	int cpu; /* encoded CPU # + 1 value */
+};
+
+struct optimistic_spin_queue {
+	/*
+	 * Stores an encoded value of the CPU # of the tail node in the queue.
+	 * If the queue is empty, then it's set to OSQ_UNLOCKED_VAL.
+	 */
+	atomic_t tail;
+};
+
+#define OSQ_UNLOCKED_VAL (0)
+
+/* Init macro and function. */
+#define OSQ_LOCK_UNLOCKED { ATOMIC_INIT(OSQ_UNLOCKED_VAL) }
+
+static inline void osq_lock_init(struct optimistic_spin_queue *lock)
+{
+	atomic_set(&lock->tail, OSQ_UNLOCKED_VAL);
+}
+
+static inline bool osq_lock(struct optimistic_spin_queue *lock)
+{
+	return false;
+}
+
+static inline void osq_unlock(struct optimistic_spin_queue *lock) {}
+
+static inline bool osq_is_locked(struct optimistic_spin_queue *lock)
+{
+	return atomic_read(&lock->tail) != OSQ_UNLOCKED_VAL;
+}
+
+#endif
diff --git a/include/linux/page.h b/include/linux/page.h
new file mode 100644
index 0000000..c99d9de
--- /dev/null
+++ b/include/linux/page.h
@@ -0,0 +1,18 @@
+#ifndef _LINUX_PAGE_H
+#define _LINUX_PAGE_H
+
+#include <sys/user.h>
+
+struct page;
+
+#define virt_to_page(kaddr)		((struct page *) (kaddr))
+#define page_address(kaddr)		((void *) (kaddr))
+
+#define kmap_atomic(page)		page_address(page)
+#define kunmap_atomic(addr)		do {} while (0)
+
+static const char zero_page[PAGE_SIZE];
+
+#define ZERO_PAGE(o)			((struct page *) &zero_page[0])
+
+#endif /* _LINUX_PAGE_H */
diff --git a/include/linux/path.h b/include/linux/path.h
new file mode 100644
index 0000000..d137218
--- /dev/null
+++ b/include/linux/path.h
@@ -0,0 +1,20 @@
+#ifndef _LINUX_PATH_H
+#define _LINUX_PATH_H
+
+struct dentry;
+struct vfsmount;
+
+struct path {
+	struct vfsmount *mnt;
+	struct dentry *dentry;
+};
+
+extern void path_get(const struct path *);
+extern void path_put(const struct path *);
+
+static inline int path_equal(const struct path *path1, const struct path *path2)
+{
+	return path1->mnt == path2->mnt && path1->dentry == path2->dentry;
+}
+
+#endif  /* _LINUX_PATH_H */
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
new file mode 100644
index 0000000..5a98618
--- /dev/null
+++ b/include/linux/percpu-refcount.h
@@ -0,0 +1,183 @@
+#ifndef __TOOLS_LINUX_PERCPU_REFCOUNT_H
+#define __TOOLS_LINUX_PERCPU_REFCOUNT_H
+
+#include <linux/atomic.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+
+struct percpu_ref;
+typedef void (percpu_ref_func_t)(struct percpu_ref *);
+
+/* flags set in the lower bits of percpu_ref->percpu_count_ptr */
+enum {
+	__PERCPU_REF_ATOMIC	= 1LU << 0,	/* operating in atomic mode */
+	__PERCPU_REF_DEAD	= 1LU << 1,	/* (being) killed */
+	__PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD,
+
+	__PERCPU_REF_FLAG_BITS	= 2,
+};
+
+/* @flags for percpu_ref_init() */
+enum {
+	/*
+	 * Start w/ ref == 1 in atomic mode.  Can be switched to percpu
+	 * operation using percpu_ref_switch_to_percpu().  If initialized
+	 * with this flag, the ref will stay in atomic mode until
+	 * percpu_ref_switch_to_percpu() is invoked on it.
+	 */
+	PERCPU_REF_INIT_ATOMIC	= 1 << 0,
+
+	/*
+	 * Start dead w/ ref == 0 in atomic mode.  Must be revived with
+	 * percpu_ref_reinit() before used.  Implies INIT_ATOMIC.
+	 */
+	PERCPU_REF_INIT_DEAD	= 1 << 1,
+};
+
+struct percpu_ref {
+	atomic_long_t		count;
+	percpu_ref_func_t	*release;
+	percpu_ref_func_t	*confirm_switch;
+};
+
+static inline void percpu_ref_exit(struct percpu_ref *ref) {}
+
+static inline int __must_check percpu_ref_init(struct percpu_ref *ref,
+				 percpu_ref_func_t *release, unsigned int flags,
+				 gfp_t gfp)
+{
+	unsigned long start_count = 0;
+
+	if (!(flags & PERCPU_REF_INIT_DEAD))
+		start_count++;
+
+	atomic_long_set(&ref->count, start_count);
+
+	ref->release = release;
+	return 0;
+}
+
+static inline void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
+				 percpu_ref_func_t *confirm_switch) {}
+
+static inline void percpu_ref_switch_to_percpu(struct percpu_ref *ref) {}
+
+static inline void percpu_ref_reinit(struct percpu_ref *ref) {}
+
+/**
+ * percpu_ref_get_many - increment a percpu refcount
+ * @ref: percpu_ref to get
+ * @nr: number of references to get
+ *
+ * Analogous to atomic_long_add().
+ *
+ * This function is safe to call as long as @ref is between init and exit.
+ */
+static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
+{
+	atomic_long_add(nr, &ref->count);
+}
+
+/**
+ * percpu_ref_get - increment a percpu refcount
+ * @ref: percpu_ref to get
+ *
+ * Analagous to atomic_long_inc().
+ *
+ * This function is safe to call as long as @ref is between init and exit.
+ */
+static inline void percpu_ref_get(struct percpu_ref *ref)
+{
+	percpu_ref_get_many(ref, 1);
+}
+
+/**
+ * percpu_ref_tryget - try to increment a percpu refcount
+ * @ref: percpu_ref to try-get
+ *
+ * Increment a percpu refcount unless its count already reached zero.
+ * Returns %true on success; %false on failure.
+ *
+ * This function is safe to call as long as @ref is between init and exit.
+ */
+static inline bool percpu_ref_tryget(struct percpu_ref *ref)
+{
+	return atomic_long_inc_not_zero(&ref->count);
+}
+
+/**
+ * percpu_ref_tryget_live - try to increment a live percpu refcount
+ * @ref: percpu_ref to try-get
+ *
+ * Increment a percpu refcount unless it has already been killed.  Returns
+ * %true on success; %false on failure.
+ *
+ * Completion of percpu_ref_kill() in itself doesn't guarantee that this
+ * function will fail.  For such guarantee, percpu_ref_kill_and_confirm()
+ * should be used.  After the confirm_kill callback is invoked, it's
+ * guaranteed that no new reference will be given out by
+ * percpu_ref_tryget_live().
+ *
+ * This function is safe to call as long as @ref is between init and exit.
+ */
+static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
+{
+	return atomic_long_inc_not_zero(&ref->count);
+}
+
+/**
+ * percpu_ref_put_many - decrement a percpu refcount
+ * @ref: percpu_ref to put
+ * @nr: number of references to put
+ *
+ * Decrement the refcount, and if 0, call the release function (which was passed
+ * to percpu_ref_init())
+ *
+ * This function is safe to call as long as @ref is between init and exit.
+ */
+static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr)
+{
+	if (unlikely(atomic_long_sub_and_test(nr, &ref->count)))
+		ref->release(ref);
+}
+
+/**
+ * percpu_ref_put - decrement a percpu refcount
+ * @ref: percpu_ref to put
+ *
+ * Decrement the refcount, and if 0, call the release function (which was passed
+ * to percpu_ref_init())
+ *
+ * This function is safe to call as long as @ref is between init and exit.
+ */
+static inline void percpu_ref_put(struct percpu_ref *ref)
+{
+	percpu_ref_put_many(ref, 1);
+}
+
+/**
+ * percpu_ref_kill - drop the initial ref
+ * @ref: percpu_ref to kill
+ *
+ * Must be used to drop the initial ref on a percpu refcount; must be called
+ * precisely once before shutdown.
+ */
+static inline void percpu_ref_kill(struct percpu_ref *ref)
+{
+	percpu_ref_put(ref);
+}
+
+/**
+ * percpu_ref_is_zero - test whether a percpu refcount reached zero
+ * @ref: percpu_ref to test
+ *
+ * Returns %true if @ref reached zero.
+ *
+ * This function is safe to call as long as @ref is between init and exit.
+ */
+static inline bool percpu_ref_is_zero(struct percpu_ref *ref)
+{
+	return !atomic_long_read(&ref->count);
+}
+
+#endif /* __TOOLS_LINUX_PERCPU_REFCOUNT_H */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
new file mode 100644
index 0000000..ad24977
--- /dev/null
+++ b/include/linux/percpu.h
@@ -0,0 +1,189 @@
+#ifndef __TOOLS_LINUX_PERCPU_H
+#define __TOOLS_LINUX_PERCPU_H
+
+#define __percpu
+
+#define free_percpu(percpu)				free(percpu)
+
+#define __alloc_percpu_gfp(size, align, gfp)		calloc(1, size)
+#define __alloc_percpu(size, align)			calloc(1, size)
+
+#define alloc_percpu_gfp(type, gfp)					\
+	(typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type),	\
+						__alignof__(type), gfp)
+#define alloc_percpu(type)						\
+	(typeof(type) __percpu *)__alloc_percpu(sizeof(type),		\
+						__alignof__(type))
+
+#define __verify_pcpu_ptr(ptr)
+
+#define per_cpu_ptr(ptr, cpu)	(ptr)
+#define raw_cpu_ptr(ptr)	(ptr)
+#define this_cpu_ptr(ptr)	raw_cpu_ptr(ptr)
+
+#define __pcpu_size_call_return(stem, variable)				\
+({									\
+	typeof(variable) pscr_ret__;					\
+	__verify_pcpu_ptr(&(variable));					\
+	switch(sizeof(variable)) {					\
+	case 1: pscr_ret__ = stem##1(variable); break;			\
+	case 2: pscr_ret__ = stem##2(variable); break;			\
+	case 4: pscr_ret__ = stem##4(variable); break;			\
+	case 8: pscr_ret__ = stem##8(variable); break;			\
+	default:							\
+		__bad_size_call_parameter(); break;			\
+	}								\
+	pscr_ret__;							\
+})
+
+#define __pcpu_size_call_return2(stem, variable, ...)			\
+({									\
+	typeof(variable) pscr2_ret__;					\
+	__verify_pcpu_ptr(&(variable));					\
+	switch(sizeof(variable)) {					\
+	case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break;	\
+	case 2: pscr2_ret__ = stem##2(variable, __VA_ARGS__); break;	\
+	case 4: pscr2_ret__ = stem##4(variable, __VA_ARGS__); break;	\
+	case 8: pscr2_ret__ = stem##8(variable, __VA_ARGS__); break;	\
+	default:							\
+		__bad_size_call_parameter(); break;			\
+	}								\
+	pscr2_ret__;							\
+})
+
+/*
+ * Special handling for cmpxchg_double.  cmpxchg_double is passed two
+ * percpu variables.  The first has to be aligned to a double word
+ * boundary and the second has to follow directly thereafter.
+ * We enforce this on all architectures even if they don't support
+ * a double cmpxchg instruction, since it's a cheap requirement, and it
+ * avoids breaking the requirement for architectures with the instruction.
+ */
+#define __pcpu_double_call_return_bool(stem, pcp1, pcp2, ...)		\
+({									\
+	bool pdcrb_ret__;						\
+	__verify_pcpu_ptr(&(pcp1));					\
+	BUILD_BUG_ON(sizeof(pcp1) != sizeof(pcp2));			\
+	VM_BUG_ON((unsigned long)(&(pcp1)) % (2 * sizeof(pcp1)));	\
+	VM_BUG_ON((unsigned long)(&(pcp2)) !=				\
+		  (unsigned long)(&(pcp1)) + sizeof(pcp1));		\
+	switch(sizeof(pcp1)) {						\
+	case 1: pdcrb_ret__ = stem##1(pcp1, pcp2, __VA_ARGS__); break;	\
+	case 2: pdcrb_ret__ = stem##2(pcp1, pcp2, __VA_ARGS__); break;	\
+	case 4: pdcrb_ret__ = stem##4(pcp1, pcp2, __VA_ARGS__); break;	\
+	case 8: pdcrb_ret__ = stem##8(pcp1, pcp2, __VA_ARGS__); break;	\
+	default:							\
+		__bad_size_call_parameter(); break;			\
+	}								\
+	pdcrb_ret__;							\
+})
+
+#define __pcpu_size_call(stem, variable, ...)				\
+do {									\
+	__verify_pcpu_ptr(&(variable));					\
+	switch(sizeof(variable)) {					\
+		case 1: stem##1(variable, __VA_ARGS__);break;		\
+		case 2: stem##2(variable, __VA_ARGS__);break;		\
+		case 4: stem##4(variable, __VA_ARGS__);break;		\
+		case 8: stem##8(variable, __VA_ARGS__);break;		\
+		default: 						\
+			__bad_size_call_parameter();break;		\
+	}								\
+} while (0)
+
+#define raw_cpu_read(pcp)		__pcpu_size_call_return(raw_cpu_read_, pcp)
+#define raw_cpu_write(pcp, val)		__pcpu_size_call(raw_cpu_write_, pcp, val)
+#define raw_cpu_add(pcp, val)		__pcpu_size_call(raw_cpu_add_, pcp, val)
+#define raw_cpu_and(pcp, val)		__pcpu_size_call(raw_cpu_and_, pcp, val)
+#define raw_cpu_or(pcp, val)		__pcpu_size_call(raw_cpu_or_, pcp, val)
+#define raw_cpu_add_return(pcp, val)	__pcpu_size_call_return2(raw_cpu_add_return_, pcp, val)
+#define raw_cpu_xchg(pcp, nval)		__pcpu_size_call_return2(raw_cpu_xchg_, pcp, nval)
+#define raw_cpu_cmpxchg(pcp, oval, nval) \
+	__pcpu_size_call_return2(raw_cpu_cmpxchg_, pcp, oval, nval)
+#define raw_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
+	__pcpu_double_call_return_bool(raw_cpu_cmpxchg_double_, pcp1, pcp2, oval1, oval2, nval1, nval2)
+
+#define raw_cpu_sub(pcp, val)		raw_cpu_add(pcp, -(val))
+#define raw_cpu_inc(pcp)		raw_cpu_add(pcp, 1)
+#define raw_cpu_dec(pcp)		raw_cpu_sub(pcp, 1)
+#define raw_cpu_sub_return(pcp, val)	raw_cpu_add_return(pcp, -(typeof(pcp))(val))
+#define raw_cpu_inc_return(pcp)		raw_cpu_add_return(pcp, 1)
+#define raw_cpu_dec_return(pcp)		raw_cpu_add_return(pcp, -1)
+
+#define __this_cpu_read(pcp)						\
+({									\
+	raw_cpu_read(pcp);						\
+})
+
+#define __this_cpu_write(pcp, val)					\
+({									\
+	raw_cpu_write(pcp, val);					\
+})
+
+#define __this_cpu_add(pcp, val)					\
+({									\
+	raw_cpu_add(pcp, val);						\
+})
+
+#define __this_cpu_and(pcp, val)					\
+({									\
+	raw_cpu_and(pcp, val);						\
+})
+
+#define __this_cpu_or(pcp, val)						\
+({									\
+	raw_cpu_or(pcp, val);						\
+})
+
+#define __this_cpu_add_return(pcp, val)					\
+({									\
+	raw_cpu_add_return(pcp, val);					\
+})
+
+#define __this_cpu_xchg(pcp, nval)					\
+({									\
+	raw_cpu_xchg(pcp, nval);					\
+})
+
+#define __this_cpu_cmpxchg(pcp, oval, nval)				\
+({									\
+	raw_cpu_cmpxchg(pcp, oval, nval);				\
+})
+
+#define __this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
+	raw_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2);	\
+})
+
+#define __this_cpu_sub(pcp, val)	__this_cpu_add(pcp, -(typeof(pcp))(val))
+#define __this_cpu_inc(pcp)		__this_cpu_add(pcp, 1)
+#define __this_cpu_dec(pcp)		__this_cpu_sub(pcp, 1)
+#define __this_cpu_sub_return(pcp, val)	__this_cpu_add_return(pcp, -(typeof(pcp))(val))
+#define __this_cpu_inc_return(pcp)	__this_cpu_add_return(pcp, 1)
+#define __this_cpu_dec_return(pcp)	__this_cpu_add_return(pcp, -1)
+
+#define this_cpu_read(pcp)		((pcp))
+#define this_cpu_write(pcp, val)	((pcp) = val)
+#define this_cpu_add(pcp, val)		((pcp) += val)
+#define this_cpu_and(pcp, val)		((pcp) &= val)
+#define this_cpu_or(pcp, val)		((pcp) |= val)
+#define this_cpu_add_return(pcp, val)	((pcp) += val)
+#define this_cpu_xchg(pcp, nval)					\
+({									\
+	typeof(pcp) _r = (pcp);						\
+	(pcp) = (nval);							\
+	_r;								\
+})
+
+#define this_cpu_cmpxchg(pcp, oval, nval) \
+	__pcpu_size_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
+#define this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
+	__pcpu_double_call_return_bool(this_cpu_cmpxchg_double_, pcp1, pcp2, oval1, oval2, nval1, nval2)
+
+#define this_cpu_sub(pcp, val)		this_cpu_add(pcp, -(typeof(pcp))(val))
+#define this_cpu_inc(pcp)		this_cpu_add(pcp, 1)
+#define this_cpu_dec(pcp)		this_cpu_sub(pcp, 1)
+#define this_cpu_sub_return(pcp, val)	this_cpu_add_return(pcp, -(typeof(pcp))(val))
+#define this_cpu_inc_return(pcp)	this_cpu_add_return(pcp, 1)
+#define this_cpu_dec_return(pcp)	this_cpu_add_return(pcp, -1)
+
+#endif /* __TOOLS_LINUX_PERCPU_H */
diff --git a/include/linux/poison.h b/include/linux/poison.h
new file mode 100644
index 0000000..51334ed
--- /dev/null
+++ b/include/linux/poison.h
@@ -0,0 +1,90 @@
+#ifndef _LINUX_POISON_H
+#define _LINUX_POISON_H
+
+/********** include/linux/list.h **********/
+
+/*
+ * Architectures might want to move the poison pointer offset
+ * into some well-recognized area such as 0xdead000000000000,
+ * that is also not mappable by user-space exploits:
+ */
+#ifdef CONFIG_ILLEGAL_POINTER_VALUE
+# define POISON_POINTER_DELTA _AC(CONFIG_ILLEGAL_POINTER_VALUE, UL)
+#else
+# define POISON_POINTER_DELTA 0
+#endif
+
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1  ((void *) 0x100 + POISON_POINTER_DELTA)
+#define LIST_POISON2  ((void *) 0x200 + POISON_POINTER_DELTA)
+
+/********** include/linux/timer.h **********/
+/*
+ * Magic number "tsta" to indicate a static timer initializer
+ * for the object debugging code.
+ */
+#define TIMER_ENTRY_STATIC	((void *) 0x300 + POISON_POINTER_DELTA)
+
+/********** mm/debug-pagealloc.c **********/
+#ifdef CONFIG_PAGE_POISONING_ZERO
+#define PAGE_POISON 0x00
+#else
+#define PAGE_POISON 0xaa
+#endif
+
+/********** mm/page_alloc.c ************/
+
+#define TAIL_MAPPING	((void *) 0x400 + POISON_POINTER_DELTA)
+
+/********** mm/slab.c **********/
+/*
+ * Magic nums for obj red zoning.
+ * Placed in the first word before and the first word after an obj.
+ */
+#define	RED_INACTIVE	0x09F911029D74E35BULL	/* when obj is inactive */
+#define	RED_ACTIVE	0xD84156C5635688C0ULL	/* when obj is active */
+
+#define SLUB_RED_INACTIVE	0xbb
+#define SLUB_RED_ACTIVE		0xcc
+
+/* ...and for poisoning */
+#define	POISON_INUSE	0x5a	/* for use-uninitialised poisoning */
+#define POISON_FREE	0x6b	/* for use-after-free poisoning */
+#define	POISON_END	0xa5	/* end-byte of poisoning */
+
+/********** arch/$ARCH/mm/init.c **********/
+#define POISON_FREE_INITMEM	0xcc
+
+/********** arch/ia64/hp/common/sba_iommu.c **********/
+/*
+ * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a
+ * value of "SBAIOMMU POISON\0" for spill-over poisoning.
+ */
+
+/********** fs/jbd/journal.c **********/
+#define JBD_POISON_FREE		0x5b
+#define JBD2_POISON_FREE	0x5c
+
+/********** drivers/base/dmapool.c **********/
+#define	POOL_POISON_FREED	0xa7	/* !inuse */
+#define	POOL_POISON_ALLOCATED	0xa9	/* !initted */
+
+/********** drivers/atm/ **********/
+#define ATM_POISON_FREE		0x12
+#define ATM_POISON		0xdeadbeef
+
+/********** kernel/mutexes **********/
+#define MUTEX_DEBUG_INIT	0x11
+#define MUTEX_DEBUG_FREE	0x22
+
+/********** lib/flex_array.c **********/
+#define FLEX_ARRAY_FREE	0x6c	/* for use-after-free poisoning */
+
+/********** security/ **********/
+#define KEY_DESTROY		0xbd
+
+#endif
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
new file mode 100644
index 0000000..1d21bfe
--- /dev/null
+++ b/include/linux/posix_acl.h
@@ -0,0 +1,49 @@
+/*
+  File: linux/posix_acl.h
+
+  (C) 2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+
+#ifndef __LINUX_POSIX_ACL_H
+#define __LINUX_POSIX_ACL_H
+
+#include <linux/bug.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
+
+#define ACL_UNDEFINED_ID	(-1)
+
+/* a_type field in acl_user_posix_entry_t */
+#define ACL_TYPE_ACCESS		(0x8000)
+#define ACL_TYPE_DEFAULT	(0x4000)
+
+/* e_tag entry in struct posix_acl_entry */
+#define ACL_USER_OBJ		(0x01)
+#define ACL_USER		(0x02)
+#define ACL_GROUP_OBJ		(0x04)
+#define ACL_GROUP		(0x08)
+#define ACL_MASK		(0x10)
+#define ACL_OTHER		(0x20)
+
+/* permissions in the e_perm field */
+#define ACL_READ		(0x04)
+#define ACL_WRITE		(0x02)
+#define ACL_EXECUTE		(0x01)
+
+struct posix_acl_entry {
+	short			e_tag;
+	unsigned short		e_perm;
+	union {
+		uid_t		e_uid;
+		gid_t		e_gid;
+	};
+};
+
+struct posix_acl {
+	struct rcu_head		a_rcu;
+	unsigned int		a_count;
+	struct posix_acl_entry	a_entries[0];
+};
+
+#endif  /* __LINUX_POSIX_ACL_H */
diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
new file mode 100644
index 0000000..65beeb1
--- /dev/null
+++ b/include/linux/posix_acl_xattr.h
@@ -0,0 +1,34 @@
+/*
+  File: linux/posix_acl_xattr.h
+
+  Extended attribute system call representation of Access Control Lists.
+
+  Copyright (C) 2000 by Andreas Gruenbacher <a.gruenbacher@computer.org>
+  Copyright (C) 2002 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com>
+ */
+#ifndef _POSIX_ACL_XATTR_H
+#define _POSIX_ACL_XATTR_H
+
+#include <uapi/linux/xattr.h>
+
+/* Supported ACL a_version fields */
+#define POSIX_ACL_XATTR_VERSION	0x0002
+
+/* An undefined entry e_id value */
+#define ACL_UNDEFINED_ID	(-1)
+
+typedef struct {
+	__le16			e_tag;
+	__le16			e_perm;
+	__le32			e_id;
+} posix_acl_xattr_entry;
+
+typedef struct {
+	__le32			a_version;
+	posix_acl_xattr_entry	a_entries[0];
+} posix_acl_xattr_header;
+
+extern const struct xattr_handler posix_acl_access_xattr_handler;
+extern const struct xattr_handler posix_acl_default_xattr_handler;
+
+#endif	/* _POSIX_ACL_XATTR_H */
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
new file mode 100644
index 0000000..0618601
--- /dev/null
+++ b/include/linux/preempt.h
@@ -0,0 +1,15 @@
+#ifndef __LINUX_PREEMPT_H
+#define __LINUX_PREEMPT_H
+
+#define preempt_disable()			barrier()
+#define sched_preempt_enable_no_resched()	barrier()
+#define preempt_enable_no_resched()		barrier()
+#define preempt_enable()			barrier()
+#define preempt_check_resched()			do { } while (0)
+
+#define preempt_disable_notrace()		barrier()
+#define preempt_enable_no_resched_notrace()	barrier()
+#define preempt_enable_notrace()		barrier()
+#define preemptible()				0
+
+#endif /* __LINUX_PREEMPT_H */
diff --git a/include/linux/prefetch.h b/include/linux/prefetch.h
new file mode 100644
index 0000000..13cb826
--- /dev/null
+++ b/include/linux/prefetch.h
@@ -0,0 +1,7 @@
+#ifndef _LINUX_PREFETCH_H
+#define _LINUX_PREFETCH_H
+
+#define prefetch(p)	\
+	({ __maybe_unused typeof(p) __var = (p); })
+
+#endif /* _LINUX_PREFETCH_H */
diff --git a/include/linux/printk.h b/include/linux/printk.h
new file mode 100644
index 0000000..4e29af4
--- /dev/null
+++ b/include/linux/printk.h
@@ -0,0 +1,205 @@
+#ifndef __TOOLS_LINUX_PRINTK_H
+#define __TOOLS_LINUX_PRINTK_H
+
+#ifndef pr_fmt
+#define pr_fmt(fmt) fmt
+#endif
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#define KERN_EMERG	""
+#define KERN_ALERT	""
+#define KERN_CRIT	""
+#define KERN_ERR	""
+#define KERN_WARNING	""
+#define KERN_NOTICE	""
+#define KERN_INFO	""
+#define KERN_DEBUG	""
+#define KERN_DEFAULT	""
+#define KERN_CONT	""
+
+static inline int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
+{
+       int i = vsnprintf(buf, size, fmt, args);
+       ssize_t ssize = size;
+
+       return (i >= ssize) ? (ssize - 1) : i;
+}
+
+static inline int scnprintf(char * buf, size_t size, const char * fmt, ...)
+{
+       ssize_t ssize = size;
+       va_list args;
+       int i;
+
+       va_start(args, fmt);
+       i = vsnprintf(buf, size, fmt, args);
+       va_end(args);
+
+       return (i >= ssize) ? (ssize - 1) : i;
+}
+
+#define printk(...)	printf(__VA_ARGS__)
+
+#define no_printk(fmt, ...)				\
+({							\
+	do {						\
+		if (0)					\
+			printk(fmt, ##__VA_ARGS__);	\
+	} while (0);					\
+	0;						\
+})
+
+#define pr_emerg(fmt, ...) \
+	printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_alert(fmt, ...) \
+	printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_crit(fmt, ...) \
+	printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_err(fmt, ...) \
+	printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_warning(fmt, ...) \
+	printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_warn pr_warning
+#define pr_notice(fmt, ...) \
+	printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_info(fmt, ...) \
+	printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
+/*
+ * Like KERN_CONT, pr_cont() should only be used when continuing
+ * a line with no newline ('\n') enclosed. Otherwise it defaults
+ * back to KERN_DEFAULT.
+ */
+#define pr_cont(fmt, ...) \
+	printk(KERN_CONT fmt, ##__VA_ARGS__)
+
+/* pr_devel() should produce zero code unless DEBUG is defined */
+#ifdef DEBUG
+#define pr_devel(fmt, ...) \
+	printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
+#else
+#define pr_devel(fmt, ...) \
+	no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
+#endif
+
+
+/* If you are writing a driver, please use dev_dbg instead */
+#if defined(CONFIG_DYNAMIC_DEBUG)
+#include <linux/dynamic_debug.h>
+
+/* dynamic_pr_debug() uses pr_fmt() internally so we don't need it here */
+#define pr_debug(fmt, ...) \
+	dynamic_pr_debug(fmt, ##__VA_ARGS__)
+#elif defined(DEBUG)
+#define pr_debug(fmt, ...) \
+	printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
+#else
+#define pr_debug(fmt, ...) \
+	no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
+#endif
+
+/*
+ * Print a one-time message (analogous to WARN_ONCE() et al):
+ */
+
+#define printk_once(fmt, ...)					\
+({								\
+	static bool __print_once __read_mostly;			\
+	bool __ret_print_once = !__print_once;			\
+								\
+	if (!__print_once) {					\
+		__print_once = true;				\
+		printk(fmt, ##__VA_ARGS__);			\
+	}							\
+	unlikely(__ret_print_once);				\
+})
+#define printk_deferred_once(fmt, ...)				\
+({								\
+	static bool __print_once __read_mostly;			\
+	bool __ret_print_once = !__print_once;			\
+								\
+	if (!__print_once) {					\
+		__print_once = true;				\
+		printk_deferred(fmt, ##__VA_ARGS__);		\
+	}							\
+	unlikely(__ret_print_once);				\
+})
+
+#define pr_emerg_once(fmt, ...)					\
+	printk_once(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_alert_once(fmt, ...)					\
+	printk_once(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_crit_once(fmt, ...)					\
+	printk_once(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_err_once(fmt, ...)					\
+	printk_once(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_warn_once(fmt, ...)					\
+	printk_once(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_notice_once(fmt, ...)				\
+	printk_once(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_info_once(fmt, ...)					\
+	printk_once(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_cont_once(fmt, ...)					\
+	printk_once(KERN_CONT pr_fmt(fmt), ##__VA_ARGS__)
+
+#if defined(DEBUG)
+#define pr_devel_once(fmt, ...)					\
+	printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
+#else
+#define pr_devel_once(fmt, ...)					\
+	no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
+#endif
+
+/* If you are writing a driver, please use dev_dbg instead */
+#if defined(DEBUG)
+#define pr_debug_once(fmt, ...)					\
+	printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
+#else
+#define pr_debug_once(fmt, ...)					\
+	no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
+#endif
+
+/*
+ * ratelimited messages with local ratelimit_state,
+ * no local ratelimit_state used in the !PRINTK case
+ */
+#ifdef CONFIG_PRINTK
+#define printk_ratelimited(fmt, ...)					\
+({									\
+	static DEFINE_RATELIMIT_STATE(_rs,				\
+				      DEFAULT_RATELIMIT_INTERVAL,	\
+				      DEFAULT_RATELIMIT_BURST);		\
+									\
+	if (__ratelimit(&_rs))						\
+		printk(fmt, ##__VA_ARGS__);				\
+})
+#else
+#define printk_ratelimited(fmt, ...)					\
+	no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+#define pr_emerg_ratelimited(fmt, ...)					\
+	printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_alert_ratelimited(fmt, ...)					\
+	printk_ratelimited(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_crit_ratelimited(fmt, ...)					\
+	printk_ratelimited(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_err_ratelimited(fmt, ...)					\
+	printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_warn_ratelimited(fmt, ...)					\
+	printk_ratelimited(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_notice_ratelimited(fmt, ...)					\
+	printk_ratelimited(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_info_ratelimited(fmt, ...)					\
+	printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
+/* no pr_cont_ratelimited, don't do that... */
+
+#if defined(DEBUG)
+#define pr_devel_ratelimited(fmt, ...)					\
+	printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
+#else
+#define pr_devel_ratelimited(fmt, ...)					\
+	no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
+#endif
+#endif /* __TOOLS_LINUX_PRINTK_H */
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
new file mode 100644
index 0000000..ae5cc6d
--- /dev/null
+++ b/include/linux/radix-tree.h
@@ -0,0 +1,14 @@
+#ifndef _LINUX_RADIX_TREE_H
+#define _LINUX_RADIX_TREE_H
+
+struct radix_tree_root {
+};
+
+#define INIT_RADIX_TREE(root, mask)	do {} while (0)
+
+static inline void *radix_tree_lookup(struct radix_tree_root *r, unsigned long i)
+{
+	return NULL;
+}
+
+#endif /* _LINUX_RADIX_TREE_H */
diff --git a/include/linux/random.h b/include/linux/random.h
new file mode 100644
index 0000000..bd3dc61
--- /dev/null
+++ b/include/linux/random.h
@@ -0,0 +1,31 @@
+/*
+ * include/linux/random.h
+ *
+ * Include file for the random number generator.
+ */
+#ifndef _LINUX_RANDOM_H
+#define _LINUX_RANDOM_H
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/bug.h>
+
+static inline int getrandom(void *buf, size_t buflen, unsigned int flags)
+{
+	 return syscall(SYS_getrandom, buf, buflen, flags);
+}
+
+static inline void get_random_bytes(void *buf, int nbytes)
+{
+	BUG_ON(getrandom(buf, nbytes, 0) != nbytes);
+}
+
+static inline int get_random_int(void)
+{
+	int v;
+
+	get_random_bytes(&v, sizeof(v));
+	return v;
+}
+
+#endif /* _LINUX_RANDOM_H */
diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h
new file mode 100644
index 0000000..680181d
--- /dev/null
+++ b/include/linux/ratelimit.h
@@ -0,0 +1,109 @@
+#ifndef _LINUX_RATELIMIT_H
+#define _LINUX_RATELIMIT_H
+
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+
+#define DEFAULT_RATELIMIT_INTERVAL	(5 * HZ)
+#define DEFAULT_RATELIMIT_BURST		10
+
+/* issue num suppressed message on exit */
+#define RATELIMIT_MSG_ON_RELEASE	1
+
+struct ratelimit_state {
+	raw_spinlock_t	lock;		/* protect the state */
+
+	int		interval;
+	int		burst;
+	int		printed;
+	int		missed;
+	unsigned long	begin;
+	unsigned long	flags;
+};
+
+#define RATELIMIT_STATE_INIT(name, interval_init, burst_init) {		\
+		.lock		= __RAW_SPIN_LOCK_UNLOCKED(name.lock),	\
+		.interval	= interval_init,			\
+		.burst		= burst_init,				\
+	}
+
+#define RATELIMIT_STATE_INIT_DISABLED					\
+	RATELIMIT_STATE_INIT(ratelimit_state, 0, DEFAULT_RATELIMIT_BURST)
+
+#define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init)		\
+									\
+	struct ratelimit_state name =					\
+		RATELIMIT_STATE_INIT(name, interval_init, burst_init)	\
+
+static inline void ratelimit_state_init(struct ratelimit_state *rs,
+					int interval, int burst)
+{
+	memset(rs, 0, sizeof(*rs));
+
+	raw_spin_lock_init(&rs->lock);
+	rs->interval	= interval;
+	rs->burst	= burst;
+}
+
+static inline void ratelimit_default_init(struct ratelimit_state *rs)
+{
+	return ratelimit_state_init(rs, DEFAULT_RATELIMIT_INTERVAL,
+					DEFAULT_RATELIMIT_BURST);
+}
+
+static inline void ratelimit_state_exit(struct ratelimit_state *rs)
+{
+	if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE))
+		return;
+
+	if (rs->missed) {
+		pr_warn("%s: %d output lines suppressed due to ratelimiting\n",
+			current->comm, rs->missed);
+		rs->missed = 0;
+	}
+}
+
+static inline void
+ratelimit_set_flags(struct ratelimit_state *rs, unsigned long flags)
+{
+	rs->flags = flags;
+}
+
+extern struct ratelimit_state printk_ratelimit_state;
+
+extern int ___ratelimit(struct ratelimit_state *rs, const char *func);
+#define __ratelimit(state) ___ratelimit(state, __func__)
+
+#ifdef CONFIG_PRINTK
+
+#define WARN_ON_RATELIMIT(condition, state)			\
+		WARN_ON((condition) && __ratelimit(state))
+
+#define WARN_RATELIMIT(condition, format, ...)			\
+({								\
+	static DEFINE_RATELIMIT_STATE(_rs,			\
+				      DEFAULT_RATELIMIT_INTERVAL,	\
+				      DEFAULT_RATELIMIT_BURST);	\
+	int rtn = !!(condition);				\
+								\
+	if (unlikely(rtn && __ratelimit(&_rs)))			\
+		WARN(rtn, format, ##__VA_ARGS__);		\
+								\
+	rtn;							\
+})
+
+#else
+
+#define WARN_ON_RATELIMIT(condition, state)			\
+	WARN_ON(condition)
+
+#define WARN_RATELIMIT(condition, format, ...)			\
+({								\
+	int rtn = WARN(condition, format, ##__VA_ARGS__);	\
+	rtn;							\
+})
+
+#endif
+
+#endif /* _LINUX_RATELIMIT_H */
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
new file mode 100644
index 0000000..68ba8ce
--- /dev/null
+++ b/include/linux/rbtree.h
@@ -0,0 +1,127 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/include/linux/rbtree.h
+
+  To use rbtrees you'll have to implement your own insert and search cores.
+  This will avoid us to use callbacks and to drop drammatically performances.
+  I know it's not the cleaner way,  but in C (not in C++) to get
+  performances and genericity...
+
+  See Documentation/rbtree.txt for documentation and samples.
+*/
+
+#ifndef	_LINUX_RBTREE_H
+#define	_LINUX_RBTREE_H
+
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
+
+struct rb_node {
+	unsigned long  __rb_parent_color;
+	struct rb_node *rb_right;
+	struct rb_node *rb_left;
+} __attribute__((aligned(sizeof(long))));
+    /* The alignment might seem pointless, but allegedly CRIS needs it */
+
+struct rb_root {
+	struct rb_node *rb_node;
+};
+
+
+#define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))
+
+#define RB_ROOT	(struct rb_root) { NULL, }
+#define	rb_entry(ptr, type, member) container_of(ptr, type, member)
+
+#define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)
+
+/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
+#define RB_EMPTY_NODE(node)  \
+	((node)->__rb_parent_color == (unsigned long)(node))
+#define RB_CLEAR_NODE(node)  \
+	((node)->__rb_parent_color = (unsigned long)(node))
+
+
+extern void rb_insert_color(struct rb_node *, struct rb_root *);
+extern void rb_erase(struct rb_node *, struct rb_root *);
+
+
+/* Find logical next and previous nodes in a tree */
+extern struct rb_node *rb_next(const struct rb_node *);
+extern struct rb_node *rb_prev(const struct rb_node *);
+extern struct rb_node *rb_first(const struct rb_root *);
+extern struct rb_node *rb_last(const struct rb_root *);
+
+/* Postorder iteration - always visit the parent after its children */
+extern struct rb_node *rb_first_postorder(const struct rb_root *);
+extern struct rb_node *rb_next_postorder(const struct rb_node *);
+
+/* Fast replacement of a single node without remove/rebalance/add/rebalance */
+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+			    struct rb_root *root);
+extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
+				struct rb_root *root);
+
+static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
+				struct rb_node **rb_link)
+{
+	node->__rb_parent_color = (unsigned long)parent;
+	node->rb_left = node->rb_right = NULL;
+
+	*rb_link = node;
+}
+
+static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
+				    struct rb_node **rb_link)
+{
+	node->__rb_parent_color = (unsigned long)parent;
+	node->rb_left = node->rb_right = NULL;
+
+	rcu_assign_pointer(*rb_link, node);
+}
+
+#define rb_entry_safe(ptr, type, member) \
+	({ typeof(ptr) ____ptr = (ptr); \
+	   ____ptr ? rb_entry(____ptr, type, member) : NULL; \
+	})
+
+/**
+ * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
+ * given type allowing the backing memory of @pos to be invalidated
+ *
+ * @pos:	the 'type *' to use as a loop cursor.
+ * @n:		another 'type *' to use as temporary storage
+ * @root:	'rb_root *' of the rbtree.
+ * @field:	the name of the rb_node field within 'type'.
+ *
+ * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
+ * list_for_each_entry_safe() and allows the iteration to continue independent
+ * of changes to @pos by the body of the loop.
+ *
+ * Note, however, that it cannot handle other modifications that re-order the
+ * rbtree it is iterating over. This includes calling rb_erase() on @pos, as
+ * rb_erase() may rebalance the tree, causing us to miss some nodes.
+ */
+#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
+	for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
+	     pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
+			typeof(*pos), field); 1; }); \
+	     pos = n)
+
+#endif	/* _LINUX_RBTREE_H */
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
new file mode 100644
index 0000000..d076183
--- /dev/null
+++ b/include/linux/rbtree_augmented.h
@@ -0,0 +1,262 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/include/linux/rbtree_augmented.h
+*/
+
+#ifndef _LINUX_RBTREE_AUGMENTED_H
+#define _LINUX_RBTREE_AUGMENTED_H
+
+#include <linux/compiler.h>
+#include <linux/rbtree.h>
+
+/*
+ * Please note - only struct rb_augment_callbacks and the prototypes for
+ * rb_insert_augmented() and rb_erase_augmented() are intended to be public.
+ * The rest are implementation details you are not expected to depend on.
+ *
+ * See Documentation/rbtree.txt for documentation and samples.
+ */
+
+struct rb_augment_callbacks {
+	void (*propagate)(struct rb_node *node, struct rb_node *stop);
+	void (*copy)(struct rb_node *old, struct rb_node *new);
+	void (*rotate)(struct rb_node *old, struct rb_node *new);
+};
+
+extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+/*
+ * Fixup the rbtree and update the augmented information when rebalancing.
+ *
+ * On insertion, the user must update the augmented information on the path
+ * leading to the inserted node, then call rb_link_node() as usual and
+ * rb_augment_inserted() instead of the usual rb_insert_color() call.
+ * If rb_augment_inserted() rebalances the rbtree, it will callback into
+ * a user provided function to update the augmented information on the
+ * affected subtrees.
+ */
+static inline void
+rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+		    const struct rb_augment_callbacks *augment)
+{
+	__rb_insert_augmented(node, root, augment->rotate);
+}
+
+#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield,	\
+			     rbtype, rbaugmented, rbcompute)		\
+static inline void							\
+rbname ## _propagate(struct rb_node *rb, struct rb_node *stop)		\
+{									\
+	while (rb != stop) {						\
+		rbstruct *node = rb_entry(rb, rbstruct, rbfield);	\
+		rbtype augmented = rbcompute(node);			\
+		if (node->rbaugmented == augmented)			\
+			break;						\
+		node->rbaugmented = augmented;				\
+		rb = rb_parent(&node->rbfield);				\
+	}								\
+}									\
+static inline void							\
+rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)		\
+{									\
+	rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);		\
+	rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);		\
+	new->rbaugmented = old->rbaugmented;				\
+}									\
+static void								\
+rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)	\
+{									\
+	rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);		\
+	rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);		\
+	new->rbaugmented = old->rbaugmented;				\
+	old->rbaugmented = rbcompute(old);				\
+}									\
+rbstatic const struct rb_augment_callbacks rbname = {			\
+	rbname ## _propagate, rbname ## _copy, rbname ## _rotate	\
+};
+
+
+#define	RB_RED		0
+#define	RB_BLACK	1
+
+#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))
+
+#define __rb_color(pc)     ((pc) & 1)
+#define __rb_is_black(pc)  __rb_color(pc)
+#define __rb_is_red(pc)    (!__rb_color(pc))
+#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
+#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
+#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)
+
+static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
+{
+	rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
+}
+
+static inline void rb_set_parent_color(struct rb_node *rb,
+				       struct rb_node *p, int color)
+{
+	rb->__rb_parent_color = (unsigned long)p | color;
+}
+
+static inline void
+__rb_change_child(struct rb_node *old, struct rb_node *new,
+		  struct rb_node *parent, struct rb_root *root)
+{
+	if (parent) {
+		if (parent->rb_left == old)
+			WRITE_ONCE(parent->rb_left, new);
+		else
+			WRITE_ONCE(parent->rb_right, new);
+	} else
+		WRITE_ONCE(root->rb_node, new);
+}
+
+static inline void
+__rb_change_child_rcu(struct rb_node *old, struct rb_node *new,
+		      struct rb_node *parent, struct rb_root *root)
+{
+	if (parent) {
+		if (parent->rb_left == old)
+			rcu_assign_pointer(parent->rb_left, new);
+		else
+			rcu_assign_pointer(parent->rb_right, new);
+	} else
+		rcu_assign_pointer(root->rb_node, new);
+}
+
+extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+
+static __always_inline struct rb_node *
+__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+		     const struct rb_augment_callbacks *augment)
+{
+	struct rb_node *child = node->rb_right;
+	struct rb_node *tmp = node->rb_left;
+	struct rb_node *parent, *rebalance;
+	unsigned long pc;
+
+	if (!tmp) {
+		/*
+		 * Case 1: node to erase has no more than 1 child (easy!)
+		 *
+		 * Note that if there is one child it must be red due to 5)
+		 * and node must be black due to 4). We adjust colors locally
+		 * so as to bypass __rb_erase_color() later on.
+		 */
+		pc = node->__rb_parent_color;
+		parent = __rb_parent(pc);
+		__rb_change_child(node, child, parent, root);
+		if (child) {
+			child->__rb_parent_color = pc;
+			rebalance = NULL;
+		} else
+			rebalance = __rb_is_black(pc) ? parent : NULL;
+		tmp = parent;
+	} else if (!child) {
+		/* Still case 1, but this time the child is node->rb_left */
+		tmp->__rb_parent_color = pc = node->__rb_parent_color;
+		parent = __rb_parent(pc);
+		__rb_change_child(node, tmp, parent, root);
+		rebalance = NULL;
+		tmp = parent;
+	} else {
+		struct rb_node *successor = child, *child2;
+
+		tmp = child->rb_left;
+		if (!tmp) {
+			/*
+			 * Case 2: node's successor is its right child
+			 *
+			 *    (n)          (s)
+			 *    / \          / \
+			 *  (x) (s)  ->  (x) (c)
+			 *        \
+			 *        (c)
+			 */
+			parent = successor;
+			child2 = successor->rb_right;
+
+			augment->copy(node, successor);
+		} else {
+			/*
+			 * Case 3: node's successor is leftmost under
+			 * node's right child subtree
+			 *
+			 *    (n)          (s)
+			 *    / \          / \
+			 *  (x) (y)  ->  (x) (y)
+			 *      /            /
+			 *    (p)          (p)
+			 *    /            /
+			 *  (s)          (c)
+			 *    \
+			 *    (c)
+			 */
+			do {
+				parent = successor;
+				successor = tmp;
+				tmp = tmp->rb_left;
+			} while (tmp);
+			child2 = successor->rb_right;
+			WRITE_ONCE(parent->rb_left, child2);
+			WRITE_ONCE(successor->rb_right, child);
+			rb_set_parent(child, successor);
+
+			augment->copy(node, successor);
+			augment->propagate(parent, successor);
+		}
+
+		tmp = node->rb_left;
+		WRITE_ONCE(successor->rb_left, tmp);
+		rb_set_parent(tmp, successor);
+
+		pc = node->__rb_parent_color;
+		tmp = __rb_parent(pc);
+		__rb_change_child(node, successor, tmp, root);
+
+		if (child2) {
+			successor->__rb_parent_color = pc;
+			rb_set_parent_color(child2, parent, RB_BLACK);
+			rebalance = NULL;
+		} else {
+			unsigned long pc2 = successor->__rb_parent_color;
+			successor->__rb_parent_color = pc;
+			rebalance = __rb_is_black(pc2) ? parent : NULL;
+		}
+		tmp = successor;
+	}
+
+	augment->propagate(tmp, NULL);
+	return rebalance;
+}
+
+static __always_inline void
+rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+		   const struct rb_augment_callbacks *augment)
+{
+	struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
+	if (rebalance)
+		__rb_erase_color(rebalance, root, augment->rotate);
+}
+
+#endif	/* _LINUX_RBTREE_AUGMENTED_H */
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
new file mode 100644
index 0000000..8beb98d
--- /dev/null
+++ b/include/linux/rculist.h
@@ -0,0 +1,675 @@
+#ifndef _LINUX_RCULIST_H
+#define _LINUX_RCULIST_H
+
+#ifdef __KERNEL__
+
+/*
+ * RCU-protected list version
+ */
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+
+/*
+ * Why is there no list_empty_rcu()?  Because list_empty() serves this
+ * purpose.  The list_empty() function fetches the RCU-protected pointer
+ * and compares it to the address of the list head, but neither dereferences
+ * this pointer itself nor provides this pointer to the caller.  Therefore,
+ * it is not necessary to use rcu_dereference(), so that list_empty() can
+ * be used anywhere you would want to use a list_empty_rcu().
+ */
+
+/*
+ * INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers
+ * @list: list to be initialized
+ *
+ * You should instead use INIT_LIST_HEAD() for normal initialization and
+ * cleanup tasks, when readers have no access to the list being initialized.
+ * However, if the list being initialized is visible to readers, you
+ * need to keep the compiler from being too mischievous.
+ */
+static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
+{
+	WRITE_ONCE(list->next, list);
+	WRITE_ONCE(list->prev, list);
+}
+
+/*
+ * return the ->next pointer of a list_head in an rcu safe
+ * way, we must not access it directly
+ */
+#define list_next_rcu(list)	(*((struct list_head __rcu **)(&(list)->next)))
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+#ifndef CONFIG_DEBUG_LIST
+static inline void __list_add_rcu(struct list_head *new,
+		struct list_head *prev, struct list_head *next)
+{
+	new->next = next;
+	new->prev = prev;
+	rcu_assign_pointer(list_next_rcu(prev), new);
+	next->prev = new;
+}
+#else
+void __list_add_rcu(struct list_head *new,
+		    struct list_head *prev, struct list_head *next);
+#endif
+
+/**
+ * list_add_rcu - add a new entry to rcu-protected list
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as list_add_rcu()
+ * or list_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * list_for_each_entry_rcu().
+ */
+static inline void list_add_rcu(struct list_head *new, struct list_head *head)
+{
+	__list_add_rcu(new, head, head->next);
+}
+
+/**
+ * list_add_tail_rcu - add a new entry to rcu-protected list
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as list_add_tail_rcu()
+ * or list_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * list_for_each_entry_rcu().
+ */
+static inline void list_add_tail_rcu(struct list_head *new,
+					struct list_head *head)
+{
+	__list_add_rcu(new, head->prev, head);
+}
+
+/**
+ * list_del_rcu - deletes entry from list without re-initialization
+ * @entry: the element to delete from the list.
+ *
+ * Note: list_empty() on entry does not return true after this,
+ * the entry is in an undefined state. It is useful for RCU based
+ * lockfree traversal.
+ *
+ * In particular, it means that we can not poison the forward
+ * pointers that may still be used for walking the list.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as list_del_rcu()
+ * or list_add_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * list_for_each_entry_rcu().
+ *
+ * Note that the caller is not permitted to immediately free
+ * the newly deleted entry.  Instead, either synchronize_rcu()
+ * or call_rcu() must be used to defer freeing until an RCU
+ * grace period has elapsed.
+ */
+static inline void list_del_rcu(struct list_head *entry)
+{
+	__list_del_entry(entry);
+	entry->prev = LIST_POISON2;
+}
+
+/**
+ * hlist_del_init_rcu - deletes entry from hash list with re-initialization
+ * @n: the element to delete from the hash list.
+ *
+ * Note: list_unhashed() on the node return true after this. It is
+ * useful for RCU based read lockfree traversal if the writer side
+ * must know if the list entry is still hashed or already unhashed.
+ *
+ * In particular, it means that we can not poison the forward pointers
+ * that may still be used for walking the hash list and we can only
+ * zero the pprev pointer so list_unhashed() will return true after
+ * this.
+ *
+ * The caller must take whatever precautions are necessary (such as
+ * holding appropriate locks) to avoid racing with another
+ * list-mutation primitive, such as hlist_add_head_rcu() or
+ * hlist_del_rcu(), running on this same list.  However, it is
+ * perfectly legal to run concurrently with the _rcu list-traversal
+ * primitives, such as hlist_for_each_entry_rcu().
+ */
+static inline void hlist_del_init_rcu(struct hlist_node *n)
+{
+	if (!hlist_unhashed(n)) {
+		__hlist_del(n);
+		n->pprev = NULL;
+	}
+}
+
+/**
+ * list_replace_rcu - replace old entry by new one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ *
+ * The @old entry will be replaced with the @new entry atomically.
+ * Note: @old should not be empty.
+ */
+static inline void list_replace_rcu(struct list_head *old,
+				struct list_head *new)
+{
+	new->next = old->next;
+	new->prev = old->prev;
+	rcu_assign_pointer(list_next_rcu(new->prev), new);
+	new->next->prev = new;
+	old->prev = LIST_POISON2;
+}
+
+/**
+ * __list_splice_init_rcu - join an RCU-protected list into an existing list.
+ * @list:	the RCU-protected list to splice
+ * @prev:	points to the last element of the existing list
+ * @next:	points to the first element of the existing list
+ * @sync:	function to sync: synchronize_rcu(), synchronize_sched(), ...
+ *
+ * The list pointed to by @prev and @next can be RCU-read traversed
+ * concurrently with this function.
+ *
+ * Note that this function blocks.
+ *
+ * Important note: the caller must take whatever action is necessary to prevent
+ * any other updates to the existing list.  In principle, it is possible to
+ * modify the list as soon as sync() begins execution. If this sort of thing
+ * becomes necessary, an alternative version based on call_rcu() could be
+ * created.  But only if -really- needed -- there is no shortage of RCU API
+ * members.
+ */
+static inline void __list_splice_init_rcu(struct list_head *list,
+					  struct list_head *prev,
+					  struct list_head *next,
+					  void (*sync)(void))
+{
+	struct list_head *first = list->next;
+	struct list_head *last = list->prev;
+
+	/*
+	 * "first" and "last" tracking list, so initialize it.  RCU readers
+	 * have access to this list, so we must use INIT_LIST_HEAD_RCU()
+	 * instead of INIT_LIST_HEAD().
+	 */
+
+	INIT_LIST_HEAD_RCU(list);
+
+	/*
+	 * At this point, the list body still points to the source list.
+	 * Wait for any readers to finish using the list before splicing
+	 * the list body into the new list.  Any new readers will see
+	 * an empty list.
+	 */
+
+	sync();
+
+	/*
+	 * Readers are finished with the source list, so perform splice.
+	 * The order is important if the new list is global and accessible
+	 * to concurrent RCU readers.  Note that RCU readers are not
+	 * permitted to traverse the prev pointers without excluding
+	 * this function.
+	 */
+
+	last->next = next;
+	rcu_assign_pointer(list_next_rcu(prev), first);
+	first->prev = prev;
+	next->prev = last;
+}
+
+/**
+ * list_splice_init_rcu - splice an RCU-protected list into an existing list,
+ *                        designed for stacks.
+ * @list:	the RCU-protected list to splice
+ * @head:	the place in the existing list to splice the first list into
+ * @sync:	function to sync: synchronize_rcu(), synchronize_sched(), ...
+ */
+static inline void list_splice_init_rcu(struct list_head *list,
+					struct list_head *head,
+					void (*sync)(void))
+{
+	if (!list_empty(list))
+		__list_splice_init_rcu(list, head, head->next, sync);
+}
+
+/**
+ * list_splice_tail_init_rcu - splice an RCU-protected list into an existing
+ *                             list, designed for queues.
+ * @list:	the RCU-protected list to splice
+ * @head:	the place in the existing list to splice the first list into
+ * @sync:	function to sync: synchronize_rcu(), synchronize_sched(), ...
+ */
+static inline void list_splice_tail_init_rcu(struct list_head *list,
+					     struct list_head *head,
+					     void (*sync)(void))
+{
+	if (!list_empty(list))
+		__list_splice_init_rcu(list, head->prev, head, sync);
+}
+
+/**
+ * list_entry_rcu - get the struct for this entry
+ * @ptr:        the &struct list_head pointer.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_head within the struct.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
+ */
+#define list_entry_rcu(ptr, type, member) \
+	container_of(lockless_dereference(ptr), type, member)
+
+/**
+ * Where are list_empty_rcu() and list_first_entry_rcu()?
+ *
+ * Implementing those functions following their counterparts list_empty() and
+ * list_first_entry() is not advisable because they lead to subtle race
+ * conditions as the following snippet shows:
+ *
+ * if (!list_empty_rcu(mylist)) {
+ *	struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member);
+ *	do_something(bar);
+ * }
+ *
+ * The list may not be empty when list_empty_rcu checks it, but it may be when
+ * list_first_entry_rcu rereads the ->next pointer.
+ *
+ * Rereading the ->next pointer is not a problem for list_empty() and
+ * list_first_entry() because they would be protected by a lock that blocks
+ * writers.
+ *
+ * See list_first_or_null_rcu for an alternative.
+ */
+
+/**
+ * list_first_or_null_rcu - get the first element from a list
+ * @ptr:        the list head to take the element from.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_head within the struct.
+ *
+ * Note that if the list is empty, it returns NULL.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
+ */
+#define list_first_or_null_rcu(ptr, type, member) \
+({ \
+	struct list_head *__ptr = (ptr); \
+	struct list_head *__next = READ_ONCE(__ptr->next); \
+	likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \
+})
+
+/**
+ * list_next_or_null_rcu - get the first element from a list
+ * @head:	the head for the list.
+ * @ptr:        the list head to take the next element from.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_head within the struct.
+ *
+ * Note that if the ptr is at the end of the list, NULL is returned.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
+ */
+#define list_next_or_null_rcu(head, ptr, type, member) \
+({ \
+	struct list_head *__head = (head); \
+	struct list_head *__ptr = (ptr); \
+	struct list_head *__next = READ_ONCE(__ptr->next); \
+	likely(__next != __head) ? list_entry_rcu(__next, type, \
+						  member) : NULL; \
+})
+
+/**
+ * list_for_each_entry_rcu	-	iterate over rcu list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as list_add_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+#define list_for_each_entry_rcu(pos, head, member) \
+	for (pos = list_entry_rcu((head)->next, typeof(*pos), member); \
+		&pos->member != (head); \
+		pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_entry_lockless - get the struct for this entry
+ * @ptr:        the &struct list_head pointer.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_head within the struct.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu(), but requires some implicit RCU
+ * read-side guarding.  One example is running within a special
+ * exception-time environment where preemption is disabled and where
+ * lockdep cannot be invoked (in which case updaters must use RCU-sched,
+ * as in synchronize_sched(), call_rcu_sched(), and friends).  Another
+ * example is when items are added to the list, but never deleted.
+ */
+#define list_entry_lockless(ptr, type, member) \
+	container_of((typeof(ptr))lockless_dereference(ptr), type, member)
+
+/**
+ * list_for_each_entry_lockless - iterate over rcu list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu(), but requires some implicit RCU
+ * read-side guarding.  One example is running within a special
+ * exception-time environment where preemption is disabled and where
+ * lockdep cannot be invoked (in which case updaters must use RCU-sched,
+ * as in synchronize_sched(), call_rcu_sched(), and friends).  Another
+ * example is when items are added to the list, but never deleted.
+ */
+#define list_for_each_entry_lockless(pos, head, member) \
+	for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \
+	     &pos->member != (head); \
+	     pos = list_entry_lockless(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_continue_rcu - continue iteration over list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position.
+ */
+#define list_for_each_entry_continue_rcu(pos, head, member) 		\
+	for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \
+	     &pos->member != (head);	\
+	     pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
+
+/**
+ * hlist_del_rcu - deletes entry from hash list without re-initialization
+ * @n: the element to delete from the hash list.
+ *
+ * Note: list_unhashed() on entry does not return true after this,
+ * the entry is in an undefined state. It is useful for RCU based
+ * lockfree traversal.
+ *
+ * In particular, it means that we can not poison the forward
+ * pointers that may still be used for walking the hash list.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_add_head_rcu()
+ * or hlist_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_for_each_entry().
+ */
+static inline void hlist_del_rcu(struct hlist_node *n)
+{
+	__hlist_del(n);
+	n->pprev = LIST_POISON2;
+}
+
+/**
+ * hlist_replace_rcu - replace old entry by new one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ *
+ * The @old entry will be replaced with the @new entry atomically.
+ */
+static inline void hlist_replace_rcu(struct hlist_node *old,
+					struct hlist_node *new)
+{
+	struct hlist_node *next = old->next;
+
+	new->next = next;
+	new->pprev = old->pprev;
+	rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new);
+	if (next)
+		new->next->pprev = &new->next;
+	old->pprev = LIST_POISON2;
+}
+
+/*
+ * return the first or the next element in an RCU protected hlist
+ */
+#define hlist_first_rcu(head)	(*((struct hlist_node __rcu **)(&(head)->first)))
+#define hlist_next_rcu(node)	(*((struct hlist_node __rcu **)(&(node)->next)))
+#define hlist_pprev_rcu(node)	(*((struct hlist_node __rcu **)((node)->pprev)))
+
+/**
+ * hlist_add_head_rcu
+ * @n: the element to add to the hash list.
+ * @h: the list to add to.
+ *
+ * Description:
+ * Adds the specified element to the specified hlist,
+ * while permitting racing traversals.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_add_head_rcu()
+ * or hlist_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_for_each_entry_rcu(), used to prevent memory-consistency
+ * problems on Alpha CPUs.  Regardless of the type of CPU, the
+ * list-traversal primitive must be guarded by rcu_read_lock().
+ */
+static inline void hlist_add_head_rcu(struct hlist_node *n,
+					struct hlist_head *h)
+{
+	struct hlist_node *first = h->first;
+
+	n->next = first;
+	n->pprev = &h->first;
+	rcu_assign_pointer(hlist_first_rcu(h), n);
+	if (first)
+		first->pprev = &n->next;
+}
+
+/**
+ * hlist_add_tail_rcu
+ * @n: the element to add to the hash list.
+ * @h: the list to add to.
+ *
+ * Description:
+ * Adds the specified element to the specified hlist,
+ * while permitting racing traversals.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_add_head_rcu()
+ * or hlist_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_for_each_entry_rcu(), used to prevent memory-consistency
+ * problems on Alpha CPUs.  Regardless of the type of CPU, the
+ * list-traversal primitive must be guarded by rcu_read_lock().
+ */
+static inline void hlist_add_tail_rcu(struct hlist_node *n,
+				      struct hlist_head *h)
+{
+	struct hlist_node *i, *last = NULL;
+
+	for (i = hlist_first_rcu(h); i; i = hlist_next_rcu(i))
+		last = i;
+
+	if (last) {
+		n->next = last->next;
+		n->pprev = &last->next;
+		rcu_assign_pointer(hlist_next_rcu(last), n);
+	} else {
+		hlist_add_head_rcu(n, h);
+	}
+}
+
+/**
+ * hlist_add_before_rcu
+ * @n: the new element to add to the hash list.
+ * @next: the existing element to add the new element before.
+ *
+ * Description:
+ * Adds the specified element to the specified hlist
+ * before the specified node while permitting racing traversals.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_add_head_rcu()
+ * or hlist_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_for_each_entry_rcu(), used to prevent memory-consistency
+ * problems on Alpha CPUs.
+ */
+static inline void hlist_add_before_rcu(struct hlist_node *n,
+					struct hlist_node *next)
+{
+	n->pprev = next->pprev;
+	n->next = next;
+	rcu_assign_pointer(hlist_pprev_rcu(n), n);
+	next->pprev = &n->next;
+}
+
+/**
+ * hlist_add_behind_rcu
+ * @n: the new element to add to the hash list.
+ * @prev: the existing element to add the new element after.
+ *
+ * Description:
+ * Adds the specified element to the specified hlist
+ * after the specified node while permitting racing traversals.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_add_head_rcu()
+ * or hlist_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_for_each_entry_rcu(), used to prevent memory-consistency
+ * problems on Alpha CPUs.
+ */
+static inline void hlist_add_behind_rcu(struct hlist_node *n,
+					struct hlist_node *prev)
+{
+	n->next = prev->next;
+	n->pprev = &prev->next;
+	rcu_assign_pointer(hlist_next_rcu(prev), n);
+	if (n->next)
+		n->next->pprev = &n->next;
+}
+
+#define __hlist_for_each_rcu(pos, head)				\
+	for (pos = rcu_dereference(hlist_first_rcu(head));	\
+	     pos;						\
+	     pos = rcu_dereference(hlist_next_rcu(pos)))
+
+/**
+ * hlist_for_each_entry_rcu - iterate over rcu list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+#define hlist_for_each_entry_rcu(pos, head, member)			\
+	for (pos = hlist_entry_safe (rcu_dereference_raw(hlist_first_rcu(head)),\
+			typeof(*(pos)), member);			\
+		pos;							\
+		pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
+			&(pos)->member)), typeof(*(pos)), member))
+
+/**
+ * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing)
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ *
+ * This is the same as hlist_for_each_entry_rcu() except that it does
+ * not do any RCU debugging or tracing.
+ */
+#define hlist_for_each_entry_rcu_notrace(pos, head, member)			\
+	for (pos = hlist_entry_safe (rcu_dereference_raw_notrace(hlist_first_rcu(head)),\
+			typeof(*(pos)), member);			\
+		pos;							\
+		pos = hlist_entry_safe(rcu_dereference_raw_notrace(hlist_next_rcu(\
+			&(pos)->member)), typeof(*(pos)), member))
+
+/**
+ * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+#define hlist_for_each_entry_rcu_bh(pos, head, member)			\
+	for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_first_rcu(head)),\
+			typeof(*(pos)), member);			\
+		pos;							\
+		pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(\
+			&(pos)->member)), typeof(*(pos)), member))
+
+/**
+ * hlist_for_each_entry_continue_rcu - iterate over a hlist continuing after current point
+ * @pos:	the type * to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_continue_rcu(pos, member)			\
+	for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
+			&(pos)->member)), typeof(*(pos)), member);	\
+	     pos;							\
+	     pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(	\
+			&(pos)->member)), typeof(*(pos)), member))
+
+/**
+ * hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point
+ * @pos:	the type * to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_continue_rcu_bh(pos, member)		\
+	for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(  \
+			&(pos)->member)), typeof(*(pos)), member);	\
+	     pos;							\
+	     pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(	\
+			&(pos)->member)), typeof(*(pos)), member))
+
+/**
+ * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point
+ * @pos:	the type * to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_from_rcu(pos, member)			\
+	for (; pos;							\
+	     pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(	\
+			&(pos)->member)), typeof(*(pos)), member))
+
+#endif	/* __KERNEL__ */
+#endif
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
new file mode 100644
index 0000000..c99d78a
--- /dev/null
+++ b/include/linux/rcupdate.h
@@ -0,0 +1,16 @@
+#ifndef __TOOLS_LINUX_RCUPDATE_H
+#define __TOOLS_LINUX_RCUPDATE_H
+
+#include <urcu.h>
+#include <linux/compiler.h>
+
+#define rcu_dereference_check(p, c)	rcu_dereference(p)
+#define rcu_dereference_raw(p)		rcu_dereference(p)
+#define rcu_dereference_protected(p, c)	rcu_dereference(p)
+#define rcu_access_pointer(p)		READ_ONCE(p)
+
+#define kfree_rcu(ptr, rcu_head)	kfree(ptr) /* XXX */
+
+#define RCU_INIT_POINTER(p, v)		WRITE_ONCE(p, v)
+
+#endif /* __TOOLS_LINUX_RCUPDATE_H */
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
new file mode 100644
index 0000000..4c67a15
--- /dev/null
+++ b/include/linux/reboot.h
@@ -0,0 +1,74 @@
+#ifndef _LINUX_REBOOT_H
+#define _LINUX_REBOOT_H
+
+#include <linux/notifier.h>
+
+#define SYS_DOWN	0x0001	/* Notify of system down */
+#define SYS_RESTART	SYS_DOWN
+#define SYS_HALT	0x0002	/* Notify of system halt */
+#define SYS_POWER_OFF	0x0003	/* Notify of system power off */
+
+enum reboot_mode {
+	REBOOT_COLD = 0,
+	REBOOT_WARM,
+	REBOOT_HARD,
+	REBOOT_SOFT,
+	REBOOT_GPIO,
+};
+extern enum reboot_mode reboot_mode;
+
+enum reboot_type {
+	BOOT_TRIPLE	= 't',
+	BOOT_KBD	= 'k',
+	BOOT_BIOS	= 'b',
+	BOOT_ACPI	= 'a',
+	BOOT_EFI	= 'e',
+	BOOT_CF9_FORCE	= 'p',
+	BOOT_CF9_SAFE	= 'q',
+};
+extern enum reboot_type reboot_type;
+
+extern int reboot_default;
+extern int reboot_cpu;
+extern int reboot_force;
+
+
+static inline int register_reboot_notifier(struct notifier_block *n) { return 0; }
+static inline int unregister_reboot_notifier(struct notifier_block *n) { return 0; }
+
+extern int register_restart_handler(struct notifier_block *);
+extern int unregister_restart_handler(struct notifier_block *);
+extern void do_kernel_restart(char *cmd);
+
+/*
+ * Architecture-specific implementations of sys_reboot commands.
+ */
+
+extern void migrate_to_reboot_cpu(void);
+extern void machine_restart(char *cmd);
+extern void machine_halt(void);
+extern void machine_power_off(void);
+
+extern void machine_shutdown(void);
+struct pt_regs;
+extern void machine_crash_shutdown(struct pt_regs *);
+
+/*
+ * Architecture independent implemenations of sys_reboot commands.
+ */
+
+extern void kernel_restart_prepare(char *cmd);
+extern void kernel_restart(char *cmd);
+extern void kernel_halt(void);
+extern void kernel_power_off(void);
+
+extern int C_A_D; /* for sysctl */
+void ctrl_alt_del(void);
+
+#define POWEROFF_CMD_PATH_LEN	256
+extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN];
+
+extern void orderly_poweroff(bool force);
+extern void orderly_reboot(void);
+
+#endif /* _LINUX_REBOOT_H */
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
new file mode 100644
index 0000000..e5b35ed
--- /dev/null
+++ b/include/linux/rhashtable.h
@@ -0,0 +1,912 @@
+/*
+ * Resizable, Scalable, Concurrent Hash Table
+ *
+ * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
+ * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
+ * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
+ *
+ * Code partially derived from nft_hash
+ * Rewritten with rehash code from br_multicast plus single list
+ * pointer as suggested by Josh Triplett
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _LINUX_RHASHTABLE_H
+#define _LINUX_RHASHTABLE_H
+
+#include <linux/atomic.h>
+#include <linux/cache.h>
+#include <linux/compiler.h>
+#include <linux/cpumask.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/jhash.h>
+#include <linux/list_nulls.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+
+/*
+ * The end of the chain is marked with a special nulls marks which has
+ * the following format:
+ *
+ * +-------+-----------------------------------------------------+-+
+ * | Base  |                      Hash                           |1|
+ * +-------+-----------------------------------------------------+-+
+ *
+ * Base (4 bits) : Reserved to distinguish between multiple tables.
+ *                 Specified via &struct rhashtable_params.nulls_base.
+ * Hash (27 bits): Full hash (unmasked) of first element added to bucket
+ * 1 (1 bit)     : Nulls marker (always set)
+ *
+ * The remaining bits of the next pointer remain unused for now.
+ */
+#define RHT_BASE_BITS		4
+#define RHT_HASH_BITS		27
+#define RHT_BASE_SHIFT		RHT_HASH_BITS
+
+/* Base bits plus 1 bit for nulls marker */
+#define RHT_HASH_RESERVED_SPACE	(RHT_BASE_BITS + 1)
+
+struct rhash_head {
+	struct rhash_head __rcu		*next;
+};
+
+/**
+ * struct bucket_table - Table of hash buckets
+ * @size: Number of hash buckets
+ * @rehash: Current bucket being rehashed
+ * @hash_rnd: Random seed to fold into hash
+ * @locks_mask: Mask to apply before accessing locks[]
+ * @locks: Array of spinlocks protecting individual buckets
+ * @walkers: List of active walkers
+ * @rcu: RCU structure for freeing the table
+ * @future_tbl: Table under construction during rehashing
+ * @buckets: size * hash buckets
+ */
+struct bucket_table {
+	unsigned int		size;
+	unsigned int		rehash;
+	u32			hash_rnd;
+	unsigned int		locks_mask;
+	spinlock_t		*locks;
+	struct list_head	walkers;
+	struct rcu_head		rcu;
+
+	struct bucket_table __rcu *future_tbl;
+
+	struct rhash_head __rcu	*buckets[] ____cacheline_aligned_in_smp;
+};
+
+/**
+ * struct rhashtable_compare_arg - Key for the function rhashtable_compare
+ * @ht: Hash table
+ * @key: Key to compare against
+ */
+struct rhashtable_compare_arg {
+	struct rhashtable *ht;
+	const void *key;
+};
+
+typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed);
+typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed);
+typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
+			       const void *obj);
+
+struct rhashtable;
+
+/**
+ * struct rhashtable_params - Hash table construction parameters
+ * @nelem_hint: Hint on number of elements, should be 75% of desired size
+ * @key_len: Length of key
+ * @key_offset: Offset of key in struct to be hashed
+ * @head_offset: Offset of rhash_head in struct to be hashed
+ * @insecure_max_entries: Maximum number of entries (may be exceeded)
+ * @max_size: Maximum size while expanding
+ * @min_size: Minimum size while shrinking
+ * @nulls_base: Base value to generate nulls marker
+ * @insecure_elasticity: Set to true to disable chain length checks
+ * @automatic_shrinking: Enable automatic shrinking of tables
+ * @locks_mul: Number of bucket locks to allocate per cpu (default: 128)
+ * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
+ * @obj_hashfn: Function to hash object
+ * @obj_cmpfn: Function to compare key with object
+ */
+struct rhashtable_params {
+	size_t			nelem_hint;
+	size_t			key_len;
+	size_t			key_offset;
+	size_t			head_offset;
+	unsigned int		insecure_max_entries;
+	unsigned int		max_size;
+	unsigned int		min_size;
+	u32			nulls_base;
+	bool			insecure_elasticity;
+	bool			automatic_shrinking;
+	size_t			locks_mul;
+	rht_hashfn_t		hashfn;
+	rht_obj_hashfn_t	obj_hashfn;
+	rht_obj_cmpfn_t		obj_cmpfn;
+};
+
+/**
+ * struct rhashtable - Hash table handle
+ * @tbl: Bucket table
+ * @nelems: Number of elements in table
+ * @key_len: Key length for hashfn
+ * @elasticity: Maximum chain length before rehash
+ * @p: Configuration parameters
+ * @run_work: Deferred worker to expand/shrink asynchronously
+ * @mutex: Mutex to protect current/future table swapping
+ * @lock: Spin lock to protect walker list
+ */
+struct rhashtable {
+	struct bucket_table __rcu	*tbl;
+	atomic_t			nelems;
+	unsigned int			key_len;
+	unsigned int			elasticity;
+	struct rhashtable_params	p;
+	struct work_struct		run_work;
+	struct mutex                    mutex;
+	spinlock_t			lock;
+};
+
+/**
+ * struct rhashtable_walker - Hash table walker
+ * @list: List entry on list of walkers
+ * @tbl: The table that we were walking over
+ */
+struct rhashtable_walker {
+	struct list_head list;
+	struct bucket_table *tbl;
+};
+
+/**
+ * struct rhashtable_iter - Hash table iterator, fits into netlink cb
+ * @ht: Table to iterate through
+ * @p: Current pointer
+ * @walker: Associated rhashtable walker
+ * @slot: Current slot
+ * @skip: Number of entries to skip in slot
+ */
+struct rhashtable_iter {
+	struct rhashtable *ht;
+	struct rhash_head *p;
+	struct rhashtable_walker *walker;
+	unsigned int slot;
+	unsigned int skip;
+};
+
+static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash)
+{
+	return NULLS_MARKER(ht->p.nulls_base + hash);
+}
+
+#define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \
+	((ptr) = (typeof(ptr)) rht_marker(ht, hash))
+
+static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
+{
+	return ((unsigned long) ptr & 1);
+}
+
+static inline unsigned long rht_get_nulls_value(const struct rhash_head *ptr)
+{
+	return ((unsigned long) ptr) >> 1;
+}
+
+static inline void *rht_obj(const struct rhashtable *ht,
+			    const struct rhash_head *he)
+{
+	return (char *)he - ht->p.head_offset;
+}
+
+static inline unsigned int rht_bucket_index(const struct bucket_table *tbl,
+					    unsigned int hash)
+{
+	return (hash >> RHT_HASH_RESERVED_SPACE) & (tbl->size - 1);
+}
+
+static inline unsigned int rht_key_hashfn(
+	struct rhashtable *ht, const struct bucket_table *tbl,
+	const void *key, const struct rhashtable_params params)
+{
+	unsigned int hash;
+
+	/* params must be equal to ht->p if it isn't constant. */
+	if (!__builtin_constant_p(params.key_len))
+		hash = ht->p.hashfn(key, ht->key_len, tbl->hash_rnd);
+	else if (params.key_len) {
+		unsigned int key_len = params.key_len;
+
+		if (params.hashfn)
+			hash = params.hashfn(key, key_len, tbl->hash_rnd);
+		else if (key_len & (sizeof(u32) - 1))
+			hash = jhash(key, key_len, tbl->hash_rnd);
+		else
+			hash = jhash2(key, key_len / sizeof(u32),
+				      tbl->hash_rnd);
+	} else {
+		unsigned int key_len = ht->p.key_len;
+
+		if (params.hashfn)
+			hash = params.hashfn(key, key_len, tbl->hash_rnd);
+		else
+			hash = jhash(key, key_len, tbl->hash_rnd);
+	}
+
+	return rht_bucket_index(tbl, hash);
+}
+
+static inline unsigned int rht_head_hashfn(
+	struct rhashtable *ht, const struct bucket_table *tbl,
+	const struct rhash_head *he, const struct rhashtable_params params)
+{
+	const char *ptr = rht_obj(ht, he);
+
+	return likely(params.obj_hashfn) ?
+	       rht_bucket_index(tbl, params.obj_hashfn(ptr, params.key_len ?:
+							    ht->p.key_len,
+						       tbl->hash_rnd)) :
+	       rht_key_hashfn(ht, tbl, ptr + params.key_offset, params);
+}
+
+/**
+ * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
+ * @ht:		hash table
+ * @tbl:	current table
+ */
+static inline bool rht_grow_above_75(const struct rhashtable *ht,
+				     const struct bucket_table *tbl)
+{
+	/* Expand table when exceeding 75% load */
+	return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) &&
+	       (!ht->p.max_size || tbl->size < ht->p.max_size);
+}
+
+/**
+ * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
+ * @ht:		hash table
+ * @tbl:	current table
+ */
+static inline bool rht_shrink_below_30(const struct rhashtable *ht,
+				       const struct bucket_table *tbl)
+{
+	/* Shrink table beneath 30% load */
+	return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) &&
+	       tbl->size > ht->p.min_size;
+}
+
+/**
+ * rht_grow_above_100 - returns true if nelems > table-size
+ * @ht:		hash table
+ * @tbl:	current table
+ */
+static inline bool rht_grow_above_100(const struct rhashtable *ht,
+				      const struct bucket_table *tbl)
+{
+	return atomic_read(&ht->nelems) > tbl->size &&
+		(!ht->p.max_size || tbl->size < ht->p.max_size);
+}
+
+/**
+ * rht_grow_above_max - returns true if table is above maximum
+ * @ht:		hash table
+ * @tbl:	current table
+ */
+static inline bool rht_grow_above_max(const struct rhashtable *ht,
+				      const struct bucket_table *tbl)
+{
+	return ht->p.insecure_max_entries &&
+	       atomic_read(&ht->nelems) >= ht->p.insecure_max_entries;
+}
+
+/* The bucket lock is selected based on the hash and protects mutations
+ * on a group of hash buckets.
+ *
+ * A maximum of tbl->size/2 bucket locks is allocated. This ensures that
+ * a single lock always covers both buckets which may both contains
+ * entries which link to the same bucket of the old table during resizing.
+ * This allows to simplify the locking as locking the bucket in both
+ * tables during resize always guarantee protection.
+ *
+ * IMPORTANT: When holding the bucket lock of both the old and new table
+ * during expansions and shrinking, the old bucket lock must always be
+ * acquired first.
+ */
+static inline spinlock_t *rht_bucket_lock(const struct bucket_table *tbl,
+					  unsigned int hash)
+{
+	return &tbl->locks[hash & tbl->locks_mask];
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_rht_mutex_is_held(struct rhashtable *ht);
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
+#else
+static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht)
+{
+	return 1;
+}
+
+static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl,
+					     u32 hash)
+{
+	return 1;
+}
+#endif /* CONFIG_PROVE_LOCKING */
+
+int rhashtable_init(struct rhashtable *ht,
+		    const struct rhashtable_params *params);
+
+struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
+					    const void *key,
+					    struct rhash_head *obj,
+					    struct bucket_table *old_tbl);
+int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl);
+
+int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter,
+			 gfp_t gfp);
+void rhashtable_walk_exit(struct rhashtable_iter *iter);
+int rhashtable_walk_start(struct rhashtable_iter *iter) __acquires(RCU);
+void *rhashtable_walk_next(struct rhashtable_iter *iter);
+void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
+
+void rhashtable_free_and_destroy(struct rhashtable *ht,
+				 void (*free_fn)(void *ptr, void *arg),
+				 void *arg);
+void rhashtable_destroy(struct rhashtable *ht);
+
+#define rht_dereference(p, ht) \
+	rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht))
+
+#define rht_dereference_rcu(p, ht) \
+	rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht))
+
+#define rht_dereference_bucket(p, tbl, hash) \
+	rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash))
+
+#define rht_dereference_bucket_rcu(p, tbl, hash) \
+	rcu_dereference_check(p, lockdep_rht_bucket_is_held(tbl, hash))
+
+#define rht_entry(tpos, pos, member) \
+	({ tpos = container_of(pos, typeof(*tpos), member); 1; })
+
+/**
+ * rht_for_each_continue - continue iterating over hash chain
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the previous &struct rhash_head to continue from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ */
+#define rht_for_each_continue(pos, head, tbl, hash) \
+	for (pos = rht_dereference_bucket(head, tbl, hash); \
+	     !rht_is_a_nulls(pos); \
+	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
+
+/**
+ * rht_for_each - iterate over hash chain
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ */
+#define rht_for_each(pos, tbl, hash) \
+	rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash)
+
+/**
+ * rht_for_each_entry_continue - continue iterating over hash chain
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the previous &struct rhash_head to continue from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ */
+#define rht_for_each_entry_continue(tpos, pos, head, tbl, hash, member)	\
+	for (pos = rht_dereference_bucket(head, tbl, hash);		\
+	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	\
+	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
+
+/**
+ * rht_for_each_entry - iterate over hash chain of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ */
+#define rht_for_each_entry(tpos, pos, tbl, hash, member)		\
+	rht_for_each_entry_continue(tpos, pos, (tbl)->buckets[hash],	\
+				    tbl, hash, member)
+
+/**
+ * rht_for_each_entry_safe - safely iterate over hash chain of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @next:	the &struct rhash_head to use as next in loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive allows for the looped code to
+ * remove the loop cursor from the list.
+ */
+#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)	    \
+	for (pos = rht_dereference_bucket((tbl)->buckets[hash], tbl, hash), \
+	     next = !rht_is_a_nulls(pos) ?				    \
+		       rht_dereference_bucket(pos->next, tbl, hash) : NULL; \
+	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	    \
+	     pos = next,						    \
+	     next = !rht_is_a_nulls(pos) ?				    \
+		       rht_dereference_bucket(pos->next, tbl, hash) : NULL)
+
+/**
+ * rht_for_each_rcu_continue - continue iterating over rcu hash chain
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the previous &struct rhash_head to continue from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_rcu_continue(pos, head, tbl, hash)			\
+	for (({barrier(); }),						\
+	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		\
+	     !rht_is_a_nulls(pos);					\
+	     pos = rcu_dereference_raw(pos->next))
+
+/**
+ * rht_for_each_rcu - iterate over rcu hash chain
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_rcu(pos, tbl, hash)				\
+	rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash)
+
+/**
+ * rht_for_each_entry_rcu_continue - continue iterating over rcu hash chain
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the previous &struct rhash_head to continue from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_entry_rcu_continue(tpos, pos, head, tbl, hash, member) \
+	for (({barrier(); }),						    \
+	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		    \
+	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	    \
+	     pos = rht_dereference_bucket_rcu(pos->next, tbl, hash))
+
+/**
+ * rht_for_each_entry_rcu - iterate over rcu hash chain of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		\
+	rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\
+					tbl, hash, member)
+
+static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
+				     const void *obj)
+{
+	struct rhashtable *ht = arg->ht;
+	const char *ptr = obj;
+
+	return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len);
+}
+
+/**
+ * rhashtable_lookup_fast - search hash table, inlined version
+ * @ht:		hash table
+ * @key:	the pointer to the key
+ * @params:	hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. The first matching entry is returned.
+ *
+ * Returns the first entry on which the compare function returned true.
+ */
+static inline void *rhashtable_lookup_fast(
+	struct rhashtable *ht, const void *key,
+	const struct rhashtable_params params)
+{
+	struct rhashtable_compare_arg arg = {
+		.ht = ht,
+		.key = key,
+	};
+	const struct bucket_table *tbl;
+	struct rhash_head *he;
+	unsigned int hash;
+
+	rcu_read_lock();
+
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+restart:
+	hash = rht_key_hashfn(ht, tbl, key, params);
+	rht_for_each_rcu(he, tbl, hash) {
+		if (params.obj_cmpfn ?
+		    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
+		    rhashtable_compare(&arg, rht_obj(ht, he)))
+			continue;
+		rcu_read_unlock();
+		return rht_obj(ht, he);
+	}
+
+	/* Ensure we see any new tables. */
+	smp_rmb();
+
+	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	if (unlikely(tbl))
+		goto restart;
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+/* Internal function, please use rhashtable_insert_fast() instead */
+static inline int __rhashtable_insert_fast(
+	struct rhashtable *ht, const void *key, struct rhash_head *obj,
+	const struct rhashtable_params params)
+{
+	struct rhashtable_compare_arg arg = {
+		.ht = ht,
+		.key = key,
+	};
+	struct bucket_table *tbl, *new_tbl;
+	struct rhash_head *head;
+	spinlock_t *lock;
+	unsigned int elasticity;
+	unsigned int hash;
+	int err;
+
+restart:
+	rcu_read_lock();
+
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+
+	/* All insertions must grab the oldest table containing
+	 * the hashed bucket that is yet to be rehashed.
+	 */
+	for (;;) {
+		hash = rht_head_hashfn(ht, tbl, obj, params);
+		lock = rht_bucket_lock(tbl, hash);
+		spin_lock_bh(lock);
+
+		if (tbl->rehash <= hash)
+			break;
+
+		spin_unlock_bh(lock);
+		tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	}
+
+	new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	if (unlikely(new_tbl)) {
+		tbl = rhashtable_insert_slow(ht, key, obj, new_tbl);
+		if (!IS_ERR_OR_NULL(tbl))
+			goto slow_path;
+
+		err = PTR_ERR(tbl);
+		goto out;
+	}
+
+	err = -E2BIG;
+	if (unlikely(rht_grow_above_max(ht, tbl)))
+		goto out;
+
+	if (unlikely(rht_grow_above_100(ht, tbl))) {
+slow_path:
+		spin_unlock_bh(lock);
+		err = rhashtable_insert_rehash(ht, tbl);
+		rcu_read_unlock();
+		if (err)
+			return err;
+
+		goto restart;
+	}
+
+	err = -EEXIST;
+	elasticity = ht->elasticity;
+	rht_for_each(head, tbl, hash) {
+		if (key &&
+		    unlikely(!(params.obj_cmpfn ?
+			       params.obj_cmpfn(&arg, rht_obj(ht, head)) :
+			       rhashtable_compare(&arg, rht_obj(ht, head)))))
+			goto out;
+		if (!--elasticity)
+			goto slow_path;
+	}
+
+	err = 0;
+
+	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+
+	RCU_INIT_POINTER(obj->next, head);
+
+	rcu_assign_pointer(tbl->buckets[hash], obj);
+
+	atomic_inc(&ht->nelems);
+	if (rht_grow_above_75(ht, tbl))
+		schedule_work(&ht->run_work);
+
+out:
+	spin_unlock_bh(lock);
+	rcu_read_unlock();
+
+	return err;
+}
+
+/**
+ * rhashtable_insert_fast - insert object into hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ *
+ * Will take a per bucket spinlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket lock.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if the size grows
+ * beyond the watermark indicated by grow_decision() which can be passed
+ * to rhashtable_init().
+ */
+static inline int rhashtable_insert_fast(
+	struct rhashtable *ht, struct rhash_head *obj,
+	const struct rhashtable_params params)
+{
+	return __rhashtable_insert_fast(ht, NULL, obj, params);
+}
+
+/**
+ * rhashtable_lookup_insert_fast - lookup and insert object into hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ *
+ * Locks down the bucket chain in both the old and new table if a resize
+ * is in progress to ensure that writers can't remove from the old table
+ * and can't insert to the new table during the atomic operation of search
+ * and insertion. Searches for duplicates in both the old and new table if
+ * a resize is in progress.
+ *
+ * This lookup function may only be used for fixed key hash table (key_len
+ * parameter set). It will BUG() if used inappropriately.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if the size grows
+ * beyond the watermark indicated by grow_decision() which can be passed
+ * to rhashtable_init().
+ */
+static inline int rhashtable_lookup_insert_fast(
+	struct rhashtable *ht, struct rhash_head *obj,
+	const struct rhashtable_params params)
+{
+	const char *key = rht_obj(ht, obj);
+
+	BUG_ON(ht->p.obj_hashfn);
+
+	return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj,
+					params);
+}
+
+/**
+ * rhashtable_lookup_insert_key - search and insert object to hash table
+ *				  with explicit key
+ * @ht:		hash table
+ * @key:	key
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ *
+ * Locks down the bucket chain in both the old and new table if a resize
+ * is in progress to ensure that writers can't remove from the old table
+ * and can't insert to the new table during the atomic operation of search
+ * and insertion. Searches for duplicates in both the old and new table if
+ * a resize is in progress.
+ *
+ * Lookups may occur in parallel with hashtable mutations and resizing.
+ *
+ * Will trigger an automatic deferred table resizing if the size grows
+ * beyond the watermark indicated by grow_decision() which can be passed
+ * to rhashtable_init().
+ *
+ * Returns zero on success.
+ */
+static inline int rhashtable_lookup_insert_key(
+	struct rhashtable *ht, const void *key, struct rhash_head *obj,
+	const struct rhashtable_params params)
+{
+	BUG_ON(!ht->p.obj_hashfn || !key);
+
+	return __rhashtable_insert_fast(ht, key, obj, params);
+}
+
+/* Internal function, please use rhashtable_remove_fast() instead */
+static inline int __rhashtable_remove_fast(
+	struct rhashtable *ht, struct bucket_table *tbl,
+	struct rhash_head *obj, const struct rhashtable_params params)
+{
+	struct rhash_head __rcu **pprev;
+	struct rhash_head *he;
+	spinlock_t * lock;
+	unsigned int hash;
+	int err = -ENOENT;
+
+	hash = rht_head_hashfn(ht, tbl, obj, params);
+	lock = rht_bucket_lock(tbl, hash);
+
+	spin_lock_bh(lock);
+
+	pprev = &tbl->buckets[hash];
+	rht_for_each(he, tbl, hash) {
+		if (he != obj) {
+			pprev = &he->next;
+			continue;
+		}
+
+		rcu_assign_pointer(*pprev, obj->next);
+		err = 0;
+		break;
+	}
+
+	spin_unlock_bh(lock);
+
+	return err;
+}
+
+/**
+ * rhashtable_remove_fast - remove object from hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ *
+ * Since the hash chain is single linked, the removal operation needs to
+ * walk the bucket chain upon removal. The removal operation is thus
+ * considerable slow if the hash table is not correctly sized.
+ *
+ * Will automatically shrink the table via rhashtable_expand() if the
+ * shrink_decision function specified at rhashtable_init() returns true.
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found.
+ */
+static inline int rhashtable_remove_fast(
+	struct rhashtable *ht, struct rhash_head *obj,
+	const struct rhashtable_params params)
+{
+	struct bucket_table *tbl;
+	int err;
+
+	rcu_read_lock();
+
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+
+	/* Because we have already taken (and released) the bucket
+	 * lock in old_tbl, if we find that future_tbl is not yet
+	 * visible then that guarantees the entry to still be in
+	 * the old tbl if it exists.
+	 */
+	while ((err = __rhashtable_remove_fast(ht, tbl, obj, params)) &&
+	       (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
+		;
+
+	if (err)
+		goto out;
+
+	atomic_dec(&ht->nelems);
+	if (unlikely(ht->p.automatic_shrinking &&
+		     rht_shrink_below_30(ht, tbl)))
+		schedule_work(&ht->run_work);
+
+out:
+	rcu_read_unlock();
+
+	return err;
+}
+
+/* Internal function, please use rhashtable_replace_fast() instead */
+static inline int __rhashtable_replace_fast(
+	struct rhashtable *ht, struct bucket_table *tbl,
+	struct rhash_head *obj_old, struct rhash_head *obj_new,
+	const struct rhashtable_params params)
+{
+	struct rhash_head __rcu **pprev;
+	struct rhash_head *he;
+	spinlock_t *lock;
+	unsigned int hash;
+	int err = -ENOENT;
+
+	/* Minimally, the old and new objects must have same hash
+	 * (which should mean identifiers are the same).
+	 */
+	hash = rht_head_hashfn(ht, tbl, obj_old, params);
+	if (hash != rht_head_hashfn(ht, tbl, obj_new, params))
+		return -EINVAL;
+
+	lock = rht_bucket_lock(tbl, hash);
+
+	spin_lock_bh(lock);
+
+	pprev = &tbl->buckets[hash];
+	rht_for_each(he, tbl, hash) {
+		if (he != obj_old) {
+			pprev = &he->next;
+			continue;
+		}
+
+		rcu_assign_pointer(obj_new->next, obj_old->next);
+		rcu_assign_pointer(*pprev, obj_new);
+		err = 0;
+		break;
+	}
+
+	spin_unlock_bh(lock);
+
+	return err;
+}
+
+/**
+ * rhashtable_replace_fast - replace an object in hash table
+ * @ht:		hash table
+ * @obj_old:	pointer to hash head inside object being replaced
+ * @obj_new:	pointer to hash head inside object which is new
+ * @params:	hash table parameters
+ *
+ * Replacing an object doesn't affect the number of elements in the hash table
+ * or bucket, so we don't need to worry about shrinking or expanding the
+ * table here.
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found,
+ * -EINVAL if hash is not the same for the old and new objects.
+ */
+static inline int rhashtable_replace_fast(
+	struct rhashtable *ht, struct rhash_head *obj_old,
+	struct rhash_head *obj_new,
+	const struct rhashtable_params params)
+{
+	struct bucket_table *tbl;
+	int err;
+
+	rcu_read_lock();
+
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+
+	/* Because we have already taken (and released) the bucket
+	 * lock in old_tbl, if we find that future_tbl is not yet
+	 * visible then that guarantees the entry to still be in
+	 * the old tbl if it exists.
+	 */
+	while ((err = __rhashtable_replace_fast(ht, tbl, obj_old,
+						obj_new, params)) &&
+	       (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
+		;
+
+	rcu_read_unlock();
+
+	return err;
+}
+
+#endif /* _LINUX_RHASHTABLE_H */
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
new file mode 100644
index 0000000..9d70e6e
--- /dev/null
+++ b/include/linux/rwsem.h
@@ -0,0 +1,28 @@
+#ifndef __TOOLS_LINUX_RWSEM_H
+#define __TOOLS_LINUX_RWSEM_H
+
+#include <pthread.h>
+
+struct rw_semaphore {
+	pthread_rwlock_t	lock;
+};
+
+#define __RWSEM_INITIALIZER(name)				\
+	{ .lock = PTHREAD_RWLOCK_INITIALIZER }
+
+#define DECLARE_RWSEM(name) \
+	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+
+static inline void init_rwsem(struct rw_semaphore *lock)
+{
+	pthread_rwlock_init(&lock->lock, NULL);
+}
+
+#define down_read(l)		pthread_rwlock_rdlock(&(l)->lock)
+#define down_read_trylock(l)	(!pthread_rwlock_tryrdlock(&(l)->lock))
+#define up_read(l)		pthread_rwlock_unlock(&(l)->lock)
+
+#define down_write(l)		pthread_rwlock_wrlock(&(l)->lock)
+#define up_write(l)		pthread_rwlock_unlock(&(l)->lock)
+
+#endif /* __TOOLS_LINUX_RWSEM_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
new file mode 100644
index 0000000..0316f50
--- /dev/null
+++ b/include/linux/sched.h
@@ -0,0 +1,144 @@
+#ifndef __TOOLS_LINUX_SCHED_H
+#define __TOOLS_LINUX_SCHED_H
+
+#include <pthread.h>
+#include <time.h>
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/completion.h>
+#include <linux/jiffies.h>
+#include <linux/time64.h>
+
+#define TASK_RUNNING		0
+#define TASK_INTERRUPTIBLE	1
+#define TASK_UNINTERRUPTIBLE	2
+#define __TASK_STOPPED		4
+#define __TASK_TRACED		8
+/* in tsk->exit_state */
+#define EXIT_DEAD		16
+#define EXIT_ZOMBIE		32
+#define EXIT_TRACE		(EXIT_ZOMBIE | EXIT_DEAD)
+/* in tsk->state again */
+#define TASK_DEAD		64
+#define TASK_WAKEKILL		128
+#define TASK_WAKING		256
+#define TASK_PARKED		512
+#define TASK_NOLOAD		1024
+#define TASK_NEW		2048
+#define TASK_IDLE_WORKER	4096
+#define TASK_STATE_MAX		8192
+
+/* Convenience macros for the sake of set_task_state */
+#define TASK_KILLABLE		(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
+#define TASK_STOPPED		(TASK_WAKEKILL | __TASK_STOPPED)
+#define TASK_TRACED		(TASK_WAKEKILL | __TASK_TRACED)
+
+#define TASK_IDLE		(TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
+
+/* Convenience macros for the sake of wake_up */
+#define TASK_NORMAL		(TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
+#define TASK_ALL		(TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
+
+#define TASK_COMM_LEN 16
+
+#define PF_EXITING	0x00000004	/* getting shut down */
+#define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
+#define PF_VCPU		0x00000010	/* I'm a virtual CPU */
+#define PF_WQ_WORKER	0x00000020	/* I'm a workqueue worker */
+#define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
+#define PF_MCE_PROCESS  0x00000080      /* process policy on mce errors */
+#define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
+#define PF_DUMPCORE	0x00000200	/* dumped core */
+#define PF_SIGNALED	0x00000400	/* killed by a signal */
+#define PF_MEMALLOC	0x00000800	/* Allocating memory */
+#define PF_NPROC_EXCEEDED 0x00001000	/* set_user noticed that RLIMIT_NPROC was exceeded */
+#define PF_USED_MATH	0x00002000	/* if unset the fpu must be initialized before use */
+#define PF_USED_ASYNC	0x00004000	/* used async_schedule*(), used by module init */
+#define PF_NOFREEZE	0x00008000	/* this thread should not be frozen */
+#define PF_FROZEN	0x00010000	/* frozen for system suspend */
+#define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
+#define PF_KSWAPD	0x00040000	/* I am kswapd */
+#define PF_MEMALLOC_NOIO 0x00080000	/* Allocating memory without IO involved */
+#define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
+#define PF_KTHREAD	0x00200000	/* I am a kernel thread */
+#define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
+#define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
+#define PF_NO_SETAFFINITY 0x04000000	/* Userland is not allowed to meddle with cpus_allowed */
+#define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
+#define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
+#define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezable */
+
+struct task_struct {
+	pthread_t		thread;
+
+	int			(*thread_fn)(void *);
+	void			*thread_data;
+
+	pthread_mutex_t		lock;
+	pthread_cond_t		wait;
+
+	atomic_t		usage;
+	volatile long		state;
+
+	/* kthread: */
+	unsigned long		kthread_flags;
+	struct completion	exited;
+
+	unsigned		flags;
+
+	bool			on_cpu;
+	char			comm[TASK_COMM_LEN];
+	struct bio_list		*bio_list;
+};
+
+extern __thread struct task_struct *current;
+
+#define __set_task_state(tsk, state_value)		\
+	do { (tsk)->state = (state_value); } while (0)
+#define set_task_state(tsk, state_value)		\
+	smp_store_mb((tsk)->state, (state_value))
+#define __set_current_state(state_value)		\
+	do { current->state = (state_value); } while (0)
+#define set_current_state(state_value)			\
+	smp_store_mb(current->state, (state_value))
+
+#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
+
+extern void __put_task_struct(struct task_struct *t);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage))
+		__put_task_struct(t);
+}
+
+#define cond_resched()
+#define need_resched()	0
+
+void schedule(void);
+
+#define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
+long schedule_timeout(long timeout);
+
+static inline void io_schedule(void)
+{
+	schedule();
+}
+
+static inline long io_schedule_timeout(long timeout)
+{
+	return schedule_timeout(timeout);
+}
+
+int wake_up_process(struct task_struct *);
+
+static inline u64 ktime_get_seconds(void)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+
+	return ts.tv_sec;
+}
+
+#endif /* __TOOLS_LINUX_SCHED_H */
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
new file mode 100644
index 0000000..ef3040e
--- /dev/null
+++ b/include/linux/sched/rt.h
@@ -0,0 +1,9 @@
+#ifndef _SCHED_RT_H
+#define _SCHED_RT_H
+
+static inline int rt_task(struct task_struct *p)
+{
+	return 0;
+}
+
+#endif /* _SCHED_RT_H */
diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h
new file mode 100644
index 0000000..aeba6eb
--- /dev/null
+++ b/include/linux/semaphore.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2008 Intel Corporation
+ * Author: Matthew Wilcox <willy@linux.intel.com>
+ *
+ * Distributed under the terms of the GNU GPL, version 2
+ *
+ * Please see kernel/semaphore.c for documentation of these functions
+ */
+#ifndef __LINUX_SEMAPHORE_H
+#define __LINUX_SEMAPHORE_H
+
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/spinlock.h>
+
+/* Please don't access any members of this structure directly */
+struct semaphore {
+	raw_spinlock_t		lock;
+	unsigned int		count;
+	struct list_head	wait_list;
+};
+
+#define __SEMAPHORE_INITIALIZER(name, n)				\
+{									\
+	.lock		= __RAW_SPIN_LOCK_UNLOCKED((name).lock),	\
+	.count		= n,						\
+	.wait_list	= LIST_HEAD_INIT((name).wait_list),		\
+}
+
+#define DEFINE_SEMAPHORE(name)	\
+	struct semaphore name = __SEMAPHORE_INITIALIZER(name, 1)
+
+static inline void sema_init(struct semaphore *sem, int val)
+{
+	static struct lock_class_key __key;
+	*sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val);
+	lockdep_init_map(&sem->lock.dep_map, "semaphore->lock", &__key, 0);
+}
+
+extern void down(struct semaphore *sem);
+extern int __must_check down_interruptible(struct semaphore *sem);
+extern int __must_check down_killable(struct semaphore *sem);
+extern int __must_check down_trylock(struct semaphore *sem);
+extern int __must_check down_timeout(struct semaphore *sem, long);
+extern void up(struct semaphore *sem);
+
+#endif /* __LINUX_SEMAPHORE_H */
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
new file mode 100644
index 0000000..7047838
--- /dev/null
+++ b/include/linux/seq_file.h
@@ -0,0 +1,25 @@
+#ifndef _LINUX_SEQ_FILE_H
+#define _LINUX_SEQ_FILE_H
+
+#include <linux/types.h>
+#include <linux/fs.h>
+
+struct seq_operations;
+struct path;
+
+struct seq_file {
+	char *buf;
+	size_t size;
+	size_t from;
+	size_t count;
+	size_t pad_until;
+	loff_t index;
+	loff_t read_pos;
+	u64 version;
+	const struct seq_operations *op;
+	int poll_event;
+	const struct file *file;
+	void *private;
+};
+
+#endif
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
new file mode 100644
index 0000000..7a08137
--- /dev/null
+++ b/include/linux/seqlock.h
@@ -0,0 +1,567 @@
+#ifndef __LINUX_SEQLOCK_H
+#define __LINUX_SEQLOCK_H
+/*
+ * Reader/writer consistent mechanism without starving writers. This type of
+ * lock for data where the reader wants a consistent set of information
+ * and is willing to retry if the information changes. There are two types
+ * of readers:
+ * 1. Sequence readers which never block a writer but they may have to retry
+ *    if a writer is in progress by detecting change in sequence number.
+ *    Writers do not wait for a sequence reader.
+ * 2. Locking readers which will wait if a writer or another locking reader
+ *    is in progress. A locking reader in progress will also block a writer
+ *    from going forward. Unlike the regular rwlock, the read lock here is
+ *    exclusive so that only one locking reader can get it.
+ *
+ * This is not as cache friendly as brlock. Also, this may not work well
+ * for data that contains pointers, because any writer could
+ * invalidate a pointer that a reader was following.
+ *
+ * Expected non-blocking reader usage:
+ * 	do {
+ *	    seq = read_seqbegin(&foo);
+ * 	...
+ *      } while (read_seqretry(&foo, seq));
+ *
+ *
+ * On non-SMP the spin locks disappear but the writer still needs
+ * to increment the sequence variables because an interrupt routine could
+ * change the state of the data.
+ *
+ * Based on x86_64 vsyscall gettimeofday 
+ * by Keith Owens and Andrea Arcangeli
+ */
+
+#include <linux/spinlock.h>
+#include <linux/lockdep.h>
+#include <linux/compiler.h>
+
+/*
+ * Version using sequence counter only.
+ * This can be used when code has its own mutex protecting the
+ * updating starting before the write_seqcountbeqin() and ending
+ * after the write_seqcount_end().
+ */
+typedef struct seqcount {
+	unsigned sequence;
+} seqcount_t;
+
+static inline void __seqcount_init(seqcount_t *s, const char *name,
+					  struct lock_class_key *key)
+{
+	s->sequence = 0;
+}
+
+# define SEQCOUNT_DEP_MAP_INIT(lockname)
+# define seqcount_init(s) __seqcount_init(s, NULL, NULL)
+# define seqcount_lockdep_reader_access(x)
+
+#define SEQCNT_ZERO(lockname) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(lockname)}
+
+
+/**
+ * __read_seqcount_begin - begin a seq-read critical section (without barrier)
+ * @s: pointer to seqcount_t
+ * Returns: count to be passed to read_seqcount_retry
+ *
+ * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb()
+ * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
+ * provided before actually loading any of the variables that are to be
+ * protected in this critical section.
+ *
+ * Use carefully, only in critical code, and comment how the barrier is
+ * provided.
+ */
+static inline unsigned __read_seqcount_begin(const seqcount_t *s)
+{
+	unsigned ret;
+
+repeat:
+	ret = READ_ONCE(s->sequence);
+	if (unlikely(ret & 1)) {
+		cpu_relax();
+		goto repeat;
+	}
+	return ret;
+}
+
+/**
+ * raw_read_seqcount - Read the raw seqcount
+ * @s: pointer to seqcount_t
+ * Returns: count to be passed to read_seqcount_retry
+ *
+ * raw_read_seqcount opens a read critical section of the given
+ * seqcount without any lockdep checking and without checking or
+ * masking the LSB. Calling code is responsible for handling that.
+ */
+static inline unsigned raw_read_seqcount(const seqcount_t *s)
+{
+	unsigned ret = READ_ONCE(s->sequence);
+	smp_rmb();
+	return ret;
+}
+
+/**
+ * raw_read_seqcount_begin - start seq-read critical section w/o lockdep
+ * @s: pointer to seqcount_t
+ * Returns: count to be passed to read_seqcount_retry
+ *
+ * raw_read_seqcount_begin opens a read critical section of the given
+ * seqcount, but without any lockdep checking. Validity of the critical
+ * section is tested by checking read_seqcount_retry function.
+ */
+static inline unsigned raw_read_seqcount_begin(const seqcount_t *s)
+{
+	unsigned ret = __read_seqcount_begin(s);
+	smp_rmb();
+	return ret;
+}
+
+/**
+ * read_seqcount_begin - begin a seq-read critical section
+ * @s: pointer to seqcount_t
+ * Returns: count to be passed to read_seqcount_retry
+ *
+ * read_seqcount_begin opens a read critical section of the given seqcount.
+ * Validity of the critical section is tested by checking read_seqcount_retry
+ * function.
+ */
+static inline unsigned read_seqcount_begin(const seqcount_t *s)
+{
+	seqcount_lockdep_reader_access(s);
+	return raw_read_seqcount_begin(s);
+}
+
+/**
+ * raw_seqcount_begin - begin a seq-read critical section
+ * @s: pointer to seqcount_t
+ * Returns: count to be passed to read_seqcount_retry
+ *
+ * raw_seqcount_begin opens a read critical section of the given seqcount.
+ * Validity of the critical section is tested by checking read_seqcount_retry
+ * function.
+ *
+ * Unlike read_seqcount_begin(), this function will not wait for the count
+ * to stabilize. If a writer is active when we begin, we will fail the
+ * read_seqcount_retry() instead of stabilizing at the beginning of the
+ * critical section.
+ */
+static inline unsigned raw_seqcount_begin(const seqcount_t *s)
+{
+	unsigned ret = READ_ONCE(s->sequence);
+	smp_rmb();
+	return ret & ~1;
+}
+
+/**
+ * __read_seqcount_retry - end a seq-read critical section (without barrier)
+ * @s: pointer to seqcount_t
+ * @start: count, from read_seqcount_begin
+ * Returns: 1 if retry is required, else 0
+ *
+ * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb()
+ * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
+ * provided before actually loading any of the variables that are to be
+ * protected in this critical section.
+ *
+ * Use carefully, only in critical code, and comment how the barrier is
+ * provided.
+ */
+static inline int __read_seqcount_retry(const seqcount_t *s, unsigned start)
+{
+	return unlikely(s->sequence != start);
+}
+
+/**
+ * read_seqcount_retry - end a seq-read critical section
+ * @s: pointer to seqcount_t
+ * @start: count, from read_seqcount_begin
+ * Returns: 1 if retry is required, else 0
+ *
+ * read_seqcount_retry closes a read critical section of the given seqcount.
+ * If the critical section was invalid, it must be ignored (and typically
+ * retried).
+ */
+static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
+{
+	smp_rmb();
+	return __read_seqcount_retry(s, start);
+}
+
+
+
+static inline void raw_write_seqcount_begin(seqcount_t *s)
+{
+	s->sequence++;
+	smp_wmb();
+}
+
+static inline void raw_write_seqcount_end(seqcount_t *s)
+{
+	smp_wmb();
+	s->sequence++;
+}
+
+/**
+ * raw_write_seqcount_barrier - do a seq write barrier
+ * @s: pointer to seqcount_t
+ *
+ * This can be used to provide an ordering guarantee instead of the
+ * usual consistency guarantee. It is one wmb cheaper, because we can
+ * collapse the two back-to-back wmb()s.
+ *
+ *      seqcount_t seq;
+ *      bool X = true, Y = false;
+ *
+ *      void read(void)
+ *      {
+ *              bool x, y;
+ *
+ *              do {
+ *                      int s = read_seqcount_begin(&seq);
+ *
+ *                      x = X; y = Y;
+ *
+ *              } while (read_seqcount_retry(&seq, s));
+ *
+ *              BUG_ON(!x && !y);
+ *      }
+ *
+ *      void write(void)
+ *      {
+ *              Y = true;
+ *
+ *              raw_write_seqcount_barrier(seq);
+ *
+ *              X = false;
+ *      }
+ */
+static inline void raw_write_seqcount_barrier(seqcount_t *s)
+{
+	s->sequence++;
+	smp_wmb();
+	s->sequence++;
+}
+
+static inline int raw_read_seqcount_latch(seqcount_t *s)
+{
+	int seq = READ_ONCE(s->sequence);
+	/* Pairs with the first smp_wmb() in raw_write_seqcount_latch() */
+	smp_read_barrier_depends();
+	return seq;
+}
+
+/**
+ * raw_write_seqcount_latch - redirect readers to even/odd copy
+ * @s: pointer to seqcount_t
+ *
+ * The latch technique is a multiversion concurrency control method that allows
+ * queries during non-atomic modifications. If you can guarantee queries never
+ * interrupt the modification -- e.g. the concurrency is strictly between CPUs
+ * -- you most likely do not need this.
+ *
+ * Where the traditional RCU/lockless data structures rely on atomic
+ * modifications to ensure queries observe either the old or the new state the
+ * latch allows the same for non-atomic updates. The trade-off is doubling the
+ * cost of storage; we have to maintain two copies of the entire data
+ * structure.
+ *
+ * Very simply put: we first modify one copy and then the other. This ensures
+ * there is always one copy in a stable state, ready to give us an answer.
+ *
+ * The basic form is a data structure like:
+ *
+ * struct latch_struct {
+ *	seqcount_t		seq;
+ *	struct data_struct	data[2];
+ * };
+ *
+ * Where a modification, which is assumed to be externally serialized, does the
+ * following:
+ *
+ * void latch_modify(struct latch_struct *latch, ...)
+ * {
+ *	smp_wmb();	<- Ensure that the last data[1] update is visible
+ *	latch->seq++;
+ *	smp_wmb();	<- Ensure that the seqcount update is visible
+ *
+ *	modify(latch->data[0], ...);
+ *
+ *	smp_wmb();	<- Ensure that the data[0] update is visible
+ *	latch->seq++;
+ *	smp_wmb();	<- Ensure that the seqcount update is visible
+ *
+ *	modify(latch->data[1], ...);
+ * }
+ *
+ * The query will have a form like:
+ *
+ * struct entry *latch_query(struct latch_struct *latch, ...)
+ * {
+ *	struct entry *entry;
+ *	unsigned seq, idx;
+ *
+ *	do {
+ *		seq = raw_read_seqcount_latch(&latch->seq);
+ *
+ *		idx = seq & 0x01;
+ *		entry = data_query(latch->data[idx], ...);
+ *
+ *		smp_rmb();
+ *	} while (seq != latch->seq);
+ *
+ *	return entry;
+ * }
+ *
+ * So during the modification, queries are first redirected to data[1]. Then we
+ * modify data[0]. When that is complete, we redirect queries back to data[0]
+ * and we can modify data[1].
+ *
+ * NOTE: The non-requirement for atomic modifications does _NOT_ include
+ *       the publishing of new entries in the case where data is a dynamic
+ *       data structure.
+ *
+ *       An iteration might start in data[0] and get suspended long enough
+ *       to miss an entire modification sequence, once it resumes it might
+ *       observe the new entry.
+ *
+ * NOTE: When data is a dynamic data structure; one should use regular RCU
+ *       patterns to manage the lifetimes of the objects within.
+ */
+static inline void raw_write_seqcount_latch(seqcount_t *s)
+{
+       smp_wmb();      /* prior stores before incrementing "sequence" */
+       s->sequence++;
+       smp_wmb();      /* increment "sequence" before following stores */
+}
+
+/*
+ * Sequence counter only version assumes that callers are using their
+ * own mutexing.
+ */
+static inline void write_seqcount_begin_nested(seqcount_t *s, int subclass)
+{
+	raw_write_seqcount_begin(s);
+}
+
+static inline void write_seqcount_begin(seqcount_t *s)
+{
+	write_seqcount_begin_nested(s, 0);
+}
+
+static inline void write_seqcount_end(seqcount_t *s)
+{
+	raw_write_seqcount_end(s);
+}
+
+/**
+ * write_seqcount_invalidate - invalidate in-progress read-side seq operations
+ * @s: pointer to seqcount_t
+ *
+ * After write_seqcount_invalidate, no read-side seq operations will complete
+ * successfully and see data older than this.
+ */
+static inline void write_seqcount_invalidate(seqcount_t *s)
+{
+	smp_wmb();
+	s->sequence+=2;
+}
+
+typedef struct {
+	struct seqcount seqcount;
+	spinlock_t lock;
+} seqlock_t;
+
+/*
+ * These macros triggered gcc-3.x compile-time problems.  We think these are
+ * OK now.  Be cautious.
+ */
+#define __SEQLOCK_UNLOCKED(lockname)			\
+	{						\
+		.seqcount = SEQCNT_ZERO(lockname),	\
+		.lock =	__SPIN_LOCK_UNLOCKED(lockname)	\
+	}
+
+#define seqlock_init(x)					\
+	do {						\
+		seqcount_init(&(x)->seqcount);		\
+		spin_lock_init(&(x)->lock);		\
+	} while (0)
+
+#define DEFINE_SEQLOCK(x) \
+		seqlock_t x = __SEQLOCK_UNLOCKED(x)
+
+/*
+ * Read side functions for starting and finalizing a read side section.
+ */
+static inline unsigned read_seqbegin(const seqlock_t *sl)
+{
+	return read_seqcount_begin(&sl->seqcount);
+}
+
+static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
+{
+	return read_seqcount_retry(&sl->seqcount, start);
+}
+
+/*
+ * Lock out other writers and update the count.
+ * Acts like a normal spin_lock/unlock.
+ * Don't need preempt_disable() because that is in the spin_lock already.
+ */
+static inline void write_seqlock(seqlock_t *sl)
+{
+	spin_lock(&sl->lock);
+	write_seqcount_begin(&sl->seqcount);
+}
+
+static inline void write_sequnlock(seqlock_t *sl)
+{
+	write_seqcount_end(&sl->seqcount);
+	spin_unlock(&sl->lock);
+}
+
+static inline void write_seqlock_bh(seqlock_t *sl)
+{
+	spin_lock_bh(&sl->lock);
+	write_seqcount_begin(&sl->seqcount);
+}
+
+static inline void write_sequnlock_bh(seqlock_t *sl)
+{
+	write_seqcount_end(&sl->seqcount);
+	spin_unlock_bh(&sl->lock);
+}
+
+static inline void write_seqlock_irq(seqlock_t *sl)
+{
+	spin_lock_irq(&sl->lock);
+	write_seqcount_begin(&sl->seqcount);
+}
+
+static inline void write_sequnlock_irq(seqlock_t *sl)
+{
+	write_seqcount_end(&sl->seqcount);
+	spin_unlock_irq(&sl->lock);
+}
+
+static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&sl->lock, flags);
+	write_seqcount_begin(&sl->seqcount);
+	return flags;
+}
+
+#define write_seqlock_irqsave(lock, flags)				\
+	do { flags = __write_seqlock_irqsave(lock); } while (0)
+
+static inline void
+write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
+{
+	write_seqcount_end(&sl->seqcount);
+	spin_unlock_irqrestore(&sl->lock, flags);
+}
+
+/*
+ * A locking reader exclusively locks out other writers and locking readers,
+ * but doesn't update the sequence number. Acts like a normal spin_lock/unlock.
+ * Don't need preempt_disable() because that is in the spin_lock already.
+ */
+static inline void read_seqlock_excl(seqlock_t *sl)
+{
+	spin_lock(&sl->lock);
+}
+
+static inline void read_sequnlock_excl(seqlock_t *sl)
+{
+	spin_unlock(&sl->lock);
+}
+
+/**
+ * read_seqbegin_or_lock - begin a sequence number check or locking block
+ * @lock: sequence lock
+ * @seq : sequence number to be checked
+ *
+ * First try it once optimistically without taking the lock. If that fails,
+ * take the lock. The sequence number is also used as a marker for deciding
+ * whether to be a reader (even) or writer (odd).
+ * N.B. seq must be initialized to an even number to begin with.
+ */
+static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
+{
+	if (!(*seq & 1))	/* Even */
+		*seq = read_seqbegin(lock);
+	else			/* Odd */
+		read_seqlock_excl(lock);
+}
+
+static inline int need_seqretry(seqlock_t *lock, int seq)
+{
+	return !(seq & 1) && read_seqretry(lock, seq);
+}
+
+static inline void done_seqretry(seqlock_t *lock, int seq)
+{
+	if (seq & 1)
+		read_sequnlock_excl(lock);
+}
+
+static inline void read_seqlock_excl_bh(seqlock_t *sl)
+{
+	spin_lock_bh(&sl->lock);
+}
+
+static inline void read_sequnlock_excl_bh(seqlock_t *sl)
+{
+	spin_unlock_bh(&sl->lock);
+}
+
+static inline void read_seqlock_excl_irq(seqlock_t *sl)
+{
+	spin_lock_irq(&sl->lock);
+}
+
+static inline void read_sequnlock_excl_irq(seqlock_t *sl)
+{
+	spin_unlock_irq(&sl->lock);
+}
+
+static inline unsigned long __read_seqlock_excl_irqsave(seqlock_t *sl)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&sl->lock, flags);
+	return flags;
+}
+
+#define read_seqlock_excl_irqsave(lock, flags)				\
+	do { flags = __read_seqlock_excl_irqsave(lock); } while (0)
+
+static inline void
+read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags)
+{
+	spin_unlock_irqrestore(&sl->lock, flags);
+}
+
+static inline unsigned long
+read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq)
+{
+	unsigned long flags = 0;
+
+	if (!(*seq & 1))	/* Even */
+		*seq = read_seqbegin(lock);
+	else			/* Odd */
+		read_seqlock_excl_irqsave(lock, flags);
+
+	return flags;
+}
+
+static inline void
+done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags)
+{
+	if (seq & 1)
+		read_sequnlock_excl_irqrestore(lock, flags);
+}
+#endif /* __LINUX_SEQLOCK_H */
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
new file mode 100644
index 0000000..baa36b1
--- /dev/null
+++ b/include/linux/shrinker.h
@@ -0,0 +1,25 @@
+#ifndef __TOOLS_LINUX_SHRINKER_H
+#define __TOOLS_LINUX_SHRINKER_H
+
+struct shrink_control {
+	gfp_t gfp_mask;
+	unsigned long nr_to_scan;
+};
+
+#define SHRINK_STOP (~0UL)
+
+struct shrinker {
+	unsigned long (*count_objects)(struct shrinker *,
+				       struct shrink_control *sc);
+	unsigned long (*scan_objects)(struct shrinker *,
+				      struct shrink_control *sc);
+
+	int seeks;	/* seeks to recreate an obj */
+	long batch;	/* reclaim batch size, 0 = default */
+	struct list_head list;
+};
+
+static inline int register_shrinker(struct shrinker *shrinker) { return 0; }
+static inline void unregister_shrinker(struct shrinker *shrinker) {}
+
+#endif /* __TOOLS_LINUX_SHRINKER_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
new file mode 100644
index 0000000..58fb73e
--- /dev/null
+++ b/include/linux/slab.h
@@ -0,0 +1,106 @@
+#ifndef __TOOLS_LINUX_SLAB_H
+#define __TOOLS_LINUX_SLAB_H
+
+#include <malloc.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <linux/kernel.h>
+#include <linux/page.h>
+#include <linux/types.h>
+
+#define ARCH_KMALLOC_MINALIGN		16
+#define KMALLOC_MAX_SIZE		SIZE_MAX
+
+static inline void *kmalloc(size_t size, gfp_t flags)
+{
+	void *p = malloc(size);
+
+	if (p && (flags & __GFP_ZERO))
+		memset(p, 0, size);
+
+	return p;
+}
+
+static inline void *krealloc(void *old, size_t size, gfp_t flags)
+{
+	void *new = kmalloc(size, flags);
+
+	if (new && (flags & __GFP_ZERO))
+		memset(new, 0, size);
+
+	if (new) {
+		memcpy(new, old,
+		       min(malloc_usable_size(old),
+			   malloc_usable_size(new)));
+		free(old);
+	}
+
+	return new;
+}
+
+#define kzalloc(size, flags)		calloc(1, size)
+#define kcalloc(n, size, flags)		calloc(n, size)
+#define kmalloc_array(n, size, flags)	calloc(n, size)
+
+#define vmalloc(size)			malloc(size)
+#define vzalloc(size)			calloc(1, size)
+
+#define kfree(p)			free(p)
+#define kvfree(p)			free(p)
+#define kzfree(p)			free(p)
+
+static inline struct page *alloc_pages(gfp_t flags, unsigned int order)
+{
+	size_t size = PAGE_SIZE << order;
+	void *p = memalign(PAGE_SIZE, size);
+
+	if (p && (flags & __GFP_ZERO))
+		memset(p, 0, size);
+
+	return p;
+}
+
+#define alloc_page(gfp)			alloc_pages(gfp, 0)
+
+#define __get_free_pages(gfp, order)	((unsigned long) alloc_pages(gfp, order))
+#define __get_free_page(gfp)		__get_free_pages(gfp, 0)
+
+#define __free_pages(page, order)			\
+do {							\
+	(void) order;					\
+	free(page);					\
+} while (0)
+
+#define free_pages(addr, order)				\
+do {							\
+	(void) order;					\
+	free((void *) (addr));				\
+} while (0)
+
+#define __free_page(page) __free_pages((page), 0)
+#define free_page(addr) free_pages((addr), 0)
+
+#define VM_IOREMAP		0x00000001	/* ioremap() and friends */
+#define VM_ALLOC		0x00000002	/* vmalloc() */
+#define VM_MAP			0x00000004	/* vmap()ed pages */
+#define VM_USERMAP		0x00000008	/* suitable for remap_vmalloc_range */
+#define VM_UNINITIALIZED	0x00000020	/* vm_struct is not fully initialized */
+#define VM_NO_GUARD		0x00000040      /* don't add guard page */
+#define VM_KASAN		0x00000080      /* has allocated kasan shadow memory */
+
+#define PAGE_KERNEL		0
+
+static inline void vunmap(const void *addr) {}
+
+static inline void *vmap(struct page **pages, unsigned int count,
+			 unsigned long flags, unsigned prot)
+{
+	return page_address(pages[0]);
+}
+
+#define is_vmalloc_addr(page)		0
+
+#define vmalloc_to_page(addr)		((struct page *) (addr))
+
+#endif /* __TOOLS_LINUX_SLAB_H */
diff --git a/include/linux/sort.h b/include/linux/sort.h
new file mode 100644
index 0000000..d534da2
--- /dev/null
+++ b/include/linux/sort.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_SORT_H
+#define _LINUX_SORT_H
+
+#include <linux/types.h>
+
+void sort(void *base, size_t num, size_t size,
+	  int (*cmp)(const void *, const void *),
+	  void (*swap)(void *, void *, int));
+
+#endif
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
new file mode 100644
index 0000000..0fa79a3
--- /dev/null
+++ b/include/linux/spinlock.h
@@ -0,0 +1,60 @@
+#ifndef __TOOLS_LINUX_SPINLOCK_H
+#define __TOOLS_LINUX_SPINLOCK_H
+
+#include <linux/atomic.h>
+
+typedef struct {
+	int		count;
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED(name)	(raw_spinlock_t) { .count = 0 }
+
+static inline void raw_spin_lock_init(raw_spinlock_t *lock)
+{
+	smp_store_release(&lock->count, 0);
+}
+
+static inline void raw_spin_lock(raw_spinlock_t *lock)
+{
+	while (xchg_acquire(&lock->count, 1))
+		;
+}
+
+static inline void raw_spin_unlock(raw_spinlock_t *lock)
+{
+	smp_store_release(&lock->count, 0);
+}
+
+#define raw_spin_lock_irq(lock)		raw_spin_lock(lock)
+#define raw_spin_unlock_irq(lock)	raw_spin_unlock(lock)
+
+#define raw_spin_lock_irqsave(lock, flags)		\
+do {							\
+	(void) flags;					\
+	raw_spin_lock(lock);				\
+} while (0)
+
+#define raw_spin_unlock_irqrestore(lock, flags) raw_spin_unlock(lock)
+
+typedef raw_spinlock_t spinlock_t;
+
+#define __SPIN_LOCK_UNLOCKED(name)	__RAW_SPIN_LOCK_UNLOCKED(name)
+
+#define DEFINE_SPINLOCK(x)	spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
+
+#define spin_lock_init(lock)		raw_spin_lock_init(lock)
+#define spin_lock(lock)			raw_spin_lock(lock)
+#define spin_unlock(lock)		raw_spin_unlock(lock)
+
+#define spin_lock_nested(lock, n)	spin_lock(lock)
+
+#define spin_lock_bh(lock)		raw_spin_lock(lock)
+#define spin_unlock_bh(lock)		raw_spin_unlock(lock)
+
+#define spin_lock_irq(lock)		raw_spin_lock(lock)
+#define spin_unlock_irq(lock)		raw_spin_unlock(lock)
+
+#define spin_lock_irqsave(lock, flags)	raw_spin_lock_irqsave(lock, flags)
+#define spin_unlock_irqrestore(lock, flags) raw_spin_unlock_irqrestore(lock, flags)
+
+#endif /* __TOOLS_LINUX_SPINLOCK_H */
diff --git a/include/linux/stat.h b/include/linux/stat.h
new file mode 100644
index 0000000..d5ded25
--- /dev/null
+++ b/include/linux/stat.h
@@ -0,0 +1,15 @@
+#ifndef _LINUX_STAT_H
+#define _LINUX_STAT_H
+
+#include <sys/stat.h>
+
+#define S_IRWXUGO	(S_IRWXU|S_IRWXG|S_IRWXO)
+#define S_IALLUGO	(S_ISUID|S_ISGID|S_ISVTX|S_IRWXUGO)
+#define S_IRUGO		(S_IRUSR|S_IRGRP|S_IROTH)
+#define S_IWUGO		(S_IWUSR|S_IWGRP|S_IWOTH)
+#define S_IXUGO		(S_IXUSR|S_IXGRP|S_IXOTH)
+
+#define UTIME_NOW	((1l << 30) - 1l)
+#define UTIME_OMIT	((1l << 30) - 2l)
+
+#endif
diff --git a/include/linux/string.h b/include/linux/string.h
new file mode 100644
index 0000000..abc191e
--- /dev/null
+++ b/include/linux/string.h
@@ -0,0 +1,15 @@
+#ifndef _TOOLS_LINUX_STRING_H_
+#define _TOOLS_LINUX_STRING_H_
+
+#include <stdlib.h>
+#include <string.h>
+#include <linux/types.h>	/* for size_t */
+
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+extern char *skip_spaces(const char *);
+extern char *strim(char *);
+extern void memzero_explicit(void *, size_t);
+
+#define kstrndup(s, n, gfp)		strndup(s, n)
+
+#endif /* _LINUX_STRING_H_ */
diff --git a/include/linux/stringify.h b/include/linux/stringify.h
new file mode 100644
index 0000000..841cec8
--- /dev/null
+++ b/include/linux/stringify.h
@@ -0,0 +1,12 @@
+#ifndef __LINUX_STRINGIFY_H
+#define __LINUX_STRINGIFY_H
+
+/* Indirect stringification.  Doing two levels allows the parameter to be a
+ * macro itself.  For example, compile with -DFOO=bar, __stringify(FOO)
+ * converts to "bar".
+ */
+
+#define __stringify_1(x...)	#x
+#define __stringify(x...)	__stringify_1(x)
+
+#endif	/* !__LINUX_STRINGIFY_H */
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
new file mode 100644
index 0000000..0be85b7
--- /dev/null
+++ b/include/linux/sysfs.h
@@ -0,0 +1,36 @@
+#ifndef _SYSFS_H_
+#define _SYSFS_H_
+
+#include <linux/compiler.h>
+#include <linux/stringify.h>
+
+struct kobject;
+
+struct attribute {
+	const char		*name;
+	umode_t			mode;
+};
+
+#define __ATTR(_name, _mode, _show, _store) {				\
+	.attr = {.name = __stringify(_name), .mode = _mode },		\
+	.show	= _show,						\
+	.store	= _store,						\
+}
+
+static inline int sysfs_create_files(struct kobject *kobj,
+				    const struct attribute **attr)
+{
+	return 0;
+}
+
+static inline int sysfs_create_link(struct kobject *kobj,
+				    struct kobject *target, const char *name)
+{
+	return 0;
+}
+
+static inline void sysfs_remove_link(struct kobject *kobj, const char *name)
+{
+}
+
+#endif /* _SYSFS_H_ */
diff --git a/include/linux/time64.h b/include/linux/time64.h
new file mode 100644
index 0000000..9d8a3ef
--- /dev/null
+++ b/include/linux/time64.h
@@ -0,0 +1,194 @@
+#ifndef _LINUX_TIME64_H
+#define _LINUX_TIME64_H
+
+#include <linux/types.h>
+
+typedef __s64 time64_t;
+
+/*
+ * This wants to go into uapi/linux/time.h once we agreed about the
+ * userspace interfaces.
+ */
+#if __BITS_PER_LONG == 64
+# define timespec64 timespec
+#else
+struct timespec64 {
+	time64_t	tv_sec;			/* seconds */
+	long		tv_nsec;		/* nanoseconds */
+};
+
+struct itimerspec64 {
+	struct timespec64 it_interval;
+	struct timespec64 it_value;
+};
+
+#endif
+
+/* Parameters used to convert the timespec values: */
+#define MSEC_PER_SEC	1000L
+#define USEC_PER_MSEC	1000L
+#define NSEC_PER_USEC	1000L
+#define NSEC_PER_MSEC	1000000L
+#define USEC_PER_SEC	1000000L
+#define NSEC_PER_SEC	1000000000L
+#define FSEC_PER_SEC	1000000000000000LL
+
+/* Located here for timespec[64]_valid_strict */
+#define TIME64_MAX			((s64)~((u64)1 << 63))
+#define KTIME_MAX			((s64)~((u64)1 << 63))
+#define KTIME_SEC_MAX			(KTIME_MAX / NSEC_PER_SEC)
+
+#if __BITS_PER_LONG == 64
+
+static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
+{
+	return ts64;
+}
+
+static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
+{
+	return ts;
+}
+
+# define timespec64_equal		timespec_equal
+# define timespec64_compare		timespec_compare
+# define set_normalized_timespec64	set_normalized_timespec
+# define timespec64_add			timespec_add
+# define timespec64_sub			timespec_sub
+# define timespec64_valid		timespec_valid
+# define timespec64_valid_strict	timespec_valid_strict
+# define timespec64_to_ns		timespec_to_ns
+# define ns_to_timespec64		ns_to_timespec
+# define timespec64_add_ns		timespec_add_ns
+
+#else
+
+static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
+{
+	struct timespec ret;
+
+	ret.tv_sec = (time_t)ts64.tv_sec;
+	ret.tv_nsec = ts64.tv_nsec;
+	return ret;
+}
+
+static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
+{
+	struct timespec64 ret;
+
+	ret.tv_sec = ts.tv_sec;
+	ret.tv_nsec = ts.tv_nsec;
+	return ret;
+}
+
+static inline int timespec64_equal(const struct timespec64 *a,
+				   const struct timespec64 *b)
+{
+	return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec);
+}
+
+/*
+ * lhs < rhs:  return <0
+ * lhs == rhs: return 0
+ * lhs > rhs:  return >0
+ */
+static inline int timespec64_compare(const struct timespec64 *lhs, const struct timespec64 *rhs)
+{
+	if (lhs->tv_sec < rhs->tv_sec)
+		return -1;
+	if (lhs->tv_sec > rhs->tv_sec)
+		return 1;
+	return lhs->tv_nsec - rhs->tv_nsec;
+}
+
+extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec);
+
+static inline struct timespec64 timespec64_add(struct timespec64 lhs,
+						struct timespec64 rhs)
+{
+	struct timespec64 ts_delta;
+	set_normalized_timespec64(&ts_delta, lhs.tv_sec + rhs.tv_sec,
+				lhs.tv_nsec + rhs.tv_nsec);
+	return ts_delta;
+}
+
+/*
+ * sub = lhs - rhs, in normalized form
+ */
+static inline struct timespec64 timespec64_sub(struct timespec64 lhs,
+						struct timespec64 rhs)
+{
+	struct timespec64 ts_delta;
+	set_normalized_timespec64(&ts_delta, lhs.tv_sec - rhs.tv_sec,
+				lhs.tv_nsec - rhs.tv_nsec);
+	return ts_delta;
+}
+
+/*
+ * Returns true if the timespec64 is norm, false if denorm:
+ */
+static inline bool timespec64_valid(const struct timespec64 *ts)
+{
+	/* Dates before 1970 are bogus */
+	if (ts->tv_sec < 0)
+		return false;
+	/* Can't have more nanoseconds then a second */
+	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
+		return false;
+	return true;
+}
+
+static inline bool timespec64_valid_strict(const struct timespec64 *ts)
+{
+	if (!timespec64_valid(ts))
+		return false;
+	/* Disallow values that could overflow ktime_t */
+	if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX)
+		return false;
+	return true;
+}
+
+/**
+ * timespec64_to_ns - Convert timespec64 to nanoseconds
+ * @ts:		pointer to the timespec64 variable to be converted
+ *
+ * Returns the scalar nanosecond representation of the timespec64
+ * parameter.
+ */
+static inline s64 timespec64_to_ns(const struct timespec64 *ts)
+{
+	return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
+}
+
+/**
+ * ns_to_timespec64 - Convert nanoseconds to timespec64
+ * @nsec:	the nanoseconds value to be converted
+ *
+ * Returns the timespec64 representation of the nsec parameter.
+ */
+extern struct timespec64 ns_to_timespec64(const s64 nsec);
+
+/**
+ * timespec64_add_ns - Adds nanoseconds to a timespec64
+ * @a:		pointer to timespec64 to be incremented
+ * @ns:		unsigned nanoseconds value to be added
+ *
+ * This must always be inlined because its used from the x86-64 vdso,
+ * which cannot call other kernel functions.
+ */
+static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
+{
+	a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns);
+	a->tv_nsec = ns;
+}
+
+#endif
+
+/*
+ * timespec64_add_safe assumes both values are positive and checks for
+ * overflow. It will return TIME64_MAX in case of overflow.
+ */
+extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
+					 const struct timespec64 rhs);
+
+#endif /* _LINUX_TIME64_H */
diff --git a/include/linux/timer.h b/include/linux/timer.h
new file mode 100644
index 0000000..363f26a
--- /dev/null
+++ b/include/linux/timer.h
@@ -0,0 +1,50 @@
+#ifndef __TOOLS_LINUX_TIMER_H
+#define __TOOLS_LINUX_TIMER_H
+
+#include <string.h>
+#include <linux/types.h>
+
+struct timer_list {
+	unsigned long		expires;
+	void			(*function)(unsigned long);
+	unsigned long		data;
+	bool			pending;
+};
+
+static inline void init_timer(struct timer_list *timer)
+{
+	memset(timer, 0, sizeof(*timer));
+}
+
+#define __init_timer(_timer, _flags)	init_timer(_timer)
+
+#define __setup_timer(_timer, _fn, _data, _flags)			\
+	do {								\
+		__init_timer((_timer), (_flags));			\
+		(_timer)->function = (_fn);				\
+		(_timer)->data = (_data);				\
+	} while (0)
+
+#define setup_timer(timer, fn, data)					\
+	__setup_timer((timer), (fn), (data), 0)
+
+static inline int timer_pending(const struct timer_list *timer)
+{
+	return timer->pending;
+}
+
+int del_timer(struct timer_list * timer);
+int del_timer_sync(struct timer_list *timer);
+
+int mod_timer(struct timer_list *timer, unsigned long expires);
+//extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
+
+static inline void add_timer(struct timer_list *timer)
+{
+	BUG_ON(timer_pending(timer));
+	mod_timer(timer, timer->expires);
+}
+
+void flush_timers(void);
+
+#endif /* __TOOLS_LINUX_TIMER_H */
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
new file mode 100644
index 0000000..1686cb9
--- /dev/null
+++ b/include/linux/tracepoint.h
@@ -0,0 +1,62 @@
+#ifndef __TOOLS_LINUX_TRACEPOINT_H
+#define __TOOLS_LINUX_TRACEPOINT_H
+
+#define PARAMS(args...) args
+
+#define TP_PROTO(args...)	args
+#define TP_ARGS(args...)	args
+#define TP_CONDITION(args...)	args
+
+#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \
+	static inline void trace_##name(proto)				\
+	{ }								\
+	static inline void trace_##name##_rcuidle(proto)		\
+	{ }								\
+	static inline int						\
+	register_trace_##name(void (*probe)(data_proto),		\
+			      void *data)				\
+	{								\
+		return -ENOSYS;						\
+	}								\
+	static inline int						\
+	unregister_trace_##name(void (*probe)(data_proto),		\
+				void *data)				\
+	{								\
+		return -ENOSYS;						\
+	}								\
+	static inline void check_trace_callback_type_##name(void (*cb)(data_proto)) \
+	{								\
+	}								\
+	static inline bool						\
+	trace_##name##_enabled(void)					\
+	{								\
+		return false;						\
+	}
+
+#define DEFINE_TRACE_FN(name, reg, unreg)
+#define DEFINE_TRACE(name)
+#define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
+#define EXPORT_TRACEPOINT_SYMBOL(name)
+
+#define DECLARE_TRACE_NOARGS(name)					\
+	__DECLARE_TRACE(name, void, ,					\
+			cpu_online(raw_smp_processor_id()),		\
+			void *__data, __data)
+
+#define DECLARE_TRACE(name, proto, args)				\
+	__DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),		\
+			cpu_online(raw_smp_processor_id()),		\
+			PARAMS(void *__data, proto),			\
+			PARAMS(__data, args))
+
+#define DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, print)
+#define DEFINE_EVENT(template, name, proto, args)		\
+	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg)\
+	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
+	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#define TRACE_EVENT(name, proto, args, struct, assign, print)	\
+	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+
+#endif /* __TOOLS_LINUX_TRACEPOINT_H */
diff --git a/include/linux/typecheck.h b/include/linux/typecheck.h
new file mode 100644
index 0000000..eb5b74a
--- /dev/null
+++ b/include/linux/typecheck.h
@@ -0,0 +1,24 @@
+#ifndef TYPECHECK_H_INCLUDED
+#define TYPECHECK_H_INCLUDED
+
+/*
+ * Check at compile time that something is of a particular type.
+ * Always evaluates to 1 so you may use it easily in comparisons.
+ */
+#define typecheck(type,x) \
+({	type __dummy; \
+	typeof(x) __dummy2; \
+	(void)(&__dummy == &__dummy2); \
+	1; \
+})
+
+/*
+ * Check at compile time that 'function' is a certain type, or is a pointer
+ * to that type (needs to use typedef for the function type.)
+ */
+#define typecheck_fn(type,function) \
+({	typeof(type) __tmp = function; \
+	(void)__tmp; \
+})
+
+#endif		/* TYPECHECK_H_INCLUDED */
diff --git a/include/linux/types.h b/include/linux/types.h
new file mode 100644
index 0000000..ddc8eca
--- /dev/null
+++ b/include/linux/types.h
@@ -0,0 +1,98 @@
+#ifndef _TOOLS_LINUX_TYPES_H_
+#define _TOOLS_LINUX_TYPES_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <sys/types.h>
+
+#define __SANE_USERSPACE_TYPES__	/* For PPC64, to get LL64 types */
+#include <asm/types.h>
+
+#define BITS_PER_LONG	__BITS_PER_LONG
+
+struct page;
+struct kmem_cache;
+
+typedef unsigned long		pgoff_t;
+
+typedef unsigned short		umode_t;
+
+typedef unsigned gfp_t;
+
+#define GFP_KERNEL	0
+#define GFP_ATOMIC	0
+#define GFP_NOFS	0
+#define GFP_NOIO	0
+#define GFP_NOWAIT	0
+#define __GFP_IO	0
+#define __GFP_NOWARN	0
+#define __GFP_NORETRY	0
+#define __GFP_ZERO	1
+
+#define PAGE_ALLOC_COSTLY_ORDER	6
+
+typedef __u64 u64;
+typedef __s64 s64;
+typedef __u32 u32;
+typedef __s32 s32;
+typedef __u16 u16;
+typedef __s16 s16;
+typedef __u8  u8;
+typedef __s8  s8;
+
+#ifdef __CHECKER__
+#define __bitwise__ __attribute__((bitwise))
+#else
+#define __bitwise__
+#endif
+#ifdef __CHECK_ENDIAN__
+#define __bitwise __bitwise__
+#else
+#define __bitwise
+#endif
+
+#define __force
+#define __user
+#define __must_check
+#define __cold
+
+typedef __u16 __bitwise __le16;
+typedef __u16 __bitwise __be16;
+typedef __u32 __bitwise __le32;
+typedef __u32 __bitwise __be32;
+typedef __u64 __bitwise __le64;
+typedef __u64 __bitwise __be64;
+
+#ifndef __aligned_u64
+# define __aligned_u64 __u64 __attribute__((aligned(8)))
+#endif
+
+typedef u64 sector_t;
+
+struct list_head {
+	struct list_head *next, *prev;
+};
+
+struct hlist_head {
+	struct hlist_node *first;
+};
+
+struct hlist_node {
+	struct hlist_node *next, **pprev;
+};
+
+struct callback_head {
+	struct callback_head *next;
+	void (*func)(struct callback_head *head);
+} __attribute__((aligned(sizeof(void *))));
+
+#if 0
+#define rcu_head callback_head
+
+typedef void (*rcu_callback_t)(struct rcu_head *head);
+typedef void (*call_rcu_func_t)(struct rcu_head *head, rcu_callback_t func);
+#endif
+
+#endif /* _TOOLS_LINUX_TYPES_H_ */
diff --git a/include/linux/unaligned/access_ok.h b/include/linux/unaligned/access_ok.h
new file mode 100644
index 0000000..33383ca
--- /dev/null
+++ b/include/linux/unaligned/access_ok.h
@@ -0,0 +1,67 @@
+#ifndef _LINUX_UNALIGNED_ACCESS_OK_H
+#define _LINUX_UNALIGNED_ACCESS_OK_H
+
+#include <linux/kernel.h>
+#include <asm/byteorder.h>
+
+static __always_inline u16 get_unaligned_le16(const void *p)
+{
+	return le16_to_cpup((__le16 *)p);
+}
+
+static __always_inline u32 get_unaligned_le32(const void *p)
+{
+	return le32_to_cpup((__le32 *)p);
+}
+
+static __always_inline u64 get_unaligned_le64(const void *p)
+{
+	return le64_to_cpup((__le64 *)p);
+}
+
+static __always_inline u16 get_unaligned_be16(const void *p)
+{
+	return be16_to_cpup((__be16 *)p);
+}
+
+static __always_inline u32 get_unaligned_be32(const void *p)
+{
+	return be32_to_cpup((__be32 *)p);
+}
+
+static __always_inline u64 get_unaligned_be64(const void *p)
+{
+	return be64_to_cpup((__be64 *)p);
+}
+
+static __always_inline void put_unaligned_le16(u16 val, void *p)
+{
+	*((__le16 *)p) = cpu_to_le16(val);
+}
+
+static __always_inline void put_unaligned_le32(u32 val, void *p)
+{
+	*((__le32 *)p) = cpu_to_le32(val);
+}
+
+static __always_inline void put_unaligned_le64(u64 val, void *p)
+{
+	*((__le64 *)p) = cpu_to_le64(val);
+}
+
+static __always_inline void put_unaligned_be16(u16 val, void *p)
+{
+	*((__be16 *)p) = cpu_to_be16(val);
+}
+
+static __always_inline void put_unaligned_be32(u32 val, void *p)
+{
+	*((__be32 *)p) = cpu_to_be32(val);
+}
+
+static __always_inline void put_unaligned_be64(u64 val, void *p)
+{
+	*((__be64 *)p) = cpu_to_be64(val);
+}
+
+#endif /* _LINUX_UNALIGNED_ACCESS_OK_H */
diff --git a/include/linux/unaligned/be_byteshift.h b/include/linux/unaligned/be_byteshift.h
new file mode 100644
index 0000000..9356b24
--- /dev/null
+++ b/include/linux/unaligned/be_byteshift.h
@@ -0,0 +1,70 @@
+#ifndef _LINUX_UNALIGNED_BE_BYTESHIFT_H
+#define _LINUX_UNALIGNED_BE_BYTESHIFT_H
+
+#include <linux/types.h>
+
+static inline u16 __get_unaligned_be16(const u8 *p)
+{
+	return p[0] << 8 | p[1];
+}
+
+static inline u32 __get_unaligned_be32(const u8 *p)
+{
+	return p[0] << 24 | p[1] << 16 | p[2] << 8 | p[3];
+}
+
+static inline u64 __get_unaligned_be64(const u8 *p)
+{
+	return (u64)__get_unaligned_be32(p) << 32 |
+	       __get_unaligned_be32(p + 4);
+}
+
+static inline void __put_unaligned_be16(u16 val, u8 *p)
+{
+	*p++ = val >> 8;
+	*p++ = val;
+}
+
+static inline void __put_unaligned_be32(u32 val, u8 *p)
+{
+	__put_unaligned_be16(val >> 16, p);
+	__put_unaligned_be16(val, p + 2);
+}
+
+static inline void __put_unaligned_be64(u64 val, u8 *p)
+{
+	__put_unaligned_be32(val >> 32, p);
+	__put_unaligned_be32(val, p + 4);
+}
+
+static inline u16 get_unaligned_be16(const void *p)
+{
+	return __get_unaligned_be16((const u8 *)p);
+}
+
+static inline u32 get_unaligned_be32(const void *p)
+{
+	return __get_unaligned_be32((const u8 *)p);
+}
+
+static inline u64 get_unaligned_be64(const void *p)
+{
+	return __get_unaligned_be64((const u8 *)p);
+}
+
+static inline void put_unaligned_be16(u16 val, void *p)
+{
+	__put_unaligned_be16(val, p);
+}
+
+static inline void put_unaligned_be32(u32 val, void *p)
+{
+	__put_unaligned_be32(val, p);
+}
+
+static inline void put_unaligned_be64(u64 val, void *p)
+{
+	__put_unaligned_be64(val, p);
+}
+
+#endif /* _LINUX_UNALIGNED_BE_BYTESHIFT_H */
diff --git a/include/linux/unaligned/be_memmove.h b/include/linux/unaligned/be_memmove.h
new file mode 100644
index 0000000..c2a76c5
--- /dev/null
+++ b/include/linux/unaligned/be_memmove.h
@@ -0,0 +1,36 @@
+#ifndef _LINUX_UNALIGNED_BE_MEMMOVE_H
+#define _LINUX_UNALIGNED_BE_MEMMOVE_H
+
+#include <linux/unaligned/memmove.h>
+
+static inline u16 get_unaligned_be16(const void *p)
+{
+	return __get_unaligned_memmove16((const u8 *)p);
+}
+
+static inline u32 get_unaligned_be32(const void *p)
+{
+	return __get_unaligned_memmove32((const u8 *)p);
+}
+
+static inline u64 get_unaligned_be64(const void *p)
+{
+	return __get_unaligned_memmove64((const u8 *)p);
+}
+
+static inline void put_unaligned_be16(u16 val, void *p)
+{
+	__put_unaligned_memmove16(val, p);
+}
+
+static inline void put_unaligned_be32(u32 val, void *p)
+{
+	__put_unaligned_memmove32(val, p);
+}
+
+static inline void put_unaligned_be64(u64 val, void *p)
+{
+	__put_unaligned_memmove64(val, p);
+}
+
+#endif /* _LINUX_UNALIGNED_LE_MEMMOVE_H */
diff --git a/include/linux/unaligned/be_struct.h b/include/linux/unaligned/be_struct.h
new file mode 100644
index 0000000..1324158
--- /dev/null
+++ b/include/linux/unaligned/be_struct.h
@@ -0,0 +1,36 @@
+#ifndef _LINUX_UNALIGNED_BE_STRUCT_H
+#define _LINUX_UNALIGNED_BE_STRUCT_H
+
+#include <linux/unaligned/packed_struct.h>
+
+static inline u16 get_unaligned_be16(const void *p)
+{
+	return __get_unaligned_cpu16((const u8 *)p);
+}
+
+static inline u32 get_unaligned_be32(const void *p)
+{
+	return __get_unaligned_cpu32((const u8 *)p);
+}
+
+static inline u64 get_unaligned_be64(const void *p)
+{
+	return __get_unaligned_cpu64((const u8 *)p);
+}
+
+static inline void put_unaligned_be16(u16 val, void *p)
+{
+	__put_unaligned_cpu16(val, p);
+}
+
+static inline void put_unaligned_be32(u32 val, void *p)
+{
+	__put_unaligned_cpu32(val, p);
+}
+
+static inline void put_unaligned_be64(u64 val, void *p)
+{
+	__put_unaligned_cpu64(val, p);
+}
+
+#endif /* _LINUX_UNALIGNED_BE_STRUCT_H */
diff --git a/include/linux/unaligned/generic.h b/include/linux/unaligned/generic.h
new file mode 100644
index 0000000..02d97ff
--- /dev/null
+++ b/include/linux/unaligned/generic.h
@@ -0,0 +1,68 @@
+#ifndef _LINUX_UNALIGNED_GENERIC_H
+#define _LINUX_UNALIGNED_GENERIC_H
+
+/*
+ * Cause a link-time error if we try an unaligned access other than
+ * 1,2,4 or 8 bytes long
+ */
+extern void __bad_unaligned_access_size(void);
+
+#define __get_unaligned_le(ptr) ((__force typeof(*(ptr)))({			\
+	__builtin_choose_expr(sizeof(*(ptr)) == 1, *(ptr),			\
+	__builtin_choose_expr(sizeof(*(ptr)) == 2, get_unaligned_le16((ptr)),	\
+	__builtin_choose_expr(sizeof(*(ptr)) == 4, get_unaligned_le32((ptr)),	\
+	__builtin_choose_expr(sizeof(*(ptr)) == 8, get_unaligned_le64((ptr)),	\
+	__bad_unaligned_access_size()))));					\
+	}))
+
+#define __get_unaligned_be(ptr) ((__force typeof(*(ptr)))({			\
+	__builtin_choose_expr(sizeof(*(ptr)) == 1, *(ptr),			\
+	__builtin_choose_expr(sizeof(*(ptr)) == 2, get_unaligned_be16((ptr)),	\
+	__builtin_choose_expr(sizeof(*(ptr)) == 4, get_unaligned_be32((ptr)),	\
+	__builtin_choose_expr(sizeof(*(ptr)) == 8, get_unaligned_be64((ptr)),	\
+	__bad_unaligned_access_size()))));					\
+	}))
+
+#define __put_unaligned_le(val, ptr) ({					\
+	void *__gu_p = (ptr);						\
+	switch (sizeof(*(ptr))) {					\
+	case 1:								\
+		*(u8 *)__gu_p = (__force u8)(val);			\
+		break;							\
+	case 2:								\
+		put_unaligned_le16((__force u16)(val), __gu_p);		\
+		break;							\
+	case 4:								\
+		put_unaligned_le32((__force u32)(val), __gu_p);		\
+		break;							\
+	case 8:								\
+		put_unaligned_le64((__force u64)(val), __gu_p);		\
+		break;							\
+	default:							\
+		__bad_unaligned_access_size();				\
+		break;							\
+	}								\
+	(void)0; })
+
+#define __put_unaligned_be(val, ptr) ({					\
+	void *__gu_p = (ptr);						\
+	switch (sizeof(*(ptr))) {					\
+	case 1:								\
+		*(u8 *)__gu_p = (__force u8)(val);			\
+		break;							\
+	case 2:								\
+		put_unaligned_be16((__force u16)(val), __gu_p);		\
+		break;							\
+	case 4:								\
+		put_unaligned_be32((__force u32)(val), __gu_p);		\
+		break;							\
+	case 8:								\
+		put_unaligned_be64((__force u64)(val), __gu_p);		\
+		break;							\
+	default:							\
+		__bad_unaligned_access_size();				\
+		break;							\
+	}								\
+	(void)0; })
+
+#endif /* _LINUX_UNALIGNED_GENERIC_H */
diff --git a/include/linux/unaligned/le_byteshift.h b/include/linux/unaligned/le_byteshift.h
new file mode 100644
index 0000000..be376fb
--- /dev/null
+++ b/include/linux/unaligned/le_byteshift.h
@@ -0,0 +1,70 @@
+#ifndef _LINUX_UNALIGNED_LE_BYTESHIFT_H
+#define _LINUX_UNALIGNED_LE_BYTESHIFT_H
+
+#include <linux/types.h>
+
+static inline u16 __get_unaligned_le16(const u8 *p)
+{
+	return p[0] | p[1] << 8;
+}
+
+static inline u32 __get_unaligned_le32(const u8 *p)
+{
+	return p[0] | p[1] << 8 | p[2] << 16 | p[3] << 24;
+}
+
+static inline u64 __get_unaligned_le64(const u8 *p)
+{
+	return (u64)__get_unaligned_le32(p + 4) << 32 |
+	       __get_unaligned_le32(p);
+}
+
+static inline void __put_unaligned_le16(u16 val, u8 *p)
+{
+	*p++ = val;
+	*p++ = val >> 8;
+}
+
+static inline void __put_unaligned_le32(u32 val, u8 *p)
+{
+	__put_unaligned_le16(val >> 16, p + 2);
+	__put_unaligned_le16(val, p);
+}
+
+static inline void __put_unaligned_le64(u64 val, u8 *p)
+{
+	__put_unaligned_le32(val >> 32, p + 4);
+	__put_unaligned_le32(val, p);
+}
+
+static inline u16 get_unaligned_le16(const void *p)
+{
+	return __get_unaligned_le16((const u8 *)p);
+}
+
+static inline u32 get_unaligned_le32(const void *p)
+{
+	return __get_unaligned_le32((const u8 *)p);
+}
+
+static inline u64 get_unaligned_le64(const void *p)
+{
+	return __get_unaligned_le64((const u8 *)p);
+}
+
+static inline void put_unaligned_le16(u16 val, void *p)
+{
+	__put_unaligned_le16(val, p);
+}
+
+static inline void put_unaligned_le32(u32 val, void *p)
+{
+	__put_unaligned_le32(val, p);
+}
+
+static inline void put_unaligned_le64(u64 val, void *p)
+{
+	__put_unaligned_le64(val, p);
+}
+
+#endif /* _LINUX_UNALIGNED_LE_BYTESHIFT_H */
diff --git a/include/linux/unaligned/le_memmove.h b/include/linux/unaligned/le_memmove.h
new file mode 100644
index 0000000..269849b
--- /dev/null
+++ b/include/linux/unaligned/le_memmove.h
@@ -0,0 +1,36 @@
+#ifndef _LINUX_UNALIGNED_LE_MEMMOVE_H
+#define _LINUX_UNALIGNED_LE_MEMMOVE_H
+
+#include <linux/unaligned/memmove.h>
+
+static inline u16 get_unaligned_le16(const void *p)
+{
+	return __get_unaligned_memmove16((const u8 *)p);
+}
+
+static inline u32 get_unaligned_le32(const void *p)
+{
+	return __get_unaligned_memmove32((const u8 *)p);
+}
+
+static inline u64 get_unaligned_le64(const void *p)
+{
+	return __get_unaligned_memmove64((const u8 *)p);
+}
+
+static inline void put_unaligned_le16(u16 val, void *p)
+{
+	__put_unaligned_memmove16(val, p);
+}
+
+static inline void put_unaligned_le32(u32 val, void *p)
+{
+	__put_unaligned_memmove32(val, p);
+}
+
+static inline void put_unaligned_le64(u64 val, void *p)
+{
+	__put_unaligned_memmove64(val, p);
+}
+
+#endif /* _LINUX_UNALIGNED_LE_MEMMOVE_H */
diff --git a/include/linux/unaligned/le_struct.h b/include/linux/unaligned/le_struct.h
new file mode 100644
index 0000000..088c457
--- /dev/null
+++ b/include/linux/unaligned/le_struct.h
@@ -0,0 +1,36 @@
+#ifndef _LINUX_UNALIGNED_LE_STRUCT_H
+#define _LINUX_UNALIGNED_LE_STRUCT_H
+
+#include <linux/unaligned/packed_struct.h>
+
+static inline u16 get_unaligned_le16(const void *p)
+{
+	return __get_unaligned_cpu16((const u8 *)p);
+}
+
+static inline u32 get_unaligned_le32(const void *p)
+{
+	return __get_unaligned_cpu32((const u8 *)p);
+}
+
+static inline u64 get_unaligned_le64(const void *p)
+{
+	return __get_unaligned_cpu64((const u8 *)p);
+}
+
+static inline void put_unaligned_le16(u16 val, void *p)
+{
+	__put_unaligned_cpu16(val, p);
+}
+
+static inline void put_unaligned_le32(u32 val, void *p)
+{
+	__put_unaligned_cpu32(val, p);
+}
+
+static inline void put_unaligned_le64(u64 val, void *p)
+{
+	__put_unaligned_cpu64(val, p);
+}
+
+#endif /* _LINUX_UNALIGNED_LE_STRUCT_H */
diff --git a/include/linux/unaligned/memmove.h b/include/linux/unaligned/memmove.h
new file mode 100644
index 0000000..eeb5a77
--- /dev/null
+++ b/include/linux/unaligned/memmove.h
@@ -0,0 +1,45 @@
+#ifndef _LINUX_UNALIGNED_MEMMOVE_H
+#define _LINUX_UNALIGNED_MEMMOVE_H
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+/* Use memmove here, so gcc does not insert a __builtin_memcpy. */
+
+static inline u16 __get_unaligned_memmove16(const void *p)
+{
+	u16 tmp;
+	memmove(&tmp, p, 2);
+	return tmp;
+}
+
+static inline u32 __get_unaligned_memmove32(const void *p)
+{
+	u32 tmp;
+	memmove(&tmp, p, 4);
+	return tmp;
+}
+
+static inline u64 __get_unaligned_memmove64(const void *p)
+{
+	u64 tmp;
+	memmove(&tmp, p, 8);
+	return tmp;
+}
+
+static inline void __put_unaligned_memmove16(u16 val, void *p)
+{
+	memmove(p, &val, 2);
+}
+
+static inline void __put_unaligned_memmove32(u32 val, void *p)
+{
+	memmove(p, &val, 4);
+}
+
+static inline void __put_unaligned_memmove64(u64 val, void *p)
+{
+	memmove(p, &val, 8);
+}
+
+#endif /* _LINUX_UNALIGNED_MEMMOVE_H */
diff --git a/include/linux/unaligned/packed_struct.h b/include/linux/unaligned/packed_struct.h
new file mode 100644
index 0000000..c0d817d
--- /dev/null
+++ b/include/linux/unaligned/packed_struct.h
@@ -0,0 +1,46 @@
+#ifndef _LINUX_UNALIGNED_PACKED_STRUCT_H
+#define _LINUX_UNALIGNED_PACKED_STRUCT_H
+
+#include <linux/kernel.h>
+
+struct __una_u16 { u16 x; } __packed;
+struct __una_u32 { u32 x; } __packed;
+struct __una_u64 { u64 x; } __packed;
+
+static inline u16 __get_unaligned_cpu16(const void *p)
+{
+	const struct __una_u16 *ptr = (const struct __una_u16 *)p;
+	return ptr->x;
+}
+
+static inline u32 __get_unaligned_cpu32(const void *p)
+{
+	const struct __una_u32 *ptr = (const struct __una_u32 *)p;
+	return ptr->x;
+}
+
+static inline u64 __get_unaligned_cpu64(const void *p)
+{
+	const struct __una_u64 *ptr = (const struct __una_u64 *)p;
+	return ptr->x;
+}
+
+static inline void __put_unaligned_cpu16(u16 val, void *p)
+{
+	struct __una_u16 *ptr = (struct __una_u16 *)p;
+	ptr->x = val;
+}
+
+static inline void __put_unaligned_cpu32(u32 val, void *p)
+{
+	struct __una_u32 *ptr = (struct __una_u32 *)p;
+	ptr->x = val;
+}
+
+static inline void __put_unaligned_cpu64(u64 val, void *p)
+{
+	struct __una_u64 *ptr = (struct __una_u64 *)p;
+	ptr->x = val;
+}
+
+#endif /* _LINUX_UNALIGNED_PACKED_STRUCT_H */
diff --git a/include/linux/uuid.h b/include/linux/uuid.h
new file mode 100644
index 0000000..b81992d
--- /dev/null
+++ b/include/linux/uuid.h
@@ -0,0 +1,27 @@
+/*
+ * UUID/GUID definition
+ *
+ * Copyright (C) 2010, 2016 Intel Corp.
+ *	Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef _LINUX_UUID_H_
+#define _LINUX_UUID_H_
+
+#include <uapi/linux/uuid.h>
+#include <string.h>
+
+static inline int uuid_le_cmp(const uuid_le u1, const uuid_le u2)
+{
+	return memcmp(&u1, &u2, sizeof(uuid_le));
+}
+
+#endif
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
new file mode 100644
index 0000000..eb6284d
--- /dev/null
+++ b/include/linux/vmalloc.h
@@ -0,0 +1,8 @@
+#ifndef __TOOLS_LINUX_VMALLOC_H
+#define __TOOLS_LINUX_VMALLOC_H
+
+#define vmalloc(size)		malloc(size)
+#define __vmalloc(size, flags, prot)	malloc(size)
+#define vfree(p)		free(p)
+
+#endif /* __TOOLS_LINUX_VMALLOC_H */
diff --git a/include/linux/wait.h b/include/linux/wait.h
new file mode 100644
index 0000000..77cba05
--- /dev/null
+++ b/include/linux/wait.h
@@ -0,0 +1,1235 @@
+#ifndef _LINUX_WAIT_H
+#define _LINUX_WAIT_H
+
+#include <pthread.h>
+
+#include <linux/bitmap.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/spinlock.h>
+//#include <uapi/linux/wait.h>
+
+typedef struct __wait_queue wait_queue_t;
+typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
+int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key);
+
+/* __wait_queue::flags */
+#define WQ_FLAG_EXCLUSIVE	0x01
+#define WQ_FLAG_WOKEN		0x02
+
+struct __wait_queue {
+	unsigned int		flags;
+	void			*private;
+	wait_queue_func_t	func;
+	struct list_head	task_list;
+};
+
+struct wait_bit_key {
+	void			*flags;
+	int			bit_nr;
+#define WAIT_ATOMIC_T_BIT_NR	-1
+	unsigned long		timeout;
+};
+
+struct wait_bit_queue {
+	struct wait_bit_key	key;
+	wait_queue_t		wait;
+};
+
+struct __wait_queue_head {
+	spinlock_t		lock;
+	struct list_head	task_list;
+};
+typedef struct __wait_queue_head wait_queue_head_t;
+
+struct task_struct;
+
+/*
+ * Macros for declaration and initialisaton of the datatypes
+ */
+
+#define __WAITQUEUE_INITIALIZER(name, tsk) {				\
+	.private	= tsk,						\
+	.func		= default_wake_function,			\
+	.task_list	= { NULL, NULL } }
+
+#define DECLARE_WAITQUEUE(name, tsk)					\
+	wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)
+
+#define __WAIT_QUEUE_HEAD_INITIALIZER(name) {				\
+	.lock		= __SPIN_LOCK_UNLOCKED(name.lock),		\
+	.task_list	= { &(name).task_list, &(name).task_list } }
+
+#define DECLARE_WAIT_QUEUE_HEAD(name) \
+	wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
+
+#define __WAIT_BIT_KEY_INITIALIZER(word, bit)				\
+	{ .flags = word, .bit_nr = bit, }
+
+#define __WAIT_ATOMIC_T_KEY_INITIALIZER(p)				\
+	{ .flags = p, .bit_nr = WAIT_ATOMIC_T_BIT_NR, }
+
+extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *);
+
+#define init_waitqueue_head(q)				\
+	do {						\
+		static struct lock_class_key __key;	\
+							\
+		__init_waitqueue_head((q), #q, &__key);	\
+	} while (0)
+
+#ifdef CONFIG_LOCKDEP
+# define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \
+	({ init_waitqueue_head(&name); name; })
+# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) \
+	wait_queue_head_t name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name)
+#else
+# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name)
+#endif
+
+static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
+{
+	q->flags	= 0;
+	q->private	= p;
+	q->func		= default_wake_function;
+}
+
+static inline void
+init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func)
+{
+	q->flags	= 0;
+	q->private	= NULL;
+	q->func		= func;
+}
+
+/**
+ * waitqueue_active -- locklessly test for waiters on the queue
+ * @q: the waitqueue to test for waiters
+ *
+ * returns true if the wait list is not empty
+ *
+ * NOTE: this function is lockless and requires care, incorrect usage _will_
+ * lead to sporadic and non-obvious failure.
+ *
+ * Use either while holding wait_queue_head_t::lock or when used for wakeups
+ * with an extra smp_mb() like:
+ *
+ *      CPU0 - waker                    CPU1 - waiter
+ *
+ *                                      for (;;) {
+ *      @cond = true;                     prepare_to_wait(&wq, &wait, state);
+ *      smp_mb();                         // smp_mb() from set_current_state()
+ *      if (waitqueue_active(wq))         if (@cond)
+ *        wake_up(wq);                      break;
+ *                                        schedule();
+ *                                      }
+ *                                      finish_wait(&wq, &wait);
+ *
+ * Because without the explicit smp_mb() it's possible for the
+ * waitqueue_active() load to get hoisted over the @cond store such that we'll
+ * observe an empty wait list while the waiter might not observe @cond.
+ *
+ * Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
+ * which (when the lock is uncontended) are of roughly equal cost.
+ */
+static inline int waitqueue_active(wait_queue_head_t *q)
+{
+	return !list_empty(&q->task_list);
+}
+
+/**
+ * wq_has_sleeper - check if there are any waiting processes
+ * @wq: wait queue head
+ *
+ * Returns true if wq has waiting processes
+ *
+ * Please refer to the comment for waitqueue_active.
+ */
+static inline bool wq_has_sleeper(wait_queue_head_t *wq)
+{
+	/*
+	 * We need to be sure we are in sync with the
+	 * add_wait_queue modifications to the wait queue.
+	 *
+	 * This memory barrier should be paired with one on the
+	 * waiting side.
+	 */
+	smp_mb();
+	return waitqueue_active(wq);
+}
+
+extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
+extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait);
+extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
+
+static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
+{
+	list_add(&new->task_list, &head->task_list);
+}
+
+/*
+ * Used for wake-one threads:
+ */
+static inline void
+__add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+{
+	wait->flags |= WQ_FLAG_EXCLUSIVE;
+	__add_wait_queue(q, wait);
+}
+
+static inline void __add_wait_queue_tail(wait_queue_head_t *head,
+					 wait_queue_t *new)
+{
+	list_add_tail(&new->task_list, &head->task_list);
+}
+
+static inline void
+__add_wait_queue_tail_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+{
+	wait->flags |= WQ_FLAG_EXCLUSIVE;
+	__add_wait_queue_tail(q, wait);
+}
+
+static inline void
+__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
+{
+	list_del(&old->task_list);
+}
+
+typedef int wait_bit_action_f(struct wait_bit_key *, int mode);
+void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
+void __wake_up_bit(wait_queue_head_t *, void *, int);
+int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, wait_bit_action_f *, unsigned);
+int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, wait_bit_action_f *, unsigned);
+void wake_up_bit(void *, int);
+void wake_up_atomic_t(atomic_t *);
+int out_of_line_wait_on_bit(void *, int, wait_bit_action_f *, unsigned);
+int out_of_line_wait_on_bit_timeout(void *, int, wait_bit_action_f *, unsigned, unsigned long);
+int out_of_line_wait_on_bit_lock(void *, int, wait_bit_action_f *, unsigned);
+int out_of_line_wait_on_atomic_t(atomic_t *, int (*)(atomic_t *), unsigned);
+wait_queue_head_t *bit_waitqueue(void *, int);
+
+#define wake_up(x)			__wake_up(x, TASK_NORMAL, 1, NULL)
+#define wake_up_nr(x, nr)		__wake_up(x, TASK_NORMAL, nr, NULL)
+#define wake_up_all(x)			__wake_up(x, TASK_NORMAL, 0, NULL)
+#define wake_up_locked(x)		__wake_up_locked((x), TASK_NORMAL, 1)
+#define wake_up_all_locked(x)		__wake_up_locked((x), TASK_NORMAL, 0)
+
+#define wake_up_interruptible(x)	__wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
+#define wake_up_interruptible_nr(x, nr)	__wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
+#define wake_up_interruptible_all(x)	__wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
+#define wake_up_interruptible_sync(x)	__wake_up_sync((x), TASK_INTERRUPTIBLE, 1)
+
+/*
+ * Wakeup macros to be used to report events to the targets.
+ */
+#define wake_up_poll(x, m)						\
+	__wake_up(x, TASK_NORMAL, 1, (void *) (m))
+#define wake_up_locked_poll(x, m)					\
+	__wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
+#define wake_up_interruptible_poll(x, m)				\
+	__wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
+#define wake_up_interruptible_sync_poll(x, m)				\
+	__wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m))
+
+#define ___wait_cond_timeout(condition)					\
+({									\
+	bool __cond = (condition);					\
+	if (__cond && !__ret)						\
+		__ret = 1;						\
+	__cond || !__ret;						\
+})
+
+#define ___wait_is_interruptible(state)					\
+	(!__builtin_constant_p(state) ||				\
+		state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE)	\
+
+/*
+ * The below macro ___wait_event() has an explicit shadow of the __ret
+ * variable when used from the wait_event_*() macros.
+ *
+ * This is so that both can use the ___wait_cond_timeout() construct
+ * to wrap the condition.
+ *
+ * The type inconsistency of the wait_event_*() __ret variable is also
+ * on purpose; we use long where we can return timeout values and int
+ * otherwise.
+ */
+
+#define ___wait_event(wq, condition, state, exclusive, ret, cmd)	\
+({									\
+	__label__ __out;						\
+	wait_queue_t __wait;						\
+	long __ret = ret;	/* explicit shadow */			\
+									\
+	INIT_LIST_HEAD(&__wait.task_list);				\
+	if (exclusive)							\
+		__wait.flags = WQ_FLAG_EXCLUSIVE;			\
+	else								\
+		__wait.flags = 0;					\
+									\
+	for (;;) {							\
+		long __int = prepare_to_wait_event(&wq, &__wait, state);\
+									\
+		if (condition)						\
+			break;						\
+									\
+		if (___wait_is_interruptible(state) && __int) {		\
+			__ret = __int;					\
+			if (exclusive) {				\
+				abort_exclusive_wait(&wq, &__wait,	\
+						     state, NULL);	\
+				goto __out;				\
+			}						\
+			break;						\
+		}							\
+									\
+		cmd;							\
+	}								\
+	finish_wait(&wq, &__wait);					\
+__out:	__ret;								\
+})
+
+#define __wait_event(wq, condition)					\
+	(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
+			    schedule())
+
+/**
+ * wait_event - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ */
+#define wait_event(wq, condition)					\
+do {									\
+	might_sleep();							\
+	if (condition)							\
+		break;							\
+	__wait_event(wq, condition);					\
+} while (0)
+
+#define __io_wait_event(wq, condition)					\
+	(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
+			    io_schedule())
+
+/*
+ * io_wait_event() -- like wait_event() but with io_schedule()
+ */
+#define io_wait_event(wq, condition)					\
+do {									\
+	might_sleep();							\
+	if (condition)							\
+		break;							\
+	__io_wait_event(wq, condition);					\
+} while (0)
+
+#define __wait_event_freezable(wq, condition)				\
+	___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0,		\
+			    schedule(); try_to_freeze())
+
+/**
+ * wait_event_freezable - sleep (or freeze) until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute
+ * to system load) until the @condition evaluates to true. The
+ * @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ */
+#define wait_event_freezable(wq, condition)				\
+({									\
+	int __ret = 0;							\
+	might_sleep();							\
+	if (!(condition))						\
+		__ret = __wait_event_freezable(wq, condition);		\
+	__ret;								\
+})
+
+#define __wait_event_timeout(wq, condition, timeout)			\
+	___wait_event(wq, ___wait_cond_timeout(condition),		\
+		      TASK_UNINTERRUPTIBLE, 0, timeout,			\
+		      __ret = schedule_timeout(__ret))
+
+/**
+ * wait_event_timeout - sleep until a condition gets true or a timeout elapses
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, in jiffies
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * Returns:
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * or the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed.
+ */
+#define wait_event_timeout(wq, condition, timeout)			\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __wait_event_timeout(wq, condition, timeout);	\
+	__ret;								\
+})
+
+#define __wait_event_freezable_timeout(wq, condition, timeout)		\
+	___wait_event(wq, ___wait_cond_timeout(condition),		\
+		      TASK_INTERRUPTIBLE, 0, timeout,			\
+		      __ret = schedule_timeout(__ret); try_to_freeze())
+
+/*
+ * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid
+ * increasing load and is freezable.
+ */
+#define wait_event_freezable_timeout(wq, condition, timeout)		\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __wait_event_freezable_timeout(wq, condition, timeout);	\
+	__ret;								\
+})
+
+#define __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2)		\
+	(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 1, 0,	\
+			    cmd1; schedule(); cmd2)
+/*
+ * Just like wait_event_cmd(), except it sets exclusive flag
+ */
+#define wait_event_exclusive_cmd(wq, condition, cmd1, cmd2)		\
+do {									\
+	if (condition)							\
+		break;							\
+	__wait_event_exclusive_cmd(wq, condition, cmd1, cmd2);		\
+} while (0)
+
+#define __wait_event_cmd(wq, condition, cmd1, cmd2)			\
+	(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
+			    cmd1; schedule(); cmd2)
+
+/**
+ * wait_event_cmd - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @cmd1: the command will be executed before sleep
+ * @cmd2: the command will be executed after sleep
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ */
+#define wait_event_cmd(wq, condition, cmd1, cmd2)			\
+do {									\
+	if (condition)							\
+		break;							\
+	__wait_event_cmd(wq, condition, cmd1, cmd2);			\
+} while (0)
+
+#define __wait_event_interruptible(wq, condition)			\
+	___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0,		\
+		      schedule())
+
+/**
+ * wait_event_interruptible - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function will return -ERESTARTSYS if it was interrupted by a
+ * signal and 0 if @condition evaluated to true.
+ */
+#define wait_event_interruptible(wq, condition)				\
+({									\
+	int __ret = 0;							\
+	might_sleep();							\
+	if (!(condition))						\
+		__ret = __wait_event_interruptible(wq, condition);	\
+	__ret;								\
+})
+
+#define __wait_event_interruptible_timeout(wq, condition, timeout)	\
+	___wait_event(wq, ___wait_cond_timeout(condition),		\
+		      TASK_INTERRUPTIBLE, 0, timeout,			\
+		      __ret = schedule_timeout(__ret))
+
+/**
+ * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, in jiffies
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * Returns:
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
+ * interrupted by a signal.
+ */
+#define wait_event_interruptible_timeout(wq, condition, timeout)	\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __wait_event_interruptible_timeout(wq,		\
+						condition, timeout);	\
+	__ret;								\
+})
+
+#define __wait_event_hrtimeout(wq, condition, timeout, state)		\
+({									\
+	int __ret = 0;							\
+	struct hrtimer_sleeper __t;					\
+									\
+	hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC,		\
+			      HRTIMER_MODE_REL);			\
+	hrtimer_init_sleeper(&__t, current);				\
+	if ((timeout).tv64 != KTIME_MAX)				\
+		hrtimer_start_range_ns(&__t.timer, timeout,		\
+				       current->timer_slack_ns,		\
+				       HRTIMER_MODE_REL);		\
+									\
+	__ret = ___wait_event(wq, condition, state, 0, 0,		\
+		if (!__t.task) {					\
+			__ret = -ETIME;					\
+			break;						\
+		}							\
+		schedule());						\
+									\
+	hrtimer_cancel(&__t.timer);					\
+	destroy_hrtimer_on_stack(&__t.timer);				\
+	__ret;								\
+})
+
+/**
+ * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, as a ktime_t
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function returns 0 if @condition became true, or -ETIME if the timeout
+ * elapsed.
+ */
+#define wait_event_hrtimeout(wq, condition, timeout)			\
+({									\
+	int __ret = 0;							\
+	might_sleep();							\
+	if (!(condition))						\
+		__ret = __wait_event_hrtimeout(wq, condition, timeout,	\
+					       TASK_UNINTERRUPTIBLE);	\
+	__ret;								\
+})
+
+/**
+ * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, as a ktime_t
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function returns 0 if @condition became true, -ERESTARTSYS if it was
+ * interrupted by a signal, or -ETIME if the timeout elapsed.
+ */
+#define wait_event_interruptible_hrtimeout(wq, condition, timeout)	\
+({									\
+	long __ret = 0;							\
+	might_sleep();							\
+	if (!(condition))						\
+		__ret = __wait_event_hrtimeout(wq, condition, timeout,	\
+					       TASK_INTERRUPTIBLE);	\
+	__ret;								\
+})
+
+#define __wait_event_interruptible_exclusive(wq, condition)		\
+	___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0,		\
+		      schedule())
+
+#define wait_event_interruptible_exclusive(wq, condition)		\
+({									\
+	int __ret = 0;							\
+	might_sleep();							\
+	if (!(condition))						\
+		__ret = __wait_event_interruptible_exclusive(wq, condition);\
+	__ret;								\
+})
+
+#define __wait_event_killable_exclusive(wq, condition)			\
+	___wait_event(wq, condition, TASK_KILLABLE, 1, 0,		\
+		      schedule())
+
+#define wait_event_killable_exclusive(wq, condition)			\
+({									\
+	int __ret = 0;							\
+	might_sleep();							\
+	if (!(condition))						\
+		__ret = __wait_event_killable_exclusive(wq, condition);	\
+	__ret;								\
+})
+
+
+#define __wait_event_freezable_exclusive(wq, condition)			\
+	___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0,		\
+			schedule(); try_to_freeze())
+
+#define wait_event_freezable_exclusive(wq, condition)			\
+({									\
+	int __ret = 0;							\
+	might_sleep();							\
+	if (!(condition))						\
+		__ret = __wait_event_freezable_exclusive(wq, condition);\
+	__ret;								\
+})
+
+
+#define __wait_event_interruptible_locked(wq, condition, exclusive, irq) \
+({									\
+	int __ret = 0;							\
+	DEFINE_WAIT(__wait);						\
+	if (exclusive)							\
+		__wait.flags |= WQ_FLAG_EXCLUSIVE;			\
+	do {								\
+		if (likely(list_empty(&__wait.task_list)))		\
+			__add_wait_queue_tail(&(wq), &__wait);		\
+		set_current_state(TASK_INTERRUPTIBLE);			\
+		if (signal_pending(current)) {				\
+			__ret = -ERESTARTSYS;				\
+			break;						\
+		}							\
+		if (irq)						\
+			spin_unlock_irq(&(wq).lock);			\
+		else							\
+			spin_unlock(&(wq).lock);			\
+		schedule();						\
+		if (irq)						\
+			spin_lock_irq(&(wq).lock);			\
+		else							\
+			spin_lock(&(wq).lock);				\
+	} while (!(condition));						\
+	__remove_wait_queue(&(wq), &__wait);				\
+	__set_current_state(TASK_RUNNING);				\
+	__ret;								\
+})
+
+
+/**
+ * wait_event_interruptible_locked - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * It must be called with wq.lock being held.  This spinlock is
+ * unlocked while sleeping but @condition testing is done while lock
+ * is held and when this macro exits the lock is held.
+ *
+ * The lock is locked/unlocked using spin_lock()/spin_unlock()
+ * functions which must match the way they are locked/unlocked outside
+ * of this macro.
+ *
+ * wake_up_locked() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function will return -ERESTARTSYS if it was interrupted by a
+ * signal and 0 if @condition evaluated to true.
+ */
+#define wait_event_interruptible_locked(wq, condition)			\
+	((condition)							\
+	 ? 0 : __wait_event_interruptible_locked(wq, condition, 0, 0))
+
+/**
+ * wait_event_interruptible_locked_irq - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * It must be called with wq.lock being held.  This spinlock is
+ * unlocked while sleeping but @condition testing is done while lock
+ * is held and when this macro exits the lock is held.
+ *
+ * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
+ * functions which must match the way they are locked/unlocked outside
+ * of this macro.
+ *
+ * wake_up_locked() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function will return -ERESTARTSYS if it was interrupted by a
+ * signal and 0 if @condition evaluated to true.
+ */
+#define wait_event_interruptible_locked_irq(wq, condition)		\
+	((condition)							\
+	 ? 0 : __wait_event_interruptible_locked(wq, condition, 0, 1))
+
+/**
+ * wait_event_interruptible_exclusive_locked - sleep exclusively until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * It must be called with wq.lock being held.  This spinlock is
+ * unlocked while sleeping but @condition testing is done while lock
+ * is held and when this macro exits the lock is held.
+ *
+ * The lock is locked/unlocked using spin_lock()/spin_unlock()
+ * functions which must match the way they are locked/unlocked outside
+ * of this macro.
+ *
+ * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
+ * set thus when other process waits process on the list if this
+ * process is awaken further processes are not considered.
+ *
+ * wake_up_locked() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function will return -ERESTARTSYS if it was interrupted by a
+ * signal and 0 if @condition evaluated to true.
+ */
+#define wait_event_interruptible_exclusive_locked(wq, condition)	\
+	((condition)							\
+	 ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 0))
+
+/**
+ * wait_event_interruptible_exclusive_locked_irq - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * It must be called with wq.lock being held.  This spinlock is
+ * unlocked while sleeping but @condition testing is done while lock
+ * is held and when this macro exits the lock is held.
+ *
+ * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
+ * functions which must match the way they are locked/unlocked outside
+ * of this macro.
+ *
+ * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
+ * set thus when other process waits process on the list if this
+ * process is awaken further processes are not considered.
+ *
+ * wake_up_locked() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function will return -ERESTARTSYS if it was interrupted by a
+ * signal and 0 if @condition evaluated to true.
+ */
+#define wait_event_interruptible_exclusive_locked_irq(wq, condition)	\
+	((condition)							\
+	 ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 1))
+
+
+#define __wait_event_killable(wq, condition)				\
+	___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule())
+
+/**
+ * wait_event_killable - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_KILLABLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function will return -ERESTARTSYS if it was interrupted by a
+ * signal and 0 if @condition evaluated to true.
+ */
+#define wait_event_killable(wq, condition)				\
+({									\
+	int __ret = 0;							\
+	might_sleep();							\
+	if (!(condition))						\
+		__ret = __wait_event_killable(wq, condition);		\
+	__ret;								\
+})
+
+
+#define __wait_event_lock_irq(wq, condition, lock, cmd)			\
+	(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
+			    spin_unlock_irq(&lock);			\
+			    cmd;					\
+			    schedule();					\
+			    spin_lock_irq(&lock))
+
+/**
+ * wait_event_lock_irq_cmd - sleep until a condition gets true. The
+ *			     condition is checked under the lock. This
+ *			     is expected to be called with the lock
+ *			     taken.
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @lock: a locked spinlock_t, which will be released before cmd
+ *	  and schedule() and reacquired afterwards.
+ * @cmd: a command which is invoked outside the critical section before
+ *	 sleep
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * This is supposed to be called while holding the lock. The lock is
+ * dropped before invoking the cmd and going to sleep and is reacquired
+ * afterwards.
+ */
+#define wait_event_lock_irq_cmd(wq, condition, lock, cmd)		\
+do {									\
+	if (condition)							\
+		break;							\
+	__wait_event_lock_irq(wq, condition, lock, cmd);		\
+} while (0)
+
+/**
+ * wait_event_lock_irq - sleep until a condition gets true. The
+ *			 condition is checked under the lock. This
+ *			 is expected to be called with the lock
+ *			 taken.
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @lock: a locked spinlock_t, which will be released before schedule()
+ *	  and reacquired afterwards.
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * This is supposed to be called while holding the lock. The lock is
+ * dropped before going to sleep and is reacquired afterwards.
+ */
+#define wait_event_lock_irq(wq, condition, lock)			\
+do {									\
+	if (condition)							\
+		break;							\
+	__wait_event_lock_irq(wq, condition, lock, );			\
+} while (0)
+
+
+#define __wait_event_interruptible_lock_irq(wq, condition, lock, cmd)	\
+	___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0,		\
+		      spin_unlock_irq(&lock);				\
+		      cmd;						\
+		      schedule();					\
+		      spin_lock_irq(&lock))
+
+/**
+ * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true.
+ *		The condition is checked under the lock. This is expected to
+ *		be called with the lock taken.
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @lock: a locked spinlock_t, which will be released before cmd and
+ *	  schedule() and reacquired afterwards.
+ * @cmd: a command which is invoked outside the critical section before
+ *	 sleep
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received. The @condition is
+ * checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * This is supposed to be called while holding the lock. The lock is
+ * dropped before invoking the cmd and going to sleep and is reacquired
+ * afterwards.
+ *
+ * The macro will return -ERESTARTSYS if it was interrupted by a signal
+ * and 0 if @condition evaluated to true.
+ */
+#define wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd)	\
+({									\
+	int __ret = 0;							\
+	if (!(condition))						\
+		__ret = __wait_event_interruptible_lock_irq(wq,		\
+						condition, lock, cmd);	\
+	__ret;								\
+})
+
+/**
+ * wait_event_interruptible_lock_irq - sleep until a condition gets true.
+ *		The condition is checked under the lock. This is expected
+ *		to be called with the lock taken.
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @lock: a locked spinlock_t, which will be released before schedule()
+ *	  and reacquired afterwards.
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or signal is received. The @condition is
+ * checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * This is supposed to be called while holding the lock. The lock is
+ * dropped before going to sleep and is reacquired afterwards.
+ *
+ * The macro will return -ERESTARTSYS if it was interrupted by a signal
+ * and 0 if @condition evaluated to true.
+ */
+#define wait_event_interruptible_lock_irq(wq, condition, lock)		\
+({									\
+	int __ret = 0;							\
+	if (!(condition))						\
+		__ret = __wait_event_interruptible_lock_irq(wq,		\
+						condition, lock,);	\
+	__ret;								\
+})
+
+#define __wait_event_interruptible_lock_irq_timeout(wq, condition,	\
+						    lock, timeout)	\
+	___wait_event(wq, ___wait_cond_timeout(condition),		\
+		      TASK_INTERRUPTIBLE, 0, timeout,			\
+		      spin_unlock_irq(&lock);				\
+		      __ret = schedule_timeout(__ret);			\
+		      spin_lock_irq(&lock));
+
+/**
+ * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets
+ *		true or a timeout elapses. The condition is checked under
+ *		the lock. This is expected to be called with the lock taken.
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @lock: a locked spinlock_t, which will be released before schedule()
+ *	  and reacquired afterwards.
+ * @timeout: timeout, in jiffies
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or signal is received. The @condition is
+ * checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * This is supposed to be called while holding the lock. The lock is
+ * dropped before going to sleep and is reacquired afterwards.
+ *
+ * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it
+ * was interrupted by a signal, and the remaining jiffies otherwise
+ * if the condition evaluated to true before the timeout elapsed.
+ */
+#define wait_event_interruptible_lock_irq_timeout(wq, condition, lock,	\
+						  timeout)		\
+({									\
+	long __ret = timeout;						\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __wait_event_interruptible_lock_irq_timeout(	\
+					wq, condition, lock, timeout);	\
+	__ret;								\
+})
+
+/*
+ * Waitqueues which are removed from the waitqueue_head at wakeup time
+ */
+void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
+void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state);
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state);
+void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
+void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key);
+long wait_woken(wait_queue_t *wait, unsigned mode, long timeout);
+int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
+int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
+int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
+
+#define DEFINE_WAIT_FUNC(name, function)				\
+	wait_queue_t name = {						\
+		.private	= current,				\
+		.func		= function,				\
+		.task_list	= LIST_HEAD_INIT((name).task_list),	\
+	}
+
+#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)
+
+#define DEFINE_WAIT_BIT(name, word, bit)				\
+	struct wait_bit_queue name = {					\
+		.key = __WAIT_BIT_KEY_INITIALIZER(word, bit),		\
+		.wait	= {						\
+			.private	= current,			\
+			.func		= wake_bit_function,		\
+			.task_list	=				\
+				LIST_HEAD_INIT((name).wait.task_list),	\
+		},							\
+	}
+
+#define init_wait(wait)							\
+	do {								\
+		(wait)->private = current;				\
+		(wait)->func = autoremove_wake_function;		\
+		INIT_LIST_HEAD(&(wait)->task_list);			\
+		(wait)->flags = 0;					\
+	} while (0)
+
+
+extern int bit_wait(struct wait_bit_key *, int);
+extern int bit_wait_io(struct wait_bit_key *, int);
+extern int bit_wait_timeout(struct wait_bit_key *, int);
+extern int bit_wait_io_timeout(struct wait_bit_key *, int);
+
+/**
+ * wait_on_bit - wait for a bit to be cleared
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @mode: the task state to sleep in
+ *
+ * There is a standard hashed waitqueue table for generic use. This
+ * is the part of the hashtable's accessor API that waits on a bit.
+ * For instance, if one were to have waiters on a bitflag, one would
+ * call wait_on_bit() in threads waiting for the bit to clear.
+ * One uses wait_on_bit() where one is waiting for the bit to clear,
+ * but has no intention of setting it.
+ * Returned value will be zero if the bit was cleared, or non-zero
+ * if the process received a signal and the mode permitted wakeup
+ * on that signal.
+ */
+static inline int
+wait_on_bit(unsigned long *word, int bit, unsigned mode)
+{
+	might_sleep();
+	if (!test_bit(bit, word))
+		return 0;
+	return out_of_line_wait_on_bit(word, bit,
+				       bit_wait,
+				       mode);
+}
+
+/**
+ * wait_on_bit_io - wait for a bit to be cleared
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @mode: the task state to sleep in
+ *
+ * Use the standard hashed waitqueue table to wait for a bit
+ * to be cleared.  This is similar to wait_on_bit(), but calls
+ * io_schedule() instead of schedule() for the actual waiting.
+ *
+ * Returned value will be zero if the bit was cleared, or non-zero
+ * if the process received a signal and the mode permitted wakeup
+ * on that signal.
+ */
+static inline int
+wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
+{
+	might_sleep();
+	if (!test_bit(bit, word))
+		return 0;
+	return out_of_line_wait_on_bit(word, bit,
+				       bit_wait_io,
+				       mode);
+}
+
+/**
+ * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @mode: the task state to sleep in
+ * @timeout: timeout, in jiffies
+ *
+ * Use the standard hashed waitqueue table to wait for a bit
+ * to be cleared. This is similar to wait_on_bit(), except also takes a
+ * timeout parameter.
+ *
+ * Returned value will be zero if the bit was cleared before the
+ * @timeout elapsed, or non-zero if the @timeout elapsed or process
+ * received a signal and the mode permitted wakeup on that signal.
+ */
+static inline int
+wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
+		    unsigned long timeout)
+{
+	might_sleep();
+	if (!test_bit(bit, word))
+		return 0;
+	return out_of_line_wait_on_bit_timeout(word, bit,
+					       bit_wait_timeout,
+					       mode, timeout);
+}
+
+/**
+ * wait_on_bit_action - wait for a bit to be cleared
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @action: the function used to sleep, which may take special actions
+ * @mode: the task state to sleep in
+ *
+ * Use the standard hashed waitqueue table to wait for a bit
+ * to be cleared, and allow the waiting action to be specified.
+ * This is like wait_on_bit() but allows fine control of how the waiting
+ * is done.
+ *
+ * Returned value will be zero if the bit was cleared, or non-zero
+ * if the process received a signal and the mode permitted wakeup
+ * on that signal.
+ */
+static inline int
+wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
+		   unsigned mode)
+{
+	might_sleep();
+	if (!test_bit(bit, word))
+		return 0;
+	return out_of_line_wait_on_bit(word, bit, action, mode);
+}
+
+/**
+ * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @mode: the task state to sleep in
+ *
+ * There is a standard hashed waitqueue table for generic use. This
+ * is the part of the hashtable's accessor API that waits on a bit
+ * when one intends to set it, for instance, trying to lock bitflags.
+ * For instance, if one were to have waiters trying to set bitflag
+ * and waiting for it to clear before setting it, one would call
+ * wait_on_bit() in threads waiting to be able to set the bit.
+ * One uses wait_on_bit_lock() where one is waiting for the bit to
+ * clear with the intention of setting it, and when done, clearing it.
+ *
+ * Returns zero if the bit was (eventually) found to be clear and was
+ * set.  Returns non-zero if a signal was delivered to the process and
+ * the @mode allows that signal to wake the process.
+ */
+static inline int
+wait_on_bit_lock(unsigned long *word, int bit, unsigned mode)
+{
+	might_sleep();
+	if (!test_and_set_bit(bit, word))
+		return 0;
+	return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode);
+}
+
+/**
+ * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @mode: the task state to sleep in
+ *
+ * Use the standard hashed waitqueue table to wait for a bit
+ * to be cleared and then to atomically set it.  This is similar
+ * to wait_on_bit(), but calls io_schedule() instead of schedule()
+ * for the actual waiting.
+ *
+ * Returns zero if the bit was (eventually) found to be clear and was
+ * set.  Returns non-zero if a signal was delivered to the process and
+ * the @mode allows that signal to wake the process.
+ */
+static inline int
+wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode)
+{
+	might_sleep();
+	if (!test_and_set_bit(bit, word))
+		return 0;
+	return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode);
+}
+
+/**
+ * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @action: the function used to sleep, which may take special actions
+ * @mode: the task state to sleep in
+ *
+ * Use the standard hashed waitqueue table to wait for a bit
+ * to be cleared and then to set it, and allow the waiting action
+ * to be specified.
+ * This is like wait_on_bit() but allows fine control of how the waiting
+ * is done.
+ *
+ * Returns zero if the bit was (eventually) found to be clear and was
+ * set.  Returns non-zero if a signal was delivered to the process and
+ * the @mode allows that signal to wake the process.
+ */
+static inline int
+wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
+			unsigned mode)
+{
+	might_sleep();
+	if (!test_and_set_bit(bit, word))
+		return 0;
+	return out_of_line_wait_on_bit_lock(word, bit, action, mode);
+}
+
+/**
+ * wait_on_atomic_t - Wait for an atomic_t to become 0
+ * @val: The atomic value being waited on, a kernel virtual address
+ * @action: the function used to sleep, which may take special actions
+ * @mode: the task state to sleep in
+ *
+ * Wait for an atomic_t to become 0.  We abuse the bit-wait waitqueue table for
+ * the purpose of getting a waitqueue, but we set the key to a bit number
+ * outside of the target 'word'.
+ */
+static inline
+int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode)
+{
+	might_sleep();
+	if (atomic_read(val) == 0)
+		return 0;
+	return out_of_line_wait_on_atomic_t(val, action, mode);
+}
+
+#endif /* _LINUX_WAIT_H */
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
new file mode 100644
index 0000000..4bad3d3
--- /dev/null
+++ b/include/linux/workqueue.h
@@ -0,0 +1,189 @@
+#ifndef __TOOLS_LINUX_WORKQUEUE_H
+#define __TOOLS_LINUX_WORKQUEUE_H
+
+#include <linux/list.h>
+#include <linux/timer.h>
+
+struct task_struct;
+struct workqueue_struct;
+struct work_struct;
+typedef void (*work_func_t)(struct work_struct *work);
+void delayed_work_timer_fn(unsigned long __data);
+
+#define work_data_bits(work) ((unsigned long *)(&(work)->data))
+
+#if 0
+enum {
+	//WORK_STRUCT_PENDING_BIT	= 0,	/* work item is pending execution */
+	//WORK_STRUCT_DELAYED_BIT	= 1,	/* work item is delayed */
+	//
+	//WORK_STRUCT_PENDING	= 1 << WORK_STRUCT_PENDING_BIT,
+	//WORK_STRUCT_DELAYED	= 1 << WORK_STRUCT_DELAYED_BIT,
+};
+#endif
+
+struct work_struct {
+	atomic_long_t data;
+	struct list_head entry;
+	work_func_t func;
+};
+
+#define INIT_WORK(_work, _func)					\
+do {								\
+	(_work)->data.counter = 0;				\
+	INIT_LIST_HEAD(&(_work)->entry);			\
+	(_work)->func = (_func);				\
+} while (0)
+
+struct delayed_work {
+	struct work_struct work;
+	struct timer_list timer;
+	struct workqueue_struct *wq;
+};
+
+#define INIT_DELAYED_WORK(_work, _func)					\
+	do {								\
+		INIT_WORK(&(_work)->work, (_func));			\
+		__setup_timer(&(_work)->timer, delayed_work_timer_fn,	\
+			      (unsigned long)(_work),			\
+			      TIMER_IRQSAFE);				\
+	} while (0)
+
+static inline struct delayed_work *to_delayed_work(struct work_struct *work)
+{
+	return container_of(work, struct delayed_work, work);
+}
+
+enum {
+	WQ_UNBOUND		= 1 << 1, /* not bound to any cpu */
+	WQ_FREEZABLE		= 1 << 2, /* freeze during suspend */
+	WQ_MEM_RECLAIM		= 1 << 3, /* may be used for memory reclaim */
+	WQ_HIGHPRI		= 1 << 4, /* high priority */
+	WQ_CPU_INTENSIVE	= 1 << 5, /* cpu intensive workqueue */
+	WQ_SYSFS		= 1 << 6, /* visible in sysfs, see wq_sysfs_register() */
+
+	/*
+	 * Per-cpu workqueues are generally preferred because they tend to
+	 * show better performance thanks to cache locality.  Per-cpu
+	 * workqueues exclude the scheduler from choosing the CPU to
+	 * execute the worker threads, which has an unfortunate side effect
+	 * of increasing power consumption.
+	 *
+	 * The scheduler considers a CPU idle if it doesn't have any task
+	 * to execute and tries to keep idle cores idle to conserve power;
+	 * however, for example, a per-cpu work item scheduled from an
+	 * interrupt handler on an idle CPU will force the scheduler to
+	 * excute the work item on that CPU breaking the idleness, which in
+	 * turn may lead to more scheduling choices which are sub-optimal
+	 * in terms of power consumption.
+	 *
+	 * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default
+	 * but become unbound if workqueue.power_efficient kernel param is
+	 * specified.  Per-cpu workqueues which are identified to
+	 * contribute significantly to power-consumption are identified and
+	 * marked with this flag and enabling the power_efficient mode
+	 * leads to noticeable power saving at the cost of small
+	 * performance disadvantage.
+	 *
+	 * http://thread.gmane.org/gmane.linux.kernel/1480396
+	 */
+	WQ_POWER_EFFICIENT	= 1 << 7,
+
+	__WQ_DRAINING		= 1 << 16, /* internal: workqueue is draining */
+	__WQ_ORDERED		= 1 << 17, /* internal: workqueue is ordered */
+	__WQ_LEGACY		= 1 << 18, /* internal: create*_workqueue() */
+
+	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
+	WQ_MAX_UNBOUND_PER_CPU	= 4,	  /* 4 * #cpus for unbound wq */
+	WQ_DFL_ACTIVE		= WQ_MAX_ACTIVE / 2,
+};
+
+/* unbound wq's aren't per-cpu, scale max_active according to #cpus */
+#define WQ_UNBOUND_MAX_ACTIVE	WQ_MAX_ACTIVE
+
+extern struct workqueue_struct *system_wq;
+extern struct workqueue_struct *system_highpri_wq;
+extern struct workqueue_struct *system_long_wq;
+extern struct workqueue_struct *system_unbound_wq;
+extern struct workqueue_struct *system_freezable_wq;
+extern struct workqueue_struct *system_power_efficient_wq;
+extern struct workqueue_struct *system_freezable_power_efficient_wq;
+
+extern struct workqueue_struct *
+alloc_workqueue(const char *fmt, unsigned int flags,
+		int max_active, ...) __printf(1, 4);
+
+#define alloc_ordered_workqueue(fmt, flags, args...)			\
+	alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args)
+
+#define create_workqueue(name)						\
+	alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name))
+#define create_freezable_workqueue(name)				\
+	alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND |	\
+			WQ_MEM_RECLAIM, 1, (name))
+#define create_singlethread_workqueue(name)				\
+	alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name)
+
+extern void destroy_workqueue(struct workqueue_struct *wq);
+
+struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask);
+void free_workqueue_attrs(struct workqueue_attrs *attrs);
+int apply_workqueue_attrs(struct workqueue_struct *wq,
+			  const struct workqueue_attrs *attrs);
+
+extern bool queue_work(struct workqueue_struct *wq,
+		       struct work_struct *work);
+extern bool queue_delayed_work(struct workqueue_struct *wq,
+			struct delayed_work *work, unsigned long delay);
+extern bool mod_delayed_work(struct workqueue_struct *wq,
+			struct delayed_work *dwork, unsigned long delay);
+
+extern void flush_workqueue(struct workqueue_struct *wq);
+extern void drain_workqueue(struct workqueue_struct *wq);
+
+extern int schedule_on_each_cpu(work_func_t func);
+
+extern bool flush_work(struct work_struct *work);
+extern bool cancel_work_sync(struct work_struct *work);
+
+extern bool flush_delayed_work(struct delayed_work *dwork);
+extern bool cancel_delayed_work(struct delayed_work *dwork);
+extern bool cancel_delayed_work_sync(struct delayed_work *dwork);
+
+extern void workqueue_set_max_active(struct workqueue_struct *wq,
+				     int max_active);
+extern bool current_is_workqueue_rescuer(void);
+extern bool workqueue_congested(int cpu, struct workqueue_struct *wq);
+extern unsigned int work_busy(struct work_struct *work);
+extern __printf(1, 2) void set_worker_desc(const char *fmt, ...);
+extern void print_worker_info(const char *log_lvl, struct task_struct *task);
+extern void show_workqueue_state(void);
+
+static inline bool schedule_work_on(int cpu, struct work_struct *work)
+{
+	return queue_work(system_wq, work);
+}
+
+static inline bool schedule_work(struct work_struct *work)
+{
+	return queue_work(system_wq, work);
+}
+
+static inline void flush_scheduled_work(void)
+{
+	flush_workqueue(system_wq);
+}
+
+static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
+					    unsigned long delay)
+{
+	return queue_delayed_work(system_wq, dwork, delay);
+}
+
+static inline bool schedule_delayed_work(struct delayed_work *dwork,
+					 unsigned long delay)
+{
+	return queue_delayed_work(system_wq, dwork, delay);
+}
+
+#endif /* __TOOLS_LINUX_WORKQUEUE_H */
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
new file mode 100644
index 0000000..d7fade7
--- /dev/null
+++ b/include/linux/xattr.h
@@ -0,0 +1,68 @@
+/*
+  File: linux/xattr.h
+
+  Extended attributes handling.
+
+  Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
+  Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+  Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
+*/
+#ifndef _LINUX_XATTR_H
+#define _LINUX_XATTR_H
+
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <uapi/linux/xattr.h>
+
+struct inode;
+struct dentry;
+
+/*
+ * struct xattr_handler: When @name is set, match attributes with exactly that
+ * name.  When @prefix is set instead, match attributes with that prefix and
+ * with a non-empty suffix.
+ */
+struct xattr_handler {
+	const char *name;
+	const char *prefix;
+	int flags;      /* fs private flags */
+	bool (*list)(struct dentry *dentry);
+	int (*get)(const struct xattr_handler *, struct dentry *dentry,
+		   struct inode *inode, const char *name, void *buffer,
+		   size_t size);
+	int (*set)(const struct xattr_handler *, struct dentry *dentry,
+		   struct inode *inode, const char *name, const void *buffer,
+		   size_t size, int flags);
+};
+
+const char *xattr_full_name(const struct xattr_handler *, const char *);
+
+struct xattr {
+	const char *name;
+	void *value;
+	size_t value_len;
+};
+
+ssize_t xattr_getsecurity(struct inode *, const char *, void *, size_t);
+ssize_t vfs_getxattr(struct dentry *, const char *, void *, size_t);
+ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size);
+int __vfs_setxattr_noperm(struct dentry *, const char *, const void *, size_t, int);
+int vfs_setxattr(struct dentry *, const char *, const void *, size_t, int);
+int vfs_removexattr(struct dentry *, const char *);
+
+ssize_t generic_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size);
+ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size);
+int generic_setxattr(struct dentry *dentry, struct inode *inode,
+		     const char *name, const void *value, size_t size, int flags);
+int generic_removexattr(struct dentry *dentry, const char *name);
+ssize_t vfs_getxattr_alloc(struct dentry *dentry, const char *name,
+			   char **xattr_value, size_t size, gfp_t flags);
+
+static inline const char *xattr_prefix(const struct xattr_handler *handler)
+{
+	return handler->prefix ?: handler->name;
+}
+
+#endif	/* _LINUX_XATTR_H */
diff --git a/include/linux/zconf.h b/include/linux/zconf.h
new file mode 100644
index 0000000..0beb75e
--- /dev/null
+++ b/include/linux/zconf.h
@@ -0,0 +1,57 @@
+/* zconf.h -- configuration of the zlib compression library
+ * Copyright (C) 1995-1998 Jean-loup Gailly.
+ * For conditions of distribution and use, see copyright notice in zlib.h 
+ */
+
+/* @(#) $Id$ */
+
+#ifndef _ZCONF_H
+#define _ZCONF_H
+
+/* The memory requirements for deflate are (in bytes):
+            (1 << (windowBits+2)) +  (1 << (memLevel+9))
+ that is: 128K for windowBits=15  +  128K for memLevel = 8  (default values)
+ plus a few kilobytes for small objects. For example, if you want to reduce
+ the default memory requirements from 256K to 128K, compile with
+     make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7"
+ Of course this will generally degrade compression (there's no free lunch).
+
+   The memory requirements for inflate are (in bytes) 1 << windowBits
+ that is, 32K for windowBits=15 (default value) plus a few kilobytes
+ for small objects.
+*/
+
+/* Maximum value for memLevel in deflateInit2 */
+#ifndef MAX_MEM_LEVEL
+#  define MAX_MEM_LEVEL 8
+#endif
+
+/* Maximum value for windowBits in deflateInit2 and inflateInit2.
+ * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files
+ * created by gzip. (Files created by minigzip can still be extracted by
+ * gzip.)
+ */
+#ifndef MAX_WBITS
+#  define MAX_WBITS   15 /* 32K LZ77 window */
+#endif
+
+/* default windowBits for decompression. MAX_WBITS is for compression only */
+#ifndef DEF_WBITS
+#  define DEF_WBITS MAX_WBITS
+#endif
+
+/* default memLevel */
+#if MAX_MEM_LEVEL >= 8
+#  define DEF_MEM_LEVEL 8
+#else
+#  define DEF_MEM_LEVEL  MAX_MEM_LEVEL
+#endif
+
+                        /* Type declarations */
+
+typedef unsigned char  Byte;  /* 8 bits */
+typedef unsigned int   uInt;  /* 16 bits or more */
+typedef unsigned long  uLong; /* 32 bits or more */
+typedef void     *voidp;
+
+#endif /* _ZCONF_H */
diff --git a/include/linux/zlib.h b/include/linux/zlib.h
new file mode 100644
index 0000000..92dbbd3
--- /dev/null
+++ b/include/linux/zlib.h
@@ -0,0 +1,593 @@
+/* zlib.h -- interface of the 'zlib' general purpose compression library
+
+  Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Jean-loup Gailly        Mark Adler
+  jloup@gzip.org          madler@alumni.caltech.edu
+
+
+  The data format used by the zlib library is described by RFCs (Request for
+  Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt
+  (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format).
+*/
+
+#ifndef _ZLIB_H
+#define _ZLIB_H
+
+#include <linux/zconf.h>
+
+/* zlib deflate based on ZLIB_VERSION "1.1.3" */
+/* zlib inflate based on ZLIB_VERSION "1.2.3" */
+
+/*
+  This is a modified version of zlib for use inside the Linux kernel.
+  The main changes are to perform all memory allocation in advance.
+
+  Inflation Changes:
+    * Z_PACKET_FLUSH is added and used by ppp_deflate. Before returning
+      this checks there is no more input data available and the next data
+      is a STORED block. It also resets the mode to be read for the next
+      data, all as per PPP requirements.
+    * Addition of zlib_inflateIncomp which copies incompressible data into
+      the history window and adjusts the accoutning without calling
+      zlib_inflate itself to inflate the data.
+*/
+
+/* 
+     The 'zlib' compression library provides in-memory compression and
+  decompression functions, including integrity checks of the uncompressed
+  data.  This version of the library supports only one compression method
+  (deflation) but other algorithms will be added later and will have the same
+  stream interface.
+
+     Compression can be done in a single step if the buffers are large
+  enough (for example if an input file is mmap'ed), or can be done by
+  repeated calls of the compression function.  In the latter case, the
+  application must provide more input and/or consume the output
+  (providing more output space) before each call.
+
+     The compressed data format used by default by the in-memory functions is
+  the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped
+  around a deflate stream, which is itself documented in RFC 1951.
+
+     The library also supports reading and writing files in gzip (.gz) format
+  with an interface similar to that of stdio.
+
+     The zlib format was designed to be compact and fast for use in memory
+  and on communications channels.  The gzip format was designed for single-
+  file compression on file systems, has a larger header than zlib to maintain
+  directory information, and uses a different, slower check method than zlib.
+
+     The library does not install any signal handler. The decoder checks
+  the consistency of the compressed data, so the library should never
+  crash even in case of corrupted input.
+*/
+
+struct internal_state;
+
+typedef struct z_stream_s {
+    const Byte *next_in;   /* next input byte */
+	uLong avail_in;  /* number of bytes available at next_in */
+    uLong    total_in;  /* total nb of input bytes read so far */
+
+    Byte    *next_out;  /* next output byte should be put there */
+	uLong avail_out; /* remaining free space at next_out */
+    uLong    total_out; /* total nb of bytes output so far */
+
+    char     *msg;      /* last error message, NULL if no error */
+    struct internal_state *state; /* not visible by applications */
+
+    void     *workspace; /* memory allocated for this stream */
+
+    int     data_type;  /* best guess about the data type: ascii or binary */
+    uLong   adler;      /* adler32 value of the uncompressed data */
+    uLong   reserved;   /* reserved for future use */
+} z_stream;
+
+typedef z_stream *z_streamp;
+
+/*
+   The application must update next_in and avail_in when avail_in has
+   dropped to zero. It must update next_out and avail_out when avail_out
+   has dropped to zero. The application must initialize zalloc, zfree and
+   opaque before calling the init function. All other fields are set by the
+   compression library and must not be updated by the application.
+
+   The opaque value provided by the application will be passed as the first
+   parameter for calls of zalloc and zfree. This can be useful for custom
+   memory management. The compression library attaches no meaning to the
+   opaque value.
+
+   zalloc must return NULL if there is not enough memory for the object.
+   If zlib is used in a multi-threaded application, zalloc and zfree must be
+   thread safe.
+
+   On 16-bit systems, the functions zalloc and zfree must be able to allocate
+   exactly 65536 bytes, but will not be required to allocate more than this
+   if the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS,
+   pointers returned by zalloc for objects of exactly 65536 bytes *must*
+   have their offset normalized to zero. The default allocation function
+   provided by this library ensures this (see zutil.c). To reduce memory
+   requirements and avoid any allocation of 64K objects, at the expense of
+   compression ratio, compile the library with -DMAX_WBITS=14 (see zconf.h).
+
+   The fields total_in and total_out can be used for statistics or
+   progress reports. After compression, total_in holds the total size of
+   the uncompressed data and may be saved for use in the decompressor
+   (particularly if the decompressor wants to decompress everything in
+   a single step).
+*/
+
+                        /* constants */
+
+#define Z_NO_FLUSH      0
+#define Z_PARTIAL_FLUSH 1 /* will be removed, use Z_SYNC_FLUSH instead */
+#define Z_PACKET_FLUSH  2
+#define Z_SYNC_FLUSH    3
+#define Z_FULL_FLUSH    4
+#define Z_FINISH        5
+#define Z_BLOCK         6 /* Only for inflate at present */
+/* Allowed flush values; see deflate() and inflate() below for details */
+
+#define Z_OK            0
+#define Z_STREAM_END    1
+#define Z_NEED_DICT     2
+#define Z_ERRNO        (-1)
+#define Z_STREAM_ERROR (-2)
+#define Z_DATA_ERROR   (-3)
+#define Z_MEM_ERROR    (-4)
+#define Z_BUF_ERROR    (-5)
+#define Z_VERSION_ERROR (-6)
+/* Return codes for the compression/decompression functions. Negative
+ * values are errors, positive values are used for special but normal events.
+ */
+
+#define Z_NO_COMPRESSION         0
+#define Z_BEST_SPEED             1
+#define Z_BEST_COMPRESSION       9
+#define Z_DEFAULT_COMPRESSION  (-1)
+/* compression levels */
+
+#define Z_FILTERED            1
+#define Z_HUFFMAN_ONLY        2
+#define Z_DEFAULT_STRATEGY    0
+/* compression strategy; see deflateInit2() below for details */
+
+#define Z_BINARY   0
+#define Z_ASCII    1
+#define Z_UNKNOWN  2
+/* Possible values of the data_type field */
+
+#define Z_DEFLATED   8
+/* The deflate compression method (the only one supported in this version) */
+
+                        /* basic functions */
+
+extern int zlib_deflate_workspacesize (int windowBits, int memLevel);
+/*
+   Returns the number of bytes that needs to be allocated for a per-
+   stream workspace with the specified parameters.  A pointer to this
+   number of bytes should be returned in stream->workspace before
+   you call zlib_deflateInit() or zlib_deflateInit2().  If you call
+   zlib_deflateInit(), specify windowBits = MAX_WBITS and memLevel =
+   MAX_MEM_LEVEL here.  If you call zlib_deflateInit2(), the windowBits
+   and memLevel parameters passed to zlib_deflateInit2() must not
+   exceed those passed here.
+*/
+
+/* 
+extern int deflateInit (z_streamp strm, int level);
+
+     Initializes the internal stream state for compression. The fields
+   zalloc, zfree and opaque must be initialized before by the caller.
+   If zalloc and zfree are set to NULL, deflateInit updates them to
+   use default allocation functions.
+
+     The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9:
+   1 gives best speed, 9 gives best compression, 0 gives no compression at
+   all (the input data is simply copied a block at a time).
+   Z_DEFAULT_COMPRESSION requests a default compromise between speed and
+   compression (currently equivalent to level 6).
+
+     deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_STREAM_ERROR if level is not a valid compression level,
+   Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible
+   with the version assumed by the caller (ZLIB_VERSION).
+   msg is set to null if there is no error message.  deflateInit does not
+   perform any compression: this will be done by deflate().
+*/
+
+
+extern int zlib_deflate (z_streamp strm, int flush);
+/*
+    deflate compresses as much data as possible, and stops when the input
+  buffer becomes empty or the output buffer becomes full. It may introduce some
+  output latency (reading input without producing any output) except when
+  forced to flush.
+
+    The detailed semantics are as follows. deflate performs one or both of the
+  following actions:
+
+  - Compress more input starting at next_in and update next_in and avail_in
+    accordingly. If not all input can be processed (because there is not
+    enough room in the output buffer), next_in and avail_in are updated and
+    processing will resume at this point for the next call of deflate().
+
+  - Provide more output starting at next_out and update next_out and avail_out
+    accordingly. This action is forced if the parameter flush is non zero.
+    Forcing flush frequently degrades the compression ratio, so this parameter
+    should be set only when necessary (in interactive applications).
+    Some output may be provided even if flush is not set.
+
+  Before the call of deflate(), the application should ensure that at least
+  one of the actions is possible, by providing more input and/or consuming
+  more output, and updating avail_in or avail_out accordingly; avail_out
+  should never be zero before the call. The application can consume the
+  compressed output when it wants, for example when the output buffer is full
+  (avail_out == 0), or after each call of deflate(). If deflate returns Z_OK
+  and with zero avail_out, it must be called again after making room in the
+  output buffer because there might be more output pending.
+
+    If the parameter flush is set to Z_SYNC_FLUSH, all pending output is
+  flushed to the output buffer and the output is aligned on a byte boundary, so
+  that the decompressor can get all input data available so far. (In particular
+  avail_in is zero after the call if enough output space has been provided
+  before the call.)  Flushing may degrade compression for some compression
+  algorithms and so it should be used only when necessary.
+
+    If flush is set to Z_FULL_FLUSH, all output is flushed as with
+  Z_SYNC_FLUSH, and the compression state is reset so that decompression can
+  restart from this point if previous compressed data has been damaged or if
+  random access is desired. Using Z_FULL_FLUSH too often can seriously degrade
+  the compression.
+
+    If deflate returns with avail_out == 0, this function must be called again
+  with the same value of the flush parameter and more output space (updated
+  avail_out), until the flush is complete (deflate returns with non-zero
+  avail_out).
+
+    If the parameter flush is set to Z_FINISH, pending input is processed,
+  pending output is flushed and deflate returns with Z_STREAM_END if there
+  was enough output space; if deflate returns with Z_OK, this function must be
+  called again with Z_FINISH and more output space (updated avail_out) but no
+  more input data, until it returns with Z_STREAM_END or an error. After
+  deflate has returned Z_STREAM_END, the only possible operations on the
+  stream are deflateReset or deflateEnd.
+  
+    Z_FINISH can be used immediately after deflateInit if all the compression
+  is to be done in a single step. In this case, avail_out must be at least
+  0.1% larger than avail_in plus 12 bytes.  If deflate does not return
+  Z_STREAM_END, then it must be called again as described above.
+
+    deflate() sets strm->adler to the adler32 checksum of all input read
+  so far (that is, total_in bytes).
+
+    deflate() may update data_type if it can make a good guess about
+  the input data type (Z_ASCII or Z_BINARY). In doubt, the data is considered
+  binary. This field is only for information purposes and does not affect
+  the compression algorithm in any manner.
+
+    deflate() returns Z_OK if some progress has been made (more input
+  processed or more output produced), Z_STREAM_END if all input has been
+  consumed and all output has been produced (only when flush is set to
+  Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example
+  if next_in or next_out was NULL), Z_BUF_ERROR if no progress is possible
+  (for example avail_in or avail_out was zero).
+*/
+
+
+extern int zlib_deflateEnd (z_streamp strm);
+/*
+     All dynamically allocated data structures for this stream are freed.
+   This function discards any unprocessed input and does not flush any
+   pending output.
+
+     deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the
+   stream state was inconsistent, Z_DATA_ERROR if the stream was freed
+   prematurely (some input or output was discarded). In the error case,
+   msg may be set but then points to a static string (which must not be
+   deallocated).
+*/
+
+
+extern int zlib_inflate_workspacesize (void);
+/*
+   Returns the number of bytes that needs to be allocated for a per-
+   stream workspace.  A pointer to this number of bytes should be
+   returned in stream->workspace before calling zlib_inflateInit().
+*/
+
+/* 
+extern int zlib_inflateInit (z_streamp strm);
+
+     Initializes the internal stream state for decompression. The fields
+   next_in, avail_in, and workspace must be initialized before by
+   the caller. If next_in is not NULL and avail_in is large enough (the exact
+   value depends on the compression method), inflateInit determines the
+   compression method from the zlib header and allocates all data structures
+   accordingly; otherwise the allocation will be deferred to the first call of
+   inflate.  If zalloc and zfree are set to NULL, inflateInit updates them to
+   use default allocation functions.
+
+     inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
+   version assumed by the caller.  msg is set to null if there is no error
+   message. inflateInit does not perform any decompression apart from reading
+   the zlib header if present: this will be done by inflate().  (So next_in and
+   avail_in may be modified, but next_out and avail_out are unchanged.)
+*/
+
+
+extern int zlib_inflate (z_streamp strm, int flush);
+/*
+    inflate decompresses as much data as possible, and stops when the input
+  buffer becomes empty or the output buffer becomes full. It may introduce
+  some output latency (reading input without producing any output) except when
+  forced to flush.
+
+  The detailed semantics are as follows. inflate performs one or both of the
+  following actions:
+
+  - Decompress more input starting at next_in and update next_in and avail_in
+    accordingly. If not all input can be processed (because there is not
+    enough room in the output buffer), next_in is updated and processing
+    will resume at this point for the next call of inflate().
+
+  - Provide more output starting at next_out and update next_out and avail_out
+    accordingly.  inflate() provides as much output as possible, until there
+    is no more input data or no more space in the output buffer (see below
+    about the flush parameter).
+
+  Before the call of inflate(), the application should ensure that at least
+  one of the actions is possible, by providing more input and/or consuming
+  more output, and updating the next_* and avail_* values accordingly.
+  The application can consume the uncompressed output when it wants, for
+  example when the output buffer is full (avail_out == 0), or after each
+  call of inflate(). If inflate returns Z_OK and with zero avail_out, it
+  must be called again after making room in the output buffer because there
+  might be more output pending.
+
+    The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH,
+  Z_FINISH, or Z_BLOCK. Z_SYNC_FLUSH requests that inflate() flush as much
+  output as possible to the output buffer. Z_BLOCK requests that inflate() stop
+  if and when it gets to the next deflate block boundary. When decoding the
+  zlib or gzip format, this will cause inflate() to return immediately after
+  the header and before the first block. When doing a raw inflate, inflate()
+  will go ahead and process the first block, and will return when it gets to
+  the end of that block, or when it runs out of data.
+
+    The Z_BLOCK option assists in appending to or combining deflate streams.
+  Also to assist in this, on return inflate() will set strm->data_type to the
+  number of unused bits in the last byte taken from strm->next_in, plus 64
+  if inflate() is currently decoding the last block in the deflate stream,
+  plus 128 if inflate() returned immediately after decoding an end-of-block
+  code or decoding the complete header up to just before the first byte of the
+  deflate stream. The end-of-block will not be indicated until all of the
+  uncompressed data from that block has been written to strm->next_out.  The
+  number of unused bits may in general be greater than seven, except when
+  bit 7 of data_type is set, in which case the number of unused bits will be
+  less than eight.
+
+    inflate() should normally be called until it returns Z_STREAM_END or an
+  error. However if all decompression is to be performed in a single step
+  (a single call of inflate), the parameter flush should be set to
+  Z_FINISH. In this case all pending input is processed and all pending
+  output is flushed; avail_out must be large enough to hold all the
+  uncompressed data. (The size of the uncompressed data may have been saved
+  by the compressor for this purpose.) The next operation on this stream must
+  be inflateEnd to deallocate the decompression state. The use of Z_FINISH
+  is never required, but can be used to inform inflate that a faster approach
+  may be used for the single inflate() call.
+
+     In this implementation, inflate() always flushes as much output as
+  possible to the output buffer, and always uses the faster approach on the
+  first call. So the only effect of the flush parameter in this implementation
+  is on the return value of inflate(), as noted below, or when it returns early
+  because Z_BLOCK is used.
+
+     If a preset dictionary is needed after this call (see inflateSetDictionary
+  below), inflate sets strm->adler to the adler32 checksum of the dictionary
+  chosen by the compressor and returns Z_NEED_DICT; otherwise it sets
+  strm->adler to the adler32 checksum of all output produced so far (that is,
+  total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described
+  below. At the end of the stream, inflate() checks that its computed adler32
+  checksum is equal to that saved by the compressor and returns Z_STREAM_END
+  only if the checksum is correct.
+
+    inflate() will decompress and check either zlib-wrapped or gzip-wrapped
+  deflate data.  The header type is detected automatically.  Any information
+  contained in the gzip header is not retained, so applications that need that
+  information should instead use raw inflate, see inflateInit2() below, or
+  inflateBack() and perform their own processing of the gzip header and
+  trailer.
+
+    inflate() returns Z_OK if some progress has been made (more input processed
+  or more output produced), Z_STREAM_END if the end of the compressed data has
+  been reached and all uncompressed output has been produced, Z_NEED_DICT if a
+  preset dictionary is needed at this point, Z_DATA_ERROR if the input data was
+  corrupted (input stream not conforming to the zlib format or incorrect check
+  value), Z_STREAM_ERROR if the stream structure was inconsistent (for example
+  if next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory,
+  Z_BUF_ERROR if no progress is possible or if there was not enough room in the
+  output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and
+  inflate() can be called again with more input and more output space to
+  continue decompressing. If Z_DATA_ERROR is returned, the application may then
+  call inflateSync() to look for a good compression block if a partial recovery
+  of the data is desired.
+*/
+
+
+extern int zlib_inflateEnd (z_streamp strm);
+/*
+     All dynamically allocated data structures for this stream are freed.
+   This function discards any unprocessed input and does not flush any
+   pending output.
+
+     inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state
+   was inconsistent. In the error case, msg may be set but then points to a
+   static string (which must not be deallocated).
+*/
+
+                        /* Advanced functions */
+
+/*
+    The following functions are needed only in some special applications.
+*/
+
+/*   
+extern int deflateInit2 (z_streamp strm,
+                                     int  level,
+                                     int  method,
+                                     int  windowBits,
+                                     int  memLevel,
+                                     int  strategy);
+
+     This is another version of deflateInit with more compression options. The
+   fields next_in, zalloc, zfree and opaque must be initialized before by
+   the caller.
+
+     The method parameter is the compression method. It must be Z_DEFLATED in
+   this version of the library.
+
+     The windowBits parameter is the base two logarithm of the window size
+   (the size of the history buffer).  It should be in the range 8..15 for this
+   version of the library. Larger values of this parameter result in better
+   compression at the expense of memory usage. The default value is 15 if
+   deflateInit is used instead.
+
+     The memLevel parameter specifies how much memory should be allocated
+   for the internal compression state. memLevel=1 uses minimum memory but
+   is slow and reduces compression ratio; memLevel=9 uses maximum memory
+   for optimal speed. The default value is 8. See zconf.h for total memory
+   usage as a function of windowBits and memLevel.
+
+     The strategy parameter is used to tune the compression algorithm. Use the
+   value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a
+   filter (or predictor), or Z_HUFFMAN_ONLY to force Huffman encoding only (no
+   string match).  Filtered data consists mostly of small values with a
+   somewhat random distribution. In this case, the compression algorithm is
+   tuned to compress them better. The effect of Z_FILTERED is to force more
+   Huffman coding and less string matching; it is somewhat intermediate
+   between Z_DEFAULT and Z_HUFFMAN_ONLY. The strategy parameter only affects
+   the compression ratio but not the correctness of the compressed output even
+   if it is not set appropriately.
+
+      deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_STREAM_ERROR if a parameter is invalid (such as an invalid
+   method). msg is set to null if there is no error message.  deflateInit2 does
+   not perform any compression: this will be done by deflate().
+*/
+
+extern int zlib_deflateReset (z_streamp strm);
+/*
+     This function is equivalent to deflateEnd followed by deflateInit,
+   but does not free and reallocate all the internal compression state.
+   The stream will keep the same compression level and any other attributes
+   that may have been set by deflateInit2.
+
+      deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being NULL).
+*/
+
+static inline unsigned long deflateBound(unsigned long s)
+{
+	return s + ((s + 7) >> 3) + ((s + 63) >> 6) + 11;
+}
+
+/*   
+extern int inflateInit2 (z_streamp strm, int  windowBits);
+
+     This is another version of inflateInit with an extra parameter. The
+   fields next_in, avail_in, zalloc, zfree and opaque must be initialized
+   before by the caller.
+
+     The windowBits parameter is the base two logarithm of the maximum window
+   size (the size of the history buffer).  It should be in the range 8..15 for
+   this version of the library. The default value is 15 if inflateInit is used
+   instead. windowBits must be greater than or equal to the windowBits value
+   provided to deflateInit2() while compressing, or it must be equal to 15 if
+   deflateInit2() was not used. If a compressed stream with a larger window
+   size is given as input, inflate() will return with the error code
+   Z_DATA_ERROR instead of trying to allocate a larger window.
+
+     windowBits can also be -8..-15 for raw inflate. In this case, -windowBits
+   determines the window size. inflate() will then process raw deflate data,
+   not looking for a zlib or gzip header, not generating a check value, and not
+   looking for any check values for comparison at the end of the stream. This
+   is for use with other formats that use the deflate compressed data format
+   such as zip.  Those formats provide their own check values. If a custom
+   format is developed using the raw deflate format for compressed data, it is
+   recommended that a check value such as an adler32 or a crc32 be applied to
+   the uncompressed data as is done in the zlib, gzip, and zip formats.  For
+   most applications, the zlib format should be used as is. Note that comments
+   above on the use in deflateInit2() applies to the magnitude of windowBits.
+
+     windowBits can also be greater than 15 for optional gzip decoding. Add
+   32 to windowBits to enable zlib and gzip decoding with automatic header
+   detection, or add 16 to decode only the gzip format (the zlib format will
+   return a Z_DATA_ERROR).  If a gzip stream is being decoded, strm->adler is
+   a crc32 instead of an adler32.
+
+     inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_STREAM_ERROR if a parameter is invalid (such as a null strm). msg
+   is set to null if there is no error message.  inflateInit2 does not perform
+   any decompression apart from reading the zlib header if present: this will
+   be done by inflate(). (So next_in and avail_in may be modified, but next_out
+   and avail_out are unchanged.)
+*/
+
+extern int zlib_inflateReset (z_streamp strm);
+/*
+     This function is equivalent to inflateEnd followed by inflateInit,
+   but does not free and reallocate all the internal decompression state.
+   The stream will keep attributes that may have been set by inflateInit2.
+
+      inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being NULL).
+*/
+
+extern int zlib_inflateIncomp (z_stream *strm);
+/*
+     This function adds the data at next_in (avail_in bytes) to the output
+   history without performing any output.  There must be no pending output,
+   and the decompressor must be expecting to see the start of a block.
+   Calling this function is equivalent to decompressing a stored block
+   containing the data at next_in (except that the data is not output).
+*/
+
+#define zlib_deflateInit(strm, level) \
+	zlib_deflateInit2((strm), (level), Z_DEFLATED, MAX_WBITS, \
+			      DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY)
+#define zlib_inflateInit(strm) \
+	zlib_inflateInit2((strm), DEF_WBITS)
+
+extern int zlib_deflateInit2(z_streamp strm, int  level, int  method,
+                                      int windowBits, int memLevel,
+                                      int strategy);
+extern int zlib_inflateInit2(z_streamp strm, int  windowBits);
+
+#if !defined(_Z_UTIL_H) && !defined(NO_DUMMY_DECL)
+    struct internal_state {int dummy;}; /* hack for buggy compilers */
+#endif
+
+/* Utility function: initialize zlib, unpack binary blob, clean up zlib,
+ * return len or negative error code. */
+extern int zlib_inflate_blob(void *dst, unsigned dst_sz, const void *src, unsigned src_sz);
+
+#endif /* _ZLIB_H */
diff --git a/include/linux/zutil.h b/include/linux/zutil.h
new file mode 100644
index 0000000..8caa7d3
--- /dev/null
+++ b/include/linux/zutil.h
@@ -0,0 +1,108 @@
+/* zutil.h -- internal interface and configuration of the compression library
+ * Copyright (C) 1995-1998 Jean-loup Gailly.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+/* @(#) $Id: zutil.h,v 1.1 2000/01/01 03:32:23 davem Exp $ */
+
+#ifndef _Z_UTIL_H
+#define _Z_UTIL_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <linux/zlib.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+
+typedef unsigned char  uch;
+typedef unsigned short ush;
+typedef unsigned long  ulg;
+
+        /* common constants */
+
+#define STORED_BLOCK 0
+#define STATIC_TREES 1
+#define DYN_TREES    2
+/* The three kinds of block type */
+
+#define MIN_MATCH  3
+#define MAX_MATCH  258
+/* The minimum and maximum match lengths */
+
+#define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */
+
+        /* target dependencies */
+
+        /* Common defaults */
+
+#ifndef OS_CODE
+#  define OS_CODE  0x03  /* assume Unix */
+#endif
+
+         /* functions */
+
+typedef uLong (*check_func) (uLong check, const Byte *buf,
+				       uInt len);
+
+
+                        /* checksum functions */
+
+#define BASE 65521L /* largest prime smaller than 65536 */
+#define NMAX 5552
+/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
+
+#define DO1(buf,i)  {s1 += buf[i]; s2 += s1;}
+#define DO2(buf,i)  DO1(buf,i); DO1(buf,i+1);
+#define DO4(buf,i)  DO2(buf,i); DO2(buf,i+2);
+#define DO8(buf,i)  DO4(buf,i); DO4(buf,i+4);
+#define DO16(buf)   DO8(buf,0); DO8(buf,8);
+
+/* ========================================================================= */
+/*
+     Update a running Adler-32 checksum with the bytes buf[0..len-1] and
+   return the updated checksum. If buf is NULL, this function returns
+   the required initial value for the checksum.
+   An Adler-32 checksum is almost as reliable as a CRC32 but can be computed
+   much faster. Usage example:
+
+     uLong adler = zlib_adler32(0L, NULL, 0);
+
+     while (read_buffer(buffer, length) != EOF) {
+       adler = zlib_adler32(adler, buffer, length);
+     }
+     if (adler != original_adler) error();
+*/
+static inline uLong zlib_adler32(uLong adler,
+				 const Byte *buf,
+				 uInt len)
+{
+    unsigned long s1 = adler & 0xffff;
+    unsigned long s2 = (adler >> 16) & 0xffff;
+    int k;
+
+    if (buf == NULL) return 1L;
+
+    while (len > 0) {
+        k = len < NMAX ? len : NMAX;
+        len -= k;
+        while (k >= 16) {
+            DO16(buf);
+	    buf += 16;
+            k -= 16;
+        }
+        if (k != 0) do {
+            s1 += *buf++;
+	    s2 += s1;
+        } while (--k);
+        s1 %= BASE;
+        s2 %= BASE;
+    }
+    return (s2 << 16) | s1;
+}
+
+#endif /* _Z_UTIL_H */
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/include/trace/define_trace.h
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
new file mode 100644
index 0000000..d4968c5
--- /dev/null
+++ b/include/trace/events/bcache.h
@@ -0,0 +1,1177 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bcache
+
+#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BCACHE_H
+
+#include <linux/tracepoint.h>
+
+struct bcache_device;
+struct bio;
+struct bkey;
+struct btree;
+struct cache;
+struct cache_set;
+struct keylist;
+struct moving_queue;
+
+DECLARE_EVENT_CLASS(bcache_request,
+	TP_PROTO(struct bcache_device *d, struct bio *bio),
+	TP_ARGS(d, bio),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(unsigned int,	orig_major		)
+		__field(unsigned int,	orig_minor		)
+		__field(sector_t,	sector			)
+		__field(sector_t,	orig_sector		)
+		__field(unsigned int,	nr_sector		)
+		__array(char,		rwbs,	6		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->orig_major	= d->disk->major;
+		__entry->orig_minor	= d->disk->first_minor;
+		__entry->sector		= bio->bi_iter.bi_sector;
+		__entry->orig_sector	= bio->bi_iter.bi_sector - 16;
+		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
+			      bio->bi_iter.bi_size);
+	),
+
+	TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->orig_major, __entry->orig_minor,
+		  (unsigned long long)__entry->orig_sector)
+);
+
+DECLARE_EVENT_CLASS(bpos,
+	TP_PROTO(struct bpos p),
+	TP_ARGS(p),
+
+	TP_STRUCT__entry(
+		__field(u64,	inode				)
+		__field(u64,	offset				)
+	),
+
+	TP_fast_assign(
+		__entry->inode	= p.inode;
+		__entry->offset	= p.offset;
+	),
+
+	TP_printk("%llu:%llu", __entry->inode, __entry->offset)
+);
+
+DECLARE_EVENT_CLASS(bkey,
+	TP_PROTO(const struct bkey *k),
+	TP_ARGS(k),
+
+	TP_STRUCT__entry(
+		__field(u64,	inode				)
+		__field(u64,	offset				)
+		__field(u32,	size				)
+	),
+
+	TP_fast_assign(
+		__entry->inode	= k->p.inode;
+		__entry->offset	= k->p.offset;
+		__entry->size	= k->size;
+	),
+
+	TP_printk("%llu:%llu len %u", __entry->inode,
+		  __entry->offset, __entry->size)
+);
+
+/* request.c */
+
+DEFINE_EVENT(bcache_request, bcache_request_start,
+	TP_PROTO(struct bcache_device *d, struct bio *bio),
+	TP_ARGS(d, bio)
+);
+
+DEFINE_EVENT(bcache_request, bcache_request_end,
+	TP_PROTO(struct bcache_device *d, struct bio *bio),
+	TP_ARGS(d, bio)
+);
+
+DECLARE_EVENT_CLASS(bcache_bio,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(sector_t,	sector			)
+		__field(unsigned int,	nr_sector		)
+		__array(char,		rwbs,	6		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_iter.bi_sector;
+		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
+			      bio->bi_iter.bi_size);
+	),
+
+	TP_printk("%d,%d  %s %llu + %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector, __entry->nr_sector)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_bypass_sequential,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_bypass_congested,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_promote,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bkey, bcache_promote_collision,
+	TP_PROTO(const struct bkey *k),
+	TP_ARGS(k)
+);
+
+TRACE_EVENT(bcache_read,
+	TP_PROTO(struct bio *bio, bool hit, bool bypass),
+	TP_ARGS(bio, hit, bypass),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(sector_t,	sector			)
+		__field(unsigned int,	nr_sector		)
+		__array(char,		rwbs,	6		)
+		__field(bool,		cache_hit		)
+		__field(bool,		bypass			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_iter.bi_sector;
+		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
+			      bio->bi_iter.bi_size);
+		__entry->cache_hit = hit;
+		__entry->bypass = bypass;
+	),
+
+	TP_printk("%d,%d  %s %llu + %u hit %u bypass %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->cache_hit, __entry->bypass)
+);
+
+TRACE_EVENT(bcache_write,
+	TP_PROTO(struct cache_set *c, u64 inode, struct bio *bio,
+		bool writeback, bool bypass),
+	TP_ARGS(c, inode, bio, writeback, bypass),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16		)
+		__field(u64,		inode			)
+		__field(sector_t,	sector			)
+		__field(unsigned int,	nr_sector		)
+		__array(char,		rwbs,	6		)
+		__field(bool,		writeback		)
+		__field(bool,		bypass			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+		__entry->inode		= inode;
+		__entry->sector		= bio->bi_iter.bi_sector;
+		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
+			      bio->bi_iter.bi_size);
+		__entry->writeback	= writeback;
+		__entry->bypass		= bypass;
+	),
+
+	TP_printk("%pU inode %llu  %s %llu + %u hit %u bypass %u",
+		  __entry->uuid, __entry->inode,
+		  __entry->rwbs, (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->writeback, __entry->bypass)
+);
+
+TRACE_EVENT(bcache_write_throttle,
+	TP_PROTO(struct cache_set *c, u64 inode, struct bio *bio, u64 delay),
+	TP_ARGS(c, inode, bio, delay),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16		)
+		__field(u64,		inode			)
+		__field(sector_t,	sector			)
+		__field(unsigned int,	nr_sector		)
+		__array(char,		rwbs,	6		)
+		__field(u64,		delay			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+		__entry->inode		= inode;
+		__entry->sector		= bio->bi_iter.bi_sector;
+		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
+			      bio->bi_iter.bi_size);
+		__entry->delay		= delay;
+	),
+
+	TP_printk("%pU inode %llu  %s %llu + %u delay %llu",
+		  __entry->uuid, __entry->inode,
+		  __entry->rwbs, (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->delay)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_read_retry,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DECLARE_EVENT_CLASS(page_alloc_fail,
+	TP_PROTO(struct cache_set *c, u64 size),
+	TP_ARGS(c, size),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16	)
+		__field(u64,		size		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+		__entry->size = size;
+	),
+
+	TP_printk("%pU size %llu", __entry->uuid, __entry->size)
+);
+
+/* Journal */
+
+DECLARE_EVENT_CLASS(cache_set,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16 )
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+	),
+
+	TP_printk("%pU", __entry->uuid)
+);
+
+DEFINE_EVENT(bkey, bcache_journal_replay_key,
+	TP_PROTO(const struct bkey *k),
+	TP_ARGS(k)
+);
+
+TRACE_EVENT(bcache_journal_next_bucket,
+	TP_PROTO(struct cache *ca, unsigned cur_idx, unsigned last_idx),
+	TP_ARGS(ca, cur_idx, last_idx),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16	)
+		__field(unsigned,	cur_idx		)
+		__field(unsigned,	last_idx	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, ca->disk_sb.sb->disk_uuid.b, 16);
+		__entry->cur_idx	= cur_idx;
+		__entry->last_idx	= last_idx;
+	),
+
+	TP_printk("%pU cur %u last %u", __entry->uuid,
+		  __entry->cur_idx, __entry->last_idx)
+);
+
+TRACE_EVENT(bcache_journal_write_oldest,
+	TP_PROTO(struct cache_set *c, u64 seq),
+	TP_ARGS(c, seq),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16	)
+		__field(u64,		seq		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+		__entry->seq		= seq;
+	),
+
+	TP_printk("%pU seq %llu", __entry->uuid, __entry->seq)
+);
+
+TRACE_EVENT(bcache_journal_write_oldest_done,
+	TP_PROTO(struct cache_set *c, u64 seq, unsigned written),
+	TP_ARGS(c, seq, written),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16	)
+		__field(u64,		seq		)
+		__field(unsigned,	written		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+		__entry->seq		= seq;
+		__entry->written	= written;
+	),
+
+	TP_printk("%pU seq %llu written %u", __entry->uuid, __entry->seq,
+		  __entry->written)
+);
+
+DEFINE_EVENT(cache_set, bcache_journal_full,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(cache_set, bcache_journal_entry_full,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_journal_write,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+/* Device state changes */
+
+DEFINE_EVENT(cache_set, bcache_cache_set_read_only,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(cache_set, bcache_cache_set_read_only_done,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DECLARE_EVENT_CLASS(cache,
+	TP_PROTO(struct cache *ca),
+	TP_ARGS(ca),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16	)
+		__field(unsigned,	tier		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, ca->disk_sb.sb->disk_uuid.b, 16);
+		__entry->tier = ca->mi.tier;
+	),
+
+	TP_printk("%pU tier %u", __entry->uuid, __entry->tier)
+);
+
+DEFINE_EVENT(cache, bcache_cache_read_only,
+	TP_PROTO(struct cache *ca),
+	TP_ARGS(ca)
+);
+
+DEFINE_EVENT(cache, bcache_cache_read_only_done,
+	TP_PROTO(struct cache *ca),
+	TP_ARGS(ca)
+);
+
+DEFINE_EVENT(cache, bcache_cache_read_write,
+	TP_PROTO(struct cache *ca),
+	TP_ARGS(ca)
+);
+
+DEFINE_EVENT(cache, bcache_cache_read_write_done,
+	TP_PROTO(struct cache *ca),
+	TP_ARGS(ca)
+);
+
+/* Searching */
+
+DEFINE_EVENT(bpos, bkey_pack_pos_fail,
+	TP_PROTO(struct bpos p),
+	TP_ARGS(p)
+);
+
+/* Btree */
+
+DECLARE_EVENT_CLASS(btree_node,
+	TP_PROTO(struct cache_set *c, struct btree *b),
+	TP_ARGS(c, b),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,		16	)
+		__field(u64,		bucket			)
+		__field(u8,		level			)
+		__field(u8,		id			)
+		__field(u32,		inode			)
+		__field(u64,		offset			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+		__entry->bucket		= PTR_BUCKET_NR_TRACE(c, &b->key, 0);
+		__entry->level		= b->level;
+		__entry->id		= b->btree_id;
+		__entry->inode		= b->key.k.p.inode;
+		__entry->offset		= b->key.k.p.offset;
+	),
+
+	TP_printk("%pU bucket %llu(%u) id %u: %u:%llu",
+		  __entry->uuid, __entry->bucket, __entry->level, __entry->id,
+		  __entry->inode, __entry->offset)
+);
+
+DEFINE_EVENT(btree_node, bcache_btree_read,
+	TP_PROTO(struct cache_set *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+TRACE_EVENT(bcache_btree_write,
+	TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
+	TP_ARGS(b, bytes, sectors),
+
+	TP_STRUCT__entry(
+		__field(enum bkey_type,	type)
+		__field(unsigned,	bytes			)
+		__field(unsigned,	sectors			)
+	),
+
+	TP_fast_assign(
+		__entry->type	= btree_node_type(b);
+		__entry->bytes	= bytes;
+		__entry->sectors = sectors;
+	),
+
+	TP_printk("bkey type %u bytes %u sectors %u",
+		  __entry->type , __entry->bytes, __entry->sectors)
+);
+
+DEFINE_EVENT(btree_node, bcache_btree_node_alloc,
+	TP_PROTO(struct cache_set *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+TRACE_EVENT(bcache_btree_node_alloc_fail,
+	TP_PROTO(struct cache_set *c, enum btree_id id),
+	TP_ARGS(c, id),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16		)
+		__field(enum btree_id,	id			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+		__entry->id = id;
+	),
+
+	TP_printk("%pU id %u", __entry->uuid, __entry->id)
+);
+
+DEFINE_EVENT(btree_node, bcache_btree_node_free,
+	TP_PROTO(struct cache_set *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+TRACE_EVENT(bcache_mca_reap,
+	TP_PROTO(struct cache_set *c, struct btree *b, int ret),
+	TP_ARGS(c, b, ret),
+
+	TP_STRUCT__entry(
+		__field(u64,			bucket		)
+		__field(int,			ret		)
+	),
+
+	TP_fast_assign(
+		__entry->bucket	= PTR_BUCKET_NR_TRACE(c, &b->key, 0);
+		__entry->ret = ret;
+	),
+
+	TP_printk("bucket %llu ret %d", __entry->bucket, __entry->ret)
+);
+
+TRACE_EVENT(bcache_mca_scan,
+	TP_PROTO(struct cache_set *c, unsigned touched, unsigned freed,
+		 unsigned can_free, unsigned long nr),
+	TP_ARGS(c, touched, freed, can_free, nr),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16	)
+		__field(unsigned long,	touched		)
+		__field(unsigned long,	freed		)
+		__field(unsigned long,	can_free	)
+		__field(unsigned long,	nr		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+		__entry->touched	= touched;
+		__entry->freed		= freed;
+		__entry->can_free	= can_free;
+		__entry->nr		= nr;
+	),
+
+	TP_printk("%pU touched %lu freed %lu can_free %lu nr %lu",
+		  __entry->uuid, __entry->touched, __entry->freed,
+		  __entry->can_free, __entry->nr)
+);
+
+DECLARE_EVENT_CLASS(mca_cannibalize_lock,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c),
+
+	TP_STRUCT__entry(
+		__array(char,			uuid,	16	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+	),
+
+	TP_printk("%pU", __entry->uuid)
+);
+
+DEFINE_EVENT(mca_cannibalize_lock, bcache_mca_cannibalize_lock_fail,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(mca_cannibalize_lock, bcache_mca_cannibalize_lock,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(mca_cannibalize_lock, bcache_mca_cannibalize,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(cache_set, bcache_mca_cannibalize_unlock,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+TRACE_EVENT(bcache_btree_insert_key,
+	TP_PROTO(struct cache_set *c, struct btree *b, struct bkey_i *k),
+	TP_ARGS(c, b, k),
+
+	TP_STRUCT__entry(
+		__field(u64,		b_bucket		)
+		__field(u64,		b_offset		)
+		__field(u64,		offset			)
+		__field(u32,		b_inode			)
+		__field(u32,		inode			)
+		__field(u32,		size			)
+		__field(u8,		level			)
+		__field(u8,		id			)
+	),
+
+	TP_fast_assign(
+		__entry->b_bucket	= PTR_BUCKET_NR_TRACE(c, &b->key, 0);
+		__entry->level		= b->level;
+		__entry->id		= b->btree_id;
+		__entry->b_inode	= b->key.k.p.inode;
+		__entry->b_offset	= b->key.k.p.offset;
+		__entry->inode		= k->k.p.inode;
+		__entry->offset		= k->k.p.offset;
+		__entry->size		= k->k.size;
+	),
+
+	TP_printk("bucket %llu(%u) id %u: %u:%llu %u:%llu len %u",
+		  __entry->b_bucket, __entry->level, __entry->id,
+		  __entry->b_inode, __entry->b_offset,
+		  __entry->inode, __entry->offset, __entry->size)
+);
+
+DECLARE_EVENT_CLASS(btree_split,
+	TP_PROTO(struct cache_set *c, struct btree *b, unsigned keys),
+	TP_ARGS(c, b, keys),
+
+	TP_STRUCT__entry(
+		__field(u64,		bucket			)
+		__field(u8,		level			)
+		__field(u8,		id			)
+		__field(u32,		inode			)
+		__field(u64,		offset			)
+		__field(u32,		keys			)
+	),
+
+	TP_fast_assign(
+		__entry->bucket	= PTR_BUCKET_NR_TRACE(c, &b->key, 0);
+		__entry->level	= b->level;
+		__entry->id	= b->btree_id;
+		__entry->inode	= b->key.k.p.inode;
+		__entry->offset	= b->key.k.p.offset;
+		__entry->keys	= keys;
+	),
+
+	TP_printk("bucket %llu(%u) id %u: %u:%llu keys %u",
+		  __entry->bucket, __entry->level, __entry->id,
+		  __entry->inode, __entry->offset, __entry->keys)
+);
+
+DEFINE_EVENT(btree_split, bcache_btree_node_split,
+	TP_PROTO(struct cache_set *c, struct btree *b, unsigned keys),
+	TP_ARGS(c, b, keys)
+);
+
+DEFINE_EVENT(btree_split, bcache_btree_node_compact,
+	TP_PROTO(struct cache_set *c, struct btree *b, unsigned keys),
+	TP_ARGS(c, b, keys)
+);
+
+DEFINE_EVENT(btree_node, bcache_btree_set_root,
+	TP_PROTO(struct cache_set *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+/* Garbage collection */
+
+TRACE_EVENT(bcache_btree_gc_coalesce,
+	TP_PROTO(struct cache_set *c, struct btree *b, unsigned nodes),
+	TP_ARGS(c, b, nodes),
+
+	TP_STRUCT__entry(
+		__field(u64,		bucket			)
+		__field(u8,		level			)
+		__field(u8,		id			)
+		__field(u32,		inode			)
+		__field(u64,		offset			)
+		__field(unsigned,	nodes			)
+	),
+
+	TP_fast_assign(
+		__entry->bucket		= PTR_BUCKET_NR_TRACE(c, &b->key, 0);
+		__entry->level		= b->level;
+		__entry->id		= b->btree_id;
+		__entry->inode		= b->key.k.p.inode;
+		__entry->offset		= b->key.k.p.offset;
+		__entry->nodes		= nodes;
+	),
+
+	TP_printk("bucket %llu(%u) id %u: %u:%llu nodes %u",
+		  __entry->bucket, __entry->level, __entry->id,
+		  __entry->inode, __entry->offset, __entry->nodes)
+);
+
+TRACE_EVENT(bcache_btree_gc_coalesce_fail,
+	TP_PROTO(struct cache_set *c, int reason),
+	TP_ARGS(c, reason),
+
+	TP_STRUCT__entry(
+		__field(u8,		reason			)
+		__array(char,		uuid,	16		)
+	),
+
+	TP_fast_assign(
+		__entry->reason		= reason;
+		memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+	),
+
+	TP_printk("%pU: %u", __entry->uuid, __entry->reason)
+);
+
+TRACE_EVENT(bcache_btree_node_alloc_replacement,
+	TP_PROTO(struct cache_set *c, struct btree *old, struct btree *b),
+	TP_ARGS(c, old, b),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,		16	)
+		__field(u64,		bucket			)
+		__field(u64,		old_bucket		)
+		__field(u8,		level			)
+		__field(u8,		id			)
+		__field(u32,		inode			)
+		__field(u64,		offset			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+		__entry->old_bucket	= PTR_BUCKET_NR_TRACE(c,
+							      &old->key, 0);
+		__entry->bucket		= PTR_BUCKET_NR_TRACE(c, &b->key, 0);
+		__entry->level		= b->level;
+		__entry->id		= b->btree_id;
+		__entry->inode		= b->key.k.p.inode;
+		__entry->offset		= b->key.k.p.offset;
+	),
+
+	TP_printk("%pU for %llu bucket %llu(%u) id %u: %u:%llu",
+		  __entry->uuid, __entry->old_bucket, __entry->bucket,
+		  __entry->level, __entry->id,
+		  __entry->inode, __entry->offset)
+);
+
+DEFINE_EVENT(btree_node, bcache_btree_gc_rewrite_node,
+	TP_PROTO(struct cache_set *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, bcache_btree_gc_rewrite_node_fail,
+	TP_PROTO(struct cache_set *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(cache_set, bcache_gc_start,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(cache_set, bcache_gc_end,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(cache_set, bcache_gc_coalesce_start,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(cache_set, bcache_gc_coalesce_end,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(cache, bcache_sectors_saturated,
+	TP_PROTO(struct cache *ca),
+	TP_ARGS(ca)
+);
+
+DEFINE_EVENT(cache_set, bcache_gc_sectors_saturated,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(cache_set, bcache_gc_cannot_inc_gens,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(cache_set, bcache_gc_periodic,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+TRACE_EVENT(bcache_mark_bucket,
+	TP_PROTO(struct cache *ca, const struct bkey *k,
+		 const struct bch_extent_ptr *ptr,
+		 int sectors, bool dirty),
+	TP_ARGS(ca, k, ptr, sectors, dirty),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,		16	)
+		__field(u32,		inode			)
+		__field(u64,		offset			)
+		__field(u32,		sectors			)
+		__field(u64,		bucket			)
+		__field(bool,		dirty			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, ca->disk_sb.sb->disk_uuid.b, 16);
+		__entry->inode		= k->p.inode;
+		__entry->offset		= k->p.offset;
+		__entry->sectors	= sectors;
+		__entry->bucket		= PTR_BUCKET_NR(ca, ptr);
+		__entry->dirty		= dirty;
+	),
+
+	TP_printk("%pU %u:%llu sectors %i bucket %llu dirty %i",
+		  __entry->uuid, __entry->inode, __entry->offset,
+		  __entry->sectors, __entry->bucket, __entry->dirty)
+);
+
+/* Allocator */
+
+TRACE_EVENT(bcache_alloc_batch,
+	TP_PROTO(struct cache *ca, size_t free, size_t total),
+	TP_ARGS(ca, free, total),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16	)
+		__field(size_t,		free		)
+		__field(size_t,		total		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, ca->disk_sb.sb->disk_uuid.b, 16);
+		__entry->free = free;
+		__entry->total = total;
+	),
+
+	TP_printk("%pU free %zu total %zu",
+		__entry->uuid, __entry->free, __entry->total)
+);
+
+TRACE_EVENT(bcache_btree_reserve_get_fail,
+	TP_PROTO(struct cache_set *c, size_t required, struct closure *cl),
+	TP_ARGS(c, required, cl),
+
+	TP_STRUCT__entry(
+		__array(char,			uuid,	16	)
+		__field(size_t,			required	)
+		__field(struct closure *,	cl		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+		__entry->required = required;
+		__entry->cl = cl;
+	),
+
+	TP_printk("%pU required %zu by %p", __entry->uuid,
+		  __entry->required, __entry->cl)
+);
+
+DEFINE_EVENT(cache, bcache_prio_write_start,
+	TP_PROTO(struct cache *ca),
+	TP_ARGS(ca)
+);
+
+DEFINE_EVENT(cache, bcache_prio_write_end,
+	TP_PROTO(struct cache *ca),
+	TP_ARGS(ca)
+);
+
+TRACE_EVENT(bcache_invalidate,
+	TP_PROTO(struct cache *ca, size_t bucket, unsigned sectors),
+	TP_ARGS(ca, bucket, sectors),
+
+	TP_STRUCT__entry(
+		__field(unsigned,	sectors			)
+		__field(dev_t,		dev			)
+		__field(__u64,		offset			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= ca->disk_sb.bdev->bd_dev;
+		__entry->offset		= bucket << ca->bucket_bits;
+		__entry->sectors	= sectors;
+	),
+
+	TP_printk("invalidated %u sectors at %d,%d sector=%llu",
+		  __entry->sectors, MAJOR(__entry->dev),
+		  MINOR(__entry->dev), __entry->offset)
+);
+
+DEFINE_EVENT(cache_set, bcache_rescale_prios,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DECLARE_EVENT_CLASS(cache_bucket_alloc,
+	TP_PROTO(struct cache *ca, enum alloc_reserve reserve),
+	TP_ARGS(ca, reserve),
+
+	TP_STRUCT__entry(
+		__array(char,			uuid,	16)
+		__field(enum alloc_reserve,	reserve	  )
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, ca->disk_sb.sb->disk_uuid.b, 16);
+		__entry->reserve = reserve;
+	),
+
+	TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve)
+);
+
+DEFINE_EVENT(cache_bucket_alloc, bcache_bucket_alloc,
+	TP_PROTO(struct cache *ca, enum alloc_reserve reserve),
+	TP_ARGS(ca, reserve)
+);
+
+DEFINE_EVENT(cache_bucket_alloc, bcache_bucket_alloc_fail,
+	TP_PROTO(struct cache *ca, enum alloc_reserve reserve),
+	TP_ARGS(ca, reserve)
+);
+
+DECLARE_EVENT_CLASS(cache_set_bucket_alloc,
+	TP_PROTO(struct cache_set *c, enum alloc_reserve reserve,
+		 struct closure *cl),
+	TP_ARGS(c, reserve, cl),
+
+	TP_STRUCT__entry(
+		__array(char,			uuid,	16	)
+		__field(enum alloc_reserve,	reserve		)
+		__field(struct closure *,	cl		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+		__entry->reserve = reserve;
+		__entry->cl = cl;
+	),
+
+	TP_printk("%pU reserve %d cl %p", __entry->uuid, __entry->reserve,
+		  __entry->cl)
+);
+
+DEFINE_EVENT(cache_set_bucket_alloc, bcache_freelist_empty_fail,
+	TP_PROTO(struct cache_set *c, enum alloc_reserve reserve,
+		 struct closure *cl),
+	TP_ARGS(c, reserve, cl)
+);
+
+DECLARE_EVENT_CLASS(open_bucket_alloc,
+	TP_PROTO(struct cache_set *c, struct closure *cl),
+	TP_ARGS(c, cl),
+
+	TP_STRUCT__entry(
+		__array(char,			uuid,	16	)
+		__field(struct closure *,	cl		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+		__entry->cl = cl;
+	),
+
+	TP_printk("%pU cl %p",
+		  __entry->uuid, __entry->cl)
+);
+
+DEFINE_EVENT(open_bucket_alloc, bcache_open_bucket_alloc,
+	TP_PROTO(struct cache_set *c, struct closure *cl),
+	TP_ARGS(c, cl)
+);
+
+DEFINE_EVENT(open_bucket_alloc, bcache_open_bucket_alloc_fail,
+	TP_PROTO(struct cache_set *c, struct closure *cl),
+	TP_ARGS(c, cl)
+);
+
+/* Keylists */
+
+TRACE_EVENT(bcache_keyscan,
+	TP_PROTO(unsigned nr_found,
+		 unsigned start_inode, u64 start_offset,
+		 unsigned end_inode, u64 end_offset),
+	TP_ARGS(nr_found,
+		start_inode, start_offset,
+		end_inode, end_offset),
+
+	TP_STRUCT__entry(
+		__field(__u32,	nr_found			)
+		__field(__u32,	start_inode			)
+		__field(__u64,	start_offset			)
+		__field(__u32,	end_inode			)
+		__field(__u64,	end_offset			)
+	),
+
+	TP_fast_assign(
+		__entry->nr_found	= nr_found;
+		__entry->start_inode	= start_inode;
+		__entry->start_offset	= start_offset;
+		__entry->end_inode	= end_inode;
+		__entry->end_offset	= end_offset;
+	),
+
+	TP_printk("found %u keys from %u:%llu to %u:%llu", __entry->nr_found,
+		  __entry->start_inode, __entry->start_offset,
+		  __entry->end_inode, __entry->end_offset)
+);
+
+/* Moving IO */
+
+DECLARE_EVENT_CLASS(moving_io,
+	TP_PROTO(struct bkey *k),
+	TP_ARGS(k),
+
+	TP_STRUCT__entry(
+		__field(__u32,		inode			)
+		__field(__u64,		offset			)
+		__field(__u32,		sectors			)
+	),
+
+	TP_fast_assign(
+		__entry->inode		= k->p.inode;
+		__entry->offset		= k->p.offset;
+		__entry->sectors	= k->size;
+	),
+
+	TP_printk("%u:%llu sectors %u",
+		  __entry->inode, __entry->offset, __entry->sectors)
+);
+
+DEFINE_EVENT(moving_io, bcache_move_read,
+	TP_PROTO(struct bkey *k),
+	TP_ARGS(k)
+);
+
+DEFINE_EVENT(moving_io, bcache_move_read_done,
+	TP_PROTO(struct bkey *k),
+	TP_ARGS(k)
+);
+
+DEFINE_EVENT(moving_io, bcache_move_write,
+	TP_PROTO(struct bkey *k),
+	TP_ARGS(k)
+);
+
+DEFINE_EVENT(moving_io, bcache_move_write_done,
+	TP_PROTO(struct bkey *k),
+	TP_ARGS(k)
+);
+
+DEFINE_EVENT(moving_io, bcache_copy_collision,
+	TP_PROTO(struct bkey *k),
+	TP_ARGS(k)
+);
+
+/* Copy GC */
+
+DEFINE_EVENT(page_alloc_fail, bcache_moving_gc_alloc_fail,
+	TP_PROTO(struct cache_set *c, u64 size),
+	TP_ARGS(c, size)
+);
+
+DEFINE_EVENT(cache, bcache_moving_gc_start,
+	TP_PROTO(struct cache *ca),
+	TP_ARGS(ca)
+);
+
+TRACE_EVENT(bcache_moving_gc_end,
+	TP_PROTO(struct cache *ca, u64 sectors_moved, u64 keys_moved,
+		u64 buckets_moved),
+	TP_ARGS(ca, sectors_moved, keys_moved, buckets_moved),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16	)
+		__field(u64,		sectors_moved	)
+		__field(u64,		keys_moved	)
+		__field(u64,		buckets_moved	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, ca->disk_sb.sb->disk_uuid.b, 16);
+		__entry->sectors_moved = sectors_moved;
+		__entry->keys_moved = keys_moved;
+		__entry->buckets_moved = buckets_moved;
+	),
+
+	TP_printk("%pU sectors_moved %llu keys_moved %llu buckets_moved %llu",
+		__entry->uuid, __entry->sectors_moved, __entry->keys_moved,
+		__entry->buckets_moved)
+);
+
+DEFINE_EVENT(cache, bcache_moving_gc_reserve_empty,
+	TP_PROTO(struct cache *ca),
+	TP_ARGS(ca)
+);
+
+DEFINE_EVENT(cache, bcache_moving_gc_no_work,
+	TP_PROTO(struct cache *ca),
+	TP_ARGS(ca)
+);
+
+DEFINE_EVENT(bkey, bcache_gc_copy,
+	TP_PROTO(const struct bkey *k),
+	TP_ARGS(k)
+);
+
+/* Tiering */
+
+DEFINE_EVENT(cache_set, bcache_tiering_refill_start,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(cache_set, bcache_tiering_refill_end,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(page_alloc_fail, bcache_tiering_alloc_fail,
+	TP_PROTO(struct cache_set *c, u64 size),
+	TP_ARGS(c, size)
+);
+
+DEFINE_EVENT(cache_set, bcache_tiering_start,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+TRACE_EVENT(bcache_tiering_end,
+	TP_PROTO(struct cache_set *c, u64 sectors_moved,
+		u64 keys_moved),
+	TP_ARGS(c, sectors_moved, keys_moved),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16	)
+		__field(u64,		sectors_moved	)
+		__field(u64,		keys_moved	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+		__entry->sectors_moved = sectors_moved;
+		__entry->keys_moved = keys_moved;
+	),
+
+	TP_printk("%pU sectors_moved %llu keys_moved %llu",
+		__entry->uuid, __entry->sectors_moved, __entry->keys_moved)
+);
+
+DEFINE_EVENT(bkey, bcache_tiering_copy,
+	TP_PROTO(const struct bkey *k),
+	TP_ARGS(k)
+);
+
+/* Background writeback */
+
+DEFINE_EVENT(bkey, bcache_writeback,
+	TP_PROTO(const struct bkey *k),
+	TP_ARGS(k)
+);
+
+DEFINE_EVENT(bkey, bcache_writeback_collision,
+	TP_PROTO(const struct bkey *k),
+	TP_ARGS(k)
+);
+
+TRACE_EVENT(bcache_writeback_error,
+	TP_PROTO(struct bkey *k, bool write, int error),
+	TP_ARGS(k, write, error),
+
+	TP_STRUCT__entry(
+		__field(u32,	size				)
+		__field(u32,	inode				)
+		__field(u64,	offset				)
+		__field(bool,	write				)
+		__field(int,	error				)
+	),
+
+	TP_fast_assign(
+		__entry->inode	= k->p.inode;
+		__entry->offset	= k->p.offset;
+		__entry->size	= k->size;
+		__entry->write	= write;
+		__entry->error	= error;
+	),
+
+	TP_printk("%u:%llu len %u %s error %d", __entry->inode,
+		  __entry->offset, __entry->size,
+		  __entry->write ? "write" : "read",
+		  __entry->error)
+);
+
+DEFINE_EVENT(page_alloc_fail, bcache_writeback_alloc_fail,
+	TP_PROTO(struct cache_set *c, u64 size),
+	TP_ARGS(c, size)
+);
+
+#endif /* _TRACE_BCACHE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/uuid.h b/include/uapi/linux/uuid.h
new file mode 100644
index 0000000..6e97810
--- /dev/null
+++ b/include/uapi/linux/uuid.h
@@ -0,0 +1,53 @@
+/*
+ * UUID/GUID definition
+ *
+ * Copyright (C) 2010, Intel Corp.
+ *	Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _UAPI_LINUX_UUID_H_
+#define _UAPI_LINUX_UUID_H_
+
+#include <asm/types.h>
+
+typedef struct {
+	__u8 b[16];
+} uuid_le;
+
+typedef struct {
+	__u8 b[16];
+} uuid_be;
+
+#define UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)		\
+((uuid_le)								\
+{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
+   (b) & 0xff, ((b) >> 8) & 0xff,					\
+   (c) & 0xff, ((c) >> 8) & 0xff,					\
+   (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
+
+#define UUID_BE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)		\
+((uuid_be)								\
+{{ ((a) >> 24) & 0xff, ((a) >> 16) & 0xff, ((a) >> 8) & 0xff, (a) & 0xff, \
+   ((b) >> 8) & 0xff, (b) & 0xff,					\
+   ((c) >> 8) & 0xff, (c) & 0xff,					\
+   (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
+
+#define NULL_UUID_LE							\
+	UUID_LE(0x00000000, 0x0000, 0x0000, 0x00, 0x00, 0x00, 0x00,	\
+		0x00, 0x00, 0x00, 0x00)
+
+#define NULL_UUID_BE							\
+	UUID_BE(0x00000000, 0x0000, 0x0000, 0x00, 0x00, 0x00, 0x00,	\
+		0x00, 0x00, 0x00, 0x00)
+
+
+#endif /* _UAPI_LINUX_UUID_H_ */
diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h
new file mode 100644
index 0000000..1590c49
--- /dev/null
+++ b/include/uapi/linux/xattr.h
@@ -0,0 +1,77 @@
+/*
+  File: linux/xattr.h
+
+  Extended attributes handling.
+
+  Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
+  Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+  Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
+*/
+
+#include <linux/libc-compat.h>
+
+#ifndef _UAPI_LINUX_XATTR_H
+#define _UAPI_LINUX_XATTR_H
+
+#if __UAPI_DEF_XATTR
+#define __USE_KERNEL_XATTR_DEFS
+
+#define XATTR_CREATE	0x1	/* set value, fail if attr already exists */
+#define XATTR_REPLACE	0x2	/* set value, fail if attr does not exist */
+#endif
+
+/* Namespaces */
+#define XATTR_OS2_PREFIX "os2."
+#define XATTR_OS2_PREFIX_LEN (sizeof(XATTR_OS2_PREFIX) - 1)
+
+#define XATTR_MAC_OSX_PREFIX "osx."
+#define XATTR_MAC_OSX_PREFIX_LEN (sizeof(XATTR_MAC_OSX_PREFIX) - 1)
+
+#define XATTR_BTRFS_PREFIX "btrfs."
+#define XATTR_BTRFS_PREFIX_LEN (sizeof(XATTR_BTRFS_PREFIX) - 1)
+
+#define XATTR_SECURITY_PREFIX	"security."
+#define XATTR_SECURITY_PREFIX_LEN (sizeof(XATTR_SECURITY_PREFIX) - 1)
+
+#define XATTR_SYSTEM_PREFIX "system."
+#define XATTR_SYSTEM_PREFIX_LEN (sizeof(XATTR_SYSTEM_PREFIX) - 1)
+
+#define XATTR_TRUSTED_PREFIX "trusted."
+#define XATTR_TRUSTED_PREFIX_LEN (sizeof(XATTR_TRUSTED_PREFIX) - 1)
+
+#define XATTR_USER_PREFIX "user."
+#define XATTR_USER_PREFIX_LEN (sizeof(XATTR_USER_PREFIX) - 1)
+
+/* Security namespace */
+#define XATTR_EVM_SUFFIX "evm"
+#define XATTR_NAME_EVM XATTR_SECURITY_PREFIX XATTR_EVM_SUFFIX
+
+#define XATTR_IMA_SUFFIX "ima"
+#define XATTR_NAME_IMA XATTR_SECURITY_PREFIX XATTR_IMA_SUFFIX
+
+#define XATTR_SELINUX_SUFFIX "selinux"
+#define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX
+
+#define XATTR_SMACK_SUFFIX "SMACK64"
+#define XATTR_SMACK_IPIN "SMACK64IPIN"
+#define XATTR_SMACK_IPOUT "SMACK64IPOUT"
+#define XATTR_SMACK_EXEC "SMACK64EXEC"
+#define XATTR_SMACK_TRANSMUTE "SMACK64TRANSMUTE"
+#define XATTR_SMACK_MMAP "SMACK64MMAP"
+#define XATTR_NAME_SMACK XATTR_SECURITY_PREFIX XATTR_SMACK_SUFFIX
+#define XATTR_NAME_SMACKIPIN	XATTR_SECURITY_PREFIX XATTR_SMACK_IPIN
+#define XATTR_NAME_SMACKIPOUT	XATTR_SECURITY_PREFIX XATTR_SMACK_IPOUT
+#define XATTR_NAME_SMACKEXEC	XATTR_SECURITY_PREFIX XATTR_SMACK_EXEC
+#define XATTR_NAME_SMACKTRANSMUTE XATTR_SECURITY_PREFIX XATTR_SMACK_TRANSMUTE
+#define XATTR_NAME_SMACKMMAP XATTR_SECURITY_PREFIX XATTR_SMACK_MMAP
+
+#define XATTR_CAPS_SUFFIX "capability"
+#define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX
+
+#define XATTR_POSIX_ACL_ACCESS  "posix_acl_access"
+#define XATTR_NAME_POSIX_ACL_ACCESS XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_ACCESS
+#define XATTR_POSIX_ACL_DEFAULT  "posix_acl_default"
+#define XATTR_NAME_POSIX_ACL_DEFAULT XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_DEFAULT
+
+
+#endif /* _UAPI_LINUX_XATTR_H */
diff --git a/libbcache.c b/libbcache.c
index 5b2cac5..081fd42 100644
--- a/libbcache.c
+++ b/libbcache.c
@@ -11,10 +11,9 @@
 
 #include <uuid/uuid.h>
 
-#include "ccan/ilog/ilog.h"
-
-#include "bcache-ondisk.h"
+#include "linux/bcache.h"
 #include "libbcache.h"
+#include "checksum.h"
 
 const char * const cache_state[] = {
 	"active",
@@ -154,7 +153,7 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
 			i->bucket_size = max(block_size, 256U);
 
 			if (i->size >= min_size(i->bucket_size)) {
-				unsigned scale = max(1U,
+				unsigned scale = max(1,
 					ilog2(i->size / min_size(i->bucket_size)) / 4);
 
 				scale = rounddown_pow_of_two(scale);
@@ -216,8 +215,8 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
 	 * crc64
 	 */
 	SET_CACHE_SB_CSUM_TYPE(sb,		BCH_CSUM_CRC64);
-	SET_CACHE_SET_META_CSUM_TYPE(sb,	meta_csum_type);
-	SET_CACHE_SET_DATA_CSUM_TYPE(sb,	data_csum_type);
+	SET_CACHE_SET_META_PREFERRED_CSUM_TYPE(sb,	meta_csum_type);
+	SET_CACHE_SET_DATA_PREFERRED_CSUM_TYPE(sb,	data_csum_type);
 	SET_CACHE_SET_COMPRESSION_TYPE(sb,	compression_type);
 
 	SET_CACHE_SET_BTREE_NODE_SIZE(sb,	btree_node_size);
@@ -313,12 +312,12 @@ void bcache_super_print(struct cache_sb *sb, int units)
 	       CACHE_SET_DATA_REPLICAS_HAVE(sb),
 	       CACHE_SET_DATA_REPLICAS_WANT(sb),
 
-	       CACHE_SET_META_CSUM_TYPE(sb) < BCH_CSUM_NR
-	       ? csum_types[CACHE_SET_META_CSUM_TYPE(sb)]
+	       CACHE_SET_META_PREFERRED_CSUM_TYPE(sb) < BCH_CSUM_NR
+	       ? csum_types[CACHE_SET_META_PREFERRED_CSUM_TYPE(sb)]
 	       : "unknown",
 
-	       CACHE_SET_DATA_CSUM_TYPE(sb) < BCH_CSUM_NR
-	       ? csum_types[CACHE_SET_DATA_CSUM_TYPE(sb)]
+	       CACHE_SET_DATA_PREFERRED_CSUM_TYPE(sb) < BCH_CSUM_NR
+	       ? csum_types[CACHE_SET_DATA_PREFERRED_CSUM_TYPE(sb)]
 	       : "unknown",
 
 	       CACHE_SET_COMPRESSION_TYPE(sb) < BCH_COMPRESSION_NR
diff --git a/libbcache.h b/libbcache.h
index 9dfc647..e4600d3 100644
--- a/libbcache.h
+++ b/libbcache.h
@@ -1,7 +1,7 @@
 #ifndef _LIBBCACHE_H
 #define _LIBBCACHE_H
 
-#include "util.h"
+#include "tools-util.h"
 #include "stdbool.h"
 
 extern const char * const cache_state[];
diff --git a/libbcache/acl.c b/libbcache/acl.c
new file mode 100644
index 0000000..64d5616
--- /dev/null
+++ b/libbcache/acl.c
@@ -0,0 +1,225 @@
+#include "bcache.h"
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *bch_acl_from_disk(const void *value, size_t size)
+{
+	const char *end = (char *)value + size;
+	int n, count;
+	struct posix_acl *acl;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(bch_acl_header))
+		return ERR_PTR(-EINVAL);
+	if (((bch_acl_header *)value)->a_version !=
+	    cpu_to_le32(BCH_ACL_VERSION))
+		return ERR_PTR(-EINVAL);
+	value = (char *)value + sizeof(bch_acl_header);
+	count = bch_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	for (n = 0; n < count; n++) {
+		bch_acl_entry *entry =
+			(bch_acl_entry *)value;
+		if ((char *)value + sizeof(bch_acl_entry_short) > end)
+			goto fail;
+		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+		switch (acl->a_entries[n].e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			value = (char *)value +
+				sizeof(bch_acl_entry_short);
+			break;
+
+		case ACL_USER:
+			value = (char *)value + sizeof(bch_acl_entry);
+			if ((char *)value > end)
+				goto fail;
+			acl->a_entries[n].e_uid =
+				make_kuid(&init_user_ns,
+					  le32_to_cpu(entry->e_id));
+			break;
+		case ACL_GROUP:
+			value = (char *)value + sizeof(bch_acl_entry);
+			if ((char *)value > end)
+				goto fail;
+			acl->a_entries[n].e_gid =
+				make_kgid(&init_user_ns,
+					  le32_to_cpu(entry->e_id));
+			break;
+
+		default:
+			goto fail;
+		}
+	}
+	if (value != end)
+		goto fail;
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static void *bch_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+	bch_acl_header *ext_acl;
+	char *e;
+	size_t n;
+
+	*size = bch_acl_size(acl->a_count);
+	ext_acl = kmalloc(sizeof(bch_acl_header) + acl->a_count *
+			sizeof(bch_acl_entry), GFP_KERNEL);
+	if (!ext_acl)
+		return ERR_PTR(-ENOMEM);
+	ext_acl->a_version = cpu_to_le32(BCH_ACL_VERSION);
+	e = (char *)ext_acl + sizeof(bch_acl_header);
+	for (n = 0; n < acl->a_count; n++) {
+		const struct posix_acl_entry *acl_e = &acl->a_entries[n];
+		bch_acl_entry *entry = (bch_acl_entry *)e;
+
+		entry->e_tag = cpu_to_le16(acl_e->e_tag);
+		entry->e_perm = cpu_to_le16(acl_e->e_perm);
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+			entry->e_id = cpu_to_le32(
+				from_kuid(&init_user_ns, acl_e->e_uid));
+			e += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(
+				from_kgid(&init_user_ns, acl_e->e_gid));
+			e += sizeof(bch_acl_entry);
+			break;
+
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			e += sizeof(bch_acl_entry_short);
+			break;
+
+		default:
+			goto fail;
+		}
+	}
+	return (char *)ext_acl;
+
+fail:
+	kfree(ext_acl);
+	return ERR_PTR(-EINVAL);
+}
+
+struct posix_acl *bch_get_acl(struct inode *inode, int type)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	int name_index;
+	char *value = NULL;
+	struct posix_acl *acl;
+	int ret;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	ret = bch_xattr_get(c, inode, "", NULL, 0, name_index);
+	if (ret > 0) {
+		value = kmalloc(ret, GFP_KERNEL);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		ret = bch_xattr_get(c, inode, "", value,
+				    ret, name_index);
+	}
+	if (ret > 0)
+		acl = bch_acl_from_disk(value, ret);
+	else if (ret == -ENODATA || ret == -ENOSYS)
+		acl = NULL;
+	else
+		acl = ERR_PTR(ret);
+	kfree(value);
+
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+
+	return acl;
+}
+
+int bch_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	int name_index;
+	void *value = NULL;
+	size_t size = 0;
+	int ret;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
+		if (acl) {
+			ret = posix_acl_equiv_mode(acl, &inode->i_mode);
+			if (ret < 0)
+				return ret;
+			else {
+				inode->i_ctime = CURRENT_TIME_SEC;
+				mark_inode_dirty(inode);
+				if (ret == 0)
+					acl = NULL;
+			}
+		}
+		break;
+
+	case ACL_TYPE_DEFAULT:
+		name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		value = bch_acl_to_disk(acl, &size);
+		if (IS_ERR(value))
+			return (int)PTR_ERR(value);
+	}
+
+	ret = bch_xattr_set(c, inode, "", value, size, 0, name_index);
+
+	kfree(value);
+
+	if (ret == -ERANGE)
+		ret = -E2BIG;
+
+	if (!ret)
+		set_cached_acl(inode, type, acl);
+
+	return ret;
+}
diff --git a/libbcache/acl.h b/libbcache/acl.h
new file mode 100644
index 0000000..079e568
--- /dev/null
+++ b/libbcache/acl.h
@@ -0,0 +1,56 @@
+/*
+  File: fs/bch/acl.h
+
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/posix_acl_xattr.h>
+
+#define BCH_ACL_VERSION	0x0001
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+	__le32		e_id;
+} bch_acl_entry;
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+} bch_acl_entry_short;
+
+typedef struct {
+	__le32		a_version;
+} bch_acl_header;
+
+static inline size_t bch_acl_size(int count)
+{
+	if (count <= 4) {
+		return sizeof(bch_acl_header) +
+		       count * sizeof(bch_acl_entry_short);
+	} else {
+		return sizeof(bch_acl_header) +
+		       4 * sizeof(bch_acl_entry_short) +
+		       (count - 4) * sizeof(bch_acl_entry);
+	}
+}
+
+static inline int bch_acl_count(size_t size)
+{
+	ssize_t s;
+
+	size -= sizeof(bch_acl_header);
+	s = size - 4 * sizeof(bch_acl_entry_short);
+	if (s < 0) {
+		if (size % sizeof(bch_acl_entry_short))
+			return -1;
+		return size / sizeof(bch_acl_entry_short);
+	} else {
+		if (s % sizeof(bch_acl_entry))
+			return -1;
+		return s / sizeof(bch_acl_entry) + 4;
+	}
+}
+
+extern struct posix_acl *bch_get_acl(struct inode *, int);
+extern int bch_set_acl(struct inode *, struct posix_acl *, int);
diff --git a/libbcache/alloc.c b/libbcache/alloc.c
new file mode 100644
index 0000000..cff750c
--- /dev/null
+++ b/libbcache/alloc.c
@@ -0,0 +1,1861 @@
+/*
+ * Primary bucket allocation code
+ *
+ * Copyright 2012 Google, Inc.
+ *
+ * Allocation in bcache is done in terms of buckets:
+ *
+ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
+ * btree pointers - they must match for the pointer to be considered valid.
+ *
+ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
+ * bucket simply by incrementing its gen.
+ *
+ * The gens (along with the priorities; it's really the gens are important but
+ * the code is named as if it's the priorities) are written in an arbitrary list
+ * of buckets on disk, with a pointer to them in the journal header.
+ *
+ * When we invalidate a bucket, we have to write its new gen to disk and wait
+ * for that write to complete before we use it - otherwise after a crash we
+ * could have pointers that appeared to be good but pointed to data that had
+ * been overwritten.
+ *
+ * Since the gens and priorities are all stored contiguously on disk, we can
+ * batch this up: We fill up the free_inc list with freshly invalidated buckets,
+ * call prio_write(), and when prio_write() finishes we pull buckets off the
+ * free_inc list and optionally discard them.
+ *
+ * free_inc isn't the only freelist - if it was, we'd often have to sleep while
+ * priorities and gens were being written before we could allocate. c->free is a
+ * smaller freelist, and buckets on that list are always ready to be used.
+ *
+ * If we've got discards enabled, that happens when a bucket moves from the
+ * free_inc list to the free list.
+ *
+ * It's important to ensure that gens don't wrap around - with respect to
+ * either the oldest gen in the btree or the gen on disk. This is quite
+ * difficult to do in practice, but we explicitly guard against it anyways - if
+ * a bucket is in danger of wrapping around we simply skip invalidating it that
+ * time around, and we garbage collect or rewrite the priorities sooner than we
+ * would have otherwise.
+ *
+ * bch_bucket_alloc() allocates a single bucket from a specific cache.
+ *
+ * bch_bucket_alloc_set() allocates one or more buckets from different caches
+ * out of a cache set.
+ *
+ * invalidate_buckets() drives all the processes described above. It's called
+ * from bch_bucket_alloc() and a few other places that need to make sure free
+ * buckets are ready.
+ *
+ * invalidate_buckets_(lru|fifo)() find buckets that are available to be
+ * invalidated, and then invalidate them and stick them on the free_inc list -
+ * in either lru or fifo order.
+ */
+
+#include "bcache.h"
+#include "alloc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "clock.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "journal.h"
+#include "super.h"
+
+#include <linux/blkdev.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <trace/events/bcache.h>
+
+static size_t bch_bucket_alloc(struct cache *, enum alloc_reserve);
+static void __bch_bucket_free(struct cache *, struct bucket *);
+
+/* Allocation groups: */
+
+void bch_cache_group_remove_cache(struct cache_group *grp, struct cache *ca)
+{
+	unsigned i;
+
+	spin_lock(&grp->lock);
+
+	for (i = 0; i < grp->nr_devices; i++)
+		if (rcu_access_pointer(grp->d[i].dev) == ca) {
+			grp->nr_devices--;
+			memmove(&grp->d[i],
+				&grp->d[i + 1],
+				(grp->nr_devices - i) * sizeof(grp->d[0]));
+			break;
+		}
+
+	spin_unlock(&grp->lock);
+}
+
+void bch_cache_group_add_cache(struct cache_group *grp, struct cache *ca)
+{
+	unsigned i;
+
+	spin_lock(&grp->lock);
+	for (i = 0; i < grp->nr_devices; i++)
+		if (rcu_access_pointer(grp->d[i].dev) == ca)
+			goto out;
+
+	BUG_ON(grp->nr_devices >= MAX_CACHES_PER_SET);
+
+	rcu_assign_pointer(grp->d[grp->nr_devices++].dev, ca);
+out:
+	spin_unlock(&grp->lock);
+}
+
+/* Ratelimiting/PD controllers */
+
+static void pd_controllers_update(struct work_struct *work)
+{
+	struct cache_set *c = container_of(to_delayed_work(work),
+					   struct cache_set,
+					   pd_controllers_update);
+	struct cache *ca;
+	unsigned iter;
+	int i;
+
+	/* All units are in bytes */
+	u64 tier_size[CACHE_TIERS];
+	u64 tier_free[CACHE_TIERS];
+	u64 tier_dirty[CACHE_TIERS];
+	u64 tier0_can_free = 0;
+
+	memset(tier_size, 0, sizeof(tier_size));
+	memset(tier_free, 0, sizeof(tier_free));
+	memset(tier_dirty, 0, sizeof(tier_dirty));
+
+	rcu_read_lock();
+	for (i = CACHE_TIERS - 1; i >= 0; --i)
+		group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) {
+			struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
+			unsigned bucket_bits = ca->bucket_bits + 9;
+
+			/*
+			 * Bytes of internal fragmentation, which can be
+			 * reclaimed by copy GC
+			 */
+			s64 fragmented = ((stats.buckets_dirty +
+					   stats.buckets_cached) <<
+					  bucket_bits) -
+				((stats.sectors_dirty +
+				  stats.sectors_cached) << 9);
+
+			u64 dev_size = (ca->mi.nbuckets -
+					ca->mi.first_bucket) << bucket_bits;
+
+			u64 free = __buckets_free_cache(ca, stats) << bucket_bits;
+
+			if (fragmented < 0)
+				fragmented = 0;
+
+			bch_pd_controller_update(&ca->moving_gc_pd,
+						 free, fragmented, -1);
+
+			if (i == 0)
+				tier0_can_free += fragmented;
+
+			tier_size[i] += dev_size;
+			tier_free[i] += free;
+			tier_dirty[i] += stats.buckets_dirty << bucket_bits;
+		}
+	rcu_read_unlock();
+
+	if (tier_size[1]) {
+		u64 target = div_u64(tier_size[0] * c->tiering_percent, 100);
+
+		tier0_can_free = max_t(s64, 0, tier_dirty[0] - target);
+
+		bch_pd_controller_update(&c->tiering_pd,
+					 target,
+					 tier_dirty[0],
+					 -1);
+	}
+
+	/*
+	 * Throttle foreground writes if tier 0 is running out of free buckets,
+	 * and either tiering or copygc can free up space (but don't take both
+	 * into account).
+	 *
+	 * Target will be small if there isn't any work to do - we don't want to
+	 * throttle foreground writes if we currently have all the free space
+	 * we're ever going to have.
+	 *
+	 * Otherwise, if there's work to do, try to keep 20% of tier0 available
+	 * for foreground writes.
+	 */
+	bch_pd_controller_update(&c->foreground_write_pd,
+				 min(tier0_can_free,
+				     div_u64(tier_size[0] *
+					     c->foreground_target_percent,
+					     100)),
+				 tier_free[0],
+				 -1);
+
+	schedule_delayed_work(&c->pd_controllers_update,
+			      c->pd_controllers_update_seconds * HZ);
+}
+
+/*
+ * Bucket priorities/gens:
+ *
+ * For each bucket, we store on disk its
+   * 8 bit gen
+   * 16 bit priority
+ *
+ * See alloc.c for an explanation of the gen. The priority is used to implement
+ * lru (and in the future other) cache replacement policies; for most purposes
+ * it's just an opaque integer.
+ *
+ * The gens and the priorities don't have a whole lot to do with each other, and
+ * it's actually the gens that must be written out at specific times - it's no
+ * big deal if the priorities don't get written, if we lose them we just reuse
+ * buckets in suboptimal order.
+ *
+ * On disk they're stored in a packed array, and in as many buckets are required
+ * to fit them all. The buckets we use to store them form a list; the journal
+ * header points to the first bucket, the first bucket points to the second
+ * bucket, et cetera.
+ *
+ * This code is used by the allocation code; periodically (whenever it runs out
+ * of buckets to allocate from) the allocation code will invalidate some
+ * buckets, but it can't use those buckets until their new gens are safely on
+ * disk.
+ */
+
+static int prio_io(struct cache *ca, uint64_t bucket, int op)
+{
+	bio_init(ca->bio_prio);
+	bio_set_op_attrs(ca->bio_prio, op, REQ_SYNC|REQ_META);
+
+	ca->bio_prio->bi_max_vecs	= bucket_pages(ca);
+	ca->bio_prio->bi_io_vec		= ca->bio_prio->bi_inline_vecs;
+	ca->bio_prio->bi_iter.bi_sector	= bucket * ca->mi.bucket_size;
+	ca->bio_prio->bi_bdev		= ca->disk_sb.bdev;
+	ca->bio_prio->bi_iter.bi_size	= bucket_bytes(ca);
+	bch_bio_map(ca->bio_prio, ca->disk_buckets);
+
+	return submit_bio_wait(ca->bio_prio);
+}
+
+static int bch_prio_write(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	struct journal *j = &c->journal;
+	struct journal_res res = { 0 };
+	bool need_new_journal_entry;
+	int i, ret;
+
+	trace_bcache_prio_write_start(ca);
+
+	atomic64_add(ca->mi.bucket_size * prio_buckets(ca),
+		     &ca->meta_sectors_written);
+
+	for (i = prio_buckets(ca) - 1; i >= 0; --i) {
+		struct bucket *g;
+		struct prio_set *p = ca->disk_buckets;
+		struct bucket_disk *d = p->data;
+		struct bucket_disk *end = d + prios_per_bucket(ca);
+		size_t r;
+
+		for (r = i * prios_per_bucket(ca);
+		     r < ca->mi.nbuckets && d < end;
+		     r++, d++) {
+			g = ca->buckets + r;
+			d->read_prio = cpu_to_le16(g->read_prio);
+			d->write_prio = cpu_to_le16(g->write_prio);
+			d->gen = ca->buckets[r].mark.gen;
+		}
+
+		p->next_bucket	= cpu_to_le64(ca->prio_buckets[i + 1]);
+		p->magic	= cpu_to_le64(pset_magic(&c->disk_sb));
+
+		SET_PSET_CSUM_TYPE(p, c->opts.metadata_checksum);
+		p->csum		= cpu_to_le64(bch_checksum(PSET_CSUM_TYPE(p),
+							   &p->magic,
+							   bucket_bytes(ca) - 8));
+
+		spin_lock(&ca->prio_buckets_lock);
+		r = bch_bucket_alloc(ca, RESERVE_PRIO);
+		BUG_ON(!r);
+
+		/*
+		 * goes here before dropping prio_buckets_lock to guard against
+		 * it getting gc'd from under us
+		 */
+		ca->prio_buckets[i] = r;
+		bch_mark_metadata_bucket(ca, ca->buckets + r, false);
+		spin_unlock(&ca->prio_buckets_lock);
+
+		ret = prio_io(ca, r, REQ_OP_WRITE);
+		if (cache_fatal_io_err_on(ret, ca,
+					  "prio write to bucket %zu", r) ||
+		    bch_meta_write_fault("prio"))
+			return ret;
+	}
+
+	spin_lock(&j->lock);
+	j->prio_buckets[ca->sb.nr_this_dev] = cpu_to_le64(ca->prio_buckets[0]);
+	j->nr_prio_buckets = max_t(unsigned,
+				   ca->sb.nr_this_dev + 1,
+				   j->nr_prio_buckets);
+	spin_unlock(&j->lock);
+
+	do {
+		unsigned u64s = jset_u64s(0);
+
+		ret = bch_journal_res_get(j, &res, u64s, u64s);
+		if (ret)
+			return ret;
+
+		need_new_journal_entry = j->buf[res.idx].nr_prio_buckets <
+			ca->sb.nr_this_dev + 1;
+		bch_journal_res_put(j, &res);
+
+		ret = bch_journal_flush_seq(j, res.seq);
+		if (ret)
+			return ret;
+	} while (need_new_journal_entry);
+
+	/*
+	 * Don't want the old priorities to get garbage collected until after we
+	 * finish writing the new ones, and they're journalled
+	 */
+
+	spin_lock(&ca->prio_buckets_lock);
+
+	for (i = 0; i < prio_buckets(ca); i++) {
+		if (ca->prio_last_buckets[i])
+			__bch_bucket_free(ca,
+				&ca->buckets[ca->prio_last_buckets[i]]);
+
+		ca->prio_last_buckets[i] = ca->prio_buckets[i];
+	}
+
+	spin_unlock(&ca->prio_buckets_lock);
+
+	trace_bcache_prio_write_end(ca);
+	return 0;
+}
+
+int bch_prio_read(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	struct prio_set *p = ca->disk_buckets;
+	struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
+	struct bucket_mark new;
+	unsigned bucket_nr = 0;
+	u64 bucket, expect, got;
+	size_t b;
+	int ret = 0;
+
+	spin_lock(&c->journal.lock);
+	bucket = le64_to_cpu(c->journal.prio_buckets[ca->sb.nr_this_dev]);
+	spin_unlock(&c->journal.lock);
+
+	/*
+	 * If the device hasn't been used yet, there won't be a prio bucket ptr
+	 */
+	if (!bucket)
+		return 0;
+
+	unfixable_fsck_err_on(bucket < ca->mi.first_bucket ||
+			      bucket >= ca->mi.nbuckets, c,
+			      "bad prio bucket %llu", bucket);
+
+	for (b = 0; b < ca->mi.nbuckets; b++, d++) {
+		if (d == end) {
+			ca->prio_last_buckets[bucket_nr] = bucket;
+			bucket_nr++;
+
+			ret = prio_io(ca, bucket, REQ_OP_READ);
+			if (cache_fatal_io_err_on(ret, ca,
+					"prior read from bucket %llu",
+					bucket) ||
+			    bch_meta_read_fault("prio"))
+				return -EIO;
+
+			got = le64_to_cpu(p->magic);
+			expect = pset_magic(&c->disk_sb);
+			unfixable_fsck_err_on(got != expect, c,
+				"bad magic (got %llu expect %llu) while reading prios from bucket %llu",
+				got, expect, bucket);
+
+			got = le64_to_cpu(p->csum);
+			expect = bch_checksum(PSET_CSUM_TYPE(p),
+					      &p->magic,
+					      bucket_bytes(ca) - 8);
+			unfixable_fsck_err_on(got != expect, c,
+				"bad checksum (got %llu expect %llu) while reading prios from bucket %llu",
+				got, expect, bucket);
+
+			bucket = le64_to_cpu(p->next_bucket);
+			d = p->data;
+		}
+
+		ca->buckets[b].read_prio = le16_to_cpu(d->read_prio);
+		ca->buckets[b].write_prio = le16_to_cpu(d->write_prio);
+
+		bucket_cmpxchg(&ca->buckets[b], new, new.gen = d->gen);
+	}
+fsck_err:
+	return 0;
+}
+
+#define BUCKET_GC_GEN_MAX	96U
+
+/**
+ * wait_buckets_available - wait on reclaimable buckets
+ *
+ * If there aren't enough available buckets to fill up free_inc, wait until
+ * there are.
+ */
+static int wait_buckets_available(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	int ret = 0;
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop()) {
+			ret = -1;
+			break;
+		}
+
+		if (ca->inc_gen_needs_gc >= fifo_free(&ca->free_inc)) {
+			if (c->gc_thread) {
+				trace_bcache_gc_cannot_inc_gens(ca->set);
+				atomic_inc(&c->kick_gc);
+				wake_up_process(ca->set->gc_thread);
+			}
+
+			/*
+			 * We are going to wait for GC to wake us up, even if
+			 * bucket counters tell us enough buckets are available,
+			 * because we are actually waiting for GC to rewrite
+			 * nodes with stale pointers
+			 */
+		} else if (buckets_available_cache(ca) >=
+			   fifo_free(&ca->free_inc))
+			break;
+
+		up_read(&ca->set->gc_lock);
+		schedule();
+		try_to_freeze();
+		down_read(&ca->set->gc_lock);
+	}
+
+	__set_current_state(TASK_RUNNING);
+	return ret;
+}
+
+static void verify_not_on_freelist(struct cache *ca, size_t bucket)
+{
+	if (expensive_debug_checks(ca->set)) {
+		size_t iter;
+		long i;
+		unsigned j;
+
+		for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
+			BUG_ON(ca->prio_buckets[iter] == bucket);
+
+		for (j = 0; j < RESERVE_NR; j++)
+			fifo_for_each_entry(i, &ca->free[j], iter)
+				BUG_ON(i == bucket);
+		fifo_for_each_entry(i, &ca->free_inc, iter)
+			BUG_ON(i == bucket);
+	}
+}
+
+/* Bucket heap / gen */
+
+void bch_recalc_min_prio(struct cache *ca, int rw)
+{
+	struct cache_set *c = ca->set;
+	struct prio_clock *clock = &c->prio_clock[rw];
+	struct bucket *g;
+	u16 max_delta = 1;
+	unsigned i;
+
+	/* Determine min prio for this particular cache */
+	for_each_bucket(g, ca)
+		max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw]));
+
+	ca->min_prio[rw] = clock->hand - max_delta;
+
+	/*
+	 * This may possibly increase the min prio for the whole cache, check
+	 * that as well.
+	 */
+	max_delta = 1;
+
+	for_each_cache(ca, c, i)
+		max_delta = max(max_delta,
+				(u16) (clock->hand - ca->min_prio[rw]));
+
+	clock->min_prio = clock->hand - max_delta;
+}
+
+static void bch_rescale_prios(struct cache_set *c, int rw)
+{
+	struct prio_clock *clock = &c->prio_clock[rw];
+	struct cache *ca;
+	struct bucket *g;
+	unsigned i;
+
+	trace_bcache_rescale_prios(c);
+
+	for_each_cache(ca, c, i) {
+		for_each_bucket(g, ca)
+			g->prio[rw] = clock->hand -
+				(clock->hand - g->prio[rw]) / 2;
+
+		bch_recalc_min_prio(ca, rw);
+	}
+}
+
+static void bch_inc_clock_hand(struct io_timer *timer)
+{
+	struct prio_clock *clock = container_of(timer,
+					struct prio_clock, rescale);
+	struct cache_set *c = container_of(clock,
+				struct cache_set, prio_clock[clock->rw]);
+	u64 capacity;
+
+	mutex_lock(&c->bucket_lock);
+
+	clock->hand++;
+
+	/* if clock cannot be advanced more, rescale prio */
+	if (clock->hand == (u16) (clock->min_prio - 1))
+		bch_rescale_prios(c, clock->rw);
+
+	mutex_unlock(&c->bucket_lock);
+
+	capacity = READ_ONCE(c->capacity);
+
+	if (!capacity)
+		return;
+
+	/*
+	 * we only increment when 0.1% of the cache_set has been read
+	 * or written too, this determines if it's time
+	 *
+	 * XXX: we shouldn't really be going off of the capacity of devices in
+	 * RW mode (that will be 0 when we're RO, yet we can still service
+	 * reads)
+	 */
+	timer->expire += capacity >> 10;
+
+	bch_io_timer_add(&c->io_clock[clock->rw], timer);
+}
+
+static void bch_prio_timer_init(struct cache_set *c, int rw)
+{
+	struct prio_clock *clock = &c->prio_clock[rw];
+	struct io_timer *timer = &clock->rescale;
+
+	clock->rw	= rw;
+	timer->fn	= bch_inc_clock_hand;
+	timer->expire	= c->capacity >> 10;
+}
+
+/*
+ * Background allocation thread: scans for buckets to be invalidated,
+ * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
+ * then optionally issues discard commands to the newly free buckets, then puts
+ * them on the various freelists.
+ */
+
+static inline bool can_inc_bucket_gen(struct cache *ca, struct bucket *g)
+{
+	return bucket_gc_gen(ca, g) < BUCKET_GC_GEN_MAX;
+}
+
+static bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *g)
+{
+	if (!is_available_bucket(READ_ONCE(g->mark)))
+		return false;
+
+	if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX - 1)
+		ca->inc_gen_needs_gc++;
+
+	return can_inc_bucket_gen(ca, g);
+}
+
+static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *g)
+{
+	spin_lock(&ca->freelist_lock);
+
+	bch_invalidate_bucket(ca, g);
+
+	g->read_prio = ca->set->prio_clock[READ].hand;
+	g->write_prio = ca->set->prio_clock[WRITE].hand;
+
+	verify_not_on_freelist(ca, g - ca->buckets);
+	BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
+
+	spin_unlock(&ca->freelist_lock);
+}
+
+/*
+ * Determines what order we're going to reuse buckets, smallest bucket_key()
+ * first.
+ *
+ *
+ * - We take into account the read prio of the bucket, which gives us an
+ *   indication of how hot the data is -- we scale the prio so that the prio
+ *   farthest from the clock is worth 1/8th of the closest.
+ *
+ * - The number of sectors of cached data in the bucket, which gives us an
+ *   indication of the cost in cache misses this eviction will cause.
+ *
+ * - The difference between the bucket's current gen and oldest gen of any
+ *   pointer into it, which gives us an indication of the cost of an eventual
+ *   btree GC to rewrite nodes with stale pointers.
+ */
+
+#define bucket_sort_key(g)						\
+({									\
+	unsigned long prio = g->read_prio - ca->min_prio[READ];		\
+	prio = (prio * 7) / (ca->set->prio_clock[READ].hand -		\
+			     ca->min_prio[READ]);			\
+									\
+	(((prio + 1) * bucket_sectors_used(g)) << 8) | bucket_gc_gen(ca, g);\
+})
+
+static void invalidate_buckets_lru(struct cache *ca)
+{
+	struct bucket_heap_entry e;
+	struct bucket *g;
+	unsigned i;
+
+	mutex_lock(&ca->heap_lock);
+
+	ca->heap.used = 0;
+
+	mutex_lock(&ca->set->bucket_lock);
+	bch_recalc_min_prio(ca, READ);
+	bch_recalc_min_prio(ca, WRITE);
+
+	/*
+	 * Find buckets with lowest read priority, by building a maxheap sorted
+	 * by read priority and repeatedly replacing the maximum element until
+	 * all buckets have been visited.
+	 */
+	for_each_bucket(g, ca) {
+		if (!bch_can_invalidate_bucket(ca, g))
+			continue;
+
+		bucket_heap_push(ca, g, bucket_sort_key(g));
+	}
+
+	/* Sort buckets by physical location on disk for better locality */
+	for (i = 0; i < ca->heap.used; i++) {
+		struct bucket_heap_entry *e = &ca->heap.data[i];
+
+		e->val = e->g - ca->buckets;
+	}
+
+	heap_resort(&ca->heap, bucket_max_cmp);
+
+	/*
+	 * If we run out of buckets to invalidate, bch_allocator_thread() will
+	 * kick stuff and retry us
+	 */
+	while (!fifo_full(&ca->free_inc) &&
+	       heap_pop(&ca->heap, e, bucket_max_cmp)) {
+		BUG_ON(!bch_can_invalidate_bucket(ca, e.g));
+		bch_invalidate_one_bucket(ca, e.g);
+	}
+
+	mutex_unlock(&ca->set->bucket_lock);
+	mutex_unlock(&ca->heap_lock);
+}
+
+static void invalidate_buckets_fifo(struct cache *ca)
+{
+	struct bucket *g;
+	size_t checked = 0;
+
+	while (!fifo_full(&ca->free_inc)) {
+		if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
+		    ca->fifo_last_bucket >= ca->mi.nbuckets)
+			ca->fifo_last_bucket = ca->mi.first_bucket;
+
+		g = ca->buckets + ca->fifo_last_bucket++;
+
+		if (bch_can_invalidate_bucket(ca, g))
+			bch_invalidate_one_bucket(ca, g);
+
+		if (++checked >= ca->mi.nbuckets)
+			return;
+	}
+}
+
+static void invalidate_buckets_random(struct cache *ca)
+{
+	struct bucket *g;
+	size_t checked = 0;
+
+	while (!fifo_full(&ca->free_inc)) {
+		size_t n = bch_rand_range(ca->mi.nbuckets -
+					  ca->mi.first_bucket) +
+			ca->mi.first_bucket;
+
+		g = ca->buckets + n;
+
+		if (bch_can_invalidate_bucket(ca, g))
+			bch_invalidate_one_bucket(ca, g);
+
+		if (++checked >= ca->mi.nbuckets / 2)
+			return;
+	}
+}
+
+static void invalidate_buckets(struct cache *ca)
+{
+	ca->inc_gen_needs_gc = 0;
+
+	switch (ca->mi.replacement) {
+	case CACHE_REPLACEMENT_LRU:
+		invalidate_buckets_lru(ca);
+		break;
+	case CACHE_REPLACEMENT_FIFO:
+		invalidate_buckets_fifo(ca);
+		break;
+	case CACHE_REPLACEMENT_RANDOM:
+		invalidate_buckets_random(ca);
+		break;
+	}
+}
+
+static bool __bch_allocator_push(struct cache *ca, long bucket)
+{
+	if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
+		goto success;
+
+	if (fifo_push(&ca->free[RESERVE_MOVINGGC], bucket))
+		goto success;
+
+	if (fifo_push(&ca->free[RESERVE_BTREE], bucket))
+		goto success;
+
+	if (fifo_push(&ca->free[RESERVE_NONE], bucket))
+		goto success;
+
+	return false;
+success:
+	closure_wake_up(&ca->set->freelist_wait);
+	return true;
+}
+
+static bool bch_allocator_push(struct cache *ca, long bucket)
+{
+	bool ret;
+
+	spin_lock(&ca->freelist_lock);
+	ret = __bch_allocator_push(ca, bucket);
+	if (ret)
+		fifo_pop(&ca->free_inc, bucket);
+	spin_unlock(&ca->freelist_lock);
+
+	return ret;
+}
+
+static void bch_find_empty_buckets(struct cache_set *c, struct cache *ca)
+{
+	u16 last_seq_ondisk = c->journal.last_seq_ondisk;
+	struct bucket *g;
+
+	for_each_bucket(g, ca) {
+		struct bucket_mark m = READ_ONCE(g->mark);
+
+		if (is_available_bucket(m) &&
+		    !m.cached_sectors &&
+		    !m.had_metadata &&
+		    (!m.wait_on_journal ||
+		     ((s16) last_seq_ondisk - (s16) m.journal_seq >= 0))) {
+			spin_lock(&ca->freelist_lock);
+
+			bch_mark_alloc_bucket(ca, g, true);
+			g->read_prio = ca->set->prio_clock[READ].hand;
+			g->write_prio = ca->set->prio_clock[WRITE].hand;
+
+			verify_not_on_freelist(ca, g - ca->buckets);
+			BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
+
+			spin_unlock(&ca->freelist_lock);
+
+			if (fifo_full(&ca->free_inc))
+				break;
+		}
+	}
+}
+
+/**
+ * bch_allocator_thread - move buckets from free_inc to reserves
+ *
+ * The free_inc FIFO is populated by invalidate_buckets(), and
+ * the reserves are depleted by bucket allocation. When we run out
+ * of free_inc, try to invalidate some buckets and write out
+ * prios and gens.
+ */
+static int bch_allocator_thread(void *arg)
+{
+	struct cache *ca = arg;
+	struct cache_set *c = ca->set;
+	int ret;
+
+	set_freezable();
+
+	while (1) {
+		/*
+		 * First, we pull buckets off of the free_inc list, possibly
+		 * issue discards to them, then we add the bucket to a
+		 * free list:
+		 */
+
+		while (!fifo_empty(&ca->free_inc)) {
+			long bucket = fifo_peek(&ca->free_inc);
+
+			/*
+			 * Don't remove from free_inc until after it's added
+			 * to freelist, so gc doesn't miss it while we've
+			 * dropped bucket lock
+			 */
+
+			if (ca->mi.discard &&
+			    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+				blkdev_issue_discard(ca->disk_sb.bdev,
+					bucket_to_sector(ca, bucket),
+					ca->mi.bucket_size, GFP_NOIO, 0);
+
+			while (1) {
+				set_current_state(TASK_INTERRUPTIBLE);
+				if (bch_allocator_push(ca, bucket))
+					break;
+
+				if (kthread_should_stop()) {
+					__set_current_state(TASK_RUNNING);
+					goto out;
+				}
+				schedule();
+				try_to_freeze();
+			}
+
+			__set_current_state(TASK_RUNNING);
+		}
+
+		down_read(&c->gc_lock);
+
+		/*
+		 * See if we have buckets we can reuse without invalidating them
+		 * or forcing a journal commit:
+		 */
+		bch_find_empty_buckets(c, ca);
+
+		if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) {
+			up_read(&c->gc_lock);
+			continue;
+		}
+
+		/* We've run out of free buckets! */
+
+		while (!fifo_full(&ca->free_inc)) {
+			if (wait_buckets_available(ca)) {
+				up_read(&c->gc_lock);
+				goto out;
+			}
+
+			/*
+			 * Find some buckets that we can invalidate, either
+			 * they're completely unused, or only contain clean data
+			 * that's been written back to the backing device or
+			 * another cache tier
+			 */
+
+			invalidate_buckets(ca);
+			trace_bcache_alloc_batch(ca, fifo_used(&ca->free_inc),
+						 ca->free_inc.size);
+		}
+
+		up_read(&c->gc_lock);
+
+		/*
+		 * free_inc is full of newly-invalidated buckets, must write out
+		 * prios and gens before they can be re-used
+		 */
+		ret = bch_prio_write(ca);
+		if (ret) {
+			/*
+			 * Emergency read only - allocator thread has to
+			 * shutdown.
+			 *
+			 * N.B. we better be going into RO mode, else
+			 * allocations would hang indefinitely - whatever
+			 * generated the error will have sent us into RO mode.
+			 *
+			 * Clear out the free_inc freelist so things are
+			 * consistent-ish:
+			 */
+			spin_lock(&ca->freelist_lock);
+			while (!fifo_empty(&ca->free_inc)) {
+				long bucket;
+
+				fifo_pop(&ca->free_inc, bucket);
+				bch_mark_free_bucket(ca, ca->buckets + bucket);
+			}
+			spin_unlock(&ca->freelist_lock);
+			goto out;
+		}
+	}
+out:
+	/*
+	 * Avoid a race with bucket_stats_update() trying to wake us up after
+	 * we've exited:
+	 */
+	synchronize_rcu();
+	return 0;
+}
+
+/* Allocation */
+
+/**
+ * bch_bucket_alloc - allocate a single bucket from a specific device
+ *
+ * Returns index of bucket on success, 0 on failure
+ * */
+static size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve)
+{
+	struct bucket *g;
+	long r;
+
+	spin_lock(&ca->freelist_lock);
+	if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
+	    fifo_pop(&ca->free[reserve], r))
+		goto out;
+
+	spin_unlock(&ca->freelist_lock);
+
+	trace_bcache_bucket_alloc_fail(ca, reserve);
+	return 0;
+out:
+	verify_not_on_freelist(ca, r);
+	spin_unlock(&ca->freelist_lock);
+
+	trace_bcache_bucket_alloc(ca, reserve);
+
+	bch_wake_allocator(ca);
+
+	g = ca->buckets + r;
+
+	g->read_prio = ca->set->prio_clock[READ].hand;
+	g->write_prio = ca->set->prio_clock[WRITE].hand;
+
+	return r;
+}
+
+static void __bch_bucket_free(struct cache *ca, struct bucket *g)
+{
+	bch_mark_free_bucket(ca, g);
+
+	g->read_prio = ca->set->prio_clock[READ].hand;
+	g->write_prio = ca->set->prio_clock[WRITE].hand;
+}
+
+enum bucket_alloc_ret {
+	ALLOC_SUCCESS,
+	NO_DEVICES,		/* -EROFS */
+	FREELIST_EMPTY,		/* Allocator thread not keeping up */
+};
+
+static void recalc_alloc_group_weights(struct cache_set *c,
+				       struct cache_group *devs)
+{
+	struct cache *ca;
+	u64 available_buckets = 1; /* avoid a divide by zero... */
+	unsigned i;
+
+	for (i = 0; i < devs->nr_devices; i++) {
+		ca = devs->d[i].dev;
+
+		devs->d[i].weight = buckets_free_cache(ca);
+		available_buckets += devs->d[i].weight;
+	}
+
+	for (i = 0; i < devs->nr_devices; i++) {
+		const unsigned min_weight = U32_MAX >> 4;
+		const unsigned max_weight = U32_MAX;
+
+		devs->d[i].weight =
+			min_weight +
+			div64_u64(devs->d[i].weight *
+				  devs->nr_devices *
+				  (max_weight - min_weight),
+				  available_buckets);
+		devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight);
+	}
+}
+
+static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
+						    struct open_bucket *ob,
+						    enum alloc_reserve reserve,
+						    unsigned nr_replicas,
+						    struct cache_group *devs,
+						    long *caches_used)
+{
+	enum bucket_alloc_ret ret;
+	unsigned fail_idx = -1, i;
+	unsigned available = 0;
+
+	BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs));
+
+	if (ob->nr_ptrs >= nr_replicas)
+		return ALLOC_SUCCESS;
+
+	rcu_read_lock();
+	spin_lock(&devs->lock);
+
+	for (i = 0; i < devs->nr_devices; i++)
+		available += !test_bit(devs->d[i].dev->sb.nr_this_dev,
+				       caches_used);
+
+	recalc_alloc_group_weights(c, devs);
+
+	i = devs->cur_device;
+
+	while (ob->nr_ptrs < nr_replicas) {
+		struct cache *ca;
+		u64 bucket;
+
+		if (!available) {
+			ret = NO_DEVICES;
+			goto err;
+		}
+
+		i++;
+		i %= devs->nr_devices;
+
+		ret = FREELIST_EMPTY;
+		if (i == fail_idx)
+			goto err;
+
+		ca = devs->d[i].dev;
+
+		if (test_bit(ca->sb.nr_this_dev, caches_used))
+			continue;
+
+		if (fail_idx == -1 &&
+		    get_random_int() > devs->d[i].weight)
+			continue;
+
+		bucket = bch_bucket_alloc(ca, reserve);
+		if (!bucket) {
+			if (fail_idx == -1)
+				fail_idx = i;
+			continue;
+		}
+
+		/*
+		 * open_bucket_add_buckets expects new pointers at the head of
+		 * the list:
+		 */
+		memmove(&ob->ptrs[1],
+			&ob->ptrs[0],
+			ob->nr_ptrs * sizeof(ob->ptrs[0]));
+		memmove(&ob->ptr_offset[1],
+			&ob->ptr_offset[0],
+			ob->nr_ptrs * sizeof(ob->ptr_offset[0]));
+		ob->nr_ptrs++;
+		ob->ptrs[0] = (struct bch_extent_ptr) {
+			.gen	= ca->buckets[bucket].mark.gen,
+			.offset	= bucket_to_sector(ca, bucket),
+			.dev	= ca->sb.nr_this_dev,
+		};
+		ob->ptr_offset[0] = 0;
+
+		__set_bit(ca->sb.nr_this_dev, caches_used);
+		available--;
+		devs->cur_device = i;
+	}
+
+	ret = ALLOC_SUCCESS;
+err:
+	EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC);
+	spin_unlock(&devs->lock);
+	rcu_read_unlock();
+	return ret;
+}
+
+static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
+						    struct write_point *wp,
+						    struct open_bucket *ob,
+						    unsigned nr_replicas,
+						    enum alloc_reserve reserve,
+						    long *caches_used)
+{
+	/*
+	 * this should implement policy - for a given type of allocation, decide
+	 * which devices to allocate from:
+	 *
+	 * XXX: switch off wp->type and do something more intelligent here
+	 */
+
+	/* foreground writes: prefer tier 0: */
+	if (wp->group == &c->cache_all)
+		bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
+				       &c->cache_tiers[0], caches_used);
+
+	return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
+				      wp->group, caches_used);
+}
+
+static int bch_bucket_alloc_set(struct cache_set *c, struct write_point *wp,
+				struct open_bucket *ob, unsigned nr_replicas,
+				enum alloc_reserve reserve, long *caches_used,
+				struct closure *cl)
+{
+	bool waiting = false;
+
+	while (1) {
+		switch (__bch_bucket_alloc_set(c, wp, ob, nr_replicas,
+					       reserve, caches_used)) {
+		case ALLOC_SUCCESS:
+			if (waiting)
+				closure_wake_up(&c->freelist_wait);
+
+			return 0;
+
+		case NO_DEVICES:
+			if (waiting)
+				closure_wake_up(&c->freelist_wait);
+			return -EROFS;
+
+		case FREELIST_EMPTY:
+			if (!cl || waiting)
+				trace_bcache_freelist_empty_fail(c,
+							reserve, cl);
+
+			if (!cl)
+				return -ENOSPC;
+
+			if (waiting)
+				return -EAGAIN;
+
+			/* Retry allocation after adding ourself to waitlist: */
+			closure_wait(&c->freelist_wait, cl);
+			waiting = true;
+			break;
+		default:
+			BUG();
+		}
+	}
+}
+
+/* Open buckets: */
+
+/*
+ * Open buckets represent one or more buckets (on multiple devices) that are
+ * currently being allocated from. They serve two purposes:
+ *
+ *  - They track buckets that have been partially allocated, allowing for
+ *    sub-bucket sized allocations - they're used by the sector allocator below
+ *
+ *  - They provide a reference to the buckets they own that mark and sweep GC
+ *    can find, until the new allocation has a pointer to it inserted into the
+ *    btree
+ *
+ * When allocating some space with the sector allocator, the allocation comes
+ * with a reference to an open bucket - the caller is required to put that
+ * reference _after_ doing the index update that makes its allocation reachable.
+ */
+
+static void __bch_open_bucket_put(struct cache_set *c, struct open_bucket *ob)
+{
+	const struct bch_extent_ptr *ptr;
+	struct cache *ca;
+
+	lockdep_assert_held(&c->open_buckets_lock);
+
+	rcu_read_lock();
+	open_bucket_for_each_online_device(c, ob, ptr, ca)
+		bch_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), false);
+	rcu_read_unlock();
+
+	ob->nr_ptrs = 0;
+
+	list_move(&ob->list, &c->open_buckets_free);
+	c->open_buckets_nr_free++;
+	closure_wake_up(&c->open_buckets_wait);
+}
+
+void bch_open_bucket_put(struct cache_set *c, struct open_bucket *b)
+{
+	if (atomic_dec_and_test(&b->pin)) {
+		spin_lock(&c->open_buckets_lock);
+		__bch_open_bucket_put(c, b);
+		spin_unlock(&c->open_buckets_lock);
+	}
+}
+
+static struct open_bucket *bch_open_bucket_get(struct cache_set *c,
+					       unsigned nr_reserved,
+					       struct closure *cl)
+{
+	struct open_bucket *ret;
+
+	spin_lock(&c->open_buckets_lock);
+
+	if (c->open_buckets_nr_free > nr_reserved) {
+		BUG_ON(list_empty(&c->open_buckets_free));
+		ret = list_first_entry(&c->open_buckets_free,
+				       struct open_bucket, list);
+		list_move(&ret->list, &c->open_buckets_open);
+		BUG_ON(ret->nr_ptrs);
+
+		atomic_set(&ret->pin, 1); /* XXX */
+		ret->has_full_ptrs	= false;
+
+		c->open_buckets_nr_free--;
+		trace_bcache_open_bucket_alloc(c, cl);
+	} else {
+		trace_bcache_open_bucket_alloc_fail(c, cl);
+
+		if (cl) {
+			closure_wait(&c->open_buckets_wait, cl);
+			ret = ERR_PTR(-EAGAIN);
+		} else
+			ret = ERR_PTR(-ENOSPC);
+	}
+
+	spin_unlock(&c->open_buckets_lock);
+
+	return ret;
+}
+
+static unsigned ob_ptr_sectors_free(struct open_bucket *ob,
+				    struct cache_member_rcu *mi,
+				    struct bch_extent_ptr *ptr)
+{
+	unsigned i = ptr - ob->ptrs;
+	unsigned bucket_size = mi->m[ptr->dev].bucket_size;
+	unsigned used = (ptr->offset & (bucket_size - 1)) +
+		ob->ptr_offset[i];
+
+	BUG_ON(used > bucket_size);
+
+	return bucket_size - used;
+}
+
+static unsigned open_bucket_sectors_free(struct cache_set *c,
+					 struct open_bucket *ob,
+					 unsigned nr_replicas)
+{
+	struct cache_member_rcu *mi = cache_member_info_get(c);
+	unsigned i, sectors_free = UINT_MAX;
+
+	BUG_ON(nr_replicas > ob->nr_ptrs);
+
+	for (i = 0; i < nr_replicas; i++)
+		sectors_free = min(sectors_free,
+				   ob_ptr_sectors_free(ob, mi, &ob->ptrs[i]));
+
+	cache_member_info_put();
+
+	return sectors_free != UINT_MAX ? sectors_free : 0;
+}
+
+static void open_bucket_copy_unused_ptrs(struct cache_set *c,
+					 struct open_bucket *new,
+					 struct open_bucket *old)
+{
+	struct cache_member_rcu *mi = cache_member_info_get(c);
+	unsigned i;
+
+	for (i = 0; i < old->nr_ptrs; i++)
+		if (ob_ptr_sectors_free(old, mi, &old->ptrs[i])) {
+			struct bch_extent_ptr tmp = old->ptrs[i];
+
+			tmp.offset += old->ptr_offset[i];
+			new->ptrs[new->nr_ptrs] = tmp;
+			new->ptr_offset[new->nr_ptrs] = 0;
+			new->nr_ptrs++;
+		}
+	cache_member_info_put();
+}
+
+static void verify_not_stale(struct cache_set *c, const struct open_bucket *ob)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+	const struct bch_extent_ptr *ptr;
+	struct cache *ca;
+
+	rcu_read_lock();
+	open_bucket_for_each_online_device(c, ob, ptr, ca)
+		BUG_ON(ptr_stale(ca, ptr));
+	rcu_read_unlock();
+#endif
+}
+
+/* Sector allocator */
+
+static struct open_bucket *lock_writepoint(struct cache_set *c,
+					   struct write_point *wp)
+{
+	struct open_bucket *ob;
+
+	while ((ob = ACCESS_ONCE(wp->b))) {
+		mutex_lock(&ob->lock);
+		if (wp->b == ob)
+			break;
+
+		mutex_unlock(&ob->lock);
+	}
+
+	return ob;
+}
+
+static int open_bucket_add_buckets(struct cache_set *c,
+				   struct write_point *wp,
+				   struct open_bucket *ob,
+				   unsigned nr_replicas,
+				   enum alloc_reserve reserve,
+				   struct closure *cl)
+{
+	long caches_used[BITS_TO_LONGS(MAX_CACHES_PER_SET)];
+	int i, dst;
+
+	/*
+	 * We might be allocating pointers to add to an existing extent
+	 * (tiering/copygc/migration) - if so, some of the pointers in our
+	 * existing open bucket might duplicate devices we already have. This is
+	 * moderately annoying.
+	 */
+
+	/* Short circuit all the fun stuff if posssible: */
+	if (ob->nr_ptrs >= nr_replicas)
+		return 0;
+
+	memset(caches_used, 0, sizeof(caches_used));
+
+	/*
+	 * Shuffle pointers to devices we already have to the end:
+	 * bch_bucket_alloc_set() will add new pointers to the statr of @b, and
+	 * bch_alloc_sectors_done() will add the first nr_replicas ptrs to @e:
+	 */
+	for (i = dst = ob->nr_ptrs - 1; i >= 0; --i)
+		if (__test_and_set_bit(ob->ptrs[i].dev, caches_used)) {
+			if (i != dst) {
+				swap(ob->ptrs[i], ob->ptrs[dst]);
+				swap(ob->ptr_offset[i], ob->ptr_offset[dst]);
+			}
+			--dst;
+			nr_replicas++;
+		}
+
+	return bch_bucket_alloc_set(c, wp, ob, nr_replicas,
+				    reserve, caches_used, cl);
+}
+
+/*
+ * Get us an open_bucket we can allocate from, return with it locked:
+ */
+struct open_bucket *bch_alloc_sectors_start(struct cache_set *c,
+					    struct write_point *wp,
+					    unsigned nr_replicas,
+					    enum alloc_reserve reserve,
+					    struct closure *cl)
+{
+	struct open_bucket *ob;
+	unsigned open_buckets_reserved = wp == &c->btree_write_point
+		? 0 : BTREE_NODE_RESERVE;
+	int ret;
+
+	BUG_ON(!wp->group);
+	BUG_ON(!reserve);
+	BUG_ON(!nr_replicas);
+retry:
+	ob = lock_writepoint(c, wp);
+
+	/*
+	 * If ob->sectors_free == 0, one or more of the buckets ob points to is
+	 * full. We can't drop pointers from an open bucket - garbage collection
+	 * still needs to find them; instead, we must allocate a new open bucket
+	 * and copy any pointers to non-full buckets into the new open bucket.
+	 */
+	if (!ob || ob->has_full_ptrs) {
+		struct open_bucket *new_ob;
+
+		new_ob = bch_open_bucket_get(c, open_buckets_reserved, cl);
+		if (IS_ERR(new_ob))
+			return new_ob;
+
+		mutex_lock(&new_ob->lock);
+
+		/*
+		 * We point the write point at the open_bucket before doing the
+		 * allocation to avoid a race with shutdown:
+		 */
+		if (race_fault() ||
+		    cmpxchg(&wp->b, ob, new_ob) != ob) {
+			/* We raced: */
+			mutex_unlock(&new_ob->lock);
+			bch_open_bucket_put(c, new_ob);
+
+			if (ob)
+				mutex_unlock(&ob->lock);
+			goto retry;
+		}
+
+		if (ob) {
+			open_bucket_copy_unused_ptrs(c, new_ob, ob);
+			mutex_unlock(&ob->lock);
+			bch_open_bucket_put(c, ob);
+		}
+
+		ob = new_ob;
+	}
+
+	ret = open_bucket_add_buckets(c, wp, ob, nr_replicas,
+				      reserve, cl);
+	if (ret) {
+		mutex_unlock(&ob->lock);
+		return ERR_PTR(ret);
+	}
+
+	ob->sectors_free = open_bucket_sectors_free(c, ob, nr_replicas);
+
+	BUG_ON(!ob->sectors_free);
+	verify_not_stale(c, ob);
+
+	return ob;
+}
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
+				   unsigned nr_replicas, struct open_bucket *ob,
+				   unsigned sectors)
+{
+	struct bch_extent_ptr tmp, *ptr;
+	struct cache *ca;
+	bool has_data = false;
+	unsigned i;
+
+	/*
+	 * We're keeping any existing pointer k has, and appending new pointers:
+	 * __bch_write() will only write to the pointers we add here:
+	 */
+
+	/*
+	 * XXX: don't add pointers to devices @e already has
+	 */
+	BUG_ON(nr_replicas > ob->nr_ptrs);
+	BUG_ON(sectors > ob->sectors_free);
+
+	/* didn't use all the ptrs: */
+	if (nr_replicas < ob->nr_ptrs)
+		has_data = true;
+
+	for (i = 0; i < nr_replicas; i++) {
+		EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));
+
+		tmp = ob->ptrs[i];
+		tmp.offset += ob->ptr_offset[i];
+		extent_ptr_append(e, tmp);
+
+		ob->ptr_offset[i] += sectors;
+	}
+
+	open_bucket_for_each_online_device(c, ob, ptr, ca)
+		this_cpu_add(*ca->sectors_written, sectors);
+}
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+void bch_alloc_sectors_done(struct cache_set *c, struct write_point *wp,
+			    struct open_bucket *ob)
+{
+	struct cache_member_rcu *mi = cache_member_info_get(c);
+	bool has_data = false;
+	unsigned i;
+
+	for (i = 0; i < ob->nr_ptrs; i++) {
+		if (!ob_ptr_sectors_free(ob, mi, &ob->ptrs[i]))
+			ob->has_full_ptrs = true;
+		else
+			has_data = true;
+	}
+
+	cache_member_info_put();
+
+	if (likely(has_data))
+		atomic_inc(&ob->pin);
+	else
+		BUG_ON(xchg(&wp->b, NULL) != ob);
+
+	mutex_unlock(&ob->lock);
+}
+
+/*
+ * Allocates some space in the cache to write to, and k to point to the newly
+ * allocated space, and updates k->size and k->offset (to point to the
+ * end of the newly allocated space).
+ *
+ * May allocate fewer sectors than @sectors, k->size indicates how many
+ * sectors were actually allocated.
+ *
+ * Return codes:
+ * - -EAGAIN: closure was added to waitlist
+ * - -ENOSPC: out of space and no closure provided
+ *
+ * @c  - cache set.
+ * @wp - write point to use for allocating sectors.
+ * @k  - key to return the allocated space information.
+ * @cl - closure to wait for a bucket
+ */
+struct open_bucket *bch_alloc_sectors(struct cache_set *c,
+				      struct write_point *wp,
+				      struct bkey_i_extent *e,
+				      unsigned nr_replicas,
+				      enum alloc_reserve reserve,
+				      struct closure *cl)
+{
+	struct open_bucket *ob;
+
+	ob = bch_alloc_sectors_start(c, wp, nr_replicas, reserve, cl);
+	if (IS_ERR_OR_NULL(ob))
+		return ob;
+
+	if (e->k.size > ob->sectors_free)
+		bch_key_resize(&e->k, ob->sectors_free);
+
+	bch_alloc_sectors_append_ptrs(c, e, nr_replicas, ob, e->k.size);
+
+	bch_alloc_sectors_done(c, wp, ob);
+
+	return ob;
+}
+
+/* Startup/shutdown (ro/rw): */
+
+static void bch_recalc_capacity(struct cache_set *c)
+{
+	struct cache_group *tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers);
+	struct cache *ca;
+	u64 total_capacity, capacity = 0, reserved_sectors = 0;
+	unsigned long ra_pages = 0;
+	unsigned i, j;
+
+	rcu_read_lock();
+	for_each_cache_rcu(ca, c, i) {
+		struct backing_dev_info *bdi =
+			blk_get_backing_dev_info(ca->disk_sb.bdev);
+
+		ra_pages += bdi->ra_pages;
+	}
+
+	c->bdi.ra_pages = ra_pages;
+
+	/*
+	 * Capacity of the cache set is the capacity of all the devices in the
+	 * slowest (highest) tier - we don't include lower tier devices.
+	 */
+	for (tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers) - 1;
+	     tier > c->cache_tiers && !tier->nr_devices;
+	     --tier)
+		;
+
+	group_for_each_cache_rcu(ca, tier, i) {
+		size_t reserve = 0;
+
+		/*
+		 * We need to reserve buckets (from the number
+		 * of currently available buckets) against
+		 * foreground writes so that mainly copygc can
+		 * make forward progress.
+		 *
+		 * We need enough to refill the various reserves
+		 * from scratch - copygc will use its entire
+		 * reserve all at once, then run against when
+		 * its reserve is refilled (from the formerly
+		 * available buckets).
+		 *
+		 * This reserve is just used when considering if
+		 * allocations for foreground writes must wait -
+		 * not -ENOSPC calculations.
+		 */
+		for (j = 0; j < RESERVE_NONE; j++)
+			reserve += ca->free[j].size;
+
+		reserve += ca->free_inc.size;
+
+		reserve += ARRAY_SIZE(c->write_points);
+
+		if (ca->mi.tier)
+			reserve += 1;	/* tiering write point */
+		reserve += 1;		/* btree write point */
+
+		reserved_sectors += reserve << ca->bucket_bits;
+
+		capacity += (ca->mi.nbuckets -
+			     ca->mi.first_bucket) <<
+			ca->bucket_bits;
+	}
+	rcu_read_unlock();
+
+	total_capacity = capacity;
+
+	capacity *= (100 - c->opts.gc_reserve_percent);
+	capacity = div64_u64(capacity, 100);
+
+	BUG_ON(capacity + reserved_sectors > total_capacity);
+
+	c->capacity = capacity;
+
+	if (c->capacity) {
+		bch_io_timer_add(&c->io_clock[READ],
+				 &c->prio_clock[READ].rescale);
+		bch_io_timer_add(&c->io_clock[WRITE],
+				 &c->prio_clock[WRITE].rescale);
+	} else {
+		bch_io_timer_del(&c->io_clock[READ],
+				 &c->prio_clock[READ].rescale);
+		bch_io_timer_del(&c->io_clock[WRITE],
+				 &c->prio_clock[WRITE].rescale);
+	}
+
+	/* Wake up case someone was waiting for buckets */
+	closure_wake_up(&c->freelist_wait);
+}
+
+static void bch_stop_write_point(struct cache *ca,
+				 struct write_point *wp)
+{
+	struct cache_set *c = ca->set;
+	struct open_bucket *ob;
+	struct bch_extent_ptr *ptr;
+
+	ob = lock_writepoint(c, wp);
+	if (!ob)
+		return;
+
+	for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
+		if (ptr->dev == ca->sb.nr_this_dev)
+			goto found;
+
+	mutex_unlock(&ob->lock);
+	return;
+found:
+	BUG_ON(xchg(&wp->b, NULL) != ob);
+	mutex_unlock(&ob->lock);
+
+	/* Drop writepoint's ref: */
+	bch_open_bucket_put(c, ob);
+}
+
+static bool bch_dev_has_open_write_point(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	struct bch_extent_ptr *ptr;
+	struct open_bucket *ob;
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++)
+		if (atomic_read(&ob->pin)) {
+			mutex_lock(&ob->lock);
+			for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
+				if (ptr->dev == ca->sb.nr_this_dev) {
+					mutex_unlock(&ob->lock);
+					return true;
+				}
+			mutex_unlock(&ob->lock);
+		}
+
+	return false;
+}
+
+/* device goes ro: */
+void bch_cache_allocator_stop(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+	struct task_struct *p;
+	struct closure cl;
+	unsigned i;
+
+	closure_init_stack(&cl);
+
+	/* First, remove device from allocation groups: */
+
+	bch_cache_group_remove_cache(tier, ca);
+	bch_cache_group_remove_cache(&c->cache_all, ca);
+
+	bch_recalc_capacity(c);
+
+	/*
+	 * Stopping the allocator thread comes after removing from allocation
+	 * groups, else pending allocations will hang:
+	 */
+
+	p = ca->alloc_thread;
+	ca->alloc_thread = NULL;
+	smp_wmb();
+
+	/*
+	 * We need an rcu barrier between setting ca->alloc_thread = NULL and
+	 * the thread shutting down to avoid a race with bucket_stats_update() -
+	 * the allocator thread itself does a synchronize_rcu() on exit.
+	 *
+	 * XXX: it would be better to have the rcu barrier be asynchronous
+	 * instead of blocking us here
+	 */
+	if (p) {
+		kthread_stop(p);
+		put_task_struct(p);
+	}
+
+	/* Next, close write points that point to this device... */
+
+	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+		bch_stop_write_point(ca, &c->write_points[i]);
+
+	bch_stop_write_point(ca, &ca->copygc_write_point);
+	bch_stop_write_point(ca, &c->promote_write_point);
+	bch_stop_write_point(ca, &ca->tiering_write_point);
+	bch_stop_write_point(ca, &c->migration_write_point);
+	bch_stop_write_point(ca, &c->btree_write_point);
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+	while (c->btree_reserve_cache_nr) {
+		struct btree_alloc *a =
+			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+		bch_open_bucket_put(c, a->ob);
+	}
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+	/* Avoid deadlocks.. */
+
+	closure_wake_up(&c->freelist_wait);
+	wake_up(&c->journal.wait);
+
+	/* Now wait for any in flight writes: */
+
+	while (1) {
+		closure_wait(&c->open_buckets_wait, &cl);
+
+		if (!bch_dev_has_open_write_point(ca)) {
+			closure_wake_up(&c->open_buckets_wait);
+			break;
+		}
+
+		closure_sync(&cl);
+	}
+}
+
+/*
+ * Startup the allocator thread for transition to RW mode:
+ */
+int bch_cache_allocator_start(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+	struct task_struct *k;
+
+	/*
+	 * allocator thread already started?
+	 */
+	if (ca->alloc_thread)
+		return 0;
+
+	k = kthread_create(bch_allocator_thread, ca, "bcache_allocator");
+	if (IS_ERR(k))
+		return 0;
+
+	get_task_struct(k);
+	ca->alloc_thread = k;
+
+	bch_cache_group_add_cache(tier, ca);
+	bch_cache_group_add_cache(&c->cache_all, ca);
+
+	bch_recalc_capacity(c);
+
+	/*
+	 * Don't wake up allocator thread until after adding device to
+	 * allocator groups - otherwise, alloc thread could get a spurious
+	 * -EROFS due to prio_write() -> journal_meta() not finding any devices:
+	 */
+	wake_up_process(k);
+	return 0;
+}
+
+void bch_open_buckets_init(struct cache_set *c)
+{
+	unsigned i;
+
+	INIT_LIST_HEAD(&c->open_buckets_open);
+	INIT_LIST_HEAD(&c->open_buckets_free);
+	spin_lock_init(&c->open_buckets_lock);
+	bch_prio_timer_init(c, READ);
+	bch_prio_timer_init(c, WRITE);
+
+	/* open bucket 0 is a sentinal NULL: */
+	mutex_init(&c->open_buckets[0].lock);
+	INIT_LIST_HEAD(&c->open_buckets[0].list);
+
+	for (i = 1; i < ARRAY_SIZE(c->open_buckets); i++) {
+		mutex_init(&c->open_buckets[i].lock);
+		c->open_buckets_nr_free++;
+		list_add(&c->open_buckets[i].list, &c->open_buckets_free);
+	}
+
+	spin_lock_init(&c->cache_all.lock);
+
+	for (i = 0; i < ARRAY_SIZE(c->write_points); i++) {
+		c->write_points[i].throttle = true;
+		c->write_points[i].group = &c->cache_tiers[0];
+	}
+
+	for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++)
+		spin_lock_init(&c->cache_tiers[i].lock);
+
+	c->promote_write_point.group = &c->cache_tiers[0];
+
+	c->migration_write_point.group = &c->cache_all;
+
+	c->btree_write_point.group = &c->cache_all;
+
+	c->pd_controllers_update_seconds = 5;
+	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
+
+	spin_lock_init(&c->foreground_write_pd_lock);
+	bch_pd_controller_init(&c->foreground_write_pd);
+	/*
+	 * We do not want the write rate to have an effect on the computed
+	 * rate, for two reasons:
+	 *
+	 * We do not call bch_ratelimit_delay() at all if the write rate
+	 * exceeds 1GB/s. In this case, the PD controller will think we are
+	 * not "keeping up" and not change the rate.
+	 */
+	c->foreground_write_pd.backpressure = 0;
+	init_timer(&c->foreground_write_wakeup);
+
+	c->foreground_write_wakeup.data = (unsigned long) c;
+	c->foreground_write_wakeup.function = bch_wake_delayed_writes;
+}
diff --git a/libbcache/alloc.h b/libbcache/alloc.h
new file mode 100644
index 0000000..ac83e4f
--- /dev/null
+++ b/libbcache/alloc.h
@@ -0,0 +1,110 @@
+#ifndef _BCACHE_ALLOC_H
+#define _BCACHE_ALLOC_H
+
+#include "alloc_types.h"
+
+struct bkey;
+struct bucket;
+struct cache;
+struct cache_set;
+struct cache_group;
+
+static inline size_t prios_per_bucket(const struct cache *ca)
+{
+	return (bucket_bytes(ca) - sizeof(struct prio_set)) /
+		sizeof(struct bucket_disk);
+}
+
+static inline size_t prio_buckets(const struct cache *ca)
+{
+	return DIV_ROUND_UP((size_t) (ca)->mi.nbuckets, prios_per_bucket(ca));
+}
+
+void bch_cache_group_remove_cache(struct cache_group *, struct cache *);
+void bch_cache_group_add_cache(struct cache_group *, struct cache *);
+
+int bch_prio_read(struct cache *);
+
+void bch_recalc_min_prio(struct cache *, int);
+
+void bch_open_bucket_put(struct cache_set *, struct open_bucket *);
+
+struct open_bucket *bch_alloc_sectors_start(struct cache_set *,
+					    struct write_point *,
+					    unsigned, enum alloc_reserve,
+					    struct closure *);
+
+void bch_alloc_sectors_append_ptrs(struct cache_set *, struct bkey_i_extent *,
+				   unsigned, struct open_bucket *, unsigned);
+void bch_alloc_sectors_done(struct cache_set *, struct write_point *,
+			    struct open_bucket *);
+
+struct open_bucket *bch_alloc_sectors(struct cache_set *, struct write_point *,
+				      struct bkey_i_extent *, unsigned,
+				      enum alloc_reserve, struct closure *);
+
+static inline void bch_wake_allocator(struct cache *ca)
+{
+	struct task_struct *p;
+
+	rcu_read_lock();
+	if ((p = ACCESS_ONCE(ca->alloc_thread)))
+		wake_up_process(p);
+	rcu_read_unlock();
+}
+
+static inline struct cache *cache_group_next_rcu(struct cache_group *devs,
+						 unsigned *iter)
+{
+	struct cache *ret = NULL;
+
+	while (*iter < devs->nr_devices &&
+	       !(ret = rcu_dereference(devs->d[*iter].dev)))
+		(*iter)++;
+
+	return ret;
+}
+
+#define group_for_each_cache_rcu(ca, devs, iter)			\
+	for ((iter) = 0;						\
+	     ((ca) = cache_group_next_rcu((devs), &(iter)));		\
+	     (iter)++)
+
+static inline struct cache *cache_group_next(struct cache_group *devs,
+					     unsigned *iter)
+{
+	struct cache *ret;
+
+	rcu_read_lock();
+	if ((ret = cache_group_next_rcu(devs, iter)))
+		percpu_ref_get(&ret->ref);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+#define group_for_each_cache(ca, devs, iter)				\
+	for ((iter) = 0;						\
+	     (ca = cache_group_next(devs, &(iter)));			\
+	     percpu_ref_put(&ca->ref), (iter)++)
+
+#define __open_bucket_next_online_device(_c, _ob, _ptr, _ca)            \
+({									\
+	(_ca) = NULL;							\
+									\
+	while ((_ptr) < (_ob)->ptrs + (_ob)->nr_ptrs &&			\
+	       !((_ca) = PTR_CACHE(_c, _ptr)))				\
+		(_ptr)++;						\
+	(_ca);								\
+})
+
+#define open_bucket_for_each_online_device(_c, _ob, _ptr, _ca)		\
+	for ((_ptr) = (_ob)->ptrs;					\
+	     ((_ca) = __open_bucket_next_online_device(_c, _ob,	_ptr, _ca));\
+	     (_ptr)++)
+
+void bch_cache_allocator_stop(struct cache *);
+int bch_cache_allocator_start(struct cache *);
+void bch_open_buckets_init(struct cache_set *);
+
+#endif /* _BCACHE_ALLOC_H */
diff --git a/libbcache/alloc_types.h b/libbcache/alloc_types.h
new file mode 100644
index 0000000..337b6e4
--- /dev/null
+++ b/libbcache/alloc_types.h
@@ -0,0 +1,102 @@
+#ifndef _BCACHE_ALLOC_TYPES_H
+#define _BCACHE_ALLOC_TYPES_H
+
+#include <linux/mutex.h>
+
+#include "clock_types.h"
+
+/*
+ * There's two of these clocks, one for reads and one for writes:
+ *
+ * All fields protected by bucket_lock
+ */
+struct prio_clock {
+	/*
+	 * "now" in (read/write) IO time - incremented whenever we do X amount
+	 * of reads or writes.
+	 *
+	 * Goes with the bucket read/write prios: when we read or write to a
+	 * bucket we reset the bucket's prio to the current hand; thus hand -
+	 * prio = time since bucket was last read/written.
+	 *
+	 * The units are some amount (bytes/sectors) of data read/written, and
+	 * the units can change on the fly if we need to rescale to fit
+	 * everything in a u16 - your only guarantee is that the units are
+	 * consistent.
+	 */
+	u16			hand;
+	u16			min_prio;
+
+	int			rw;
+
+	struct io_timer		rescale;
+};
+
+/* There is one reserve for each type of btree, one for prios and gens
+ * and one for moving GC */
+enum alloc_reserve {
+	RESERVE_PRIO,
+	RESERVE_BTREE,
+	RESERVE_METADATA_LAST = RESERVE_BTREE,
+	RESERVE_MOVINGGC,
+
+	RESERVE_NONE,
+	RESERVE_NR,
+};
+
+static inline bool allocation_is_metadata(enum alloc_reserve id)
+{
+	return id <= RESERVE_METADATA_LAST;
+}
+
+struct cache_group {
+	spinlock_t		lock;
+	unsigned		nr_devices;
+	unsigned		cur_device;
+	struct {
+		u64		weight;
+		struct cache	*dev;
+	}			d[MAX_CACHES_PER_SET];
+};
+
+/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
+#define OPEN_BUCKETS_COUNT	256
+
+#define WRITE_POINT_COUNT	16
+
+struct open_bucket {
+	struct list_head	list;
+	struct mutex		lock;
+	atomic_t		pin;
+	bool			has_full_ptrs;
+	/*
+	 * recalculated every time we allocate from this open_bucket based on
+	 * how many pointers we're actually going to use:
+	 */
+	unsigned		sectors_free;
+	unsigned		nr_ptrs;
+	struct bch_extent_ptr	ptrs[BCH_REPLICAS_MAX];
+	unsigned		ptr_offset[BCH_REPLICAS_MAX];
+};
+
+struct write_point {
+	struct open_bucket	*b;
+
+	/*
+	 * Throttle writes to this write point if tier 0 is full?
+	 */
+	bool			throttle;
+
+	/*
+	 * If not NULL, cache group for tiering, promotion and moving GC -
+	 * always allocates a single replica
+	 */
+	struct cache_group	*group;
+
+	/*
+	 * Otherwise do a normal replicated bucket allocation that could come
+	 * from any device in tier 0 (foreground write)
+	 */
+};
+
+#endif /* _BCACHE_ALLOC_TYPES_H */
diff --git a/libbcache/bcache.h b/libbcache/bcache.h
new file mode 100644
index 0000000..9a43a69
--- /dev/null
+++ b/libbcache/bcache.h
@@ -0,0 +1,831 @@
+#ifndef _BCACHE_H
+#define _BCACHE_H
+
+/*
+ * SOME HIGH LEVEL CODE DOCUMENTATION:
+ *
+ * Bcache mostly works with cache sets, cache devices, and backing devices.
+ *
+ * Support for multiple cache devices hasn't quite been finished off yet, but
+ * it's about 95% plumbed through. A cache set and its cache devices is sort of
+ * like a md raid array and its component devices. Most of the code doesn't care
+ * about individual cache devices, the main abstraction is the cache set.
+ *
+ * Multiple cache devices is intended to give us the ability to mirror dirty
+ * cached data and metadata, without mirroring clean cached data.
+ *
+ * Backing devices are different, in that they have a lifetime independent of a
+ * cache set. When you register a newly formatted backing device it'll come up
+ * in passthrough mode, and then you can attach and detach a backing device from
+ * a cache set at runtime - while it's mounted and in use. Detaching implicitly
+ * invalidates any cached data for that backing device.
+ *
+ * A cache set can have multiple (many) backing devices attached to it.
+ *
+ * There's also flash only volumes - this is the reason for the distinction
+ * between struct cached_dev and struct bcache_device. A flash only volume
+ * works much like a bcache device that has a backing device, except the
+ * "cached" data is always dirty. The end result is that we get thin
+ * provisioning with very little additional code.
+ *
+ * Flash only volumes work but they're not production ready because the moving
+ * garbage collector needs more work. More on that later.
+ *
+ * BUCKETS/ALLOCATION:
+ *
+ * Bcache is primarily designed for caching, which means that in normal
+ * operation all of our available space will be allocated. Thus, we need an
+ * efficient way of deleting things from the cache so we can write new things to
+ * it.
+ *
+ * To do this, we first divide the cache device up into buckets. A bucket is the
+ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
+ * works efficiently.
+ *
+ * Each bucket has a 16 bit priority, and an 8 bit generation associated with
+ * it. The gens and priorities for all the buckets are stored contiguously and
+ * packed on disk (in a linked list of buckets - aside from the superblock, all
+ * of bcache's metadata is stored in buckets).
+ *
+ * The priority is used to implement an LRU. We reset a bucket's priority when
+ * we allocate it or on cache it, and every so often we decrement the priority
+ * of each bucket. It could be used to implement something more sophisticated,
+ * if anyone ever gets around to it.
+ *
+ * The generation is used for invalidating buckets. Each pointer also has an 8
+ * bit generation embedded in it; for a pointer to be considered valid, its gen
+ * must match the gen of the bucket it points into.  Thus, to reuse a bucket all
+ * we have to do is increment its gen (and write its new gen to disk; we batch
+ * this up).
+ *
+ * Bcache is entirely COW - we never write twice to a bucket, even buckets that
+ * contain metadata (including btree nodes).
+ *
+ * THE BTREE:
+ *
+ * Bcache is in large part design around the btree.
+ *
+ * At a high level, the btree is just an index of key -> ptr tuples.
+ *
+ * Keys represent extents, and thus have a size field. Keys also have a variable
+ * number of pointers attached to them (potentially zero, which is handy for
+ * invalidating the cache).
+ *
+ * The key itself is an inode:offset pair. The inode number corresponds to a
+ * backing device or a flash only volume. The offset is the ending offset of the
+ * extent within the inode - not the starting offset; this makes lookups
+ * slightly more convenient.
+ *
+ * Pointers contain the cache device id, the offset on that device, and an 8 bit
+ * generation number. More on the gen later.
+ *
+ * Index lookups are not fully abstracted - cache lookups in particular are
+ * still somewhat mixed in with the btree code, but things are headed in that
+ * direction.
+ *
+ * Updates are fairly well abstracted, though. There are two different ways of
+ * updating the btree; insert and replace.
+ *
+ * BTREE_INSERT will just take a list of keys and insert them into the btree -
+ * overwriting (possibly only partially) any extents they overlap with. This is
+ * used to update the index after a write.
+ *
+ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
+ * overwriting a key that matches another given key. This is used for inserting
+ * data into the cache after a cache miss, and for background writeback, and for
+ * the moving garbage collector.
+ *
+ * There is no "delete" operation; deleting things from the index is
+ * accomplished by either by invalidating pointers (by incrementing a bucket's
+ * gen) or by inserting a key with 0 pointers - which will overwrite anything
+ * previously present at that location in the index.
+ *
+ * This means that there are always stale/invalid keys in the btree. They're
+ * filtered out by the code that iterates through a btree node, and removed when
+ * a btree node is rewritten.
+ *
+ * BTREE NODES:
+ *
+ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
+ * free smaller than a bucket - so, that's how big our btree nodes are.
+ *
+ * (If buckets are really big we'll only use part of the bucket for a btree node
+ * - no less than 1/4th - but a bucket still contains no more than a single
+ * btree node. I'd actually like to change this, but for now we rely on the
+ * bucket's gen for deleting btree nodes when we rewrite/split a node.)
+ *
+ * Anyways, btree nodes are big - big enough to be inefficient with a textbook
+ * btree implementation.
+ *
+ * The way this is solved is that btree nodes are internally log structured; we
+ * can append new keys to an existing btree node without rewriting it. This
+ * means each set of keys we write is sorted, but the node is not.
+ *
+ * We maintain this log structure in memory - keeping 1Mb of keys sorted would
+ * be expensive, and we have to distinguish between the keys we have written and
+ * the keys we haven't. So to do a lookup in a btree node, we have to search
+ * each sorted set. But we do merge written sets together lazily, so the cost of
+ * these extra searches is quite low (normally most of the keys in a btree node
+ * will be in one big set, and then there'll be one or two sets that are much
+ * smaller).
+ *
+ * This log structure makes bcache's btree more of a hybrid between a
+ * conventional btree and a compacting data structure, with some of the
+ * advantages of both.
+ *
+ * GARBAGE COLLECTION:
+ *
+ * We can't just invalidate any bucket - it might contain dirty data or
+ * metadata. If it once contained dirty data, other writes might overwrite it
+ * later, leaving no valid pointers into that bucket in the index.
+ *
+ * Thus, the primary purpose of garbage collection is to find buckets to reuse.
+ * It also counts how much valid data it each bucket currently contains, so that
+ * allocation can reuse buckets sooner when they've been mostly overwritten.
+ *
+ * It also does some things that are really internal to the btree
+ * implementation. If a btree node contains pointers that are stale by more than
+ * some threshold, it rewrites the btree node to avoid the bucket's generation
+ * wrapping around. It also merges adjacent btree nodes if they're empty enough.
+ *
+ * THE JOURNAL:
+ *
+ * Bcache's journal is not necessary for consistency; we always strictly
+ * order metadata writes so that the btree and everything else is consistent on
+ * disk in the event of an unclean shutdown, and in fact bcache had writeback
+ * caching (with recovery from unclean shutdown) before journalling was
+ * implemented.
+ *
+ * Rather, the journal is purely a performance optimization; we can't complete a
+ * write until we've updated the index on disk, otherwise the cache would be
+ * inconsistent in the event of an unclean shutdown. This means that without the
+ * journal, on random write workloads we constantly have to update all the leaf
+ * nodes in the btree, and those writes will be mostly empty (appending at most
+ * a few keys each) - highly inefficient in terms of amount of metadata writes,
+ * and it puts more strain on the various btree resorting/compacting code.
+ *
+ * The journal is just a log of keys we've inserted; on startup we just reinsert
+ * all the keys in the open journal entries. That means that when we're updating
+ * a node in the btree, we can wait until a 4k block of keys fills up before
+ * writing them out.
+ *
+ * For simplicity, we only journal updates to leaf nodes; updates to parent
+ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
+ * the complexity to deal with journalling them (in particular, journal replay)
+ * - updates to non leaf nodes just happen synchronously (see btree_split()).
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+
+#include <linux/bug.h>
+#include <linux/bcache.h>
+#include <linux/bio.h>
+#include <linux/kobject.h>
+#include <linux/lglock.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/percpu-refcount.h>
+#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
+#include <linux/rhashtable.h>
+#include <linux/rwsem.h>
+#include <linux/seqlock.h>
+#include <linux/shrinker.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include "bset.h"
+#include "fifo.h"
+#include "util.h"
+#include "closure.h"
+#include "opts.h"
+
+#include <linux/dynamic_fault.h>
+
+#define cache_set_init_fault(name)					\
+	dynamic_fault("bcache:cache_set_init:" name)
+#define bch_meta_read_fault(name)					\
+	 dynamic_fault("bcache:meta:read:" name)
+#define bch_meta_write_fault(name)					\
+	 dynamic_fault("bcache:meta:write:" name)
+
+#define bch_fmt(_c, fmt)					\
+	"bcache (%s): " fmt "\n", ((_c)->name)
+
+#define bch_info(c, fmt, ...) \
+	printk(KERN_INFO bch_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_notice(c, fmt, ...) \
+	printk(KERN_NOTICE bch_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn(c, fmt, ...) \
+	printk(KERN_WARNING bch_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err(c, fmt, ...) \
+	printk(KERN_ERR bch_fmt(c, fmt), ##__VA_ARGS__)
+
+#define bch_verbose(c, fmt, ...)					\
+do {									\
+	if ((c)->opts.verbose_recovery)					\
+		bch_info(c, fmt, ##__VA_ARGS__);			\
+} while (0)
+
+/* Parameters that are useful for debugging, but should always be compiled in: */
+#define BCH_DEBUG_PARAMS_ALWAYS()					\
+	BCH_DEBUG_PARAM(key_merging_disabled,				\
+		"Disables merging of extents")				\
+	BCH_DEBUG_PARAM(btree_gc_always_rewrite,			\
+		"Causes mark and sweep to compact and rewrite every "	\
+		"btree node it traverses")				\
+	BCH_DEBUG_PARAM(btree_gc_rewrite_disabled,			\
+		"Disables rewriting of btree nodes during mark and sweep")\
+	BCH_DEBUG_PARAM(btree_gc_coalesce_disabled,			\
+		"Disables coalescing of btree nodes")			\
+	BCH_DEBUG_PARAM(btree_shrinker_disabled,			\
+		"Disables the shrinker callback for the btree node cache")
+
+/* Parameters that should only be compiled in in debug mode: */
+#define BCH_DEBUG_PARAMS_DEBUG()					\
+	BCH_DEBUG_PARAM(expensive_debug_checks,				\
+		"Enables various runtime debugging checks that "	\
+		"significantly affect performance")			\
+	BCH_DEBUG_PARAM(debug_check_bkeys,				\
+		"Run bkey_debugcheck (primarily checking GC/allocation "\
+		"information) when iterating over keys")		\
+	BCH_DEBUG_PARAM(version_stress_test,				\
+		"Assigns random version numbers to newly written "	\
+		"extents, to test overlapping extent cases")		\
+	BCH_DEBUG_PARAM(verify_btree_ondisk,				\
+		"Reread btree nodes at various points to verify the "	\
+		"mergesort in the read path against modifications "	\
+		"done in memory")					\
+
+#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
+
+#ifdef CONFIG_BCACHE_DEBUG
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
+#else
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
+#endif
+
+/* name, frequency_units, duration_units */
+#define BCH_TIME_STATS()						\
+	BCH_TIME_STAT(mca_alloc,		sec, us)		\
+	BCH_TIME_STAT(mca_scan,			sec, ms)		\
+	BCH_TIME_STAT(btree_gc,			sec, ms)		\
+	BCH_TIME_STAT(btree_coalesce,		sec, ms)		\
+	BCH_TIME_STAT(btree_split,		sec, us)		\
+	BCH_TIME_STAT(btree_sort,		ms, us)			\
+	BCH_TIME_STAT(btree_read,		ms, us)			\
+	BCH_TIME_STAT(journal_write,		us, us)			\
+	BCH_TIME_STAT(journal_delay,		ms, us)			\
+	BCH_TIME_STAT(journal_blocked,		sec, ms)		\
+	BCH_TIME_STAT(journal_flush_seq,	us, us)
+
+#include "alloc_types.h"
+#include "blockdev_types.h"
+#include "buckets_types.h"
+#include "clock_types.h"
+#include "io_types.h"
+#include "journal_types.h"
+#include "keylist_types.h"
+#include "keybuf_types.h"
+#include "move_types.h"
+#include "stats_types.h"
+#include "super_types.h"
+
+/* 256k, in sectors */
+#define BTREE_NODE_SIZE_MAX		512
+
+/*
+ * Number of nodes we might have to allocate in a worst case btree split
+ * operation - we split all the way up to the root, then allocate a new root.
+ */
+#define btree_reserve_required_nodes(depth)	(((depth) + 1) * 2 + 1)
+
+/* Number of nodes btree coalesce will try to coalesce at once */
+#define GC_MERGE_NODES		4U
+
+/* Maximum number of nodes we might need to allocate atomically: */
+#define BTREE_RESERVE_MAX						\
+	(btree_reserve_required_nodes(BTREE_MAX_DEPTH) + GC_MERGE_NODES)
+
+/* Size of the freelist we allocate btree nodes from: */
+#define BTREE_NODE_RESERVE		(BTREE_RESERVE_MAX * 2)
+
+struct btree;
+struct cache;
+
+enum gc_phase {
+	GC_PHASE_PENDING_DELETE		= BTREE_ID_NR + 1,
+	GC_PHASE_DONE
+};
+
+struct gc_pos {
+	enum gc_phase		phase;
+	struct bpos		pos;
+	unsigned		level;
+};
+
+struct cache_member_cpu {
+	u64			nbuckets;	/* device size */
+	u16			first_bucket;   /* index of first bucket used */
+	u16			bucket_size;	/* sectors */
+	u8			state;
+	u8			tier;
+	u8			replication_set;
+	u8			has_metadata;
+	u8			has_data;
+	u8			replacement;
+	u8			discard;
+	u8			valid;
+};
+
+struct cache_member_rcu {
+	struct rcu_head		rcu;
+	unsigned		nr_in_set;
+	struct cache_member_cpu	m[];
+};
+
+/* cache->flags: */
+enum {
+	CACHE_DEV_REMOVING,
+	CACHE_DEV_FORCE_REMOVE,
+};
+
+struct cache {
+	struct percpu_ref	ref;
+	struct rcu_head		free_rcu;
+	struct work_struct	free_work;
+	struct work_struct	remove_work;
+	unsigned long		flags;
+
+	struct cache_set	*set;
+
+	struct cache_group	self;
+
+	/*
+	 * Cached version of this device's member info from superblock
+	 * Committed by write_super()
+	 */
+	struct {
+		u8		nr_this_dev;
+	}			sb;
+	struct cache_member_cpu	mi;
+
+	struct bcache_superblock disk_sb;
+
+	struct kobject		kobj;
+
+	/* biosets used in cloned bios for replicas and moving_gc */
+	struct bio_set		replica_set;
+
+	struct task_struct	*alloc_thread;
+
+	struct prio_set		*disk_buckets;
+
+	/*
+	 * When allocating new buckets, prio_write() gets first dibs - since we
+	 * may not be allocate at all without writing priorities and gens.
+	 * prio_last_buckets[] contains the last buckets we wrote priorities to
+	 * (so gc can mark them as metadata).
+	 */
+	u64			*prio_buckets;
+	u64			*prio_last_buckets;
+	spinlock_t		prio_buckets_lock;
+	struct bio		*bio_prio;
+
+	/*
+	 * free: Buckets that are ready to be used
+	 *
+	 * free_inc: Incoming buckets - these are buckets that currently have
+	 * cached data in them, and we can't reuse them until after we write
+	 * their new gen to disk. After prio_write() finishes writing the new
+	 * gens/prios, they'll be moved to the free list (and possibly discarded
+	 * in the process)
+	 */
+	DECLARE_FIFO(long, free)[RESERVE_NR];
+	DECLARE_FIFO(long, free_inc);
+	spinlock_t		freelist_lock;
+
+	size_t			fifo_last_bucket;
+
+	/* Allocation stuff: */
+
+	/* most out of date gen in the btree */
+	u8			*oldest_gens;
+	struct bucket		*buckets;
+	unsigned short		bucket_bits;	/* ilog2(bucket_size) */
+
+	/* last calculated minimum prio */
+	u16			min_prio[2];
+
+	/*
+	 * Bucket book keeping. The first element is updated by GC, the
+	 * second contains a saved copy of the stats from the beginning
+	 * of GC.
+	 */
+	struct bucket_stats_cache __percpu *bucket_stats_percpu;
+	struct bucket_stats_cache	bucket_stats_cached;
+
+	atomic_long_t		saturated_count;
+	size_t			inc_gen_needs_gc;
+
+	struct mutex		heap_lock;
+	DECLARE_HEAP(struct bucket_heap_entry, heap);
+
+	/* Moving GC: */
+	struct task_struct	*moving_gc_read;
+
+	struct bch_pd_controller moving_gc_pd;
+
+	/* Tiering: */
+	struct write_point	tiering_write_point;
+
+	struct write_point	copygc_write_point;
+
+	struct journal_device	journal;
+
+	struct work_struct	io_error_work;
+
+	/* The rest of this all shows up in sysfs */
+#define IO_ERROR_SHIFT		20
+	atomic_t		io_errors;
+	atomic_t		io_count;
+
+	atomic64_t		meta_sectors_written;
+	atomic64_t		btree_sectors_written;
+	u64 __percpu		*sectors_written;
+};
+
+/*
+ * Flag bits for what phase of startup/shutdown the cache set is at, how we're
+ * shutting down, etc.:
+ *
+ * CACHE_SET_UNREGISTERING means we're not just shutting down, we're detaching
+ * all the backing devices first (their cached data gets invalidated, and they
+ * won't automatically reattach).
+ *
+ * CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
+ * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
+ * flushing dirty data).
+ *
+ * CACHE_SET_RUNNING means all cache devices have been registered and journal
+ * replay is complete.
+ */
+enum {
+	/* Startup: */
+	CACHE_SET_INITIAL_GC_DONE,
+	CACHE_SET_RUNNING,
+
+	/* Shutdown: */
+	CACHE_SET_UNREGISTERING,
+	CACHE_SET_STOPPING,
+	CACHE_SET_RO,
+	CACHE_SET_RO_COMPLETE,
+	CACHE_SET_EMERGENCY_RO,
+	CACHE_SET_WRITE_DISABLE_COMPLETE,
+	CACHE_SET_GC_STOPPING,
+	CACHE_SET_GC_FAILURE,
+	CACHE_SET_BDEV_MOUNTED,
+	CACHE_SET_ERROR,
+	CACHE_SET_FSCK_FIXED_ERRORS,
+};
+
+struct btree_debug {
+	unsigned		id;
+	struct dentry		*btree;
+	struct dentry		*btree_format;
+	struct dentry		*failed;
+};
+
+struct cache_set {
+	struct closure		cl;
+
+	struct list_head	list;
+	struct kobject		kobj;
+	struct kobject		internal;
+	struct kobject		opts_dir;
+	struct kobject		time_stats;
+	struct completion	*stop_completion;
+	unsigned long		flags;
+
+	int			minor;
+	struct device		*chardev;
+	struct super_block	*vfs_sb;
+	char			name[40];
+
+	/* Counts outstanding writes, for clean transition to read-only */
+	struct percpu_ref	writes;
+	struct work_struct	read_only_work;
+
+	struct cache __rcu	*cache[MAX_CACHES_PER_SET];
+
+	struct mutex		mi_lock;
+	struct cache_member_rcu __rcu *members;
+	struct cache_member	*disk_mi; /* protected by register_lock */
+
+	struct cache_set_opts	opts;
+
+	/*
+	 * Cached copy in native endianness:
+	 * Set by cache_sb_to_cache_set:
+	 */
+	struct {
+		u16		block_size;
+		u16		btree_node_size;
+
+		u8		nr_in_set;
+		u8		clean;
+
+		u8		meta_replicas_have;
+		u8		data_replicas_have;
+
+		u8		str_hash_type;
+	}			sb;
+
+	struct cache_sb		disk_sb;
+	unsigned short		block_bits;	/* ilog2(block_size) */
+
+	struct closure		sb_write;
+	struct semaphore	sb_write_mutex;
+
+	struct backing_dev_info bdi;
+
+	/* BTREE CACHE */
+	struct bio_set		btree_read_bio;
+
+	struct btree_root	btree_roots[BTREE_ID_NR];
+	struct mutex		btree_root_lock;
+
+	bool			btree_cache_table_init_done;
+	struct rhashtable	btree_cache_table;
+
+	/*
+	 * We never free a struct btree, except on shutdown - we just put it on
+	 * the btree_cache_freed list and reuse it later. This simplifies the
+	 * code, and it doesn't cost us much memory as the memory usage is
+	 * dominated by buffers that hold the actual btree node data and those
+	 * can be freed - and the number of struct btrees allocated is
+	 * effectively bounded.
+	 *
+	 * btree_cache_freeable effectively is a small cache - we use it because
+	 * high order page allocations can be rather expensive, and it's quite
+	 * common to delete and allocate btree nodes in quick succession. It
+	 * should never grow past ~2-3 nodes in practice.
+	 */
+	struct mutex		btree_cache_lock;
+	struct list_head	btree_cache;
+	struct list_head	btree_cache_freeable;
+	struct list_head	btree_cache_freed;
+
+	/* Number of elements in btree_cache + btree_cache_freeable lists */
+	unsigned		btree_cache_used;
+	unsigned		btree_cache_reserve;
+	struct shrinker		btree_cache_shrink;
+
+	/*
+	 * If we need to allocate memory for a new btree node and that
+	 * allocation fails, we can cannibalize another node in the btree cache
+	 * to satisfy the allocation - lock to guarantee only one thread does
+	 * this at a time:
+	 */
+	struct closure_waitlist	mca_wait;
+	struct task_struct	*btree_cache_alloc_lock;
+
+	mempool_t		btree_reserve_pool;
+
+	/*
+	 * Cache of allocated btree nodes - if we allocate a btree node and
+	 * don't use it, if we free it that space can't be reused until going
+	 * _all_ the way through the allocator (which exposes us to a livelock
+	 * when allocating btree reserves fail halfway through) - instead, we
+	 * can stick them here:
+	 */
+	struct btree_alloc {
+		struct open_bucket	*ob;
+		BKEY_PADDED(k);
+	}			btree_reserve_cache[BTREE_NODE_RESERVE * 2];
+	unsigned		btree_reserve_cache_nr;
+	struct mutex		btree_reserve_cache_lock;
+
+	mempool_t		btree_interior_update_pool;
+	struct list_head	btree_interior_update_list;
+	struct mutex		btree_interior_update_lock;
+
+	struct workqueue_struct	*wq;
+	/* copygc needs its own workqueue for index updates.. */
+	struct workqueue_struct	*copygc_wq;
+
+	/* ALLOCATION */
+	struct bch_pd_controller foreground_write_pd;
+	struct delayed_work	pd_controllers_update;
+	unsigned		pd_controllers_update_seconds;
+	spinlock_t		foreground_write_pd_lock;
+	struct bch_write_op	*write_wait_head;
+	struct bch_write_op	*write_wait_tail;
+
+	struct timer_list	foreground_write_wakeup;
+
+	/*
+	 * These contain all r/w devices - i.e. devices we can currently
+	 * allocate from:
+	 */
+	struct cache_group	cache_all;
+	struct cache_group	cache_tiers[CACHE_TIERS];
+
+	u64			capacity; /* sectors */
+
+	/*
+	 * When capacity _decreases_ (due to a disk being removed), we
+	 * increment capacity_gen - this invalidates outstanding reservations
+	 * and forces them to be revalidated
+	 */
+	u32			capacity_gen;
+
+	atomic64_t		sectors_available;
+
+	struct bucket_stats_cache_set __percpu *bucket_stats_percpu;
+	struct bucket_stats_cache_set	bucket_stats_cached;
+	struct lglock		bucket_stats_lock;
+
+	struct mutex		bucket_lock;
+
+	struct closure_waitlist	freelist_wait;
+
+
+	/*
+	 * When we invalidate buckets, we use both the priority and the amount
+	 * of good data to determine which buckets to reuse first - to weight
+	 * those together consistently we keep track of the smallest nonzero
+	 * priority of any bucket.
+	 */
+	struct prio_clock	prio_clock[2];
+
+	struct io_clock		io_clock[2];
+
+	/* SECTOR ALLOCATOR */
+	struct list_head	open_buckets_open;
+	struct list_head	open_buckets_free;
+	unsigned		open_buckets_nr_free;
+	struct closure_waitlist	open_buckets_wait;
+	spinlock_t		open_buckets_lock;
+	struct open_bucket	open_buckets[OPEN_BUCKETS_COUNT];
+
+	struct write_point	btree_write_point;
+
+	struct write_point	write_points[WRITE_POINT_COUNT];
+	struct write_point	promote_write_point;
+
+	/*
+	 * This write point is used for migrating data off a device
+	 * and can point to any other device.
+	 * We can't use the normal write points because those will
+	 * gang up n replicas, and for migration we want only one new
+	 * replica.
+	 */
+	struct write_point	migration_write_point;
+
+	/* GARBAGE COLLECTION */
+	struct task_struct	*gc_thread;
+	atomic_t		kick_gc;
+
+	/*
+	 * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
+	 * has been marked by GC.
+	 *
+	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
+	 *
+	 * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
+	 * currently running, and gc marks are currently valid
+	 *
+	 * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
+	 * can read without a lock.
+	 */
+	seqcount_t		gc_pos_lock;
+	struct gc_pos		gc_pos;
+
+	/*
+	 * The allocation code needs gc_mark in struct bucket to be correct, but
+	 * it's not while a gc is in progress.
+	 */
+	struct rw_semaphore	gc_lock;
+
+	/* IO PATH */
+	struct bio_set		bio_read;
+	struct bio_set		bio_read_split;
+	struct bio_set		bio_write;
+	struct mutex		bio_bounce_pages_lock;
+	mempool_t		bio_bounce_pages;
+
+	mempool_t		lz4_workspace_pool;
+	void			*zlib_workspace;
+	struct mutex		zlib_workspace_lock;
+	mempool_t		compression_bounce[2];
+	struct bio_decompress_worker __percpu
+				*bio_decompress_worker;
+
+	/* For punting bio submissions to workqueue, io.c */
+	struct bio_list		bio_submit_list;
+	struct work_struct	bio_submit_work;
+	spinlock_t		bio_submit_lock;
+
+	struct bio_list		read_retry_list;
+	struct work_struct	read_retry_work;
+	spinlock_t		read_retry_lock;
+
+	/* FILESYSTEM */
+	wait_queue_head_t	writeback_wait;
+	atomic_t		writeback_pages;
+	unsigned		writeback_pages_max;
+	atomic_long_t		nr_inodes;
+
+	/* TIERING */
+	struct task_struct	*tiering_read;
+	struct bch_pd_controller tiering_pd;
+
+	/* NOTIFICATIONS */
+	struct mutex		uevent_lock;
+	struct kobj_uevent_env	uevent_env;
+
+	/* DEBUG JUNK */
+	struct dentry		*debug;
+	struct btree_debug	btree_debug[BTREE_ID_NR];
+#ifdef CONFIG_BCACHE_DEBUG
+	struct btree		*verify_data;
+	struct btree_node	*verify_ondisk;
+	struct mutex		verify_lock;
+#endif
+
+	u64			unused_inode_hint;
+
+	/*
+	 * A btree node on disk could have too many bsets for an iterator to fit
+	 * on the stack - have to dynamically allocate them
+	 */
+	mempool_t		fill_iter;
+
+	mempool_t		btree_bounce_pool;
+
+	struct journal		journal;
+
+	unsigned		bucket_journal_seq;
+
+	/* CACHING OTHER BLOCK DEVICES */
+	mempool_t		search;
+	struct radix_tree_root	devices;
+	struct list_head	cached_devs;
+	u64			cached_dev_sectors;
+	struct closure		caching;
+
+#define CONGESTED_MAX		1024
+	unsigned		congested_last_us;
+	atomic_t		congested;
+
+	/* The rest of this all shows up in sysfs */
+	unsigned		congested_read_threshold_us;
+	unsigned		congested_write_threshold_us;
+
+	struct cache_accounting accounting;
+	atomic_long_t		cache_read_races;
+	atomic_long_t		writeback_keys_done;
+	atomic_long_t		writeback_keys_failed;
+
+	unsigned		error_limit;
+	unsigned		error_decay;
+
+	unsigned		foreground_write_ratelimit_enabled:1;
+	unsigned		copy_gc_enabled:1;
+	unsigned		tiering_enabled:1;
+	unsigned		tiering_percent;
+
+	/*
+	 * foreground writes will be throttled when the number of free
+	 * buckets is below this percentage
+	 */
+	unsigned		foreground_target_percent;
+
+#define BCH_DEBUG_PARAM(name, description) bool name;
+	BCH_DEBUG_PARAMS_ALL()
+#undef BCH_DEBUG_PARAM
+
+#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
+	struct time_stats	name##_time;
+	BCH_TIME_STATS()
+#undef BCH_TIME_STAT
+};
+
+static inline unsigned bucket_pages(const struct cache *ca)
+{
+	return ca->mi.bucket_size / PAGE_SECTORS;
+}
+
+static inline unsigned bucket_bytes(const struct cache *ca)
+{
+	return ca->mi.bucket_size << 9;
+}
+
+static inline unsigned block_bytes(const struct cache_set *c)
+{
+	return c->sb.block_size << 9;
+}
+
+#endif /* _BCACHE_H */
diff --git a/libbcache/bkey.c b/libbcache/bkey.c
new file mode 100644
index 0000000..64d2c84
--- /dev/null
+++ b/libbcache/bkey.c
@@ -0,0 +1,1261 @@
+
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+
+#include <linux/kernel.h>
+
+#include "bkey.h"
+#include "bset.h"
+#include "util.h"
+
+const struct bkey_format bch_bkey_format_current = BKEY_FORMAT_CURRENT;
+
+struct bkey __bkey_unpack_key(const struct bkey_format *,
+			      const struct bkey_packed *);
+
+void bch_to_binary(char *out, const u64 *p, unsigned nr_bits)
+{
+	unsigned bit = high_bit_offset, done = 0;
+
+	while (1) {
+		while (bit < 64) {
+			if (done && !(done % 8))
+				*out++ = ' ';
+			*out++ = *p & (1ULL << (63 - bit)) ? '1' : '0';
+			bit++;
+			done++;
+			if (done == nr_bits) {
+				*out++ = '\0';
+				return;
+			}
+		}
+
+		p = next_word(p);
+		bit = 0;
+	}
+}
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+static void bch_bkey_pack_verify(const struct bkey_packed *packed,
+				 const struct bkey *unpacked,
+				 const struct bkey_format *format)
+{
+	struct bkey tmp;
+
+	BUG_ON(bkeyp_val_u64s(format, packed) !=
+	       bkey_val_u64s(unpacked));
+
+	BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed));
+
+	tmp = __bkey_unpack_key(format, packed);
+
+	if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
+		char buf1[160], buf2[160];
+		char buf3[160], buf4[160];
+
+		bch_bkey_to_text(buf1, sizeof(buf1), unpacked);
+		bch_bkey_to_text(buf2, sizeof(buf2), &tmp);
+		bch_to_binary(buf3, (void *) unpacked, 80);
+		bch_to_binary(buf4, high_word(format, packed), 80);
+
+		panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n",
+		      format->key_u64s,
+		      format->bits_per_field[0],
+		      format->bits_per_field[1],
+		      format->bits_per_field[2],
+		      format->bits_per_field[3],
+		      format->bits_per_field[4],
+		      buf1, buf2, buf3, buf4);
+	}
+}
+
+#else
+static inline void bch_bkey_pack_verify(const struct bkey_packed *packed,
+					const struct bkey *unpacked,
+					const struct bkey_format *format) {}
+#endif
+
+int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
+{
+	char *out = buf, *end = buf + size;
+
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+	p("u64s %u type %u %llu:%llu snap %u len %u ver %u",
+	  k->u64s, k->type, k->p.inode, k->p.offset,
+	  k->p.snapshot, k->size, k->version);
+
+	BUG_ON(bkey_packed(k));
+
+	switch (k->type) {
+	case KEY_TYPE_DELETED:
+		p(" deleted");
+		break;
+	case KEY_TYPE_DISCARD:
+		p(" discard");
+		break;
+	case KEY_TYPE_ERROR:
+		p(" error");
+		break;
+	case KEY_TYPE_COOKIE:
+		p(" cookie");
+		break;
+	}
+#undef p
+
+	return out - buf;
+}
+
+struct pack_state {
+	const struct bkey_format *format;
+	unsigned		bits;	/* bits remaining in current word */
+	u64			w;	/* current word */
+	u64			*p;	/* pointer to next word */
+};
+
+__always_inline
+static struct pack_state pack_state_init(const struct bkey_format *format,
+					 struct bkey_packed *k)
+{
+	u64 *p = high_word(format, k);
+
+	return (struct pack_state) {
+		.format	= format,
+		.bits	= 64 - high_bit_offset,
+		.w	= 0,
+		.p	= p,
+	};
+}
+
+__always_inline
+static void pack_state_finish(struct pack_state *state,
+			      struct bkey_packed *k)
+{
+	EBUG_ON(state->p <  k->_data);
+	EBUG_ON(state->p >= k->_data + state->format->key_u64s);
+
+	*state->p = state->w;
+}
+
+struct unpack_state {
+	const struct bkey_format *format;
+	unsigned		bits;	/* bits remaining in current word */
+	u64			w;	/* current word */
+	const u64		*p;	/* pointer to next word */
+};
+
+__always_inline
+static struct unpack_state unpack_state_init(const struct bkey_format *format,
+					     const struct bkey_packed *k)
+{
+	const u64 *p = high_word(format, k);
+
+	return (struct unpack_state) {
+		.format	= format,
+		.bits	= 64 - high_bit_offset,
+		.w	= *p << high_bit_offset,
+		.p	= p,
+	};
+}
+
+__always_inline
+static u64 get_inc_field(struct unpack_state *state, unsigned field)
+{
+	unsigned bits = state->format->bits_per_field[field];
+	u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]);
+
+	if (bits >= state->bits) {
+		v = state->w >> (64 - bits);
+		bits -= state->bits;
+
+		state->p = next_word(state->p);
+		state->w = *state->p;
+		state->bits = 64;
+	}
+
+	/* avoid shift by 64 if bits is 0 - bits is never 64 here: */
+	v |= (state->w >> 1) >> (63 - bits);
+	state->w <<= bits;
+	state->bits -= bits;
+
+	return v + offset;
+}
+
+__always_inline
+static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
+{
+	unsigned bits = state->format->bits_per_field[field];
+	u64 offset = le64_to_cpu(state->format->field_offset[field]);
+
+	if (v < offset)
+		return false;
+
+	v -= offset;
+
+	if (fls64(v) > bits)
+		return false;
+
+	if (bits > state->bits) {
+		bits -= state->bits;
+		/* avoid shift by 64 if bits is 0 - bits is never 64 here: */
+		state->w |= (v >> 1) >> (bits - 1);
+
+		*state->p = state->w;
+		state->p = next_word(state->p);
+		state->w = 0;
+		state->bits = 64;
+	}
+
+	state->bits -= bits;
+	state->w |= v << state->bits;
+
+	return true;
+}
+
+/*
+ * Note: does NOT set out->format (we don't know what it should be here!)
+ *
+ * Also: doesn't work on extents - it doesn't preserve the invariant that
+ * if k is packed bkey_start_pos(k) will successfully pack
+ */
+static bool bch_bkey_transform_key(const struct bkey_format *out_f,
+				   struct bkey_packed *out,
+				   const struct bkey_format *in_f,
+				   const struct bkey_packed *in)
+{
+	struct pack_state out_s = pack_state_init(out_f, out);
+	struct unpack_state in_s = unpack_state_init(in_f, in);
+	unsigned i;
+
+	out->_data[0] = 0;
+
+	for (i = 0; i < BKEY_NR_FIELDS; i++)
+		if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
+			return false;
+
+	/* Can't happen because the val would be too big to unpack: */
+	EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX);
+
+	pack_state_finish(&out_s, out);
+	out->u64s	= out_f->key_u64s + in->u64s - in_f->key_u64s;
+	out->needs_whiteout = in->needs_whiteout;
+	out->type	= in->type;
+
+	return true;
+}
+
+bool bch_bkey_transform(const struct bkey_format *out_f,
+			struct bkey_packed *out,
+			const struct bkey_format *in_f,
+			const struct bkey_packed *in)
+{
+	if (!bch_bkey_transform_key(out_f, out, in_f, in))
+		return false;
+
+	memcpy_u64s((u64 *) out + out_f->key_u64s,
+		    (u64 *) in + in_f->key_u64s,
+		    (in->u64s - in_f->key_u64s));
+	return true;
+}
+
+struct bkey __bkey_unpack_key(const struct bkey_format *format,
+			      const struct bkey_packed *in)
+{
+	struct unpack_state state = unpack_state_init(format, in);
+	struct bkey out;
+
+	EBUG_ON(format->nr_fields != 5);
+	EBUG_ON(in->u64s < format->key_u64s);
+	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+	EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
+
+	out.u64s	= BKEY_U64s + in->u64s - format->key_u64s;
+	out.format	= KEY_FORMAT_CURRENT;
+	out.needs_whiteout = in->needs_whiteout;
+	out.type	= in->type;
+	out.pad[0]	= 0;
+	out.p.inode	= get_inc_field(&state, BKEY_FIELD_INODE);
+	out.p.offset	= get_inc_field(&state, BKEY_FIELD_OFFSET);
+	out.p.snapshot	= get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
+	out.size	= get_inc_field(&state, BKEY_FIELD_SIZE);
+	out.version	= get_inc_field(&state, BKEY_FIELD_VERSION);
+
+	return out;
+}
+
+#ifndef HAVE_BCACHE_COMPILED_UNPACK
+struct bpos __bkey_unpack_pos(const struct bkey_format *format,
+				     const struct bkey_packed *in)
+{
+	struct unpack_state state = unpack_state_init(format, in);
+	struct bpos out;
+
+	EBUG_ON(format->nr_fields != 5);
+	EBUG_ON(in->u64s < format->key_u64s);
+	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+
+	out.inode	= get_inc_field(&state, BKEY_FIELD_INODE);
+	out.offset	= get_inc_field(&state, BKEY_FIELD_OFFSET);
+	out.snapshot	= get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
+
+	return out;
+}
+#endif
+
+/**
+ * bkey_pack_key -- pack just the key, not the value
+ */
+bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
+		   const struct bkey_format *format)
+{
+	struct pack_state state = pack_state_init(format, out);
+
+	EBUG_ON((void *) in == (void *) out);
+	EBUG_ON(format->nr_fields != 5);
+	EBUG_ON(in->format != KEY_FORMAT_CURRENT);
+
+	out->_data[0] = 0;
+
+	if (!set_inc_field(&state, BKEY_FIELD_INODE,	in->p.inode) ||
+	    !set_inc_field(&state, BKEY_FIELD_OFFSET,	in->p.offset) ||
+	    !set_inc_field(&state, BKEY_FIELD_SNAPSHOT,	in->p.snapshot) ||
+	    !set_inc_field(&state, BKEY_FIELD_SIZE,	in->size) ||
+	    !set_inc_field(&state, BKEY_FIELD_VERSION,	in->version))
+		return false;
+
+	/*
+	 * Extents - we have to guarantee that if an extent is packed, a trimmed
+	 * version will also pack:
+	 */
+	if (bkey_start_offset(in) < format->field_offset[BKEY_FIELD_OFFSET])
+		return false;
+
+	pack_state_finish(&state, out);
+	out->u64s	= format->key_u64s + in->u64s - BKEY_U64s;
+	out->format	= KEY_FORMAT_LOCAL_BTREE;
+	out->needs_whiteout = in->needs_whiteout;
+	out->type	= in->type;
+
+	bch_bkey_pack_verify(out, in, format);
+	return true;
+}
+
+/*
+ * Alternate implementations using bch_bkey_transform_key() - unfortunately, too
+ * slow
+ */
+#if 0
+struct bkey __bkey_unpack_key(const struct bkey_format *format,
+			      const struct bkey_packed *in)
+{
+	struct bkey out;
+	bool s;
+
+	EBUG_ON(format->nr_fields != 5);
+	EBUG_ON(in->u64s < format->key_u64s);
+	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+
+	s = bch_bkey_transform_key(&bch_bkey_format_current, (void *) &out,
+				   format, in);
+	EBUG_ON(!s);
+
+	out.format = KEY_FORMAT_CURRENT;
+
+	return out;
+}
+
+bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
+		   const struct bkey_format *format)
+{
+	EBUG_ON(format->nr_fields != 5);
+	EBUG_ON(in->format != KEY_FORMAT_CURRENT);
+
+	if (!bch_bkey_transform_key(format, out,
+				    &bch_bkey_format_current, (void *) in))
+		return false;
+
+	out->format = KEY_FORMAT_LOCAL_BTREE;
+
+	bch_bkey_pack_verify(out, in, format);
+	return true;
+}
+#endif
+
+/**
+ * bkey_unpack -- unpack the key and the value
+ */
+void bkey_unpack(const struct btree *b, struct bkey_i *dst,
+		 const struct bkey_packed *src)
+{
+	dst->k = bkey_unpack_key(b, src);
+
+	memcpy_u64s(&dst->v,
+		    bkeyp_val(&b->format, src),
+		    bkeyp_val_u64s(&b->format, src));
+}
+
+/**
+ * bkey_pack -- pack the key and the value
+ */
+bool bkey_pack(struct bkey_packed *out, const struct bkey_i *in,
+	       const struct bkey_format *format)
+{
+	struct bkey_packed tmp;
+
+	if (!bkey_pack_key(&tmp, &in->k, format))
+		return false;
+
+	memmove_u64s((u64 *) out + format->key_u64s,
+		     &in->v,
+		     bkey_val_u64s(&in->k));
+	memcpy_u64s(out, &tmp, format->key_u64s);
+
+	return true;
+}
+
+__always_inline
+static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
+{
+	unsigned bits = state->format->bits_per_field[field];
+	u64 offset = le64_to_cpu(state->format->field_offset[field]);
+	bool ret = true;
+
+	EBUG_ON(v < offset);
+	v -= offset;
+
+	if (fls64(v) > bits) {
+		v = ~(~0ULL << bits);
+		ret = false;
+	}
+
+	if (bits > state->bits) {
+		bits -= state->bits;
+		state->w |= (v >> 1) >> (bits - 1);
+
+		*state->p = state->w;
+		state->p = next_word(state->p);
+		state->w = 0;
+		state->bits = 64;
+	}
+
+	state->bits -= bits;
+	state->w |= v << state->bits;
+
+	return ret;
+}
+
+#ifdef CONFIG_BCACHE_DEBUG
+static bool bkey_packed_successor(struct bkey_packed *out,
+				  const struct btree *b,
+				  struct bkey_packed k)
+{
+	const struct bkey_format *f = &b->format;
+	unsigned nr_key_bits = b->nr_key_bits;
+	unsigned first_bit, offset;
+	u64 *p;
+
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+	if (!nr_key_bits)
+		return false;
+
+	*out = k;
+
+	first_bit = high_bit_offset + nr_key_bits - 1;
+	p = nth_word(high_word(f, out), first_bit >> 6);
+	offset = 63 - (first_bit & 63);
+
+	while (nr_key_bits) {
+		unsigned bits = min(64 - offset, nr_key_bits);
+		u64 mask = (~0ULL >> (64 - bits)) << offset;
+
+		if ((*p & mask) != mask) {
+			*p += 1ULL << offset;
+			EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0);
+			return true;
+		}
+
+		*p &= ~mask;
+		p = prev_word(p);
+		nr_key_bits -= bits;
+		offset = 0;
+	}
+
+	return false;
+}
+#endif
+
+/*
+ * Returns a packed key that compares <= in
+ *
+ * This is used in bset_search_tree(), where we need a packed pos in order to be
+ * able to compare against the keys in the auxiliary search tree - and it's
+ * legal to use a packed pos that isn't equivalent to the original pos,
+ * _provided_ it compares <= to the original pos.
+ */
+enum bkey_pack_pos_ret bkey_pack_pos_lossy(struct bkey_packed *out,
+					   struct bpos in,
+					   const struct btree *b)
+{
+	const struct bkey_format *f = &b->format;
+	struct pack_state state = pack_state_init(f, out);
+#ifdef CONFIG_BCACHE_DEBUG
+	struct bpos orig = in;
+#endif
+	bool exact = true;
+
+	out->_data[0] = 0;
+
+	if (unlikely(in.snapshot <
+		     le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
+		if (!in.offset-- &&
+		    !in.inode--)
+			return BKEY_PACK_POS_FAIL;
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (unlikely(in.offset <
+		     le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) {
+		if (!in.inode--)
+			return BKEY_PACK_POS_FAIL;
+		in.offset	= KEY_OFFSET_MAX;
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (unlikely(in.inode <
+		     le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
+		return BKEY_PACK_POS_FAIL;
+
+	if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) {
+		in.offset	= KEY_OFFSET_MAX;
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) {
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot))
+		exact = false;
+
+	pack_state_finish(&state, out);
+	out->u64s	= f->key_u64s;
+	out->format	= KEY_FORMAT_LOCAL_BTREE;
+	out->type	= KEY_TYPE_DELETED;
+
+#ifdef CONFIG_BCACHE_DEBUG
+	if (exact) {
+		BUG_ON(bkey_cmp_left_packed(b, out, &orig));
+	} else {
+		struct bkey_packed successor;
+
+		BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
+		BUG_ON(bkey_packed_successor(&successor, b, *out) &&
+		       bkey_cmp_left_packed(b, &successor, &orig) < 0);
+	}
+#endif
+
+	return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
+}
+
+void bch_bkey_format_init(struct bkey_format_state *s)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(s->field_min); i++)
+		s->field_min[i] = U64_MAX;
+
+	for (i = 0; i < ARRAY_SIZE(s->field_max); i++)
+		s->field_max[i] = 0;
+
+	/* Make sure we can store a size of 0: */
+	s->field_min[BKEY_FIELD_SIZE] = 0;
+}
+
+static void __bkey_format_add(struct bkey_format_state *s,
+			      unsigned field, u64 v)
+{
+	s->field_min[field] = min(s->field_min[field], v);
+	s->field_max[field] = max(s->field_max[field], v);
+}
+
+/*
+ * Changes @format so that @k can be successfully packed with @format
+ */
+void bch_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
+{
+	__bkey_format_add(s, BKEY_FIELD_INODE, k->p.inode);
+	__bkey_format_add(s, BKEY_FIELD_OFFSET, k->p.offset);
+	__bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k));
+	__bkey_format_add(s, BKEY_FIELD_SNAPSHOT, k->p.snapshot);
+	__bkey_format_add(s, BKEY_FIELD_SIZE, k->size);
+	__bkey_format_add(s, BKEY_FIELD_VERSION, k->version);
+}
+
+void bch_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
+{
+	unsigned field = 0;
+
+	__bkey_format_add(s, field++, p.inode);
+	__bkey_format_add(s, field++, p.offset);
+	__bkey_format_add(s, field++, p.snapshot);
+}
+
+/*
+ * We don't want it to be possible for the packed format to represent fields
+ * bigger than a u64... that will cause confusion and issues (like with
+ * bkey_packed_successor())
+ */
+static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
+			     unsigned bits, u64 offset)
+{
+	offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1));
+
+	f->bits_per_field[i]	= bits;
+	f->field_offset[i]	= cpu_to_le64(offset);
+}
+
+struct bkey_format bch_bkey_format_done(struct bkey_format_state *s)
+{
+	unsigned i, bits = KEY_PACKED_BITS_START;
+	struct bkey_format ret = {
+		.nr_fields = BKEY_NR_FIELDS,
+	};
+
+	for (i = 0; i < ARRAY_SIZE(s->field_min); i++) {
+		s->field_min[i] = min(s->field_min[i], s->field_max[i]);
+
+		set_format_field(&ret, i,
+				 fls64(s->field_max[i] - s->field_min[i]),
+				 s->field_min[i]);
+
+		bits += ret.bits_per_field[i];
+	}
+
+	ret.key_u64s = DIV_ROUND_UP(bits, 64);
+
+	/* if we have enough spare bits, round fields up to nearest byte */
+	bits = ret.key_u64s * 64 - bits;
+
+	for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) {
+		unsigned r = round_up(ret.bits_per_field[i], 8) -
+			ret.bits_per_field[i];
+
+		if (r <= bits) {
+			set_format_field(&ret, i,
+					 ret.bits_per_field[i] + r,
+					 le64_to_cpu(ret.field_offset[i]));
+			bits -= r;
+		}
+	}
+
+	EBUG_ON(bch_bkey_format_validate(&ret));
+	return ret;
+}
+
+const char *bch_bkey_format_validate(struct bkey_format *f)
+{
+	unsigned i, bits = KEY_PACKED_BITS_START;
+
+	if (f->nr_fields != BKEY_NR_FIELDS)
+		return "invalid format: incorrect number of fields";
+
+	for (i = 0; i < f->nr_fields; i++) {
+		u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+		if (f->bits_per_field[i] > 64)
+			return "invalid format: field too large";
+
+		if (field_offset &&
+		    (f->bits_per_field[i] == 64 ||
+		    (field_offset + ((1ULL << f->bits_per_field[i]) - 1) <
+		     field_offset)))
+			return "invalid format: offset + bits overflow";
+
+		bits += f->bits_per_field[i];
+	}
+
+	if (f->key_u64s != DIV_ROUND_UP(bits, 64))
+		return "invalid format: incorrect key_u64s";
+
+	return NULL;
+}
+
+/*
+ * Most significant differing bit
+ * Bits are indexed from 0 - return is [0, nr_key_bits)
+ */
+__pure
+unsigned bkey_greatest_differing_bit(const struct btree *b,
+				     const struct bkey_packed *l_k,
+				     const struct bkey_packed *r_k)
+{
+	const u64 *l = high_word(&b->format, l_k);
+	const u64 *r = high_word(&b->format, r_k);
+	unsigned nr_key_bits = b->nr_key_bits;
+	unsigned word_bits = 64 - high_bit_offset;
+	u64 l_v, r_v;
+
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
+
+	/* for big endian, skip past header */
+	l_v = *l & (~0ULL >> high_bit_offset);
+	r_v = *r & (~0ULL >> high_bit_offset);
+
+	while (nr_key_bits) {
+		if (nr_key_bits < word_bits) {
+			l_v >>= word_bits - nr_key_bits;
+			r_v >>= word_bits - nr_key_bits;
+			nr_key_bits = 0;
+		} else {
+			nr_key_bits -= word_bits;
+		}
+
+		if (l_v != r_v)
+			return fls64(l_v ^ r_v) - 1 + nr_key_bits;
+
+		l = next_word(l);
+		r = next_word(r);
+
+		l_v = *l;
+		r_v = *r;
+		word_bits = 64;
+	}
+
+	return 0;
+}
+
+/*
+ * First set bit
+ * Bits are indexed from 0 - return is [0, nr_key_bits)
+ */
+__pure
+unsigned bkey_ffs(const struct btree *b,
+		  const struct bkey_packed *k)
+{
+	const u64 *p = high_word(&b->format, k);
+	unsigned nr_key_bits = b->nr_key_bits;
+	unsigned ret = 0, offset;
+
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
+
+	offset = nr_key_bits;
+	while (offset > 64) {
+		p = next_word(p);
+		offset -= 64;
+	}
+
+	offset = 64 - offset;
+
+	while (nr_key_bits) {
+		unsigned bits = nr_key_bits + offset < 64
+			? nr_key_bits
+			: 64 - offset;
+
+		u64 mask = (~0ULL >> (64 - bits)) << offset;
+
+		if (*p & mask)
+			return ret + __ffs64(*p & mask) - offset;
+
+		p = prev_word(p);
+		nr_key_bits -= bits;
+		ret += bits;
+		offset = 0;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_X86_64
+
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+				  unsigned nr_key_bits)
+{
+	long d0, d1, d2, d3;
+	int cmp;
+
+	/* we shouldn't need asm for this, but gcc is being retarded: */
+
+	asm(".intel_syntax noprefix;"
+	    "xor eax, eax;"
+	    "xor edx, edx;"
+	    "1:;"
+	    "mov r8, [rdi];"
+	    "mov r9, [rsi];"
+	    "sub ecx, 64;"
+	    "jl 2f;"
+
+	    "cmp r8, r9;"
+	    "jnz 3f;"
+
+	    "lea rdi, [rdi - 8];"
+	    "lea rsi, [rsi - 8];"
+	    "jmp 1b;"
+
+	    "2:;"
+	    "not ecx;"
+	    "shr r8, 1;"
+	    "shr r9, 1;"
+	    "shr r8, cl;"
+	    "shr r9, cl;"
+	    "cmp r8, r9;"
+
+	    "3:\n"
+	    "seta al;"
+	    "setb dl;"
+	    "sub eax, edx;"
+	    ".att_syntax prefix;"
+	    : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
+	    : "0" (l), "1" (r), "3" (nr_key_bits)
+	    : "r8", "r9", "cc", "memory");
+
+	return cmp;
+}
+
+#define I(_x)			(*(out)++ = (_x))
+#define I1(i0)						I(i0)
+#define I2(i0, i1)		(I1(i0),		I(i1))
+#define I3(i0, i1, i2)		(I2(i0, i1),		I(i2))
+#define I4(i0, i1, i2, i3)	(I3(i0, i1, i2),	I(i3))
+#define I5(i0, i1, i2, i3, i4)	(I4(i0, i1, i2, i3),	I(i4))
+
+static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
+			      enum bch_bkey_fields field,
+			      unsigned dst_offset, unsigned dst_size,
+			      bool *eax_zeroed)
+{
+	unsigned byte = format->key_u64s * sizeof(u64);
+	unsigned bits = format->bits_per_field[field];
+	u64 offset = format->field_offset[field];
+	unsigned i, bit_offset = 0;
+	unsigned shl, shr;
+
+	if (!bits && !offset) {
+		if (!*eax_zeroed) {
+			/* xor eax, eax */
+			I2(0x31, 0xc0);
+		}
+
+		*eax_zeroed = true;
+		goto set_field;
+	}
+
+	if (!bits) {
+		/* just return offset: */
+
+		switch (dst_size) {
+		case 8:
+			if (offset > S32_MAX) {
+				/* mov [rdi + dst_offset], offset */
+				I3(0xc7, 0x47, dst_offset);
+				memcpy(out, &offset, 4);
+				out += 4;
+
+				I3(0xc7, 0x47, dst_offset + 4);
+				memcpy(out, (void *) &offset + 4, 4);
+				out += 4;
+			} else {
+				/* mov [rdi + dst_offset], offset */
+				/* sign extended */
+				I4(0x48, 0xc7, 0x47, dst_offset);
+				memcpy(out, &offset, 4);
+				out += 4;
+			}
+			break;
+		case 4:
+			/* mov [rdi + dst_offset], offset */
+			I3(0xc7, 0x47, dst_offset);
+			memcpy(out, &offset, 4);
+			out += 4;
+			break;
+		default:
+			BUG();
+		}
+
+		return out;
+	}
+
+	for (i = 0; i <= field; i++)
+		bit_offset += format->bits_per_field[i];
+
+	byte -= DIV_ROUND_UP(bit_offset, 8);
+	bit_offset = round_up(bit_offset, 8) - bit_offset;
+
+	*eax_zeroed = false;
+
+	if (bit_offset == 0 && bits == 8) {
+		/* movzx eax, BYTE PTR [rsi + imm8] */
+		I4(0x0f, 0xb6, 0x46, byte);
+	} else if (bit_offset == 0 && bits == 16) {
+		/* movzx eax, WORD PTR [rsi + imm8] */
+		I4(0x0f, 0xb7, 0x46, byte);
+	} else if (bit_offset + bits <= 32) {
+		/* mov eax, [rsi + imm8] */
+		I3(0x8b, 0x46, byte);
+
+		if (bit_offset) {
+			/* shr eax, imm8 */
+			I3(0xc1, 0xe8, bit_offset);
+		}
+
+		if (bit_offset + bits < 32) {
+			unsigned mask = ~0U >> (32 - bits);
+
+			/* and eax, imm32 */
+			I1(0x25);
+			memcpy(out, &mask, 4);
+			out += 4;
+		}
+	} else if (bit_offset + bits <= 64) {
+		/* mov rax, [rsi + imm8] */
+		I4(0x48, 0x8b, 0x46, byte);
+
+		shl = 64 - bit_offset - bits;
+		shr = bit_offset + shl;
+
+		if (shl) {
+			/* shl rax, imm8 */
+			I4(0x48, 0xc1, 0xe0, shl);
+		}
+
+		if (shr) {
+			/* shr rax, imm8 */
+			I4(0x48, 0xc1, 0xe8, shr);
+		}
+	} else {
+		/* mov rax, [rsi + byte] */
+		I4(0x48, 0x8b, 0x46, byte);
+
+		/* mov edx, [rsi + byte + 8] */
+		I3(0x8b, 0x56, byte + 8);
+
+		/* bits from next word: */
+		shr = bit_offset + bits - 64;
+		BUG_ON(shr > bit_offset);
+
+		/* shr rax, bit_offset */
+		I4(0x48, 0xc1, 0xe8, shr);
+
+		/* shl rdx, imm8 */
+		I4(0x48, 0xc1, 0xe2, 64 - shr);
+
+		/* or rax, rdx */
+		I3(0x48, 0x09, 0xd0);
+
+		shr = bit_offset - shr;
+
+		if (shr) {
+			/* shr rax, imm8 */
+			I4(0x48, 0xc1, 0xe8, shr);
+		}
+	}
+
+	/* rax += offset: */
+	if (offset > S32_MAX) {
+		/* mov rdx, imm64 */
+		I2(0x48, 0xba);
+		memcpy(out, &offset, 8);
+		out += 8;
+		/* add %rdx, %rax */
+		I3(0x48, 0x01, 0xd0);
+	} else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) {
+		/* add rax, imm32 */
+		I2(0x48, 0x05);
+		memcpy(out, &offset, 4);
+		out += 4;
+	} else if (offset) {
+		/* add eax, imm32 */
+		I1(0x05);
+		memcpy(out, &offset, 4);
+		out += 4;
+	}
+set_field:
+	switch (dst_size) {
+	case 8:
+		/* mov [rdi + dst_offset], rax */
+		I4(0x48, 0x89, 0x47, dst_offset);
+		break;
+	case 4:
+		/* mov [rdi + dst_offset], eax */
+		I3(0x89, 0x47, dst_offset);
+		break;
+	default:
+		BUG();
+	}
+
+	return out;
+}
+
+int bch_compile_bkey_format(const struct bkey_format *format, void *_out)
+{
+	bool eax_zeroed = false;
+	u8 *out = _out;
+
+	/*
+	 * rdi: dst - unpacked key
+	 * rsi: src - packed key
+	 */
+
+	/* k->u64s, k->format, k->type */
+
+	/* mov eax, [rsi] */
+	I2(0x8b, 0x06);
+
+	/* add eax, BKEY_U64s - format->key_u64s */
+	I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0);
+
+	/* and eax, imm32: mask out k->pad: */
+	I5(0x25, 0xff, 0xff, 0xff, 0);
+
+	/* mov [rdi], eax */
+	I2(0x89, 0x07);
+
+	out = compile_bkey_field(format, out,	BKEY_FIELD_INODE,
+				 offsetof(struct bkey, p.inode), 8,
+				 &eax_zeroed);
+
+	out = compile_bkey_field(format, out,	BKEY_FIELD_OFFSET,
+				 offsetof(struct bkey, p.offset), 8,
+				 &eax_zeroed);
+
+	out = compile_bkey_field(format, out,	BKEY_FIELD_SNAPSHOT,
+				 offsetof(struct bkey, p.snapshot), 4,
+				 &eax_zeroed);
+
+	out = compile_bkey_field(format, out,	BKEY_FIELD_SIZE,
+				 offsetof(struct bkey, size), 4,
+				 &eax_zeroed);
+
+	out = compile_bkey_field(format, out,	BKEY_FIELD_VERSION,
+				 offsetof(struct bkey, version), 4,
+				 &eax_zeroed);
+
+	/* retq */
+	I1(0xc3);
+
+	return (void *) out - _out;
+}
+
+#else
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+				  unsigned nr_key_bits)
+{
+	u64 l_v, r_v;
+
+	if (!nr_key_bits)
+		return 0;
+
+	/* for big endian, skip past header */
+	nr_key_bits += high_bit_offset;
+	l_v = *l & (~0ULL >> high_bit_offset);
+	r_v = *r & (~0ULL >> high_bit_offset);
+
+	while (1) {
+		if (nr_key_bits < 64) {
+			l_v >>= 64 - nr_key_bits;
+			r_v >>= 64 - nr_key_bits;
+			nr_key_bits = 0;
+		} else {
+			nr_key_bits -= 64;
+		}
+
+		if (l_v != r_v)
+			return l_v < r_v ? -1 : 1;
+
+		if (!nr_key_bits)
+			return 0;
+
+		l = next_word(l);
+		r = next_word(r);
+
+		l_v = *l;
+		r_v = *r;
+	}
+}
+#endif
+
+/*
+ * Would like to use this if we can make __bkey_cmp_bits() fast enough, it'll be
+ * a decent reduction in code size
+ */
+#if 0
+static int bkey_cmp_verify(const struct bkey *l, const struct bkey *r)
+{
+	if (l->p.inode != r->p.inode)
+		return l->p.inode < r->p.inode ? -1 : 1;
+
+	if (l->p.offset != r->p.offset)
+		return l->p.offset < r->p.offset ? -1 : 1;
+
+	if (l->p.snapshot != r->p.snapshot)
+		return l->p.snapshot < r->p.snapshot ? -1 : 1;
+
+	return 0;
+}
+
+int bkey_cmp(const struct bkey *l, const struct bkey *r)
+{
+	int ret;
+
+	EBUG_ON(bkey_packed(l) || bkey_packed(r));
+
+	ret = __bkey_cmp_bits((sizeof(l->inode) +
+			       sizeof(l->offset) +
+			       sizeof(l->snapshot)) * BITS_PER_BYTE,
+			      __high_word(BKEY_U64s, l),
+			      __high_word(BKEY_U64s, r));
+
+	BUG_ON(ret != bkey_cmp_verify(l, r));
+
+	return ret;
+}
+#endif
+
+__pure
+int __bkey_cmp_packed_format_checked(const struct bkey_packed *l,
+				     const struct bkey_packed *r,
+				     const struct btree *b)
+{
+	const struct bkey_format *f = &b->format;
+	int ret;
+
+	EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+	ret = __bkey_cmp_bits(high_word(f, l),
+			      high_word(f, r),
+			      b->nr_key_bits);
+
+	EBUG_ON(ret != bkey_cmp(bkey_unpack_key_format_checked(b, l).p,
+				bkey_unpack_key_format_checked(b, r).p));
+	return ret;
+}
+
+__pure __flatten
+int __bkey_cmp_left_packed_format_checked(const struct btree *b,
+					  const struct bkey_packed *l,
+					  const struct bpos *r)
+{
+	return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r);
+}
+
+__pure __flatten
+int __bkey_cmp_packed(const struct bkey_packed *l,
+		      const struct bkey_packed *r,
+		      const struct btree *b)
+{
+	int packed = bkey_lr_packed(l, r);
+
+	if (likely(packed == BKEY_PACKED_BOTH))
+		return __bkey_cmp_packed_format_checked(l, r, b);
+
+	switch (packed) {
+	case BKEY_PACKED_NONE:
+		return bkey_cmp(((struct bkey *) l)->p,
+				((struct bkey *) r)->p);
+	case BKEY_PACKED_LEFT:
+		return __bkey_cmp_left_packed_format_checked(b,
+				  (struct bkey_packed *) l,
+				  &((struct bkey *) r)->p);
+	case BKEY_PACKED_RIGHT:
+		return -__bkey_cmp_left_packed_format_checked(b,
+				  (struct bkey_packed *) r,
+				  &((struct bkey *) l)->p);
+	default:
+		unreachable();
+	}
+}
+
+__pure __flatten
+int bkey_cmp_left_packed(const struct btree *b,
+			 const struct bkey_packed *l, const struct bpos *r)
+{
+	const struct bkey *l_unpacked;
+
+	return unlikely(l_unpacked = packed_to_bkey_c(l))
+		? bkey_cmp(l_unpacked->p, *r)
+		: __bkey_cmp_left_packed_format_checked(b, l, r);
+}
+
+void bch_bpos_swab(struct bpos *p)
+{
+	u8 *l = (u8 *) p;
+	u8 *h = ((u8 *) &p[1]) - 1;
+
+	while (l < h) {
+		swap(*l, *h);
+		l++;
+		--h;
+	}
+}
+
+void bch_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
+{
+	const struct bkey_format *f = bkey_packed(k) ? _f : &bch_bkey_format_current;
+	u8 *l = k->key_start;
+	u8 *h = (u8 *) (k->_data + f->key_u64s) - 1;
+
+	while (l < h) {
+		swap(*l, *h);
+		l++;
+		--h;
+	}
+}
+
+#ifdef CONFIG_BCACHE_DEBUG
+void bkey_pack_test(void)
+{
+	struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0);
+	struct bkey_packed p;
+
+	struct bkey_format test_format = {
+		.key_u64s	= 2,
+		.nr_fields	= 5,
+		.bits_per_field = {
+			13,
+			64,
+		},
+	};
+
+	struct unpack_state in_s =
+		unpack_state_init(&bch_bkey_format_current, (void *) &t);
+	struct pack_state out_s = pack_state_init(&test_format, &p);
+	unsigned i;
+
+	for (i = 0; i < out_s.format->nr_fields; i++) {
+		u64 a, v = get_inc_field(&in_s, i);
+
+		switch (i) {
+		case 0:
+			a = t.p.inode;
+			break;
+		case 1:
+			a = t.p.offset;
+			break;
+		case 2:
+			a = t.p.snapshot;
+			break;
+		case 3:
+			a = t.size;
+			break;
+		case 4:
+			a = t.version;
+			break;
+		default:
+			BUG();
+		}
+
+		if (a != v)
+			panic("got %llu actual %llu i %u\n", v, a, i);
+
+		if (!set_inc_field(&out_s, i, v))
+			panic("failed at %u\n", i);
+	}
+
+	BUG_ON(!bkey_pack_key(&p, &t, &test_format));
+}
+#endif
diff --git a/libbcache/bkey.h b/libbcache/bkey.h
new file mode 100644
index 0000000..3e29cdd
--- /dev/null
+++ b/libbcache/bkey.h
@@ -0,0 +1,596 @@
+#ifndef _BCACHE_BKEY_H
+#define _BCACHE_BKEY_H
+
+#include <linux/bug.h>
+#include <linux/bcache.h>
+
+#include "util.h"
+
+void bch_to_binary(char *, const u64 *, unsigned);
+int bch_bkey_to_text(char *, size_t, const struct bkey *);
+
+#define BKEY_PADDED(key)	__BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
+
+/* bkey with split value, const */
+struct bkey_s_c {
+	const struct bkey	*k;
+	const struct bch_val	*v;
+};
+
+/* bkey with split value */
+struct bkey_s {
+	union {
+	struct {
+		struct bkey	*k;
+		struct bch_val	*v;
+	};
+	struct bkey_s_c		s_c;
+	};
+};
+
+#define bkey_next(_k)							\
+({									\
+	BUILD_BUG_ON(!type_is(_k, struct bkey *) &&			\
+		     !type_is(_k, struct bkey_i *) &&			\
+		     !type_is(_k, struct bkey_packed *));		\
+									\
+	((typeof(_k)) __bkey_idx(((struct bkey *) (_k)),		\
+				 ((struct bkey *) (_k))->u64s));	\
+})
+
+static inline unsigned bkey_val_u64s(const struct bkey *k)
+{
+	return k->u64s - BKEY_U64s;
+}
+
+static inline size_t bkey_val_bytes(const struct bkey *k)
+{
+	return bkey_val_u64s(k) * sizeof(u64);
+}
+
+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
+{
+	k->u64s = BKEY_U64s + val_u64s;
+}
+
+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
+{
+	k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
+}
+
+/*
+ * Mark a key as deleted without changing the size of the value (i.e. modifying
+ * keys in the btree in place)
+ */
+static inline void __set_bkey_deleted(struct bkey *k)
+{
+	k->type = KEY_TYPE_DELETED;
+}
+
+static inline void set_bkey_deleted(struct bkey *k)
+{
+	__set_bkey_deleted(k);
+	set_bkey_val_u64s(k, 0);
+}
+
+#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_DELETED)
+
+#define bkey_whiteout(_k)				\
+	((_k)->type == KEY_TYPE_DELETED || (_k)->type == KEY_TYPE_DISCARD)
+
+#define bkey_packed_typecheck(_k)					\
+({									\
+	BUILD_BUG_ON(!type_is(_k, struct bkey *) &&			\
+		     !type_is(_k, struct bkey_packed *));		\
+	type_is(_k, struct bkey_packed *);				\
+})
+
+enum bkey_lr_packed {
+	BKEY_PACKED_BOTH,
+	BKEY_PACKED_RIGHT,
+	BKEY_PACKED_LEFT,
+	BKEY_PACKED_NONE,
+};
+
+#define bkey_lr_packed_typecheck(_l, _r)				\
+	(!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1))
+
+#define bkey_lr_packed(_l, _r)						\
+	((_l)->format + ((_r)->format << 1))
+
+#define bkey_copy(_dst, _src)					\
+do {								\
+	BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) &&		\
+		     !type_is(_dst, struct bkey_packed *));	\
+	BUILD_BUG_ON(!type_is(_src, struct bkey_i *) &&		\
+		     !type_is(_src, struct bkey_packed *));	\
+	EBUG_ON((u64 *) (_dst) > (u64 *) (_src) &&		\
+		(u64 *) (_dst) < (u64 *) (_src) +		\
+		((struct bkey *) (_src))->u64s);		\
+								\
+	__memmove_u64s_down((_dst), (_src),			\
+			    ((struct bkey *) (_src))->u64s);	\
+} while (0)
+
+struct btree;
+
+struct bkey_format_state {
+	u64 field_min[BKEY_NR_FIELDS];
+	u64 field_max[BKEY_NR_FIELDS];
+};
+
+void bch_bkey_format_init(struct bkey_format_state *);
+void bch_bkey_format_add_key(struct bkey_format_state *, const struct bkey *);
+void bch_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
+struct bkey_format bch_bkey_format_done(struct bkey_format_state *);
+const char *bch_bkey_format_validate(struct bkey_format *);
+
+__pure
+unsigned bkey_greatest_differing_bit(const struct btree *,
+				     const struct bkey_packed *,
+				     const struct bkey_packed *);
+__pure
+unsigned bkey_ffs(const struct btree *, const struct bkey_packed *);
+
+__pure
+int __bkey_cmp_packed_format_checked(const struct bkey_packed *,
+				     const struct bkey_packed *,
+				     const struct btree *);
+
+__pure
+int __bkey_cmp_left_packed_format_checked(const struct btree *,
+					  const struct bkey_packed *,
+					  const struct bpos *);
+
+__pure
+int __bkey_cmp_packed(const struct bkey_packed *,
+		      const struct bkey_packed *,
+		      const struct btree *);
+
+__pure
+int bkey_cmp_left_packed(const struct btree *,
+			 const struct bkey_packed *,
+			 const struct bpos *);
+
+/*
+ * we prefer to pass bpos by ref, but it's often enough terribly convenient to
+ * pass it by by val... as much as I hate c++, const ref would be nice here:
+ */
+__pure __flatten
+static inline int bkey_cmp_left_packed_byval(const struct btree *b,
+					     const struct bkey_packed *l,
+					     struct bpos r)
+{
+	return bkey_cmp_left_packed(b, l, &r);
+}
+
+/*
+ * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to
+ * skip dispatching on k->format:
+ */
+#define bkey_cmp_packed(_b, _l, _r)					\
+({									\
+	int _cmp;							\
+									\
+	switch (bkey_lr_packed_typecheck(_l, _r)) {			\
+	case BKEY_PACKED_NONE:						\
+		_cmp = bkey_cmp(((struct bkey *) (_l))->p,		\
+				((struct bkey *) (_r))->p);		\
+		break;							\
+	case BKEY_PACKED_LEFT:						\
+		_cmp = bkey_cmp_left_packed((_b),			\
+				  (struct bkey_packed *) (_l),		\
+				  &((struct bkey *) (_r))->p);		\
+		break;							\
+	case BKEY_PACKED_RIGHT:						\
+		_cmp = -bkey_cmp_left_packed((_b),			\
+				  (struct bkey_packed *) (_r),		\
+				  &((struct bkey *) (_l))->p);		\
+		break;							\
+	case BKEY_PACKED_BOTH:						\
+		_cmp = __bkey_cmp_packed((void *) (_l),			\
+					 (void *) (_r), (_b));		\
+		break;							\
+	}								\
+	_cmp;								\
+})
+
+#if 1
+static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
+{
+	if (l.inode != r.inode)
+		return l.inode < r.inode ? -1 : 1;
+	if (l.offset != r.offset)
+		return l.offset < r.offset ? -1 : 1;
+	if (l.snapshot != r.snapshot)
+		return l.snapshot < r.snapshot ? -1 : 1;
+	return 0;
+}
+#else
+int bkey_cmp(struct bpos l, struct bpos r);
+#endif
+
+static inline struct bpos bpos_min(struct bpos l, struct bpos r)
+{
+	return bkey_cmp(l, r) < 0 ? l : r;
+}
+
+void bch_bpos_swab(struct bpos *);
+void bch_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
+
+#ifdef CONFIG_BCACHE_DEBUG
+/* statement expressions confusing unlikely()? */
+#define bkey_packed(_k)							\
+	({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT);			\
+	 (_k)->format != KEY_FORMAT_CURRENT; })
+#else
+#define bkey_packed(_k)		((_k)->format != KEY_FORMAT_CURRENT)
+#endif
+
+/*
+ * It's safe to treat an unpacked bkey as a packed one, but not the reverse
+ */
+static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
+{
+	return (struct bkey_packed *) k;
+}
+
+static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
+{
+	return (const struct bkey_packed *) k;
+}
+
+static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
+{
+	return bkey_packed(k) ? NULL : (struct bkey_i *) k;
+}
+
+static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
+{
+	return bkey_packed(k) ? NULL : (const struct bkey *) k;
+}
+
+static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
+{
+	return format->bits_per_field[BKEY_FIELD_INODE] +
+		format->bits_per_field[BKEY_FIELD_OFFSET] +
+		format->bits_per_field[BKEY_FIELD_SNAPSHOT];
+}
+
+static inline struct bpos bkey_successor(struct bpos p)
+{
+	struct bpos ret = p;
+
+	if (!++ret.offset)
+		BUG_ON(!++ret.inode);
+
+	return ret;
+}
+
+static inline u64 bkey_start_offset(const struct bkey *k)
+{
+	return k->p.offset - k->size;
+}
+
+static inline struct bpos bkey_start_pos(const struct bkey *k)
+{
+	return (struct bpos) {
+		.inode		= k->p.inode,
+		.offset		= bkey_start_offset(k),
+		.snapshot	= k->p.snapshot,
+	};
+}
+
+/* Packed helpers */
+
+static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
+				      const struct bkey_packed *k)
+{
+	unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+
+	EBUG_ON(k->u64s < ret);
+	return ret;
+}
+
+static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
+				       const struct bkey_packed *k)
+{
+	return bkeyp_key_u64s(format, k) * sizeof(u64);
+}
+
+static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
+				      const struct bkey_packed *k)
+{
+	return k->u64s - bkeyp_key_u64s(format, k);
+}
+
+static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
+				     const struct bkey_packed *k)
+{
+	return bkeyp_val_u64s(format, k) * sizeof(u64);
+}
+
+static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
+				      struct bkey_packed *k, unsigned val_u64s)
+{
+	k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
+}
+
+#define bkeyp_val(_format, _k)						\
+	 ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k)))
+
+extern const struct bkey_format bch_bkey_format_current;
+
+bool bch_bkey_transform(const struct bkey_format *,
+			struct bkey_packed *,
+			const struct bkey_format *,
+			const struct bkey_packed *);
+
+struct bkey __bkey_unpack_key(const struct bkey_format *,
+			      const struct bkey_packed *);
+
+#ifndef HAVE_BCACHE_COMPILED_UNPACK
+struct bpos __bkey_unpack_pos(const struct bkey_format *,
+			      const struct bkey_packed *);
+#endif
+
+bool bkey_pack_key(struct bkey_packed *, const struct bkey *,
+		   const struct bkey_format *);
+
+enum bkey_pack_pos_ret {
+	BKEY_PACK_POS_EXACT,
+	BKEY_PACK_POS_SMALLER,
+	BKEY_PACK_POS_FAIL,
+};
+
+enum bkey_pack_pos_ret bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
+					   const struct btree *);
+
+static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
+				 const struct btree *b)
+{
+	return bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
+}
+
+void bkey_unpack(const struct btree *, struct bkey_i *,
+		 const struct bkey_packed *);
+bool bkey_pack(struct bkey_packed *, const struct bkey_i *,
+	       const struct bkey_format *);
+
+static inline u64 bkey_field_max(const struct bkey_format *f,
+				 enum bch_bkey_fields nr)
+{
+	return f->bits_per_field[nr] < 64
+		? f->field_offset[nr] + ~(~0ULL << f->bits_per_field[nr])
+		: U64_MAX;
+}
+
+#ifdef CONFIG_X86_64
+#define HAVE_BCACHE_COMPILED_UNPACK	1
+
+int bch_compile_bkey_format(const struct bkey_format *, void *);
+
+#else
+
+static inline int bch_compile_bkey_format(const struct bkey_format *format,
+					  void *out) { return 0; }
+
+#endif
+
+static inline void bkey_reassemble(struct bkey_i *dst,
+				   struct bkey_s_c src)
+{
+	BUG_ON(bkey_packed(src.k));
+	dst->k = *src.k;
+	memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k));
+}
+
+#define bkey_s_null		((struct bkey_s)   { .k = NULL })
+#define bkey_s_c_null		((struct bkey_s_c) { .k = NULL })
+
+#define bkey_s_err(err)		((struct bkey_s)   { .k = ERR_PTR(err) })
+#define bkey_s_c_err(err)	((struct bkey_s_c) { .k = ERR_PTR(err) })
+
+static inline struct bkey_s bkey_to_s(struct bkey *k)
+{
+	return (struct bkey_s) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
+{
+	return (struct bkey_s_c) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
+{
+	return (struct bkey_s) { .k = &k->k, .v = &k->v };
+}
+
+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
+{
+	return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
+}
+
+/*
+ * For a given type of value (e.g. struct bch_extent), generates the types for
+ * bkey + bch_extent - inline, split, split const - and also all the conversion
+ * functions, which also check that the value is of the correct type.
+ *
+ * We use anonymous unions for upcasting - e.g. converting from e.g. a
+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
+ * functions.
+ */
+#define __BKEY_VAL_ACCESSORS(name, nr, _assert)				\
+struct bkey_s_c_##name {						\
+	union {								\
+	struct {							\
+		const struct bkey	*k;				\
+		const struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+struct bkey_s_##name {							\
+	union {								\
+	struct {							\
+		struct bkey		*k;				\
+		struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c_##name		c;				\
+	struct bkey_s			s;				\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
+{									\
+	_assert(k->k.type, nr);						\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline const struct bkey_i_##name *				\
+bkey_i_to_##name##_c(const struct bkey_i *k)				\
+{									\
+	_assert(k->k.type, nr);						\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
+{									\
+	_assert(k.k->type, nr);						\
+	return (struct bkey_s_##name) {					\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
+{									\
+	_assert(k.k->type, nr);						\
+	return (struct bkey_s_c_##name) {				\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
+{									\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+name##_i_to_s_c(const struct bkey_i_##name *k)				\
+{									\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
+{									\
+	_assert(k->k.type, nr);						\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+bkey_i_to_s_c_##name(const struct bkey_i *k)				\
+{									\
+	_assert(k->k.type, nr);						\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bch_##name *					\
+bkey_p_##name##_val(const struct bkey_format *f,			\
+		    struct bkey_packed *k)				\
+{									\
+	return container_of(bkeyp_val(f, k), struct bch_##name, v);	\
+}									\
+									\
+static inline const struct bch_##name *					\
+bkey_p_c_##name##_val(const struct bkey_format *f,			\
+		      const struct bkey_packed *k)			\
+{									\
+	return container_of(bkeyp_val(f, k), struct bch_##name, v);	\
+}									\
+									\
+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
+{									\
+	struct bkey_i_##name *k =					\
+		container_of(&_k->k, struct bkey_i_##name, k);		\
+									\
+	bkey_init(&k->k);						\
+	memset(&k->v, 0, sizeof(k->v));					\
+	k->k.type = nr;							\
+	set_bkey_val_bytes(&k->k, sizeof(k->v));			\
+									\
+	return k;							\
+}
+
+#define __BKEY_VAL_ASSERT(_type, _nr)	EBUG_ON(_type != _nr)
+
+#define BKEY_VAL_ACCESSORS(name, _nr)					\
+	static inline void __bch_##name##_assert(u8 type, u8 nr)	\
+	{								\
+		EBUG_ON(type != _nr);					\
+	}								\
+									\
+	__BKEY_VAL_ACCESSORS(name, _nr, __bch_##name##_assert)
+
+BKEY_VAL_ACCESSORS(cookie,		KEY_TYPE_COOKIE);
+
+static inline void __bch_extent_assert(u8 type, u8 nr)
+{
+	EBUG_ON(type != BCH_EXTENT && type != BCH_EXTENT_CACHED);
+}
+
+__BKEY_VAL_ACCESSORS(extent,		BCH_EXTENT, __bch_extent_assert);
+
+BKEY_VAL_ACCESSORS(inode,		BCH_INODE_FS);
+BKEY_VAL_ACCESSORS(inode_blockdev,	BCH_INODE_BLOCKDEV);
+
+BKEY_VAL_ACCESSORS(dirent,		BCH_DIRENT);
+
+BKEY_VAL_ACCESSORS(xattr,		BCH_XATTR);
+
+/* byte order helpers */
+
+#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
+#error edit for your odd byteorder.
+#endif
+
+#ifdef __LITTLE_ENDIAN
+
+#define high_bit_offset		0
+#define __high_word(u64s, k)	((k)->_data + (u64s) - 1)
+#define nth_word(p, n)		((p) - (n))
+
+#else
+
+#define high_bit_offset		KEY_PACKED_BITS_START
+#define __high_word(u64s, k)	((k)->_data)
+#define nth_word(p, n)		((p) + (n))
+
+#endif
+
+#define high_word(format, k)	__high_word((format)->key_u64s, k)
+#define next_word(p)		nth_word(p, 1)
+#define prev_word(p)		nth_word(p, -1)
+
+#ifdef CONFIG_BCACHE_DEBUG
+void bkey_pack_test(void);
+#else
+static inline void bkey_pack_test(void) {}
+#endif
+
+#endif /* _BCACHE_BKEY_H */
diff --git a/libbcache/bkey_methods.c b/libbcache/bkey_methods.c
new file mode 100644
index 0000000..3bcd0e0
--- /dev/null
+++ b/libbcache/bkey_methods.c
@@ -0,0 +1,117 @@
+
+#include "bcache.h"
+#include "bkey_methods.h"
+#include "btree_types.h"
+#include "dirent.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "xattr.h"
+
+const struct bkey_ops *bch_bkey_ops[] = {
+	[BKEY_TYPE_EXTENTS]	= &bch_bkey_extent_ops,
+	[BKEY_TYPE_INODES]	= &bch_bkey_inode_ops,
+	[BKEY_TYPE_DIRENTS]	= &bch_bkey_dirent_ops,
+	[BKEY_TYPE_XATTRS]	= &bch_bkey_xattr_ops,
+	[BKEY_TYPE_BTREE]	= &bch_bkey_btree_ops,
+};
+
+/* Returns string indicating reason for being invalid, or NULL if valid: */
+const char *bkey_invalid(struct cache_set *c, enum bkey_type type,
+			 struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = bch_bkey_ops[type];
+
+	if (k.k->u64s < BKEY_U64s)
+		return "u64s too small";
+
+	if (k.k->size &&
+	    (bkey_deleted(k.k) || !ops->is_extents))
+		return "nonzero size field";
+
+	switch (k.k->type) {
+	case KEY_TYPE_DELETED:
+	case KEY_TYPE_DISCARD:
+		return NULL;
+
+	case KEY_TYPE_ERROR:
+		return bkey_val_bytes(k.k) != 0
+			? "value size should be zero"
+			: NULL;
+
+	case KEY_TYPE_COOKIE:
+		return bkey_val_bytes(k.k) != sizeof(struct bch_cookie)
+			? "incorrect value size"
+			: NULL;
+
+	default:
+		if (k.k->type < KEY_TYPE_GENERIC_NR)
+			return "invalid type";
+
+		return ops->key_invalid(c, k);
+	}
+}
+
+const char *btree_bkey_invalid(struct cache_set *c, struct btree *b,
+			       struct bkey_s_c k)
+{
+	if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
+		return "key before start of btree node";
+
+	if (bkey_cmp(k.k->p, b->data->max_key) > 0)
+		return "key past end of btree node";
+
+	if (k.k->p.snapshot)
+		return "nonzero snapshot";
+
+	return bkey_invalid(c, btree_node_type(b), k);
+}
+
+void bkey_debugcheck(struct cache_set *c, struct btree *b, struct bkey_s_c k)
+{
+	enum bkey_type type = btree_node_type(b);
+	const struct bkey_ops *ops = bch_bkey_ops[type];
+	const char *invalid;
+
+	BUG_ON(!k.k->u64s);
+
+	invalid = btree_bkey_invalid(c, b, k);
+	if (invalid) {
+		char buf[160];
+
+		bch_bkey_val_to_text(c, type, buf, sizeof(buf), k);
+		cache_set_bug(c, "invalid bkey %s: %s", buf, invalid);
+		return;
+	}
+
+	if (k.k->type >= KEY_TYPE_GENERIC_NR &&
+	    ops->key_debugcheck)
+		ops->key_debugcheck(c, b, k);
+}
+
+void bch_bkey_val_to_text(struct cache_set *c, enum bkey_type type,
+			  char *buf, size_t size, struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = bch_bkey_ops[type];
+	char *out = buf, *end = buf + size;
+
+	out += bch_bkey_to_text(out, end - out, k.k);
+
+	if (k.k->type >= KEY_TYPE_GENERIC_NR &&
+	    ops->val_to_text) {
+		out += scnprintf(out, end - out, " -> ");
+		ops->val_to_text(c, out, end - out, k);
+	}
+}
+
+void bch_bkey_swab(enum bkey_type type,
+		   const struct bkey_format *f,
+		   struct bkey_packed *k)
+{
+	const struct bkey_ops *ops = bch_bkey_ops[type];
+
+	bch_bkey_swab_key(f, k);
+
+	if (ops->swab)
+		ops->swab(f, k);
+}
diff --git a/libbcache/bkey_methods.h b/libbcache/bkey_methods.h
new file mode 100644
index 0000000..0e305eb
--- /dev/null
+++ b/libbcache/bkey_methods.h
@@ -0,0 +1,80 @@
+#ifndef _BCACHE_BKEY_METHODS_H
+#define _BCACHE_BKEY_METHODS_H
+
+#include "bkey.h"
+
+#define DEF_BTREE_ID(kwd, val, name) BKEY_TYPE_##kwd = val,
+
+enum bkey_type {
+	DEFINE_BCH_BTREE_IDS()
+	BKEY_TYPE_BTREE,
+};
+
+/* Type of a key in btree @id at level @level: */
+static inline enum bkey_type bkey_type(unsigned level, enum btree_id id)
+{
+	return level ? BKEY_TYPE_BTREE : id;
+}
+
+static inline bool btree_type_has_ptrs(enum bkey_type type)
+{
+	switch (type) {
+	case BKEY_TYPE_BTREE:
+	case BKEY_TYPE_EXTENTS:
+		return true;
+	default:
+		return false;
+	}
+}
+
+struct cache_set;
+struct btree;
+struct bkey;
+
+enum merge_result {
+	BCH_MERGE_NOMERGE,
+
+	/*
+	 * The keys were mergeable, but would have overflowed size - so instead
+	 * l was changed to the maximum size, and both keys were modified:
+	 */
+	BCH_MERGE_PARTIAL,
+	BCH_MERGE_MERGE,
+};
+
+typedef bool (*key_filter_fn)(struct cache_set *, struct btree *,
+			      struct bkey_s);
+typedef enum merge_result (*key_merge_fn)(struct cache_set *,
+					  struct btree *,
+					  struct bkey_i *, struct bkey_i *);
+
+struct bkey_ops {
+	/* Returns reason for being invalid if invalid, else NULL: */
+	const char *	(*key_invalid)(const struct cache_set *,
+				       struct bkey_s_c);
+	void		(*key_debugcheck)(struct cache_set *, struct btree *,
+					  struct bkey_s_c);
+	void		(*val_to_text)(struct cache_set *, char *,
+				       size_t, struct bkey_s_c);
+	void		(*swab)(const struct bkey_format *, struct bkey_packed *);
+	key_filter_fn	key_normalize;
+	key_merge_fn	key_merge;
+	bool		is_extents;
+};
+
+const char *bkey_invalid(struct cache_set *, enum bkey_type, struct bkey_s_c);
+const char *btree_bkey_invalid(struct cache_set *, struct btree *,
+			       struct bkey_s_c);
+
+void bkey_debugcheck(struct cache_set *, struct btree *, struct bkey_s_c);
+void bch_bkey_val_to_text(struct cache_set *, enum bkey_type,
+			  char *, size_t, struct bkey_s_c);
+
+void bch_bkey_swab(enum bkey_type, const struct bkey_format *,
+		   struct bkey_packed *);
+
+extern const struct bkey_ops *bch_bkey_ops[];
+
+#undef DEF_BTREE_ID
+
+#endif /* _BCACHE_BKEY_METHODS_H */
diff --git a/libbcache/blockdev.c b/libbcache/blockdev.c
new file mode 100644
index 0000000..cd231f5
--- /dev/null
+++ b/libbcache/blockdev.c
@@ -0,0 +1,824 @@
+
+#include "bcache.h"
+#include "blockdev.h"
+#include "btree_iter.h"
+#include "checksum.h"
+#include "error.h"
+#include "inode.h"
+#include "request.h"
+#include "super.h"
+#include "writeback.h"
+
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+static int bch_blockdev_major;
+static DEFINE_IDA(bch_blockdev_minor);
+static LIST_HEAD(uncached_devices);
+struct kmem_cache *bch_search_cache;
+
+static void write_bdev_super_endio(struct bio *bio)
+{
+	struct cached_dev *dc = bio->bi_private;
+	/* XXX: error checking */
+
+	closure_put(&dc->sb_write);
+}
+
+static void bch_write_bdev_super_unlock(struct closure *cl)
+{
+	struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
+
+	up(&dc->sb_write_mutex);
+}
+
+void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
+{
+	struct backingdev_sb *sb = dc->disk_sb.sb;
+	struct closure *cl = &dc->sb_write;
+	struct bio *bio = dc->disk_sb.bio;
+
+	down(&dc->sb_write_mutex);
+	closure_init(cl, parent);
+
+	bio_reset(bio);
+	bio->bi_end_io	= write_bdev_super_endio;
+	bio->bi_private = dc;
+
+	closure_get(cl);
+
+	sb->csum = cpu_to_le64(__csum_set(sb, 0, BCH_CSUM_CRC64));
+	__write_super(dc->disk.c, (void *) &dc->disk_sb);
+
+	closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
+}
+
+bool bch_is_open_backing_dev(struct block_device *bdev)
+{
+	struct cache_set *c, *tc;
+	struct cached_dev *dc, *t;
+
+	list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
+		list_for_each_entry_safe(dc, t, &c->cached_devs, list)
+			if (dc->disk_sb.bdev == bdev)
+				return true;
+	list_for_each_entry_safe(dc, t, &uncached_devices, list)
+		if (dc->disk_sb.bdev == bdev)
+			return true;
+	return false;
+}
+
+static int open_dev(struct block_device *b, fmode_t mode)
+{
+	struct bcache_device *d = b->bd_disk->private_data;
+
+	if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
+		return -ENXIO;
+
+	closure_get(&d->cl);
+	return 0;
+}
+
+static void release_dev(struct gendisk *b, fmode_t mode)
+{
+	struct bcache_device *d = b->private_data;
+
+	closure_put(&d->cl);
+}
+
+static int ioctl_dev(struct block_device *b, fmode_t mode,
+		     unsigned int cmd, unsigned long arg)
+{
+	struct bcache_device *d = b->bd_disk->private_data;
+
+	return d->ioctl(d, mode, cmd, arg);
+}
+
+static const struct block_device_operations bcache_ops = {
+	.open		= open_dev,
+	.release	= release_dev,
+	.ioctl		= ioctl_dev,
+	.owner		= THIS_MODULE,
+};
+
+void bch_blockdev_stop(struct bcache_device *d)
+{
+	if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
+		closure_queue(&d->cl);
+}
+
+static void bcache_device_unlink(struct bcache_device *d)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
+		sysfs_remove_link(&d->c->kobj, d->name);
+		sysfs_remove_link(&d->kobj, "cache");
+	}
+}
+
+static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
+			       const char *name)
+{
+	snprintf(d->name, BCACHEDEVNAME_SIZE,
+		 "%s%llu", name, bcache_dev_inum(d));
+
+	WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
+	     sysfs_create_link(&c->kobj, &d->kobj, d->name),
+	     "Couldn't create device <-> cache set symlinks");
+
+	clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
+}
+
+static void bcache_device_detach(struct bcache_device *d)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
+		mutex_lock(&d->inode_lock);
+		bch_inode_rm(d->c, bcache_dev_inum(d));
+		mutex_unlock(&d->inode_lock);
+	}
+
+	bcache_device_unlink(d);
+
+	radix_tree_delete(&d->c->devices, bcache_dev_inum(d));
+
+	closure_put(&d->c->caching);
+	d->c = NULL;
+}
+
+static int bcache_device_attach(struct bcache_device *d, struct cache_set *c)
+{
+	int ret;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	ret = radix_tree_insert(&c->devices, bcache_dev_inum(d), d);
+	if (ret) {
+		pr_err("radix_tree_insert() error for inum %llu",
+		       bcache_dev_inum(d));
+		return ret;
+	}
+
+	d->c = c;
+	closure_get(&c->caching);
+
+	return ret;
+}
+
+static void bcache_device_free(struct bcache_device *d)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	pr_info("%s stopped", d->disk->disk_name);
+
+	if (d->c)
+		bcache_device_detach(d);
+	if (d->disk && d->disk->flags & GENHD_FL_UP)
+		del_gendisk(d->disk);
+	if (d->disk && d->disk->queue)
+		blk_cleanup_queue(d->disk->queue);
+	if (d->disk) {
+		ida_simple_remove(&bch_blockdev_minor, d->disk->first_minor);
+		put_disk(d->disk);
+	}
+
+	bioset_exit(&d->bio_split);
+
+	closure_debug_destroy(&d->cl);
+}
+
+static int bcache_device_init(struct bcache_device *d, unsigned block_size,
+			      sector_t sectors)
+{
+	struct request_queue *q;
+	int minor;
+
+	mutex_init(&d->inode_lock);
+
+	minor = ida_simple_get(&bch_blockdev_minor, 0, MINORMASK + 1, GFP_KERNEL);
+	if (minor < 0) {
+		pr_err("cannot allocate minor");
+		return minor;
+	}
+
+	if (!(d->disk = alloc_disk(1)) ||
+	    bioset_init(&d->bio_split, 4, offsetof(struct bch_read_bio, bio))) {
+		pr_err("cannot allocate disk");
+		ida_simple_remove(&bch_blockdev_minor, minor);
+		return -ENOMEM;
+	}
+
+	set_capacity(d->disk, sectors);
+	snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
+
+	d->disk->major		= bch_blockdev_major;
+	d->disk->first_minor	= minor;
+	d->disk->fops		= &bcache_ops;
+	d->disk->private_data	= d;
+
+	q = blk_alloc_queue(GFP_KERNEL);
+	if (!q) {
+		pr_err("cannot allocate queue");
+		return -ENOMEM;
+	}
+
+	blk_queue_make_request(q, NULL);
+	d->disk->queue			= q;
+	q->queuedata			= d;
+	q->backing_dev_info.congested_data = d;
+	q->limits.max_hw_sectors	= UINT_MAX;
+	q->limits.max_sectors		= UINT_MAX;
+	q->limits.max_segment_size	= UINT_MAX;
+	q->limits.max_segments		= BIO_MAX_PAGES;
+	blk_queue_max_discard_sectors(q, UINT_MAX);
+	q->limits.discard_granularity	= 512;
+	q->limits.io_min		= block_size;
+	q->limits.logical_block_size	= block_size;
+	q->limits.physical_block_size	= block_size;
+	set_bit(QUEUE_FLAG_NONROT,	&d->disk->queue->queue_flags);
+	clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
+	set_bit(QUEUE_FLAG_DISCARD,	&d->disk->queue->queue_flags);
+
+	blk_queue_write_cache(q, true, true);
+
+	return 0;
+}
+
+/* Cached device */
+
+static void calc_cached_dev_sectors(struct cache_set *c)
+{
+	u64 sectors = 0;
+	struct cached_dev *dc;
+
+	list_for_each_entry(dc, &c->cached_devs, list)
+		sectors += bdev_sectors(dc->disk_sb.bdev);
+
+	c->cached_dev_sectors = sectors;
+}
+
+void bch_cached_dev_run(struct cached_dev *dc)
+{
+	struct bcache_device *d = &dc->disk;
+	char buf[SB_LABEL_SIZE + 1];
+	char *env[] = {
+		"DRIVER=bcache",
+		kasprintf(GFP_KERNEL, "CACHED_UUID=%pU",
+			  dc->disk_sb.sb->disk_uuid.b),
+		NULL,
+		NULL,
+	};
+
+	memcpy(buf, dc->disk_sb.sb->label, SB_LABEL_SIZE);
+	buf[SB_LABEL_SIZE] = '\0';
+	env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
+
+	if (atomic_xchg(&dc->running, 1)) {
+		kfree(env[1]);
+		kfree(env[2]);
+		return;
+	}
+
+	if (!d->c &&
+	    BDEV_STATE(dc->disk_sb.sb) != BDEV_STATE_NONE) {
+		struct closure cl;
+
+		closure_init_stack(&cl);
+
+		SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_STALE);
+		bch_write_bdev_super(dc, &cl);
+		closure_sync(&cl);
+	}
+
+	add_disk(d->disk);
+	bd_link_disk_holder(dc->disk_sb.bdev, dc->disk.disk);
+	/* won't show up in the uevent file, use udevadm monitor -e instead
+	 * only class / kset properties are persistent */
+	kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
+	kfree(env[1]);
+	kfree(env[2]);
+
+	if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
+	    sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
+		pr_debug("error creating sysfs link");
+}
+
+static void cached_dev_detach_finish(struct work_struct *w)
+{
+	struct cached_dev *dc = container_of(w, struct cached_dev, detach);
+	char buf[BDEVNAME_SIZE];
+	struct closure cl;
+
+	closure_init_stack(&cl);
+
+	BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
+	BUG_ON(atomic_read(&dc->count));
+
+	mutex_lock(&bch_register_lock);
+
+	memset(&dc->disk_sb.sb->set_uuid, 0, 16);
+	SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_NONE);
+
+	bch_write_bdev_super(dc, &cl);
+	closure_sync(&cl);
+
+	bcache_device_detach(&dc->disk);
+	list_move(&dc->list, &uncached_devices);
+
+	clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
+	clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
+
+	mutex_unlock(&bch_register_lock);
+
+	pr_info("Caching disabled for %s", bdevname(dc->disk_sb.bdev, buf));
+
+	/* Drop ref we took in cached_dev_detach() */
+	closure_put(&dc->disk.cl);
+}
+
+void bch_cached_dev_detach(struct cached_dev *dc)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
+		return;
+
+	if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
+		return;
+
+	/*
+	 * Block the device from being closed and freed until we're finished
+	 * detaching
+	 */
+	closure_get(&dc->disk.cl);
+
+	dc->writeback_pd.rate.rate = UINT_MAX;
+	bch_writeback_queue(dc);
+	cached_dev_put(dc);
+}
+
+int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
+{
+	__le64 rtime = cpu_to_le64(ktime_get_seconds());
+	char buf[BDEVNAME_SIZE];
+	bool found;
+	int ret;
+
+	bdevname(dc->disk_sb.bdev, buf);
+
+	if (memcmp(&dc->disk_sb.sb->set_uuid,
+		   &c->disk_sb.set_uuid,
+		   sizeof(c->disk_sb.set_uuid)))
+		return -ENOENT;
+
+	if (dc->disk.c) {
+		pr_err("Can't attach %s: already attached", buf);
+		return -EINVAL;
+	}
+
+	if (!test_bit(CACHE_SET_RUNNING, &c->flags))
+		return 0;
+
+	if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
+		pr_err("Can't attach %s: shutting down", buf);
+		return -EINVAL;
+	}
+
+	if (le16_to_cpu(dc->disk_sb.sb->block_size) < c->sb.block_size) {
+		/* Will die */
+		pr_err("Couldn't attach %s: block size less than set's block size",
+		       buf);
+		return -EINVAL;
+	}
+
+	found = !bch_cached_dev_inode_find_by_uuid(c,
+					&dc->disk_sb.sb->disk_uuid,
+					&dc->disk.inode);
+
+	if (!found && BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_DIRTY) {
+		pr_err("Couldn't find uuid for %s in set", buf);
+		return -ENOENT;
+	}
+
+	if (found &&
+	    (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_STALE ||
+	     BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_NONE)) {
+		found = false;
+		bch_inode_rm(c, bcache_dev_inum(&dc->disk));
+	}
+
+	/* Deadlocks since we're called via sysfs...
+	sysfs_remove_file(&dc->kobj, &sysfs_attach);
+	 */
+
+	if (!found) {
+		struct closure cl;
+
+		closure_init_stack(&cl);
+
+		bkey_inode_blockdev_init(&dc->disk.inode.k_i);
+		dc->disk.inode.k.type = BCH_INODE_BLOCKDEV;
+		SET_CACHED_DEV(&dc->disk.inode.v, true);
+		dc->disk.inode.v.i_uuid = dc->disk_sb.sb->disk_uuid;
+		memcpy(dc->disk.inode.v.i_label,
+		       dc->disk_sb.sb->label, SB_LABEL_SIZE);
+		dc->disk.inode.v.i_ctime = rtime;
+		dc->disk.inode.v.i_mtime = rtime;
+
+		ret = bch_inode_create(c, &dc->disk.inode.k_i,
+				       0, BLOCKDEV_INODE_MAX,
+				       &c->unused_inode_hint);
+		if (ret) {
+			pr_err("Error %d, not caching %s", ret, buf);
+			return ret;
+		}
+
+		pr_info("attached inode %llu", bcache_dev_inum(&dc->disk));
+
+		dc->disk_sb.sb->set_uuid = c->disk_sb.set_uuid;
+		SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_CLEAN);
+
+		bch_write_bdev_super(dc, &cl);
+		closure_sync(&cl);
+	} else {
+		dc->disk.inode.v.i_mtime = rtime;
+		bch_inode_update(c, &dc->disk.inode.k_i, NULL);
+	}
+
+	/* Count dirty sectors before attaching */
+	if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_DIRTY)
+		bch_sectors_dirty_init(dc, c);
+
+	ret = bcache_device_attach(&dc->disk, c);
+	if (ret)
+		return ret;
+
+	list_move(&dc->list, &c->cached_devs);
+	calc_cached_dev_sectors(c);
+
+	/*
+	 * dc->c must be set before dc->count != 0 - paired with the mb in
+	 * cached_dev_get()
+	 */
+	smp_wmb();
+	atomic_set(&dc->count, 1);
+
+	if (bch_cached_dev_writeback_start(dc))
+		return -ENOMEM;
+
+	if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_DIRTY) {
+		atomic_set(&dc->has_dirty, 1);
+		atomic_inc(&dc->count);
+	}
+
+	bch_cached_dev_run(dc);
+	bcache_device_link(&dc->disk, c, "bdev");
+
+	pr_info("Caching %s as %s on set %pU",
+		bdevname(dc->disk_sb.bdev, buf), dc->disk.disk->disk_name,
+		dc->disk.c->disk_sb.set_uuid.b);
+	return 0;
+}
+
+void bch_attach_backing_devs(struct cache_set *c)
+{
+	struct cached_dev *dc, *t;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	list_for_each_entry_safe(dc, t, &uncached_devices, list)
+		bch_cached_dev_attach(dc, c);
+}
+
+void bch_cached_dev_release(struct kobject *kobj)
+{
+	struct cached_dev *dc = container_of(kobj, struct cached_dev,
+					     disk.kobj);
+	kfree(dc);
+	module_put(THIS_MODULE);
+}
+
+static void cached_dev_free(struct closure *cl)
+{
+	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+
+	bch_cached_dev_writeback_stop(dc);
+	bch_cached_dev_writeback_free(dc);
+
+	mutex_lock(&bch_register_lock);
+
+	if (atomic_read(&dc->running))
+		bd_unlink_disk_holder(dc->disk_sb.bdev, dc->disk.disk);
+	bcache_device_free(&dc->disk);
+	list_del(&dc->list);
+
+	mutex_unlock(&bch_register_lock);
+
+	free_super((void *) &dc->disk_sb);
+
+	kobject_put(&dc->disk.kobj);
+}
+
+static void cached_dev_flush(struct closure *cl)
+{
+	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+	struct bcache_device *d = &dc->disk;
+
+	mutex_lock(&bch_register_lock);
+	bcache_device_unlink(d);
+	mutex_unlock(&bch_register_lock);
+
+	bch_cache_accounting_destroy(&dc->accounting);
+	kobject_del(&d->kobj);
+
+	continue_at(cl, cached_dev_free, system_wq);
+}
+
+static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
+{
+	int ret;
+	struct io *io;
+	struct request_queue *q = bdev_get_queue(dc->disk_sb.bdev);
+
+	dc->sequential_cutoff		= 4 << 20;
+
+	for (io = dc->io; io < dc->io + RECENT_IO; io++) {
+		list_add(&io->lru, &dc->io_lru);
+		hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
+	}
+
+	dc->disk.stripe_size = q->limits.io_opt >> 9;
+
+	if (dc->disk.stripe_size)
+		dc->partial_stripes_expensive =
+			q->limits.raid_partial_stripes_expensive;
+
+	ret = bcache_device_init(&dc->disk, block_size,
+			 dc->disk_sb.bdev->bd_part->nr_sects -
+			 le64_to_cpu(dc->disk_sb.sb->data_offset));
+	if (ret)
+		return ret;
+
+	dc->disk.disk->queue->backing_dev_info.ra_pages =
+		max(dc->disk.disk->queue->backing_dev_info.ra_pages,
+		    q->backing_dev_info.ra_pages);
+
+	bch_cached_dev_request_init(dc);
+	ret = bch_cached_dev_writeback_init(dc);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+/* Cached device - bcache superblock */
+
+static const char *bdev_validate_super(struct backingdev_sb *sb)
+{
+	switch (le64_to_cpu(sb->version)) {
+	case BCACHE_SB_VERSION_BDEV:
+		sb->data_offset	= cpu_to_le64(BDEV_DATA_START_DEFAULT);
+		break;
+	case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
+		if (le64_to_cpu(sb->data_offset) < BDEV_DATA_START_DEFAULT)
+			return "Bad data offset";
+
+		break;
+	default:
+		return"Unsupported superblock version";
+	}
+
+	sb->last_mount	= cpu_to_le32(get_seconds());
+
+	return NULL;
+}
+
+const char *bch_backing_dev_register(struct bcache_superblock *sb)
+{
+	char name[BDEVNAME_SIZE];
+	const char *err;
+	struct cache_set *c;
+	struct cached_dev *dc;
+
+	dc = kzalloc(sizeof(*dc), GFP_KERNEL);
+	if (!dc)
+		return "cannot allocate memory";
+
+	__module_get(THIS_MODULE);
+	INIT_LIST_HEAD(&dc->list);
+	closure_init(&dc->disk.cl, NULL);
+	set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
+	kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
+	INIT_WORK(&dc->detach, cached_dev_detach_finish);
+	sema_init(&dc->sb_write_mutex, 1);
+	INIT_LIST_HEAD(&dc->io_lru);
+	spin_lock_init(&dc->io_lock);
+	bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
+
+	memcpy(&dc->disk_sb, sb, sizeof(*sb));
+	dc->disk_sb.bdev->bd_holder = dc;
+	memset(sb, 0, sizeof(*sb));
+
+	err = bdev_validate_super(dc->disk_sb.sb);
+	if (err)
+		goto err;
+
+	if (cached_dev_init(dc, le16_to_cpu(dc->disk_sb.sb->block_size) << 9))
+		goto err;
+
+	err = "error creating kobject";
+	if (kobject_add(&dc->disk.kobj,
+			&part_to_dev(dc->disk_sb.bdev->bd_part)->kobj,
+			"bcache"))
+		goto err;
+
+	err = "error accounting kobject";
+	if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
+		goto err;
+
+	pr_info("registered backing device %s",
+		bdevname(dc->disk_sb.bdev, name));
+
+	list_add(&dc->list, &uncached_devices);
+	list_for_each_entry(c, &bch_cache_sets, list)
+		bch_cached_dev_attach(dc, c);
+
+	if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_NONE ||
+	    BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_STALE)
+		bch_cached_dev_run(dc);
+
+	return NULL;
+err:
+	bch_blockdev_stop(&dc->disk);
+	return err;
+}
+
+/* Flash only volumes */
+
+void bch_blockdev_volume_release(struct kobject *kobj)
+{
+	struct bcache_device *d = container_of(kobj, struct bcache_device,
+					       kobj);
+	kfree(d);
+}
+
+static void blockdev_volume_free(struct closure *cl)
+{
+	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+
+	mutex_lock(&bch_register_lock);
+	bcache_device_free(d);
+	mutex_unlock(&bch_register_lock);
+	kobject_put(&d->kobj);
+}
+
+static void blockdev_volume_flush(struct closure *cl)
+{
+	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+
+	mutex_lock(&bch_register_lock);
+	bcache_device_unlink(d);
+	mutex_unlock(&bch_register_lock);
+	kobject_del(&d->kobj);
+	continue_at(cl, blockdev_volume_free, system_wq);
+}
+
+static int blockdev_volume_run(struct cache_set *c,
+			       struct bkey_s_c_inode_blockdev inode)
+{
+	struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
+					  GFP_KERNEL);
+	int ret = -ENOMEM;
+
+	if (!d)
+		return ret;
+
+	bkey_reassemble(&d->inode.k_i, inode.s_c);
+
+	closure_init(&d->cl, NULL);
+	set_closure_fn(&d->cl, blockdev_volume_flush, system_wq);
+
+	kobject_init(&d->kobj, &bch_blockdev_volume_ktype);
+
+	ret = bcache_device_init(d, block_bytes(c),
+				 le64_to_cpu(inode.v->i_size) >> 9);
+	if (ret)
+		goto err;
+
+	ret = bcache_device_attach(d, c);
+	if (ret)
+		goto err;
+
+	bch_blockdev_volume_request_init(d);
+	add_disk(d->disk);
+
+	if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
+		goto err;
+
+	bcache_device_link(d, c, "volume");
+
+	return 0;
+err:
+	kobject_put(&d->kobj);
+	return ret;
+}
+
+int bch_blockdev_volumes_start(struct cache_set *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_inode_blockdev inode;
+	int ret = 0;
+
+	if (test_bit(CACHE_SET_STOPPING, &c->flags))
+		return -EINVAL;
+
+	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) {
+		if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
+			break;
+
+		if (k.k->type != BCH_INODE_BLOCKDEV)
+			continue;
+
+		inode = bkey_s_c_to_inode_blockdev(k);
+
+		ret = blockdev_volume_run(c, inode);
+		if (ret)
+			break;
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+int bch_blockdev_volume_create(struct cache_set *c, u64 size)
+{
+	__le64 rtime = cpu_to_le64(ktime_get_seconds());
+	struct bkey_i_inode_blockdev inode;
+	int ret;
+
+	bkey_inode_blockdev_init(&inode.k_i);
+	get_random_bytes(&inode.v.i_uuid, sizeof(inode.v.i_uuid));
+	inode.v.i_ctime = rtime;
+	inode.v.i_mtime = rtime;
+	inode.v.i_size = cpu_to_le64(size);
+
+	ret = bch_inode_create(c, &inode.k_i, 0, BLOCKDEV_INODE_MAX,
+			       &c->unused_inode_hint);
+	if (ret) {
+		pr_err("Can't create volume: %d", ret);
+		return ret;
+	}
+
+	return blockdev_volume_run(c, inode_blockdev_i_to_s_c(&inode));
+}
+
+void bch_blockdevs_stop(struct cache_set *c)
+{
+	struct cached_dev *dc;
+	struct bcache_device *d;
+	struct radix_tree_iter iter;
+	void **slot;
+
+	mutex_lock(&bch_register_lock);
+	rcu_read_lock();
+
+	radix_tree_for_each_slot(slot, &c->devices, &iter, 0) {
+		d = radix_tree_deref_slot(slot);
+
+		if (CACHED_DEV(&d->inode.v) &&
+		    test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
+			dc = container_of(d, struct cached_dev, disk);
+			bch_cached_dev_detach(dc);
+		} else {
+			bch_blockdev_stop(d);
+		}
+	}
+
+	rcu_read_unlock();
+	mutex_unlock(&bch_register_lock);
+}
+
+void bch_blockdev_exit(void)
+{
+	kmem_cache_destroy(bch_search_cache);
+
+	if (bch_blockdev_major >= 0)
+		unregister_blkdev(bch_blockdev_major, "bcache");
+}
+
+int __init bch_blockdev_init(void)
+{
+	bch_blockdev_major = register_blkdev(0, "bcache");
+	if (bch_blockdev_major < 0)
+		return bch_blockdev_major;
+
+	bch_search_cache = KMEM_CACHE(search, 0);
+	if (!bch_search_cache)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/libbcache/blockdev.h b/libbcache/blockdev.h
new file mode 100644
index 0000000..0fc0ed1
--- /dev/null
+++ b/libbcache/blockdev.h
@@ -0,0 +1,99 @@
+#ifndef _BCACHE_BLOCKDEV_H
+#define _BCACHE_BLOCKDEV_H
+
+#include "blockdev_types.h"
+#include "io_types.h"
+
+void bch_write_bdev_super(struct cached_dev *, struct closure *);
+
+void bch_cached_dev_release(struct kobject *);
+void bch_blockdev_volume_release(struct kobject *);
+
+int bch_cached_dev_attach(struct cached_dev *, struct cache_set *);
+void bch_attach_backing_devs(struct cache_set *);
+
+void bch_cached_dev_detach(struct cached_dev *);
+void bch_cached_dev_run(struct cached_dev *);
+void bch_blockdev_stop(struct bcache_device *);
+
+bool bch_is_open_backing_dev(struct block_device *);
+const char *bch_backing_dev_register(struct bcache_superblock *);
+
+int bch_blockdev_volume_create(struct cache_set *, u64);
+int bch_blockdev_volumes_start(struct cache_set *);
+
+void bch_blockdevs_stop(struct cache_set *);
+
+void bch_blockdev_exit(void);
+int bch_blockdev_init(void);
+
+static inline void cached_dev_put(struct cached_dev *dc)
+{
+	if (atomic_dec_and_test(&dc->count))
+		schedule_work(&dc->detach);
+}
+
+static inline bool cached_dev_get(struct cached_dev *dc)
+{
+	if (!atomic_inc_not_zero(&dc->count))
+		return false;
+
+	/* Paired with the mb in cached_dev_attach */
+	smp_mb__after_atomic();
+	return true;
+}
+
+static inline u64 bcache_dev_inum(struct bcache_device *d)
+{
+	return d->inode.k.p.inode;
+}
+
+static inline struct bcache_device *bch_dev_find(struct cache_set *c, u64 inode)
+{
+	return radix_tree_lookup(&c->devices, inode);
+}
+
+struct search {
+	/* Stack frame for bio_complete */
+	struct closure		cl;
+
+	union {
+	struct bch_read_bio	rbio;
+	struct bch_write_bio	wbio;
+	};
+	/* Not modified */
+	struct bio		*orig_bio;
+	struct bcache_device	*d;
+
+	unsigned		inode;
+	unsigned		write:1;
+
+	/* Flags only used for reads */
+	unsigned		recoverable:1;
+	unsigned		read_dirty_data:1;
+	unsigned		cache_miss:1;
+
+	/*
+	 * For reads:  bypass read from cache and insertion into cache
+	 * For writes: discard key range from cache, sending the write to
+	 *             the backing device (if there is a backing device)
+	 */
+	unsigned		bypass:1;
+
+	unsigned long		start_time;
+
+	/*
+	 * Mostly only used for writes. For reads, we still make use of
+	 * some trivial fields:
+	 * - c
+	 * - error
+	 */
+	struct bch_write_op	iop;
+};
+
+extern struct kmem_cache *bch_search_cache;
+
+extern struct kobj_type bch_cached_dev_ktype;
+extern struct kobj_type bch_blockdev_volume_ktype;
+
+#endif /* _BCACHE_BLOCKDEV_H */
diff --git a/libbcache/blockdev_types.h b/libbcache/blockdev_types.h
new file mode 100644
index 0000000..3254917
--- /dev/null
+++ b/libbcache/blockdev_types.h
@@ -0,0 +1,123 @@
+#ifndef _BCACHE_BLOCKDEV_TYPES_H
+#define _BCACHE_BLOCKDEV_TYPES_H
+
+#include "keybuf_types.h"
+#include "stats_types.h"
+#include "super_types.h"
+#include "util.h"
+
+struct bcache_device {
+	struct closure		cl;
+
+	struct kobject		kobj;
+
+	struct cache_set	*c;
+
+	struct rb_node		node;
+	struct bkey_i_inode_blockdev inode;
+	struct mutex		inode_lock;
+
+#define BCACHEDEVNAME_SIZE	12
+	char			name[BCACHEDEVNAME_SIZE];
+
+	struct gendisk		*disk;
+
+	unsigned long		flags;
+#define BCACHE_DEV_CLOSING	0
+#define BCACHE_DEV_DETACHING	1
+#define BCACHE_DEV_UNLINK_DONE	2
+
+	unsigned		nr_stripes;
+	unsigned		stripe_size;
+	atomic_t		*stripe_sectors_dirty;
+	unsigned long		*full_dirty_stripes;
+
+	struct bio_set		bio_split;
+
+	unsigned		data_csum:1;
+
+	int (*ioctl)(struct bcache_device *, fmode_t, unsigned, unsigned long);
+};
+
+struct io {
+	/* Used to track sequential IO so it can be skipped */
+	struct hlist_node	hash;
+	struct list_head	lru;
+
+	unsigned long		last_io;
+	unsigned		sequential;
+	sector_t		last;
+};
+
+struct cached_dev {
+	struct list_head	list;
+	struct bcache_device	disk;
+
+	//struct backingdev_sb		sb;
+
+	struct {
+		struct backingdev_sb	*sb;
+		struct block_device	*bdev;
+		struct bio		*bio;
+		unsigned		page_order;
+	} disk_sb;
+	struct closure		sb_write;
+	struct semaphore	sb_write_mutex;
+
+	/* Refcount on the cache set. Always nonzero when we're caching. */
+	atomic_t		count;
+	struct work_struct	detach;
+
+	/*
+	 * Device might not be running if it's dirty and the cache set hasn't
+	 * showed up yet.
+	 */
+	atomic_t		running;
+
+	/*
+	 * Writes take a shared lock from start to finish; scanning for dirty
+	 * data to refill the rb tree requires an exclusive lock.
+	 */
+	struct rw_semaphore	writeback_lock;
+
+	/*
+	 * Nonzero, and writeback has a refcount (d->count), iff there is dirty
+	 * data in the cache. Protected by writeback_lock; must have an
+	 * shared lock to set and exclusive lock to clear.
+	 */
+	atomic_t		has_dirty;
+
+	/* for dynamic rate control of writeback */
+	struct bch_pd_controller writeback_pd;
+	struct delayed_work	writeback_pd_update;
+	unsigned		writeback_pd_update_seconds;
+
+	struct task_struct	*writeback_thread;
+	struct keybuf		writeback_keys;
+	mempool_t		writeback_io_pool;
+	mempool_t		writeback_page_pool;
+
+	/* For tracking sequential IO */
+#define RECENT_IO_BITS	7
+#define RECENT_IO	(1 << RECENT_IO_BITS)
+	struct io		io[RECENT_IO];
+	struct hlist_head	io_hash[RECENT_IO + 1];
+	struct list_head	io_lru;
+	spinlock_t		io_lock;
+
+	struct cache_accounting	accounting;
+
+	/* The rest of this all shows up in sysfs */
+	unsigned		sequential_cutoff;
+	unsigned		readahead;
+
+	unsigned		verify:1;
+	unsigned		bypass_torture_test:1;
+
+	unsigned		partial_stripes_expensive:1;
+	unsigned		writeback_metadata:1;
+	unsigned		writeback_running:1;
+	unsigned char		writeback_percent;
+};
+
+#endif /* _BCACHE_BLOCKDEV_TYPES_H */
diff --git a/libbcache/bset.c b/libbcache/bset.c
new file mode 100644
index 0000000..3488095
--- /dev/null
+++ b/libbcache/bset.c
@@ -0,0 +1,1846 @@
+/*
+ * Code for working with individual keys, and sorted sets of keys with in a
+ * btree node
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+
+#include "eytzinger.h"
+#include "util.h"
+#include "bset.h"
+
+#include <asm/unaligned.h>
+#include <linux/dynamic_fault.h>
+#include <linux/console.h>
+#include <linux/random.h>
+#include <linux/prefetch.h>
+
+/* hack.. */
+#include "alloc_types.h"
+#include <trace/events/bcache.h>
+
+struct bset_tree *bch_bkey_to_bset(struct btree *b, struct bkey_packed *k)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t)
+		if (k >= btree_bkey_first(b, t) &&
+		    k < btree_bkey_last(b, t))
+			return t;
+
+	BUG();
+}
+
+/*
+ * There are never duplicate live keys in the btree - but including keys that
+ * have been flagged as deleted (and will be cleaned up later) we _will_ see
+ * duplicates.
+ *
+ * Thus the sort order is: usual key comparison first, but for keys that compare
+ * equal the deleted key(s) come first, and the (at most one) live version comes
+ * last.
+ *
+ * The main reason for this is insertion: to handle overwrites, we first iterate
+ * over keys that compare equal to our insert key, and then insert immediately
+ * prior to the first key greater than the key we're inserting - our insert
+ * position will be after all keys that compare equal to our insert key, which
+ * by the time we actually do the insert will all be deleted.
+ */
+
+void bch_dump_bset(struct btree *b, struct bset *i, unsigned set)
+{
+	struct bkey_packed *_k, *_n;
+	struct bkey k, n;
+	char buf[120];
+
+	if (!i->u64s)
+		return;
+
+	for (_k = i->start, k = bkey_unpack_key(b, _k);
+	     _k < bset_bkey_last(i);
+	     _k = _n, k = n) {
+		_n = bkey_next(_k);
+
+		bch_bkey_to_text(buf, sizeof(buf), &k);
+		printk(KERN_ERR "block %u key %zi/%u: %s\n", set,
+		       _k->_data - i->_data, i->u64s, buf);
+
+		if (_n == bset_bkey_last(i))
+			continue;
+
+		n = bkey_unpack_key(b, _n);
+
+		if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) {
+			printk(KERN_ERR "Key skipped backwards\n");
+			continue;
+		}
+
+		/*
+		 * Weird check for duplicate non extent keys: extents are
+		 * deleted iff they have 0 size, so if it has zero size and it's
+		 * not deleted these aren't extents:
+		 */
+		if (((!k.size && !bkey_deleted(&k)) ||
+		     (!n.size && !bkey_deleted(&n))) &&
+		    !bkey_deleted(&k) &&
+		    !bkey_cmp(n.p, k.p))
+			printk(KERN_ERR "Duplicate keys\n");
+	}
+}
+
+void bch_dump_btree_node(struct btree *b)
+{
+	struct bset_tree *t;
+
+	console_lock();
+	for_each_bset(b, t)
+		bch_dump_bset(b, bset(b, t), t - b->set);
+	console_unlock();
+}
+
+void bch_dump_btree_node_iter(struct btree *b,
+			      struct btree_node_iter *iter)
+{
+	struct btree_node_iter_set *set;
+
+	printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets);
+
+	btree_node_iter_for_each(iter, set) {
+		struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
+		struct bset_tree *t = bch_bkey_to_bset(b, k);
+		struct bkey uk = bkey_unpack_key(b, k);
+		char buf[100];
+
+		bch_bkey_to_text(buf, sizeof(buf), &uk);
+		printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set,
+		       k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf);
+	}
+}
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+static bool keys_out_of_order(struct btree *b,
+			      const struct bkey_packed *prev,
+			      const struct bkey_packed *next,
+			      bool is_extents)
+{
+	struct bkey nextu = bkey_unpack_key(b, next);
+
+	return bkey_cmp_left_packed_byval(b, prev, bkey_start_pos(&nextu)) > 0 ||
+		((is_extents
+		  ? !bkey_deleted(next)
+		  : !bkey_deleted(prev)) &&
+		 !bkey_cmp_packed(b, prev, next));
+}
+
+void __bch_verify_btree_nr_keys(struct btree *b)
+{
+	struct bset_tree *t;
+	struct bkey_packed *k;
+	struct btree_nr_keys nr = { 0 };
+
+	for_each_bset(b, t)
+		for (k = btree_bkey_first(b, t);
+		     k != btree_bkey_last(b, t);
+		     k = bkey_next(k))
+			if (!bkey_whiteout(k))
+				btree_keys_account_key_add(&nr, t - b->set, k);
+
+	BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
+}
+
+static void bch_btree_node_iter_next_check(struct btree_node_iter *iter,
+					   struct btree *b,
+					   struct bkey_packed *k)
+{
+	const struct bkey_packed *n = bch_btree_node_iter_peek_all(iter, b);
+
+	bkey_unpack_key(b, k);
+
+	if (n &&
+	    keys_out_of_order(b, k, n, iter->is_extents)) {
+		struct bkey ku = bkey_unpack_key(b, k);
+		struct bkey nu = bkey_unpack_key(b, n);
+		char buf1[80], buf2[80];
+
+		bch_dump_btree_node(b);
+		bch_bkey_to_text(buf1, sizeof(buf1), &ku);
+		bch_bkey_to_text(buf2, sizeof(buf2), &nu);
+		panic("out of order/overlapping:\n%s\n%s\n", buf1, buf2);
+	}
+}
+
+void bch_btree_node_iter_verify(struct btree_node_iter *iter,
+				struct btree *b)
+{
+	struct btree_node_iter_set *set;
+	struct bset_tree *t;
+	struct bkey_packed *k, *first;
+
+	BUG_ON(iter->used > MAX_BSETS);
+
+	if (!iter->used)
+		return;
+
+	btree_node_iter_for_each(iter, set) {
+		k = __btree_node_offset_to_key(b, set->k);
+		t = bch_bkey_to_bset(b, k);
+
+		BUG_ON(__btree_node_offset_to_key(b, set->end) !=
+		       btree_bkey_last(b, t));
+
+		BUG_ON(set + 1 < iter->data + iter->used &&
+		       btree_node_iter_cmp(iter, b, set[0], set[1]) > 0);
+	}
+
+	first = __btree_node_offset_to_key(b, iter->data[0].k);
+
+	for_each_bset(b, t)
+		if (bch_btree_node_iter_bset_pos(iter, b, t) ==
+		    btree_bkey_last(b, t) &&
+		    (k = bkey_prev_all(b, t, btree_bkey_last(b, t))))
+			BUG_ON(__btree_node_iter_cmp(iter->is_extents, b,
+						     k, first) > 0);
+}
+
+void bch_verify_key_order(struct btree *b,
+			  struct btree_node_iter *iter,
+			  struct bkey_packed *where)
+{
+	struct bset_tree *t = bch_bkey_to_bset(b, where);
+	struct bkey_packed *k, *prev;
+	struct bkey uk, uw = bkey_unpack_key(b, where);
+
+	k = bkey_prev_all(b, t, where);
+	if (k &&
+	    keys_out_of_order(b, k, where, iter->is_extents)) {
+		char buf1[100], buf2[100];
+
+		bch_dump_btree_node(b);
+		uk = bkey_unpack_key(b, k);
+		bch_bkey_to_text(buf1, sizeof(buf1), &uk);
+		bch_bkey_to_text(buf2, sizeof(buf2), &uw);
+		panic("out of order with prev:\n%s\n%s\n",
+		      buf1, buf2);
+	}
+
+	k = bkey_next(where);
+	BUG_ON(k != btree_bkey_last(b, t) &&
+	       keys_out_of_order(b, where, k, iter->is_extents));
+
+	for_each_bset(b, t) {
+		if (where >= btree_bkey_first(b, t) ||
+		    where < btree_bkey_last(b, t))
+			continue;
+
+		k = bch_btree_node_iter_bset_pos(iter, b, t);
+
+		if (k == btree_bkey_last(b, t))
+			k = bkey_prev_all(b, t, k);
+
+		while (bkey_cmp_left_packed_byval(b, k, bkey_start_pos(&uw)) > 0 &&
+		       (prev = bkey_prev_all(b, t, k)))
+			k = prev;
+
+		for (;
+		     k != btree_bkey_last(b, t);
+		     k = bkey_next(k)) {
+			uk = bkey_unpack_key(b, k);
+
+			if (iter->is_extents) {
+				BUG_ON(!(bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0 ||
+					 bkey_cmp(uk.p, bkey_start_pos(&uw)) <= 0));
+			} else {
+				BUG_ON(!bkey_cmp(uw.p, uk.p) &&
+				       !bkey_deleted(&uk));
+			}
+
+			if (bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0)
+				break;
+		}
+	}
+}
+
+#else
+
+static void bch_btree_node_iter_next_check(struct btree_node_iter *iter,
+					   struct btree *b,
+					   struct bkey_packed *k) {}
+
+#endif
+
+/* Auxiliary search trees */
+
+#define BFLOAT_FAILED_UNPACKED	(U8_MAX - 0)
+#define BFLOAT_FAILED_PREV	(U8_MAX - 1)
+#define BFLOAT_FAILED_OVERFLOW	(U8_MAX - 2)
+#define BFLOAT_FAILED		(U8_MAX - 2)
+
+#define KEY_WORDS		BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS)
+
+struct bkey_float {
+	u8		exponent;
+	u8		key_offset;
+	union {
+		u32	mantissa32;
+	struct {
+		u16	mantissa16;
+		u16	_pad;
+	};
+	};
+} __packed;
+
+#define BFLOAT_32BIT_NR		32U
+
+static unsigned bkey_float_byte_offset(unsigned idx)
+{
+	int d = (idx - BFLOAT_32BIT_NR) << 1;
+
+	d &= ~(d >> 31);
+
+	return idx * 6 - d;
+}
+
+struct ro_aux_tree {
+	struct bkey_float	_d[0];
+};
+
+struct rw_aux_tree {
+	u16		offset;
+	struct bpos	k;
+};
+
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+
+#define BSET_CACHELINE		128
+
+/* Space required for the btree node keys */
+static inline size_t btree_keys_bytes(struct btree *b)
+{
+	return PAGE_SIZE << b->page_order;
+}
+
+static inline size_t btree_keys_cachelines(struct btree *b)
+{
+	return btree_keys_bytes(b) / BSET_CACHELINE;
+}
+
+static inline size_t btree_aux_data_bytes(struct btree *b)
+{
+	return btree_keys_cachelines(b) * 8;
+}
+
+static inline size_t btree_aux_data_u64s(struct btree *b)
+{
+	return btree_aux_data_bytes(b) / sizeof(u64);
+}
+
+static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
+{
+	BUG_ON(t->aux_data_offset == U16_MAX);
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		return t->aux_data_offset;
+	case BSET_RO_AUX_TREE:
+		return t->aux_data_offset +
+			DIV_ROUND_UP(bkey_float_byte_offset(t->size) +
+				     sizeof(u8) * t->size, 8);
+	case BSET_RW_AUX_TREE:
+		return t->aux_data_offset +
+			DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
+	default:
+		BUG();
+	}
+}
+
+static unsigned bset_aux_tree_buf_start(const struct btree *b,
+					const struct bset_tree *t)
+{
+	return t == b->set
+		? DIV_ROUND_UP(b->unpack_fn_len, 8)
+		: bset_aux_tree_buf_end(t - 1);
+}
+
+static void *__aux_tree_base(const struct btree *b,
+			     const struct bset_tree *t)
+{
+	return b->aux_data + t->aux_data_offset * 8;
+}
+
+static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
+					    const struct bset_tree *t)
+{
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+	return __aux_tree_base(b, t);
+}
+
+static u8 *ro_aux_tree_prev(const struct btree *b,
+			    const struct bset_tree *t)
+{
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+	return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
+}
+
+static struct bkey_float *bkey_float_get(struct ro_aux_tree *b,
+					 unsigned idx)
+{
+	return (void *) b + bkey_float_byte_offset(idx);
+}
+
+static struct bkey_float *bkey_float(const struct btree *b,
+				     const struct bset_tree *t,
+				     unsigned idx)
+{
+	return bkey_float_get(ro_aux_tree_base(b, t), idx);
+}
+
+static void bset_aux_tree_verify(struct btree *b)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+	struct bset_tree *t;
+
+	for_each_bset(b, t) {
+		if (t->aux_data_offset == U16_MAX)
+			continue;
+
+		BUG_ON(t != b->set &&
+		       t[-1].aux_data_offset == U16_MAX);
+
+		BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
+		BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
+		BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
+	}
+#endif
+}
+
+/* Memory allocation */
+
+void bch_btree_keys_free(struct btree *b)
+{
+	vfree(b->aux_data);
+	b->aux_data = NULL;
+}
+
+int bch_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
+{
+	b->page_order	= page_order;
+	b->aux_data	= __vmalloc(btree_aux_data_bytes(b), gfp,
+				    PAGE_KERNEL_EXEC);
+	if (!b->aux_data)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void bch_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
+{
+	unsigned i;
+
+	b->nsets		= 0;
+	memset(&b->nr, 0, sizeof(b->nr));
+#ifdef CONFIG_BCACHE_DEBUG
+	b->expensive_debug_checks = expensive_debug_checks;
+#endif
+	for (i = 0; i < MAX_BSETS; i++)
+		b->set[i].data_offset = U16_MAX;
+
+	bch_bset_set_no_aux_tree(b, b->set);
+}
+
+/* Binary tree stuff for auxiliary search trees */
+
+/*
+ * Cacheline/offset <-> bkey pointer arithmetic:
+ *
+ * t->tree is a binary search tree in an array; each node corresponds to a key
+ * in one cacheline in t->set (BSET_CACHELINE bytes).
+ *
+ * This means we don't have to store the full index of the key that a node in
+ * the binary tree points to; eytzinger_to_inorder() gives us the cacheline, and
+ * then bkey_float->m gives us the offset within that cacheline, in units of 8
+ * bytes.
+ *
+ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
+ * make this work.
+ *
+ * To construct the bfloat for an arbitrary key we need to know what the key
+ * immediately preceding it is: we have to check if the two keys differ in the
+ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
+ * of the previous key so we can walk backwards to it from t->tree[j]'s key.
+ */
+
+static inline void *bset_cacheline(const struct btree *b,
+				   const struct bset_tree *t,
+				   unsigned cacheline)
+{
+	return (void *) round_down((unsigned long) btree_bkey_first(b, t),
+				   L1_CACHE_BYTES) +
+		cacheline * BSET_CACHELINE;
+}
+
+static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
+					     const struct bset_tree *t,
+					     unsigned cacheline,
+					     unsigned offset)
+{
+	return bset_cacheline(b, t, cacheline) + offset * 8;
+}
+
+static unsigned bkey_to_cacheline(const struct btree *b,
+				  const struct bset_tree *t,
+				  const struct bkey_packed *k)
+{
+	return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
+}
+
+static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
+					  const struct bset_tree *t,
+					  unsigned cacheline,
+					  const struct bkey_packed *k)
+{
+	return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
+}
+
+static unsigned bkey_to_cacheline_offset(const struct btree *b,
+					 const struct bset_tree *t,
+					 unsigned cacheline,
+					 const struct bkey_packed *k)
+{
+	size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
+
+	EBUG_ON(m > U8_MAX);
+	return m;
+}
+
+static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
+					       const struct bset_tree *t,
+					       unsigned j)
+{
+	return cacheline_to_bkey(b, t,
+			__eytzinger_to_inorder(j, t->size, t->extra),
+			bkey_float(b, t, j)->key_offset);
+}
+
+static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
+					     const struct bset_tree *t,
+					     unsigned j)
+{
+	unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
+
+	return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
+}
+
+static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
+				       const struct bset_tree *t)
+{
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+
+	return __aux_tree_base(b, t);
+}
+
+/*
+ * For the write set - the one we're currently inserting keys into - we don't
+ * maintain a full search tree, we just keep a simple lookup table in t->prev.
+ */
+static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
+					  struct bset_tree *t,
+					  unsigned j)
+{
+	return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
+}
+
+static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
+			    unsigned j, struct bkey_packed *k)
+{
+	BUG_ON(k >= btree_bkey_last(b, t));
+
+	rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
+		.offset	= __btree_node_key_to_offset(b, k),
+		.k	= bkey_unpack_pos(b, k),
+	};
+}
+
+static void bch_bset_verify_rw_aux_tree(struct btree *b,
+					struct bset_tree *t)
+{
+	struct bkey_packed *k = btree_bkey_first(b, t);
+	unsigned j = 0;
+
+	if (!btree_keys_expensive_checks(b))
+		return;
+
+	BUG_ON(bset_has_ro_aux_tree(t));
+
+	if (!bset_has_rw_aux_tree(t))
+		return;
+
+	BUG_ON(t->size < 1);
+	BUG_ON(rw_aux_to_bkey(b, t, j) != k);
+
+	goto start;
+	while (1) {
+		if (rw_aux_to_bkey(b, t, j) == k) {
+			BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
+					bkey_unpack_pos(b, k)));
+start:
+			if (++j == t->size)
+				break;
+
+			BUG_ON(rw_aux_tree(b, t)[j].offset <=
+			       rw_aux_tree(b, t)[j - 1].offset);
+		}
+
+		k = bkey_next(k);
+		BUG_ON(k >= btree_bkey_last(b, t));
+	}
+}
+
+/* returns idx of first entry >= offset: */
+static unsigned rw_aux_tree_bsearch(struct btree *b,
+				    struct bset_tree *t,
+				    unsigned offset)
+{
+	unsigned l = 0, r = t->size;
+
+	BUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+
+	while (l < r) {
+		unsigned m = (l + r) >> 1;
+
+		if (rw_aux_tree(b, t)[m].offset < offset)
+			l = m + 1;
+		else
+			r = m;
+	}
+
+	BUG_ON(l < t->size &&
+	       rw_aux_tree(b, t)[l].offset < offset);
+	BUG_ON(l &&
+	       rw_aux_tree(b, t)[l - 1].offset >= offset);
+
+	BUG_ON(l > r);
+	BUG_ON(l > t->size);
+
+	return l;
+}
+
+static inline unsigned bfloat_mantissa(const struct bkey_float *f,
+				       unsigned idx)
+{
+	return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16;
+}
+
+static inline void bfloat_mantissa_set(struct bkey_float *f,
+				       unsigned idx, unsigned mantissa)
+{
+	if (idx < BFLOAT_32BIT_NR)
+		f->mantissa32 = mantissa;
+	else
+		f->mantissa16 = mantissa;
+}
+
+static inline unsigned bkey_mantissa(const struct bkey_packed *k,
+				     const struct bkey_float *f,
+				     unsigned idx)
+{
+	u64 v;
+
+	EBUG_ON(!bkey_packed(k));
+
+	v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
+
+	/*
+	 * In little endian, we're shifting off low bits (and then the bits we
+	 * want are at the low end), in big endian we're shifting off high bits
+	 * (and then the bits we want are at the high end, so we shift them
+	 * back down):
+	 */
+#ifdef __LITTLE_ENDIAN
+	v >>= f->exponent & 7;
+#else
+	v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
+#endif
+	return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v;
+}
+
+static void make_bfloat(struct btree *b, struct bset_tree *t,
+			unsigned j,
+			struct bkey_packed *min_key,
+			struct bkey_packed *max_key)
+{
+	struct bkey_float *f = bkey_float(b, t, j);
+	struct bkey_packed *m = tree_to_bkey(b, t, j);
+	struct bkey_packed *p = tree_to_prev_bkey(b, t, j);
+	struct bkey_packed *l, *r;
+	unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16;
+	unsigned mantissa;
+	int shift, exponent;
+
+	EBUG_ON(bkey_next(p) != m);
+
+	if (is_power_of_2(j)) {
+		l = min_key;
+
+		if (!l->u64s) {
+			if (!bkey_pack_pos(l, b->data->min_key, b)) {
+				struct bkey_i tmp;
+
+				bkey_init(&tmp.k);
+				tmp.k.p = b->data->min_key;
+				bkey_copy(l, &tmp);
+			}
+		}
+	} else {
+		l = tree_to_prev_bkey(b, t, j >> ffs(j));
+
+		EBUG_ON(m < l);
+	}
+
+	if (is_power_of_2(j + 1)) {
+		r = max_key;
+
+		if (!r->u64s) {
+			if (!bkey_pack_pos(r, t->max_key, b)) {
+				struct bkey_i tmp;
+
+				bkey_init(&tmp.k);
+				tmp.k.p = t->max_key;
+				bkey_copy(r, &tmp);
+			}
+		}
+	} else {
+		r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
+
+		EBUG_ON(m > r);
+	}
+
+	/*
+	 * for failed bfloats, the lookup code falls back to comparing against
+	 * the original key.
+	 */
+
+	if (!bkey_packed(l) || !bkey_packed(r) ||
+	    !bkey_packed(p) || !bkey_packed(m)) {
+		f->exponent = BFLOAT_FAILED_UNPACKED;
+		return;
+	}
+
+	/*
+	 * The greatest differing bit of l and r is the first bit we must
+	 * include in the bfloat mantissa we're creating in order to do
+	 * comparisons - that bit always becomes the high bit of
+	 * bfloat->mantissa, and thus the exponent we're calculating here is
+	 * the position of what will become the low bit in bfloat->mantissa:
+	 *
+	 * Note that this may be negative - we may be running off the low end
+	 * of the key: we handle this later:
+	 */
+	exponent = (int) bkey_greatest_differing_bit(b, l, r) - (bits - 1);
+
+	/*
+	 * Then we calculate the actual shift value, from the start of the key
+	 * (k->_data), to get the key bits starting at exponent:
+	 */
+#ifdef __LITTLE_ENDIAN
+	shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
+
+	EBUG_ON(shift + bits > b->format.key_u64s * 64);
+#else
+	shift = high_bit_offset +
+		b->nr_key_bits -
+		exponent -
+		bits;
+
+	EBUG_ON(shift < KEY_PACKED_BITS_START);
+#endif
+	EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
+
+	f->exponent = shift;
+	mantissa = bkey_mantissa(m, f, j);
+
+	/*
+	 * If we've got garbage bits, set them to all 1s - it's legal for the
+	 * bfloat to compare larger than the original key, but not smaller:
+	 */
+	if (exponent < 0)
+		mantissa |= ~(~0U << -exponent);
+
+	bfloat_mantissa_set(f, j, mantissa);
+
+	/*
+	 * The bfloat must be able to tell its key apart from the previous key -
+	 * if its key and the previous key don't differ in the required bits,
+	 * flag as failed - unless the keys are actually equal, in which case
+	 * we aren't required to return a specific one:
+	 */
+	if (exponent > 0 &&
+	    bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) &&
+	    bkey_cmp_packed(b, p, m)) {
+		f->exponent = BFLOAT_FAILED_PREV;
+		return;
+	}
+
+	/*
+	 * f->mantissa must compare >= the original key - for transitivity with
+	 * the comparison in bset_search_tree. If we're dropping set bits,
+	 * increment it:
+	 */
+	if (exponent > (int) bkey_ffs(b, m)) {
+		if (j < BFLOAT_32BIT_NR
+		    ? f->mantissa32 == U32_MAX
+		    : f->mantissa16 == U16_MAX)
+			f->exponent = BFLOAT_FAILED_OVERFLOW;
+
+		if (j < BFLOAT_32BIT_NR)
+			f->mantissa32++;
+		else
+			f->mantissa16++;
+	}
+}
+
+/* bytes remaining - only valid for last bset: */
+static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
+{
+	bset_aux_tree_verify(b);
+
+	return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
+}
+
+static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
+{
+	unsigned bytes = __bset_tree_capacity(b, t);
+
+	if (bytes < 7 * BFLOAT_32BIT_NR)
+		return bytes / 7;
+
+	bytes -= 7 * BFLOAT_32BIT_NR;
+
+	return BFLOAT_32BIT_NR + bytes / 5;
+}
+
+static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
+{
+	return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
+}
+
+static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
+{
+	struct bkey_packed *k;
+
+	t->size = 1;
+	t->extra = BSET_RW_AUX_TREE_VAL;
+	rw_aux_tree(b, t)[0].offset =
+		__btree_node_key_to_offset(b, btree_bkey_first(b, t));
+
+	for (k = btree_bkey_first(b, t);
+	     k != btree_bkey_last(b, t);
+	     k = bkey_next(k)) {
+		if (t->size == bset_rw_tree_capacity(b, t))
+			break;
+
+		if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
+		    L1_CACHE_BYTES)
+			rw_aux_tree_set(b, t, t->size++, k);
+	}
+}
+
+static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
+{
+	struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
+	struct bkey_packed min_key, max_key;
+	unsigned j, cacheline = 1;
+
+	/* signal to make_bfloat() that they're uninitialized: */
+	min_key.u64s = max_key.u64s = 0;
+
+	t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
+		      bset_ro_tree_capacity(b, t));
+retry:
+	if (t->size < 2) {
+		t->size = 0;
+		t->extra = BSET_NO_AUX_TREE_VAL;
+		return;
+	}
+
+	t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
+
+	/* First we figure out where the first key in each cacheline is */
+	eytzinger_for_each(j, t->size) {
+		while (bkey_to_cacheline(b, t, k) < cacheline)
+			prev = k, k = bkey_next(k);
+
+		if (k >= btree_bkey_last(b, t)) {
+			t->size--;
+			goto retry;
+		}
+
+		ro_aux_tree_prev(b, t)[j] = prev->u64s;
+		bkey_float(b, t, j)->key_offset =
+			bkey_to_cacheline_offset(b, t, cacheline++, k);
+
+		BUG_ON(tree_to_prev_bkey(b, t, j) != prev);
+		BUG_ON(tree_to_bkey(b, t, j) != k);
+	}
+
+	while (bkey_next(k) != btree_bkey_last(b, t))
+		k = bkey_next(k);
+
+	t->max_key = bkey_unpack_pos(b, k);
+
+	/* Then we build the tree */
+	eytzinger_for_each(j, t->size)
+		make_bfloat(b, t, j, &min_key, &max_key);
+}
+
+static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
+{
+	struct bset_tree *i;
+
+	for (i = b->set; i != t; i++)
+		BUG_ON(bset_has_rw_aux_tree(i));
+
+	bch_bset_set_no_aux_tree(b, t);
+
+	/* round up to next cacheline: */
+	t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
+				      SMP_CACHE_BYTES / sizeof(u64));
+
+	bset_aux_tree_verify(b);
+}
+
+void bch_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
+			     bool writeable)
+{
+	if (writeable
+	    ? bset_has_rw_aux_tree(t)
+	    : bset_has_ro_aux_tree(t))
+		return;
+
+	bset_alloc_tree(b, t);
+
+	if (!__bset_tree_capacity(b, t))
+		return;
+
+	if (writeable)
+		__build_rw_aux_tree(b, t);
+	else
+		__build_ro_aux_tree(b, t);
+
+	bset_aux_tree_verify(b);
+}
+
+void bch_bset_init_first(struct btree *b, struct bset *i)
+{
+	struct bset_tree *t;
+
+	BUG_ON(b->nsets);
+
+	memset(i, 0, sizeof(*i));
+	get_random_bytes(&i->seq, sizeof(i->seq));
+	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+	t = &b->set[b->nsets++];
+	set_btree_bset(b, t, i);
+}
+
+void bch_bset_init_next(struct btree *b, struct bset *i)
+{
+	struct bset_tree *t;
+
+	BUG_ON(b->nsets >= MAX_BSETS);
+
+	memset(i, 0, sizeof(*i));
+	i->seq = btree_bset_first(b)->seq;
+	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+	t = &b->set[b->nsets++];
+	set_btree_bset(b, t, i);
+}
+
+static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
+				       struct bkey_packed *k)
+{
+	struct bkey_packed *p;
+	unsigned offset;
+	int j;
+
+	EBUG_ON(k < btree_bkey_first(b, t) ||
+		k > btree_bkey_last(b, t));
+
+	if (k == btree_bkey_first(b, t))
+		return NULL;
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		p = btree_bkey_first(b, t);
+		break;
+	case BSET_RO_AUX_TREE:
+		j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
+
+		do {
+			p = j ? tree_to_bkey(b, t,
+					__inorder_to_eytzinger(j--,
+							t->size, t->extra))
+			      : btree_bkey_first(b, t);
+		} while (p >= k);
+		break;
+	case BSET_RW_AUX_TREE:
+		offset = __btree_node_key_to_offset(b, k);
+		j = rw_aux_tree_bsearch(b, t, offset);
+		p = j ? rw_aux_to_bkey(b, t, j - 1)
+		      : btree_bkey_first(b, t);
+		break;
+	}
+
+	return p;
+}
+
+struct bkey_packed *bkey_prev_all(struct btree *b, struct bset_tree *t,
+				  struct bkey_packed *k)
+{
+	struct bkey_packed *p;
+
+	p = __bkey_prev(b, t, k);
+	if (!p)
+		return NULL;
+
+	while (bkey_next(p) != k)
+		p = bkey_next(p);
+
+	return p;
+}
+
+struct bkey_packed *bkey_prev(struct btree *b, struct bset_tree *t,
+			      struct bkey_packed *k)
+{
+	while (1) {
+		struct bkey_packed *p, *i, *ret = NULL;
+
+		p = __bkey_prev(b, t, k);
+		if (!p)
+			return NULL;
+
+		for (i = p; i != k; i = bkey_next(i))
+			if (!bkey_deleted(i))
+				ret = i;
+
+		if (ret)
+			return ret;
+
+		k = p;
+	}
+}
+
+/* Insert */
+
+static void rw_aux_tree_fix_invalidated_key(struct btree *b,
+					    struct bset_tree *t,
+					    struct bkey_packed *k)
+{
+	unsigned offset = __btree_node_key_to_offset(b, k);
+	unsigned j = rw_aux_tree_bsearch(b, t, offset);
+
+	if (j < t->size &&
+	    rw_aux_tree(b, t)[j].offset == offset)
+		rw_aux_tree_set(b, t, j, k);
+
+	bch_bset_verify_rw_aux_tree(b, t);
+}
+
+static void ro_aux_tree_fix_invalidated_key(struct btree *b,
+					    struct bset_tree *t,
+					    struct bkey_packed *k)
+{
+	struct bkey_packed min_key, max_key;
+	unsigned inorder, j;
+
+	BUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+	/* signal to make_bfloat() that they're uninitialized: */
+	min_key.u64s = max_key.u64s = 0;
+
+	if (bkey_next(k) == btree_bkey_last(b, t)) {
+		t->max_key = bkey_unpack_pos(b, k);
+
+		for (j = 1; j < t->size; j = j * 2 + 1)
+			make_bfloat(b, t, j, &min_key, &max_key);
+	}
+
+	inorder = bkey_to_cacheline(b, t, k);
+
+	if (inorder &&
+	    inorder < t->size) {
+		j = __inorder_to_eytzinger(inorder, t->size, t->extra);
+
+		if (k == tree_to_bkey(b, t, j)) {
+			/* Fix the node this key corresponds to */
+			make_bfloat(b, t, j, &min_key, &max_key);
+
+			/* Children for which this key is the right boundary */
+			for (j = eytzinger_left_child(j);
+			     j < t->size;
+			     j = eytzinger_right_child(j))
+				make_bfloat(b, t, j, &min_key, &max_key);
+		}
+	}
+
+	if (inorder + 1 < t->size) {
+		j = __inorder_to_eytzinger(inorder + 1, t->size, t->extra);
+
+		if (k == tree_to_prev_bkey(b, t, j)) {
+			make_bfloat(b, t, j, &min_key, &max_key);
+
+			/* Children for which this key is the left boundary */
+			for (j = eytzinger_right_child(j);
+			     j < t->size;
+			     j = eytzinger_left_child(j))
+				make_bfloat(b, t, j, &min_key, &max_key);
+		}
+	}
+}
+
+/**
+ * bch_bset_fix_invalidated_key() - given an existing  key @k that has been
+ * modified, fix any auxiliary search tree by remaking all the nodes in the
+ * auxiliary search tree that @k corresponds to
+ */
+void bch_bset_fix_invalidated_key(struct btree *b, struct bset_tree *t,
+				  struct bkey_packed *k)
+{
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		break;
+	case BSET_RO_AUX_TREE:
+		ro_aux_tree_fix_invalidated_key(b, t, k);
+		break;
+	case BSET_RW_AUX_TREE:
+		rw_aux_tree_fix_invalidated_key(b, t, k);
+		break;
+	}
+}
+
+static void bch_bset_fix_lookup_table(struct btree *b,
+				      struct bset_tree *t,
+				      struct bkey_packed *_where,
+				      unsigned clobber_u64s,
+				      unsigned new_u64s)
+{
+	int shift = new_u64s - clobber_u64s;
+	unsigned l, j, where = __btree_node_key_to_offset(b, _where);
+
+	BUG_ON(bset_has_ro_aux_tree(t));
+
+	if (!bset_has_rw_aux_tree(t))
+		return;
+
+	l = rw_aux_tree_bsearch(b, t, where);
+
+	/* l is first >= than @where */
+
+	BUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where);
+	BUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where);
+
+	if (!l) /* never delete first entry */
+		l++;
+	else if (l < t->size &&
+		 where < t->end_offset &&
+		 rw_aux_tree(b, t)[l].offset == where)
+		rw_aux_tree_set(b, t, l++, _where);
+
+	/* l now > where */
+
+	for (j = l;
+	     j < t->size &&
+	     rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
+	     j++)
+		;
+
+	if (j < t->size &&
+	    rw_aux_tree(b, t)[j].offset + shift ==
+	    rw_aux_tree(b, t)[l - 1].offset)
+		j++;
+
+	memmove(&rw_aux_tree(b, t)[l],
+		&rw_aux_tree(b, t)[j],
+		(void *) &rw_aux_tree(b, t)[t->size] -
+		(void *) &rw_aux_tree(b, t)[j]);
+	t->size -= j - l;
+
+	for (j = l; j < t->size; j++)
+	       rw_aux_tree(b, t)[j].offset += shift;
+
+	BUG_ON(l < t->size &&
+	       rw_aux_tree(b, t)[l].offset ==
+	       rw_aux_tree(b, t)[l - 1].offset);
+
+	if (t->size < bset_rw_tree_capacity(b, t) &&
+	    (l < t->size
+	     ? rw_aux_tree(b, t)[l].offset
+	     : t->end_offset) -
+	    rw_aux_tree(b, t)[l - 1].offset >
+	    L1_CACHE_BYTES / sizeof(u64)) {
+		struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
+		struct bkey_packed *end = l < t->size
+			? rw_aux_to_bkey(b, t, l)
+			: btree_bkey_last(b, t);
+		struct bkey_packed *k = start;
+
+		while (1) {
+			k = bkey_next(k);
+			if (k == end)
+				break;
+
+			if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
+				memmove(&rw_aux_tree(b, t)[l + 1],
+					&rw_aux_tree(b, t)[l],
+					(void *) &rw_aux_tree(b, t)[t->size] -
+					(void *) &rw_aux_tree(b, t)[l]);
+				t->size++;
+				rw_aux_tree_set(b, t, l, k);
+				break;
+			}
+		}
+	}
+
+	bch_bset_verify_rw_aux_tree(b, t);
+	bset_aux_tree_verify(b);
+}
+
+void bch_bset_insert(struct btree *b,
+		    struct btree_node_iter *iter,
+		    struct bkey_packed *where,
+		    struct bkey_i *insert,
+		    unsigned clobber_u64s)
+{
+	struct bkey_format *f = &b->format;
+	struct bset_tree *t = bset_tree_last(b);
+	struct bkey_packed packed, *src = bkey_to_packed(insert);
+
+	bch_bset_verify_rw_aux_tree(b, t);
+
+	if (bkey_pack_key(&packed, &insert->k, f))
+		src = &packed;
+
+	if (!bkey_whiteout(&insert->k))
+		btree_keys_account_key_add(&b->nr, t - b->set, src);
+
+	if (src->u64s != clobber_u64s) {
+		u64 *src_p = where->_data + clobber_u64s;
+		u64 *dst_p = where->_data + src->u64s;
+
+		BUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
+		       (int) clobber_u64s - src->u64s);
+
+		memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
+		le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
+		set_btree_bset_end(b, t);
+	}
+
+	memcpy_u64s(where, src,
+		    bkeyp_key_u64s(f, src));
+	memcpy_u64s(bkeyp_val(f, where), &insert->v,
+		    bkeyp_val_u64s(f, src));
+
+	bch_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
+
+	bch_verify_key_order(b, iter, where);
+	bch_verify_btree_nr_keys(b);
+}
+
+void bch_bset_delete(struct btree *b,
+		     struct bkey_packed *where,
+		     unsigned clobber_u64s)
+{
+	struct bset_tree *t = bset_tree_last(b);
+	u64 *src_p = where->_data + clobber_u64s;
+	u64 *dst_p = where->_data;
+
+	bch_bset_verify_rw_aux_tree(b, t);
+
+	BUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
+
+	memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
+	le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
+	set_btree_bset_end(b, t);
+
+	bch_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
+}
+
+/* Lookup */
+
+__flatten
+static struct bkey_packed *bset_search_write_set(const struct btree *b,
+				struct bset_tree *t,
+				struct bpos search,
+				const struct bkey_packed *packed_search)
+{
+	unsigned l = 0, r = t->size;
+
+	while (l + 1 != r) {
+		unsigned m = (l + r) >> 1;
+
+		if (bkey_cmp(rw_aux_tree(b, t)[m].k, search) < 0)
+			l = m;
+		else
+			r = m;
+	}
+
+	return rw_aux_to_bkey(b, t, l);
+}
+
+noinline
+static int bset_search_tree_slowpath(const struct btree *b,
+				struct bset_tree *t, struct bpos *search,
+				const struct bkey_packed *packed_search,
+				unsigned n)
+{
+	return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n),
+				 packed_search, search) < 0;
+}
+
+__flatten
+static struct bkey_packed *bset_search_tree(const struct btree *b,
+				struct bset_tree *t,
+				struct bpos search,
+				const struct bkey_packed *packed_search)
+{
+	struct ro_aux_tree *base = ro_aux_tree_base(b, t);
+	struct bkey_float *f = bkey_float_get(base, 1);
+	void *p;
+	unsigned inorder, n = 1;
+
+	while (1) {
+		if (likely(n << 4 < t->size)) {
+			p = bkey_float_get(base, n << 4);
+			prefetch(p);
+		} else if (n << 3 < t->size) {
+			inorder = __eytzinger_to_inorder(n, t->size, t->extra);
+			p = bset_cacheline(b, t, inorder);
+#ifdef CONFIG_X86_64
+			asm(".intel_syntax noprefix;"
+			    "prefetcht0 [%0 - 127 + 64 * 0];"
+			    "prefetcht0 [%0 - 127 + 64 * 1];"
+			    "prefetcht0 [%0 - 127 + 64 * 2];"
+			    "prefetcht0 [%0 - 127 + 64 * 3];"
+			    ".att_syntax prefix;"
+			    :
+			    : "r" (p + 127));
+#else
+			prefetch(p + L1_CACHE_BYTES * 0);
+			prefetch(p + L1_CACHE_BYTES * 1);
+			prefetch(p + L1_CACHE_BYTES * 2);
+			prefetch(p + L1_CACHE_BYTES * 3);
+#endif
+		} else if (n >= t->size)
+			break;
+
+		f = bkey_float_get(base, n);
+
+		if (packed_search &&
+		    likely(f->exponent < BFLOAT_FAILED))
+			n = n * 2 + (bfloat_mantissa(f, n) <
+				     bkey_mantissa(packed_search, f, n));
+		else
+			n = n * 2 + bset_search_tree_slowpath(b, t,
+						&search, packed_search, n);
+	} while (n < t->size);
+
+	inorder = __eytzinger_to_inorder(n >> 1, t->size, t->extra);
+
+	/*
+	 * n would have been the node we recursed to - the low bit tells us if
+	 * we recursed left or recursed right.
+	 */
+	if (n & 1) {
+		return cacheline_to_bkey(b, t, inorder, f->key_offset);
+	} else {
+		if (--inorder) {
+			n = eytzinger_prev(n >> 1, t->size);
+			f = bkey_float_get(base, n);
+			return cacheline_to_bkey(b, t, inorder, f->key_offset);
+		} else
+			return btree_bkey_first(b, t);
+	}
+}
+
+/*
+ * Returns the first key greater than or equal to @search
+ */
+__always_inline __flatten
+static struct bkey_packed *bch_bset_search(struct btree *b,
+				struct bset_tree *t,
+				struct bpos search,
+				struct bkey_packed *packed_search,
+				const struct bkey_packed *lossy_packed_search,
+				bool strictly_greater)
+{
+	struct bkey_packed *m;
+
+	/*
+	 * First, we search for a cacheline, then lastly we do a linear search
+	 * within that cacheline.
+	 *
+	 * To search for the cacheline, there's three different possibilities:
+	 *  * The set is too small to have a search tree, so we just do a linear
+	 *    search over the whole set.
+	 *  * The set is the one we're currently inserting into; keeping a full
+	 *    auxiliary search tree up to date would be too expensive, so we
+	 *    use a much simpler lookup table to do a binary search -
+	 *    bset_search_write_set().
+	 *  * Or we use the auxiliary search tree we constructed earlier -
+	 *    bset_search_tree()
+	 */
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		m = btree_bkey_first(b, t);
+		break;
+	case BSET_RW_AUX_TREE:
+		m = bset_search_write_set(b, t, search, lossy_packed_search);
+		break;
+	case BSET_RO_AUX_TREE:
+		/*
+		 * Each node in the auxiliary search tree covers a certain range
+		 * of bits, and keys above and below the set it covers might
+		 * differ outside those bits - so we have to special case the
+		 * start and end - handle that here:
+		 */
+
+		if (bkey_cmp(search, t->max_key) > 0)
+			return btree_bkey_last(b, t);
+
+		m = bset_search_tree(b, t, search, lossy_packed_search);
+		break;
+	}
+
+	if (lossy_packed_search)
+		while (m != btree_bkey_last(b, t) &&
+		       !btree_iter_pos_cmp_p_or_unp(b, search, lossy_packed_search,
+						    m, strictly_greater))
+			m = bkey_next(m);
+
+	if (!packed_search)
+		while (m != btree_bkey_last(b, t) &&
+		       !btree_iter_pos_cmp_packed(b, &search, m, strictly_greater))
+			m = bkey_next(m);
+
+	if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
+		struct bkey_packed *prev = bkey_prev_all(b, t, m);
+
+		BUG_ON(prev &&
+		       btree_iter_pos_cmp_p_or_unp(b, search, packed_search,
+						   prev, strictly_greater));
+	}
+
+	return m;
+}
+
+/* Btree node iterator */
+
+void bch_btree_node_iter_push(struct btree_node_iter *iter,
+			      struct btree *b,
+			      const struct bkey_packed *k,
+			      const struct bkey_packed *end)
+{
+	if (k != end) {
+		struct btree_node_iter_set *pos, n =
+			((struct btree_node_iter_set) {
+				 __btree_node_key_to_offset(b, k),
+				 __btree_node_key_to_offset(b, end)
+			 });
+
+		btree_node_iter_for_each(iter, pos)
+			if (btree_node_iter_cmp(iter, b, n, *pos) <= 0)
+				break;
+
+		memmove(pos + 1, pos,
+			(void *) (iter->data + iter->used) - (void *) pos);
+		iter->used++;
+		*pos = n;
+	}
+}
+
+noinline __flatten __attribute__((cold))
+static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
+			      struct btree *b, struct bpos search,
+			      bool strictly_greater, bool is_extents)
+{
+	struct bset_tree *t;
+
+	trace_bkey_pack_pos_fail(search);
+
+	for_each_bset(b, t)
+		__bch_btree_node_iter_push(iter, b,
+			bch_bset_search(b, t, search, NULL, NULL,
+					strictly_greater),
+			btree_bkey_last(b, t));
+
+	bch_btree_node_iter_sort(iter, b);
+}
+
+/**
+ * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
+ * given position
+ *
+ * Main entry point to the lookup code for individual btree nodes:
+ *
+ * NOTE:
+ *
+ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
+ * keys. This doesn't matter for most code, but it does matter for lookups.
+ *
+ * Some adjacent keys with a string of equal keys:
+ *	i j k k k k l m
+ *
+ * If you search for k, the lookup code isn't guaranteed to return you any
+ * specific k. The lookup code is conceptually doing a binary search and
+ * iterating backwards is very expensive so if the pivot happens to land at the
+ * last k that's what you'll get.
+ *
+ * This works out ok, but it's something to be aware of:
+ *
+ *  - For non extents, we guarantee that the live key comes last - see
+ *    btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
+ *    see will only be deleted keys you don't care about.
+ *
+ *  - For extents, deleted keys sort last (see the comment at the top of this
+ *    file). But when you're searching for extents, you actually want the first
+ *    key strictly greater than your search key - an extent that compares equal
+ *    to the search key is going to have 0 sectors after the search key.
+ *
+ *    But this does mean that we can't just search for
+ *    bkey_successor(start_of_range) to get the first extent that overlaps with
+ *    the range we want - if we're unlucky and there's an extent that ends
+ *    exactly where we searched, then there could be a deleted key at the same
+ *    position and we'd get that when we search instead of the preceding extent
+ *    we needed.
+ *
+ *    So we've got to search for start_of_range, then after the lookup iterate
+ *    past any extents that compare equal to the position we searched for.
+ */
+void bch_btree_node_iter_init(struct btree_node_iter *iter,
+			      struct btree *b, struct bpos search,
+			      bool strictly_greater, bool is_extents)
+{
+	struct bset_tree *t;
+	struct bkey_packed p, *packed_search = NULL;
+
+	EBUG_ON(bkey_cmp(search, b->data->min_key) < 0);
+	bset_aux_tree_verify(b);
+
+	__bch_btree_node_iter_init(iter, is_extents);
+
+	//if (bkey_cmp(search, b->curr_max_key) > 0)
+	//	return;
+
+	switch (bkey_pack_pos_lossy(&p, search, b)) {
+	case BKEY_PACK_POS_EXACT:
+		packed_search = &p;
+		break;
+	case BKEY_PACK_POS_SMALLER:
+		packed_search = NULL;
+		break;
+	case BKEY_PACK_POS_FAIL:
+		btree_node_iter_init_pack_failed(iter, b, search,
+					strictly_greater, is_extents);
+		return;
+	}
+
+	for_each_bset(b, t)
+		__bch_btree_node_iter_push(iter, b,
+					   bch_bset_search(b, t, search,
+							   packed_search, &p,
+							   strictly_greater),
+					   btree_bkey_last(b, t));
+
+	bch_btree_node_iter_sort(iter, b);
+}
+
+void bch_btree_node_iter_init_from_start(struct btree_node_iter *iter,
+					 struct btree *b,
+					 bool is_extents)
+{
+	struct bset_tree *t;
+
+	__bch_btree_node_iter_init(iter, is_extents);
+
+	for_each_bset(b, t)
+		__bch_btree_node_iter_push(iter, b,
+					   btree_bkey_first(b, t),
+					   btree_bkey_last(b, t));
+	bch_btree_node_iter_sort(iter, b);
+}
+
+struct bkey_packed *bch_btree_node_iter_bset_pos(struct btree_node_iter *iter,
+						 struct btree *b,
+						 struct bset_tree *t)
+{
+	struct btree_node_iter_set *set;
+
+	BUG_ON(iter->used > MAX_BSETS);
+
+	btree_node_iter_for_each(iter, set)
+		if (set->end == t->end_offset)
+			return __btree_node_offset_to_key(b, set->k);
+
+	return btree_bkey_last(b, t);
+}
+
+static inline void btree_node_iter_sift(struct btree_node_iter *iter,
+					struct btree *b,
+					unsigned start)
+{
+	unsigned i;
+
+	EBUG_ON(iter->used > MAX_BSETS);
+
+	for (i = start;
+	     i + 1 < iter->used &&
+	     btree_node_iter_cmp(iter, b, iter->data[i], iter->data[i + 1]) > 0;
+	     i++)
+		swap(iter->data[i], iter->data[i + 1]);
+}
+
+static inline void btree_node_iter_sort_two(struct btree_node_iter *iter,
+					    struct btree *b,
+					    unsigned first)
+{
+	if (btree_node_iter_cmp(iter, b,
+				iter->data[first],
+				iter->data[first + 1]) > 0)
+		swap(iter->data[first], iter->data[first + 1]);
+}
+
+void bch_btree_node_iter_sort(struct btree_node_iter *iter,
+			      struct btree *b)
+{
+	EBUG_ON(iter->used > 3);
+
+	/* unrolled bubble sort: */
+
+	if (iter->used > 2) {
+		btree_node_iter_sort_two(iter, b, 0);
+		btree_node_iter_sort_two(iter, b, 1);
+	}
+
+	if (iter->used > 1)
+		btree_node_iter_sort_two(iter, b, 0);
+}
+EXPORT_SYMBOL(bch_btree_node_iter_sort);
+
+/**
+ * bch_btree_node_iter_advance - advance @iter by one key
+ *
+ * Doesn't do debugchecks - for cases where (insert_fixup_extent()) a bset might
+ * momentarily have out of order extents.
+ */
+void bch_btree_node_iter_advance(struct btree_node_iter *iter,
+				 struct btree *b)
+{
+	struct bkey_packed *k = bch_btree_node_iter_peek_all(iter, b);
+
+	iter->data->k += __bch_btree_node_iter_peek_all(iter, b)->u64s;
+
+	BUG_ON(iter->data->k > iter->data->end);
+
+	if (iter->data->k == iter->data->end) {
+		BUG_ON(iter->used == 0);
+		iter->data[0] = iter->data[--iter->used];
+	}
+
+	btree_node_iter_sift(iter, b, 0);
+
+	bch_btree_node_iter_next_check(iter, b, k);
+}
+
+/*
+ * Expensive:
+ */
+struct bkey_packed *bch_btree_node_iter_prev_all(struct btree_node_iter *iter,
+						 struct btree *b)
+{
+	struct bkey_packed *k, *prev = NULL;
+	struct btree_node_iter_set *set;
+	struct bset_tree *t;
+	struct bset_tree *prev_t;
+	unsigned end;
+
+	bch_btree_node_iter_verify(iter, b);
+
+	for_each_bset(b, t) {
+		k = bkey_prev_all(b, t,
+			bch_btree_node_iter_bset_pos(iter, b, t));
+		if (k &&
+		    (!prev || __btree_node_iter_cmp(iter->is_extents, b,
+						    k, prev) > 0)) {
+			prev = k;
+			prev_t = t;
+		}
+	}
+
+	if (!prev)
+		return NULL;
+
+	/*
+	 * We're manually memmoving instead of just calling sort() to ensure the
+	 * prev we picked ends up in slot 0 - sort won't necessarily put it
+	 * there because of duplicate deleted keys:
+	 */
+	end = __btree_node_key_to_offset(b, btree_bkey_last(b, prev_t));
+	btree_node_iter_for_each(iter, set)
+		if (set->end == end) {
+			memmove(&iter->data[1],
+				&iter->data[0],
+				(void *) set - (void *) &iter->data[0]);
+			goto out;
+		}
+
+	memmove(&iter->data[1],
+		&iter->data[0],
+		(void *) &iter->data[iter->used] - (void *) &iter->data[0]);
+	iter->used++;
+out:
+	iter->data[0].k = __btree_node_key_to_offset(b, prev);
+	iter->data[0].end = end;
+	return prev;
+}
+
+struct bkey_packed *bch_btree_node_iter_prev(struct btree_node_iter *iter,
+					     struct btree *b)
+{
+	struct bkey_packed *k;
+
+	do {
+		k = bch_btree_node_iter_prev_all(iter, b);
+	} while (k && bkey_deleted(k));
+
+	return k;
+}
+
+struct bkey_s_c bch_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
+						struct btree *b,
+						struct bkey *u)
+{
+	struct bkey_packed *k = bch_btree_node_iter_peek(iter, b);
+
+	return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
+}
+EXPORT_SYMBOL(bch_btree_node_iter_peek_unpack);
+
+/* Mergesort */
+
+void bch_btree_keys_stats(struct btree *b, struct bset_stats *stats)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t) {
+		enum bset_aux_tree_type type = bset_aux_tree_type(t);
+		size_t j;
+
+		stats->sets[type].nr++;
+		stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
+			sizeof(u64);
+
+		if (bset_has_ro_aux_tree(t)) {
+			stats->floats += t->size - 1;
+
+			for (j = 1; j < t->size; j++)
+				switch (bkey_float(b, t, j)->exponent) {
+				case BFLOAT_FAILED_UNPACKED:
+					stats->failed_unpacked++;
+					break;
+				case BFLOAT_FAILED_PREV:
+					stats->failed_prev++;
+					break;
+				case BFLOAT_FAILED_OVERFLOW:
+					stats->failed_overflow++;
+					break;
+				}
+		}
+	}
+}
+
+int bch_bkey_print_bfloat(struct btree *b, struct bkey_packed *k,
+			  char *buf, size_t size)
+{
+	struct bset_tree *t = bch_bkey_to_bset(b, k);
+	struct bkey_packed *l, *r, *p;
+	struct bkey uk, up;
+	char buf1[200], buf2[200];
+	unsigned j;
+
+	if (!size)
+		return 0;
+
+	if (!bset_has_ro_aux_tree(t))
+		goto out;
+
+	j = __inorder_to_eytzinger(bkey_to_cacheline(b, t, k), t->size, t->extra);
+	if (j &&
+	    j < t->size &&
+	    k == tree_to_bkey(b, t, j))
+		switch (bkey_float(b, t, j)->exponent) {
+		case BFLOAT_FAILED_UNPACKED:
+			uk = bkey_unpack_key(b, k);
+			return scnprintf(buf, size,
+					 "    failed unpacked at depth %u\n"
+					 "\t%llu:%llu\n",
+					 ilog2(j),
+					 uk.p.inode, uk.p.offset);
+		case BFLOAT_FAILED_PREV:
+			p = tree_to_prev_bkey(b, t, j);
+			l = is_power_of_2(j)
+				? btree_bkey_first(b, t)
+				: tree_to_prev_bkey(b, t, j >> ffs(j));
+			r = is_power_of_2(j + 1)
+				? bkey_prev_all(b, t, btree_bkey_last(b, t))
+				: tree_to_bkey(b, t, j >> (ffz(j) + 1));
+
+			up = bkey_unpack_key(b, p);
+			uk = bkey_unpack_key(b, k);
+			bch_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits);
+			bch_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits);
+
+			return scnprintf(buf, size,
+					 "    failed prev at depth %u\n"
+					 "\tkey starts at bit %u but first differing bit at %u\n"
+					 "\t%llu:%llu\n"
+					 "\t%llu:%llu\n"
+					 "\t%s\n"
+					 "\t%s\n",
+					 ilog2(j),
+					 bkey_greatest_differing_bit(b, l, r),
+					 bkey_greatest_differing_bit(b, p, k),
+					 uk.p.inode, uk.p.offset,
+					 up.p.inode, up.p.offset,
+					 buf1, buf2);
+		case BFLOAT_FAILED_OVERFLOW:
+			uk = bkey_unpack_key(b, k);
+			return scnprintf(buf, size,
+					 "    failed overflow at depth %u\n"
+					 "\t%llu:%llu\n",
+					 ilog2(j),
+					 uk.p.inode, uk.p.offset);
+		}
+out:
+	*buf = '\0';
+	return 0;
+}
diff --git a/libbcache/bset.h b/libbcache/bset.h
new file mode 100644
index 0000000..f03e6b8
--- /dev/null
+++ b/libbcache/bset.h
@@ -0,0 +1,628 @@
+#ifndef _BCACHE_BSET_H
+#define _BCACHE_BSET_H
+
+#include <linux/bcache.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "bkey.h"
+#include "bkey_methods.h"
+#include "btree_types.h"
+#include "util.h" /* for time_stats */
+
+/*
+ * BKEYS:
+ *
+ * A bkey contains a key, a size field, a variable number of pointers, and some
+ * ancillary flag bits.
+ *
+ * We use two different functions for validating bkeys, bkey_invalid and
+ * bkey_deleted().
+ *
+ * The one exception to the rule that ptr_invalid() filters out invalid keys is
+ * that it also filters out keys of size 0 - these are keys that have been
+ * completely overwritten. It'd be safe to delete these in memory while leaving
+ * them on disk, just unnecessary work - so we filter them out when resorting
+ * instead.
+ *
+ * We can't filter out stale keys when we're resorting, because garbage
+ * collection needs to find them to ensure bucket gens don't wrap around -
+ * unless we're rewriting the btree node those stale keys still exist on disk.
+ *
+ * We also implement functions here for removing some number of sectors from the
+ * front or the back of a bkey - this is mainly used for fixing overlapping
+ * extents, by removing the overlapping sectors from the older key.
+ *
+ * BSETS:
+ *
+ * A bset is an array of bkeys laid out contiguously in memory in sorted order,
+ * along with a header. A btree node is made up of a number of these, written at
+ * different times.
+ *
+ * There could be many of them on disk, but we never allow there to be more than
+ * 4 in memory - we lazily resort as needed.
+ *
+ * We implement code here for creating and maintaining auxiliary search trees
+ * (described below) for searching an individial bset, and on top of that we
+ * implement a btree iterator.
+ *
+ * BTREE ITERATOR:
+ *
+ * Most of the code in bcache doesn't care about an individual bset - it needs
+ * to search entire btree nodes and iterate over them in sorted order.
+ *
+ * The btree iterator code serves both functions; it iterates through the keys
+ * in a btree node in sorted order, starting from either keys after a specific
+ * point (if you pass it a search key) or the start of the btree node.
+ *
+ * AUXILIARY SEARCH TREES:
+ *
+ * Since keys are variable length, we can't use a binary search on a bset - we
+ * wouldn't be able to find the start of the next key. But binary searches are
+ * slow anyways, due to terrible cache behaviour; bcache originally used binary
+ * searches and that code topped out at under 50k lookups/second.
+ *
+ * So we need to construct some sort of lookup table. Since we only insert keys
+ * into the last (unwritten) set, most of the keys within a given btree node are
+ * usually in sets that are mostly constant. We use two different types of
+ * lookup tables to take advantage of this.
+ *
+ * Both lookup tables share in common that they don't index every key in the
+ * set; they index one key every BSET_CACHELINE bytes, and then a linear search
+ * is used for the rest.
+ *
+ * For sets that have been written to disk and are no longer being inserted
+ * into, we construct a binary search tree in an array - traversing a binary
+ * search tree in an array gives excellent locality of reference and is very
+ * fast, since both children of any node are adjacent to each other in memory
+ * (and their grandchildren, and great grandchildren...) - this means
+ * prefetching can be used to great effect.
+ *
+ * It's quite useful performance wise to keep these nodes small - not just
+ * because they're more likely to be in L2, but also because we can prefetch
+ * more nodes on a single cacheline and thus prefetch more iterations in advance
+ * when traversing this tree.
+ *
+ * Nodes in the auxiliary search tree must contain both a key to compare against
+ * (we don't want to fetch the key from the set, that would defeat the purpose),
+ * and a pointer to the key. We use a few tricks to compress both of these.
+ *
+ * To compress the pointer, we take advantage of the fact that one node in the
+ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
+ * a function (to_inorder()) that takes the index of a node in a binary tree and
+ * returns what its index would be in an inorder traversal, so we only have to
+ * store the low bits of the offset.
+ *
+ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
+ * compress that,  we take advantage of the fact that when we're traversing the
+ * search tree at every iteration we know that both our search key and the key
+ * we're looking for lie within some range - bounded by our previous
+ * comparisons. (We special case the start of a search so that this is true even
+ * at the root of the tree).
+ *
+ * So we know the key we're looking for is between a and b, and a and b don't
+ * differ higher than bit 50, we don't need to check anything higher than bit
+ * 50.
+ *
+ * We don't usually need the rest of the bits, either; we only need enough bits
+ * to partition the key range we're currently checking.  Consider key n - the
+ * key our auxiliary search tree node corresponds to, and key p, the key
+ * immediately preceding n.  The lowest bit we need to store in the auxiliary
+ * search tree is the highest bit that differs between n and p.
+ *
+ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
+ * comparison. But we'd really like our nodes in the auxiliary search tree to be
+ * of fixed size.
+ *
+ * The solution is to make them fixed size, and when we're constructing a node
+ * check if p and n differed in the bits we needed them to. If they don't we
+ * flag that node, and when doing lookups we fallback to comparing against the
+ * real key. As long as this doesn't happen to often (and it seems to reliably
+ * happen a bit less than 1% of the time), we win - even on failures, that key
+ * is then more likely to be in cache than if we were doing binary searches all
+ * the way, since we're touching so much less memory.
+ *
+ * The keys in the auxiliary search tree are stored in (software) floating
+ * point, with an exponent and a mantissa. The exponent needs to be big enough
+ * to address all the bits in the original key, but the number of bits in the
+ * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
+ *
+ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
+ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
+ * We need one node per 128 bytes in the btree node, which means the auxiliary
+ * search trees take up 3% as much memory as the btree itself.
+ *
+ * Constructing these auxiliary search trees is moderately expensive, and we
+ * don't want to be constantly rebuilding the search tree for the last set
+ * whenever we insert another key into it. For the unwritten set, we use a much
+ * simpler lookup table - it's just a flat array, so index i in the lookup table
+ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
+ * within each byte range works the same as with the auxiliary search trees.
+ *
+ * These are much easier to keep up to date when we insert a key - we do it
+ * somewhat lazily; when we shift a key up we usually just increment the pointer
+ * to it, only when it would overflow do we go to the trouble of finding the
+ * first key in that range of bytes again.
+ */
+
+struct btree_node_iter;
+struct btree_node_iter_set;
+
+enum bset_aux_tree_type {
+	BSET_NO_AUX_TREE,
+	BSET_RO_AUX_TREE,
+	BSET_RW_AUX_TREE,
+};
+
+#define BSET_TREE_NR_TYPES	3
+
+#define BSET_NO_AUX_TREE_VAL	(U16_MAX)
+#define BSET_RW_AUX_TREE_VAL	(U16_MAX - 1)
+
+static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
+{
+	switch (t->extra) {
+	case BSET_NO_AUX_TREE_VAL:
+		EBUG_ON(t->size);
+		return BSET_NO_AUX_TREE;
+	case BSET_RW_AUX_TREE_VAL:
+		EBUG_ON(!t->size);
+		return BSET_RW_AUX_TREE;
+	default:
+		EBUG_ON(!t->size);
+		return BSET_RO_AUX_TREE;
+	}
+}
+
+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
+
+static inline struct bkey
+bkey_unpack_key_format_checked(const struct btree *b,
+			       const struct bkey_packed *src)
+{
+	struct bkey dst;
+
+#ifdef HAVE_BCACHE_COMPILED_UNPACK
+	{
+		compiled_unpack_fn unpack_fn = b->aux_data;
+		unpack_fn(&dst, src);
+
+		if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
+			struct bkey dst2 = __bkey_unpack_key(&b->format, src);
+
+			BUG_ON(memcmp(&dst, &dst2, sizeof(dst)));
+		}
+	}
+#else
+	dst = __bkey_unpack_key(&b->format, src);
+#endif
+	return dst;
+}
+
+/**
+ * bkey_unpack_key -- unpack just the key, not the value
+ */
+static inline struct bkey bkey_unpack_key(const struct btree *b,
+					  const struct bkey_packed *src)
+{
+	return likely(bkey_packed(src))
+		? bkey_unpack_key_format_checked(b, src)
+		: *packed_to_bkey_c(src);
+}
+
+static inline struct bpos
+bkey_unpack_pos_format_checked(const struct btree *b,
+			       const struct bkey_packed *src)
+{
+#ifdef HAVE_BCACHE_COMPILED_UNPACK
+	return bkey_unpack_key_format_checked(b, src).p;
+#else
+	return __bkey_unpack_pos(&b->format, src);
+#endif
+}
+
+static inline struct bpos bkey_unpack_pos(const struct btree *b,
+					  const struct bkey_packed *src)
+{
+	return likely(bkey_packed(src))
+		? bkey_unpack_pos_format_checked(b, src)
+		: packed_to_bkey_c(src)->p;
+}
+
+/* Disassembled bkeys */
+
+static inline struct bkey_s_c bkey_disassemble(struct btree *b,
+					       const struct bkey_packed *k,
+					       struct bkey *u)
+{
+	*u = bkey_unpack_key(b, k);
+
+	return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
+}
+
+/* non const version: */
+static inline struct bkey_s __bkey_disassemble(struct btree *b,
+					       struct bkey_packed *k,
+					       struct bkey *u)
+{
+	*u = bkey_unpack_key(b, k);
+
+	return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
+}
+
+#define for_each_bset(_b, _t)					\
+	for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
+
+extern bool bch_expensive_debug_checks;
+
+static inline bool btree_keys_expensive_checks(struct btree *b)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+	return bch_expensive_debug_checks || *b->expensive_debug_checks;
+#else
+	return false;
+#endif
+}
+
+static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
+{
+	return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
+}
+
+static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
+{
+	return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
+}
+
+static inline void bch_bset_set_no_aux_tree(struct btree *b,
+					    struct bset_tree *t)
+{
+	BUG_ON(t < b->set);
+
+	for (; t < b->set + ARRAY_SIZE(b->set); t++) {
+		t->size = 0;
+		t->extra = BSET_NO_AUX_TREE_VAL;
+		t->aux_data_offset = U16_MAX;
+	}
+}
+
+static inline void btree_node_set_format(struct btree *b,
+					 struct bkey_format f)
+{
+	int len;
+
+	b->format	= f;
+	b->nr_key_bits	= bkey_format_key_bits(&f);
+
+	len = bch_compile_bkey_format(&b->format, b->aux_data);
+	BUG_ON(len < 0 || len > U8_MAX);
+
+	b->unpack_fn_len = len;
+
+	bch_bset_set_no_aux_tree(b, b->set);
+}
+
+#define __set_bytes(_i, _u64s)	(sizeof(*(_i)) + (_u64s) * sizeof(u64))
+#define set_bytes(_i)		__set_bytes(_i, (_i)->u64s)
+
+#define __set_blocks(_i, _u64s, _block_bytes)				\
+	DIV_ROUND_UP((size_t) __set_bytes((_i), (_u64s)), (_block_bytes))
+
+#define set_blocks(_i, _block_bytes)					\
+	__set_blocks((_i), (_i)->u64s, (_block_bytes))
+
+static inline struct bset *bset_next_set(struct btree *b,
+					 unsigned block_bytes)
+{
+	struct bset *i = btree_bset_last(b);
+
+	EBUG_ON(!is_power_of_2(block_bytes));
+
+	return ((void *) i) + round_up(set_bytes(i), block_bytes);
+}
+
+void bch_btree_keys_free(struct btree *);
+int bch_btree_keys_alloc(struct btree *, unsigned, gfp_t);
+void bch_btree_keys_init(struct btree *, bool *);
+
+void bch_bset_init_first(struct btree *, struct bset *);
+void bch_bset_init_next(struct btree *, struct bset *);
+void bch_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
+void bch_bset_fix_invalidated_key(struct btree *, struct bset_tree *,
+				  struct bkey_packed *);
+
+void bch_bset_insert(struct btree *, struct btree_node_iter *,
+		     struct bkey_packed *, struct bkey_i *, unsigned);
+void bch_bset_delete(struct btree *, struct bkey_packed *, unsigned);
+
+/* Bkey utility code */
+
+/* packed or unpacked */
+static inline int bkey_cmp_p_or_unp(const struct btree *b,
+				    const struct bkey_packed *l,
+				    const struct bkey_packed *r_packed,
+				    struct bpos *r)
+{
+	EBUG_ON(r_packed && !bkey_packed(r_packed));
+
+	if (unlikely(!bkey_packed(l)))
+		return bkey_cmp(packed_to_bkey_c(l)->p, *r);
+
+	if (likely(r_packed))
+		return __bkey_cmp_packed_format_checked(l, r_packed, b);
+
+	return __bkey_cmp_left_packed_format_checked(b, l, r);
+}
+
+/* Returns true if @k is after iterator position @pos */
+static inline bool btree_iter_pos_cmp(struct bpos pos, const struct bkey *k,
+				      bool strictly_greater)
+{
+	int cmp = bkey_cmp(k->p, pos);
+
+	return cmp > 0 ||
+		(cmp == 0 && !strictly_greater && !bkey_deleted(k));
+}
+
+static inline bool btree_iter_pos_cmp_packed(const struct btree *b,
+					     struct bpos *pos,
+					     const struct bkey_packed *k,
+					     bool strictly_greater)
+{
+	int cmp = bkey_cmp_left_packed(b, k, pos);
+
+	return cmp > 0 ||
+		(cmp == 0 && !strictly_greater && !bkey_deleted(k));
+}
+
+static inline bool btree_iter_pos_cmp_p_or_unp(const struct btree *b,
+					struct bpos pos,
+					const struct bkey_packed *pos_packed,
+					const struct bkey_packed *k,
+					bool strictly_greater)
+{
+	int cmp = bkey_cmp_p_or_unp(b, k, pos_packed, &pos);
+
+	return cmp > 0 ||
+		(cmp == 0 && !strictly_greater && !bkey_deleted(k));
+}
+
+static inline struct bkey_packed *bset_bkey_idx(struct bset *i, unsigned idx)
+{
+	return bkey_idx(i, idx);
+}
+
+struct bset_tree *bch_bkey_to_bset(struct btree *, struct bkey_packed *);
+struct bkey_packed *bkey_prev_all(struct btree *, struct bset_tree *,
+				  struct bkey_packed *);
+struct bkey_packed *bkey_prev(struct btree *, struct bset_tree *,
+			      struct bkey_packed *);
+
+enum bch_extent_overlap {
+	BCH_EXTENT_OVERLAP_ALL		= 0,
+	BCH_EXTENT_OVERLAP_BACK		= 1,
+	BCH_EXTENT_OVERLAP_FRONT	= 2,
+	BCH_EXTENT_OVERLAP_MIDDLE	= 3,
+};
+
+/* Returns how k overlaps with m */
+static inline enum bch_extent_overlap bch_extent_overlap(const struct bkey *k,
+							 const struct bkey *m)
+{
+	int cmp1 = bkey_cmp(k->p, m->p) < 0;
+	int cmp2 = bkey_cmp(bkey_start_pos(k),
+			    bkey_start_pos(m)) > 0;
+
+	return (cmp1 << 1) + cmp2;
+}
+
+/* Btree key iteration */
+
+struct btree_node_iter {
+	u8		is_extents;
+	u16		used;
+
+	struct btree_node_iter_set {
+		u16	k, end;
+	} data[MAX_BSETS];
+};
+
+static inline void __bch_btree_node_iter_init(struct btree_node_iter *iter,
+					      bool is_extents)
+{
+	iter->used = 0;
+	iter->is_extents = is_extents;
+}
+
+void bch_btree_node_iter_push(struct btree_node_iter *, struct btree *,
+			      const struct bkey_packed *,
+			      const struct bkey_packed *);
+void bch_btree_node_iter_init(struct btree_node_iter *, struct btree *,
+			      struct bpos, bool, bool);
+void bch_btree_node_iter_init_from_start(struct btree_node_iter *,
+					 struct btree *, bool);
+struct bkey_packed *bch_btree_node_iter_bset_pos(struct btree_node_iter *,
+						 struct btree *,
+						 struct bset_tree *);
+
+void bch_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
+void bch_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
+
+#define btree_node_iter_for_each(_iter, _set)			\
+	for (_set = (_iter)->data;				\
+	     _set < (_iter)->data + (_iter)->used;		\
+	     _set++)
+
+static inline bool bch_btree_node_iter_end(struct btree_node_iter *iter)
+{
+	return !iter->used;
+}
+
+static inline int __btree_node_iter_cmp(bool is_extents,
+					struct btree *b,
+					struct bkey_packed *l,
+					struct bkey_packed *r)
+{
+	/*
+	 * For non extents, when keys compare equal the deleted keys have to
+	 * come first - so that bch_btree_node_iter_next_check() can detect
+	 * duplicate nondeleted keys (and possibly other reasons?)
+	 *
+	 * For extents, bkey_deleted() is used as a proxy for k->size == 0, so
+	 * deleted keys have to sort last.
+	 */
+	return bkey_cmp_packed(b, l, r) ?: is_extents
+		? (int) bkey_deleted(l) - (int) bkey_deleted(r)
+		: (int) bkey_deleted(r) - (int) bkey_deleted(l);
+}
+
+static inline int btree_node_iter_cmp(struct btree_node_iter *iter,
+				      struct btree *b,
+				      struct btree_node_iter_set l,
+				      struct btree_node_iter_set r)
+{
+	return __btree_node_iter_cmp(iter->is_extents, b,
+			__btree_node_offset_to_key(b, l.k),
+			__btree_node_offset_to_key(b, r.k));
+}
+
+static inline void __bch_btree_node_iter_push(struct btree_node_iter *iter,
+			      struct btree *b,
+			      const struct bkey_packed *k,
+			      const struct bkey_packed *end)
+{
+	if (k != end)
+		iter->data[iter->used++] = (struct btree_node_iter_set) {
+			__btree_node_key_to_offset(b, k),
+			__btree_node_key_to_offset(b, end)
+		};
+}
+
+static inline struct bkey_packed *
+__bch_btree_node_iter_peek_all(struct btree_node_iter *iter,
+			       struct btree *b)
+{
+	return __btree_node_offset_to_key(b, iter->data->k);
+}
+
+static inline struct bkey_packed *
+bch_btree_node_iter_peek_all(struct btree_node_iter *iter,
+			     struct btree *b)
+{
+	return bch_btree_node_iter_end(iter)
+		? NULL
+		: __bch_btree_node_iter_peek_all(iter, b);
+}
+
+static inline struct bkey_packed *
+bch_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
+{
+	struct bkey_packed *ret;
+
+	while ((ret = bch_btree_node_iter_peek_all(iter, b)) &&
+	       bkey_deleted(ret))
+		bch_btree_node_iter_advance(iter, b);
+
+	return ret;
+}
+
+static inline struct bkey_packed *
+bch_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
+{
+	struct bkey_packed *ret = bch_btree_node_iter_peek_all(iter, b);
+
+	if (ret)
+		bch_btree_node_iter_advance(iter, b);
+
+	return ret;
+}
+
+struct bkey_packed *bch_btree_node_iter_prev_all(struct btree_node_iter *,
+						 struct btree *);
+struct bkey_packed *bch_btree_node_iter_prev(struct btree_node_iter *,
+					     struct btree *);
+
+/*
+ * Iterates over all _live_ keys - skipping deleted (and potentially
+ * overlapping) keys
+ */
+#define for_each_btree_node_key(b, k, iter, _is_extents)		\
+	for (bch_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
+	     ((k) = bch_btree_node_iter_peek(iter, b));			\
+	     bch_btree_node_iter_advance(iter, b))
+
+struct bkey_s_c bch_btree_node_iter_peek_unpack(struct btree_node_iter *,
+						struct btree *,
+						struct bkey *);
+
+#define for_each_btree_node_key_unpack(b, k, iter, _is_extents, unpacked)\
+	for (bch_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
+	     (k = bch_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
+	     bch_btree_node_iter_advance(iter, b))
+
+/* Accounting: */
+
+static inline void btree_keys_account_key(struct btree_nr_keys *n,
+					  unsigned bset,
+					  struct bkey_packed *k,
+					  int sign)
+{
+	n->live_u64s		+= k->u64s * sign;
+	n->bset_u64s[bset]	+= k->u64s * sign;
+
+	if (bkey_packed(k))
+		n->packed_keys	+= sign;
+	else
+		n->unpacked_keys += sign;
+}
+
+#define btree_keys_account_key_add(_nr, _bset_idx, _k)		\
+	btree_keys_account_key(_nr, _bset_idx, _k, 1)
+#define btree_keys_account_key_drop(_nr, _bset_idx, _k)	\
+	btree_keys_account_key(_nr, _bset_idx, _k, -1)
+
+struct bset_stats {
+	struct {
+		size_t nr, bytes;
+	} sets[BSET_TREE_NR_TYPES];
+
+	size_t floats;
+	size_t failed_unpacked;
+	size_t failed_prev;
+	size_t failed_overflow;
+};
+
+void bch_btree_keys_stats(struct btree *, struct bset_stats *);
+int bch_bkey_print_bfloat(struct btree *, struct bkey_packed *,
+			  char *, size_t);
+
+/* Debug stuff */
+
+void bch_dump_bset(struct btree *, struct bset *, unsigned);
+void bch_dump_btree_node(struct btree *);
+void bch_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+void __bch_verify_btree_nr_keys(struct btree *);
+void bch_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
+void bch_verify_key_order(struct btree *, struct btree_node_iter *,
+			  struct bkey_packed *);
+
+#else
+
+static inline void __bch_verify_btree_nr_keys(struct btree *b) {}
+static inline void bch_btree_node_iter_verify(struct btree_node_iter *iter,
+					      struct btree *b) {}
+static inline void bch_verify_key_order(struct btree *b,
+					struct btree_node_iter *iter,
+					struct bkey_packed *where) {}
+#endif
+
+static inline void bch_verify_btree_nr_keys(struct btree *b)
+{
+	if (btree_keys_expensive_checks(b))
+		__bch_verify_btree_nr_keys(b);
+}
+
+#endif
diff --git a/libbcache/btree_cache.c b/libbcache/btree_cache.c
new file mode 100644
index 0000000..0994190
--- /dev/null
+++ b/libbcache/btree_cache.c
@@ -0,0 +1,701 @@
+
+#include "bcache.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "debug.h"
+#include "extents.h"
+
+#include <trace/events/bcache.h>
+
+#define DEF_BTREE_ID(kwd, val, name) name,
+
+const char *bch_btree_id_names[BTREE_ID_NR] = {
+	DEFINE_BCH_BTREE_IDS()
+};
+
+#undef DEF_BTREE_ID
+
+void bch_recalc_btree_reserve(struct cache_set *c)
+{
+	unsigned i, reserve = 16;
+
+	if (!c->btree_roots[0].b)
+		reserve += 8;
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (c->btree_roots[i].b)
+			reserve += min_t(unsigned, 1,
+					 c->btree_roots[i].b->level) * 8;
+
+	c->btree_cache_reserve = reserve;
+}
+
+#define mca_can_free(c)						\
+	max_t(int, 0, c->btree_cache_used - c->btree_cache_reserve)
+
+static void __mca_data_free(struct cache_set *c, struct btree *b)
+{
+	EBUG_ON(btree_node_write_in_flight(b));
+
+	free_pages((unsigned long) b->data, btree_page_order(c));
+	b->data = NULL;
+	bch_btree_keys_free(b);
+}
+
+static void mca_data_free(struct cache_set *c, struct btree *b)
+{
+	__mca_data_free(c, b);
+	c->btree_cache_used--;
+	list_move(&b->list, &c->btree_cache_freed);
+}
+
+#define PTR_HASH(_k)	(bkey_i_to_extent_c(_k)->v._data[0])
+
+static const struct rhashtable_params bch_btree_cache_params = {
+	.head_offset	= offsetof(struct btree, hash),
+	.key_offset	= offsetof(struct btree, key.v),
+	.key_len	= sizeof(struct bch_extent_ptr),
+};
+
+static void mca_data_alloc(struct cache_set *c, struct btree *b, gfp_t gfp)
+{
+	unsigned order = ilog2(btree_pages(c));
+
+	b->data = (void *) __get_free_pages(gfp, order);
+	if (!b->data)
+		goto err;
+
+	if (bch_btree_keys_alloc(b, order, gfp))
+		goto err;
+
+	c->btree_cache_used++;
+	list_move(&b->list, &c->btree_cache_freeable);
+	return;
+err:
+	free_pages((unsigned long) b->data, order);
+	b->data = NULL;
+	list_move(&b->list, &c->btree_cache_freed);
+}
+
+static struct btree *mca_bucket_alloc(struct cache_set *c, gfp_t gfp)
+{
+	struct btree *b = kzalloc(sizeof(struct btree), gfp);
+	if (!b)
+		return NULL;
+
+	six_lock_init(&b->lock);
+	INIT_LIST_HEAD(&b->list);
+	INIT_LIST_HEAD(&b->write_blocked);
+
+	mca_data_alloc(c, b, gfp);
+	return b->data ? b : NULL;
+}
+
+/* Btree in memory cache - hash table */
+
+void mca_hash_remove(struct cache_set *c, struct btree *b)
+{
+	BUG_ON(btree_node_dirty(b));
+
+	b->nsets = 0;
+
+	rhashtable_remove_fast(&c->btree_cache_table, &b->hash,
+			       bch_btree_cache_params);
+
+	/* Cause future lookups for this node to fail: */
+	bkey_i_to_extent(&b->key)->v._data[0] = 0;
+}
+
+int mca_hash_insert(struct cache_set *c, struct btree *b,
+		    unsigned level, enum btree_id id)
+{
+	int ret;
+	b->level	= level;
+	b->btree_id	= id;
+
+	ret = rhashtable_lookup_insert_fast(&c->btree_cache_table, &b->hash,
+					    bch_btree_cache_params);
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->btree_cache_lock);
+	list_add(&b->list, &c->btree_cache);
+	mutex_unlock(&c->btree_cache_lock);
+
+	return 0;
+}
+
+__flatten
+static inline struct btree *mca_find(struct cache_set *c,
+				     const struct bkey_i *k)
+{
+	return rhashtable_lookup_fast(&c->btree_cache_table, &PTR_HASH(k),
+				      bch_btree_cache_params);
+}
+
+/*
+ * this version is for btree nodes that have already been freed (we're not
+ * reaping a real btree node)
+ */
+static int mca_reap_notrace(struct cache_set *c, struct btree *b, bool flush)
+{
+	lockdep_assert_held(&c->btree_cache_lock);
+
+	if (!six_trylock_intent(&b->lock))
+		return -ENOMEM;
+
+	if (!six_trylock_write(&b->lock))
+		goto out_unlock_intent;
+
+	if (btree_node_write_error(b))
+		goto out_unlock;
+
+	if (!list_empty(&b->write_blocked))
+		goto out_unlock;
+
+	if (!flush &&
+	    (btree_node_dirty(b) ||
+	     btree_node_write_in_flight(b)))
+		goto out_unlock;
+
+	/*
+	 * Using the underscore version because we don't want to compact bsets
+	 * after the write, since this node is about to be evicted - unless
+	 * btree verify mode is enabled, since it runs out of the post write
+	 * cleanup:
+	 */
+	if (btree_node_dirty(b)) {
+		if (verify_btree_ondisk(c))
+			bch_btree_node_write(c, b, NULL, SIX_LOCK_intent, -1);
+		else
+			__bch_btree_node_write(c, b, NULL, SIX_LOCK_read, -1);
+	}
+
+	/* wait for any in flight btree write */
+	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+
+	return 0;
+out_unlock:
+	six_unlock_write(&b->lock);
+out_unlock_intent:
+	six_unlock_intent(&b->lock);
+	return -ENOMEM;
+}
+
+static int mca_reap(struct cache_set *c, struct btree *b, bool flush)
+{
+	int ret = mca_reap_notrace(c, b, flush);
+
+	trace_bcache_mca_reap(c, b, ret);
+	return ret;
+}
+
+static unsigned long bch_mca_scan(struct shrinker *shrink,
+				  struct shrink_control *sc)
+{
+	struct cache_set *c = container_of(shrink, struct cache_set,
+					   btree_cache_shrink);
+	struct btree *b, *t;
+	unsigned long nr = sc->nr_to_scan;
+	unsigned long can_free;
+	unsigned long touched = 0;
+	unsigned long freed = 0;
+	unsigned i;
+
+	u64 start_time = local_clock();
+
+	if (btree_shrinker_disabled(c))
+		return SHRINK_STOP;
+
+	if (c->btree_cache_alloc_lock)
+		return SHRINK_STOP;
+
+	/* Return -1 if we can't do anything right now */
+	if (sc->gfp_mask & __GFP_IO)
+		mutex_lock(&c->btree_cache_lock);
+	else if (!mutex_trylock(&c->btree_cache_lock))
+		return -1;
+
+	/*
+	 * It's _really_ critical that we don't free too many btree nodes - we
+	 * have to always leave ourselves a reserve. The reserve is how we
+	 * guarantee that allocating memory for a new btree node can always
+	 * succeed, so that inserting keys into the btree can always succeed and
+	 * IO can always make forward progress:
+	 */
+	nr /= btree_pages(c);
+	can_free = mca_can_free(c);
+	nr = min_t(unsigned long, nr, can_free);
+
+	i = 0;
+	list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
+		touched++;
+
+		if (freed >= nr)
+			break;
+
+		if (++i > 3 &&
+		    !mca_reap_notrace(c, b, false)) {
+			mca_data_free(c, b);
+			six_unlock_write(&b->lock);
+			six_unlock_intent(&b->lock);
+			freed++;
+		}
+	}
+restart:
+	list_for_each_entry_safe(b, t, &c->btree_cache, list) {
+		touched++;
+
+		if (freed >= nr) {
+			/* Save position */
+			if (&t->list != &c->btree_cache)
+				list_move_tail(&c->btree_cache, &t->list);
+			break;
+		}
+
+		if (!btree_node_accessed(b) &&
+		    !mca_reap(c, b, false)) {
+			/* can't call mca_hash_remove under btree_cache_lock  */
+			freed++;
+			if (&t->list != &c->btree_cache)
+				list_move_tail(&c->btree_cache, &t->list);
+
+			mca_data_free(c, b);
+			mutex_unlock(&c->btree_cache_lock);
+
+			mca_hash_remove(c, b);
+			six_unlock_write(&b->lock);
+			six_unlock_intent(&b->lock);
+
+			if (freed >= nr)
+				goto out;
+
+			if (sc->gfp_mask & __GFP_IO)
+				mutex_lock(&c->btree_cache_lock);
+			else if (!mutex_trylock(&c->btree_cache_lock))
+				goto out;
+			goto restart;
+		} else
+			clear_btree_node_accessed(b);
+	}
+
+	mutex_unlock(&c->btree_cache_lock);
+out:
+	bch_time_stats_update(&c->mca_scan_time, start_time);
+
+	trace_bcache_mca_scan(c,
+			      touched * btree_pages(c),
+			      freed * btree_pages(c),
+			      can_free * btree_pages(c),
+			      sc->nr_to_scan);
+
+	return (unsigned long) freed * btree_pages(c);
+}
+
+static unsigned long bch_mca_count(struct shrinker *shrink,
+				   struct shrink_control *sc)
+{
+	struct cache_set *c = container_of(shrink, struct cache_set,
+					   btree_cache_shrink);
+
+	if (btree_shrinker_disabled(c))
+		return 0;
+
+	if (c->btree_cache_alloc_lock)
+		return 0;
+
+	return mca_can_free(c) * btree_pages(c);
+}
+
+void bch_btree_cache_free(struct cache_set *c)
+{
+	struct btree *b;
+	unsigned i;
+
+	if (c->btree_cache_shrink.list.next)
+		unregister_shrinker(&c->btree_cache_shrink);
+
+	mutex_lock(&c->btree_cache_lock);
+
+#ifdef CONFIG_BCACHE_DEBUG
+	if (c->verify_data)
+		list_move(&c->verify_data->list, &c->btree_cache);
+
+	free_pages((unsigned long) c->verify_ondisk, ilog2(btree_pages(c)));
+#endif
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (c->btree_roots[i].b)
+			list_add(&c->btree_roots[i].b->list, &c->btree_cache);
+
+	list_splice(&c->btree_cache_freeable,
+		    &c->btree_cache);
+
+	while (!list_empty(&c->btree_cache)) {
+		b = list_first_entry(&c->btree_cache, struct btree, list);
+
+		if (btree_node_dirty(b))
+			bch_btree_complete_write(c, b, btree_current_write(b));
+		clear_btree_node_dirty(b);
+
+		mca_data_free(c, b);
+	}
+
+	while (!list_empty(&c->btree_cache_freed)) {
+		b = list_first_entry(&c->btree_cache_freed,
+				     struct btree, list);
+		list_del(&b->list);
+		kfree(b);
+	}
+
+	mutex_unlock(&c->btree_cache_lock);
+
+	if (c->btree_cache_table_init_done)
+		rhashtable_destroy(&c->btree_cache_table);
+}
+
+int bch_btree_cache_alloc(struct cache_set *c)
+{
+	unsigned i;
+	int ret;
+
+	ret = rhashtable_init(&c->btree_cache_table, &bch_btree_cache_params);
+	if (ret)
+		return ret;
+
+	c->btree_cache_table_init_done = true;
+
+	bch_recalc_btree_reserve(c);
+
+	for (i = 0; i < c->btree_cache_reserve; i++)
+		if (!mca_bucket_alloc(c, GFP_KERNEL))
+			return -ENOMEM;
+
+	list_splice_init(&c->btree_cache,
+			 &c->btree_cache_freeable);
+
+#ifdef CONFIG_BCACHE_DEBUG
+	mutex_init(&c->verify_lock);
+
+	c->verify_ondisk = (void *)
+		__get_free_pages(GFP_KERNEL, ilog2(btree_pages(c)));
+	if (!c->verify_ondisk)
+		return -ENOMEM;
+
+	c->verify_data = mca_bucket_alloc(c, GFP_KERNEL);
+	if (!c->verify_data)
+		return -ENOMEM;
+
+	list_del_init(&c->verify_data->list);
+#endif
+
+	c->btree_cache_shrink.count_objects = bch_mca_count;
+	c->btree_cache_shrink.scan_objects = bch_mca_scan;
+	c->btree_cache_shrink.seeks = 4;
+	c->btree_cache_shrink.batch = btree_pages(c) * 2;
+	register_shrinker(&c->btree_cache_shrink);
+
+	return 0;
+}
+
+/*
+ * We can only have one thread cannibalizing other cached btree nodes at a time,
+ * or we'll deadlock. We use an open coded mutex to ensure that, which a
+ * cannibalize_bucket() will take. This means every time we unlock the root of
+ * the btree, we need to release this lock if we have it held.
+ */
+void mca_cannibalize_unlock(struct cache_set *c)
+{
+	if (c->btree_cache_alloc_lock == current) {
+		trace_bcache_mca_cannibalize_unlock(c);
+		c->btree_cache_alloc_lock = NULL;
+		closure_wake_up(&c->mca_wait);
+	}
+}
+
+int mca_cannibalize_lock(struct cache_set *c, struct closure *cl)
+{
+	struct task_struct *old;
+
+	old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
+	if (old == NULL || old == current)
+		goto success;
+
+	if (!cl) {
+		trace_bcache_mca_cannibalize_lock_fail(c);
+		return -ENOMEM;
+	}
+
+	closure_wait(&c->mca_wait, cl);
+
+	/* Try again, after adding ourselves to waitlist */
+	old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
+	if (old == NULL || old == current) {
+		/* We raced */
+		closure_wake_up(&c->mca_wait);
+		goto success;
+	}
+
+	trace_bcache_mca_cannibalize_lock_fail(c);
+	return -EAGAIN;
+
+success:
+	trace_bcache_mca_cannibalize_lock(c);
+	return 0;
+}
+
+static struct btree *mca_cannibalize(struct cache_set *c)
+{
+	struct btree *b;
+
+	list_for_each_entry_reverse(b, &c->btree_cache, list)
+		if (!mca_reap(c, b, false))
+			return b;
+
+	while (1) {
+		list_for_each_entry_reverse(b, &c->btree_cache, list)
+			if (!mca_reap(c, b, true))
+				return b;
+
+		/*
+		 * Rare case: all nodes were intent-locked.
+		 * Just busy-wait.
+		 */
+		WARN_ONCE(1, "btree cache cannibalize failed\n");
+		cond_resched();
+	}
+}
+
+struct btree *mca_alloc(struct cache_set *c)
+{
+	struct btree *b;
+	u64 start_time = local_clock();
+
+	mutex_lock(&c->btree_cache_lock);
+
+	/*
+	 * btree_free() doesn't free memory; it sticks the node on the end of
+	 * the list. Check if there's any freed nodes there:
+	 */
+	list_for_each_entry(b, &c->btree_cache_freeable, list)
+		if (!mca_reap_notrace(c, b, false))
+			goto out_unlock;
+
+	/*
+	 * We never free struct btree itself, just the memory that holds the on
+	 * disk node. Check the freed list before allocating a new one:
+	 */
+	list_for_each_entry(b, &c->btree_cache_freed, list)
+		if (!mca_reap_notrace(c, b, false)) {
+			mca_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
+			if (b->data)
+				goto out_unlock;
+
+			six_unlock_write(&b->lock);
+			six_unlock_intent(&b->lock);
+			goto err;
+		}
+
+	b = mca_bucket_alloc(c, __GFP_NOWARN|GFP_NOIO);
+	if (!b)
+		goto err;
+
+	BUG_ON(!six_trylock_intent(&b->lock));
+	BUG_ON(!six_trylock_write(&b->lock));
+out_unlock:
+	BUG_ON(bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key));
+	BUG_ON(btree_node_write_in_flight(b));
+
+	list_del_init(&b->list);
+	mutex_unlock(&c->btree_cache_lock);
+out:
+	b->flags		= 0;
+	b->written		= 0;
+	b->nsets		= 0;
+	b->sib_u64s[0]		= 0;
+	b->sib_u64s[1]		= 0;
+	b->whiteout_u64s	= 0;
+	b->uncompacted_whiteout_u64s = 0;
+	bch_btree_keys_init(b, &c->expensive_debug_checks);
+
+	bch_time_stats_update(&c->mca_alloc_time, start_time);
+
+	return b;
+err:
+	/* Try to cannibalize another cached btree node: */
+	if (c->btree_cache_alloc_lock == current) {
+		b = mca_cannibalize(c);
+		list_del_init(&b->list);
+		mutex_unlock(&c->btree_cache_lock);
+
+		mca_hash_remove(c, b);
+
+		trace_bcache_mca_cannibalize(c);
+		goto out;
+	}
+
+	mutex_unlock(&c->btree_cache_lock);
+	return ERR_PTR(-ENOMEM);
+}
+
+/* Slowpath, don't want it inlined into btree_iter_traverse() */
+static noinline struct btree *bch_btree_node_fill(struct btree_iter *iter,
+						  const struct bkey_i *k,
+						  unsigned level,
+						  enum six_lock_type lock_type)
+{
+	struct cache_set *c = iter->c;
+	struct btree *b;
+
+	b = mca_alloc(c);
+	if (IS_ERR(b))
+		return b;
+
+	bkey_copy(&b->key, k);
+	if (mca_hash_insert(c, b, level, iter->btree_id)) {
+		/* raced with another fill: */
+
+		/* mark as unhashed... */
+		bkey_i_to_extent(&b->key)->v._data[0] = 0;
+
+		mutex_lock(&c->btree_cache_lock);
+		list_add(&b->list, &c->btree_cache_freeable);
+		mutex_unlock(&c->btree_cache_lock);
+
+		six_unlock_write(&b->lock);
+		six_unlock_intent(&b->lock);
+		return NULL;
+	}
+
+	/*
+	 * If the btree node wasn't cached, we can't drop our lock on
+	 * the parent until after it's added to the cache - because
+	 * otherwise we could race with a btree_split() freeing the node
+	 * we're trying to lock.
+	 *
+	 * But the deadlock described below doesn't exist in this case,
+	 * so it's safe to not drop the parent lock until here:
+	 */
+	if (btree_node_read_locked(iter, level + 1))
+		btree_node_unlock(iter, level + 1);
+
+	bch_btree_node_read(c, b);
+	six_unlock_write(&b->lock);
+
+	if (lock_type == SIX_LOCK_read)
+		six_lock_downgrade(&b->lock);
+
+	return b;
+}
+
+/**
+ * bch_btree_node_get - find a btree node in the cache and lock it, reading it
+ * in from disk if necessary.
+ *
+ * If IO is necessary and running under generic_make_request, returns -EAGAIN.
+ *
+ * The btree node will have either a read or a write lock held, depending on
+ * the @write parameter.
+ */
+struct btree *bch_btree_node_get(struct btree_iter *iter,
+				 const struct bkey_i *k, unsigned level,
+				 enum six_lock_type lock_type)
+{
+	struct btree *b;
+	struct bset_tree *t;
+
+	BUG_ON(level >= BTREE_MAX_DEPTH);
+retry:
+	rcu_read_lock();
+	b = mca_find(iter->c, k);
+	rcu_read_unlock();
+
+	if (unlikely(!b)) {
+		/*
+		 * We must have the parent locked to call bch_btree_node_fill(),
+		 * else we could read in a btree node from disk that's been
+		 * freed:
+		 */
+		b = bch_btree_node_fill(iter, k, level, lock_type);
+
+		/* We raced and found the btree node in the cache */
+		if (!b)
+			goto retry;
+
+		if (IS_ERR(b))
+			return b;
+	} else {
+		/*
+		 * There's a potential deadlock with splits and insertions into
+		 * interior nodes we have to avoid:
+		 *
+		 * The other thread might be holding an intent lock on the node
+		 * we want, and they want to update its parent node so they're
+		 * going to upgrade their intent lock on the parent node to a
+		 * write lock.
+		 *
+		 * But if we're holding a read lock on the parent, and we're
+		 * trying to get the intent lock they're holding, we deadlock.
+		 *
+		 * So to avoid this we drop the read locks on parent nodes when
+		 * we're starting to take intent locks - and handle the race.
+		 *
+		 * The race is that they might be about to free the node we
+		 * want, and dropping our read lock on the parent node lets them
+		 * update the parent marking the node we want as freed, and then
+		 * free it:
+		 *
+		 * To guard against this, btree nodes are evicted from the cache
+		 * when they're freed - and PTR_HASH() is zeroed out, which we
+		 * check for after we lock the node.
+		 *
+		 * Then, btree_node_relock() on the parent will fail - because
+		 * the parent was modified, when the pointer to the node we want
+		 * was removed - and we'll bail out:
+		 */
+		if (btree_node_read_locked(iter, level + 1))
+			btree_node_unlock(iter, level + 1);
+
+		if (!btree_node_lock(b, k->k.p, level, iter, lock_type))
+			return ERR_PTR(-EINTR);
+
+		if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
+			     b->level != level ||
+			     race_fault())) {
+			six_unlock_type(&b->lock, lock_type);
+			if (btree_node_relock(iter, level + 1))
+				goto retry;
+
+			return ERR_PTR(-EINTR);
+		}
+	}
+
+	prefetch(b->aux_data);
+
+	for_each_bset(b, t) {
+		void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+		prefetch(p + L1_CACHE_BYTES * 0);
+		prefetch(p + L1_CACHE_BYTES * 1);
+		prefetch(p + L1_CACHE_BYTES * 2);
+	}
+
+	/* avoid atomic set bit if it's not needed: */
+	if (btree_node_accessed(b))
+		set_btree_node_accessed(b);
+
+	if (unlikely(btree_node_read_error(b))) {
+		six_unlock_type(&b->lock, lock_type);
+		return ERR_PTR(-EIO);
+	}
+
+	EBUG_ON(!b->written);
+	EBUG_ON(b->btree_id != iter->btree_id ||
+		BSET_BTREE_LEVEL(&b->data->keys) != level ||
+		bkey_cmp(b->data->max_key, k->k.p));
+
+	return b;
+}
diff --git a/libbcache/btree_cache.h b/libbcache/btree_cache.h
new file mode 100644
index 0000000..e745abb
--- /dev/null
+++ b/libbcache/btree_cache.h
@@ -0,0 +1,61 @@
+#ifndef _BCACHE_BTREE_CACHE_H
+#define _BCACHE_BTREE_CACHE_H
+
+#include "bcache.h"
+#include "btree_types.h"
+
+struct btree_iter;
+
+extern const char *bch_btree_id_names[BTREE_ID_NR];
+
+void bch_recalc_btree_reserve(struct cache_set *);
+
+void mca_hash_remove(struct cache_set *, struct btree *);
+int mca_hash_insert(struct cache_set *, struct btree *,
+		    unsigned, enum btree_id);
+
+void mca_cannibalize_unlock(struct cache_set *);
+int mca_cannibalize_lock(struct cache_set *, struct closure *);
+
+struct btree *mca_alloc(struct cache_set *);
+
+struct btree *bch_btree_node_get(struct btree_iter *, const struct bkey_i *,
+				 unsigned, enum six_lock_type);
+
+void bch_btree_cache_free(struct cache_set *);
+int bch_btree_cache_alloc(struct cache_set *);
+
+#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)		\
+	for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl,	\
+					  &(_c)->btree_cache_table),	\
+	     _iter = 0;	_iter < (_tbl)->size; _iter++)			\
+		rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
+
+static inline size_t btree_bytes(struct cache_set *c)
+{
+	return c->sb.btree_node_size << 9;
+}
+
+static inline size_t btree_max_u64s(struct cache_set *c)
+{
+	return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
+}
+
+static inline size_t btree_pages(struct cache_set *c)
+{
+	return c->sb.btree_node_size >> (PAGE_SHIFT - 9);
+}
+
+static inline size_t btree_page_order(struct cache_set *c)
+{
+	return ilog2(btree_pages(c));
+}
+
+static inline unsigned btree_blocks(struct cache_set *c)
+{
+	return c->sb.btree_node_size >> c->block_bits;
+}
+
+#define btree_node_root(_c, _b)	((_c)->btree_roots[(_b)->btree_id].b)
+
+#endif /* _BCACHE_BTREE_CACHE_H */
diff --git a/libbcache/btree_gc.c b/libbcache/btree_gc.c
new file mode 100644
index 0000000..8417187
--- /dev/null
+++ b/libbcache/btree_gc.c
@@ -0,0 +1,898 @@
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright (C) 2014 Datera Inc.
+ */
+
+#include "bcache.h"
+#include "alloc.h"
+#include "bkey_methods.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "btree_io.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "clock.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "writeback.h"
+
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/rcupdate.h>
+#include <trace/events/bcache.h>
+
+struct range_checks {
+	struct range_level {
+		struct bpos	min;
+		struct bpos	max;
+	}			l[BTREE_MAX_DEPTH];
+	unsigned		depth;
+};
+
+static void btree_node_range_checks_init(struct range_checks *r, unsigned depth)
+{
+	unsigned i;
+
+	for (i = 0; i < BTREE_MAX_DEPTH; i++)
+		r->l[i].min = r->l[i].max = POS_MIN;
+	r->depth = depth;
+}
+
+static void btree_node_range_checks(struct cache_set *c, struct btree *b,
+				    struct range_checks *r)
+{
+	struct range_level *l = &r->l[b->level];
+
+	struct bpos expected_min = bkey_cmp(l->min, l->max)
+		? btree_type_successor(b->btree_id, l->max)
+		: l->max;
+
+	cache_set_inconsistent_on(bkey_cmp(b->data->min_key,
+					   expected_min), c,
+		"btree node has incorrect min key: %llu:%llu != %llu:%llu",
+		b->data->min_key.inode,
+		b->data->min_key.offset,
+		expected_min.inode,
+		expected_min.offset);
+
+	l->max = b->data->max_key;
+
+	if (b->level > r->depth) {
+		l = &r->l[b->level - 1];
+
+		cache_set_inconsistent_on(bkey_cmp(b->data->min_key,
+						   l->min), c,
+			"btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu",
+			b->data->min_key.inode,
+			b->data->min_key.offset,
+			l->min.inode,
+			l->min.offset);
+
+		cache_set_inconsistent_on(bkey_cmp(b->data->max_key,
+						   l->max), c,
+			"btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu",
+			b->data->max_key.inode,
+			b->data->max_key.offset,
+			l->max.inode,
+			l->max.offset);
+
+		if (bkey_cmp(b->data->max_key, POS_MAX))
+			l->min = l->max =
+				btree_type_successor(b->btree_id,
+						     b->data->max_key);
+	}
+}
+
+u8 bch_btree_key_recalc_oldest_gen(struct cache_set *c, struct bkey_s_c k)
+{
+	const struct bch_extent_ptr *ptr;
+	struct cache *ca;
+	u8 max_stale = 0;
+
+	if (bkey_extent_is_data(k.k)) {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+
+		rcu_read_lock();
+
+		extent_for_each_online_device(c, e, ptr, ca) {
+			size_t b = PTR_BUCKET_NR(ca, ptr);
+
+			if (__gen_after(ca->oldest_gens[b], ptr->gen))
+				ca->oldest_gens[b] = ptr->gen;
+
+			max_stale = max(max_stale, ptr_stale(ca, ptr));
+		}
+
+		rcu_read_unlock();
+	}
+
+	return max_stale;
+}
+
+/*
+ * For runtime mark and sweep:
+ */
+u8 __bch_btree_mark_key(struct cache_set *c, enum bkey_type type,
+			struct bkey_s_c k)
+{
+	switch (type) {
+	case BKEY_TYPE_BTREE:
+		bch_gc_mark_key(c, k, c->sb.btree_node_size, true);
+		return 0;
+	case BKEY_TYPE_EXTENTS:
+		bch_gc_mark_key(c, k, k.k->size, false);
+		return bch_btree_key_recalc_oldest_gen(c, k);
+	default:
+		BUG();
+	}
+}
+
+static u8 btree_mark_key(struct cache_set *c, struct btree *b,
+			 struct bkey_s_c k)
+{
+	return __bch_btree_mark_key(c, btree_node_type(b), k);
+}
+
+static bool btree_gc_mark_node(struct cache_set *c, struct btree *b)
+{
+	if (btree_node_has_ptrs(b)) {
+		struct btree_node_iter iter;
+		struct bkey unpacked;
+		struct bkey_s_c k;
+		u8 stale = 0;
+
+		for_each_btree_node_key_unpack(b, k, &iter,
+					       btree_node_is_extents(b),
+					       &unpacked) {
+			bkey_debugcheck(c, b, k);
+			stale = max(stale, btree_mark_key(c, b, k));
+		}
+
+		if (btree_gc_rewrite_disabled(c))
+			return false;
+
+		if (stale > 10)
+			return true;
+	}
+
+	if (btree_gc_always_rewrite(c))
+		return true;
+
+	return false;
+}
+
+static inline void __gc_pos_set(struct cache_set *c, struct gc_pos new_pos)
+{
+	write_seqcount_begin(&c->gc_pos_lock);
+	c->gc_pos = new_pos;
+	write_seqcount_end(&c->gc_pos_lock);
+}
+
+static inline void gc_pos_set(struct cache_set *c, struct gc_pos new_pos)
+{
+	BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
+	__gc_pos_set(c, new_pos);
+}
+
+static int bch_gc_btree(struct cache_set *c, enum btree_id btree_id)
+{
+	struct btree_iter iter;
+	struct btree *b;
+	bool should_rewrite;
+	struct range_checks r;
+	unsigned depth = btree_id == BTREE_ID_EXTENTS ? 0 : 1;
+	int ret;
+
+	/*
+	 * if expensive_debug_checks is on, run range_checks on all leaf nodes:
+	 */
+	if (expensive_debug_checks(c))
+		depth = 0;
+
+	btree_node_range_checks_init(&r, depth);
+
+	for_each_btree_node(&iter, c, btree_id, POS_MIN, depth, b) {
+		btree_node_range_checks(c, b, &r);
+
+		bch_verify_btree_nr_keys(b);
+
+		should_rewrite = btree_gc_mark_node(c, b);
+
+		gc_pos_set(c, gc_pos_btree_node(b));
+
+		if (should_rewrite)
+			bch_btree_node_rewrite(&iter, b, NULL);
+
+		bch_btree_iter_cond_resched(&iter);
+	}
+	ret = bch_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->btree_root_lock);
+
+	b = c->btree_roots[btree_id].b;
+	__bch_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key));
+	gc_pos_set(c, gc_pos_btree_root(b->btree_id));
+
+	mutex_unlock(&c->btree_root_lock);
+	return 0;
+}
+
+static void bch_mark_allocator_buckets(struct cache_set *c)
+{
+	struct cache *ca;
+	struct open_bucket *ob;
+	size_t i, j, iter;
+	unsigned ci;
+
+	for_each_cache(ca, c, ci) {
+		spin_lock(&ca->freelist_lock);
+
+		fifo_for_each_entry(i, &ca->free_inc, iter)
+			bch_mark_alloc_bucket(ca, &ca->buckets[i], true);
+
+		for (j = 0; j < RESERVE_NR; j++)
+			fifo_for_each_entry(i, &ca->free[j], iter)
+				bch_mark_alloc_bucket(ca, &ca->buckets[i], true);
+
+		spin_unlock(&ca->freelist_lock);
+	}
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++) {
+		const struct bch_extent_ptr *ptr;
+
+		mutex_lock(&ob->lock);
+		rcu_read_lock();
+		open_bucket_for_each_online_device(c, ob, ptr, ca)
+			bch_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), true);
+		rcu_read_unlock();
+		mutex_unlock(&ob->lock);
+	}
+}
+
+/*
+ * Mark non btree metadata - prios, journal
+ */
+static void bch_mark_metadata(struct cache_set *c)
+{
+	struct cache *ca;
+	unsigned i;
+
+	for_each_cache(ca, c, i) {
+		unsigned j;
+		u64 *i;
+
+		for (j = 0; j < bch_nr_journal_buckets(ca->disk_sb.sb); j++)
+			bch_mark_metadata_bucket(ca,
+				&ca->buckets[journal_bucket(ca->disk_sb.sb, j)],
+				true);
+
+		spin_lock(&ca->prio_buckets_lock);
+
+		for (i = ca->prio_buckets;
+		     i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
+			bch_mark_metadata_bucket(ca, &ca->buckets[*i], true);
+
+		spin_unlock(&ca->prio_buckets_lock);
+	}
+}
+
+/* Also see bch_pending_btree_node_free_insert_done() */
+static void bch_mark_pending_btree_node_frees(struct cache_set *c)
+{
+	struct bucket_stats_cache_set stats = { 0 };
+	struct btree_interior_update *as;
+	struct pending_btree_node_free *d;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
+
+	for_each_pending_btree_node_free(c, as, d)
+		if (d->index_update_done)
+			__bch_gc_mark_key(c, bkey_i_to_s_c(&d->key),
+					  c->sb.btree_node_size, true,
+					  &stats);
+	/*
+	 * Don't apply stats - pending deletes aren't tracked in
+	 * cache_set_stats:
+	 */
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+/**
+ * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
+ */
+void bch_gc(struct cache_set *c)
+{
+	struct cache *ca;
+	struct bucket *g;
+	struct bucket_mark new;
+	u64 start_time = local_clock();
+	unsigned i;
+	int cpu;
+
+	/*
+	 * Walk _all_ references to buckets, and recompute them:
+	 *
+	 * Order matters here:
+	 *  - Concurrent GC relies on the fact that we have a total ordering for
+	 *    everything that GC walks - see  gc_will_visit_node(),
+	 *    gc_will_visit_root()
+	 *
+	 *  - also, references move around in the course of index updates and
+	 *    various other crap: everything needs to agree on the ordering
+	 *    references are allowed to move around in - e.g., we're allowed to
+	 *    start with a reference owned by an open_bucket (the allocator) and
+	 *    move it to the btree, but not the reverse.
+	 *
+	 *    This is necessary to ensure that gc doesn't miss references that
+	 *    move around - if references move backwards in the ordering GC
+	 *    uses, GC could skip past them
+	 */
+
+	if (test_bit(CACHE_SET_GC_FAILURE, &c->flags))
+		return;
+
+	trace_bcache_gc_start(c);
+
+	/*
+	 * Do this before taking gc_lock - bch_disk_reservation_get() blocks on
+	 * gc_lock if sectors_available goes to 0:
+	 */
+	bch_recalc_sectors_available(c);
+
+	down_write(&c->gc_lock);
+
+	lg_global_lock(&c->bucket_stats_lock);
+
+	/*
+	 * Indicates to buckets code that gc is now in progress - done under
+	 * bucket_stats_lock to avoid racing with bch_mark_key():
+	 */
+	__gc_pos_set(c, GC_POS_MIN);
+
+	/* Save a copy of the existing bucket stats while we recompute them: */
+	for_each_cache(ca, c, i) {
+		ca->bucket_stats_cached = __bch_bucket_stats_read_cache(ca);
+		for_each_possible_cpu(cpu) {
+			struct bucket_stats_cache *p =
+				per_cpu_ptr(ca->bucket_stats_percpu, cpu);
+			memset(p, 0, sizeof(*p));
+		}
+	}
+
+	c->bucket_stats_cached = __bch_bucket_stats_read_cache_set(c);
+	for_each_possible_cpu(cpu) {
+		struct bucket_stats_cache_set *p =
+			per_cpu_ptr(c->bucket_stats_percpu, cpu);
+
+		memset(p->s, 0, sizeof(p->s));
+		p->persistent_reserved = 0;
+	}
+
+	lg_global_unlock(&c->bucket_stats_lock);
+
+	/* Clear bucket marks: */
+	for_each_cache(ca, c, i)
+		for_each_bucket(g, ca) {
+			bucket_cmpxchg(g, new, ({
+				new.owned_by_allocator	= 0;
+				new.is_metadata		= 0;
+				new.cached_sectors	= 0;
+				new.dirty_sectors	= 0;
+			}));
+			ca->oldest_gens[g - ca->buckets] = new.gen;
+		}
+
+	/* Walk allocator's references: */
+	bch_mark_allocator_buckets(c);
+
+	/* Walk btree: */
+	while (c->gc_pos.phase < (int) BTREE_ID_NR) {
+		int ret = c->btree_roots[c->gc_pos.phase].b
+			? bch_gc_btree(c, (int) c->gc_pos.phase)
+			: 0;
+
+		if (ret) {
+			bch_err(c, "btree gc failed: %d", ret);
+			set_bit(CACHE_SET_GC_FAILURE, &c->flags);
+			up_write(&c->gc_lock);
+			return;
+		}
+
+		gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
+	}
+
+	bch_mark_metadata(c);
+	bch_mark_pending_btree_node_frees(c);
+	bch_writeback_recalc_oldest_gens(c);
+
+	for_each_cache(ca, c, i)
+		atomic_long_set(&ca->saturated_count, 0);
+
+	/* Indicates that gc is no longer in progress: */
+	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
+
+	up_write(&c->gc_lock);
+	trace_bcache_gc_end(c);
+	bch_time_stats_update(&c->btree_gc_time, start_time);
+
+	/*
+	 * Wake up allocator in case it was waiting for buckets
+	 * because of not being able to inc gens
+	 */
+	for_each_cache(ca, c, i)
+		bch_wake_allocator(ca);
+}
+
+/* Btree coalescing */
+
+static void recalc_packed_keys(struct btree *b)
+{
+	struct bkey_packed *k;
+
+	memset(&b->nr, 0, sizeof(b->nr));
+
+	BUG_ON(b->nsets != 1);
+
+	for (k =  btree_bkey_first(b, b->set);
+	     k != btree_bkey_last(b, b->set);
+	     k = bkey_next(k))
+		btree_keys_account_key_add(&b->nr, 0, k);
+}
+
+static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
+			       struct btree_iter *iter)
+{
+	struct btree *parent = iter->nodes[old_nodes[0]->level + 1];
+	struct cache_set *c = iter->c;
+	unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
+	unsigned blocks = btree_blocks(c) * 2 / 3;
+	struct btree *new_nodes[GC_MERGE_NODES];
+	struct btree_interior_update *as;
+	struct btree_reserve *res;
+	struct keylist keylist;
+	struct bkey_format_state format_state;
+	struct bkey_format new_format;
+
+	memset(new_nodes, 0, sizeof(new_nodes));
+	bch_keylist_init(&keylist, NULL, 0);
+
+	/* Count keys that are not deleted */
+	for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
+		u64s += old_nodes[i]->nr.live_u64s;
+
+	nr_old_nodes = nr_new_nodes = i;
+
+	/* Check if all keys in @old_nodes could fit in one fewer node */
+	if (nr_old_nodes <= 1 ||
+	    __set_blocks(old_nodes[0]->data,
+			 DIV_ROUND_UP(u64s, nr_old_nodes - 1),
+			 block_bytes(c)) > blocks)
+		return;
+
+	res = bch_btree_reserve_get(c, parent, nr_old_nodes,
+				    BTREE_INSERT_NOFAIL|
+				    BTREE_INSERT_USE_RESERVE,
+				    NULL);
+	if (IS_ERR(res)) {
+		trace_bcache_btree_gc_coalesce_fail(c,
+				BTREE_GC_COALESCE_FAIL_RESERVE_GET);
+		return;
+	}
+
+	if (bch_keylist_realloc(&keylist, NULL, 0,
+			(BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
+		trace_bcache_btree_gc_coalesce_fail(c,
+				BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
+		goto out;
+	}
+
+	/* Find a format that all keys in @old_nodes can pack into */
+	bch_bkey_format_init(&format_state);
+
+	for (i = 0; i < nr_old_nodes; i++)
+		__bch_btree_calc_format(&format_state, old_nodes[i]);
+
+	new_format = bch_bkey_format_done(&format_state);
+
+	/* Check if repacking would make any nodes too big to fit */
+	for (i = 0; i < nr_old_nodes; i++)
+		if (!bch_btree_node_format_fits(c, old_nodes[i], &new_format)) {
+			trace_bcache_btree_gc_coalesce_fail(c,
+					BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
+			goto out;
+		}
+
+	trace_bcache_btree_gc_coalesce(c, parent, nr_old_nodes);
+
+	as = bch_btree_interior_update_alloc(c);
+
+	for (i = 0; i < nr_old_nodes; i++)
+		bch_btree_interior_update_will_free_node(c, as, old_nodes[i]);
+
+	/* Repack everything with @new_format and sort down to one bset */
+	for (i = 0; i < nr_old_nodes; i++)
+		new_nodes[i] = __btree_node_alloc_replacement(c, old_nodes[i],
+							      new_format, res);
+
+	/*
+	 * Conceptually we concatenate the nodes together and slice them
+	 * up at different boundaries.
+	 */
+	for (i = nr_new_nodes - 1; i > 0; --i) {
+		struct btree *n1 = new_nodes[i];
+		struct btree *n2 = new_nodes[i - 1];
+
+		struct bset *s1 = btree_bset_first(n1);
+		struct bset *s2 = btree_bset_first(n2);
+		struct bkey_packed *k, *last = NULL;
+
+		/* Calculate how many keys from @n2 we could fit inside @n1 */
+		u64s = 0;
+
+		for (k = s2->start;
+		     k < bset_bkey_last(s2) &&
+		     __set_blocks(n1->data, le16_to_cpu(s1->u64s) + u64s + k->u64s,
+				  block_bytes(c)) <= blocks;
+		     k = bkey_next(k)) {
+			last = k;
+			u64s += k->u64s;
+		}
+
+		if (u64s == le16_to_cpu(s2->u64s)) {
+			/* n2 fits entirely in n1 */
+			n1->key.k.p = n1->data->max_key = n2->data->max_key;
+
+			memcpy_u64s(bset_bkey_last(s1),
+				    s2->start,
+				    le16_to_cpu(s2->u64s));
+			le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
+
+			set_btree_bset_end(n1, n1->set);
+
+			six_unlock_write(&n2->lock);
+			bch_btree_node_free_never_inserted(c, n2);
+			six_unlock_intent(&n2->lock);
+
+			memmove(new_nodes + i - 1,
+				new_nodes + i,
+				sizeof(new_nodes[0]) * (nr_new_nodes - i));
+			new_nodes[--nr_new_nodes] = NULL;
+		} else if (u64s) {
+			/* move part of n2 into n1 */
+			n1->key.k.p = n1->data->max_key =
+				bkey_unpack_pos(n1, last);
+
+			n2->data->min_key =
+				btree_type_successor(iter->btree_id,
+						     n1->data->max_key);
+
+			memcpy_u64s(bset_bkey_last(s1),
+				    s2->start, u64s);
+			le16_add_cpu(&s1->u64s, u64s);
+
+			memmove(s2->start,
+				bset_bkey_idx(s2, u64s),
+				(le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
+			s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
+
+			set_btree_bset_end(n1, n1->set);
+			set_btree_bset_end(n2, n2->set);
+		}
+	}
+
+	for (i = 0; i < nr_new_nodes; i++) {
+		struct btree *n = new_nodes[i];
+
+		recalc_packed_keys(n);
+		btree_node_reset_sib_u64s(n);
+
+		bch_btree_build_aux_trees(n);
+		six_unlock_write(&n->lock);
+
+		bch_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
+	}
+
+	/*
+	 * The keys for the old nodes get deleted. We don't want to insert keys
+	 * that compare equal to the keys for the new nodes we'll also be
+	 * inserting - we can't because keys on a keylist must be strictly
+	 * greater than the previous keys, and we also don't need to since the
+	 * key for the new node will serve the same purpose (overwriting the key
+	 * for the old node).
+	 */
+	for (i = 0; i < nr_old_nodes; i++) {
+		struct bkey_i delete;
+		unsigned j;
+
+		for (j = 0; j < nr_new_nodes; j++)
+			if (!bkey_cmp(old_nodes[i]->key.k.p,
+				      new_nodes[j]->key.k.p))
+				goto next;
+
+		bkey_init(&delete.k);
+		delete.k.p = old_nodes[i]->key.k.p;
+		bch_keylist_add_in_order(&keylist, &delete);
+next:
+		i = i;
+	}
+
+	/*
+	 * Keys for the new nodes get inserted: bch_btree_insert_keys() only
+	 * does the lookup once and thus expects the keys to be in sorted order
+	 * so we have to make sure the new keys are correctly ordered with
+	 * respect to the deleted keys added in the previous loop
+	 */
+	for (i = 0; i < nr_new_nodes; i++)
+		bch_keylist_add_in_order(&keylist, &new_nodes[i]->key);
+
+	/* Insert the newly coalesced nodes */
+	bch_btree_insert_node(parent, iter, &keylist, res, as);
+
+	BUG_ON(!bch_keylist_empty(&keylist));
+
+	BUG_ON(iter->nodes[old_nodes[0]->level] != old_nodes[0]);
+
+	BUG_ON(!bch_btree_iter_node_replace(iter, new_nodes[0]));
+
+	for (i = 0; i < nr_new_nodes; i++)
+		btree_open_bucket_put(c, new_nodes[i]);
+
+	/* Free the old nodes and update our sliding window */
+	for (i = 0; i < nr_old_nodes; i++) {
+		bch_btree_node_free_inmem(iter, old_nodes[i]);
+		six_unlock_intent(&old_nodes[i]->lock);
+
+		/*
+		 * the index update might have triggered a split, in which case
+		 * the nodes we coalesced - the new nodes we just created -
+		 * might not be sibling nodes anymore - don't add them to the
+		 * sliding window (except the first):
+		 */
+		if (!i) {
+			old_nodes[i] = new_nodes[i];
+		} else {
+			old_nodes[i] = NULL;
+			if (new_nodes[i])
+				six_unlock_intent(&new_nodes[i]->lock);
+		}
+	}
+out:
+	bch_keylist_free(&keylist, NULL);
+	bch_btree_reserve_put(c, res);
+}
+
+static int bch_coalesce_btree(struct cache_set *c, enum btree_id btree_id)
+{
+	struct btree_iter iter;
+	struct btree *b;
+	unsigned i;
+
+	/* Sliding window of adjacent btree nodes */
+	struct btree *merge[GC_MERGE_NODES];
+	u32 lock_seq[GC_MERGE_NODES];
+
+	/*
+	 * XXX: We don't have a good way of positively matching on sibling nodes
+	 * that have the same parent - this code works by handling the cases
+	 * where they might not have the same parent, and is thus fragile. Ugh.
+	 *
+	 * Perhaps redo this to use multiple linked iterators?
+	 */
+	memset(merge, 0, sizeof(merge));
+
+	__for_each_btree_node(&iter, c, btree_id, POS_MIN, 0, b, U8_MAX) {
+		memmove(merge + 1, merge,
+			sizeof(merge) - sizeof(merge[0]));
+		memmove(lock_seq + 1, lock_seq,
+			sizeof(lock_seq) - sizeof(lock_seq[0]));
+
+		merge[0] = b;
+
+		for (i = 1; i < GC_MERGE_NODES; i++) {
+			if (!merge[i] ||
+			    !six_relock_intent(&merge[i]->lock, lock_seq[i]))
+				break;
+
+			if (merge[i]->level != merge[0]->level) {
+				six_unlock_intent(&merge[i]->lock);
+				break;
+			}
+		}
+		memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
+
+		bch_coalesce_nodes(merge, &iter);
+
+		for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
+			lock_seq[i] = merge[i]->lock.state.seq;
+			six_unlock_intent(&merge[i]->lock);
+		}
+
+		lock_seq[0] = merge[0]->lock.state.seq;
+
+		if (test_bit(CACHE_SET_GC_STOPPING, &c->flags)) {
+			bch_btree_iter_unlock(&iter);
+			return -ESHUTDOWN;
+		}
+
+		bch_btree_iter_cond_resched(&iter);
+
+		/*
+		 * If the parent node wasn't relocked, it might have been split
+		 * and the nodes in our sliding window might not have the same
+		 * parent anymore - blow away the sliding window:
+		 */
+		if (iter.nodes[iter.level + 1] &&
+		    !btree_node_intent_locked(&iter, iter.level + 1))
+			memset(merge + 1, 0,
+			       (GC_MERGE_NODES - 1) * sizeof(merge[0]));
+	}
+	return bch_btree_iter_unlock(&iter);
+}
+
+/**
+ * bch_coalesce - coalesce adjacent nodes with low occupancy
+ */
+void bch_coalesce(struct cache_set *c)
+{
+	u64 start_time;
+	enum btree_id id;
+
+	if (btree_gc_coalesce_disabled(c))
+		return;
+
+	if (test_bit(CACHE_SET_GC_FAILURE, &c->flags))
+		return;
+
+	down_read(&c->gc_lock);
+	trace_bcache_gc_coalesce_start(c);
+	start_time = local_clock();
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		int ret = c->btree_roots[id].b
+			? bch_coalesce_btree(c, id)
+			: 0;
+
+		if (ret) {
+			if (ret != -ESHUTDOWN)
+				bch_err(c, "btree coalescing failed: %d", ret);
+			set_bit(CACHE_SET_GC_FAILURE, &c->flags);
+			return;
+		}
+	}
+
+	bch_time_stats_update(&c->btree_coalesce_time, start_time);
+	trace_bcache_gc_coalesce_end(c);
+	up_read(&c->gc_lock);
+}
+
+static int bch_gc_thread(void *arg)
+{
+	struct cache_set *c = arg;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	unsigned long last = atomic_long_read(&clock->now);
+	unsigned last_kick = atomic_read(&c->kick_gc);
+
+	set_freezable();
+
+	while (1) {
+		unsigned long next = last + c->capacity / 16;
+
+		while (atomic_long_read(&clock->now) < next) {
+			set_current_state(TASK_INTERRUPTIBLE);
+
+			if (kthread_should_stop()) {
+				__set_current_state(TASK_RUNNING);
+				return 0;
+			}
+
+			if (atomic_read(&c->kick_gc) != last_kick) {
+				__set_current_state(TASK_RUNNING);
+				break;
+			}
+
+			bch_io_clock_schedule_timeout(clock, next);
+			try_to_freeze();
+		}
+
+		last = atomic_long_read(&clock->now);
+		last_kick = atomic_read(&c->kick_gc);
+
+		bch_gc(c);
+		bch_coalesce(c);
+
+		debug_check_no_locks_held();
+	}
+
+	return 0;
+}
+
+void bch_gc_thread_stop(struct cache_set *c)
+{
+	set_bit(CACHE_SET_GC_STOPPING, &c->flags);
+
+	if (!IS_ERR_OR_NULL(c->gc_thread))
+		kthread_stop(c->gc_thread);
+}
+
+int bch_gc_thread_start(struct cache_set *c)
+{
+	clear_bit(CACHE_SET_GC_STOPPING, &c->flags);
+
+	c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
+	if (IS_ERR(c->gc_thread))
+		return PTR_ERR(c->gc_thread);
+
+	wake_up_process(c->gc_thread);
+	return 0;
+}
+
+/* Initial GC computes bucket marks during startup */
+
+static void bch_initial_gc_btree(struct cache_set *c, enum btree_id id)
+{
+	struct btree_iter iter;
+	struct btree *b;
+	struct range_checks r;
+
+	btree_node_range_checks_init(&r, 0);
+
+	if (!c->btree_roots[id].b)
+		return;
+
+	/*
+	 * We have to hit every btree node before starting journal replay, in
+	 * order for the journal seq blacklist machinery to work:
+	 */
+	for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
+		btree_node_range_checks(c, b, &r);
+
+		if (btree_node_has_ptrs(b)) {
+			struct btree_node_iter node_iter;
+			struct bkey unpacked;
+			struct bkey_s_c k;
+
+			for_each_btree_node_key_unpack(b, k, &node_iter,
+						       btree_node_is_extents(b),
+						       &unpacked)
+				btree_mark_key(c, b, k);
+		}
+
+		bch_btree_iter_cond_resched(&iter);
+	}
+
+	bch_btree_iter_unlock(&iter);
+
+	__bch_btree_mark_key(c, BKEY_TYPE_BTREE,
+			     bkey_i_to_s_c(&c->btree_roots[id].b->key));
+}
+
+int bch_initial_gc(struct cache_set *c, struct list_head *journal)
+{
+	enum btree_id id;
+
+	if (journal) {
+		for (id = 0; id < BTREE_ID_NR; id++)
+			bch_initial_gc_btree(c, id);
+
+		bch_journal_mark(c, journal);
+	}
+
+	bch_mark_metadata(c);
+
+	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
+	set_bit(CACHE_SET_INITIAL_GC_DONE, &c->flags);
+
+	return 0;
+}
diff --git a/libbcache/btree_gc.h b/libbcache/btree_gc.h
new file mode 100644
index 0000000..91d31c0
--- /dev/null
+++ b/libbcache/btree_gc.h
@@ -0,0 +1,103 @@
+#ifndef _BCACHE_GC_H
+#define _BCACHE_GC_H
+
+#include "btree_types.h"
+
+enum bkey_type;
+
+void bch_coalesce(struct cache_set *);
+void bch_gc(struct cache_set *);
+void bch_gc_thread_stop(struct cache_set *);
+int bch_gc_thread_start(struct cache_set *);
+int bch_initial_gc(struct cache_set *, struct list_head *);
+u8 bch_btree_key_recalc_oldest_gen(struct cache_set *, struct bkey_s_c);
+u8 __bch_btree_mark_key(struct cache_set *, enum bkey_type,
+				struct bkey_s_c);
+
+/*
+ * For concurrent mark and sweep (with other index updates), we define a total
+ * ordering of _all_ references GC walks:
+ *
+ * Note that some references will have the same GC position as others - e.g.
+ * everything within the same btree node; in those cases we're relying on
+ * whatever locking exists for where those references live, i.e. the write lock
+ * on a btree node.
+ *
+ * That locking is also required to ensure GC doesn't pass the updater in
+ * between the updater adding/removing the reference and updating the GC marks;
+ * without that, we would at best double count sometimes.
+ *
+ * That part is important - whenever calling bch_mark_pointers(), a lock _must_
+ * be held that prevents GC from passing the position the updater is at.
+ *
+ * (What about the start of gc, when we're clearing all the marks? GC clears the
+ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
+ * position inside its cmpxchg loop, so crap magically works).
+ */
+
+/* Position of (the start of) a gc phase: */
+static inline struct gc_pos gc_phase(enum gc_phase phase)
+{
+	return (struct gc_pos) {
+		.phase	= phase,
+		.pos	= POS_MIN,
+		.level	= 0,
+	};
+}
+
+#define GC_POS_MIN	gc_phase(0)
+
+static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
+{
+	if (l.phase != r.phase)
+		return l.phase < r.phase ? -1 : 1;
+	if (bkey_cmp(l.pos, r.pos))
+		return bkey_cmp(l.pos, r.pos);
+	if (l.level != r.level)
+		return l.level < r.level ? -1 : 1;
+	return 0;
+}
+
+/*
+ * GC position of the pointers within a btree node: note, _not_ for &b->key
+ * itself, that lives in the parent node:
+ */
+static inline struct gc_pos gc_pos_btree_node(struct btree *b)
+{
+	return (struct gc_pos) {
+		.phase	= b->btree_id,
+		.pos	= b->key.k.p,
+		.level	= b->level,
+	};
+}
+
+/*
+ * GC position of the pointer to a btree root: we don't use
+ * gc_pos_pointer_to_btree_node() here to avoid a potential race with
+ * btree_split() increasing the tree depth - the new root will have level > the
+ * old root and thus have a greater gc position than the old root, but that
+ * would be incorrect since once gc has marked the root it's not coming back.
+ */
+static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
+{
+	return (struct gc_pos) {
+		.phase	= (int) id,
+		.pos	= POS_MAX,
+		.level	= U8_MAX,
+	};
+}
+
+static inline bool gc_will_visit(struct cache_set *c, struct gc_pos pos)
+{
+	unsigned seq;
+	bool ret;
+
+	do {
+		seq = read_seqcount_begin(&c->gc_pos_lock);
+		ret = gc_pos_cmp(c->gc_pos, pos) < 0;
+	} while (read_seqcount_retry(&c->gc_pos_lock, seq));
+
+	return ret;
+}
+
+#endif
diff --git a/libbcache/btree_io.c b/libbcache/btree_io.c
new file mode 100644
index 0000000..ff976b5
--- /dev/null
+++ b/libbcache/btree_io.c
@@ -0,0 +1,1674 @@
+
+#include "bcache.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_update.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "journal.h"
+
+#include <trace/events/bcache.h>
+
+static void verify_no_dups(struct btree *b,
+			   struct bkey_packed *start,
+			   struct bkey_packed *end)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+	struct bkey_packed *k;
+
+	for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) {
+		struct bkey l = bkey_unpack_key(b, k);
+		struct bkey r = bkey_unpack_key(b, bkey_next(k));
+
+		BUG_ON(btree_node_is_extents(b)
+		       ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
+		       : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
+		//BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
+	}
+#endif
+}
+
+static void clear_needs_whiteout(struct bset *i)
+{
+	struct bkey_packed *k;
+
+	for (k = i->start; k != bset_bkey_last(i); k = bkey_next(k))
+		k->needs_whiteout = false;
+}
+
+static void set_needs_whiteout(struct bset *i)
+{
+	struct bkey_packed *k;
+
+	for (k = i->start; k != bset_bkey_last(i); k = bkey_next(k))
+		k->needs_whiteout = true;
+}
+
+static void btree_bounce_free(struct cache_set *c, unsigned order,
+			      bool used_mempool, void *p)
+{
+	if (used_mempool)
+		mempool_free(virt_to_page(p), &c->btree_bounce_pool);
+	else
+		free_pages((unsigned long) p, order);
+}
+
+static void *btree_bounce_alloc(struct cache_set *c, unsigned order,
+				bool *used_mempool)
+{
+	void *p;
+
+	BUG_ON(1 << order > btree_pages(c));
+
+	*used_mempool = false;
+	p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
+	if (p)
+		return p;
+
+	*used_mempool = true;
+	return page_address(mempool_alloc(&c->btree_bounce_pool, GFP_NOIO));
+}
+
+typedef int (*sort_cmp_fn)(struct btree *,
+			   struct bkey_packed *,
+			   struct bkey_packed *);
+
+struct sort_iter {
+	struct btree	*b;
+	unsigned		used;
+
+	struct sort_iter_set {
+		struct bkey_packed *k, *end;
+	} data[MAX_BSETS + 1];
+};
+
+static void sort_iter_init(struct sort_iter *iter, struct btree *b)
+{
+	memset(iter, 0, sizeof(*iter));
+	iter->b = b;
+}
+
+static inline void __sort_iter_sift(struct sort_iter *iter,
+				    unsigned from,
+				    sort_cmp_fn cmp)
+{
+	unsigned i;
+
+	for (i = from;
+	     i + 1 < iter->used &&
+	     cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
+	     i++)
+		swap(iter->data[i], iter->data[i + 1]);
+}
+
+static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+
+	__sort_iter_sift(iter, 0, cmp);
+}
+
+static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	unsigned i = iter->used;
+
+	while (i--)
+		__sort_iter_sift(iter, i, cmp);
+}
+
+static void sort_iter_add(struct sort_iter *iter,
+			  struct bkey_packed *k,
+			  struct bkey_packed *end)
+{
+	BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
+
+	if (k != end)
+		iter->data[iter->used++] = (struct sort_iter_set) { k, end };
+}
+
+static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
+{
+	return iter->used ? iter->data->k : NULL;
+}
+
+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	iter->data->k = bkey_next(iter->data->k);
+
+	BUG_ON(iter->data->k > iter->data->end);
+
+	if (iter->data->k == iter->data->end)
+		memmove(&iter->data[0],
+			&iter->data[1],
+			sizeof(iter->data[0]) * --iter->used);
+	else
+		sort_iter_sift(iter, cmp);
+}
+
+static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
+						 sort_cmp_fn cmp)
+{
+	struct bkey_packed *ret = sort_iter_peek(iter);
+
+	if (ret)
+		sort_iter_advance(iter, cmp);
+
+	return ret;
+}
+
+static inline int sort_key_whiteouts_cmp(struct btree *b,
+					 struct bkey_packed *l,
+					 struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r);
+}
+
+static unsigned sort_key_whiteouts(struct bkey_packed *dst,
+				   struct sort_iter *iter)
+{
+	struct bkey_packed *in, *out = dst;
+
+	sort_iter_sort(iter, sort_key_whiteouts_cmp);
+
+	while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
+		bkey_copy(out, in);
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static inline int sort_extent_whiteouts_cmp(struct btree *b,
+					    struct bkey_packed *l,
+					    struct bkey_packed *r)
+{
+	struct bkey ul = bkey_unpack_key(b, l);
+	struct bkey ur = bkey_unpack_key(b, r);
+
+	return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
+}
+
+static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
+				      struct sort_iter *iter)
+{
+	const struct bkey_format *f = &iter->b->format;
+	struct bkey_packed *in, *out = dst;
+	struct bkey_i l, r;
+	bool prev = false, l_packed;
+	u64 max_packed_size	= bkey_field_max(f, BKEY_FIELD_SIZE);
+	u64 max_packed_offset	= bkey_field_max(f, BKEY_FIELD_OFFSET);
+	u64 new_size;
+
+	max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
+
+	sort_iter_sort(iter, sort_extent_whiteouts_cmp);
+
+	while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
+		EBUG_ON(bkeyp_val_u64s(f, in));
+		EBUG_ON(in->type != KEY_TYPE_DISCARD);
+
+		r.k = bkey_unpack_key(iter->b, in);
+
+		if (prev &&
+		    bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
+			if (bkey_cmp(l.k.p, r.k.p) >= 0)
+				continue;
+
+			new_size = l_packed
+				? min(max_packed_size, max_packed_offset -
+				      bkey_start_offset(&l.k))
+				: KEY_SIZE_MAX;
+
+			new_size = min(new_size, r.k.p.offset -
+				       bkey_start_offset(&l.k));
+
+			BUG_ON(new_size < l.k.size);
+
+			bch_key_resize(&l.k, new_size);
+
+			if (bkey_cmp(l.k.p, r.k.p) >= 0)
+				continue;
+
+			bch_cut_front(l.k.p, &r);
+		}
+
+		if (prev) {
+			if (!bkey_pack(out, &l, f)) {
+				BUG_ON(l_packed);
+				bkey_copy(out, &l);
+			}
+			out = bkey_next(out);
+		}
+
+		l = r;
+		prev = true;
+		l_packed = bkey_packed(in);
+	}
+
+	if (prev) {
+		if (!bkey_pack(out, &l, f)) {
+			BUG_ON(l_packed);
+			bkey_copy(out, &l);
+		}
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
+				    bool compacting,
+				    enum compact_mode mode)
+{
+	unsigned live_u64s = b->nr.bset_u64s[t - b->set];
+	unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
+
+	if (live_u64s == bset_u64s)
+		return 0;
+
+	if (mode == COMPACT_LAZY) {
+		if (live_u64s * 4 < bset_u64s * 3 ||
+		    (compacting && bset_unwritten(b, bset(b, t))))
+			return bset_u64s - live_u64s;
+	} else {
+		if (bset_written(b, bset(b, t)))
+			return bset_u64s - live_u64s;
+	}
+
+	return 0;
+}
+
+bool __bch_compact_whiteouts(struct cache_set *c, struct btree *b,
+			     enum compact_mode mode)
+{
+	const struct bkey_format *f = &b->format;
+	struct bset_tree *t;
+	struct bkey_packed *whiteouts = NULL;
+	struct bkey_packed *u_start, *u_pos;
+	struct sort_iter sort_iter;
+	unsigned order, whiteout_u64s = 0, u64s;
+	bool used_mempool, compacting = false;
+
+	for_each_bset(b, t)
+		whiteout_u64s += should_compact_bset(b, t,
+					whiteout_u64s != 0, mode);
+
+	if (!whiteout_u64s)
+		return false;
+
+	sort_iter_init(&sort_iter, b);
+
+	whiteout_u64s += b->whiteout_u64s;
+	order = get_order(whiteout_u64s * sizeof(u64));
+
+	whiteouts = btree_bounce_alloc(c, order, &used_mempool);
+	u_start = u_pos = whiteouts;
+
+	memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
+		    b->whiteout_u64s);
+	u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64);
+
+	sort_iter_add(&sort_iter, u_start, u_pos);
+
+	for_each_bset(b, t) {
+		struct bset *i = bset(b, t);
+		struct bkey_packed *k, *n, *out, *start, *end;
+		struct btree_node_entry *src = NULL, *dst = NULL;
+
+		if (t != b->set && bset_unwritten(b, i)) {
+			src = container_of(i, struct btree_node_entry, keys);
+			dst = max(write_block(b),
+				  (void *) btree_bkey_last(b, t -1));
+		}
+
+		if (!should_compact_bset(b, t, compacting, mode)) {
+			if (src != dst) {
+				memmove(dst, src, sizeof(*src) +
+					le16_to_cpu(src->keys.u64s) *
+					sizeof(u64));
+				i = &dst->keys;
+				set_btree_bset(b, t, i);
+			}
+			continue;
+		}
+
+		compacting = true;
+		u_start = u_pos;
+		start = i->start;
+		end = bset_bkey_last(i);
+
+		if (src != dst) {
+			memmove(dst, src, sizeof(*src));
+			i = &dst->keys;
+			set_btree_bset(b, t, i);
+		}
+
+		out = i->start;
+
+		for (k = start; k != end; k = n) {
+			n = bkey_next(k);
+
+			if (bkey_deleted(k) && btree_node_is_extents(b))
+				continue;
+
+			if (bkey_whiteout(k) && !k->needs_whiteout)
+				continue;
+
+			if (bkey_whiteout(k)) {
+				unreserve_whiteout(b, t, k);
+				memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
+				set_bkeyp_val_u64s(f, u_pos, 0);
+				u_pos = bkey_next(u_pos);
+			} else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
+				bkey_copy(out, k);
+				out = bkey_next(out);
+			}
+		}
+
+		sort_iter_add(&sort_iter, u_start, u_pos);
+
+		if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
+			i->u64s = cpu_to_le16((u64 *) out - i->_data);
+			set_btree_bset_end(b, t);
+			bch_bset_set_no_aux_tree(b, t);
+		}
+	}
+
+	b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
+
+	BUG_ON((void *) unwritten_whiteouts_start(c, b) <
+	       (void *) btree_bkey_last(b, bset_tree_last(b)));
+
+	u64s = btree_node_is_extents(b)
+		? sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
+					&sort_iter)
+		: sort_key_whiteouts(unwritten_whiteouts_start(c, b),
+				     &sort_iter);
+
+	BUG_ON(u64s > b->whiteout_u64s);
+	BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b));
+	BUG_ON(u_pos != whiteouts && !u64s);
+
+	if (u64s != b->whiteout_u64s) {
+		void *src = unwritten_whiteouts_start(c, b);
+
+		b->whiteout_u64s = u64s;
+		memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s);
+	}
+
+	verify_no_dups(b,
+		       unwritten_whiteouts_start(c, b),
+		       unwritten_whiteouts_end(c, b));
+
+	btree_bounce_free(c, order, used_mempool, whiteouts);
+
+	if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK)
+		bch_btree_build_aux_trees(b);
+
+	bch_btree_keys_u64s_remaining(c, b);
+	bch_verify_btree_nr_keys(b);
+
+	return true;
+}
+
+static bool bch_drop_whiteouts(struct btree *b)
+{
+	struct bset_tree *t;
+	bool ret = false;
+
+	for_each_bset(b, t) {
+		struct bset *i = bset(b, t);
+		struct bkey_packed *k, *n, *out, *start, *end;
+
+		if (!should_compact_bset(b, t, true, true))
+			continue;
+
+		start	= btree_bkey_first(b, t);
+		end	= btree_bkey_last(b, t);
+
+		if (bset_unwritten(b, i) &&
+		    t != b->set) {
+			struct bset *dst =
+			       max_t(struct bset *, write_block(b),
+				     (void *) btree_bkey_last(b, t -1));
+
+			memmove(dst, i, sizeof(struct bset));
+			i = dst;
+			set_btree_bset(b, t, i);
+		}
+
+		out = i->start;
+
+		for (k = start; k != end; k = n) {
+			n = bkey_next(k);
+
+			if (!bkey_whiteout(k)) {
+				bkey_copy(out, k);
+				out = bkey_next(out);
+			}
+		}
+
+		i->u64s = cpu_to_le16((u64 *) out - i->_data);
+		bch_bset_set_no_aux_tree(b, t);
+		ret = true;
+	}
+
+	bch_verify_btree_nr_keys(b);
+
+	return ret;
+}
+
+static inline int sort_keys_cmp(struct btree *b,
+				struct bkey_packed *l,
+				struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r) ?:
+		(int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
+		(int) l->needs_whiteout - (int) r->needs_whiteout;
+}
+
+static unsigned sort_keys(struct bkey_packed *dst,
+			  struct sort_iter *iter,
+			  bool filter_whiteouts)
+{
+	const struct bkey_format *f = &iter->b->format;
+	struct bkey_packed *in, *next, *out = dst;
+
+	sort_iter_sort(iter, sort_keys_cmp);
+
+	while ((in = sort_iter_next(iter, sort_keys_cmp))) {
+		if (bkey_whiteout(in) &&
+		    (filter_whiteouts || !in->needs_whiteout))
+			continue;
+
+		if (bkey_whiteout(in) &&
+		    (next = sort_iter_peek(iter)) &&
+		    !bkey_cmp_packed(iter->b, in, next)) {
+			BUG_ON(in->needs_whiteout &&
+			       next->needs_whiteout);
+			/*
+			 * XXX racy, called with read lock from write path
+			 *
+			 * leads to spurious BUG_ON() in bkey_unpack_key() in
+			 * debug mode
+			 */
+			next->needs_whiteout |= in->needs_whiteout;
+			continue;
+		}
+
+		if (bkey_whiteout(in)) {
+			memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
+			set_bkeyp_val_u64s(f, out, 0);
+		} else {
+			bkey_copy(out, in);
+		}
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static inline int sort_extents_cmp(struct btree *b,
+				   struct bkey_packed *l,
+				   struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r) ?:
+		(int) bkey_deleted(l) - (int) bkey_deleted(r);
+}
+
+static unsigned sort_extents(struct bkey_packed *dst,
+			     struct sort_iter *iter,
+			     bool filter_whiteouts)
+{
+	struct bkey_packed *in, *out = dst;
+
+	sort_iter_sort(iter, sort_extents_cmp);
+
+	while ((in = sort_iter_next(iter, sort_extents_cmp))) {
+		if (bkey_deleted(in))
+			continue;
+
+		if (bkey_whiteout(in) &&
+		    (filter_whiteouts || !in->needs_whiteout))
+			continue;
+
+		bkey_copy(out, in);
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static void btree_node_sort(struct cache_set *c, struct btree *b,
+			    struct btree_iter *iter,
+			    unsigned start_idx,
+			    unsigned end_idx,
+			    bool filter_whiteouts)
+{
+	struct btree_node *out;
+	struct sort_iter sort_iter;
+	struct bset_tree *t;
+	struct bset *start_bset = bset(b, &b->set[start_idx]);
+	bool used_mempool = false;
+	u64 start_time;
+	unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
+	bool sorting_entire_node = start_idx == 0 &&
+		end_idx == b->nsets;
+
+	sort_iter_init(&sort_iter, b);
+
+	for (t = b->set + start_idx;
+	     t < b->set + end_idx;
+	     t++) {
+		u64s += le16_to_cpu(bset(b, t)->u64s);
+		sort_iter_add(&sort_iter,
+			      btree_bkey_first(b, t),
+			      btree_bkey_last(b, t));
+	}
+
+	order = sorting_entire_node
+		? btree_page_order(c)
+		: get_order(__set_bytes(b->data, u64s));
+
+	out = btree_bounce_alloc(c, order, &used_mempool);
+
+	start_time = local_clock();
+
+	if (btree_node_is_extents(b))
+		filter_whiteouts = bset_written(b, start_bset);
+
+	u64s = btree_node_is_extents(b)
+		? sort_extents(out->keys.start, &sort_iter, filter_whiteouts)
+		: sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
+
+	out->keys.u64s = cpu_to_le16(u64s);
+
+	BUG_ON((void *) bset_bkey_last(&out->keys) >
+	       (void *) out + (PAGE_SIZE << order));
+
+	if (sorting_entire_node)
+		bch_time_stats_update(&c->btree_sort_time, start_time);
+
+	/* Make sure we preserve bset journal_seq: */
+	for (t = b->set + start_idx + 1;
+	     t < b->set + end_idx;
+	     t++)
+		start_bset->journal_seq =
+			max(start_bset->journal_seq,
+			    bset(b, t)->journal_seq);
+
+	if (sorting_entire_node) {
+		unsigned u64s = le16_to_cpu(out->keys.u64s);
+
+		BUG_ON(order != btree_page_order(c));
+
+		/*
+		 * Our temporary buffer is the same size as the btree node's
+		 * buffer, we can just swap buffers instead of doing a big
+		 * memcpy()
+		 */
+		*out = *b->data;
+		out->keys.u64s = cpu_to_le16(u64s);
+		swap(out, b->data);
+		set_btree_bset(b, b->set, &b->data->keys);
+	} else {
+		start_bset->u64s = out->keys.u64s;
+		memcpy_u64s(start_bset->start,
+			    out->keys.start,
+			    le16_to_cpu(out->keys.u64s));
+	}
+
+	for (i = start_idx + 1; i < end_idx; i++)
+		b->nr.bset_u64s[start_idx] +=
+			b->nr.bset_u64s[i];
+
+	b->nsets -= shift;
+
+	for (i = start_idx + 1; i < b->nsets; i++) {
+		b->nr.bset_u64s[i]	= b->nr.bset_u64s[i + shift];
+		b->set[i]		= b->set[i + shift];
+	}
+
+	for (i = b->nsets; i < MAX_BSETS; i++)
+		b->nr.bset_u64s[i] = 0;
+
+	set_btree_bset_end(b, &b->set[start_idx]);
+	bch_bset_set_no_aux_tree(b, &b->set[start_idx]);
+
+	btree_bounce_free(c, order, used_mempool, out);
+
+	bch_verify_btree_nr_keys(b);
+}
+
+/* Sort + repack in a new format: */
+static struct btree_nr_keys sort_repack(struct bset *dst,
+					struct btree *src,
+					struct btree_node_iter *src_iter,
+					struct bkey_format *out_f,
+					bool filter_whiteouts)
+{
+	struct bkey_format *in_f = &src->format;
+	struct bkey_packed *in, *out = bset_bkey_last(dst);
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	while ((in = bch_btree_node_iter_next_all(src_iter, src))) {
+		if (filter_whiteouts && bkey_whiteout(in))
+			continue;
+
+		if (bch_bkey_transform(out_f, out, bkey_packed(in)
+				       ? in_f : &bch_bkey_format_current, in))
+			out->format = KEY_FORMAT_LOCAL_BTREE;
+		else
+			bkey_unpack(src, (void *) out, in);
+
+		btree_keys_account_key_add(&nr, 0, out);
+		out = bkey_next(out);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+/* Sort, repack, and merge: */
+static struct btree_nr_keys sort_repack_merge(struct cache_set *c,
+					      struct bset *dst,
+					      struct btree *src,
+					      struct btree_node_iter *iter,
+					      struct bkey_format *out_f,
+					      bool filter_whiteouts,
+					      key_filter_fn filter,
+					      key_merge_fn merge)
+{
+	struct bkey_packed *k, *prev = NULL, *out;
+	struct btree_nr_keys nr;
+	BKEY_PADDED(k) tmp;
+
+	memset(&nr, 0, sizeof(nr));
+
+	while ((k = bch_btree_node_iter_next_all(iter, src))) {
+		if (filter_whiteouts && bkey_whiteout(k))
+			continue;
+
+		/*
+		 * The filter might modify pointers, so we have to unpack the
+		 * key and values to &tmp.k:
+		 */
+		bkey_unpack(src, &tmp.k, k);
+
+		if (filter && filter(c, src, bkey_i_to_s(&tmp.k)))
+			continue;
+
+		/* prev is always unpacked, for key merging: */
+
+		if (prev &&
+		    merge &&
+		    merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE)
+			continue;
+
+		/*
+		 * the current key becomes the new prev: advance prev, then
+		 * copy the current key - but first pack prev (in place):
+		 */
+		if (prev) {
+			bkey_pack(prev, (void *) prev, out_f);
+
+			btree_keys_account_key_add(&nr, 0, prev);
+			prev = bkey_next(prev);
+		} else {
+			prev = bset_bkey_last(dst);
+		}
+
+		bkey_copy(prev, &tmp.k);
+	}
+
+	if (prev) {
+		bkey_pack(prev, (void *) prev, out_f);
+		btree_keys_account_key_add(&nr, 0, prev);
+		out = bkey_next(prev);
+	} else {
+		out = bset_bkey_last(dst);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+void bch_btree_sort_into(struct cache_set *c,
+			 struct btree *dst,
+			 struct btree *src)
+{
+	struct btree_nr_keys nr;
+	struct btree_node_iter src_iter;
+	u64 start_time = local_clock();
+
+	BUG_ON(dst->nsets != 1);
+
+	bch_bset_set_no_aux_tree(dst, dst->set);
+
+	bch_btree_node_iter_init_from_start(&src_iter, src,
+					    btree_node_is_extents(src));
+
+	if (btree_node_ops(src)->key_normalize ||
+	    btree_node_ops(src)->key_merge)
+		nr = sort_repack_merge(c, btree_bset_first(dst),
+				src, &src_iter,
+				&dst->format,
+				true,
+				btree_node_ops(src)->key_normalize,
+				btree_node_ops(src)->key_merge);
+	else
+		nr = sort_repack(btree_bset_first(dst),
+				src, &src_iter,
+				&dst->format,
+				true);
+
+	bch_time_stats_update(&c->btree_sort_time, start_time);
+
+	set_btree_bset_end(dst, dst->set);
+
+	dst->nr.live_u64s	+= nr.live_u64s;
+	dst->nr.bset_u64s[0]	+= nr.bset_u64s[0];
+	dst->nr.packed_keys	+= nr.packed_keys;
+	dst->nr.unpacked_keys	+= nr.unpacked_keys;
+
+	bch_verify_btree_nr_keys(dst);
+}
+
+#define SORT_CRIT	(4096 / sizeof(u64))
+
+/*
+ * We're about to add another bset to the btree node, so if there's currently
+ * too many bsets - sort some of them together:
+ */
+static bool btree_node_compact(struct cache_set *c, struct btree *b,
+			       struct btree_iter *iter)
+{
+	unsigned unwritten_idx;
+	bool ret = false;
+
+	for (unwritten_idx = 0;
+	     unwritten_idx < b->nsets;
+	     unwritten_idx++)
+		if (bset_unwritten(b, bset(b, &b->set[unwritten_idx])))
+			break;
+
+	if (b->nsets - unwritten_idx > 1) {
+		btree_node_sort(c, b, iter, unwritten_idx,
+				b->nsets, false);
+		ret = true;
+	}
+
+	if (unwritten_idx > 1) {
+		btree_node_sort(c, b, iter, 0, unwritten_idx, false);
+		ret = true;
+	}
+
+	return ret;
+}
+
+void bch_btree_build_aux_trees(struct btree *b)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t)
+		bch_bset_build_aux_tree(b, t,
+				bset_unwritten(b, bset(b, t)) &&
+				t == bset_tree_last(b));
+}
+
+/*
+ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
+ * inserted into
+ *
+ * Safe to call if there already is an unwritten bset - will only add a new bset
+ * if @b doesn't already have one.
+ *
+ * Returns true if we sorted (i.e. invalidated iterators
+ */
+void bch_btree_init_next(struct cache_set *c, struct btree *b,
+			 struct btree_iter *iter)
+{
+	struct btree_node_entry *bne;
+	bool did_sort;
+
+	EBUG_ON(!(b->lock.state.seq & 1));
+	EBUG_ON(iter && iter->nodes[b->level] != b);
+
+	did_sort = btree_node_compact(c, b, iter);
+
+	bne = want_new_bset(c, b);
+	if (bne)
+		bch_bset_init_next(b, &bne->keys);
+
+	bch_btree_build_aux_trees(b);
+
+	if (iter && did_sort)
+		bch_btree_iter_reinit_node(iter, b);
+}
+
+/*
+ * We seed the checksum with the entire first pointer (dev, gen and offset),
+ * since for btree nodes we have to store the checksum with the data instead of
+ * the pointer - this helps guard against reading a valid btree node that is not
+ * the node we actually wanted:
+ */
+#define btree_csum_set(_b, _i)						\
+({									\
+	void *_data = (void *) (_i) + 8;				\
+	void *_end = bset_bkey_last(&(_i)->keys);			\
+									\
+	bch_checksum_update(BSET_CSUM_TYPE(&(_i)->keys),		\
+			    bkey_i_to_extent_c(&(_b)->key)->v._data[0],	\
+			    _data,					\
+			    _end - _data) ^ 0xffffffffffffffffULL;	\
+})
+
+#define btree_node_error(b, c, ptr, fmt, ...)				\
+	cache_set_inconsistent(c,					\
+		"btree node error at btree %u level %u/%u bucket %zu block %u u64s %u: " fmt,\
+		(b)->btree_id, (b)->level, btree_node_root(c, b)	\
+			    ? btree_node_root(c, b)->level : -1,	\
+		PTR_BUCKET_NR(ca, ptr), (b)->written,			\
+		(i)->u64s, ##__VA_ARGS__)
+
+static const char *validate_bset(struct cache_set *c, struct btree *b,
+				 struct cache *ca,
+				 const struct bch_extent_ptr *ptr,
+				 struct bset *i, unsigned sectors,
+				 unsigned *whiteout_u64s)
+{
+	struct bkey_packed *k, *prev = NULL;
+	bool seen_non_whiteout = false;
+
+	if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION)
+		return "unsupported bset version";
+
+	if (b->written + sectors > c->sb.btree_node_size)
+		return  "bset past end of btree node";
+
+	if (i != &b->data->keys && !i->u64s)
+		btree_node_error(b, c, ptr, "empty set");
+
+	if (!BSET_SEPARATE_WHITEOUTS(i)) {
+		seen_non_whiteout = true;
+		whiteout_u64s = 0;
+	}
+
+	for (k = i->start;
+	     k != bset_bkey_last(i);) {
+		struct bkey_s_c u;
+		struct bkey tmp;
+		const char *invalid;
+
+		if (!k->u64s) {
+			btree_node_error(b, c, ptr,
+				"KEY_U64s 0: %zu bytes of metadata lost",
+				(void *) bset_bkey_last(i) - (void *) k);
+
+			i->u64s = cpu_to_le16((u64 *) k - i->_data);
+			break;
+		}
+
+		if (bkey_next(k) > bset_bkey_last(i)) {
+			btree_node_error(b, c, ptr,
+					 "key extends past end of bset");
+
+			i->u64s = cpu_to_le16((u64 *) k - i->_data);
+			break;
+		}
+
+		if (k->format > KEY_FORMAT_CURRENT) {
+			btree_node_error(b, c, ptr,
+					 "invalid bkey format %u", k->format);
+
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+			memmove_u64s_down(k, bkey_next(k),
+					  (u64 *) bset_bkey_last(i) - (u64 *) k);
+			continue;
+		}
+
+		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
+			bch_bkey_swab(btree_node_type(b), &b->format, k);
+
+		u = bkey_disassemble(b, k, &tmp);
+
+		invalid = btree_bkey_invalid(c, b, u);
+		if (invalid) {
+			char buf[160];
+
+			bch_bkey_val_to_text(c, btree_node_type(b),
+					     buf, sizeof(buf), u);
+			btree_node_error(b, c, ptr,
+					 "invalid bkey %s: %s", buf, invalid);
+
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+			memmove_u64s_down(k, bkey_next(k),
+					  (u64 *) bset_bkey_last(i) - (u64 *) k);
+			continue;
+		}
+
+		/*
+		 * with the separate whiteouts thing (used for extents), the
+		 * second set of keys actually can have whiteouts too, so we
+		 * can't solely go off bkey_whiteout()...
+		 */
+
+		if (!seen_non_whiteout &&
+		    (!bkey_whiteout(k) ||
+		     (prev && bkey_cmp_left_packed_byval(b, prev,
+					bkey_start_pos(u.k)) > 0))) {
+			*whiteout_u64s = k->_data - i->_data;
+			seen_non_whiteout = true;
+		}
+
+		prev = k;
+		k = bkey_next(k);
+	}
+
+	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+	b->written += sectors;
+	return NULL;
+}
+
+void bch_btree_node_read_done(struct cache_set *c, struct btree *b,
+			      struct cache *ca,
+			      const struct bch_extent_ptr *ptr)
+{
+	struct btree_node_entry *bne;
+	struct bset *i = &b->data->keys;
+	struct btree_node_iter *iter;
+	struct btree_node *sorted;
+	bool used_mempool;
+	unsigned u64s;
+	const char *err;
+	int ret;
+
+	iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
+	__bch_btree_node_iter_init(iter, btree_node_is_extents(b));
+
+	err = "dynamic fault";
+	if (bch_meta_read_fault("btree"))
+		goto err;
+
+	while (b->written < c->sb.btree_node_size) {
+		unsigned sectors, whiteout_u64s = 0;
+
+		if (!b->written) {
+			i = &b->data->keys;
+
+			err = "unknown checksum type";
+			if (BSET_CSUM_TYPE(i) >= BCH_CSUM_NR)
+				goto err;
+
+			/* XXX: retry checksum errors */
+
+			err = "bad checksum";
+			if (le64_to_cpu(b->data->csum) !=
+			    btree_csum_set(b, b->data))
+				goto err;
+
+			sectors = __set_blocks(b->data,
+					       le16_to_cpu(b->data->keys.u64s),
+					       block_bytes(c)) << c->block_bits;
+
+			err = "bad magic";
+			if (le64_to_cpu(b->data->magic) != bset_magic(&c->disk_sb))
+				goto err;
+
+			err = "bad btree header";
+			if (!b->data->keys.seq)
+				goto err;
+
+			if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
+				bch_bpos_swab(&b->data->min_key);
+				bch_bpos_swab(&b->data->max_key);
+			}
+
+			err = "incorrect max key";
+			if (bkey_cmp(b->data->max_key, b->key.k.p))
+				goto err;
+
+			err = "incorrect level";
+			if (BSET_BTREE_LEVEL(i) != b->level)
+				goto err;
+
+			err = bch_bkey_format_validate(&b->data->format);
+			if (err)
+				goto err;
+
+			set_btree_bset(b, b->set, &b->data->keys);
+
+			btree_node_set_format(b, b->data->format);
+		} else {
+			bne = write_block(b);
+			i = &bne->keys;
+
+			if (i->seq != b->data->keys.seq)
+				break;
+
+			err = "unknown checksum type";
+			if (BSET_CSUM_TYPE(i) >= BCH_CSUM_NR)
+				goto err;
+
+			err = "bad checksum";
+			if (le64_to_cpu(bne->csum) !=
+			    btree_csum_set(b, bne))
+				goto err;
+
+			sectors = __set_blocks(bne,
+					       le16_to_cpu(bne->keys.u64s),
+					       block_bytes(c)) << c->block_bits;
+		}
+
+		err = validate_bset(c, b, ca, ptr, i, sectors, &whiteout_u64s);
+		if (err)
+			goto err;
+
+		err = "insufficient memory";
+		ret = bch_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b);
+		if (ret < 0)
+			goto err;
+
+		if (ret)
+			continue;
+
+		__bch_btree_node_iter_push(iter, b,
+					   i->start,
+					   bkey_idx(i, whiteout_u64s));
+
+		__bch_btree_node_iter_push(iter, b,
+					   bkey_idx(i, whiteout_u64s),
+					   bset_bkey_last(i));
+	}
+
+	err = "corrupted btree";
+	for (bne = write_block(b);
+	     bset_byte_offset(b, bne) < btree_bytes(c);
+	     bne = (void *) bne + block_bytes(c))
+		if (bne->keys.seq == b->data->keys.seq)
+			goto err;
+
+	sorted = btree_bounce_alloc(c, ilog2(btree_pages(c)), &used_mempool);
+	sorted->keys.u64s = 0;
+
+	b->nr = btree_node_is_extents(b)
+		? bch_extent_sort_fix_overlapping(c, &sorted->keys, b, iter)
+		: bch_key_sort_fix_overlapping(&sorted->keys, b, iter);
+
+	u64s = le16_to_cpu(sorted->keys.u64s);
+	*sorted = *b->data;
+	sorted->keys.u64s = cpu_to_le16(u64s);
+	swap(sorted, b->data);
+	set_btree_bset(b, b->set, &b->data->keys);
+	b->nsets = 1;
+
+	BUG_ON(b->nr.live_u64s != u64s);
+
+	btree_bounce_free(c, ilog2(btree_pages(c)), used_mempool, sorted);
+
+	bch_bset_build_aux_tree(b, b->set, false);
+
+	set_needs_whiteout(btree_bset_first(b));
+
+	btree_node_reset_sib_u64s(b);
+out:
+	mempool_free(iter, &c->fill_iter);
+	return;
+err:
+	set_btree_node_read_error(b);
+	btree_node_error(b, c, ptr, "%s", err);
+	goto out;
+}
+
+static void btree_node_read_endio(struct bio *bio)
+{
+	closure_put(bio->bi_private);
+}
+
+void bch_btree_node_read(struct cache_set *c, struct btree *b)
+{
+	uint64_t start_time = local_clock();
+	struct closure cl;
+	struct bio *bio;
+	struct extent_pick_ptr pick;
+
+	trace_bcache_btree_read(c, b);
+
+	closure_init_stack(&cl);
+
+	pick = bch_btree_pick_ptr(c, b);
+	if (cache_set_fatal_err_on(!pick.ca, c,
+				   "no cache device for btree node")) {
+		set_btree_node_read_error(b);
+		return;
+	}
+
+	bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
+	bio->bi_bdev		= pick.ca->disk_sb.bdev;
+	bio->bi_iter.bi_sector	= pick.ptr.offset;
+	bio->bi_iter.bi_size	= btree_bytes(c);
+	bio->bi_end_io		= btree_node_read_endio;
+	bio->bi_private		= &cl;
+	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC);
+
+	bch_bio_map(bio, b->data);
+
+	closure_get(&cl);
+	bch_generic_make_request(bio, c);
+	closure_sync(&cl);
+
+	if (cache_fatal_io_err_on(bio->bi_error,
+				  pick.ca, "IO error reading bucket %zu",
+				  PTR_BUCKET_NR(pick.ca, &pick.ptr)) ||
+	    bch_meta_read_fault("btree")) {
+		set_btree_node_read_error(b);
+		goto out;
+	}
+
+	bch_btree_node_read_done(c, b, pick.ca, &pick.ptr);
+	bch_time_stats_update(&c->btree_read_time, start_time);
+out:
+	bio_put(bio);
+	percpu_ref_put(&pick.ca->ref);
+}
+
+int bch_btree_root_read(struct cache_set *c, enum btree_id id,
+			const struct bkey_i *k, unsigned level)
+{
+	struct closure cl;
+	struct btree *b;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	do {
+		ret = mca_cannibalize_lock(c, &cl);
+		closure_sync(&cl);
+	} while (ret);
+
+	b = mca_alloc(c);
+	mca_cannibalize_unlock(c);
+
+	BUG_ON(IS_ERR(b));
+
+	bkey_copy(&b->key, k);
+	BUG_ON(mca_hash_insert(c, b, level, id));
+
+	bch_btree_node_read(c, b);
+	six_unlock_write(&b->lock);
+
+	if (btree_node_read_error(b)) {
+		six_unlock_intent(&b->lock);
+		return -EIO;
+	}
+
+	bch_btree_set_root_initial(c, b, NULL);
+	six_unlock_intent(&b->lock);
+
+	return 0;
+}
+
+void bch_btree_complete_write(struct cache_set *c, struct btree *b,
+			      struct btree_write *w)
+{
+	bch_journal_pin_drop(&c->journal, &w->journal);
+	closure_wake_up(&w->wait);
+}
+
+static void btree_node_write_done(struct cache_set *c, struct btree *b)
+{
+	struct btree_write *w = btree_prev_write(b);
+
+	/*
+	 * Before calling bch_btree_complete_write() - if the write errored, we
+	 * have to halt new journal writes before they see this btree node
+	 * write as completed:
+	 */
+	if (btree_node_write_error(b))
+		bch_journal_halt(&c->journal);
+
+	bch_btree_complete_write(c, b, w);
+	btree_node_io_unlock(b);
+}
+
+static void btree_node_write_endio(struct bio *bio)
+{
+	struct btree *b = bio->bi_private;
+	struct bch_write_bio *wbio = to_wbio(bio);
+	struct cache_set *c	= wbio->c;
+	struct bio *orig	= wbio->split ? wbio->orig : NULL;
+	struct closure *cl	= !wbio->split ? wbio->cl : NULL;
+	struct cache *ca	= wbio->ca;
+
+	if (cache_fatal_io_err_on(bio->bi_error, ca, "btree write") ||
+	    bch_meta_write_fault("btree"))
+		set_btree_node_write_error(b);
+
+	if (wbio->bounce)
+		btree_bounce_free(c,
+			wbio->order,
+			wbio->used_mempool,
+			page_address(bio->bi_io_vec[0].bv_page));
+
+	if (wbio->put_bio)
+		bio_put(bio);
+
+	if (orig) {
+		bio_endio(orig);
+	} else {
+		btree_node_write_done(c, b);
+		if (cl)
+			closure_put(cl);
+	}
+
+	if (ca)
+		percpu_ref_put(&ca->ref);
+}
+
+void __bch_btree_node_write(struct cache_set *c, struct btree *b,
+			    struct closure *parent,
+			    enum six_lock_type lock_type_held,
+			    int idx_to_write)
+{
+	struct bio *bio;
+	struct bch_write_bio *wbio;
+	struct bset_tree *t;
+	struct bset *i;
+	struct btree_node *bn = NULL;
+	struct btree_node_entry *bne = NULL;
+	BKEY_PADDED(key) k;
+	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
+	struct cache *ca;
+	struct sort_iter sort_iter;
+	unsigned bytes_to_write, sectors_to_write, order, bytes, u64s;
+	u64 seq = 0;
+	bool used_mempool;
+	unsigned long old, new;
+	void *data;
+
+	/*
+	 * We may only have a read lock on the btree node - the dirty bit is our
+	 * "lock" against racing with other threads that may be trying to start
+	 * a write, we do a write iff we clear the dirty bit. Since setting the
+	 * dirty bit requires a write lock, we can't race with other threads
+	 * redirtying it:
+	 */
+	do {
+		old = new = READ_ONCE(b->flags);
+
+		if (!(old & (1 << BTREE_NODE_dirty)))
+			return;
+
+		if (idx_to_write >= 0 &&
+		    idx_to_write != !!(old & (1 << BTREE_NODE_write_idx)))
+			return;
+
+		if (old & (1 << BTREE_NODE_write_in_flight)) {
+			wait_on_bit_io(&b->flags,
+				       BTREE_NODE_write_in_flight,
+				       TASK_UNINTERRUPTIBLE);
+			continue;
+		}
+
+		new &= ~(1 << BTREE_NODE_dirty);
+		new |=  (1 << BTREE_NODE_write_in_flight);
+		new |=  (1 << BTREE_NODE_just_written);
+		new ^=  (1 << BTREE_NODE_write_idx);
+	} while (cmpxchg_acquire(&b->flags, old, new) != old);
+
+	BUG_ON(!list_empty(&b->write_blocked));
+
+	BUG_ON(b->written >= c->sb.btree_node_size);
+	BUG_ON(bset_written(b, btree_bset_last(b)));
+	BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(&c->disk_sb));
+	BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
+
+	if (lock_type_held == SIX_LOCK_intent) {
+		six_lock_write(&b->lock);
+		__bch_compact_whiteouts(c, b, COMPACT_WRITTEN);
+		six_unlock_write(&b->lock);
+	} else {
+		__bch_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK);
+	}
+
+	BUG_ON(b->uncompacted_whiteout_u64s);
+
+	sort_iter_init(&sort_iter, b);
+
+	bytes = !b->written
+		? sizeof(struct btree_node)
+		: sizeof(struct btree_node_entry);
+
+	bytes += b->whiteout_u64s * sizeof(u64);
+
+	for_each_bset(b, t) {
+		i = bset(b, t);
+
+		if (bset_written(b, i))
+			continue;
+
+		bytes += le16_to_cpu(i->u64s) * sizeof(u64);
+		sort_iter_add(&sort_iter,
+			      btree_bkey_first(b, t),
+			      btree_bkey_last(b, t));
+		seq = max(seq, le64_to_cpu(i->journal_seq));
+	}
+
+	order = get_order(bytes);
+	data = btree_bounce_alloc(c, order, &used_mempool);
+
+	if (!b->written) {
+		bn = data;
+		*bn = *b->data;
+		i = &bn->keys;
+	} else {
+		bne = data;
+		bne->keys = b->data->keys;
+		i = &bne->keys;
+	}
+
+	i->journal_seq	= cpu_to_le64(seq);
+	i->u64s		= 0;
+
+	if (!btree_node_is_extents(b)) {
+		sort_iter_add(&sort_iter,
+			      unwritten_whiteouts_start(c, b),
+			      unwritten_whiteouts_end(c, b));
+		SET_BSET_SEPARATE_WHITEOUTS(i, false);
+	} else {
+		memcpy_u64s(i->start,
+			    unwritten_whiteouts_start(c, b),
+			    b->whiteout_u64s);
+		i->u64s = cpu_to_le16(b->whiteout_u64s);
+		SET_BSET_SEPARATE_WHITEOUTS(i, true);
+	}
+
+	b->whiteout_u64s = 0;
+
+	u64s = btree_node_is_extents(b)
+		? sort_extents(bset_bkey_last(i), &sort_iter, false)
+		: sort_keys(i->start, &sort_iter, false);
+	le16_add_cpu(&i->u64s, u64s);
+
+	clear_needs_whiteout(i);
+
+	if (b->written && !i->u64s) {
+		/* Nothing to write: */
+		btree_bounce_free(c, order, used_mempool, data);
+		btree_node_write_done(c, b);
+		return;
+	}
+
+	BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
+	BUG_ON(i->seq != b->data->keys.seq);
+
+	i->version = cpu_to_le16(BCACHE_BSET_VERSION);
+	SET_BSET_CSUM_TYPE(i, c->opts.metadata_checksum);
+
+	if (bn)
+		bn->csum = cpu_to_le64(btree_csum_set(b, bn));
+	else
+		bne->csum = cpu_to_le64(btree_csum_set(b, bne));
+
+	bytes_to_write = (void *) bset_bkey_last(i) - data;
+	sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
+
+	memset(data + bytes_to_write, 0,
+	       (sectors_to_write << 9) - bytes_to_write);
+
+	BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size);
+
+	trace_bcache_btree_write(b, bytes_to_write, sectors_to_write);
+
+	/*
+	 * We handle btree write errors by immediately halting the journal -
+	 * after we've done that, we can't issue any subsequent btree writes
+	 * because they might have pointers to new nodes that failed to write.
+	 *
+	 * Furthermore, there's no point in doing any more btree writes because
+	 * with the journal stopped, we're never going to update the journal to
+	 * reflect that those writes were done and the data flushed from the
+	 * journal:
+	 *
+	 * Make sure to update b->written so bch_btree_init_next() doesn't
+	 * break:
+	 */
+	if (bch_journal_error(&c->journal)) {
+		set_btree_node_write_error(b);
+		b->written += sectors_to_write;
+
+		btree_bounce_free(c, order, used_mempool, data);
+		btree_node_write_done(c, b);
+		return;
+	}
+
+	bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write);
+
+	wbio			= to_wbio(bio);
+	wbio->cl		= parent;
+	wbio->bounce		= true;
+	wbio->put_bio		= true;
+	wbio->order		= order;
+	wbio->used_mempool	= used_mempool;
+	bio->bi_iter.bi_size	= sectors_to_write << 9;
+	bio->bi_end_io		= btree_node_write_endio;
+	bio->bi_private		= b;
+	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA);
+
+	if (parent)
+		closure_get(parent);
+
+	bch_bio_map(bio, data);
+
+	/*
+	 * If we're appending to a leaf node, we don't technically need FUA -
+	 * this write just needs to be persisted before the next journal write,
+	 * which will be marked FLUSH|FUA.
+	 *
+	 * Similarly if we're writing a new btree root - the pointer is going to
+	 * be in the next journal entry.
+	 *
+	 * But if we're writing a new btree node (that isn't a root) or
+	 * appending to a non leaf btree node, we need either FUA or a flush
+	 * when we write the parent with the new pointer. FUA is cheaper than a
+	 * flush, and writes appending to leaf nodes aren't blocking anything so
+	 * just make all btree node writes FUA to keep things sane.
+	 */
+
+	bkey_copy(&k.key, &b->key);
+	e = bkey_i_to_s_extent(&k.key);
+
+	extent_for_each_ptr(e, ptr)
+		ptr->offset += b->written;
+
+	rcu_read_lock();
+	extent_for_each_online_device(c, e, ptr, ca)
+		atomic64_add(sectors_to_write, &ca->btree_sectors_written);
+	rcu_read_unlock();
+
+	b->written += sectors_to_write;
+
+	bch_submit_wbio_replicas(wbio, c, &k.key, true);
+}
+
+/*
+ * Work that must be done with write lock held:
+ */
+bool bch_btree_post_write_cleanup(struct cache_set *c, struct btree *b)
+{
+	bool invalidated_iter = false;
+	struct btree_node_entry *bne;
+	struct bset_tree *t;
+
+	if (!btree_node_just_written(b))
+		return false;
+
+	BUG_ON(b->whiteout_u64s);
+	BUG_ON(b->uncompacted_whiteout_u64s);
+
+	clear_btree_node_just_written(b);
+
+	/*
+	 * Note: immediately after write, bset_unwritten()/bset_written() don't
+	 * work - the amount of data we had to write after compaction might have
+	 * been smaller than the offset of the last bset.
+	 *
+	 * However, we know that all bsets have been written here, as long as
+	 * we're still holding the write lock:
+	 */
+
+	/*
+	 * XXX: decide if we really want to unconditionally sort down to a
+	 * single bset:
+	 */
+	if (b->nsets > 1) {
+		btree_node_sort(c, b, NULL, 0, b->nsets, true);
+		invalidated_iter = true;
+	} else {
+		invalidated_iter = bch_drop_whiteouts(b);
+	}
+
+	for_each_bset(b, t)
+		set_needs_whiteout(bset(b, t));
+
+	bch_btree_verify(c, b);
+
+	/*
+	 * If later we don't unconditionally sort down to a single bset, we have
+	 * to ensure this is still true:
+	 */
+	BUG_ON((void *) bset_bkey_last(btree_bset_last(b)) > write_block(b));
+
+	bne = want_new_bset(c, b);
+	if (bne)
+		bch_bset_init_next(b, &bne->keys);
+
+	bch_btree_build_aux_trees(b);
+
+	return invalidated_iter;
+}
+
+/*
+ * Use this one if the node is intent locked:
+ */
+void bch_btree_node_write(struct cache_set *c, struct btree *b,
+			  struct closure *parent,
+			  enum six_lock_type lock_type_held,
+			  int idx_to_write)
+{
+	BUG_ON(lock_type_held == SIX_LOCK_write);
+
+	if (lock_type_held == SIX_LOCK_intent ||
+	    six_trylock_convert(&b->lock, SIX_LOCK_read,
+				SIX_LOCK_intent)) {
+		__bch_btree_node_write(c, b, parent, SIX_LOCK_intent, idx_to_write);
+
+		six_lock_write(&b->lock);
+		bch_btree_post_write_cleanup(c, b);
+		six_unlock_write(&b->lock);
+
+		if (lock_type_held == SIX_LOCK_read)
+			six_lock_downgrade(&b->lock);
+	} else {
+		__bch_btree_node_write(c, b, parent, SIX_LOCK_read, idx_to_write);
+	}
+}
+
+static void bch_btree_node_write_dirty(struct cache_set *c, struct btree *b,
+				       struct closure *parent)
+{
+	six_lock_read(&b->lock);
+	BUG_ON(b->level);
+
+	bch_btree_node_write(c, b, parent, SIX_LOCK_read, -1);
+	six_unlock_read(&b->lock);
+}
+
+/*
+ * Write all dirty btree nodes to disk, including roots
+ */
+void bch_btree_flush(struct cache_set *c)
+{
+	struct closure cl;
+	struct btree *b;
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	bool dropped_lock;
+	unsigned i;
+
+	closure_init_stack(&cl);
+
+	rcu_read_lock();
+
+	do {
+		dropped_lock = false;
+		i = 0;
+restart:
+		tbl = rht_dereference_rcu(c->btree_cache_table.tbl,
+					  &c->btree_cache_table);
+
+		for (; i < tbl->size; i++)
+			rht_for_each_entry_rcu(b, pos, tbl, i, hash)
+				/*
+				 * XXX - locking for b->level, when called from
+				 * bch_journal_move()
+				 */
+				if (!b->level && btree_node_dirty(b)) {
+					rcu_read_unlock();
+					bch_btree_node_write_dirty(c, b, &cl);
+					dropped_lock = true;
+					rcu_read_lock();
+					goto restart;
+				}
+	} while (dropped_lock);
+
+	rcu_read_unlock();
+
+	closure_sync(&cl);
+}
+
+/**
+ * bch_btree_node_flush_journal - flush any journal entries that contain keys
+ * from this node
+ *
+ * The bset's journal sequence number is used for preserving ordering of index
+ * updates across unclean shutdowns - it's used to ignore bsets newer than the
+ * most recent journal entry.
+ *
+ * But when rewriting btree nodes we compact all the bsets in a btree node - and
+ * if we compacted a bset that should be ignored with bsets we do need, that
+ * would be bad. So to avoid that, prior to making the new node visible ensure
+ * that the journal has been flushed so that all the bsets we compacted should
+ * be visible.
+ */
+void bch_btree_node_flush_journal_entries(struct cache_set *c,
+					  struct btree *b,
+					  struct closure *cl)
+{
+	int i = b->nsets;
+
+	/*
+	 * Journal sequence numbers in the different bsets will always be in
+	 * ascending order, we only need to flush the highest - except that the
+	 * most recent bset might not have a journal sequence number yet, so we
+	 * need to loop:
+	 */
+	while (i--) {
+		u64 seq = le64_to_cpu(bset(b, &b->set[i])->journal_seq);
+
+		if (seq) {
+			bch_journal_flush_seq_async(&c->journal, seq, cl);
+			break;
+		}
+	}
+}
diff --git a/libbcache/btree_io.h b/libbcache/btree_io.h
new file mode 100644
index 0000000..866cc6c
--- /dev/null
+++ b/libbcache/btree_io.h
@@ -0,0 +1,73 @@
+#ifndef _BCACHE_BTREE_IO_H
+#define _BCACHE_BTREE_IO_H
+
+struct cache_set;
+struct btree_write;
+struct btree;
+struct btree_iter;
+
+static inline void btree_node_io_unlock(struct btree *b)
+{
+	EBUG_ON(!btree_node_write_in_flight(b));
+	clear_btree_node_write_in_flight(b);
+	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+static inline void btree_node_io_lock(struct btree *b)
+{
+	wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
+			    TASK_UNINTERRUPTIBLE);
+}
+
+enum compact_mode {
+	COMPACT_LAZY,
+	COMPACT_WRITTEN,
+	COMPACT_WRITTEN_NO_WRITE_LOCK,
+};
+
+bool __bch_compact_whiteouts(struct cache_set *, struct btree *, enum compact_mode);
+
+static inline bool bch_maybe_compact_whiteouts(struct cache_set *c, struct btree *b)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t) {
+		unsigned live_u64s = b->nr.bset_u64s[t - b->set];
+		unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
+
+		if (live_u64s * 4 < bset_u64s * 3)
+			goto compact;
+	}
+
+	return false;
+compact:
+	return __bch_compact_whiteouts(c, b, COMPACT_LAZY);
+}
+
+void bch_btree_sort_into(struct cache_set *, struct btree *, struct btree *);
+
+void bch_btree_build_aux_trees(struct btree *);
+void bch_btree_init_next(struct cache_set *, struct btree *,
+			 struct btree_iter *);
+
+void bch_btree_node_read_done(struct cache_set *, struct btree *,
+			      struct cache *, const struct bch_extent_ptr *);
+void bch_btree_node_read(struct cache_set *, struct btree *);
+int bch_btree_root_read(struct cache_set *, enum btree_id,
+			const struct bkey_i *, unsigned);
+
+void bch_btree_complete_write(struct cache_set *, struct btree *,
+			      struct btree_write *);
+
+void __bch_btree_node_write(struct cache_set *, struct btree *,
+			    struct closure *, enum six_lock_type, int);
+bool bch_btree_post_write_cleanup(struct cache_set *, struct btree *);
+
+void bch_btree_node_write(struct cache_set *, struct btree *,
+			  struct closure *, enum six_lock_type, int);
+
+void bch_btree_flush(struct cache_set *);
+void bch_btree_node_flush_journal_entries(struct cache_set *, struct btree *,
+					  struct closure *);
+
+#endif /* _BCACHE_BTREE_IO_H */
diff --git a/libbcache/btree_iter.c b/libbcache/btree_iter.c
new file mode 100644
index 0000000..a9859e3
--- /dev/null
+++ b/libbcache/btree_iter.c
@@ -0,0 +1,1150 @@
+
+#include "bcache.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "debug.h"
+#include "extents.h"
+
+#include <trace/events/bcache.h>
+
+#define BTREE_ITER_NOT_END	((struct btree *) 1)
+
+static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
+{
+	return iter->nodes[l] && iter->nodes[l] != BTREE_ITER_NOT_END;
+}
+
+/* Btree node locking: */
+
+/*
+ * Updates the saved lock sequence number, so that btree_node_relock() will
+ * succeed:
+ */
+void btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
+{
+	struct btree_iter *linked;
+
+	EBUG_ON(iter->nodes[b->level] != b);
+	EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq);
+
+	for_each_linked_btree_node(iter, b, linked)
+		linked->lock_seq[b->level] += 2;
+
+	iter->lock_seq[b->level] += 2;
+
+	six_unlock_write(&b->lock);
+}
+
+void btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+{
+	struct btree_iter *linked;
+	unsigned readers = 0;
+
+	EBUG_ON(iter->nodes[b->level] != b);
+	EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
+
+	if (six_trylock_write(&b->lock))
+		return;
+
+	for_each_linked_btree_iter(iter, linked)
+		if (linked->nodes[b->level] == b &&
+		    btree_node_read_locked(linked, b->level))
+			readers++;
+
+	if (likely(!readers)) {
+		six_lock_write(&b->lock);
+	} else {
+		/*
+		 * Must drop our read locks before calling six_lock_write() -
+		 * six_unlock() won't do wakeups until the reader count
+		 * goes to 0, and it's safe because we have the node intent
+		 * locked:
+		 */
+		atomic64_sub(__SIX_VAL(read_lock, readers),
+			     &b->lock.state.counter);
+		six_lock_write(&b->lock);
+		atomic64_add(__SIX_VAL(read_lock, readers),
+			     &b->lock.state.counter);
+	}
+}
+
+/* versions that allow iter to be null: */
+void __btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
+{
+	if (likely(iter))
+		btree_node_unlock_write(b, iter);
+	else
+		six_unlock_write(&b->lock);
+}
+
+void __btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+{
+	if (likely(iter))
+		btree_node_lock_write(b, iter);
+	else
+		six_lock_write(&b->lock);
+}
+
+bool btree_node_relock(struct btree_iter *iter, unsigned level)
+{
+	struct btree_iter *linked;
+	struct btree *b = iter->nodes[level];
+	enum btree_node_locked_type want = btree_lock_want(iter, level);
+	enum btree_node_locked_type have = btree_node_locked_type(iter, level);
+
+	if (want == have)
+		return true;
+
+	if (!is_btree_node(iter, level))
+		return false;
+
+	if (race_fault())
+		return false;
+
+	if (have != BTREE_NODE_UNLOCKED
+	    ? six_trylock_convert(&b->lock, have, want)
+	    : six_relock_type(&b->lock, want, iter->lock_seq[level]))
+		goto success;
+
+	for_each_linked_btree_iter(iter, linked)
+		if (linked->nodes[level] == b &&
+		    btree_node_locked_type(linked, level) == want &&
+		    iter->lock_seq[level] == b->lock.state.seq) {
+			btree_node_unlock(iter, level);
+			six_lock_increment(&b->lock, want);
+			goto success;
+		}
+
+	return false;
+success:
+	mark_btree_node_unlocked(iter, level);
+	mark_btree_node_locked(iter, level, want);
+	return true;
+}
+
+/* Slowpath: */
+bool __bch_btree_node_lock(struct btree *b, struct bpos pos,
+			   unsigned level,
+			   struct btree_iter *iter,
+			   enum six_lock_type type)
+{
+	struct btree_iter *linked;
+
+	/* Can't have children locked before ancestors: */
+	EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked));
+
+	/*
+	 * Can't hold any read locks while we block taking an intent lock - see
+	 * below for reasoning, and we should have already dropped any read
+	 * locks in the current iterator
+	 */
+	EBUG_ON(type == SIX_LOCK_intent &&
+		iter->nodes_locked != iter->nodes_intent_locked);
+
+	for_each_linked_btree_iter(iter, linked)
+		if (linked->nodes[level] == b &&
+		    btree_node_locked_type(linked, level) == type) {
+			six_lock_increment(&b->lock, type);
+			return true;
+		}
+
+	/*
+	 * Must lock btree nodes in key order - this case hapens when locking
+	 * the prev sibling in btree node merging:
+	 */
+	if (iter->nodes_locked &&
+	    __ffs(iter->nodes_locked) == level &&
+	    __btree_iter_cmp(iter->btree_id, pos, iter))
+		return false;
+
+	for_each_linked_btree_iter(iter, linked) {
+		if (!linked->nodes_locked)
+			continue;
+
+		/*
+		 * Can't block taking an intent lock if we have _any_ nodes read
+		 * locked:
+		 *
+		 * - Our read lock blocks another thread with an intent lock on
+		 *   the same node from getting a write lock, and thus from
+		 *   dropping its intent lock
+		 *
+		 * - And the other thread may have multiple nodes intent locked:
+		 *   both the node we want to intent lock, and the node we
+		 *   already have read locked - deadlock:
+		 */
+		if (type == SIX_LOCK_intent &&
+		    linked->nodes_locked != linked->nodes_intent_locked) {
+			linked->locks_want = max(linked->locks_want,
+						 iter->locks_want);
+			return false;
+		}
+
+		/* We have to lock btree nodes in key order: */
+		if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
+			return false;
+
+		/*
+		 * Interior nodes must be locked before their descendants: if
+		 * another iterator has possible descendants locked of the node
+		 * we're about to lock, it must have the ancestors locked too:
+		 */
+		if (linked->btree_id == iter->btree_id &&
+		    level > __fls(linked->nodes_locked)) {
+			linked->locks_want = max(linked->locks_want,
+						 iter->locks_want);
+			return false;
+		}
+	}
+
+	six_lock_type(&b->lock, type);
+	return true;
+}
+
+/* Btree iterator locking: */
+
+
+static void btree_iter_drop_extra_locks(struct btree_iter *iter)
+{
+	unsigned l;
+
+	while (iter->nodes_locked &&
+	       (l = __fls(iter->nodes_locked)) > iter->locks_want) {
+		if (!btree_node_locked(iter, l))
+			panic("l %u nodes_locked %u\n", l, iter->nodes_locked);
+
+		if (l > iter->level) {
+			btree_node_unlock(iter, l);
+		} else if (btree_node_intent_locked(iter, l)) {
+			six_lock_downgrade(&iter->nodes[l]->lock);
+			iter->nodes_intent_locked ^= 1 << l;
+		}
+	}
+}
+
+bool __bch_btree_iter_set_locks_want(struct btree_iter *iter,
+				     unsigned new_locks_want)
+{
+	struct btree_iter *linked;
+	unsigned l;
+
+	/* Drop locks we don't want anymore: */
+	if (new_locks_want < iter->locks_want)
+		for_each_linked_btree_iter(iter, linked)
+			if (linked->locks_want > new_locks_want) {
+				linked->locks_want = max_t(unsigned, 1,
+							   new_locks_want);
+				btree_iter_drop_extra_locks(linked);
+			}
+
+	iter->locks_want = new_locks_want;
+	btree_iter_drop_extra_locks(iter);
+
+	for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
+		if (!btree_node_relock(iter, l))
+			goto fail;
+
+	return true;
+fail:
+	/*
+	 * Just an optimization: ancestor nodes must be locked before child
+	 * nodes, so set locks_want on iterators that might lock ancestors
+	 * before us to avoid getting -EINTR later:
+	 */
+	for_each_linked_btree_iter(iter, linked)
+		if (linked->btree_id == iter->btree_id &&
+		    btree_iter_cmp(linked, iter) <= 0)
+			linked->locks_want = max_t(unsigned, linked->locks_want,
+						   new_locks_want);
+	return false;
+}
+
+static int __bch_btree_iter_unlock(struct btree_iter *iter)
+{
+	BUG_ON(iter->error == -EINTR);
+
+	while (iter->nodes_locked)
+		btree_node_unlock(iter, __ffs(iter->nodes_locked));
+
+	return iter->error;
+}
+
+int bch_btree_iter_unlock(struct btree_iter *iter)
+{
+	struct btree_iter *linked;
+
+	for_each_linked_btree_iter(iter, linked)
+		__bch_btree_iter_unlock(linked);
+	return __bch_btree_iter_unlock(iter);
+}
+
+/* Btree iterator: */
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+static void __bch_btree_iter_verify(struct btree_iter *iter,
+				    struct btree *b)
+{
+	struct btree_node_iter *node_iter = &iter->node_iters[b->level];
+	struct btree_node_iter tmp = *node_iter;
+	struct bkey_packed *k;
+
+	bch_btree_node_iter_verify(node_iter, b);
+
+	/*
+	 * For interior nodes, the iterator will have skipped past
+	 * deleted keys:
+	 */
+	k = b->level
+		? bch_btree_node_iter_prev(&tmp, b)
+		: bch_btree_node_iter_prev_all(&tmp, b);
+	if (k && btree_iter_pos_cmp_packed(b, &iter->pos, k,
+					   iter->is_extents)) {
+		char buf[100];
+		struct bkey uk = bkey_unpack_key(b, k);
+
+		bch_bkey_to_text(buf, sizeof(buf), &uk);
+		panic("prev key should be before after pos:\n%s\n%llu:%llu\n",
+		      buf, iter->pos.inode, iter->pos.offset);
+	}
+
+	k = bch_btree_node_iter_peek_all(node_iter, b);
+	if (k && !btree_iter_pos_cmp_packed(b, &iter->pos, k,
+					    iter->is_extents)) {
+		char buf[100];
+		struct bkey uk = bkey_unpack_key(b, k);
+
+		bch_bkey_to_text(buf, sizeof(buf), &uk);
+		panic("next key should be before iter pos:\n%llu:%llu\n%s\n",
+		      iter->pos.inode, iter->pos.offset, buf);
+	}
+}
+
+void bch_btree_iter_verify(struct btree_iter *iter, struct btree *b)
+{
+	struct btree_iter *linked;
+
+	if (iter->nodes[b->level] == b)
+		__bch_btree_iter_verify(iter, b);
+
+	for_each_linked_btree_node(iter, b, linked)
+		__bch_btree_iter_verify(iter, b);
+}
+
+#endif
+
+static void __bch_btree_node_iter_fix(struct btree_iter *iter,
+				      struct btree *b,
+				      struct btree_node_iter *node_iter,
+				      struct bset_tree *t,
+				      struct bkey_packed *where,
+				      unsigned clobber_u64s,
+				      unsigned new_u64s)
+{
+	const struct bkey_packed *end = btree_bkey_last(b, t);
+	struct btree_node_iter_set *set;
+	unsigned offset = __btree_node_key_to_offset(b, where);
+	int shift = new_u64s - clobber_u64s;
+	unsigned old_end = (int) __btree_node_key_to_offset(b, end) - shift;
+
+	btree_node_iter_for_each(node_iter, set)
+		if (set->end == old_end)
+			goto found;
+
+	/* didn't find the bset in the iterator - might have to readd it: */
+	if (new_u64s &&
+	    btree_iter_pos_cmp_packed(b, &iter->pos, where,
+				      iter->is_extents))
+		bch_btree_node_iter_push(node_iter, b, where, end);
+	return;
+found:
+	set->end = (int) set->end + shift;
+
+	/* Iterator hasn't gotten to the key that changed yet: */
+	if (set->k < offset)
+		return;
+
+	if (new_u64s &&
+	    btree_iter_pos_cmp_packed(b, &iter->pos, where,
+				      iter->is_extents)) {
+		set->k = offset;
+		bch_btree_node_iter_sort(node_iter, b);
+	} else if (set->k < offset + clobber_u64s) {
+		set->k = offset + new_u64s;
+		if (set->k == set->end)
+			*set = node_iter->data[--node_iter->used];
+		bch_btree_node_iter_sort(node_iter, b);
+	} else {
+		set->k = (int) set->k + shift;
+	}
+
+	/*
+	 * Interior nodes are special because iterators for interior nodes don't
+	 * obey the usual invariants regarding the iterator position:
+	 *
+	 * We may have whiteouts that compare greater than the iterator
+	 * position, and logically should be in the iterator, but that we
+	 * skipped past to find the first live key greater than the iterator
+	 * position. This becomes an issue when we insert a new key that is
+	 * greater than the current iterator position, but smaller than the
+	 * whiteouts we've already skipped past - this happens in the course of
+	 * a btree split.
+	 *
+	 * We have to rewind the iterator past to before those whiteouts here,
+	 * else bkey_node_iter_prev() is not going to work and who knows what
+	 * else would happen. And we have to do it manually, because here we've
+	 * already done the insert and the iterator is currently inconsistent:
+	 *
+	 * We've got multiple competing invariants, here - we have to be careful
+	 * about rewinding iterators for interior nodes, because they should
+	 * always point to the key for the child node the btree iterator points
+	 * to.
+	 */
+	if (b->level && new_u64s && !bkey_deleted(where) &&
+	    btree_iter_pos_cmp_packed(b, &iter->pos, where,
+				      iter->is_extents)) {
+		struct bset_tree *t;
+		struct bkey_packed *k;
+
+		for_each_bset(b, t) {
+			if (bch_bkey_to_bset(b, where) == t)
+				continue;
+
+			k = bkey_prev_all(b, t,
+				bch_btree_node_iter_bset_pos(node_iter, b, t));
+			if (k &&
+			    __btree_node_iter_cmp(node_iter, b,
+						  k, where) > 0) {
+				struct btree_node_iter_set *set;
+				unsigned offset =
+					__btree_node_key_to_offset(b, bkey_next(k));
+
+				btree_node_iter_for_each(node_iter, set)
+					if (set->k == offset) {
+						set->k = __btree_node_key_to_offset(b, k);
+						bch_btree_node_iter_sort(node_iter, b);
+						goto next_bset;
+					}
+
+				bch_btree_node_iter_push(node_iter, b, k,
+						btree_bkey_last(b, t));
+			}
+next_bset:
+			t = t;
+		}
+	}
+}
+
+void bch_btree_node_iter_fix(struct btree_iter *iter,
+			     struct btree *b,
+			     struct btree_node_iter *node_iter,
+			     struct bset_tree *t,
+			     struct bkey_packed *where,
+			     unsigned clobber_u64s,
+			     unsigned new_u64s)
+{
+	struct btree_iter *linked;
+
+	if (node_iter != &iter->node_iters[b->level])
+		__bch_btree_node_iter_fix(iter, b, node_iter, t,
+					  where, clobber_u64s, new_u64s);
+
+	if (iter->nodes[b->level] == b)
+		__bch_btree_node_iter_fix(iter, b,
+					  &iter->node_iters[b->level], t,
+					  where, clobber_u64s, new_u64s);
+
+	for_each_linked_btree_node(iter, b, linked)
+		__bch_btree_node_iter_fix(linked, b,
+					  &linked->node_iters[b->level], t,
+					  where, clobber_u64s, new_u64s);
+
+	/* interior node iterators are... special... */
+	if (!b->level)
+		bch_btree_iter_verify(iter, b);
+}
+
+/* peek_all() doesn't skip deleted keys */
+static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter)
+{
+	struct btree *b = iter->nodes[iter->level];
+	struct bkey_packed *k =
+		bch_btree_node_iter_peek_all(&iter->node_iters[iter->level], b);
+	struct bkey_s_c ret;
+
+	EBUG_ON(!btree_node_locked(iter, iter->level));
+
+	if (!k)
+		return bkey_s_c_null;
+
+	ret = bkey_disassemble(b, k, &iter->k);
+
+	if (debug_check_bkeys(iter->c))
+		bkey_debugcheck(iter->c, b, ret);
+
+	return ret;
+}
+
+static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter)
+{
+	struct btree *b = iter->nodes[iter->level];
+	struct bkey_packed *k =
+		bch_btree_node_iter_peek(&iter->node_iters[iter->level], b);
+	struct bkey_s_c ret;
+
+	EBUG_ON(!btree_node_locked(iter, iter->level));
+
+	if (!k)
+		return bkey_s_c_null;
+
+	ret = bkey_disassemble(b, k, &iter->k);
+
+	if (debug_check_bkeys(iter->c))
+		bkey_debugcheck(iter->c, b, ret);
+
+	return ret;
+}
+
+static inline void __btree_iter_advance(struct btree_iter *iter)
+{
+	bch_btree_node_iter_advance(&iter->node_iters[iter->level],
+				    iter->nodes[iter->level]);
+}
+
+/*
+ * Verify that iterator for parent node points to child node:
+ */
+static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
+{
+	bool parent_locked;
+	struct bkey_packed *k;
+
+	if (!IS_ENABLED(CONFIG_BCACHE_DEBUG) ||
+	    !iter->nodes[b->level + 1])
+		return;
+
+	parent_locked = btree_node_locked(iter, b->level + 1);
+
+	if (!btree_node_relock(iter, b->level + 1))
+		return;
+
+	k = bch_btree_node_iter_peek_all(&iter->node_iters[b->level + 1],
+					 iter->nodes[b->level + 1]);
+	if (!k ||
+	    bkey_deleted(k) ||
+	    bkey_cmp_left_packed(iter->nodes[b->level + 1],
+				 k, &b->key.k.p)) {
+		char buf[100];
+		struct bkey uk = bkey_unpack_key(b, k);
+
+		bch_bkey_to_text(buf, sizeof(buf), &uk);
+		panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
+		      buf, b->key.k.p.inode, b->key.k.p.offset);
+	}
+
+	if (!parent_locked)
+		btree_node_unlock(iter, b->level + 1);
+}
+
+static inline void __btree_iter_init(struct btree_iter *iter,
+				     struct btree *b)
+{
+	bch_btree_node_iter_init(&iter->node_iters[b->level], b,
+				 iter->pos, iter->is_extents,
+				 btree_node_is_extents(b));
+
+	/* Skip to first non whiteout: */
+	if (b->level)
+		bch_btree_node_iter_peek(&iter->node_iters[b->level], b);
+}
+
+static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
+					  struct btree *b)
+{
+	return iter->btree_id == b->btree_id &&
+		bkey_cmp(iter->pos, b->data->min_key) >= 0 &&
+		btree_iter_pos_cmp(iter->pos, &b->key.k, iter->is_extents);
+}
+
+static inline void btree_iter_node_set(struct btree_iter *iter,
+				       struct btree *b)
+{
+	btree_iter_verify_new_node(iter, b);
+
+	EBUG_ON(!btree_iter_pos_in_node(iter, b));
+	EBUG_ON(b->lock.state.seq & 1);
+
+	iter->lock_seq[b->level] = b->lock.state.seq;
+	iter->nodes[b->level] = b;
+	__btree_iter_init(iter, b);
+}
+
+/*
+ * A btree node is being replaced - update the iterator to point to the new
+ * node:
+ */
+bool bch_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
+{
+	struct btree_iter *linked;
+
+	for_each_linked_btree_iter(iter, linked)
+		if (btree_iter_pos_in_node(linked, b)) {
+			/*
+			 * bch_btree_iter_node_drop() has already been called -
+			 * the old node we're replacing has already been
+			 * unlocked and the pointer invalidated
+			 */
+			BUG_ON(btree_node_locked(linked, b->level));
+
+			/*
+			 * If @linked wants this node read locked, we don't want
+			 * to actually take the read lock now because it's not
+			 * legal to hold read locks on other nodes while we take
+			 * write locks, so the journal can make forward
+			 * progress...
+			 *
+			 * Instead, btree_iter_node_set() sets things up so
+			 * btree_node_relock() will succeed:
+			 */
+
+			if (btree_want_intent(linked, b->level)) {
+				six_lock_increment(&b->lock, SIX_LOCK_intent);
+				mark_btree_node_intent_locked(linked, b->level);
+			}
+
+			btree_iter_node_set(linked, b);
+		}
+
+	if (!btree_iter_pos_in_node(iter, b)) {
+		six_unlock_intent(&b->lock);
+		return false;
+	}
+
+	mark_btree_node_intent_locked(iter, b->level);
+	btree_iter_node_set(iter, b);
+	return true;
+}
+
+void bch_btree_iter_node_drop_linked(struct btree_iter *iter, struct btree *b)
+{
+	struct btree_iter *linked;
+	unsigned level = b->level;
+
+	for_each_linked_btree_iter(iter, linked)
+		if (linked->nodes[level] == b) {
+			btree_node_unlock(linked, level);
+			linked->nodes[level] = BTREE_ITER_NOT_END;
+		}
+}
+
+void bch_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
+{
+	unsigned level = b->level;
+
+	if (iter->nodes[level] == b) {
+		BUG_ON(b->lock.state.intent_lock != 1);
+		btree_node_unlock(iter, level);
+		iter->nodes[level] = BTREE_ITER_NOT_END;
+	}
+}
+
+/*
+ * A btree node has been modified in such a way as to invalidate iterators - fix
+ * them:
+ */
+void bch_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
+{
+	struct btree_iter *linked;
+
+	for_each_linked_btree_node(iter, b, linked)
+		__btree_iter_init(linked, b);
+	__btree_iter_init(iter, b);
+}
+
+static inline int btree_iter_lock_root(struct btree_iter *iter,
+				       unsigned depth_want)
+{
+	struct cache_set *c = iter->c;
+	struct btree *b;
+	enum six_lock_type lock_type;
+	unsigned i;
+
+	EBUG_ON(iter->nodes_locked);
+
+	while (1) {
+		b = READ_ONCE(c->btree_roots[iter->btree_id].b);
+		iter->level = READ_ONCE(b->level);
+
+		if (unlikely(iter->level < depth_want)) {
+			/*
+			 * the root is at a lower depth than the depth we want:
+			 * got to the end of the btree, or we're walking nodes
+			 * greater than some depth and there are no nodes >=
+			 * that depth
+			 */
+			iter->level = depth_want;
+			iter->nodes[iter->level] = NULL;
+			return 0;
+		}
+
+		lock_type = btree_lock_want(iter, iter->level);
+		if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
+					      iter, lock_type)))
+			return -EINTR;
+
+		if (likely(b == c->btree_roots[iter->btree_id].b &&
+			   b->level == iter->level &&
+			   !race_fault())) {
+			for (i = 0; i < iter->level; i++)
+				iter->nodes[i] = BTREE_ITER_NOT_END;
+			iter->nodes[iter->level] = b;
+
+			mark_btree_node_locked(iter, iter->level, lock_type);
+			btree_iter_node_set(iter, b);
+			return 0;
+
+		}
+
+		six_unlock_type(&b->lock, lock_type);
+	}
+}
+
+static inline int btree_iter_down(struct btree_iter *iter)
+{
+	struct btree *b;
+	struct bkey_s_c k = __btree_iter_peek(iter);
+	unsigned level = iter->level - 1;
+	enum six_lock_type lock_type = btree_lock_want(iter, level);
+	BKEY_PADDED(k) tmp;
+
+	bkey_reassemble(&tmp.k, k);
+
+	b = bch_btree_node_get(iter, &tmp.k, level, lock_type);
+	if (unlikely(IS_ERR(b)))
+		return PTR_ERR(b);
+
+	iter->level = level;
+	mark_btree_node_locked(iter, level, lock_type);
+	btree_iter_node_set(iter, b);
+	return 0;
+}
+
+static void btree_iter_up(struct btree_iter *iter)
+{
+	btree_node_unlock(iter, iter->level++);
+}
+
+int __must_check __bch_btree_iter_traverse(struct btree_iter *);
+
+static int btree_iter_traverse_error(struct btree_iter *iter, int ret)
+{
+	struct cache_set *c = iter->c;
+	struct btree_iter *linked, *sorted_iters, **i;
+retry_all:
+	bch_btree_iter_unlock(iter);
+
+	if (ret != -ENOMEM && ret != -EINTR)
+		goto io_error;
+
+	if (ret == -ENOMEM) {
+		struct closure cl;
+
+		closure_init_stack(&cl);
+
+		do {
+			ret = mca_cannibalize_lock(c, &cl);
+			closure_sync(&cl);
+		} while (ret);
+	}
+
+	/*
+	 * Linked iters are normally a circular singly linked list - break cycle
+	 * while we sort them:
+	 */
+	linked = iter->next;
+	iter->next = NULL;
+	sorted_iters = NULL;
+
+	while (linked) {
+		iter = linked;
+		linked = linked->next;
+
+		i = &sorted_iters;
+		while (*i && btree_iter_cmp(iter, *i) > 0)
+			i = &(*i)->next;
+
+		iter->next = *i;
+		*i = iter;
+	}
+
+	/* Make list circular again: */
+	iter = sorted_iters;
+	while (iter->next)
+		iter = iter->next;
+	iter->next = sorted_iters;
+
+	/* Now, redo traversals in correct order: */
+
+	iter = sorted_iters;
+	do {
+retry:
+		ret = __bch_btree_iter_traverse(iter);
+		if (unlikely(ret)) {
+			if (ret == -EINTR)
+				goto retry;
+			goto retry_all;
+		}
+
+		iter = iter->next;
+	} while (iter != sorted_iters);
+
+	ret = btree_iter_linked(iter) ? -EINTR : 0;
+out:
+	mca_cannibalize_unlock(c);
+	return ret;
+io_error:
+	BUG_ON(ret != -EIO);
+
+	iter->error = ret;
+	iter->nodes[iter->level] = NULL;
+	goto out;
+}
+
+/*
+ * This is the main state machine for walking down the btree - walks down to a
+ * specified depth
+ *
+ * Returns 0 on success, -EIO on error (error reading in a btree node).
+ *
+ * On error, caller (peek_node()/peek_key()) must return NULL; the error is
+ * stashed in the iterator and returned from bch_btree_iter_unlock().
+ */
+int __must_check __bch_btree_iter_traverse(struct btree_iter *iter)
+{
+	unsigned depth_want = iter->level;
+
+	/* make sure we have all the intent locks we need - ugh */
+	if (unlikely(iter->nodes[iter->level] &&
+		     iter->level + 1 < iter->locks_want)) {
+		unsigned i;
+
+		for (i = iter->level + 1;
+		     i < iter->locks_want && iter->nodes[i];
+		     i++)
+			if (!btree_node_relock(iter, i)) {
+				while (iter->nodes[iter->level] &&
+				       iter->level + 1 < iter->locks_want)
+					btree_iter_up(iter);
+				break;
+			}
+	}
+
+	/*
+	 * If the current node isn't locked, go up until we have a locked node
+	 * or run out of nodes:
+	 */
+	while (iter->nodes[iter->level] &&
+	       !(is_btree_node(iter, iter->level) &&
+		 btree_node_relock(iter, iter->level) &&
+		 btree_iter_pos_cmp(iter->pos,
+				    &iter->nodes[iter->level]->key.k,
+				    iter->is_extents)))
+		btree_iter_up(iter);
+
+	/*
+	 * If we've got a btree node locked (i.e. we aren't about to relock the
+	 * root) - advance its node iterator if necessary:
+	 */
+	if (iter->nodes[iter->level]) {
+		struct bkey_s_c k;
+
+		while ((k = __btree_iter_peek_all(iter)).k &&
+		       !btree_iter_pos_cmp(iter->pos, k.k, iter->is_extents))
+			__btree_iter_advance(iter);
+	}
+
+	/*
+	 * Note: iter->nodes[iter->level] may be temporarily NULL here - that
+	 * would indicate to other code that we got to the end of the btree,
+	 * here it indicates that relocking the root failed - it's critical that
+	 * btree_iter_lock_root() comes next and that it can't fail
+	 */
+	while (iter->level > depth_want) {
+		int ret = iter->nodes[iter->level]
+			? btree_iter_down(iter)
+			: btree_iter_lock_root(iter, depth_want);
+		if (unlikely(ret)) {
+			iter->level = depth_want;
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+int __must_check bch_btree_iter_traverse(struct btree_iter *iter)
+{
+	int ret;
+
+	if (unlikely(!iter->nodes[iter->level]))
+		return 0;
+
+	iter->at_end_of_leaf = false;
+
+	ret = __bch_btree_iter_traverse(iter);
+	if (unlikely(ret))
+		ret = btree_iter_traverse_error(iter, ret);
+
+	return ret;
+}
+
+/* Iterate across nodes (leaf and interior nodes) */
+
+struct btree *bch_btree_iter_peek_node(struct btree_iter *iter)
+{
+	struct btree *b;
+	int ret;
+
+	EBUG_ON(iter->is_extents);
+
+	ret = bch_btree_iter_traverse(iter);
+	if (ret)
+		return NULL;
+
+	b = iter->nodes[iter->level];
+
+	if (b) {
+		EBUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
+		iter->pos = b->key.k.p;
+	}
+
+	return b;
+}
+
+struct btree *bch_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
+{
+	struct btree *b;
+	int ret;
+
+	EBUG_ON(iter->is_extents);
+
+	btree_iter_up(iter);
+
+	if (!iter->nodes[iter->level])
+		return NULL;
+
+	/* parent node usually won't be locked: redo traversal if necessary */
+	ret = bch_btree_iter_traverse(iter);
+	if (ret)
+		return NULL;
+
+	b = iter->nodes[iter->level];
+	if (!b)
+		return b;
+
+	if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
+		/* Haven't gotten to the end of the parent node: */
+
+		/* ick: */
+		iter->pos	= iter->btree_id == BTREE_ID_INODES
+			? btree_type_successor(iter->btree_id, iter->pos)
+			: bkey_successor(iter->pos);
+		iter->level	= depth;
+
+		ret = bch_btree_iter_traverse(iter);
+		if (ret)
+			return NULL;
+
+		b = iter->nodes[iter->level];
+	}
+
+	iter->pos = b->key.k.p;
+
+	return b;
+}
+
+/* Iterate across keys (in leaf nodes only) */
+
+void bch_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
+{
+	struct btree *b = iter->nodes[0];
+	struct btree_node_iter *node_iter = &iter->node_iters[0];
+	struct bkey_packed *k;
+
+	EBUG_ON(iter->level != 0);
+	EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
+	EBUG_ON(!btree_node_locked(iter, 0));
+	EBUG_ON(bkey_cmp(new_pos, b->key.k.p) > 0);
+
+	while ((k = bch_btree_node_iter_peek_all(node_iter, b)) &&
+	       !btree_iter_pos_cmp_packed(b, &new_pos, k,
+					  iter->is_extents))
+		bch_btree_node_iter_advance(node_iter, b);
+
+	if (!k &&
+	    !btree_iter_pos_cmp(new_pos, &b->key.k, iter->is_extents))
+		iter->at_end_of_leaf = true;
+
+	iter->pos = new_pos;
+}
+
+void bch_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+	EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); /* XXX handle this */
+	iter->pos = new_pos;
+}
+
+void bch_btree_iter_advance_pos(struct btree_iter *iter)
+{
+	/*
+	 * We use iter->k instead of iter->pos for extents: iter->pos will be
+	 * equal to the start of the extent we returned, but we need to advance
+	 * to the end of the extent we returned.
+	 */
+	bch_btree_iter_set_pos(iter,
+		btree_type_successor(iter->btree_id, iter->k.p));
+}
+
+/* XXX: expensive */
+void bch_btree_iter_rewind(struct btree_iter *iter, struct bpos pos)
+{
+	/* incapable of rewinding across nodes: */
+	BUG_ON(bkey_cmp(pos, iter->nodes[iter->level]->data->min_key) < 0);
+
+	iter->pos = pos;
+	__btree_iter_init(iter, iter->nodes[iter->level]);
+}
+
+struct bkey_s_c bch_btree_iter_peek(struct btree_iter *iter)
+{
+	struct bkey_s_c k;
+	int ret;
+
+	while (1) {
+		ret = bch_btree_iter_traverse(iter);
+		if (unlikely(ret)) {
+			iter->k = KEY(iter->pos.inode, iter->pos.offset, 0);
+			return bkey_s_c_err(ret);
+		}
+
+		k = __btree_iter_peek(iter);
+		if (likely(k.k)) {
+			/*
+			 * iter->pos should always be equal to the key we just
+			 * returned - except extents can straddle iter->pos:
+			 */
+			if (!iter->is_extents ||
+			    bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+				bch_btree_iter_set_pos(iter, bkey_start_pos(k.k));
+			return k;
+		}
+
+		iter->pos = iter->nodes[0]->key.k.p;
+
+		if (!bkey_cmp(iter->pos, POS_MAX)) {
+			iter->k = KEY(iter->pos.inode, iter->pos.offset, 0);
+			bch_btree_iter_unlock(iter);
+			return bkey_s_c_null;
+		}
+
+		iter->pos = btree_type_successor(iter->btree_id, iter->pos);
+	}
+}
+
+struct bkey_s_c bch_btree_iter_peek_with_holes(struct btree_iter *iter)
+{
+	struct bkey_s_c k;
+	struct bkey n;
+	int ret;
+
+	while (1) {
+		ret = bch_btree_iter_traverse(iter);
+		if (unlikely(ret)) {
+			iter->k = KEY(iter->pos.inode, iter->pos.offset, 0);
+			return bkey_s_c_err(ret);
+		}
+
+		k = __btree_iter_peek_all(iter);
+recheck:
+		if (!k.k || bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) {
+			/* hole */
+			bkey_init(&n);
+			n.p = iter->pos;
+
+			if (iter->is_extents) {
+				if (n.p.offset == KEY_OFFSET_MAX) {
+					iter->pos = bkey_successor(iter->pos);
+					goto recheck;
+				}
+
+				if (!k.k)
+					k.k = &iter->nodes[0]->key.k;
+
+				bch_key_resize(&n,
+				       min_t(u64, KEY_SIZE_MAX,
+					     (k.k->p.inode == n.p.inode
+					      ? bkey_start_offset(k.k)
+					      : KEY_OFFSET_MAX) -
+					     n.p.offset));
+
+				EBUG_ON(!n.size);
+			}
+
+			iter->k = n;
+			return (struct bkey_s_c) { &iter->k, NULL };
+		} else if (!bkey_deleted(k.k)) {
+			return k;
+		} else {
+			__btree_iter_advance(iter);
+		}
+	}
+}
+
+void __bch_btree_iter_init(struct btree_iter *iter, struct cache_set *c,
+			   enum btree_id btree_id, struct bpos pos,
+			   unsigned locks_want, unsigned depth)
+{
+	iter->level			= depth;
+	/* bch_bkey_ops isn't used much, this would be a cache miss */
+	/* iter->is_extents		= bch_bkey_ops[btree_id]->is_extents; */
+	iter->is_extents		= btree_id == BTREE_ID_EXTENTS;
+	iter->nodes_locked		= 0;
+	iter->nodes_intent_locked	= 0;
+	iter->locks_want		= min(locks_want, BTREE_MAX_DEPTH);
+	iter->btree_id			= btree_id;
+	iter->at_end_of_leaf		= 0;
+	iter->error			= 0;
+	iter->c				= c;
+	iter->pos			= pos;
+	memset(iter->nodes, 0, sizeof(iter->nodes));
+	iter->nodes[iter->level]	= BTREE_ITER_NOT_END;
+	iter->next			= iter;
+
+	prefetch(c->btree_roots[btree_id].b);
+}
+
+void bch_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
+{
+	BUG_ON(btree_iter_linked(new));
+
+	new->next = iter->next;
+	iter->next = new;
+
+	if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
+		unsigned nr_iters = 1;
+
+		for_each_linked_btree_iter(iter, new)
+			nr_iters++;
+
+		BUG_ON(nr_iters > SIX_LOCK_MAX_RECURSE);
+	}
+}
+
+void bch_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
+{
+	bch_btree_iter_unlock(dst);
+	memcpy(dst, src, offsetof(struct btree_iter, next));
+	dst->nodes_locked = dst->nodes_intent_locked = 0;
+}
diff --git a/libbcache/btree_iter.h b/libbcache/btree_iter.h
new file mode 100644
index 0000000..9835334
--- /dev/null
+++ b/libbcache/btree_iter.h
@@ -0,0 +1,282 @@
+#ifndef _BCACHE_BTREE_ITER_H
+#define _BCACHE_BTREE_ITER_H
+
+#include "btree_types.h"
+
+struct btree_iter {
+	/* Current btree depth */
+	u8			level;
+
+	/*
+	 * Used in bch_btree_iter_traverse(), to indicate whether we're
+	 * searching for @pos or the first key strictly greater than @pos
+	 */
+	u8			is_extents;
+
+	/* Bitmasks for read/intent locks held per level */
+	u8			nodes_locked;
+	u8			nodes_intent_locked;
+
+	/* Btree level below which we start taking intent locks */
+	u8			locks_want;
+
+	enum btree_id		btree_id:8;
+
+	/*
+	 * indicates we need to call bch_btree_iter_traverse() to revalidate
+	 * iterator:
+	 */
+	u8			at_end_of_leaf;
+
+	s8			error;
+
+	struct cache_set	*c;
+
+	/* Current position of the iterator */
+	struct bpos		pos;
+
+	u32			lock_seq[BTREE_MAX_DEPTH];
+
+	/*
+	 * NOTE: Never set iter->nodes to NULL except in btree_iter_lock_root().
+	 *
+	 * This is because iter->nodes[iter->level] == NULL is how
+	 * btree_iter_next_node() knows that it's finished with a depth first
+	 * traversal. Just unlocking a node (with btree_node_unlock()) is fine,
+	 * and if you really don't want that node used again (e.g. btree_split()
+	 * freed it) decrementing lock_seq will cause btree_node_relock() to
+	 * always fail (but since freeing a btree node takes a write lock on the
+	 * node, which increments the node's lock seq, that's not actually
+	 * necessary in that example).
+	 *
+	 * One extra slot for a sentinel NULL:
+	 */
+	struct btree		*nodes[BTREE_MAX_DEPTH + 1];
+	struct btree_node_iter	node_iters[BTREE_MAX_DEPTH];
+
+	/*
+	 * Current unpacked key - so that bch_btree_iter_next()/
+	 * bch_btree_iter_next_with_holes() can correctly advance pos.
+	 */
+	struct bkey		k;
+
+	/*
+	 * Circular linked list of linked iterators: linked iterators share
+	 * locks (e.g. two linked iterators may have the same node intent
+	 * locked, or read and write locked, at the same time), and insertions
+	 * through one iterator won't invalidate the other linked iterators.
+	 */
+
+	/* Must come last: */
+	struct btree_iter	*next;
+};
+
+static inline bool btree_iter_linked(const struct btree_iter *iter)
+{
+	return iter->next != iter;
+}
+
+/**
+ * for_each_linked_btree_iter - iterate over all iterators linked with @_iter
+ */
+#define for_each_linked_btree_iter(_iter, _linked)			\
+	for ((_linked) = (_iter)->next;					\
+	     (_linked) != (_iter);					\
+	     (_linked) = (_linked)->next)
+
+static inline struct btree_iter *
+__next_linked_btree_node(struct btree_iter *iter, struct btree *b,
+			 struct btree_iter *linked)
+{
+	do {
+		linked = linked->next;
+
+		if (linked == iter)
+			return NULL;
+
+		/*
+		 * We don't compare the low bits of the lock sequence numbers
+		 * because @iter might have taken a write lock on @b, and we
+		 * don't want to skip the linked iterator if the sequence
+		 * numbers were equal before taking that write lock. The lock
+		 * sequence number is incremented by taking and releasing write
+		 * locks and is even when unlocked:
+		 */
+	} while (linked->nodes[b->level] != b ||
+		 linked->lock_seq[b->level] >> 1 != b->lock.state.seq >> 1);
+
+	return linked;
+}
+
+/**
+ * for_each_linked_btree_node - iterate over all iterators linked with @_iter
+ * that also point to @_b
+ *
+ * @_b is assumed to be locked by @_iter
+ *
+ * Filters out iterators that don't have a valid btree_node iterator for @_b -
+ * i.e. iterators for which btree_node_relock() would not succeed.
+ */
+#define for_each_linked_btree_node(_iter, _b, _linked)			\
+	for ((_linked) = (_iter);					\
+	     ((_linked) = __next_linked_btree_node(_iter, _b, _linked));)
+
+#ifdef CONFIG_BCACHE_DEBUG
+void bch_btree_iter_verify(struct btree_iter *, struct btree *);
+#else
+static inline void bch_btree_iter_verify(struct btree_iter *iter,
+					 struct btree *b) {}
+#endif
+
+void bch_btree_node_iter_fix(struct btree_iter *, struct btree *,
+			     struct btree_node_iter *, struct bset_tree *,
+			     struct bkey_packed *, unsigned, unsigned);
+
+int bch_btree_iter_unlock(struct btree_iter *);
+bool __bch_btree_iter_set_locks_want(struct btree_iter *, unsigned);
+
+static inline bool bch_btree_iter_set_locks_want(struct btree_iter *iter,
+						 unsigned new_locks_want)
+{
+	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+
+	if (iter->locks_want == new_locks_want &&
+	    iter->nodes_intent_locked == (1 << new_locks_want) - 1)
+		return true;
+
+	return __bch_btree_iter_set_locks_want(iter, new_locks_want);
+}
+
+bool bch_btree_iter_node_replace(struct btree_iter *, struct btree *);
+void bch_btree_iter_node_drop_linked(struct btree_iter *, struct btree *);
+void bch_btree_iter_node_drop(struct btree_iter *, struct btree *);
+
+void bch_btree_iter_reinit_node(struct btree_iter *, struct btree *);
+
+int __must_check bch_btree_iter_traverse(struct btree_iter *);
+
+struct btree *bch_btree_iter_peek_node(struct btree_iter *);
+struct btree *bch_btree_iter_next_node(struct btree_iter *, unsigned);
+
+struct bkey_s_c bch_btree_iter_peek(struct btree_iter *);
+struct bkey_s_c bch_btree_iter_peek_with_holes(struct btree_iter *);
+void bch_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
+void bch_btree_iter_set_pos(struct btree_iter *, struct bpos);
+void bch_btree_iter_advance_pos(struct btree_iter *);
+void bch_btree_iter_rewind(struct btree_iter *, struct bpos);
+
+void __bch_btree_iter_init(struct btree_iter *, struct cache_set *,
+			   enum btree_id, struct bpos, unsigned , unsigned);
+
+static inline void bch_btree_iter_init(struct btree_iter *iter,
+				       struct cache_set *c,
+				       enum btree_id btree_id,
+				       struct bpos pos)
+{
+	__bch_btree_iter_init(iter, c, btree_id, pos, 0, 0);
+}
+
+static inline void bch_btree_iter_init_intent(struct btree_iter *iter,
+					      struct cache_set *c,
+					      enum btree_id btree_id,
+					      struct bpos pos)
+{
+	__bch_btree_iter_init(iter, c, btree_id, pos, 1, 0);
+}
+
+void bch_btree_iter_link(struct btree_iter *, struct btree_iter *);
+void bch_btree_iter_copy(struct btree_iter *, struct btree_iter *);
+
+static inline struct bpos btree_type_successor(enum btree_id id,
+					       struct bpos pos)
+{
+	if (id == BTREE_ID_INODES) {
+		pos.inode++;
+		pos.offset = 0;
+	} else if (id != BTREE_ID_EXTENTS) {
+		pos = bkey_successor(pos);
+	}
+
+	return pos;
+}
+
+static inline int __btree_iter_cmp(enum btree_id id,
+				   struct bpos pos,
+				   const struct btree_iter *r)
+{
+	if (id != r->btree_id)
+		return id < r->btree_id ? -1 : 1;
+	return bkey_cmp(pos, r->pos);
+}
+
+static inline int btree_iter_cmp(const struct btree_iter *l,
+				 const struct btree_iter *r)
+{
+	return __btree_iter_cmp(l->btree_id, l->pos, r);
+}
+
+#define __for_each_btree_node(_iter, _c, _btree_id, _start, _depth,	\
+			      _b, _locks_want)				\
+	for (__bch_btree_iter_init((_iter), (_c), (_btree_id),		\
+				   _start, _locks_want, _depth),	\
+	     (_iter)->is_extents = false,				\
+	     _b = bch_btree_iter_peek_node(_iter);			\
+	     (_b);							\
+	     (_b) = bch_btree_iter_next_node(_iter, _depth))
+
+#define for_each_btree_node(_iter, _c, _btree_id, _start, _depth, _b)	\
+	__for_each_btree_node(_iter, _c, _btree_id, _start, _depth, _b, 0)
+
+#define __for_each_btree_key(_iter, _c, _btree_id,  _start,		\
+			     _k, _locks_want)				\
+	for (__bch_btree_iter_init((_iter), (_c), (_btree_id),		\
+				   _start, _locks_want, 0);		\
+	     !IS_ERR_OR_NULL(((_k) = bch_btree_iter_peek(_iter)).k);	\
+	     bch_btree_iter_advance_pos(_iter))
+
+#define for_each_btree_key(_iter, _c, _btree_id,  _start, _k)		\
+	__for_each_btree_key(_iter, _c, _btree_id, _start, _k, 0)
+
+#define for_each_btree_key_intent(_iter, _c, _btree_id,  _start, _k)	\
+	__for_each_btree_key(_iter, _c, _btree_id, _start, _k, 1)
+
+#define __for_each_btree_key_with_holes(_iter, _c, _btree_id,		\
+					_start, _k, _locks_want)	\
+	for (__bch_btree_iter_init((_iter), (_c), (_btree_id),		\
+				   _start, _locks_want, 0);		\
+	     !IS_ERR_OR_NULL(((_k) = bch_btree_iter_peek_with_holes(_iter)).k);\
+	     bch_btree_iter_advance_pos(_iter))
+
+#define for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k)	\
+	__for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k, 0)
+
+#define for_each_btree_key_with_holes_intent(_iter, _c, _btree_id,	\
+					     _start, _k)		\
+	__for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k, 1)
+
+static inline int btree_iter_err(struct bkey_s_c k)
+{
+	return IS_ERR(k.k) ? PTR_ERR(k.k) : 0;
+}
+
+/*
+ * Unlocks before scheduling
+ * Note: does not revalidate iterator
+ */
+static inline void bch_btree_iter_cond_resched(struct btree_iter *iter)
+{
+	struct btree_iter *linked;
+
+	if (need_resched()) {
+		for_each_linked_btree_iter(iter, linked)
+			bch_btree_iter_unlock(linked);
+		bch_btree_iter_unlock(iter);
+		schedule();
+	} else if (race_fault()) {
+		for_each_linked_btree_iter(iter, linked)
+			bch_btree_iter_unlock(linked);
+		bch_btree_iter_unlock(iter);
+	}
+}
+
+#endif /* _BCACHE_BTREE_ITER_H */
diff --git a/libbcache/btree_locking.h b/libbcache/btree_locking.h
new file mode 100644
index 0000000..76f85c0
--- /dev/null
+++ b/libbcache/btree_locking.h
@@ -0,0 +1,119 @@
+#ifndef _BCACHE_BTREE_LOCKING_H
+#define _BCACHE_BTREE_LOCKING_H
+
+/*
+ * Only for internal btree use:
+ *
+ * The btree iterator tracks what locks it wants to take, and what locks it
+ * currently has - here we have wrappers for locking/unlocking btree nodes and
+ * updating the iterator state
+ */
+
+#include "btree_iter.h"
+#include "six.h"
+
+/* matches six lock types */
+enum btree_node_locked_type {
+	BTREE_NODE_UNLOCKED		= -1,
+	BTREE_NODE_READ_LOCKED		= SIX_LOCK_read,
+	BTREE_NODE_INTENT_LOCKED	= SIX_LOCK_intent,
+};
+
+static inline int btree_node_locked_type(struct btree_iter *iter,
+					 unsigned level)
+{
+	/*
+	 * We're relying on the fact that if nodes_intent_locked is set
+	 * nodes_locked must be set as well, so that we can compute without
+	 * branches:
+	 */
+	return BTREE_NODE_UNLOCKED +
+		((iter->nodes_locked >> level) & 1) +
+		((iter->nodes_intent_locked >> level) & 1);
+}
+
+static inline bool btree_node_intent_locked(struct btree_iter *iter,
+					    unsigned level)
+{
+	return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED;
+}
+
+static inline bool btree_node_read_locked(struct btree_iter *iter,
+					  unsigned level)
+{
+	return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED;
+}
+
+static inline bool btree_node_locked(struct btree_iter *iter, unsigned level)
+{
+	return iter->nodes_locked & (1 << level);
+}
+
+static inline void mark_btree_node_unlocked(struct btree_iter *iter,
+					    unsigned level)
+{
+	iter->nodes_locked &= ~(1 << level);
+	iter->nodes_intent_locked &= ~(1 << level);
+}
+
+static inline void mark_btree_node_locked(struct btree_iter *iter,
+					  unsigned level,
+					  enum six_lock_type type)
+{
+	/* relying on this to avoid a branch */
+	BUILD_BUG_ON(SIX_LOCK_read   != 0);
+	BUILD_BUG_ON(SIX_LOCK_intent != 1);
+
+	iter->nodes_locked |= 1 << level;
+	iter->nodes_intent_locked |= type << level;
+}
+
+static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
+						 unsigned level)
+{
+	mark_btree_node_locked(iter, level, SIX_LOCK_intent);
+}
+
+static inline enum six_lock_type
+btree_lock_want(struct btree_iter *iter, int level)
+{
+	return level < iter->locks_want
+		? SIX_LOCK_intent
+		: SIX_LOCK_read;
+}
+
+static inline bool btree_want_intent(struct btree_iter *iter, int level)
+{
+	return btree_lock_want(iter, level) == SIX_LOCK_intent;
+}
+
+static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
+{
+	int lock_type = btree_node_locked_type(iter, level);
+
+	if (lock_type != BTREE_NODE_UNLOCKED)
+		six_unlock_type(&iter->nodes[level]->lock, lock_type);
+	mark_btree_node_unlocked(iter, level);
+}
+
+bool __bch_btree_node_lock(struct btree *, struct bpos, unsigned,
+			   struct btree_iter *, enum six_lock_type);
+
+static inline bool btree_node_lock(struct btree *b, struct bpos pos,
+				   unsigned level,
+				   struct btree_iter *iter,
+				   enum six_lock_type type)
+{
+	return likely(six_trylock_type(&b->lock, type)) ||
+		__bch_btree_node_lock(b, pos, level, iter, type);
+}
+
+bool btree_node_relock(struct btree_iter *, unsigned);
+
+void btree_node_unlock_write(struct btree *, struct btree_iter *);
+void btree_node_lock_write(struct btree *, struct btree_iter *);
+
+void __btree_node_unlock_write(struct btree *, struct btree_iter *);
+void __btree_node_lock_write(struct btree *, struct btree_iter *);
+
+#endif /* _BCACHE_BTREE_LOCKING_H */
diff --git a/libbcache/btree_types.h b/libbcache/btree_types.h
new file mode 100644
index 0000000..3632a04
--- /dev/null
+++ b/libbcache/btree_types.h
@@ -0,0 +1,322 @@
+#ifndef _BCACHE_BTREE_TYPES_H
+#define _BCACHE_BTREE_TYPES_H
+
+#include <linux/bcache.h>
+#include <linux/list.h>
+#include <linux/rhashtable.h>
+#include <linux/semaphore.h>
+#include <linux/workqueue.h>
+
+#include "bkey_methods.h"
+#include "journal_types.h"
+#include "six.h"
+
+struct cache_set;
+struct open_bucket;
+struct btree_interior_update;
+
+#define MAX_BSETS		3U
+
+struct btree_nr_keys {
+
+	/*
+	 * Amount of live metadata (i.e. size of node after a compaction) in
+	 * units of u64s
+	 */
+	u16			live_u64s;
+	u16			bset_u64s[MAX_BSETS];
+
+	/* live keys only: */
+	u16			packed_keys;
+	u16			unpacked_keys;
+};
+
+struct bset_tree {
+	/*
+	 * We construct a binary tree in an array as if the array
+	 * started at 1, so that things line up on the same cachelines
+	 * better: see comments in bset.c at cacheline_to_bkey() for
+	 * details
+	 */
+
+	/* size of the binary tree and prev array */
+	u16			size;
+
+	/* function of size - precalculated for to_inorder() */
+	u16			extra;
+
+	u16			data_offset;
+	u16			aux_data_offset;
+	u16			end_offset;
+
+	struct bpos		max_key;
+};
+
+struct btree_write {
+	struct journal_entry_pin	journal;
+	struct closure_waitlist		wait;
+};
+
+struct btree {
+	/* Hottest entries first */
+	struct rhash_head	hash;
+
+	/* Key/pointer for this btree node */
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
+	struct six_lock		lock;
+
+	unsigned long		flags;
+	u16			written;
+	u8			level;
+	u8			btree_id;
+	u8			nsets;
+	u8			nr_key_bits;
+
+	struct bkey_format	format;
+
+	struct btree_node	*data;
+	void			*aux_data;
+
+	/*
+	 * Sets of sorted keys - the real btree node - plus a binary search tree
+	 *
+	 * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+	 * to the memory we have allocated for this btree node. Additionally,
+	 * set[0]->data points to the entire btree node as it exists on disk.
+	 */
+	struct bset_tree	set[MAX_BSETS];
+
+	struct btree_nr_keys	nr;
+	u16			sib_u64s[2];
+	u16			whiteout_u64s;
+	u16			uncompacted_whiteout_u64s;
+	u8			page_order;
+	u8			unpack_fn_len;
+
+	/*
+	 * XXX: add a delete sequence number, so when btree_node_relock() fails
+	 * because the lock sequence number has changed - i.e. the contents were
+	 * modified - we can still relock the node if it's still the one we
+	 * want, without redoing the traversal
+	 */
+
+	/*
+	 * For asynchronous splits/interior node updates:
+	 * When we do a split, we allocate new child nodes and update the parent
+	 * node to point to them: we update the parent in memory immediately,
+	 * but then we must wait until the children have been written out before
+	 * the update to the parent can be written - this is a list of the
+	 * btree_interior_updates that are blocking this node from being
+	 * written:
+	 */
+	struct list_head	write_blocked;
+
+	struct open_bucket	*ob;
+
+	/* lru list */
+	struct list_head	list;
+
+	struct btree_write	writes[2];
+
+#ifdef CONFIG_BCACHE_DEBUG
+	bool			*expensive_debug_checks;
+#endif
+};
+
+#define BTREE_FLAG(flag)						\
+static inline bool btree_node_ ## flag(struct btree *b)			\
+{	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
+									\
+static inline void set_btree_node_ ## flag(struct btree *b)		\
+{	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\
+									\
+static inline void clear_btree_node_ ## flag(struct btree *b)		\
+{	clear_bit(BTREE_NODE_ ## flag, &b->flags); }
+
+enum btree_flags {
+	BTREE_NODE_read_error,
+	BTREE_NODE_write_error,
+	BTREE_NODE_dirty,
+	BTREE_NODE_write_idx,
+	BTREE_NODE_accessed,
+	BTREE_NODE_write_in_flight,
+	BTREE_NODE_just_written,
+};
+
+BTREE_FLAG(read_error);
+BTREE_FLAG(write_error);
+BTREE_FLAG(dirty);
+BTREE_FLAG(write_idx);
+BTREE_FLAG(accessed);
+BTREE_FLAG(write_in_flight);
+BTREE_FLAG(just_written);
+
+static inline struct btree_write *btree_current_write(struct btree *b)
+{
+	return b->writes + btree_node_write_idx(b);
+}
+
+static inline struct btree_write *btree_prev_write(struct btree *b)
+{
+	return b->writes + (btree_node_write_idx(b) ^ 1);
+}
+
+static inline struct bset_tree *bset_tree_last(struct btree *b)
+{
+	EBUG_ON(!b->nsets);
+	return b->set + b->nsets - 1;
+}
+
+static inline struct bset *bset(const struct btree *b,
+				const struct bset_tree *t)
+{
+	return (void *) b->data + t->data_offset * sizeof(u64);
+}
+
+static inline struct bset *btree_bset_first(struct btree *b)
+{
+	return bset(b, b->set);
+}
+
+static inline struct bset *btree_bset_last(struct btree *b)
+{
+	return bset(b, bset_tree_last(b));
+}
+
+static inline u16
+__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
+{
+	size_t ret = (u64 *) k - (u64 *) b->data - 1;
+
+	EBUG_ON(ret > U16_MAX);
+	return ret;
+}
+
+static inline struct bkey_packed *
+__btree_node_offset_to_key(const struct btree *b, u16 k)
+{
+	return (void *) ((u64 *) b->data + k + 1);
+}
+
+#define __bkey_idx(_set, _offset)				\
+	((_set)->_data + (_offset))
+
+#define bkey_idx(_set, _offset)					\
+	((typeof(&(_set)->start[0])) __bkey_idx((_set), (_offset)))
+
+#define __bset_bkey_last(_set)					\
+	 __bkey_idx((_set), (_set)->u64s)
+
+#define bset_bkey_last(_set)					\
+	 bkey_idx((_set), le16_to_cpu((_set)->u64s))
+
+#define btree_bkey_first(_b, _t)	(bset(_b, _t)->start)
+
+#define btree_bkey_last(_b, _t)						\
+({									\
+	EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) !=	\
+		bset_bkey_last(bset(_b, _t)));				\
+									\
+	__btree_node_offset_to_key(_b, (_t)->end_offset);		\
+})
+
+static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
+{
+	t->end_offset =
+		__btree_node_key_to_offset(b, bset_bkey_last(bset(b, t)));
+	btree_bkey_last(b, t);
+}
+
+static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
+				  const struct bset *i)
+{
+	t->data_offset = (u64 *) i - (u64 *) b->data;
+
+	EBUG_ON(bset(b, t) != i);
+
+	set_btree_bset_end(b, t);
+}
+
+static inline unsigned bset_byte_offset(struct btree *b, void *i)
+{
+	return i - (void *) b->data;
+}
+
+/* Type of keys @b contains: */
+static inline enum bkey_type btree_node_type(struct btree *b)
+{
+	return b->level ? BKEY_TYPE_BTREE : b->btree_id;
+}
+
+static inline const struct bkey_ops *btree_node_ops(struct btree *b)
+{
+	return bch_bkey_ops[btree_node_type(b)];
+}
+
+static inline bool btree_node_has_ptrs(struct btree *b)
+{
+	return btree_type_has_ptrs(btree_node_type(b));
+}
+
+static inline bool btree_node_is_extents(struct btree *b)
+{
+	return btree_node_type(b) == BKEY_TYPE_EXTENTS;
+}
+
+struct btree_root {
+	struct btree		*b;
+
+	struct btree_interior_update *as;
+
+	/* On disk root - see async splits: */
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+	u8			level;
+	u8			alive;
+};
+
+/*
+ * Optional hook that will be called just prior to a btree node update, when
+ * we're holding the write lock and we know what key is about to be overwritten:
+ */
+
+struct btree_iter;
+struct bucket_stats_cache_set;
+struct btree_node_iter;
+
+enum extent_insert_hook_ret {
+	BTREE_HOOK_DO_INSERT,
+	BTREE_HOOK_NO_INSERT,
+	BTREE_HOOK_RESTART_TRANS,
+};
+
+struct extent_insert_hook {
+	enum extent_insert_hook_ret
+	(*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
+	      struct bkey_s_c, const struct bkey_i *);
+};
+
+enum btree_insert_ret {
+	BTREE_INSERT_OK,
+	/* extent spanned multiple leaf nodes: have to traverse to next node: */
+	BTREE_INSERT_NEED_TRAVERSE,
+	/* write lock held for too long */
+	BTREE_INSERT_NEED_RESCHED,
+	/* leaf node needs to be split */
+	BTREE_INSERT_BTREE_NODE_FULL,
+	BTREE_INSERT_JOURNAL_RES_FULL,
+	BTREE_INSERT_ENOSPC,
+	BTREE_INSERT_NEED_GC_LOCK,
+};
+
+enum btree_gc_coalesce_fail_reason {
+	BTREE_GC_COALESCE_FAIL_RESERVE_GET,
+	BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
+	BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
+};
+
+typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
+							struct btree *,
+							struct btree_node_iter *);
+
+#endif /* _BCACHE_BTREE_TYPES_H */
diff --git a/libbcache/btree_update.c b/libbcache/btree_update.c
new file mode 100644
index 0000000..95406a4
--- /dev/null
+++ b/libbcache/btree_update.c
@@ -0,0 +1,2343 @@
+
+#include "bcache.h"
+#include "alloc.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "buckets.h"
+#include "extents.h"
+#include "journal.h"
+#include "keylist.h"
+#include "super.h"
+
+#include <linux/random.h>
+#include <linux/sort.h>
+#include <trace/events/bcache.h>
+
+static void btree_interior_update_updated_root(struct cache_set *,
+					       struct btree_interior_update *,
+					       enum btree_id);
+
+/* Calculate ideal packed bkey format for new btree nodes: */
+
+void __bch_btree_calc_format(struct bkey_format_state *s, struct btree *b)
+{
+	struct bkey_packed *k;
+	struct bset_tree *t;
+	struct bkey uk;
+
+	bch_bkey_format_add_pos(s, b->data->min_key);
+
+	for_each_bset(b, t)
+		for (k = btree_bkey_first(b, t);
+		     k != btree_bkey_last(b, t);
+		     k = bkey_next(k))
+			if (!bkey_whiteout(k)) {
+				uk = bkey_unpack_key(b, k);
+				bch_bkey_format_add_key(s, &uk);
+			}
+}
+
+static struct bkey_format bch_btree_calc_format(struct btree *b)
+{
+	struct bkey_format_state s;
+
+	bch_bkey_format_init(&s);
+	__bch_btree_calc_format(&s, b);
+
+	return bch_bkey_format_done(&s);
+}
+
+static size_t btree_node_u64s_with_format(struct btree *b,
+					  struct bkey_format *new_f)
+{
+	struct bkey_format *old_f = &b->format;
+
+	/* stupid integer promotion rules */
+	ssize_t delta =
+	    (((int) new_f->key_u64s - old_f->key_u64s) *
+	     (int) b->nr.packed_keys) +
+	    (((int) new_f->key_u64s - BKEY_U64s) *
+	     (int) b->nr.unpacked_keys);
+
+	BUG_ON(delta + b->nr.live_u64s < 0);
+
+	return b->nr.live_u64s + delta;
+}
+
+/**
+ * btree_node_format_fits - check if we could rewrite node with a new format
+ *
+ * This assumes all keys can pack with the new format -- it just checks if
+ * the re-packed keys would fit inside the node itself.
+ */
+bool bch_btree_node_format_fits(struct cache_set *c, struct btree *b,
+				struct bkey_format *new_f)
+{
+	size_t u64s = btree_node_u64s_with_format(b, new_f);
+
+	return __set_bytes(b->data, u64s) < btree_bytes(c);
+}
+
+/* Btree node freeing/allocation: */
+
+/*
+ * We're doing the index update that makes @b unreachable, update stuff to
+ * reflect that:
+ *
+ * Must be called _before_ btree_interior_update_updated_root() or
+ * btree_interior_update_updated_btree:
+ */
+static void bch_btree_node_free_index(struct cache_set *c, struct btree *b,
+				      enum btree_id id, struct bkey_s_c k,
+				      struct bucket_stats_cache_set *stats)
+{
+	struct btree_interior_update *as;
+	struct pending_btree_node_free *d;
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	for_each_pending_btree_node_free(c, as, d)
+		if (!bkey_cmp(k.k->p, d->key.k.p) &&
+		    bkey_val_bytes(k.k) == bkey_val_bytes(&d->key.k) &&
+		    !memcmp(k.v, &d->key.v, bkey_val_bytes(k.k)))
+			goto found;
+
+	BUG();
+found:
+	d->index_update_done = true;
+
+	/*
+	 * Btree nodes are accounted as freed in cache_set_stats when they're
+	 * freed from the index:
+	 */
+	stats->s[S_COMPRESSED][S_META]	 -= c->sb.btree_node_size;
+	stats->s[S_UNCOMPRESSED][S_META] -= c->sb.btree_node_size;
+
+	/*
+	 * We're dropping @k from the btree, but it's still live until the
+	 * index update is persistent so we need to keep a reference around for
+	 * mark and sweep to find - that's primarily what the
+	 * btree_node_pending_free list is for.
+	 *
+	 * So here (when we set index_update_done = true), we're moving an
+	 * existing reference to a different part of the larger "gc keyspace" -
+	 * and the new position comes after the old position, since GC marks
+	 * the pending free list after it walks the btree.
+	 *
+	 * If we move the reference while mark and sweep is _between_ the old
+	 * and the new position, mark and sweep will see the reference twice
+	 * and it'll get double accounted - so check for that here and subtract
+	 * to cancel out one of mark and sweep's markings if necessary:
+	 */
+
+	/*
+	 * bch_mark_key() compares the current gc pos to the pos we're
+	 * moving this reference from, hence one comparison here:
+	 */
+	if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
+		struct bucket_stats_cache_set tmp = { 0 };
+
+		bch_mark_key(c, bkey_i_to_s_c(&d->key),
+			     -c->sb.btree_node_size, true, b
+			     ? gc_pos_btree_node(b)
+			     : gc_pos_btree_root(id),
+			     &tmp, 0);
+		/*
+		 * Don't apply tmp - pending deletes aren't tracked in
+		 * cache_set_stats:
+		 */
+	}
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void __btree_node_free(struct cache_set *c, struct btree *b,
+			      struct btree_iter *iter)
+{
+	trace_bcache_btree_node_free(c, b);
+
+	BUG_ON(b == btree_node_root(c, b));
+	BUG_ON(b->ob);
+	BUG_ON(!list_empty(&b->write_blocked));
+
+	six_lock_write(&b->lock);
+
+	if (btree_node_dirty(b))
+		bch_btree_complete_write(c, b, btree_current_write(b));
+	clear_btree_node_dirty(b);
+
+	mca_hash_remove(c, b);
+
+	mutex_lock(&c->btree_cache_lock);
+	list_move(&b->list, &c->btree_cache_freeable);
+	mutex_unlock(&c->btree_cache_lock);
+
+	/*
+	 * By using six_unlock_write() directly instead of
+	 * btree_node_unlock_write(), we don't update the iterator's sequence
+	 * numbers and cause future btree_node_relock() calls to fail:
+	 */
+	six_unlock_write(&b->lock);
+}
+
+void bch_btree_node_free_never_inserted(struct cache_set *c, struct btree *b)
+{
+	struct open_bucket *ob = b->ob;
+
+	b->ob = NULL;
+
+	__btree_node_free(c, b, NULL);
+
+	bch_open_bucket_put(c, ob);
+}
+
+void bch_btree_node_free_inmem(struct btree_iter *iter, struct btree *b)
+{
+	bch_btree_iter_node_drop_linked(iter, b);
+
+	__btree_node_free(iter->c, b, iter);
+
+	bch_btree_iter_node_drop(iter, b);
+}
+
+static void bch_btree_node_free_ondisk(struct cache_set *c,
+				       struct pending_btree_node_free *pending)
+{
+	struct bucket_stats_cache_set stats = { 0 };
+
+	BUG_ON(!pending->index_update_done);
+
+	bch_mark_key(c, bkey_i_to_s_c(&pending->key),
+		     -c->sb.btree_node_size, true,
+		     gc_phase(GC_PHASE_PENDING_DELETE),
+		     &stats, 0);
+	/*
+	 * Don't apply stats - pending deletes aren't tracked in
+	 * cache_set_stats:
+	 */
+}
+
+void btree_open_bucket_put(struct cache_set *c, struct btree *b)
+{
+	bch_open_bucket_put(c, b->ob);
+	b->ob = NULL;
+}
+
+static struct btree *__bch_btree_node_alloc(struct cache_set *c,
+					    bool use_reserve,
+					    struct disk_reservation *res,
+					    struct closure *cl)
+{
+	BKEY_PADDED(k) tmp;
+	struct open_bucket *ob;
+	struct btree *b;
+	unsigned reserve = use_reserve ? 0 : BTREE_NODE_RESERVE;
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+	if (c->btree_reserve_cache_nr > reserve) {
+		struct btree_alloc *a =
+			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+		ob = a->ob;
+		bkey_copy(&tmp.k, &a->k);
+		mutex_unlock(&c->btree_reserve_cache_lock);
+		goto mem_alloc;
+	}
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+retry:
+	/* alloc_sectors is weird, I suppose */
+	bkey_extent_init(&tmp.k);
+	tmp.k.k.size = c->sb.btree_node_size,
+
+	ob = bch_alloc_sectors(c, &c->btree_write_point,
+			       bkey_i_to_extent(&tmp.k),
+			       res->nr_replicas,
+			       use_reserve ? RESERVE_BTREE : RESERVE_NONE,
+			       cl);
+	if (IS_ERR(ob))
+		return ERR_CAST(ob);
+
+	if (tmp.k.k.size < c->sb.btree_node_size) {
+		bch_open_bucket_put(c, ob);
+		goto retry;
+	}
+mem_alloc:
+	b = mca_alloc(c);
+
+	/* we hold cannibalize_lock: */
+	BUG_ON(IS_ERR(b));
+	BUG_ON(b->ob);
+
+	bkey_copy(&b->key, &tmp.k);
+	b->key.k.size = 0;
+	b->ob = ob;
+
+	return b;
+}
+
+static struct btree *bch_btree_node_alloc(struct cache_set *c,
+					  unsigned level, enum btree_id id,
+					  struct btree_reserve *reserve)
+{
+	struct btree *b;
+
+	BUG_ON(!reserve->nr);
+
+	b = reserve->b[--reserve->nr];
+
+	BUG_ON(mca_hash_insert(c, b, level, id));
+
+	set_btree_node_accessed(b);
+	set_btree_node_dirty(b);
+
+	bch_bset_init_first(b, &b->data->keys);
+	memset(&b->nr, 0, sizeof(b->nr));
+	b->data->magic = cpu_to_le64(bset_magic(&c->disk_sb));
+	SET_BSET_BTREE_LEVEL(&b->data->keys, level);
+
+	bch_btree_build_aux_trees(b);
+
+	bch_check_mark_super(c, &b->key, true);
+
+	trace_bcache_btree_node_alloc(c, b);
+	return b;
+}
+
+struct btree *__btree_node_alloc_replacement(struct cache_set *c,
+					     struct btree *b,
+					     struct bkey_format format,
+					     struct btree_reserve *reserve)
+{
+	struct btree *n;
+
+	n = bch_btree_node_alloc(c, b->level, b->btree_id, reserve);
+
+	n->data->min_key	= b->data->min_key;
+	n->data->max_key	= b->data->max_key;
+	n->data->format		= format;
+
+	btree_node_set_format(n, format);
+
+	bch_btree_sort_into(c, n, b);
+
+	btree_node_reset_sib_u64s(n);
+
+	n->key.k.p = b->key.k.p;
+	trace_bcache_btree_node_alloc_replacement(c, b, n);
+
+	return n;
+}
+
+struct btree *btree_node_alloc_replacement(struct cache_set *c,
+					   struct btree *b,
+					   struct btree_reserve *reserve)
+{
+	struct bkey_format new_f = bch_btree_calc_format(b);
+
+	/*
+	 * The keys might expand with the new format - if they wouldn't fit in
+	 * the btree node anymore, use the old format for now:
+	 */
+	if (!bch_btree_node_format_fits(c, b, &new_f))
+		new_f = b->format;
+
+	return __btree_node_alloc_replacement(c, b, new_f, reserve);
+}
+
+static void bch_btree_set_root_inmem(struct cache_set *c, struct btree *b,
+				     struct btree_reserve *btree_reserve)
+{
+	struct btree *old = btree_node_root(c, b);
+
+	/* Root nodes cannot be reaped */
+	mutex_lock(&c->btree_cache_lock);
+	list_del_init(&b->list);
+	mutex_unlock(&c->btree_cache_lock);
+
+	mutex_lock(&c->btree_root_lock);
+	btree_node_root(c, b) = b;
+	mutex_unlock(&c->btree_root_lock);
+
+	if (btree_reserve) {
+		/*
+		 * New allocation (we're not being called because we're in
+		 * bch_btree_root_read()) - do marking while holding
+		 * btree_root_lock:
+		 */
+		struct bucket_stats_cache_set stats = { 0 };
+
+		bch_mark_key(c, bkey_i_to_s_c(&b->key),
+			     c->sb.btree_node_size, true,
+			     gc_pos_btree_root(b->btree_id),
+			     &stats, 0);
+
+		if (old)
+			bch_btree_node_free_index(c, NULL, old->btree_id,
+						  bkey_i_to_s_c(&old->key),
+						  &stats);
+		bch_cache_set_stats_apply(c, &stats, &btree_reserve->disk_res,
+					  gc_pos_btree_root(b->btree_id));
+	}
+
+	bch_recalc_btree_reserve(c);
+}
+
+static void bch_btree_set_root_ondisk(struct cache_set *c, struct btree *b)
+{
+	struct btree_root *r = &c->btree_roots[b->btree_id];
+
+	mutex_lock(&c->btree_root_lock);
+
+	BUG_ON(b != r->b);
+	bkey_copy(&r->key, &b->key);
+	r->level = b->level;
+	r->alive = true;
+
+	mutex_unlock(&c->btree_root_lock);
+}
+
+/*
+ * Only for cache set bringup, when first reading the btree roots or allocating
+ * btree roots when initializing a new cache set:
+ */
+void bch_btree_set_root_initial(struct cache_set *c, struct btree *b,
+				struct btree_reserve *btree_reserve)
+{
+	BUG_ON(btree_node_root(c, b));
+
+	bch_btree_set_root_inmem(c, b, btree_reserve);
+	bch_btree_set_root_ondisk(c, b);
+}
+
+/**
+ * bch_btree_set_root - update the root in memory and on disk
+ *
+ * To ensure forward progress, the current task must not be holding any
+ * btree node write locks. However, you must hold an intent lock on the
+ * old root.
+ *
+ * Note: This allocates a journal entry but doesn't add any keys to
+ * it.  All the btree roots are part of every journal write, so there
+ * is nothing new to be done.  This just guarantees that there is a
+ * journal write.
+ */
+static void bch_btree_set_root(struct btree_iter *iter, struct btree *b,
+			       struct btree_interior_update *as,
+			       struct btree_reserve *btree_reserve)
+{
+	struct cache_set *c = iter->c;
+	struct btree *old;
+
+	trace_bcache_btree_set_root(c, b);
+	BUG_ON(!b->written);
+
+	old = btree_node_root(c, b);
+
+	/*
+	 * Ensure no one is using the old root while we switch to the
+	 * new root:
+	 */
+	btree_node_lock_write(old, iter);
+
+	bch_btree_set_root_inmem(c, b, btree_reserve);
+
+	btree_interior_update_updated_root(c, as, iter->btree_id);
+
+	/*
+	 * Unlock old root after new root is visible:
+	 *
+	 * The new root isn't persistent, but that's ok: we still have
+	 * an intent lock on the new root, and any updates that would
+	 * depend on the new root would have to update the new root.
+	 */
+	btree_node_unlock_write(old, iter);
+}
+
+static struct btree *__btree_root_alloc(struct cache_set *c, unsigned level,
+					enum btree_id id,
+					struct btree_reserve *reserve)
+{
+	struct btree *b = bch_btree_node_alloc(c, level, id, reserve);
+
+	b->data->min_key = POS_MIN;
+	b->data->max_key = POS_MAX;
+	b->data->format = bch_btree_calc_format(b);
+	b->key.k.p = POS_MAX;
+
+	btree_node_set_format(b, b->data->format);
+	bch_btree_build_aux_trees(b);
+
+	six_unlock_write(&b->lock);
+
+	return b;
+}
+
+void bch_btree_reserve_put(struct cache_set *c, struct btree_reserve *reserve)
+{
+	bch_disk_reservation_put(c, &reserve->disk_res);
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+
+	while (reserve->nr) {
+		struct btree *b = reserve->b[--reserve->nr];
+
+		six_unlock_write(&b->lock);
+
+		if (c->btree_reserve_cache_nr <
+		    ARRAY_SIZE(c->btree_reserve_cache)) {
+			struct btree_alloc *a =
+				&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
+
+			a->ob = b->ob;
+			b->ob = NULL;
+			bkey_copy(&a->k, &b->key);
+		} else {
+			bch_open_bucket_put(c, b->ob);
+			b->ob = NULL;
+		}
+
+		__btree_node_free(c, b, NULL);
+
+		six_unlock_intent(&b->lock);
+	}
+
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+	mempool_free(reserve, &c->btree_reserve_pool);
+}
+
+static struct btree_reserve *__bch_btree_reserve_get(struct cache_set *c,
+						     unsigned nr_nodes,
+						     unsigned flags,
+						     struct closure *cl)
+{
+	struct btree_reserve *reserve;
+	struct btree *b;
+	struct disk_reservation disk_res = { 0, 0 };
+	unsigned sectors = nr_nodes * c->sb.btree_node_size;
+	int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD|
+		BCH_DISK_RESERVATION_METADATA;
+
+	if (flags & BTREE_INSERT_NOFAIL)
+		disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
+
+	/*
+	 * This check isn't necessary for correctness - it's just to potentially
+	 * prevent us from doing a lot of work that'll end up being wasted:
+	 */
+	ret = bch_journal_error(&c->journal);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (bch_disk_reservation_get(c, &disk_res, sectors, disk_res_flags))
+		return ERR_PTR(-ENOSPC);
+
+	BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
+
+	/*
+	 * Protects reaping from the btree node cache and using the btree node
+	 * open bucket reserve:
+	 */
+	ret = mca_cannibalize_lock(c, cl);
+	if (ret) {
+		bch_disk_reservation_put(c, &disk_res);
+		return ERR_PTR(ret);
+	}
+
+	reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO);
+
+	reserve->disk_res = disk_res;
+	reserve->nr = 0;
+
+	while (reserve->nr < nr_nodes) {
+		b = __bch_btree_node_alloc(c, flags & BTREE_INSERT_USE_RESERVE,
+					   &disk_res, cl);
+		if (IS_ERR(b)) {
+			ret = PTR_ERR(b);
+			goto err_free;
+		}
+
+		reserve->b[reserve->nr++] = b;
+	}
+
+	mca_cannibalize_unlock(c);
+	return reserve;
+err_free:
+	bch_btree_reserve_put(c, reserve);
+	mca_cannibalize_unlock(c);
+	trace_bcache_btree_reserve_get_fail(c, nr_nodes, cl);
+	return ERR_PTR(ret);
+}
+
+struct btree_reserve *bch_btree_reserve_get(struct cache_set *c,
+					    struct btree *b,
+					    unsigned extra_nodes,
+					    unsigned flags,
+					    struct closure *cl)
+{
+	unsigned depth = btree_node_root(c, b)->level - b->level;
+	unsigned nr_nodes = btree_reserve_required_nodes(depth) + extra_nodes;
+
+	return __bch_btree_reserve_get(c, nr_nodes, flags, cl);
+
+}
+
+int bch_btree_root_alloc(struct cache_set *c, enum btree_id id,
+			 struct closure *writes)
+{
+	struct closure cl;
+	struct btree_reserve *reserve;
+	struct btree *b;
+
+	closure_init_stack(&cl);
+
+	while (1) {
+		/* XXX haven't calculated capacity yet :/ */
+		reserve = __bch_btree_reserve_get(c, 1, 0, &cl);
+		if (!IS_ERR(reserve))
+			break;
+
+		if (PTR_ERR(reserve) == -ENOSPC)
+			return PTR_ERR(reserve);
+
+		closure_sync(&cl);
+	}
+
+	b = __btree_root_alloc(c, 0, id, reserve);
+
+	bch_btree_node_write(c, b, writes, SIX_LOCK_intent, -1);
+
+	bch_btree_set_root_initial(c, b, reserve);
+	btree_open_bucket_put(c, b);
+	six_unlock_intent(&b->lock);
+
+	bch_btree_reserve_put(c, reserve);
+
+	return 0;
+}
+
+static void bch_insert_fixup_btree_ptr(struct btree_iter *iter,
+				       struct btree *b,
+				       struct bkey_i *insert,
+				       struct btree_node_iter *node_iter,
+				       struct disk_reservation *disk_res)
+{
+	struct cache_set *c = iter->c;
+	struct bucket_stats_cache_set stats = { 0 };
+	struct bkey_packed *k;
+	struct bkey tmp;
+
+	if (bkey_extent_is_data(&insert->k))
+		bch_mark_key(c, bkey_i_to_s_c(insert),
+			     c->sb.btree_node_size, true,
+			     gc_pos_btree_node(b), &stats, 0);
+
+	while ((k = bch_btree_node_iter_peek_all(node_iter, b)) &&
+	       !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false))
+		bch_btree_node_iter_advance(node_iter, b);
+
+	/*
+	 * If we're overwriting, look up pending delete and mark so that gc
+	 * marks it on the pending delete list:
+	 */
+	if (k && !bkey_cmp_packed(b, k, &insert->k))
+		bch_btree_node_free_index(c, b, iter->btree_id,
+					  bkey_disassemble(b, k, &tmp),
+					  &stats);
+
+	bch_cache_set_stats_apply(c, &stats, disk_res, gc_pos_btree_node(b));
+
+	bch_btree_bset_insert_key(iter, b, node_iter, insert);
+	set_btree_node_dirty(b);
+}
+
+/* Inserting into a given leaf node (last stage of insert): */
+
+/* Handle overwrites and do insert, for non extents: */
+bool bch_btree_bset_insert_key(struct btree_iter *iter,
+			       struct btree *b,
+			       struct btree_node_iter *node_iter,
+			       struct bkey_i *insert)
+{
+	const struct bkey_format *f = &b->format;
+	struct bkey_packed *k;
+	struct bset_tree *t;
+	unsigned clobber_u64s;
+
+	EBUG_ON(btree_node_just_written(b));
+	EBUG_ON(bset_written(b, btree_bset_last(b)));
+	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
+	EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
+		bkey_cmp(insert->k.p, b->data->max_key) > 0);
+	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(iter->c, b));
+
+	k = bch_btree_node_iter_peek_all(node_iter, b);
+	if (k && !bkey_cmp_packed(b, k, &insert->k)) {
+		BUG_ON(bkey_whiteout(k));
+
+		t = bch_bkey_to_bset(b, k);
+
+		if (bset_unwritten(b, bset(b, t)) &&
+		    bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) {
+			BUG_ON(bkey_whiteout(k) != bkey_whiteout(&insert->k));
+
+			k->type = insert->k.type;
+			memcpy_u64s(bkeyp_val(f, k), &insert->v,
+				    bkey_val_u64s(&insert->k));
+			return true;
+		}
+
+		insert->k.needs_whiteout = k->needs_whiteout;
+
+		btree_keys_account_key_drop(&b->nr, t - b->set, k);
+
+		if (t == bset_tree_last(b)) {
+			clobber_u64s = k->u64s;
+
+			/*
+			 * If we're deleting, and the key we're deleting doesn't
+			 * need a whiteout (it wasn't overwriting a key that had
+			 * been written to disk) - just delete it:
+			 */
+			if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
+				bch_bset_delete(b, k, clobber_u64s);
+				bch_btree_node_iter_fix(iter, b, node_iter, t,
+							k, clobber_u64s, 0);
+				return true;
+			}
+
+			goto overwrite;
+		}
+
+		k->type = KEY_TYPE_DELETED;
+		bch_btree_node_iter_fix(iter, b, node_iter, t, k,
+					k->u64s, k->u64s);
+
+		if (bkey_whiteout(&insert->k)) {
+			reserve_whiteout(b, t, k);
+			return true;
+		} else {
+			k->needs_whiteout = false;
+		}
+	} else {
+		/*
+		 * Deleting, but the key to delete wasn't found - nothing to do:
+		 */
+		if (bkey_whiteout(&insert->k))
+			return false;
+
+		insert->k.needs_whiteout = false;
+	}
+
+	t = bset_tree_last(b);
+	k = bch_btree_node_iter_bset_pos(node_iter, b, t);
+	clobber_u64s = 0;
+overwrite:
+	bch_bset_insert(b, node_iter, k, insert, clobber_u64s);
+	if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
+		bch_btree_node_iter_fix(iter, b, node_iter, t, k,
+					clobber_u64s, k->u64s);
+	return true;
+}
+
+static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
+			       unsigned i)
+{
+	struct cache_set *c = container_of(j, struct cache_set, journal);
+	struct btree_write *w = container_of(pin, struct btree_write, journal);
+	struct btree *b = container_of(w, struct btree, writes[i]);
+
+	six_lock_read(&b->lock);
+	/*
+	 * Reusing a btree node can race with the journal reclaim code calling
+	 * the journal pin flush fn, and there's no good fix for this: we don't
+	 * really want journal_pin_drop() to block until the flush fn is no
+	 * longer running, because journal_pin_drop() is called from the btree
+	 * node write endio function, and we can't wait on the flush fn to
+	 * finish running in mca_reap() - where we make reused btree nodes ready
+	 * to use again - because there, we're holding the lock this function
+	 * needs - deadlock.
+	 *
+	 * So, the b->level check is a hack so we don't try to write nodes we
+	 * shouldn't:
+	 */
+	if (!b->level)
+		bch_btree_node_write(c, b, NULL, SIX_LOCK_read, i);
+	six_unlock_read(&b->lock);
+}
+
+static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin)
+{
+	return __btree_node_flush(j, pin, 0);
+}
+
+static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin)
+{
+	return __btree_node_flush(j, pin, 1);
+}
+
+void bch_btree_journal_key(struct btree_insert *trans,
+			   struct btree_iter *iter,
+			   struct bkey_i *insert)
+{
+	struct cache_set *c = trans->c;
+	struct journal *j = &c->journal;
+	struct btree *b = iter->nodes[0];
+	struct btree_write *w = btree_current_write(b);
+
+	EBUG_ON(iter->level || b->level);
+	EBUG_ON(!trans->journal_res.ref &&
+		test_bit(JOURNAL_REPLAY_DONE, &j->flags));
+
+	if (!journal_pin_active(&w->journal))
+		bch_journal_pin_add(j, &w->journal,
+				    btree_node_write_idx(b) == 0
+				    ? btree_node_flush0
+				    : btree_node_flush1);
+
+	if (trans->journal_res.ref) {
+		u64 seq = trans->journal_res.seq;
+		bool needs_whiteout = insert->k.needs_whiteout;
+
+		/*
+		 * have a bug where we're seeing an extent with an invalid crc
+		 * entry in the journal, trying to track it down:
+		 */
+		BUG_ON(bkey_invalid(c, b->btree_id, bkey_i_to_s_c(insert)));
+
+		/* ick */
+		insert->k.needs_whiteout = false;
+		bch_journal_add_keys(j, &trans->journal_res,
+				     b->btree_id, insert);
+		insert->k.needs_whiteout = needs_whiteout;
+
+		if (trans->journal_seq)
+			*trans->journal_seq = seq;
+		btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
+	}
+
+	if (!btree_node_dirty(b))
+		set_btree_node_dirty(b);
+}
+
+static enum btree_insert_ret
+bch_insert_fixup_key(struct btree_insert *trans,
+		     struct btree_insert_entry *insert)
+{
+	struct btree_iter *iter = insert->iter;
+
+	BUG_ON(iter->level);
+
+	if (bch_btree_bset_insert_key(iter,
+				      iter->nodes[0],
+				      &iter->node_iters[0],
+				      insert->k))
+		bch_btree_journal_key(trans, iter, insert->k);
+
+	trans->did_work = true;
+	return BTREE_INSERT_OK;
+}
+
+static void verify_keys_sorted(struct keylist *l)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+	struct bkey_i *k;
+
+	for_each_keylist_key(l, k)
+		BUG_ON(bkey_next(k) != l->top &&
+		       bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
+#endif
+}
+
+static void btree_node_lock_for_insert(struct btree *b, struct btree_iter *iter)
+{
+	struct cache_set *c = iter->c;
+
+	btree_node_lock_write(b, iter);
+
+	if (btree_node_just_written(b) &&
+	    bch_btree_post_write_cleanup(c, b))
+		bch_btree_iter_reinit_node(iter, b);
+
+	/*
+	 * If the last bset has been written, or if it's gotten too big - start
+	 * a new bset to insert into:
+	 */
+	if (want_new_bset(c, b))
+		bch_btree_init_next(c, b, iter);
+}
+
+/* Asynchronous interior node update machinery */
+
+struct btree_interior_update *
+bch_btree_interior_update_alloc(struct cache_set *c)
+{
+	struct btree_interior_update *as;
+
+	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
+	memset(as, 0, sizeof(*as));
+	closure_init(&as->cl, &c->cl);
+	as->c		= c;
+	as->mode	= BTREE_INTERIOR_NO_UPDATE;
+
+	bch_keylist_init(&as->parent_keys, as->inline_keys,
+			 ARRAY_SIZE(as->inline_keys));
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_add(&as->list, &c->btree_interior_update_list);
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	return as;
+}
+
+static void btree_interior_update_free(struct closure *cl)
+{
+	struct btree_interior_update *as = container_of(cl, struct btree_interior_update, cl);
+
+	mempool_free(as, &as->c->btree_interior_update_pool);
+}
+
+static void btree_interior_update_nodes_reachable(struct closure *cl)
+{
+	struct btree_interior_update *as =
+		container_of(cl, struct btree_interior_update, cl);
+	struct cache_set *c = as->c;
+	unsigned i;
+
+	bch_journal_pin_drop(&c->journal, &as->journal);
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	for (i = 0; i < as->nr_pending; i++)
+		bch_btree_node_free_ondisk(c, &as->pending[i]);
+	as->nr_pending = 0;
+
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_del(&as->list);
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	closure_wake_up(&as->wait);
+
+	closure_return_with_destructor(cl, btree_interior_update_free);
+}
+
+static void btree_interior_update_nodes_written(struct closure *cl)
+{
+	struct btree_interior_update *as =
+		container_of(cl, struct btree_interior_update, cl);
+	struct cache_set *c = as->c;
+	struct btree *b;
+
+	if (bch_journal_error(&c->journal)) {
+		/* XXX what? */
+	}
+
+	/* XXX: missing error handling, damnit */
+
+	/* check for journal error, bail out if we flushed */
+
+	/*
+	 * We did an update to a parent node where the pointers we added pointed
+	 * to child nodes that weren't written yet: now, the child nodes have
+	 * been written so we can write out the update to the interior node.
+	 */
+retry:
+	mutex_lock(&c->btree_interior_update_lock);
+	switch (as->mode) {
+	case BTREE_INTERIOR_NO_UPDATE:
+		BUG();
+	case BTREE_INTERIOR_UPDATING_NODE:
+		/* The usual case: */
+		b = READ_ONCE(as->b);
+
+		if (!six_trylock_read(&b->lock)) {
+			mutex_unlock(&c->btree_interior_update_lock);
+			six_lock_read(&b->lock);
+			six_unlock_read(&b->lock);
+			goto retry;
+		}
+
+		BUG_ON(!btree_node_dirty(b));
+		closure_wait(&btree_current_write(b)->wait, cl);
+
+		list_del(&as->write_blocked_list);
+
+		if (list_empty(&b->write_blocked))
+			bch_btree_node_write(c, b, NULL, SIX_LOCK_read, -1);
+		six_unlock_read(&b->lock);
+		break;
+
+	case BTREE_INTERIOR_UPDATING_AS:
+		/*
+		 * The btree node we originally updated has been freed and is
+		 * being rewritten - so we need to write anything here, we just
+		 * need to signal to that btree_interior_update that it's ok to make the
+		 * new replacement node visible:
+		 */
+		closure_put(&as->parent_as->cl);
+
+		/*
+		 * and then we have to wait on that btree_interior_update to finish:
+		 */
+		closure_wait(&as->parent_as->wait, cl);
+		break;
+
+	case BTREE_INTERIOR_UPDATING_ROOT:
+		/* b is the new btree root: */
+		b = READ_ONCE(as->b);
+
+		if (!six_trylock_read(&b->lock)) {
+			mutex_unlock(&c->btree_interior_update_lock);
+			six_lock_read(&b->lock);
+			six_unlock_read(&b->lock);
+			goto retry;
+		}
+
+		BUG_ON(c->btree_roots[b->btree_id].as != as);
+		c->btree_roots[b->btree_id].as = NULL;
+
+		bch_btree_set_root_ondisk(c, b);
+
+		/*
+		 * We don't have to wait anything anything here (before
+		 * btree_interior_update_nodes_reachable frees the old nodes
+		 * ondisk) - we've ensured that the very next journal write will
+		 * have the pointer to the new root, and before the allocator
+		 * can reuse the old nodes it'll have to do a journal commit:
+		 */
+		six_unlock_read(&b->lock);
+	}
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	continue_at(cl, btree_interior_update_nodes_reachable, system_wq);
+}
+
+/*
+ * We're updating @b with pointers to nodes that haven't finished writing yet:
+ * block @b from being written until @as completes
+ */
+static void btree_interior_update_updated_btree(struct cache_set *c,
+						struct btree_interior_update *as,
+						struct btree *b)
+{
+	mutex_lock(&c->btree_interior_update_lock);
+
+	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+	BUG_ON(!btree_node_dirty(b));
+
+	as->mode = BTREE_INTERIOR_UPDATING_NODE;
+	as->b = b;
+	list_add(&as->write_blocked_list, &b->write_blocked);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	bch_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
+
+	continue_at(&as->cl, btree_interior_update_nodes_written,
+		    system_freezable_wq);
+}
+
+static void btree_interior_update_updated_root(struct cache_set *c,
+					       struct btree_interior_update *as,
+					       enum btree_id btree_id)
+{
+	struct btree_root *r = &c->btree_roots[btree_id];
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+
+	/*
+	 * Old root might not be persistent yet - if so, redirect its
+	 * btree_interior_update operation to point to us:
+	 */
+	if (r->as) {
+		BUG_ON(r->as->mode != BTREE_INTERIOR_UPDATING_ROOT);
+
+		r->as->b = NULL;
+		r->as->mode = BTREE_INTERIOR_UPDATING_AS;
+		r->as->parent_as = as;
+		closure_get(&as->cl);
+	}
+
+	as->mode = BTREE_INTERIOR_UPDATING_ROOT;
+	as->b = r->b;
+	r->as = as;
+
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	bch_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
+
+	continue_at(&as->cl, btree_interior_update_nodes_written,
+		    system_freezable_wq);
+}
+
+static void interior_update_flush(struct journal *j, struct journal_entry_pin *pin)
+{
+	struct btree_interior_update *as =
+		container_of(pin, struct btree_interior_update, journal);
+
+	bch_journal_flush_seq_async(j, as->journal_seq, NULL);
+}
+
+/*
+ * @b is being split/rewritten: it may have pointers to not-yet-written btree
+ * nodes and thus outstanding btree_interior_updates - redirect @b's
+ * btree_interior_updates to point to this btree_interior_update:
+ */
+void bch_btree_interior_update_will_free_node(struct cache_set *c,
+					      struct btree_interior_update *as,
+					      struct btree *b)
+{
+	struct btree_interior_update *p, *n;
+	struct pending_btree_node_free *d;
+	struct bset_tree *t;
+
+	/*
+	 * Does this node have data that hasn't been written in the journal?
+	 *
+	 * If so, we have to wait for the corresponding journal entry to be
+	 * written before making the new nodes reachable - we can't just carry
+	 * over the bset->journal_seq tracking, since we'll be mixing those keys
+	 * in with keys that aren't in the journal anymore:
+	 */
+	for_each_bset(b, t)
+		as->journal_seq = max(as->journal_seq, bset(b, t)->journal_seq);
+
+	/*
+	 * Does this node have unwritten data that has a pin on the journal?
+	 *
+	 * If so, transfer that pin to the btree_interior_update operation -
+	 * note that if we're freeing multiple nodes, we only need to keep the
+	 * oldest pin of any of the nodes we're freeing. We'll release the pin
+	 * when the new nodes are persistent and reachable on disk:
+	 */
+	bch_journal_pin_add_if_older(&c->journal,
+				     &b->writes[0].journal,
+				     &as->journal, interior_update_flush);
+	bch_journal_pin_add_if_older(&c->journal,
+				     &b->writes[1].journal,
+				     &as->journal, interior_update_flush);
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	/*
+	 * Does this node have any btree_interior_update operations preventing
+	 * it from being written?
+	 *
+	 * If so, redirect them to point to this btree_interior_update: we can
+	 * write out our new nodes, but we won't make them visible until those
+	 * operations complete
+	 */
+	list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
+		BUG_ON(p->mode != BTREE_INTERIOR_UPDATING_NODE);
+
+		p->mode = BTREE_INTERIOR_UPDATING_AS;
+		list_del(&p->write_blocked_list);
+		p->b = NULL;
+		p->parent_as = as;
+		closure_get(&as->cl);
+	}
+
+	/* Add this node to the list of nodes being freed: */
+	BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
+
+	d = &as->pending[as->nr_pending++];
+	d->index_update_done	= false;
+	d->seq			= b->data->keys.seq;
+	d->btree_id		= b->btree_id;
+	d->level		= b->level;
+	bkey_copy(&d->key, &b->key);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void btree_node_interior_verify(struct btree *b)
+{
+	struct btree_node_iter iter;
+	struct bkey_packed *k;
+
+	BUG_ON(!b->level);
+
+	bch_btree_node_iter_init(&iter, b, b->key.k.p, false, false);
+#if 1
+	BUG_ON(!(k = bch_btree_node_iter_peek(&iter, b)) ||
+	       bkey_cmp_left_packed(b, k, &b->key.k.p));
+
+	BUG_ON((bch_btree_node_iter_advance(&iter, b),
+		!bch_btree_node_iter_end(&iter)));
+#else
+	const char *msg;
+
+	msg = "not found";
+	k = bch_btree_node_iter_peek(&iter, b);
+	if (!k)
+		goto err;
+
+	msg = "isn't what it should be";
+	if (bkey_cmp_left_packed(b, k, &b->key.k.p))
+		goto err;
+
+	bch_btree_node_iter_advance(&iter, b);
+
+	msg = "isn't last key";
+	if (!bch_btree_node_iter_end(&iter))
+		goto err;
+	return;
+err:
+	bch_dump_btree_node(b);
+	printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode,
+	       b->key.k.p.offset, msg);
+	BUG();
+#endif
+}
+
+static enum btree_insert_ret
+bch_btree_insert_keys_interior(struct btree *b,
+			       struct btree_iter *iter,
+			       struct keylist *insert_keys,
+			       struct btree_interior_update *as,
+			       struct btree_reserve *res)
+{
+	struct cache_set *c = iter->c;
+	struct btree_iter *linked;
+	struct btree_node_iter node_iter;
+	struct bkey_i *insert = bch_keylist_front(insert_keys);
+	struct bkey_packed *k;
+
+	BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
+	BUG_ON(!b->level);
+	BUG_ON(!as || as->b);
+	verify_keys_sorted(insert_keys);
+
+	btree_node_lock_for_insert(b, iter);
+
+	if (bch_keylist_u64s(insert_keys) >
+	    bch_btree_keys_u64s_remaining(c, b)) {
+		btree_node_unlock_write(b, iter);
+		return BTREE_INSERT_BTREE_NODE_FULL;
+	}
+
+	/* Don't screw up @iter's position: */
+	node_iter = iter->node_iters[b->level];
+
+	/*
+	 * btree_split(), btree_gc_coalesce() will insert keys before
+	 * the iterator's current position - they know the keys go in
+	 * the node the iterator points to:
+	 */
+	while ((k = bch_btree_node_iter_prev_all(&node_iter, b)) &&
+	       (bkey_cmp_packed(b, k, &insert->k) >= 0))
+		;
+
+	while (!bch_keylist_empty(insert_keys)) {
+		insert = bch_keylist_front(insert_keys);
+
+		bch_insert_fixup_btree_ptr(iter, b, insert,
+					   &node_iter, &res->disk_res);
+		bch_keylist_pop_front(insert_keys);
+	}
+
+	btree_interior_update_updated_btree(c, as, b);
+
+	for_each_linked_btree_node(iter, b, linked)
+		bch_btree_node_iter_peek(&linked->node_iters[b->level],
+					 b);
+	bch_btree_node_iter_peek(&iter->node_iters[b->level], b);
+
+	bch_btree_iter_verify(iter, b);
+
+	if (bch_maybe_compact_whiteouts(c, b))
+		bch_btree_iter_reinit_node(iter, b);
+
+	btree_node_unlock_write(b, iter);
+
+	btree_node_interior_verify(b);
+	return BTREE_INSERT_OK;
+}
+
+/*
+ * Move keys from n1 (original replacement node, now lower node) to n2 (higher
+ * node)
+ */
+static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n1,
+					struct btree_reserve *reserve)
+{
+	size_t nr_packed = 0, nr_unpacked = 0;
+	struct btree *n2;
+	struct bset *set1, *set2;
+	struct bkey_packed *k, *prev = NULL;
+
+	n2 = bch_btree_node_alloc(iter->c, n1->level, iter->btree_id, reserve);
+	n2->data->max_key	= n1->data->max_key;
+	n2->data->format	= n1->format;
+	n2->key.k.p = n1->key.k.p;
+
+	btree_node_set_format(n2, n2->data->format);
+
+	set1 = btree_bset_first(n1);
+	set2 = btree_bset_first(n2);
+
+	/*
+	 * Has to be a linear search because we don't have an auxiliary
+	 * search tree yet
+	 */
+	k = set1->start;
+	while (1) {
+		if (bkey_next(k) == bset_bkey_last(set1))
+			break;
+		if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
+			break;
+
+		if (bkey_packed(k))
+			nr_packed++;
+		else
+			nr_unpacked++;
+
+		prev = k;
+		k = bkey_next(k);
+	}
+
+	BUG_ON(!prev);
+
+	n1->key.k.p = bkey_unpack_pos(n1, prev);
+	n1->data->max_key = n1->key.k.p;
+	n2->data->min_key =
+		btree_type_successor(n1->btree_id, n1->key.k.p);
+
+	set2->u64s = cpu_to_le16((u64 *) bset_bkey_last(set1) - (u64 *) k);
+	set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
+
+	set_btree_bset_end(n1, n1->set);
+	set_btree_bset_end(n2, n2->set);
+
+	n2->nr.live_u64s	= le16_to_cpu(set2->u64s);
+	n2->nr.bset_u64s[0]	= le16_to_cpu(set2->u64s);
+	n2->nr.packed_keys	= n1->nr.packed_keys - nr_packed;
+	n2->nr.unpacked_keys	= n1->nr.unpacked_keys - nr_unpacked;
+
+	n1->nr.live_u64s	= le16_to_cpu(set1->u64s);
+	n1->nr.bset_u64s[0]	= le16_to_cpu(set1->u64s);
+	n1->nr.packed_keys	= nr_packed;
+	n1->nr.unpacked_keys	= nr_unpacked;
+
+	BUG_ON(!set1->u64s);
+	BUG_ON(!set2->u64s);
+
+	memcpy_u64s(set2->start,
+		    bset_bkey_last(set1),
+		    le16_to_cpu(set2->u64s));
+
+	btree_node_reset_sib_u64s(n1);
+	btree_node_reset_sib_u64s(n2);
+
+	bch_verify_btree_nr_keys(n1);
+	bch_verify_btree_nr_keys(n2);
+
+	if (n1->level) {
+		btree_node_interior_verify(n1);
+		btree_node_interior_verify(n2);
+	}
+
+	return n2;
+}
+
+/*
+ * For updates to interior nodes, we've got to do the insert before we split
+ * because the stuff we're inserting has to be inserted atomically. Post split,
+ * the keys might have to go in different nodes and the split would no longer be
+ * atomic.
+ *
+ * Worse, if the insert is from btree node coalescing, if we do the insert after
+ * we do the split (and pick the pivot) - the pivot we pick might be between
+ * nodes that were coalesced, and thus in the middle of a child node post
+ * coalescing:
+ */
+static void btree_split_insert_keys(struct btree_iter *iter, struct btree *b,
+				    struct keylist *keys,
+				    struct btree_reserve *res)
+{
+	struct btree_node_iter node_iter;
+	struct bkey_i *k = bch_keylist_front(keys);
+	struct bkey_packed *p;
+	struct bset *i;
+
+	BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
+
+	bch_btree_node_iter_init(&node_iter, b, k->k.p, false, false);
+
+	while (!bch_keylist_empty(keys)) {
+		k = bch_keylist_front(keys);
+
+		BUG_ON(bch_keylist_u64s(keys) >
+		       bch_btree_keys_u64s_remaining(iter->c, b));
+		BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0);
+		BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0);
+
+		bch_insert_fixup_btree_ptr(iter, b, k, &node_iter, &res->disk_res);
+		bch_keylist_pop_front(keys);
+	}
+
+	/*
+	 * We can't tolerate whiteouts here - with whiteouts there can be
+	 * duplicate keys, and it would be rather bad if we picked a duplicate
+	 * for the pivot:
+	 */
+	i = btree_bset_first(b);
+	p = i->start;
+	while (p != bset_bkey_last(i))
+		if (bkey_deleted(p)) {
+			le16_add_cpu(&i->u64s, -p->u64s);
+			set_btree_bset_end(b, b->set);
+			memmove_u64s_down(p, bkey_next(p),
+					  (u64 *) bset_bkey_last(i) -
+					  (u64 *) p);
+		} else
+			p = bkey_next(p);
+
+	BUG_ON(b->nsets != 1 ||
+	       b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
+
+	btree_node_interior_verify(b);
+}
+
+static void btree_split(struct btree *b, struct btree_iter *iter,
+			struct keylist *insert_keys,
+			struct btree_reserve *reserve,
+			struct btree_interior_update *as)
+{
+	struct cache_set *c = iter->c;
+	struct btree *parent = iter->nodes[b->level + 1];
+	struct btree *n1, *n2 = NULL, *n3 = NULL;
+	u64 start_time = local_clock();
+
+	BUG_ON(!parent && (b != btree_node_root(c, b)));
+	BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
+
+	bch_btree_interior_update_will_free_node(c, as, b);
+
+	n1 = btree_node_alloc_replacement(c, b, reserve);
+	if (b->level)
+		btree_split_insert_keys(iter, n1, insert_keys, reserve);
+
+	if (__set_blocks(n1->data,
+			 le16_to_cpu(n1->data->keys.u64s),
+			 block_bytes(c)) > BTREE_SPLIT_THRESHOLD(c)) {
+		trace_bcache_btree_node_split(c, b, b->nr.live_u64s);
+
+		n2 = __btree_split_node(iter, n1, reserve);
+
+		bch_btree_build_aux_trees(n2);
+		bch_btree_build_aux_trees(n1);
+		six_unlock_write(&n2->lock);
+		six_unlock_write(&n1->lock);
+
+		bch_btree_node_write(c, n2, &as->cl, SIX_LOCK_intent, -1);
+
+		/*
+		 * Note that on recursive parent_keys == insert_keys, so we
+		 * can't start adding new keys to parent_keys before emptying it
+		 * out (which we did with btree_split_insert_keys() above)
+		 */
+		bch_keylist_add(&as->parent_keys, &n1->key);
+		bch_keylist_add(&as->parent_keys, &n2->key);
+
+		if (!parent) {
+			/* Depth increases, make a new root */
+			n3 = __btree_root_alloc(c, b->level + 1,
+						iter->btree_id,
+						reserve);
+			n3->sib_u64s[0] = U16_MAX;
+			n3->sib_u64s[1] = U16_MAX;
+
+			btree_split_insert_keys(iter, n3, &as->parent_keys,
+						reserve);
+			bch_btree_node_write(c, n3, &as->cl, SIX_LOCK_intent, -1);
+		}
+	} else {
+		trace_bcache_btree_node_compact(c, b, b->nr.live_u64s);
+
+		bch_btree_build_aux_trees(n1);
+		six_unlock_write(&n1->lock);
+
+		bch_keylist_add(&as->parent_keys, &n1->key);
+	}
+
+	bch_btree_node_write(c, n1, &as->cl, SIX_LOCK_intent, -1);
+
+	/* New nodes all written, now make them visible: */
+
+	if (parent) {
+		/* Split a non root node */
+		bch_btree_insert_node(parent, iter, &as->parent_keys,
+				      reserve, as);
+	} else if (n3) {
+		bch_btree_set_root(iter, n3, as, reserve);
+	} else {
+		/* Root filled up but didn't need to be split */
+		bch_btree_set_root(iter, n1, as, reserve);
+	}
+
+	btree_open_bucket_put(c, n1);
+	if (n2)
+		btree_open_bucket_put(c, n2);
+	if (n3)
+		btree_open_bucket_put(c, n3);
+
+	/*
+	 * Note - at this point other linked iterators could still have @b read
+	 * locked; we're depending on the bch_btree_iter_node_replace() calls
+	 * below removing all references to @b so we don't return with other
+	 * iterators pointing to a node they have locked that's been freed.
+	 *
+	 * We have to free the node first because the bch_iter_node_replace()
+	 * calls will drop _our_ iterator's reference - and intent lock - to @b.
+	 */
+	bch_btree_node_free_inmem(iter, b);
+
+	/* Successful split, update the iterator to point to the new nodes: */
+
+	if (n3)
+		bch_btree_iter_node_replace(iter, n3);
+	if (n2)
+		bch_btree_iter_node_replace(iter, n2);
+	bch_btree_iter_node_replace(iter, n1);
+
+	bch_time_stats_update(&c->btree_split_time, start_time);
+}
+
+/**
+ * bch_btree_insert_node - insert bkeys into a given btree node
+ *
+ * @iter:		btree iterator
+ * @insert_keys:	list of keys to insert
+ * @hook:		insert callback
+ * @persistent:		if not null, @persistent will wait on journal write
+ *
+ * Inserts as many keys as it can into a given btree node, splitting it if full.
+ * If a split occurred, this function will return early. This can only happen
+ * for leaf nodes -- inserts into interior nodes have to be atomic.
+ */
+void bch_btree_insert_node(struct btree *b,
+			   struct btree_iter *iter,
+			   struct keylist *insert_keys,
+			   struct btree_reserve *reserve,
+			   struct btree_interior_update *as)
+{
+	BUG_ON(!b->level);
+	BUG_ON(!reserve || !as);
+
+	switch (bch_btree_insert_keys_interior(b, iter, insert_keys,
+					       as, reserve)) {
+	case BTREE_INSERT_OK:
+		break;
+	case BTREE_INSERT_BTREE_NODE_FULL:
+		btree_split(b, iter, insert_keys, reserve, as);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags)
+{
+	struct cache_set *c = iter->c;
+	struct btree *b = iter->nodes[0];
+	struct btree_reserve *reserve;
+	struct btree_interior_update *as;
+	struct closure cl;
+	int ret = 0;
+
+	closure_init_stack(&cl);
+
+	/* Hack, because gc and splitting nodes doesn't mix yet: */
+	if (!down_read_trylock(&c->gc_lock)) {
+		bch_btree_iter_unlock(iter);
+		down_read(&c->gc_lock);
+	}
+
+	/*
+	 * XXX: figure out how far we might need to split,
+	 * instead of locking/reserving all the way to the root:
+	 */
+	if (!bch_btree_iter_set_locks_want(iter, U8_MAX)) {
+		ret = -EINTR;
+		goto out;
+	}
+
+	reserve = bch_btree_reserve_get(c, b, 0, flags, &cl);
+	if (IS_ERR(reserve)) {
+		ret = PTR_ERR(reserve);
+		if (ret == -EAGAIN) {
+			bch_btree_iter_unlock(iter);
+			up_read(&c->gc_lock);
+			closure_sync(&cl);
+			return -EINTR;
+		}
+		goto out;
+	}
+
+	as = bch_btree_interior_update_alloc(c);
+
+	btree_split(b, iter, NULL, reserve, as);
+	bch_btree_reserve_put(c, reserve);
+
+	bch_btree_iter_set_locks_want(iter, 1);
+out:
+	up_read(&c->gc_lock);
+	return ret;
+}
+
+enum btree_node_sibling {
+	btree_prev_sib,
+	btree_next_sib,
+};
+
+static struct btree *btree_node_get_sibling(struct btree_iter *iter,
+					    struct btree *b,
+					    enum btree_node_sibling sib)
+{
+	struct btree *parent;
+	struct btree_node_iter node_iter;
+	struct bkey_packed *k;
+	BKEY_PADDED(k) tmp;
+	struct btree *ret;
+	unsigned level = b->level;
+
+	parent = iter->nodes[level + 1];
+	if (!parent)
+		return NULL;
+
+	if (!btree_node_relock(iter, level + 1)) {
+		bch_btree_iter_set_locks_want(iter, level + 2);
+		return ERR_PTR(-EINTR);
+	}
+
+	node_iter = iter->node_iters[parent->level];
+
+	k = bch_btree_node_iter_peek_all(&node_iter, parent);
+	BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
+
+	do {
+		k = sib == btree_prev_sib
+			? bch_btree_node_iter_prev_all(&node_iter, parent)
+			: (bch_btree_node_iter_advance(&node_iter, parent),
+			   bch_btree_node_iter_peek_all(&node_iter, parent));
+		if (!k)
+			return NULL;
+	} while (bkey_deleted(k));
+
+	bkey_unpack(parent, &tmp.k, k);
+
+	ret = bch_btree_node_get(iter, &tmp.k, level, SIX_LOCK_intent);
+
+	if (IS_ERR(ret) && PTR_ERR(ret) == -EINTR) {
+		btree_node_unlock(iter, level);
+		ret = bch_btree_node_get(iter, &tmp.k, level, SIX_LOCK_intent);
+	}
+
+	if (!IS_ERR(ret) && !btree_node_relock(iter, level)) {
+		six_unlock_intent(&ret->lock);
+		ret = ERR_PTR(-EINTR);
+	}
+
+	return ret;
+}
+
+static int __foreground_maybe_merge(struct btree_iter *iter,
+				    enum btree_node_sibling sib)
+{
+	struct cache_set *c = iter->c;
+	struct btree_reserve *reserve;
+	struct btree_interior_update *as;
+	struct bkey_format_state new_s;
+	struct bkey_format new_f;
+	struct bkey_i delete;
+	struct btree *b, *m, *n, *prev, *next, *parent;
+	struct closure cl;
+	size_t sib_u64s;
+	int ret = 0;
+
+	closure_init_stack(&cl);
+retry:
+	if (!btree_node_relock(iter, iter->level))
+		return 0;
+
+	b = iter->nodes[iter->level];
+
+	parent = iter->nodes[b->level + 1];
+	if (!parent)
+		return 0;
+
+	if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
+		return 0;
+
+	/* XXX: can't be holding read locks */
+	m = btree_node_get_sibling(iter, b, sib);
+	if (IS_ERR(m)) {
+		ret = PTR_ERR(m);
+		goto out;
+	}
+
+	/* NULL means no sibling: */
+	if (!m) {
+		b->sib_u64s[sib] = U16_MAX;
+		return 0;
+	}
+
+	if (sib == btree_prev_sib) {
+		prev = m;
+		next = b;
+	} else {
+		prev = b;
+		next = m;
+	}
+
+	bch_bkey_format_init(&new_s);
+	__bch_btree_calc_format(&new_s, b);
+	__bch_btree_calc_format(&new_s, m);
+	new_f = bch_bkey_format_done(&new_s);
+
+	sib_u64s = btree_node_u64s_with_format(b, &new_f) +
+		btree_node_u64s_with_format(m, &new_f);
+
+	if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
+		sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
+		sib_u64s /= 2;
+		sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
+	}
+
+	sib_u64s = min(sib_u64s, btree_max_u64s(c));
+	b->sib_u64s[sib] = sib_u64s;
+
+	if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
+		six_unlock_intent(&m->lock);
+		return 0;
+	}
+
+	/* We're changing btree topology, doesn't mix with gc: */
+	if (!down_read_trylock(&c->gc_lock)) {
+		six_unlock_intent(&m->lock);
+		bch_btree_iter_unlock(iter);
+
+		down_read(&c->gc_lock);
+		up_read(&c->gc_lock);
+		ret = -EINTR;
+		goto out;
+	}
+
+	if (!bch_btree_iter_set_locks_want(iter, U8_MAX)) {
+		ret = -EINTR;
+		goto out_unlock;
+	}
+
+	reserve = bch_btree_reserve_get(c, b, 0,
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_USE_RESERVE,
+					&cl);
+	if (IS_ERR(reserve)) {
+		ret = PTR_ERR(reserve);
+		goto out_unlock;
+	}
+
+	as = bch_btree_interior_update_alloc(c);
+
+	bch_btree_interior_update_will_free_node(c, as, b);
+	bch_btree_interior_update_will_free_node(c, as, m);
+
+	n = bch_btree_node_alloc(c, b->level, b->btree_id, reserve);
+	n->data->min_key	= prev->data->min_key;
+	n->data->max_key	= next->data->max_key;
+	n->data->format		= new_f;
+	n->key.k.p		= next->key.k.p;
+
+	btree_node_set_format(n, new_f);
+
+	bch_btree_sort_into(c, n, prev);
+	bch_btree_sort_into(c, n, next);
+
+	bch_btree_build_aux_trees(n);
+	six_unlock_write(&n->lock);
+
+	bkey_init(&delete.k);
+	delete.k.p = prev->key.k.p;
+	bch_keylist_add(&as->parent_keys, &delete);
+	bch_keylist_add(&as->parent_keys, &n->key);
+
+	bch_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
+
+	bch_btree_insert_node(parent, iter, &as->parent_keys, reserve, as);
+
+	btree_open_bucket_put(c, n);
+	bch_btree_node_free_inmem(iter, b);
+	bch_btree_node_free_inmem(iter, m);
+	bch_btree_iter_node_replace(iter, n);
+
+	bch_btree_iter_verify(iter, n);
+
+	bch_btree_reserve_put(c, reserve);
+out_unlock:
+	if (ret != -EINTR && ret != -EAGAIN)
+		bch_btree_iter_set_locks_want(iter, 1);
+	six_unlock_intent(&m->lock);
+	up_read(&c->gc_lock);
+out:
+	if (ret == -EAGAIN || ret == -EINTR) {
+		bch_btree_iter_unlock(iter);
+		ret = -EINTR;
+	}
+
+	closure_sync(&cl);
+
+	if (ret == -EINTR) {
+		ret = bch_btree_iter_traverse(iter);
+		if (!ret)
+			goto retry;
+	}
+
+	return ret;
+}
+
+static int inline foreground_maybe_merge(struct btree_iter *iter,
+					 enum btree_node_sibling sib)
+{
+	struct cache_set *c = iter->c;
+	struct btree *b;
+
+	if (!btree_node_locked(iter, iter->level))
+		return 0;
+
+	b = iter->nodes[iter->level];
+	if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
+		return 0;
+
+	return __foreground_maybe_merge(iter, sib);
+}
+
+/**
+ * btree_insert_key - insert a key one key into a leaf node
+ */
+static enum btree_insert_ret
+btree_insert_key(struct btree_insert *trans,
+		 struct btree_insert_entry *insert)
+{
+	struct cache_set *c = trans->c;
+	struct btree_iter *iter = insert->iter;
+	struct btree *b = iter->nodes[0];
+	enum btree_insert_ret ret;
+	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
+	int old_live_u64s = b->nr.live_u64s;
+	int live_u64s_added, u64s_added;
+
+	ret = !btree_node_is_extents(b)
+		? bch_insert_fixup_key(trans, insert)
+		: bch_insert_fixup_extent(trans, insert);
+
+	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
+
+	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+	if (u64s_added > live_u64s_added &&
+	    bch_maybe_compact_whiteouts(iter->c, b))
+		bch_btree_iter_reinit_node(iter, b);
+
+	trace_bcache_btree_insert_key(c, b, insert->k);
+	return ret;
+}
+
+static bool same_leaf_as_prev(struct btree_insert *trans,
+			      struct btree_insert_entry *i)
+{
+	/*
+	 * Because we sorted the transaction entries, if multiple iterators
+	 * point to the same leaf node they'll always be adjacent now:
+	 */
+	return i != trans->entries &&
+		i[0].iter->nodes[0] == i[-1].iter->nodes[0];
+}
+
+#define trans_for_each_entry(trans, i)					\
+	for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
+
+static void multi_lock_write(struct btree_insert *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_entry(trans, i)
+		if (!same_leaf_as_prev(trans, i))
+			btree_node_lock_for_insert(i->iter->nodes[0], i->iter);
+}
+
+static void multi_unlock_write(struct btree_insert *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_entry(trans, i)
+		if (!same_leaf_as_prev(trans, i))
+			btree_node_unlock_write(i->iter->nodes[0], i->iter);
+}
+
+static int btree_trans_entry_cmp(const void *_l, const void *_r)
+{
+	const struct btree_insert_entry *l = _l;
+	const struct btree_insert_entry *r = _r;
+
+	return btree_iter_cmp(l->iter, r->iter);
+}
+
+/* Normal update interface: */
+
+/**
+ * __bch_btree_insert_at - insert keys at given iterator positions
+ *
+ * This is main entry point for btree updates.
+ *
+ * Return values:
+ * -EINTR: locking changed, this function should be called again. Only returned
+ *  if passed BTREE_INSERT_ATOMIC.
+ * -EROFS: cache set read only
+ * -EIO: journal or btree node IO error
+ */
+int __bch_btree_insert_at(struct btree_insert *trans)
+{
+	struct cache_set *c = trans->c;
+	struct btree_insert_entry *i;
+	struct btree_iter *split = NULL;
+	bool cycle_gc_lock = false;
+	unsigned u64s;
+	int ret;
+
+	trans_for_each_entry(trans, i) {
+		EBUG_ON(i->iter->level);
+		EBUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+	}
+
+	sort(trans->entries, trans->nr, sizeof(trans->entries[0]),
+	     btree_trans_entry_cmp, NULL);
+
+	if (unlikely(!percpu_ref_tryget(&c->writes)))
+		return -EROFS;
+retry_locks:
+	ret = -EINTR;
+	trans_for_each_entry(trans, i)
+		if (!bch_btree_iter_set_locks_want(i->iter, 1))
+			goto err;
+retry:
+	trans->did_work = false;
+	u64s = 0;
+	trans_for_each_entry(trans, i)
+		if (!i->done)
+			u64s += jset_u64s(i->k->k.u64s);
+
+	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+
+	ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
+		? bch_journal_res_get(&c->journal,
+				      &trans->journal_res,
+				      u64s, u64s)
+		: 0;
+	if (ret)
+		goto err;
+
+	multi_lock_write(trans);
+
+	u64s = 0;
+	trans_for_each_entry(trans, i) {
+		/* Multiple inserts might go to same leaf: */
+		if (!same_leaf_as_prev(trans, i))
+			u64s = 0;
+
+		/*
+		 * bch_btree_node_insert_fits() must be called under write lock:
+		 * with only an intent lock, another thread can still call
+		 * bch_btree_node_write(), converting an unwritten bset to a
+		 * written one
+		 */
+		if (!i->done) {
+			u64s += i->k->k.u64s;
+			if (!bch_btree_node_insert_fits(c,
+					i->iter->nodes[0], u64s)) {
+				split = i->iter;
+				goto unlock;
+			}
+		}
+	}
+
+	ret = 0;
+	split = NULL;
+	cycle_gc_lock = false;
+
+	trans_for_each_entry(trans, i) {
+		if (i->done)
+			continue;
+
+		switch (btree_insert_key(trans, i)) {
+		case BTREE_INSERT_OK:
+			i->done = true;
+			break;
+		case BTREE_INSERT_JOURNAL_RES_FULL:
+		case BTREE_INSERT_NEED_TRAVERSE:
+			ret = -EINTR;
+			break;
+		case BTREE_INSERT_NEED_RESCHED:
+			ret = -EAGAIN;
+			break;
+		case BTREE_INSERT_BTREE_NODE_FULL:
+			split = i->iter;
+			break;
+		case BTREE_INSERT_ENOSPC:
+			ret = -ENOSPC;
+			break;
+		case BTREE_INSERT_NEED_GC_LOCK:
+			cycle_gc_lock = true;
+			ret = -EINTR;
+			break;
+		default:
+			BUG();
+		}
+
+		if (!trans->did_work && (ret || split))
+			break;
+	}
+unlock:
+	multi_unlock_write(trans);
+	bch_journal_res_put(&c->journal, &trans->journal_res);
+
+	if (split)
+		goto split;
+	if (ret)
+		goto err;
+
+	/*
+	 * hack: iterators are inconsistent when they hit end of leaf, until
+	 * traversed again
+	 */
+	trans_for_each_entry(trans, i)
+		if (i->iter->at_end_of_leaf)
+			goto out;
+
+	trans_for_each_entry(trans, i)
+		if (!same_leaf_as_prev(trans, i)) {
+			foreground_maybe_merge(i->iter, btree_prev_sib);
+			foreground_maybe_merge(i->iter, btree_next_sib);
+		}
+out:
+	/* make sure we didn't lose an error: */
+	if (!ret && IS_ENABLED(CONFIG_BCACHE_DEBUG))
+		trans_for_each_entry(trans, i)
+			BUG_ON(!i->done);
+
+	percpu_ref_put(&c->writes);
+	return ret;
+split:
+	/*
+	 * have to drop journal res before splitting, because splitting means
+	 * allocating new btree nodes, and holding a journal reservation
+	 * potentially blocks the allocator:
+	 */
+	ret = bch_btree_split_leaf(split, trans->flags);
+	if (ret)
+		goto err;
+	/*
+	 * if the split didn't have to drop locks the insert will still be
+	 * atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked()
+	 * and is overwriting won't have changed)
+	 */
+	goto retry_locks;
+err:
+	if (cycle_gc_lock) {
+		down_read(&c->gc_lock);
+		up_read(&c->gc_lock);
+	}
+
+	if (ret == -EINTR) {
+		trans_for_each_entry(trans, i) {
+			int ret2 = bch_btree_iter_traverse(i->iter);
+			if (ret2) {
+				ret = ret2;
+				goto out;
+			}
+		}
+
+		/*
+		 * BTREE_ITER_ATOMIC means we have to return -EINTR if we
+		 * dropped locks:
+		 */
+		if (!(trans->flags & BTREE_INSERT_ATOMIC))
+			goto retry;
+	}
+
+	goto out;
+}
+
+int bch_btree_insert_list_at(struct btree_iter *iter,
+			     struct keylist *keys,
+			     struct disk_reservation *disk_res,
+			     struct extent_insert_hook *hook,
+			     u64 *journal_seq, unsigned flags)
+{
+	BUG_ON(flags & BTREE_INSERT_ATOMIC);
+	BUG_ON(bch_keylist_empty(keys));
+	verify_keys_sorted(keys);
+
+	while (!bch_keylist_empty(keys)) {
+		/* need to traverse between each insert */
+		int ret = bch_btree_iter_traverse(iter);
+		if (ret)
+			return ret;
+
+		ret = bch_btree_insert_at(iter->c, disk_res, hook,
+				journal_seq, flags,
+				BTREE_INSERT_ENTRY(iter, bch_keylist_front(keys)));
+		if (ret)
+			return ret;
+
+		bch_keylist_pop_front(keys);
+	}
+
+	return 0;
+}
+
+/**
+ * bch_btree_insert_check_key - insert dummy key into btree
+ *
+ * We insert a random key on a cache miss, then compare exchange on it
+ * once the cache promotion or backing device read completes. This
+ * ensures that if this key is written to after the read, the read will
+ * lose and not overwrite the key with stale data.
+ *
+ * Return values:
+ * -EAGAIN: @iter->cl was put on a waitlist waiting for btree node allocation
+ * -EINTR: btree node was changed while upgrading to write lock
+ */
+int bch_btree_insert_check_key(struct btree_iter *iter,
+			       struct bkey_i *check_key)
+{
+	struct bpos saved_pos = iter->pos;
+	struct bkey_i_cookie *cookie;
+	BKEY_PADDED(key) tmp;
+	int ret;
+
+	BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&check_key->k)));
+
+	check_key->k.type = KEY_TYPE_COOKIE;
+	set_bkey_val_bytes(&check_key->k, sizeof(struct bch_cookie));
+
+	cookie = bkey_i_to_cookie(check_key);
+	get_random_bytes(&cookie->v, sizeof(cookie->v));
+
+	bkey_copy(&tmp.key, check_key);
+
+	ret = bch_btree_insert_at(iter->c, NULL, NULL, NULL,
+				  BTREE_INSERT_ATOMIC,
+				  BTREE_INSERT_ENTRY(iter, &tmp.key));
+
+	bch_btree_iter_rewind(iter, saved_pos);
+
+	return ret;
+}
+
+/**
+ * bch_btree_insert - insert keys into the extent btree
+ * @c:			pointer to struct cache_set
+ * @id:			btree to insert into
+ * @insert_keys:	list of keys to insert
+ * @hook:		insert callback
+ */
+int bch_btree_insert(struct cache_set *c, enum btree_id id,
+		     struct bkey_i *k,
+		     struct disk_reservation *disk_res,
+		     struct extent_insert_hook *hook,
+		     u64 *journal_seq, int flags)
+{
+	struct btree_iter iter;
+	int ret, ret2;
+
+	bch_btree_iter_init_intent(&iter, c, id, bkey_start_pos(&k->k));
+
+	ret = bch_btree_iter_traverse(&iter);
+	if (unlikely(ret))
+		goto out;
+
+	ret = bch_btree_insert_at(c, disk_res, hook, journal_seq, flags,
+				  BTREE_INSERT_ENTRY(&iter, k));
+out:	ret2 = bch_btree_iter_unlock(&iter);
+
+	return ret ?: ret2;
+}
+
+/**
+ * bch_btree_update - like bch_btree_insert(), but asserts that we're
+ * overwriting an existing key
+ */
+int bch_btree_update(struct cache_set *c, enum btree_id id,
+		     struct bkey_i *k, u64 *journal_seq)
+{
+	struct btree_iter iter;
+	struct bkey_s_c u;
+	int ret;
+
+	EBUG_ON(id == BTREE_ID_EXTENTS);
+
+	bch_btree_iter_init_intent(&iter, c, id, k->k.p);
+
+	u = bch_btree_iter_peek_with_holes(&iter);
+	ret = btree_iter_err(u);
+	if (ret)
+		return ret;
+
+	if (bkey_deleted(u.k)) {
+		bch_btree_iter_unlock(&iter);
+		return -ENOENT;
+	}
+
+	ret = bch_btree_insert_at(c, NULL, NULL, journal_seq, 0,
+				  BTREE_INSERT_ENTRY(&iter, k));
+	bch_btree_iter_unlock(&iter);
+	return ret;
+}
+
+/*
+ * bch_btree_delete_range - delete everything within a given range
+ *
+ * Range is a half open interval - [start, end)
+ */
+int bch_btree_delete_range(struct cache_set *c, enum btree_id id,
+			   struct bpos start,
+			   struct bpos end,
+			   u64 version,
+			   struct disk_reservation *disk_res,
+			   struct extent_insert_hook *hook,
+			   u64 *journal_seq)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch_btree_iter_init_intent(&iter, c, id, start);
+
+	while ((k = bch_btree_iter_peek(&iter)).k &&
+	       !(ret = btree_iter_err(k))) {
+		unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
+		/* really shouldn't be using a bare, unpadded bkey_i */
+		struct bkey_i delete;
+
+		if (bkey_cmp(iter.pos, end) >= 0)
+			break;
+
+		bkey_init(&delete.k);
+
+		/*
+		 * For extents, iter.pos won't necessarily be the same as
+		 * bkey_start_pos(k.k) (for non extents they always will be the
+		 * same). It's important that we delete starting from iter.pos
+		 * because the range we want to delete could start in the middle
+		 * of k.
+		 *
+		 * (bch_btree_iter_peek() does guarantee that iter.pos >=
+		 * bkey_start_pos(k.k)).
+		 */
+		delete.k.p = iter.pos;
+		delete.k.version = version;
+
+		if (iter.is_extents) {
+			/*
+			 * The extents btree is special - KEY_TYPE_DISCARD is
+			 * used for deletions, not KEY_TYPE_DELETED. This is an
+			 * internal implementation detail that probably
+			 * shouldn't be exposed (internally, KEY_TYPE_DELETED is
+			 * used as a proxy for k->size == 0):
+			 */
+			delete.k.type = KEY_TYPE_DISCARD;
+
+			/* create the biggest key we can */
+			bch_key_resize(&delete.k, max_sectors);
+			bch_cut_back(end, &delete.k);
+		}
+
+		ret = bch_btree_insert_at(c, disk_res, hook, journal_seq,
+					  BTREE_INSERT_NOFAIL,
+					  BTREE_INSERT_ENTRY(&iter, &delete));
+		if (ret)
+			break;
+
+		bch_btree_iter_cond_resched(&iter);
+	}
+
+	bch_btree_iter_unlock(&iter);
+	return ret;
+}
+
+/**
+ * bch_btree_node_rewrite - Rewrite/move a btree node
+ *
+ * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
+ * btree_check_reserve() has to wait)
+ */
+int bch_btree_node_rewrite(struct btree_iter *iter, struct btree *b,
+			   struct closure *cl)
+{
+	struct cache_set *c = iter->c;
+	struct btree *n, *parent = iter->nodes[b->level + 1];
+	struct btree_reserve *reserve;
+	struct btree_interior_update *as;
+	unsigned flags = BTREE_INSERT_NOFAIL;
+
+	/*
+	 * if caller is going to wait if allocating reserve fails, then this is
+	 * a rewrite that must succeed:
+	 */
+	if (cl)
+		flags |= BTREE_INSERT_USE_RESERVE;
+
+	if (!bch_btree_iter_set_locks_want(iter, U8_MAX))
+		return -EINTR;
+
+	reserve = bch_btree_reserve_get(c, b, 0, flags, cl);
+	if (IS_ERR(reserve)) {
+		trace_bcache_btree_gc_rewrite_node_fail(c, b);
+		return PTR_ERR(reserve);
+	}
+
+	as = bch_btree_interior_update_alloc(c);
+
+	bch_btree_interior_update_will_free_node(c, as, b);
+
+	n = btree_node_alloc_replacement(c, b, reserve);
+
+	bch_btree_build_aux_trees(n);
+	six_unlock_write(&n->lock);
+
+	trace_bcache_btree_gc_rewrite_node(c, b);
+
+	bch_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
+
+	if (parent) {
+		bch_btree_insert_node(parent, iter,
+				      &keylist_single(&n->key),
+				      reserve, as);
+	} else {
+		bch_btree_set_root(iter, n, as, reserve);
+	}
+
+	btree_open_bucket_put(c, n);
+
+	bch_btree_node_free_inmem(iter, b);
+
+	BUG_ON(!bch_btree_iter_node_replace(iter, n));
+
+	bch_btree_reserve_put(c, reserve);
+	return 0;
+}
diff --git a/libbcache/btree_update.h b/libbcache/btree_update.h
new file mode 100644
index 0000000..0154441
--- /dev/null
+++ b/libbcache/btree_update.h
@@ -0,0 +1,421 @@
+#ifndef _BCACHE_BTREE_INSERT_H
+#define _BCACHE_BTREE_INSERT_H
+
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "journal.h"
+
+struct cache_set;
+struct bkey_format_state;
+struct bkey_format;
+struct btree;
+
+#define BTREE_SPLIT_THRESHOLD(c)		(btree_blocks(c) * 3 / 4)
+
+#define BTREE_FOREGROUND_MERGE_THRESHOLD(c)	(btree_max_u64s(c) * 1 / 3)
+#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c)			\
+	(BTREE_FOREGROUND_MERGE_THRESHOLD(c) +			\
+	 (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
+
+static inline void btree_node_reset_sib_u64s(struct btree *b)
+{
+	b->sib_u64s[0] = b->nr.live_u64s;
+	b->sib_u64s[1] = b->nr.live_u64s;
+}
+
+struct btree_reserve {
+	struct disk_reservation	disk_res;
+	unsigned		nr;
+	struct btree		*b[BTREE_RESERVE_MAX];
+};
+
+void __bch_btree_calc_format(struct bkey_format_state *, struct btree *);
+bool bch_btree_node_format_fits(struct cache_set *c, struct btree *,
+				struct bkey_format *);
+
+/* Btree node freeing/allocation: */
+
+/*
+ * Tracks a btree node that has been (or is about to be) freed in memory, but
+ * has _not_ yet been freed on disk (because the write that makes the new
+ * node(s) visible and frees the old hasn't completed yet)
+ */
+struct pending_btree_node_free {
+	bool			index_update_done;
+
+	__le64			seq;
+	enum btree_id		btree_id;
+	unsigned		level;
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+};
+
+/*
+ * Tracks an in progress split/rewrite of a btree node and the update to the
+ * parent node:
+ *
+ * When we split/rewrite a node, we do all the updates in memory without
+ * waiting for any writes to complete - we allocate the new node(s) and update
+ * the parent node, possibly recursively up to the root.
+ *
+ * The end result is that we have one or more new nodes being written -
+ * possibly several, if there were multiple splits - and then a write (updating
+ * an interior node) which will make all these new nodes visible.
+ *
+ * Additionally, as we split/rewrite nodes we free the old nodes - but the old
+ * nodes can't be freed (their space on disk can't be reclaimed) until the
+ * update to the interior node that makes the new node visible completes -
+ * until then, the old nodes are still reachable on disk.
+ *
+ */
+struct btree_interior_update {
+	struct closure			cl;
+	struct cache_set		*c;
+
+	struct list_head		list;
+
+	/* What kind of update are we doing? */
+	enum {
+		BTREE_INTERIOR_NO_UPDATE,
+		BTREE_INTERIOR_UPDATING_NODE,
+		BTREE_INTERIOR_UPDATING_ROOT,
+		BTREE_INTERIOR_UPDATING_AS,
+	} mode;
+
+	/*
+	 * BTREE_INTERIOR_UPDATING_NODE:
+	 * The update that made the new nodes visible was a regular update to an
+	 * existing interior node - @b. We can't write out the update to @b
+	 * until the new nodes we created are finished writing, so we block @b
+	 * from writing by putting this btree_interior update on the
+	 * @b->write_blocked list with @write_blocked_list:
+	 */
+	struct btree			*b;
+	struct list_head		write_blocked_list;
+
+	/*
+	 * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
+	 * we're now blocking another btree_interior_update
+	 * @parent_as - btree_interior_update that's waiting on our nodes to finish
+	 * writing, before it can make new nodes visible on disk
+	 * @wait - list of child btree_interior_updates that are waiting on this
+	 * btree_interior_update to make all the new nodes visible before they can free
+	 * their old btree nodes
+	 */
+	struct btree_interior_update	*parent_as;
+	struct closure_waitlist		wait;
+
+	/*
+	 * We may be freeing nodes that were dirty, and thus had journal entries
+	 * pinned: we need to transfer the oldest of those pins to the
+	 * btree_interior_update operation, and release it when the new node(s)
+	 * are all persistent and reachable:
+	 */
+	struct journal_entry_pin	journal;
+
+	u64				journal_seq;
+
+	/*
+	 * Nodes being freed:
+	 * Protected by c->btree_node_pending_free_lock
+	 */
+	struct pending_btree_node_free	pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
+	unsigned			nr_pending;
+
+	/* Only here to reduce stack usage on recursive splits: */
+	struct keylist			parent_keys;
+	/*
+	 * Enough room for btree_split's keys without realloc - btree node
+	 * pointers never have crc/compression info, so we only need to acount
+	 * for the pointers for three keys
+	 */
+	u64				inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
+};
+
+#define for_each_pending_btree_node_free(c, as, p)			\
+	list_for_each_entry(as, &c->btree_interior_update_list, list)	\
+		for (p = as->pending; p < as->pending + as->nr_pending; p++)
+
+void bch_btree_node_free_inmem(struct btree_iter *, struct btree *);
+void bch_btree_node_free_never_inserted(struct cache_set *, struct btree *);
+
+void btree_open_bucket_put(struct cache_set *c, struct btree *);
+
+struct btree *__btree_node_alloc_replacement(struct cache_set *,
+					     struct btree *,
+					     struct bkey_format,
+					     struct btree_reserve *);
+struct btree *btree_node_alloc_replacement(struct cache_set *, struct btree *,
+					   struct btree_reserve *);
+
+struct btree_interior_update *
+bch_btree_interior_update_alloc(struct cache_set *);
+
+void bch_btree_interior_update_will_free_node(struct cache_set *,
+					      struct btree_interior_update *,
+					      struct btree *);
+
+void bch_btree_set_root_initial(struct cache_set *, struct btree *,
+				struct btree_reserve *);
+
+void bch_btree_reserve_put(struct cache_set *, struct btree_reserve *);
+struct btree_reserve *bch_btree_reserve_get(struct cache_set *,
+					    struct btree *, unsigned,
+					    unsigned, struct closure *);
+
+int bch_btree_root_alloc(struct cache_set *, enum btree_id, struct closure *);
+
+/* Inserting into a given leaf node (last stage of insert): */
+
+bool bch_btree_bset_insert_key(struct btree_iter *, struct btree *,
+			       struct btree_node_iter *, struct bkey_i *);
+void bch_btree_journal_key(struct btree_insert *trans, struct btree_iter *,
+			   struct bkey_i *);
+
+static inline void *btree_data_end(struct cache_set *c, struct btree *b)
+{
+	return (void *) b->data + btree_bytes(c);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_start(struct cache_set *c,
+							    struct btree *b)
+{
+	return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_end(struct cache_set *c,
+							  struct btree *b)
+{
+	return btree_data_end(c, b);
+}
+
+static inline void *write_block(struct btree *b)
+{
+	return (void *) b->data + (b->written << 9);
+}
+
+static inline bool bset_written(struct btree *b, struct bset *i)
+{
+	return (void *) i < write_block(b);
+}
+
+static inline bool bset_unwritten(struct btree *b, struct bset *i)
+{
+	return (void *) i > write_block(b);
+}
+
+static inline unsigned bset_end_sector(struct cache_set *c, struct btree *b,
+				       struct bset *i)
+{
+	return round_up(bset_byte_offset(b, bset_bkey_last(i)),
+			block_bytes(c)) >> 9;
+}
+
+static inline size_t bch_btree_keys_u64s_remaining(struct cache_set *c,
+						   struct btree *b)
+{
+	struct bset *i = btree_bset_last(b);
+	unsigned used = bset_byte_offset(b, bset_bkey_last(i)) / sizeof(u64) +
+		b->whiteout_u64s +
+		b->uncompacted_whiteout_u64s;
+	unsigned total = c->sb.btree_node_size << 6;
+
+	EBUG_ON(used > total);
+
+	if (bset_written(b, i))
+		return 0;
+
+	return total - used;
+}
+
+static inline unsigned btree_write_set_buffer(struct btree *b)
+{
+	/*
+	 * Could buffer up larger amounts of keys for btrees with larger keys,
+	 * pending benchmarking:
+	 */
+	return 4 << 10;
+}
+
+static inline struct btree_node_entry *want_new_bset(struct cache_set *c,
+						     struct btree *b)
+{
+	struct bset *i = btree_bset_last(b);
+	unsigned offset = max_t(unsigned, b->written << 9,
+				bset_byte_offset(b, bset_bkey_last(i)));
+	ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t)
+		(offset + sizeof(struct btree_node_entry) +
+		 b->whiteout_u64s * sizeof(u64) +
+		 b->uncompacted_whiteout_u64s * sizeof(u64));
+
+	EBUG_ON(offset > btree_bytes(c));
+
+	if ((unlikely(bset_written(b, i)) && n > 0) ||
+	    (unlikely(__set_bytes(i, le16_to_cpu(i->u64s)) >
+		      btree_write_set_buffer(b)) && n > btree_write_set_buffer(b)))
+		return (void *) b->data + offset;
+
+	return NULL;
+}
+
+/*
+ * write lock must be held on @b (else the dirty bset that we were going to
+ * insert into could be written out from under us)
+ */
+static inline bool bch_btree_node_insert_fits(struct cache_set *c,
+					      struct btree *b, unsigned u64s)
+{
+	if (btree_node_is_extents(b)) {
+		/* The insert key might split an existing key
+		 * (bch_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case:
+		 */
+		u64s += BKEY_EXTENT_U64s_MAX;
+	}
+
+	return u64s <= bch_btree_keys_u64s_remaining(c, b);
+}
+
+static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
+				      struct bkey_packed *k)
+{
+	if (bset_written(b, bset(b, t))) {
+		EBUG_ON(b->uncompacted_whiteout_u64s <
+			bkeyp_key_u64s(&b->format, k));
+		b->uncompacted_whiteout_u64s -=
+			bkeyp_key_u64s(&b->format, k);
+	}
+}
+
+static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
+				    struct bkey_packed *k)
+{
+	if (bset_written(b, bset(b, t))) {
+		BUG_ON(!k->needs_whiteout);
+		b->uncompacted_whiteout_u64s +=
+			bkeyp_key_u64s(&b->format, k);
+	}
+}
+
+void bch_btree_insert_node(struct btree *, struct btree_iter *,
+			   struct keylist *, struct btree_reserve *,
+			   struct btree_interior_update *as);
+
+/* Normal update interface: */
+
+struct btree_insert {
+	struct cache_set	*c;
+	struct disk_reservation *disk_res;
+	struct journal_res	journal_res;
+	u64			*journal_seq;
+	struct extent_insert_hook *hook;
+	unsigned		flags;
+	bool			did_work;
+
+	unsigned short		nr;
+	struct btree_insert_entry {
+		struct btree_iter *iter;
+		struct bkey_i	*k;
+		/*
+		 * true if entire key was inserted - can only be false for
+		 * extents
+		 */
+		bool		done;
+	}			*entries;
+};
+
+int __bch_btree_insert_at(struct btree_insert *);
+
+
+#define _TENTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...)   N
+#define COUNT_ARGS(...)  _TENTH_ARG(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define BTREE_INSERT_ENTRY(_iter, _k)					\
+	((struct btree_insert_entry) {					\
+		.iter		= (_iter),				\
+		.k		= (_k),					\
+		.done		= false,				\
+	})
+
+/**
+ * bch_btree_insert_at - insert one or more keys at iterator positions
+ * @iter:		btree iterator
+ * @insert_key:		key to insert
+ * @disk_res:		disk reservation
+ * @hook:		extent insert callback
+ *
+ * Return values:
+ * -EINTR: locking changed, this function should be called again. Only returned
+ *  if passed BTREE_INSERT_ATOMIC.
+ * -EROFS: cache set read only
+ * -EIO: journal or btree node IO error
+ */
+#define bch_btree_insert_at(_c, _disk_res, _hook,			\
+			    _journal_seq, _flags, ...)			\
+	__bch_btree_insert_at(&(struct btree_insert) {			\
+		.c		= (_c),					\
+		.disk_res	= (_disk_res),				\
+		.journal_seq	= (_journal_seq),			\
+		.hook		= (_hook),				\
+		.flags		= (_flags),				\
+		.nr		= COUNT_ARGS(__VA_ARGS__),		\
+		.entries	= (struct btree_insert_entry[]) {	\
+			__VA_ARGS__					\
+		}})
+
+/*
+ * Don't drop/retake locks: instead return -EINTR if need to upgrade to intent
+ * locks, -EAGAIN if need to wait on btree reserve
+ */
+#define BTREE_INSERT_ATOMIC		(1 << 0)
+
+/* Don't check for -ENOSPC: */
+#define BTREE_INSERT_NOFAIL		(1 << 1)
+
+/* for copygc, or when merging btree nodes */
+#define BTREE_INSERT_USE_RESERVE	(1 << 2)
+
+/*
+ * Insert is for journal replay: don't get journal reservations, or mark extents
+ * (bch_mark_key)
+ */
+#define BTREE_INSERT_JOURNAL_REPLAY	(1 << 3)
+
+int bch_btree_insert_list_at(struct btree_iter *, struct keylist *,
+			     struct disk_reservation *,
+			     struct extent_insert_hook *, u64 *, unsigned);
+
+static inline bool journal_res_insert_fits(struct btree_insert *trans,
+					   struct btree_insert_entry *insert)
+{
+	unsigned u64s = 0;
+	struct btree_insert_entry *i;
+
+	/*
+	 * If we didn't get a journal reservation, we're in journal replay and
+	 * we're not journalling updates:
+	 */
+	if (!trans->journal_res.ref)
+		return true;
+
+	for (i = insert; i < trans->entries + trans->nr; i++)
+		u64s += jset_u64s(i->k->k.u64s);
+
+	return u64s <= trans->journal_res.u64s;
+}
+
+int bch_btree_insert_check_key(struct btree_iter *, struct bkey_i *);
+int bch_btree_insert(struct cache_set *, enum btree_id, struct bkey_i *,
+		     struct disk_reservation *,
+		     struct extent_insert_hook *, u64 *, int flags);
+int bch_btree_update(struct cache_set *, enum btree_id,
+		     struct bkey_i *, u64 *);
+
+int bch_btree_delete_range(struct cache_set *, enum btree_id,
+			   struct bpos, struct bpos, u64,
+			   struct disk_reservation *,
+			   struct extent_insert_hook *, u64 *);
+
+int bch_btree_node_rewrite(struct btree_iter *, struct btree *, struct closure *);
+
+#endif /* _BCACHE_BTREE_INSERT_H */
+
diff --git a/libbcache/buckets.c b/libbcache/buckets.c
new file mode 100644
index 0000000..3398b25
--- /dev/null
+++ b/libbcache/buckets.c
@@ -0,0 +1,755 @@
+/*
+ * Code for manipulating bucket marks for garbage collection.
+ *
+ * Copyright 2014 Datera, Inc.
+ *
+ * Bucket states:
+ * - free bucket: mark == 0
+ *   The bucket contains no data and will not be read
+ *
+ * - allocator bucket: owned_by_allocator == 1
+ *   The bucket is on a free list, or it is an open bucket
+ *
+ * - cached bucket: owned_by_allocator == 0 &&
+ *                  dirty_sectors == 0 &&
+ *                  cached_sectors > 0
+ *   The bucket contains data but may be safely discarded as there are
+ *   enough replicas of the data on other cache devices, or it has been
+ *   written back to the backing device
+ *
+ * - dirty bucket: owned_by_allocator == 0 &&
+ *                 dirty_sectors > 0
+ *   The bucket contains data that we must not discard (either only copy,
+ *   or one of the 'main copies' for data requiring multiple replicas)
+ *
+ * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
+ *   This is a btree node, journal or gen/prio bucket
+ *
+ * Lifecycle:
+ *
+ * bucket invalidated => bucket on freelist => open bucket =>
+ *     [dirty bucket =>] cached bucket => bucket invalidated => ...
+ *
+ * Note that cache promotion can skip the dirty bucket step, as data
+ * is copied from a deeper tier to a shallower tier, onto a cached
+ * bucket.
+ * Note also that a cached bucket can spontaneously become dirty --
+ * see below.
+ *
+ * Only a traversal of the key space can determine whether a bucket is
+ * truly dirty or cached.
+ *
+ * Transitions:
+ *
+ * - free => allocator: bucket was invalidated
+ * - cached => allocator: bucket was invalidated
+ *
+ * - allocator => dirty: open bucket was filled up
+ * - allocator => cached: open bucket was filled up
+ * - allocator => metadata: metadata was allocated
+ *
+ * - dirty => cached: dirty sectors were copied to a deeper tier
+ * - dirty => free: dirty sectors were overwritten or moved (copy gc)
+ * - cached => free: cached sectors were overwritten
+ *
+ * - metadata => free: metadata was freed
+ *
+ * Oddities:
+ * - cached => dirty: a device was removed so formerly replicated data
+ *                    is no longer sufficiently replicated
+ * - free => cached: cannot happen
+ * - free => dirty: cannot happen
+ * - free => metadata: cannot happen
+ */
+
+#include "bcache.h"
+#include "alloc.h"
+#include "btree_gc.h"
+#include "buckets.h"
+
+#include <linux/preempt.h>
+#include <trace/events/bcache.h>
+
+#ifdef DEBUG_BUCKETS
+
+#define lg_local_lock	lg_global_lock
+#define lg_local_unlock	lg_global_unlock
+
+static void bch_cache_set_stats_verify(struct cache_set *c)
+{
+	struct bucket_stats_cache_set stats =
+		__bch_bucket_stats_read_cache_set(c);
+
+	if ((s64) stats.sectors_dirty < 0)
+		panic("sectors_dirty underflow: %lli\n", stats.sectors_dirty);
+
+	if ((s64) stats.sectors_cached < 0)
+		panic("sectors_cached underflow: %lli\n", stats.sectors_cached);
+
+	if ((s64) stats.sectors_meta < 0)
+		panic("sectors_meta underflow: %lli\n", stats.sectors_meta);
+
+	if ((s64) stats.sectors_persistent_reserved < 0)
+		panic("sectors_persistent_reserved underflow: %lli\n", stats.sectors_persistent_reserved);
+
+	if ((s64) stats.sectors_online_reserved < 0)
+		panic("sectors_online_reserved underflow: %lli\n", stats.sectors_online_reserved);
+}
+
+#else
+
+static void bch_cache_set_stats_verify(struct cache_set *c) {}
+
+#endif
+
+void bch_bucket_seq_cleanup(struct cache_set *c)
+{
+	u16 last_seq_ondisk = c->journal.last_seq_ondisk;
+	struct cache *ca;
+	struct bucket *g;
+	struct bucket_mark m;
+	unsigned i;
+
+	for_each_cache(ca, c, i)
+		for_each_bucket(g, ca) {
+			bucket_cmpxchg(g, m, ({
+				if (!m.wait_on_journal ||
+				    ((s16) last_seq_ondisk -
+				     (s16) m.journal_seq < 0))
+					break;
+
+				m.wait_on_journal = 0;
+			}));
+		}
+}
+
+#define bucket_stats_add(_acc, _stats)					\
+do {									\
+	typeof(_acc) _a = (_acc), _s = (_stats);			\
+	unsigned i;							\
+									\
+	for (i = 0; i < sizeof(*_a) / sizeof(u64); i++)			\
+		((u64 *) (_a))[i] += ((u64 *) (_s))[i];			\
+} while (0)
+
+#define bucket_stats_read_raw(_stats)					\
+({									\
+	typeof(*this_cpu_ptr(_stats)) _acc = { 0 };			\
+	int cpu;							\
+									\
+	for_each_possible_cpu(cpu)					\
+		bucket_stats_add(&_acc, per_cpu_ptr((_stats), cpu));	\
+									\
+	_acc;								\
+})
+
+#define bucket_stats_read_cached(_c, _cached, _uncached)		\
+({									\
+	typeof(_cached) _ret;						\
+	unsigned _seq;							\
+									\
+	do {								\
+		_seq = read_seqcount_begin(&(_c)->gc_pos_lock);		\
+		_ret = (_c)->gc_pos.phase == GC_PHASE_DONE		\
+			? bucket_stats_read_raw(_uncached)		\
+			: (_cached);					\
+	} while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq));	\
+									\
+	_ret;								\
+})
+
+struct bucket_stats_cache __bch_bucket_stats_read_cache(struct cache *ca)
+{
+	return bucket_stats_read_raw(ca->bucket_stats_percpu);
+}
+
+struct bucket_stats_cache bch_bucket_stats_read_cache(struct cache *ca)
+{
+	return bucket_stats_read_cached(ca->set,
+				ca->bucket_stats_cached,
+				ca->bucket_stats_percpu);
+}
+
+struct bucket_stats_cache_set
+__bch_bucket_stats_read_cache_set(struct cache_set *c)
+{
+	return bucket_stats_read_raw(c->bucket_stats_percpu);
+}
+
+struct bucket_stats_cache_set
+bch_bucket_stats_read_cache_set(struct cache_set *c)
+{
+	return bucket_stats_read_cached(c,
+				c->bucket_stats_cached,
+				c->bucket_stats_percpu);
+}
+
+static inline int is_meta_bucket(struct bucket_mark m)
+{
+	return !m.owned_by_allocator && m.is_metadata;
+}
+
+static inline int is_dirty_bucket(struct bucket_mark m)
+{
+	return !m.owned_by_allocator && !m.is_metadata && !!m.dirty_sectors;
+}
+
+static inline int is_cached_bucket(struct bucket_mark m)
+{
+	return !m.owned_by_allocator && !m.dirty_sectors && !!m.cached_sectors;
+}
+
+void bch_cache_set_stats_apply(struct cache_set *c,
+			       struct bucket_stats_cache_set *stats,
+			       struct disk_reservation *disk_res,
+			       struct gc_pos gc_pos)
+{
+	s64 added =
+		stats->s[S_COMPRESSED][S_META] +
+		stats->s[S_COMPRESSED][S_DIRTY] +
+		stats->persistent_reserved +
+		stats->online_reserved;
+
+	/*
+	 * Not allowed to reduce sectors_available except by getting a
+	 * reservation:
+	 */
+	BUG_ON(added > (s64) (disk_res ? disk_res->sectors : 0));
+
+	if (added > 0) {
+		disk_res->sectors	-= added;
+		stats->online_reserved	-= added;
+	}
+
+	lg_local_lock(&c->bucket_stats_lock);
+	/* online_reserved not subject to gc: */
+	this_cpu_ptr(c->bucket_stats_percpu)->online_reserved +=
+		stats->online_reserved;
+	stats->online_reserved = 0;
+
+	if (!gc_will_visit(c, gc_pos))
+		bucket_stats_add(this_cpu_ptr(c->bucket_stats_percpu), stats);
+
+	bch_cache_set_stats_verify(c);
+	lg_local_unlock(&c->bucket_stats_lock);
+
+	memset(stats, 0, sizeof(*stats));
+}
+
+static void bucket_stats_update(struct cache *ca,
+			struct bucket_mark old, struct bucket_mark new,
+			bool may_make_unavailable,
+			struct bucket_stats_cache_set *cache_set_stats)
+{
+	struct cache_set *c = ca->set;
+	struct bucket_stats_cache *cache_stats;
+
+	BUG_ON(!may_make_unavailable &&
+	       is_available_bucket(old) &&
+	       !is_available_bucket(new) &&
+	       c->gc_pos.phase == GC_PHASE_DONE);
+
+	if (cache_set_stats) {
+		cache_set_stats->s[S_COMPRESSED][S_CACHED] +=
+			(int) new.cached_sectors - (int) old.cached_sectors;
+
+		cache_set_stats->s[S_COMPRESSED]
+			[old.is_metadata ? S_META : S_DIRTY] -=
+			old.dirty_sectors;
+
+		cache_set_stats->s[S_COMPRESSED]
+			[new.is_metadata ? S_META : S_DIRTY] +=
+			new.dirty_sectors;
+	}
+
+	preempt_disable();
+	cache_stats = this_cpu_ptr(ca->bucket_stats_percpu);
+
+	cache_stats->sectors_cached +=
+		(int) new.cached_sectors - (int) old.cached_sectors;
+
+	if (old.is_metadata)
+		cache_stats->sectors_meta -= old.dirty_sectors;
+	else
+		cache_stats->sectors_dirty -= old.dirty_sectors;
+
+	if (new.is_metadata)
+		cache_stats->sectors_meta += new.dirty_sectors;
+	else
+		cache_stats->sectors_dirty += new.dirty_sectors;
+
+	cache_stats->buckets_alloc +=
+		(int) new.owned_by_allocator - (int) old.owned_by_allocator;
+
+	cache_stats->buckets_meta += is_meta_bucket(new) - is_meta_bucket(old);
+	cache_stats->buckets_cached += is_cached_bucket(new) - is_cached_bucket(old);
+	cache_stats->buckets_dirty += is_dirty_bucket(new) - is_dirty_bucket(old);
+	preempt_enable();
+
+	if (!is_available_bucket(old) && is_available_bucket(new))
+		bch_wake_allocator(ca);
+}
+
+void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
+{
+	struct bucket_stats_cache_set stats = { 0 };
+	struct bucket_mark old, new;
+
+	old = bucket_cmpxchg(g, new, ({
+		new.owned_by_allocator	= 1;
+		new.is_metadata		= 0;
+		new.cached_sectors	= 0;
+		new.dirty_sectors	= 0;
+		new.copygc		= 0;
+		new.gen++;
+	}));
+
+	BUG_ON(old.dirty_sectors);
+
+	bucket_stats_update(ca, old, new, true, &stats);
+
+	/*
+	 * Ick:
+	 *
+	 * Only stats.sectors_cached should be nonzero: this is important
+	 * because in this path we modify cache_set_stats based on how the
+	 * bucket_mark was modified, and the sector counts in bucket_mark are
+	 * subject to (saturating) overflow - and if they did overflow, the
+	 * cache set stats will now be off. We can tolerate this for
+	 * sectors_cached, but not anything else:
+	 */
+	stats.s[S_COMPRESSED][S_CACHED] = 0;
+	stats.s[S_UNCOMPRESSED][S_CACHED] = 0;
+	BUG_ON(!bch_is_zero(&stats, sizeof(stats)));
+
+	if (!old.owned_by_allocator && old.cached_sectors)
+		trace_bcache_invalidate(ca, g - ca->buckets,
+					old.cached_sectors);
+}
+
+void bch_mark_free_bucket(struct cache *ca, struct bucket *g)
+{
+	struct bucket_stats_cache_set stats = { 0 };
+	struct bucket_mark old, new;
+
+	old = bucket_cmpxchg(g, new, ({
+		new.owned_by_allocator	= 0;
+		new.is_metadata		= 0;
+		new.cached_sectors	= 0;
+		new.dirty_sectors	= 0;
+	}));
+
+	bucket_stats_update(ca, old, new, false, &stats);
+}
+
+void bch_mark_alloc_bucket(struct cache *ca, struct bucket *g,
+			   bool owned_by_allocator)
+{
+	struct bucket_stats_cache_set stats = { 0 };
+	struct bucket_mark old, new;
+
+	old = bucket_cmpxchg(g, new, new.owned_by_allocator = owned_by_allocator);
+
+	bucket_stats_update(ca, old, new, true, &stats);
+}
+
+void bch_mark_metadata_bucket(struct cache *ca, struct bucket *g,
+			      bool may_make_unavailable)
+{
+	struct bucket_stats_cache_set stats = { 0 };
+	struct bucket_mark old, new;
+
+	old = bucket_cmpxchg(g, new, ({
+		new.is_metadata = 1;
+		new.had_metadata = 1;
+	}));
+
+	BUG_ON(old.cached_sectors);
+	BUG_ON(old.dirty_sectors);
+
+	bucket_stats_update(ca, old, new, may_make_unavailable, &stats);
+}
+
+#define saturated_add(ca, dst, src, max)			\
+do {								\
+	BUG_ON((int) (dst) + (src) < 0);			\
+	if ((dst) == (max))					\
+		;						\
+	else if ((dst) + (src) <= (max))			\
+		dst += (src);					\
+	else {							\
+		dst = (max);					\
+		trace_bcache_sectors_saturated(ca);		\
+	}							\
+} while (0)
+
+#if 0
+/* Reverting this until the copygc + compression issue is fixed: */
+
+static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
+{
+	return crc_compression_type(crc)
+		? sectors * crc_compressed_size(crc) / crc_uncompressed_size(crc)
+		: sectors;
+}
+
+static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors)
+{
+	return crc_compression_type(crc)
+		? min_t(unsigned, crc_compressed_size(crc), sectors)
+		: sectors;
+}
+#else
+static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
+{
+	return sectors;
+}
+
+static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors)
+{
+	return sectors;
+}
+#endif
+
+/*
+ * Checking against gc's position has to be done here, inside the cmpxchg()
+ * loop, to avoid racing with the start of gc clearing all the marks - GC does
+ * that with the gc pos seqlock held.
+ */
+static void bch_mark_pointer(struct cache_set *c,
+			     struct bkey_s_c_extent e,
+			     struct cache *ca,
+			     const union bch_extent_crc *crc,
+			     const struct bch_extent_ptr *ptr,
+			     s64 sectors, enum s_alloc type,
+			     bool may_make_unavailable,
+			     struct bucket_stats_cache_set *stats,
+			     bool gc_will_visit, u64 journal_seq)
+{
+	struct bucket_mark old, new;
+	unsigned saturated;
+	struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr);
+	u64 v = READ_ONCE(g->_mark.counter);
+	unsigned old_sectors, new_sectors;
+	int disk_sectors, compressed_sectors;
+
+	if (sectors > 0) {
+		old_sectors = 0;
+		new_sectors = sectors;
+	} else {
+		old_sectors = e.k->size;
+		new_sectors = e.k->size + sectors;
+	}
+
+	disk_sectors = -__disk_sectors(crc, old_sectors)
+		+ __disk_sectors(crc, new_sectors);
+	compressed_sectors = -__compressed_sectors(crc, old_sectors)
+		+ __compressed_sectors(crc, new_sectors);
+
+	if (gc_will_visit) {
+		if (journal_seq)
+			bucket_cmpxchg(g, new, new.journal_seq = journal_seq);
+
+		goto out;
+	}
+
+	do {
+		new.counter = old.counter = v;
+		saturated = 0;
+
+		/*
+		 * Check this after reading bucket mark to guard against
+		 * the allocator invalidating a bucket after we've already
+		 * checked the gen
+		 */
+		if (gen_after(old.gen, ptr->gen)) {
+			EBUG_ON(type != S_CACHED &&
+				test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
+			return;
+		}
+
+		EBUG_ON(type != S_CACHED &&
+			!may_make_unavailable &&
+			is_available_bucket(old) &&
+			test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
+
+		if (type != S_CACHED &&
+		    new.dirty_sectors == GC_MAX_SECTORS_USED &&
+		    disk_sectors < 0)
+			saturated = -disk_sectors;
+
+		if (type == S_CACHED)
+			saturated_add(ca, new.cached_sectors, disk_sectors,
+				      GC_MAX_SECTORS_USED);
+		else
+			saturated_add(ca, new.dirty_sectors, disk_sectors,
+				      GC_MAX_SECTORS_USED);
+
+		if (!new.dirty_sectors &&
+		    !new.cached_sectors) {
+			new.is_metadata = false;
+
+			if (journal_seq) {
+				new.wait_on_journal = true;
+				new.journal_seq = journal_seq;
+			}
+		} else {
+			new.is_metadata = (type == S_META);
+		}
+
+		new.had_metadata |= new.is_metadata;
+	} while ((v = cmpxchg(&g->_mark.counter,
+			      old.counter,
+			      new.counter)) != old.counter);
+
+	bucket_stats_update(ca, old, new, may_make_unavailable, NULL);
+
+	if (saturated &&
+	    atomic_long_add_return(saturated,
+				   &ca->saturated_count) >=
+	    ca->free_inc.size << ca->bucket_bits) {
+		if (c->gc_thread) {
+			trace_bcache_gc_sectors_saturated(c);
+			wake_up_process(c->gc_thread);
+		}
+	}
+out:
+	stats->s[S_COMPRESSED][type]	+= compressed_sectors;
+	stats->s[S_UNCOMPRESSED][type]	+= sectors;
+}
+
+static void bch_mark_extent(struct cache_set *c, struct bkey_s_c_extent e,
+			    s64 sectors, bool metadata,
+			    bool may_make_unavailable,
+			    struct bucket_stats_cache_set *stats,
+			    bool gc_will_visit, u64 journal_seq)
+{
+	const struct bch_extent_ptr *ptr;
+	const union bch_extent_crc *crc;
+	struct cache *ca;
+	enum s_alloc type = metadata ? S_META : S_DIRTY;
+
+	BUG_ON(metadata && bkey_extent_is_cached(e.k));
+	BUG_ON(!sectors);
+
+	rcu_read_lock();
+	extent_for_each_online_device_crc(c, e, crc, ptr, ca) {
+		bool dirty = bch_extent_ptr_is_dirty(c, e, ptr);
+
+		trace_bcache_mark_bucket(ca, e.k, ptr, sectors, dirty);
+
+		bch_mark_pointer(c, e, ca, crc, ptr, sectors,
+				 dirty ? type : S_CACHED,
+				 may_make_unavailable,
+				 stats, gc_will_visit, journal_seq);
+	}
+	rcu_read_unlock();
+}
+
+static void __bch_mark_key(struct cache_set *c, struct bkey_s_c k,
+			   s64 sectors, bool metadata,
+			   bool may_make_unavailable,
+			   struct bucket_stats_cache_set *stats,
+			   bool gc_will_visit, u64 journal_seq)
+{
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		bch_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata,
+				may_make_unavailable, stats,
+				gc_will_visit, journal_seq);
+		break;
+	case BCH_RESERVATION:
+		stats->persistent_reserved += sectors;
+		break;
+	}
+}
+
+void __bch_gc_mark_key(struct cache_set *c, struct bkey_s_c k,
+		       s64 sectors, bool metadata,
+		       struct bucket_stats_cache_set *stats)
+{
+	__bch_mark_key(c, k, sectors, metadata, true, stats, false, 0);
+}
+
+void bch_gc_mark_key(struct cache_set *c, struct bkey_s_c k,
+		     s64 sectors, bool metadata)
+{
+	struct bucket_stats_cache_set stats = { 0 };
+
+	__bch_gc_mark_key(c, k, sectors, metadata, &stats);
+
+	preempt_disable();
+	bucket_stats_add(this_cpu_ptr(c->bucket_stats_percpu), &stats);
+	preempt_enable();
+}
+
+void bch_mark_key(struct cache_set *c, struct bkey_s_c k,
+		  s64 sectors, bool metadata, struct gc_pos gc_pos,
+		  struct bucket_stats_cache_set *stats, u64 journal_seq)
+{
+	/*
+	 * synchronization w.r.t. GC:
+	 *
+	 * Normally, bucket sector counts/marks are updated on the fly, as
+	 * references are added/removed from the btree, the lists of buckets the
+	 * allocator owns, other metadata buckets, etc.
+	 *
+	 * When GC is in progress and going to mark this reference, we do _not_
+	 * mark this reference here, to avoid double counting - GC will count it
+	 * when it gets to it.
+	 *
+	 * To know whether we should mark a given reference (GC either isn't
+	 * running, or has already marked references at this position) we
+	 * construct a total order for everything GC walks. Then, we can simply
+	 * compare the position of the reference we're marking - @gc_pos - with
+	 * GC's current position. If GC is going to mark this reference, GC's
+	 * current position will be less than @gc_pos; if GC's current position
+	 * is greater than @gc_pos GC has either already walked this position,
+	 * or isn't running.
+	 *
+	 * To avoid racing with GC's position changing, we have to deal with
+	 *  - GC's position being set to GC_POS_MIN when GC starts:
+	 *    bucket_stats_lock guards against this
+	 *  - GC's position overtaking @gc_pos: we guard against this with
+	 *    whatever lock protects the data structure the reference lives in
+	 *    (e.g. the btree node lock, or the relevant allocator lock).
+	 */
+	lg_local_lock(&c->bucket_stats_lock);
+	__bch_mark_key(c, k, sectors, metadata, false, stats,
+		       gc_will_visit(c, gc_pos), journal_seq);
+
+	bch_cache_set_stats_verify(c);
+	lg_local_unlock(&c->bucket_stats_lock);
+}
+
+static u64 __recalc_sectors_available(struct cache_set *c)
+{
+	return c->capacity - cache_set_sectors_used(c);
+}
+
+/* Used by gc when it's starting: */
+void bch_recalc_sectors_available(struct cache_set *c)
+{
+	int cpu;
+
+	lg_global_lock(&c->bucket_stats_lock);
+
+	for_each_possible_cpu(cpu)
+		per_cpu_ptr(c->bucket_stats_percpu, cpu)->available_cache = 0;
+
+	atomic64_set(&c->sectors_available,
+		     __recalc_sectors_available(c));
+
+	lg_global_unlock(&c->bucket_stats_lock);
+}
+
+void bch_disk_reservation_put(struct cache_set *c,
+			      struct disk_reservation *res)
+{
+	if (res->sectors) {
+		lg_local_lock(&c->bucket_stats_lock);
+		this_cpu_sub(c->bucket_stats_percpu->online_reserved,
+			     res->sectors);
+
+		bch_cache_set_stats_verify(c);
+		lg_local_unlock(&c->bucket_stats_lock);
+
+		res->sectors = 0;
+	}
+}
+
+#define SECTORS_CACHE	1024
+
+int bch_disk_reservation_add(struct cache_set *c,
+			     struct disk_reservation *res,
+			     unsigned sectors, int flags)
+{
+	struct bucket_stats_cache_set *stats;
+	u64 old, new, v;
+	s64 sectors_available;
+	int ret;
+
+	sectors *= res->nr_replicas;
+
+	lg_local_lock(&c->bucket_stats_lock);
+	stats = this_cpu_ptr(c->bucket_stats_percpu);
+
+	if (sectors >= stats->available_cache)
+		goto out;
+
+	v = atomic64_read(&c->sectors_available);
+	do {
+		old = v;
+		if (old < sectors) {
+			lg_local_unlock(&c->bucket_stats_lock);
+			goto recalculate;
+		}
+
+		new = max_t(s64, 0, old - sectors - SECTORS_CACHE);
+	} while ((v = atomic64_cmpxchg(&c->sectors_available,
+				       old, new)) != old);
+
+	stats->available_cache	+= old - new;
+out:
+	stats->available_cache	-= sectors;
+	stats->online_reserved	+= sectors;
+	res->sectors		+= sectors;
+
+	bch_cache_set_stats_verify(c);
+	lg_local_unlock(&c->bucket_stats_lock);
+	return 0;
+
+recalculate:
+	/*
+	 * GC recalculates sectors_available when it starts, so that hopefully
+	 * we don't normally end up blocking here:
+	 */
+
+	/*
+	 * Piss fuck, we can be called from extent_insert_fixup() with btree
+	 * locks held:
+	 */
+
+	if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
+		if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
+			down_read(&c->gc_lock);
+		else if (!down_read_trylock(&c->gc_lock))
+			return -EINTR;
+	}
+	lg_global_lock(&c->bucket_stats_lock);
+
+	sectors_available = __recalc_sectors_available(c);
+
+	if (sectors <= sectors_available ||
+	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
+		atomic64_set(&c->sectors_available,
+			     max_t(s64, 0, sectors_available - sectors));
+		stats->online_reserved	+= sectors;
+		res->sectors		+= sectors;
+		ret = 0;
+	} else {
+		atomic64_set(&c->sectors_available, sectors_available);
+		ret = -ENOSPC;
+	}
+
+	bch_cache_set_stats_verify(c);
+	lg_global_unlock(&c->bucket_stats_lock);
+	if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
+		up_read(&c->gc_lock);
+
+	return ret;
+}
+
+int bch_disk_reservation_get(struct cache_set *c,
+			     struct disk_reservation *res,
+			     unsigned sectors, int flags)
+{
+	res->sectors = 0;
+	res->gen = c->capacity_gen;
+	res->nr_replicas = (flags & BCH_DISK_RESERVATION_METADATA)
+		? c->opts.metadata_replicas
+		: c->opts.data_replicas;
+
+	return bch_disk_reservation_add(c, res, sectors, flags);
+}
diff --git a/libbcache/buckets.h b/libbcache/buckets.h
new file mode 100644
index 0000000..35100eb
--- /dev/null
+++ b/libbcache/buckets.h
@@ -0,0 +1,272 @@
+/*
+ * Code for manipulating bucket marks for garbage collection.
+ *
+ * Copyright 2014 Datera, Inc.
+ */
+
+#ifndef _BUCKETS_H
+#define _BUCKETS_H
+
+#include "buckets_types.h"
+#include "super.h"
+
+#define for_each_bucket(b, ca)					\
+	for (b = (ca)->buckets + (ca)->mi.first_bucket;		\
+	     b < (ca)->buckets + (ca)->mi.nbuckets; b++)
+
+#define bucket_cmpxchg(g, new, expr)				\
+({								\
+	u64 _v = READ_ONCE((g)->_mark.counter);			\
+	struct bucket_mark _old;				\
+								\
+	do {							\
+		(new).counter = _old.counter = _v;		\
+		expr;						\
+	} while ((_v = cmpxchg(&(g)->_mark.counter,		\
+			       _old.counter,			\
+			       (new).counter)) != _old.counter);\
+	_old;							\
+})
+
+/*
+ * bucket_gc_gen() returns the difference between the bucket's current gen and
+ * the oldest gen of any pointer into that bucket in the btree.
+ */
+
+static inline u8 bucket_gc_gen(struct cache *ca, struct bucket *g)
+{
+	unsigned long r = g - ca->buckets;
+	return g->mark.gen - ca->oldest_gens[r];
+}
+
+static inline struct cache *PTR_CACHE(const struct cache_set *c,
+				      const struct bch_extent_ptr *ptr)
+{
+	EBUG_ON(ptr->dev > rcu_dereference(c->members)->nr_in_set);
+
+	return rcu_dereference(c->cache[ptr->dev]);
+}
+
+static inline size_t PTR_BUCKET_NR(const struct cache *ca,
+				   const struct bch_extent_ptr *ptr)
+{
+	return sector_to_bucket(ca, ptr->offset);
+}
+
+/*
+ * Returns 0 if no pointers or device offline - only for tracepoints!
+ */
+static inline size_t PTR_BUCKET_NR_TRACE(const struct cache_set *c,
+					 const struct bkey_i *k,
+					 unsigned ptr)
+{
+	size_t bucket = 0;
+#if 0
+	if (bkey_extent_is_data(&k->k)) {
+		const struct bch_extent_ptr *ptr;
+		const struct cache *ca;
+
+		rcu_read_lock();
+		extent_for_each_online_device(c, bkey_i_to_s_c_extent(k), ptr, ca) {
+			bucket = PTR_BUCKET_NR(ca, ptr);
+			break;
+		}
+		rcu_read_unlock();
+	}
+#endif
+	return bucket;
+}
+
+static inline struct bucket *PTR_BUCKET(const struct cache *ca,
+					const struct bch_extent_ptr *ptr)
+{
+	return ca->buckets + PTR_BUCKET_NR(ca, ptr);
+}
+
+static inline u8 __gen_after(u8 a, u8 b)
+{
+	u8 r = a - b;
+
+	return r > 128U ? 0 : r;
+}
+
+static inline u8 gen_after(u8 a, u8 b)
+{
+	u8 r = a - b;
+
+	BUG_ON(r > 128U);
+
+	return r;
+}
+
+/**
+ * ptr_stale() - check if a pointer points into a bucket that has been
+ * invalidated.
+ *
+ * Warning: PTR_CACHE(c, k, ptr) must equal ca.
+ */
+static inline u8 ptr_stale(const struct cache *ca,
+			   const struct bch_extent_ptr *ptr)
+{
+	return gen_after(PTR_BUCKET(ca, ptr)->mark.gen, ptr->gen);
+}
+
+/* bucket heaps */
+
+static inline bool bucket_min_cmp(struct bucket_heap_entry l,
+				  struct bucket_heap_entry r)
+{
+	return l.val < r.val;
+}
+
+static inline bool bucket_max_cmp(struct bucket_heap_entry l,
+				  struct bucket_heap_entry r)
+{
+	return l.val > r.val;
+}
+
+static inline void bucket_heap_push(struct cache *ca, struct bucket *g,
+				    unsigned long val)
+{
+	struct bucket_heap_entry new = { g, val };
+
+	if (!heap_full(&ca->heap))
+		heap_add(&ca->heap, new, bucket_min_cmp);
+	else if (bucket_min_cmp(new, heap_peek(&ca->heap))) {
+		ca->heap.data[0] = new;
+		heap_sift(&ca->heap, 0, bucket_min_cmp);
+	}
+}
+
+/* bucket gc marks */
+
+/* The dirty and cached sector counts saturate. If this occurs,
+ * reference counting alone will not free the bucket, and a btree
+ * GC must be performed. */
+#define GC_MAX_SECTORS_USED ((1U << 15) - 1)
+
+static inline bool bucket_unused(struct bucket *g)
+{
+	return !g->mark.counter;
+}
+
+static inline unsigned bucket_sectors_used(struct bucket *g)
+{
+	return g->mark.dirty_sectors + g->mark.cached_sectors;
+}
+
+/* Per device stats: */
+
+struct bucket_stats_cache __bch_bucket_stats_read_cache(struct cache *);
+struct bucket_stats_cache bch_bucket_stats_read_cache(struct cache *);
+
+static inline u64 __buckets_available_cache(struct cache *ca,
+					    struct bucket_stats_cache stats)
+{
+	return max_t(s64, 0,
+		     ca->mi.nbuckets - ca->mi.first_bucket -
+		     stats.buckets_dirty -
+		     stats.buckets_alloc -
+		     stats.buckets_meta);
+}
+
+/*
+ * Number of reclaimable buckets - only for use by the allocator thread:
+ */
+static inline u64 buckets_available_cache(struct cache *ca)
+{
+	return __buckets_available_cache(ca, bch_bucket_stats_read_cache(ca));
+}
+
+static inline u64 __buckets_free_cache(struct cache *ca,
+				       struct bucket_stats_cache stats)
+{
+	return __buckets_available_cache(ca, stats) +
+		fifo_used(&ca->free[RESERVE_NONE]) +
+		fifo_used(&ca->free_inc);
+}
+
+static inline u64 buckets_free_cache(struct cache *ca)
+{
+	return __buckets_free_cache(ca, bch_bucket_stats_read_cache(ca));
+}
+
+/* Cache set stats: */
+
+struct bucket_stats_cache_set __bch_bucket_stats_read_cache_set(struct cache_set *);
+struct bucket_stats_cache_set bch_bucket_stats_read_cache_set(struct cache_set *);
+void bch_cache_set_stats_apply(struct cache_set *,
+			       struct bucket_stats_cache_set *,
+			       struct disk_reservation *,
+			       struct gc_pos);
+
+static inline u64 __cache_set_sectors_used(struct cache_set *c)
+{
+	struct bucket_stats_cache_set stats = __bch_bucket_stats_read_cache_set(c);
+	u64 reserved = stats.persistent_reserved +
+		stats.online_reserved;
+
+	return stats.s[S_COMPRESSED][S_META] +
+		stats.s[S_COMPRESSED][S_DIRTY] +
+		reserved +
+		(reserved >> 7);
+}
+
+static inline u64 cache_set_sectors_used(struct cache_set *c)
+{
+	return min(c->capacity, __cache_set_sectors_used(c));
+}
+
+/* XXX: kill? */
+static inline u64 sectors_available(struct cache_set *c)
+{
+	struct cache *ca;
+	unsigned i;
+	u64 ret = 0;
+
+	rcu_read_lock();
+	for_each_cache_rcu(ca, c, i)
+		ret += buckets_available_cache(ca) << ca->bucket_bits;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline bool is_available_bucket(struct bucket_mark mark)
+{
+	return (!mark.owned_by_allocator &&
+		!mark.is_metadata &&
+		!mark.dirty_sectors);
+}
+
+void bch_bucket_seq_cleanup(struct cache_set *);
+
+void bch_invalidate_bucket(struct cache *, struct bucket *);
+void bch_mark_free_bucket(struct cache *, struct bucket *);
+void bch_mark_alloc_bucket(struct cache *, struct bucket *, bool);
+void bch_mark_metadata_bucket(struct cache *, struct bucket *, bool);
+
+void __bch_gc_mark_key(struct cache_set *, struct bkey_s_c, s64, bool,
+		       struct bucket_stats_cache_set *);
+void bch_gc_mark_key(struct cache_set *, struct bkey_s_c, s64, bool);
+void bch_mark_key(struct cache_set *, struct bkey_s_c, s64, bool,
+		  struct gc_pos, struct bucket_stats_cache_set *, u64);
+
+void bch_recalc_sectors_available(struct cache_set *);
+
+void bch_disk_reservation_put(struct cache_set *,
+			      struct disk_reservation *);
+
+#define BCH_DISK_RESERVATION_NOFAIL		(1 << 0)
+#define BCH_DISK_RESERVATION_METADATA		(1 << 1)
+#define BCH_DISK_RESERVATION_GC_LOCK_HELD	(1 << 2)
+#define BCH_DISK_RESERVATION_BTREE_LOCKS_HELD	(1 << 3)
+
+int bch_disk_reservation_add(struct cache_set *,
+			     struct disk_reservation *,
+			     unsigned, int);
+int bch_disk_reservation_get(struct cache_set *,
+			     struct disk_reservation *,
+			     unsigned, int);
+
+#endif /* _BUCKETS_H */
diff --git a/libbcache/buckets_types.h b/libbcache/buckets_types.h
new file mode 100644
index 0000000..6bbdcd2
--- /dev/null
+++ b/libbcache/buckets_types.h
@@ -0,0 +1,99 @@
+#ifndef _BUCKETS_TYPES_H
+#define _BUCKETS_TYPES_H
+
+struct bucket_mark {
+	union {
+	struct {
+		u64		counter;
+	};
+
+	struct {
+		u8		gen;
+
+		/* generation copygc is going to move this bucket into */
+		unsigned	copygc:1;
+		unsigned	wait_on_journal:1;
+
+		/*
+		 * If this bucket ever had metadata in it, the allocator must
+		 * increment its gen before we reuse it:
+		 */
+		unsigned	had_metadata:1;
+
+		unsigned	owned_by_allocator:1;
+		unsigned	is_metadata:1;
+
+		u16		cached_sectors;
+		u16		dirty_sectors;
+
+		/*
+		 * low bits of journal sequence number when this bucket was most
+		 * recently modified:
+		 */
+		u16		journal_seq;
+	};
+	};
+};
+
+struct bucket {
+	union {
+		struct {
+			u16	read_prio;
+			u16	write_prio;
+		};
+		u16		prio[2];
+	};
+
+	union {
+		struct bucket_mark	_mark;
+		const struct bucket_mark mark;
+	};
+};
+
+struct bucket_stats_cache {
+	u64			buckets_dirty;
+	u64			buckets_cached;
+	u64			buckets_meta;
+	u64			buckets_alloc;
+
+	u64			sectors_dirty;
+	u64			sectors_cached;
+	u64			sectors_meta;
+};
+
+enum s_alloc {
+	S_META,
+	S_DIRTY,
+	S_CACHED,
+	S_ALLOC_NR,
+};
+
+enum s_compressed {
+	S_COMPRESSED,
+	S_UNCOMPRESSED,
+	S_COMPRESSED_NR,
+};
+
+struct bucket_stats_cache_set {
+	/* all fields are in units of 512 byte sectors: */
+	u64			s[S_COMPRESSED_NR][S_ALLOC_NR];
+	u64			persistent_reserved;
+	u64			online_reserved;
+	u64			available_cache;
+};
+
+struct bucket_heap_entry {
+	struct bucket *g;
+	unsigned long val;
+};
+
+/*
+ * A reservation for space on disk:
+ */
+struct disk_reservation {
+	u64		sectors;
+	u32		gen;
+	unsigned	nr_replicas;
+};
+
+#endif /* _BUCKETS_TYPES_H */
diff --git a/libbcache/chardev.c b/libbcache/chardev.c
new file mode 100644
index 0000000..0b020c8
--- /dev/null
+++ b/libbcache/chardev.c
@@ -0,0 +1,319 @@
+/*
+ * This file adds support for a character device /dev/bcache that is used to
+ * atomically register a list of devices, remove a device from a cache_set
+ * and add a device to a cache set.
+ *
+ * Copyright (c) 2014 Datera, Inc.
+ *
+ */
+
+#include "bcache.h"
+#include "super.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/major.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/ioctl.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/bcache-ioctl.h>
+
+static long bch_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
+{
+	struct bch_ioctl_assemble arg;
+	const char *err;
+	u64 *user_devs = NULL;
+	char **devs = NULL;
+	unsigned i;
+	int ret = -EFAULT;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
+	if (!devs)
+		return -ENOMEM;
+
+	devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
+
+	if (copy_from_user(user_devs, user_arg->devs,
+			   sizeof(u64) * arg.nr_devs))
+		goto err;
+
+	for (i = 0; i < arg.nr_devs; i++) {
+		devs[i] = strndup_user((const char __user *)(unsigned long)
+				       user_devs[i],
+				       PATH_MAX);
+		if (!devs[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
+
+	err = bch_register_cache_set(devs, arg.nr_devs,
+				     cache_set_opts_empty(),
+				     NULL);
+	if (err) {
+		pr_err("Could not register cache set: %s", err);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = 0;
+err:
+	if (devs)
+		for (i = 0; i < arg.nr_devs; i++)
+			kfree(devs[i]);
+	kfree(devs);
+	return ret;
+}
+
+static long bch_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
+{
+	struct bch_ioctl_incremental arg;
+	const char *err;
+	char *path;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	if (!path)
+		return -ENOMEM;
+
+	err = bch_register_one(path);
+	kfree(path);
+
+	if (err) {
+		pr_err("Could not register bcache devices: %s", err);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static long bch_global_ioctl(unsigned cmd, void __user *arg)
+{
+	switch (cmd) {
+	case BCH_IOCTL_ASSEMBLE:
+		return bch_ioctl_assemble(arg);
+	case BCH_IOCTL_INCREMENTAL:
+		return bch_ioctl_incremental(arg);
+	default:
+		return -ENOTTY;
+	}
+}
+
+static long bch_ioctl_stop(struct cache_set *c)
+{
+	bch_cache_set_stop(c);
+	return 0;
+}
+
+static long bch_ioctl_disk_add(struct cache_set *c,
+			       struct bch_ioctl_disk_add __user *user_arg)
+{
+	struct bch_ioctl_disk_add arg;
+	char *path;
+	int ret;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	if (!path)
+		return -ENOMEM;
+
+	ret = bch_cache_set_add_cache(c, path);
+	kfree(path);
+
+	return ret;
+}
+
+/* returns with ref on ca->ref */
+static struct cache *bch_device_lookup(struct cache_set *c,
+				       const char __user *dev)
+{
+	struct block_device *bdev;
+	struct cache *ca;
+	char *path;
+	unsigned i;
+
+	path = strndup_user(dev, PATH_MAX);
+	if (!path)
+		return ERR_PTR(-ENOMEM);
+
+	bdev = lookup_bdev(strim(path));
+	kfree(path);
+	if (IS_ERR(bdev))
+		return ERR_CAST(bdev);
+
+	for_each_cache(ca, c, i)
+		if (ca->disk_sb.bdev == bdev)
+			goto found;
+
+	ca = NULL;
+found:
+	bdput(bdev);
+	return ca;
+}
+
+static long bch_ioctl_disk_remove(struct cache_set *c,
+				  struct bch_ioctl_disk_remove __user *user_arg)
+{
+	struct bch_ioctl_disk_remove arg;
+	struct cache *ca;
+	int ret;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	ca = bch_device_lookup(c, (const char __user *)(unsigned long) arg.dev);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch_cache_remove(ca, arg.flags & BCH_FORCE_IF_DATA_MISSING)
+		? 0 : -EBUSY;
+
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+static long bch_ioctl_disk_fail(struct cache_set *c,
+				struct bch_ioctl_disk_fail __user *user_arg)
+{
+	struct bch_ioctl_disk_fail arg;
+	struct cache *ca;
+	int ret;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	ca = bch_device_lookup(c, (const char __user *)(unsigned long) arg.dev);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	/* XXX: failed not actually implemented yet */
+	ret = bch_cache_remove(ca, true);
+
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+static struct cache_member *bch_uuid_lookup(struct cache_set *c, uuid_le uuid)
+{
+	struct cache_member *mi = c->disk_mi;
+	unsigned i;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	for (i = 0; i < c->disk_sb.nr_in_set; i++)
+		if (!memcmp(&mi[i].uuid, &uuid, sizeof(uuid)))
+			return &mi[i];
+
+	return NULL;
+}
+
+static long bch_ioctl_disk_remove_by_uuid(struct cache_set *c,
+			struct bch_ioctl_disk_remove_by_uuid __user *user_arg)
+{
+	struct bch_ioctl_disk_fail_by_uuid arg;
+	struct cache_member *m;
+	int ret = -ENOENT;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	mutex_lock(&bch_register_lock);
+	if ((m = bch_uuid_lookup(c, arg.dev))) {
+		/* XXX: */
+		SET_CACHE_STATE(m, CACHE_FAILED);
+		bcache_write_super(c);
+		ret = 0;
+	}
+	mutex_unlock(&bch_register_lock);
+
+	return ret;
+}
+
+static long bch_ioctl_disk_fail_by_uuid(struct cache_set *c,
+			struct bch_ioctl_disk_fail_by_uuid __user *user_arg)
+{
+	struct bch_ioctl_disk_fail_by_uuid arg;
+	struct cache_member *m;
+	int ret = -ENOENT;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	mutex_lock(&bch_register_lock);
+	if ((m = bch_uuid_lookup(c, arg.dev))) {
+		SET_CACHE_STATE(m, CACHE_FAILED);
+		bcache_write_super(c);
+		ret = 0;
+	}
+	mutex_unlock(&bch_register_lock);
+
+	return ret;
+}
+
+static long bch_ioctl_query_uuid(struct cache_set *c,
+			struct bch_ioctl_query_uuid __user *user_arg)
+{
+	return copy_to_user(&user_arg->uuid,
+			    &c->disk_sb.user_uuid,
+			    sizeof(c->disk_sb.user_uuid));
+}
+
+long bch_cache_set_ioctl(struct cache_set *c, unsigned cmd, void __user *arg)
+{
+	/* ioctls that don't require admin cap: */
+	switch (cmd) {
+	case BCH_IOCTL_QUERY_UUID:
+		return bch_ioctl_query_uuid(c, arg);
+	}
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* ioctls that do require admin cap: */
+	switch (cmd) {
+	case BCH_IOCTL_RUN:
+		return -ENOTTY;
+	case BCH_IOCTL_STOP:
+		return bch_ioctl_stop(c);
+
+	case BCH_IOCTL_DISK_ADD:
+		return bch_ioctl_disk_add(c, arg);
+	case BCH_IOCTL_DISK_REMOVE:
+		return bch_ioctl_disk_remove(c, arg);
+	case BCH_IOCTL_DISK_FAIL:
+		return bch_ioctl_disk_fail(c, arg);
+
+	case BCH_IOCTL_DISK_REMOVE_BY_UUID:
+		return bch_ioctl_disk_remove_by_uuid(c, arg);
+	case BCH_IOCTL_DISK_FAIL_BY_UUID:
+		return bch_ioctl_disk_fail_by_uuid(c, arg);
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+static long bch_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
+{
+	struct cache_set *c = filp->private_data;
+	void __user *arg = (void __user *) v;
+
+	return c
+		? bch_cache_set_ioctl(c, cmd, arg)
+		: bch_global_ioctl(cmd, arg);
+}
+
+const struct file_operations bch_chardev_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl = bch_chardev_ioctl,
+	.open		= nonseekable_open,
+};
diff --git a/libbcache/chardev.h b/libbcache/chardev.h
new file mode 100644
index 0000000..657bf2b
--- /dev/null
+++ b/libbcache/chardev.h
@@ -0,0 +1,7 @@
+#ifndef _BCACHE_CHARDEV_H
+#define _BCACHE_CHARDEV_H
+
+long bch_cache_set_ioctl(struct cache_set *, unsigned, void __user *);
+extern const struct file_operations bch_chardev_fops;
+
+#endif /* _BCACHE_CHARDEV_H */
diff --git a/util.c b/libbcache/checksum.c
index 1bab7da..beae0b2 100644
--- a/util.c
+++ b/libbcache/checksum.c
@@ -1,284 +1,11 @@
-#include <alloca.h>
-#include <assert.h>
-#include <ctype.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <limits.h>
-#include <linux/fs.h>
-#include <math.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
 
-#include <uuid/uuid.h>
+#include "bcache.h"
+#include "checksum.h"
 
-#include "ccan/crc/crc.h"
-
-#include "util.h"
-
-/* Integer stuff: */
-
-unsigned ilog2(u64 n)
-{
-	unsigned ret = 0;
-
-	assert(n > 0);
-
-	while (n > 1) {
-		ret++;
-		n >>= 1;
-	}
-
-	return ret;
-}
-
-u64 rounddown_pow_of_two(u64 n)
-{
-	return 1ULL << ilog2(n);
-}
-
-u64 roundup_pow_of_two(u64 n)
-{
-	return 1ULL << (ilog2(n - 1) + 1);
-}
-
-char *skip_spaces(const char *str)
-{
-	while (isspace(*str))
-		++str;
-	return (char *)str;
-}
-
-char *strim(char *s)
-{
-	size_t size;
-	char *end;
-
-	s = skip_spaces(s);
-	size = strlen(s);
-	if (!size)
-		return s;
-
-	end = s + size - 1;
-	while (end >= s && isspace(*end))
-		end--;
-	*(end + 1) = '\0';
-
-	return s;
-}
-
-struct units_buf pr_units(u64 v, enum units units)
-{
-	struct units_buf ret;
-
-	switch (units) {
-	case BYTES:
-		snprintf(ret.b, sizeof(ret.b), "%llu", v << 9);
-		break;
-	case SECTORS:
-		snprintf(ret.b, sizeof(ret.b), "%llu", v);
-		break;
-	case HUMAN_READABLE:
-		v <<= 9;
-
-		if (v >= 1024) {
-			int exp = log(v) / log(1024);
-			snprintf(ret.b, sizeof(ret.b), "%.1f%c",
-				 v / pow(1024, exp),
-				 "KMGTPE"[exp-1]);
-		} else {
-			snprintf(ret.b, sizeof(ret.b), "%llu", v);
-		}
-
-		break;
-	}
-
-	return ret;
-}
-
-/* Argument parsing stuff: */
-
-long strtoul_or_die(const char *p, size_t max, const char *msg)
-{
-	errno = 0;
-	long v = strtol(p, NULL, 10);
-	if (errno || v < 0 || v >= max)
-		die("Invalid %s %zi", msg, v);
-
-	return v;
-}
-
-u64 hatoi(const char *s)
-{
-	char *e;
-	long long i = strtoll(s, &e, 10);
-	switch (*e) {
-		case 't':
-		case 'T':
-			i *= 1024;
-		case 'g':
-		case 'G':
-			i *= 1024;
-		case 'm':
-		case 'M':
-			i *= 1024;
-		case 'k':
-		case 'K':
-			i *= 1024;
-	}
-	return i;
-}
-
-unsigned hatoi_validate(const char *s, const char *msg)
-{
-	u64 v = hatoi(s);
-
-	if (v & (v - 1))
-		die("%s must be a power of two", msg);
-
-	v /= 512;
-
-	if (v > USHRT_MAX)
-		die("%s too large\n", msg);
-
-	if (!v)
-		die("%s too small\n", msg);
-
-	return v;
-}
-
-unsigned nr_args(char * const *args)
-{
-	unsigned i;
-
-	for (i = 0; args[i]; i++)
-		;
-
-	return i;
-}
-
-/* File parsing (i.e. sysfs) */
-
-char *read_file_str(int dirfd, const char *path)
-{
-	int fd = openat(dirfd, path, O_RDONLY);
-
-	if (fd < 0)
-		die("Unable to open %s\n", path);
-
-	struct stat statbuf;
-	if (fstat(fd, &statbuf) < 0)
-		die("fstat error\n");
-
-	char *buf = malloc(statbuf.st_size + 1);
-
-	int len = read(fd, buf, statbuf.st_size);
-	if (len < 0)
-		die("read error while reading from file %s\n", path);
-
-	buf[len] = '\0';
-	if (len && buf[len - 1] == '\n')
-		buf[len - 1] = '\0';
-
-	close(fd);
-
-	return buf;
-}
-
-u64 read_file_u64(int dirfd, const char *path)
-{
-	char *buf = read_file_str(dirfd, path);
-	u64 ret = strtoll(buf, NULL, 10);
-
-	free(buf);
-	return ret;
-}
-
-/* String list options: */
-
-ssize_t read_string_list(const char *buf, const char * const list[])
-{
-	size_t i;
-	char *s, *d = strdup(buf);
-	if (!d)
-		return -ENOMEM;
-
-	s = strim(d);
-
-	for (i = 0; list[i]; i++)
-		if (!strcmp(list[i], s))
-			break;
-
-	free(d);
-
-	if (!list[i])
-		return -EINVAL;
-
-	return i;
-}
-
-ssize_t read_string_list_or_die(const char *opt, const char * const list[],
-				const char *msg)
-{
-	ssize_t v = read_string_list(opt, list);
-	if (v < 0)
-		die("Bad %s %s", msg, opt);
-
-	return v;
-}
-
-void print_string_list(const char * const list[], size_t selected)
-{
-	size_t i;
-
-	for (i = 0; list[i]; i++) {
-		if (i)
-			putchar(' ');
-		printf(i == selected ? "[%s] ": "%s", list[i]);
-	}
-}
-
-/* Returns size of file or block device, in units of 512 byte sectors: */
-u64 get_size(const char *path, int fd)
-{
-	struct stat statbuf;
-	if (fstat(fd, &statbuf))
-		die("Error statting %s: %s", path, strerror(errno));
-
-	if (!S_ISBLK(statbuf.st_mode))
-		return statbuf.st_size >> 9;
-
-	u64 ret;
-	if (ioctl(fd, BLKGETSIZE64, &ret))
-		die("Error getting block device size on %s: %s\n",
-		    path, strerror(errno));
-
-	return ret >> 9;
-}
-
-/* Returns blocksize in units of 512 byte sectors: */
-unsigned get_blocksize(const char *path, int fd)
-{
-	struct stat statbuf;
-	if (fstat(fd, &statbuf))
-		die("Error statting %s: %s", path, strerror(errno));
-
-	if (!S_ISBLK(statbuf.st_mode))
-		return statbuf.st_blksize >> 9;
-
-	unsigned ret;
-	if (ioctl(fd, BLKPBSZGET, &ret))
-		die("Error getting blocksize on %s: %s\n",
-		    path, strerror(errno));
-
-	return ret >> 9;
-}
-
-/* Checksums: */
+#include <linux/crc32c.h>
+#include <crypto/chacha20.h>
+#include <crypto/hash.h>
+#include <crypto/poly1305.h>
 
 /*
  * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
@@ -387,10 +114,10 @@ static const u64 crc_table[256] = {
 	0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL,
 	0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL,
 	0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL,
-	0x9AFCE626CE85B507ULL
+	0x9AFCE626CE85B507ULL,
 };
 
-static u64 bch_crc64_update(u64 crc, const void *_data, size_t len)
+u64 bch_crc64_update(u64 crc, const void *_data, size_t len)
 {
 	const unsigned char *data = _data;
 
@@ -402,7 +129,7 @@ static u64 bch_crc64_update(u64 crc, const void *_data, size_t len)
 	return crc;
 }
 
-static u64 bch_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
+u64 bch_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
 {
 	switch (type) {
 	case BCH_CSUM_NONE:
@@ -412,7 +139,7 @@ static u64 bch_checksum_update(unsigned type, u64 crc, const void *data, size_t
 	case BCH_CSUM_CRC64:
 		return bch_crc64_update(crc, data, len);
 	default:
-		die("Unknown checksum type %u", type);
+		BUG();
 	}
 }
 
@@ -425,92 +152,23 @@ u64 bch_checksum(unsigned type, const void *data, size_t len)
 	return crc ^ 0xffffffffffffffffULL;
 }
 
-/* Global control device: */
-int bcachectl_open(void)
-{
-	int fd = open("/dev/bcache-ctl", O_RDWR);
-	if (fd < 0)
-		die("Can't open bcache device: %s", strerror(errno));
-
-	return fd;
-}
-
-/* Filesystem handles (ioctl, sysfs dir): */
-
-#define SYSFS_BASE "/sys/fs/bcache/"
-
-struct bcache_handle bcache_fs_open(const char *path)
+u32 bch_checksum_bio(struct bio *bio, unsigned type)
 {
-	struct bcache_handle ret;
-	uuid_t tmp;
-
-	if (!uuid_parse(path, tmp)) {
-		/* It's a UUID, look it up in sysfs: */
-
-		char *sysfs = alloca(strlen(SYSFS_BASE) + strlen(path) + 1);
-		sprintf(sysfs, "%s%s", SYSFS_BASE, path);
-
-		ret.sysfs = opendir(sysfs);
-		if (!ret.sysfs)
-			die("Unable to open %s\n", path);
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	u32 csum = U32_MAX;
 
-		char *minor = read_file_str(dirfd(ret.sysfs), "minor");
-		char *ctl = alloca(20 + strlen(minor));
-
-		sprintf(ctl, "/dev/bcache%s-ctl", minor);
-		free(minor);
-
-		ret.fd = open(ctl, O_RDWR);
-		if (ret.fd < 0)
-			die("Error opening control device: %s\n",
-			    strerror(errno));
-	} else {
-		/* It's a path: */
-
-		ret.fd = open(path, O_RDONLY);
-		if (ret.fd < 0)
-			die("Error opening %s: %s\n",
-			    path, strerror(errno));
-
-		struct bch_ioctl_query_uuid uuid;
-		if (ioctl(ret.fd, BCH_IOCTL_QUERY_UUID, &uuid))
-			die("ioctl error (not a bcache fs?): %s\n",
-			    strerror(errno));
-
-		char uuid_str[40];
-		uuid_unparse(uuid.uuid.b, uuid_str);
+	if (type == BCH_CSUM_NONE)
+		return 0;
 
-		char *sysfs = alloca(strlen(SYSFS_BASE) + strlen(uuid_str) + 1);
-		sprintf(sysfs, "%s%s", SYSFS_BASE, uuid_str);
+	bio_for_each_segment(bv, bio, iter) {
+		void *p = kmap_atomic(bv.bv_page);
 
-		ret.sysfs = opendir(sysfs);
-		if (!ret.sysfs)
-			die("Unable to open sysfs dir %s: %s\n",
-			    sysfs, strerror(errno));
+		csum = bch_checksum_update(type, csum,
+					   p + bv.bv_offset,
+					   bv.bv_len);
+		kunmap_atomic(p);
 	}
 
-	return ret;
-}
-
-bool ask_proceed(void)
-{
-	const char *short_yes = "yY";
-	char *buf = NULL;
-	size_t buflen = 0;
-	bool ret;
-
-	fputs("Proceed anyway? (y,n) ", stdout);
-
-	if (getline(&buf, &buflen, stdin) < 0)
-		die("error reading from standard input");
-
-	ret = strchr(short_yes, buf[0]);
-	free(buf);
-	return ret;
-}
-
-void memzero_explicit(void *buf, size_t len)
-{
-    void *(* volatile memset_s)(void *s, int c, size_t n) = memset;
-    memset_s(buf, 0, len);
+	return csum ^= U32_MAX;
 }
diff --git a/libbcache/checksum.h b/libbcache/checksum.h
new file mode 100644
index 0000000..196b7e8
--- /dev/null
+++ b/libbcache/checksum.h
@@ -0,0 +1,24 @@
+#ifndef _BCACHE_CHECKSUM_H
+#define _BCACHE_CHECKSUM_H
+
+#include "btree_types.h"
+
+u64 bch_crc64_update(u64, const void *, size_t);
+
+u64 bch_checksum_update(unsigned, u64, const void *, size_t);
+u64 bch_checksum(unsigned, const void *, size_t);
+u32 bch_checksum_bio(struct bio *, unsigned);
+
+/*
+ * This is used for various on disk data structures - cache_sb, prio_set, bset,
+ * jset: The checksum is _always_ the first 8 bytes of these structs
+ */
+#define __csum_set(i, u64s, type)					\
+({									\
+	const void *start = ((const void *) (i)) + sizeof(u64);		\
+	const void *end = __bkey_idx(i, u64s);				\
+									\
+	bch_checksum(type, start, end - start);				\
+})
+
+#endif /* _BCACHE_CHECKSUM_H */
diff --git a/libbcache/clock.c b/libbcache/clock.c
new file mode 100644
index 0000000..8218769
--- /dev/null
+++ b/libbcache/clock.c
@@ -0,0 +1,161 @@
+#include "bcache.h"
+#include "clock.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+
+static inline bool io_timer_cmp(struct io_timer *l, struct io_timer *r)
+{
+	return time_after(l->expire, r->expire);
+}
+
+void bch_io_timer_add(struct io_clock *clock, struct io_timer *timer)
+{
+	size_t i;
+
+	spin_lock(&clock->timer_lock);
+	for (i = 0; i < clock->timers.used; i++)
+		if (clock->timers.data[i] == timer)
+			goto out;
+
+	BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp));
+out:
+	spin_unlock(&clock->timer_lock);
+}
+
+void bch_io_timer_del(struct io_clock *clock, struct io_timer *timer)
+{
+	size_t i;
+
+	spin_lock(&clock->timer_lock);
+
+	for (i = 0; i < clock->timers.used; i++)
+		if (clock->timers.data[i] == timer) {
+			heap_del(&clock->timers, i, io_timer_cmp);
+			break;
+		}
+
+	spin_unlock(&clock->timer_lock);
+}
+
+struct io_clock_wait {
+	struct io_timer		timer;
+	struct task_struct	*task;
+	int			expired;
+};
+
+static void io_clock_wait_fn(struct io_timer *timer)
+{
+	struct io_clock_wait *wait = container_of(timer,
+				struct io_clock_wait, timer);
+
+	wait->expired = 1;
+	wake_up_process(wait->task);
+}
+
+void bch_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
+{
+	struct io_clock_wait wait;
+
+	/* XXX: calculate sleep time rigorously */
+	wait.timer.expire	= until;
+	wait.timer.fn		= io_clock_wait_fn;
+	wait.task		= current;
+	wait.expired		= 0;
+	bch_io_timer_add(clock, &wait.timer);
+
+	schedule();
+
+	bch_io_timer_del(clock, &wait.timer);
+}
+
+/*
+ * _only_ to be used from a kthread
+ */
+void bch_kthread_io_clock_wait(struct io_clock *clock,
+			       unsigned long until)
+{
+	struct io_clock_wait wait;
+
+	/* XXX: calculate sleep time rigorously */
+	wait.timer.expire	= until;
+	wait.timer.fn		= io_clock_wait_fn;
+	wait.task		= current;
+	wait.expired		= 0;
+	bch_io_timer_add(clock, &wait.timer);
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop())
+			break;
+
+		if (wait.expired)
+			break;
+
+		schedule();
+		try_to_freeze();
+	}
+
+	__set_current_state(TASK_RUNNING);
+	bch_io_timer_del(clock, &wait.timer);
+}
+
+static struct io_timer *get_expired_timer(struct io_clock *clock,
+					  unsigned long now)
+{
+	struct io_timer *ret = NULL;
+
+	spin_lock(&clock->timer_lock);
+
+	if (clock->timers.used &&
+	    time_after_eq(now, clock->timers.data[0]->expire))
+		heap_pop(&clock->timers, ret, io_timer_cmp);
+
+	spin_unlock(&clock->timer_lock);
+
+	return ret;
+}
+
+void bch_increment_clock(struct cache_set *c, unsigned sectors, int rw)
+{
+	struct io_clock *clock = &c->io_clock[rw];
+	struct io_timer *timer;
+	unsigned long now;
+
+	/* Buffer up one megabyte worth of IO in the percpu counter */
+	preempt_disable();
+
+	if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) <
+		   IO_CLOCK_PCPU_SECTORS)) {
+		preempt_enable();
+		return;
+	}
+
+	sectors = this_cpu_xchg(*clock->pcpu_buf, 0);
+	preempt_enable();
+	now = atomic_long_add_return(sectors, &clock->now);
+
+	while ((timer = get_expired_timer(clock, now)))
+		timer->fn(timer);
+}
+
+void bch_io_clock_exit(struct io_clock *clock)
+{
+	free_heap(&clock->timers);
+	free_percpu(clock->pcpu_buf);
+}
+
+int bch_io_clock_init(struct io_clock *clock)
+{
+	atomic_long_set(&clock->now, 0);
+	spin_lock_init(&clock->timer_lock);
+
+	clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
+	if (!clock->pcpu_buf)
+		return -ENOMEM;
+
+	if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/libbcache/clock.h b/libbcache/clock.h
new file mode 100644
index 0000000..f59f071
--- /dev/null
+++ b/libbcache/clock.h
@@ -0,0 +1,23 @@
+#ifndef _BCACHE_CLOCK_H
+#define _BCACHE_CLOCK_H
+
+void bch_io_timer_add(struct io_clock *, struct io_timer *);
+void bch_io_timer_del(struct io_clock *, struct io_timer *);
+void bch_kthread_io_clock_wait(struct io_clock *, unsigned long);
+void bch_increment_clock(struct cache_set *, unsigned, int);
+
+void bch_io_clock_schedule_timeout(struct io_clock *, unsigned long);
+
+#define bch_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __wait_event_timeout(wq, condition, timeout);	\
+	__ret;								\
+})
+
+void bch_io_clock_exit(struct io_clock *);
+int bch_io_clock_init(struct io_clock *);
+
+#endif /* _BCACHE_CLOCK_H */
diff --git a/libbcache/clock_types.h b/libbcache/clock_types.h
new file mode 100644
index 0000000..4a02f46
--- /dev/null
+++ b/libbcache/clock_types.h
@@ -0,0 +1,34 @@
+#ifndef _BCACHE_CLOCK_TYPES_H
+#define _BCACHE_CLOCK_TYPES_H
+
+#include "util.h"
+
+#define NR_IO_TIMERS		8
+
+/*
+ * Clocks/timers in units of sectors of IO:
+ *
+ * Note - they use percpu batching, so they're only approximate.
+ */
+
+struct io_timer;
+typedef void (*io_timer_fn)(struct io_timer *);
+
+struct io_timer {
+	io_timer_fn		fn;
+	unsigned long		expire;
+};
+
+/* Amount to buffer up on a percpu counter */
+#define IO_CLOCK_PCPU_SECTORS	128
+
+struct io_clock {
+	atomic_long_t		now;
+	u16 __percpu		*pcpu_buf;
+
+	spinlock_t		timer_lock;
+	DECLARE_HEAP(struct io_timer *, timers);
+};
+
+#endif /* _BCACHE_CLOCK_TYPES_H */
+
diff --git a/libbcache/closure.c b/libbcache/closure.c
new file mode 100644
index 0000000..f6f4dd9
--- /dev/null
+++ b/libbcache/closure.c
@@ -0,0 +1,210 @@
+/*
+ * Asynchronous refcounty things
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include "closure.h"
+
+static inline void closure_put_after_sub(struct closure *cl, int flags)
+{
+	int r = flags & CLOSURE_REMAINING_MASK;
+
+	BUG_ON(flags & CLOSURE_GUARD_MASK);
+	BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
+
+	if (!r) {
+		if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
+			atomic_set(&cl->remaining,
+				   CLOSURE_REMAINING_INITIALIZER);
+			closure_queue(cl);
+		} else {
+			struct closure *parent = cl->parent;
+			closure_fn *destructor = cl->fn;
+
+			closure_debug_destroy(cl);
+
+			if (destructor)
+				destructor(cl);
+
+			if (parent)
+				closure_put(parent);
+		}
+	}
+}
+
+/* For clearing flags with the same atomic op as a put */
+void closure_sub(struct closure *cl, int v)
+{
+	closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
+}
+EXPORT_SYMBOL(closure_sub);
+
+/**
+ * closure_put - decrement a closure's refcount
+ */
+void closure_put(struct closure *cl)
+{
+	closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
+}
+EXPORT_SYMBOL(closure_put);
+
+/**
+ * closure_wake_up - wake up all closures on a wait list, without memory barrier
+ */
+void __closure_wake_up(struct closure_waitlist *wait_list)
+{
+	struct llist_node *list, *next;
+	struct closure *cl;
+
+	/*
+	 * Grab entire list, reverse order to preserve FIFO ordering, and wake
+	 * everything up
+	 */
+	for (list = llist_reverse_order(llist_del_all(&wait_list->list));
+	     list;
+	     list = next) {
+		next = llist_next(list);
+		cl = container_of(list, struct closure, list);
+
+		closure_set_waiting(cl, 0);
+		closure_sub(cl, CLOSURE_WAITING + 1);
+	}
+}
+EXPORT_SYMBOL(__closure_wake_up);
+
+/**
+ * closure_wait - add a closure to a waitlist
+ *
+ * @waitlist will own a ref on @cl, which will be released when
+ * closure_wake_up() is called on @waitlist.
+ *
+ */
+bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
+{
+	if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
+		return false;
+
+	closure_set_waiting(cl, _RET_IP_);
+	atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
+	llist_add(&cl->list, &waitlist->list);
+
+	return true;
+}
+EXPORT_SYMBOL(closure_wait);
+
+struct closure_syncer {
+	struct task_struct	*task;
+	int			done;
+};
+
+static void closure_sync_fn(struct closure *cl)
+{
+	cl->s->done = 1;
+	wake_up_process(cl->s->task);
+}
+
+void __sched __closure_sync(struct closure *cl)
+{
+	struct closure_syncer s = { .task = current };
+
+	cl->s = &s;
+	continue_at_noreturn(cl, closure_sync_fn, NULL);
+
+	while (1) {
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		smp_mb();
+		if (s.done)
+			break;
+		schedule();
+	}
+
+	__set_current_state(TASK_RUNNING);
+}
+EXPORT_SYMBOL(__closure_sync);
+
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+
+static LIST_HEAD(closure_list);
+static DEFINE_SPINLOCK(closure_list_lock);
+
+void closure_debug_create(struct closure *cl)
+{
+	unsigned long flags;
+
+	BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
+	cl->magic = CLOSURE_MAGIC_ALIVE;
+
+	spin_lock_irqsave(&closure_list_lock, flags);
+	list_add(&cl->all, &closure_list);
+	spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL(closure_debug_create);
+
+void closure_debug_destroy(struct closure *cl)
+{
+	unsigned long flags;
+
+	BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
+	cl->magic = CLOSURE_MAGIC_DEAD;
+
+	spin_lock_irqsave(&closure_list_lock, flags);
+	list_del(&cl->all);
+	spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL(closure_debug_destroy);
+
+static struct dentry *debug;
+
+static int debug_seq_show(struct seq_file *f, void *data)
+{
+	struct closure *cl;
+
+	spin_lock_irq(&closure_list_lock);
+
+	list_for_each_entry(cl, &closure_list, all) {
+		int r = atomic_read(&cl->remaining);
+
+		seq_printf(f, "%p: %pF -> %pf p %p r %i ",
+			   cl, (void *) cl->ip, cl->fn, cl->parent,
+			   r & CLOSURE_REMAINING_MASK);
+
+		seq_printf(f, "%s%s\n",
+			   test_bit(WORK_STRUCT_PENDING_BIT,
+				    work_data_bits(&cl->work)) ? "Q" : "",
+			   r & CLOSURE_RUNNING	? "R" : "");
+
+		if (r & CLOSURE_WAITING)
+			seq_printf(f, " W %pF\n",
+				   (void *) cl->waiting_on);
+
+		seq_puts(f, "\n");
+	}
+
+	spin_unlock_irq(&closure_list_lock);
+	return 0;
+}
+
+static int debug_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, debug_seq_show, NULL);
+}
+
+static const struct file_operations debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= debug_seq_open,
+	.read		= seq_read,
+	.release	= single_release
+};
+
+void __init closure_debug_init(void)
+{
+	debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops);
+}
+
+#endif
diff --git a/libbcache/closure.h b/libbcache/closure.h
new file mode 100644
index 0000000..b55254b
--- /dev/null
+++ b/libbcache/closure.h
@@ -0,0 +1,387 @@
+#ifndef _LINUX_CLOSURE_H
+#define _LINUX_CLOSURE_H
+
+#include <linux/llist.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+
+/*
+ * Closure is perhaps the most overused and abused term in computer science, but
+ * since I've been unable to come up with anything better you're stuck with it
+ * again.
+ *
+ * What are closures?
+ *
+ * They embed a refcount. The basic idea is they count "things that are in
+ * progress" - in flight bios, some other thread that's doing something else -
+ * anything you might want to wait on.
+ *
+ * The refcount may be manipulated with closure_get() and closure_put().
+ * closure_put() is where many of the interesting things happen, when it causes
+ * the refcount to go to 0.
+ *
+ * Closures can be used to wait on things both synchronously and asynchronously,
+ * and synchronous and asynchronous use can be mixed without restriction. To
+ * wait synchronously, use closure_sync() - you will sleep until your closure's
+ * refcount hits 1.
+ *
+ * To wait asynchronously, use
+ *   continue_at(cl, next_function, workqueue);
+ *
+ * passing it, as you might expect, the function to run when nothing is pending
+ * and the workqueue to run that function out of.
+ *
+ * continue_at() also, critically, requires a 'return' immediately following the
+ * location where this macro is referenced, to return to the calling function.
+ * There's good reason for this.
+ *
+ * To use safely closures asynchronously, they must always have a refcount while
+ * they are running owned by the thread that is running them. Otherwise, suppose
+ * you submit some bios and wish to have a function run when they all complete:
+ *
+ * foo_endio(struct bio *bio)
+ * {
+ *	closure_put(cl);
+ * }
+ *
+ * closure_init(cl);
+ *
+ * do_stuff();
+ * closure_get(cl);
+ * bio1->bi_endio = foo_endio;
+ * bio_submit(bio1);
+ *
+ * do_more_stuff();
+ * closure_get(cl);
+ * bio2->bi_endio = foo_endio;
+ * bio_submit(bio2);
+ *
+ * continue_at(cl, complete_some_read, system_wq);
+ *
+ * If closure's refcount started at 0, complete_some_read() could run before the
+ * second bio was submitted - which is almost always not what you want! More
+ * importantly, it wouldn't be possible to say whether the original thread or
+ * complete_some_read()'s thread owned the closure - and whatever state it was
+ * associated with!
+ *
+ * So, closure_init() initializes a closure's refcount to 1 - and when a
+ * closure_fn is run, the refcount will be reset to 1 first.
+ *
+ * Then, the rule is - if you got the refcount with closure_get(), release it
+ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
+ * on a closure because you called closure_init() or you were run out of a
+ * closure - _always_ use continue_at(). Doing so consistently will help
+ * eliminate an entire class of particularly pernicious races.
+ *
+ * Lastly, you might have a wait list dedicated to a specific event, and have no
+ * need for specifying the condition - you just want to wait until someone runs
+ * closure_wake_up() on the appropriate wait list. In that case, just use
+ * closure_wait(). It will return either true or false, depending on whether the
+ * closure was already on a wait list or not - a closure can only be on one wait
+ * list at a time.
+ *
+ * Parents:
+ *
+ * closure_init() takes two arguments - it takes the closure to initialize, and
+ * a (possibly null) parent.
+ *
+ * If parent is non null, the new closure will have a refcount for its lifetime;
+ * a closure is considered to be "finished" when its refcount hits 0 and the
+ * function to run is null. Hence
+ *
+ * continue_at(cl, NULL, NULL);
+ *
+ * returns up the (spaghetti) stack of closures, precisely like normal return
+ * returns up the C stack. continue_at() with non null fn is better thought of
+ * as doing a tail call.
+ *
+ * All this implies that a closure should typically be embedded in a particular
+ * struct (which its refcount will normally control the lifetime of), and that
+ * struct can very much be thought of as a stack frame.
+ */
+
+struct closure;
+struct closure_syncer;
+typedef void (closure_fn) (struct closure *);
+
+struct closure_waitlist {
+	struct llist_head	list;
+};
+
+enum closure_state {
+	/*
+	 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
+	 * the thread that owns the closure, and cleared by the thread that's
+	 * waking up the closure.
+	 *
+	 * The rest are for debugging and don't affect behaviour:
+	 *
+	 * CLOSURE_RUNNING: Set when a closure is running (i.e. by
+	 * closure_init() and when closure_put() runs then next function), and
+	 * must be cleared before remaining hits 0. Primarily to help guard
+	 * against incorrect usage and accidentally transferring references.
+	 * continue_at() and closure_return() clear it for you, if you're doing
+	 * something unusual you can use closure_set_dead() which also helps
+	 * annotate where references are being transferred.
+	 */
+
+	CLOSURE_BITS_START	= (1U << 27),
+	CLOSURE_DESTRUCTOR	= (1U << 27),
+	CLOSURE_WAITING		= (1U << 29),
+	CLOSURE_RUNNING		= (1U << 31),
+};
+
+#define CLOSURE_GUARD_MASK					\
+	((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
+
+#define CLOSURE_REMAINING_MASK		(CLOSURE_BITS_START - 1)
+#define CLOSURE_REMAINING_INITIALIZER	(1|CLOSURE_RUNNING)
+
+struct closure {
+	union {
+		struct {
+			struct workqueue_struct *wq;
+			struct closure_syncer	*s;
+			struct llist_node	list;
+			closure_fn		*fn;
+		};
+		struct work_struct	work;
+	};
+
+	struct closure		*parent;
+
+	atomic_t		remaining;
+
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+#define CLOSURE_MAGIC_DEAD	0xc054dead
+#define CLOSURE_MAGIC_ALIVE	0xc054a11e
+
+	unsigned		magic;
+	struct list_head	all;
+	unsigned long		ip;
+	unsigned long		waiting_on;
+#endif
+};
+
+void closure_sub(struct closure *cl, int v);
+void closure_put(struct closure *cl);
+void __closure_wake_up(struct closure_waitlist *list);
+bool closure_wait(struct closure_waitlist *list, struct closure *cl);
+void __closure_sync(struct closure *cl);
+
+/**
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
+ *
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+ * the last refcount.
+ */
+static inline void closure_sync(struct closure *cl)
+{
+	if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
+		__closure_sync(cl);
+}
+
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+
+void closure_debug_init(void);
+void closure_debug_create(struct closure *cl);
+void closure_debug_destroy(struct closure *cl);
+
+#else
+
+static inline void closure_debug_init(void) {}
+static inline void closure_debug_create(struct closure *cl) {}
+static inline void closure_debug_destroy(struct closure *cl) {}
+
+#endif
+
+static inline void closure_set_ip(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+	cl->ip = _THIS_IP_;
+#endif
+}
+
+static inline void closure_set_ret_ip(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+	cl->ip = _RET_IP_;
+#endif
+}
+
+static inline void closure_set_waiting(struct closure *cl, unsigned long f)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+	cl->waiting_on = f;
+#endif
+}
+
+static inline void closure_set_stopped(struct closure *cl)
+{
+	atomic_sub(CLOSURE_RUNNING, &cl->remaining);
+}
+
+static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
+				  struct workqueue_struct *wq)
+{
+	closure_set_ip(cl);
+	cl->fn = fn;
+	cl->wq = wq;
+	/* between atomic_dec() in closure_put() */
+	smp_mb__before_atomic();
+}
+
+static inline void closure_queue(struct closure *cl)
+{
+	struct workqueue_struct *wq = cl->wq;
+
+	if (wq) {
+		INIT_WORK(&cl->work, cl->work.func);
+		queue_work(wq, &cl->work);
+	} else
+		cl->fn(cl);
+}
+
+/**
+ * closure_get - increment a closure's refcount
+ */
+static inline void closure_get(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+	BUG_ON((atomic_inc_return(&cl->remaining) &
+		CLOSURE_REMAINING_MASK) <= 1);
+#else
+	atomic_inc(&cl->remaining);
+#endif
+}
+
+/**
+ * closure_init - Initialize a closure, setting the refcount to 1
+ * @cl:		closure to initialize
+ * @parent:	parent of the new closure. cl will take a refcount on it for its
+ *		lifetime; may be NULL.
+ */
+static inline void closure_init(struct closure *cl, struct closure *parent)
+{
+	cl->fn = NULL;
+	cl->parent = parent;
+	if (parent)
+		closure_get(parent);
+
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+
+	closure_debug_create(cl);
+	closure_set_ip(cl);
+}
+
+static inline void closure_init_stack(struct closure *cl)
+{
+	memset(cl, 0, sizeof(struct closure));
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+}
+
+/**
+ * closure_wake_up - wake up all closures on a wait list.
+ */
+static inline void closure_wake_up(struct closure_waitlist *list)
+{
+	smp_mb();
+	__closure_wake_up(list);
+}
+
+#define continue_at_noreturn(_cl, _fn, _wq)				\
+do {									\
+	set_closure_fn(_cl, _fn, _wq);					\
+	closure_sub(_cl, CLOSURE_RUNNING + 1);				\
+} while (0)
+
+/**
+ * continue_at - jump to another function with barrier
+ *
+ * After @cl is no longer waiting on anything (i.e. all outstanding refs have
+ * been dropped with closure_put()), it will resume execution at @fn running out
+ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
+ *
+ * NOTE: This macro expands to a return in the calling function!
+ *
+ * This is because after calling continue_at() you no longer have a ref on @cl,
+ * and whatever @cl owns may be freed out from under you - a running closure fn
+ * has a ref on its own closure which continue_at() drops.
+ */
+#define continue_at(_cl, _fn, _wq)					\
+do {									\
+	continue_at_noreturn(_cl, _fn, _wq);				\
+	return;								\
+} while (0)
+
+/**
+ * closure_return - finish execution of a closure
+ *
+ * This is used to indicate that @cl is finished: when all outstanding refs on
+ * @cl have been dropped @cl's ref on its parent closure (as passed to
+ * closure_init()) will be dropped, if one was specified - thus this can be
+ * thought of as returning to the parent closure.
+ */
+#define closure_return(_cl)	continue_at((_cl), NULL, NULL)
+
+/**
+ * continue_at_nobarrier - jump to another function without barrier
+ *
+ * Causes @fn to be executed out of @cl, in @wq context (or called directly if
+ * @wq is NULL).
+ *
+ * NOTE: like continue_at(), this macro expands to a return in the caller!
+ *
+ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
+ * thus it's not safe to touch anything protected by @cl after a
+ * continue_at_nobarrier().
+ */
+#define continue_at_nobarrier(_cl, _fn, _wq)				\
+do {									\
+	closure_set_ip(cl);						\
+	if (_wq) {							\
+		INIT_WORK(&(_cl)->work, (void *) _fn);			\
+		queue_work((_wq), &(_cl)->work);			\
+	} else {							\
+		(_fn)(_cl);						\
+	}								\
+	return;								\
+} while (0)
+
+#define closure_return_with_destructor_noreturn(_cl, _destructor)	\
+do {									\
+	set_closure_fn(_cl, _destructor, NULL);				\
+	closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1);	\
+} while (0)
+
+/**
+ * closure_return - finish execution of a closure, with destructor
+ *
+ * Works like closure_return(), except @destructor will be called when all
+ * outstanding refs on @cl have been dropped; @destructor may be used to safely
+ * free the memory occupied by @cl, and it is called with the ref on the parent
+ * closure still held - so @destructor could safely return an item to a
+ * freelist protected by @cl's parent.
+ */
+#define closure_return_with_destructor(_cl, _destructor)		\
+do {									\
+	closure_return_with_destructor_noreturn(_cl, _destructor);	\
+	return;								\
+} while (0)
+
+/**
+ * closure_call - execute @fn out of a new, uninitialized closure
+ *
+ * Typically used when running out of one closure, and we want to run @fn
+ * asynchronously out of a new closure - @parent will then wait for @cl to
+ * finish.
+ */
+static inline void closure_call(struct closure *cl, closure_fn fn,
+				struct workqueue_struct *wq,
+				struct closure *parent)
+{
+	closure_init(cl, parent);
+	continue_at_nobarrier(cl, fn, wq);
+}
+
+#endif /* _LINUX_CLOSURE_H */
diff --git a/libbcache/compress.c b/libbcache/compress.c
new file mode 100644
index 0000000..f7bfd57
--- /dev/null
+++ b/libbcache/compress.c
@@ -0,0 +1,458 @@
+#include "bcache.h"
+#include "compress.h"
+#include "io.h"
+
+#include <linux/lz4.h>
+#include <linux/zlib.h>
+
+enum bounced {
+	BOUNCED_MAPPED,
+	BOUNCED_KMALLOCED,
+	BOUNCED_VMALLOCED,
+	BOUNCED_MEMPOOLED,
+};
+
+static void *__bounce_alloc(struct cache_set *c, unsigned size,
+			    unsigned *bounced, int direction)
+{
+	void *data;
+
+	*bounced = BOUNCED_KMALLOCED;
+	data = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
+	if (data)
+		return data;
+
+	*bounced = BOUNCED_MEMPOOLED;
+	data = mempool_alloc(&c->compression_bounce[direction], GFP_NOWAIT);
+	if (data)
+		return page_address(data);
+
+	*bounced = BOUNCED_VMALLOCED;
+	data = vmalloc(size);
+	if (data)
+		return data;
+
+	*bounced = BOUNCED_MEMPOOLED;
+	data = mempool_alloc(&c->compression_bounce[direction], GFP_NOIO);
+	return page_address(data);
+}
+
+static void *__bio_map_or_bounce(struct cache_set *c,
+				 struct bio *bio, struct bvec_iter start,
+				 unsigned *bounced, int direction)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	unsigned nr_pages = 0;
+	struct page *stack_pages[16];
+	struct page **pages = NULL;
+	bool first = true;
+	unsigned prev_end = PAGE_SIZE;
+	void *data;
+
+	BUG_ON(bvec_iter_sectors(start) > BCH_COMPRESSED_EXTENT_MAX);
+
+	*bounced = BOUNCED_MAPPED;
+
+	__bio_for_each_segment(bv, bio, iter, start) {
+		if ((!first && bv.bv_offset) ||
+		    prev_end != PAGE_SIZE)
+			goto bounce;
+
+		prev_end = bv.bv_offset + bv.bv_len;
+		nr_pages++;
+	}
+
+	BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
+
+	pages = nr_pages > ARRAY_SIZE(stack_pages)
+		? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO)
+		: stack_pages;
+	if (!pages)
+		goto bounce;
+
+	nr_pages = 0;
+	__bio_for_each_segment(bv, bio, iter, start)
+		pages[nr_pages++] = bv.bv_page;
+
+	data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (pages != stack_pages)
+		kfree(pages);
+
+	return data + bio_iter_offset(bio, start);
+bounce:
+	data = __bounce_alloc(c, start.bi_size, bounced, direction);
+
+	if (direction == READ)
+		memcpy_from_bio(data, bio, start);
+
+	return data;
+}
+
+static void *bio_map_or_bounce(struct cache_set *c, struct bio *bio,
+			       unsigned *bounced, int direction)
+{
+	return __bio_map_or_bounce(c, bio, bio->bi_iter, bounced, direction);
+}
+
+static void bio_unmap_or_unbounce(struct cache_set *c, void *data,
+				  unsigned bounced, int direction)
+{
+	if (!data)
+		return;
+
+	switch (bounced) {
+	case BOUNCED_MAPPED:
+		vunmap((void *) ((unsigned long) data & PAGE_MASK));
+		return;
+	case BOUNCED_KMALLOCED:
+		kfree(data);
+		return;
+	case BOUNCED_VMALLOCED:
+		vfree(data);
+		return;
+	case BOUNCED_MEMPOOLED:
+		mempool_free(virt_to_page(data), &c->compression_bounce[direction]);
+		return;
+	}
+}
+
+static int __bio_uncompress(struct cache_set *c, struct bio *src,
+			    void *dst_data, struct bch_extent_crc64 crc)
+{
+	void *src_data = NULL;
+	unsigned src_bounced;
+	size_t src_len = src->bi_iter.bi_size;
+	size_t dst_len = crc.uncompressed_size << 9;
+	int ret;
+
+	src_data = bio_map_or_bounce(c, src, &src_bounced, READ);
+
+	switch (crc.compression_type) {
+	case BCH_COMPRESSION_LZ4:
+		ret = lz4_decompress(src_data, &src_len,
+				     dst_data, dst_len);
+		if (ret) {
+			ret = -EIO;
+			goto err;
+		}
+		break;
+	case BCH_COMPRESSION_GZIP: {
+		void *workspace;
+		z_stream strm;
+
+		workspace = kmalloc(zlib_inflate_workspacesize(),
+				    GFP_NOIO|__GFP_NOWARN);
+		if (!workspace) {
+			mutex_lock(&c->zlib_workspace_lock);
+			workspace = c->zlib_workspace;
+		}
+
+		strm.workspace	= workspace;
+		strm.next_in	= src_data;
+		strm.avail_in	= src_len;
+		strm.next_out	= dst_data;
+		strm.avail_out	= dst_len;
+		zlib_inflateInit2(&strm, -MAX_WBITS);
+
+		ret = zlib_inflate(&strm, Z_FINISH);
+
+		if (workspace == c->zlib_workspace)
+			mutex_unlock(&c->zlib_workspace_lock);
+		else
+			kfree(workspace);
+
+		if (ret != Z_STREAM_END) {
+			ret = -EIO;
+			goto err;
+		}
+		break;
+	}
+	default:
+		BUG();
+	}
+	ret = 0;
+err:
+	bio_unmap_or_unbounce(c, src_data, src_bounced, READ);
+	return ret;
+}
+
+int bch_bio_uncompress_inplace(struct cache_set *c, struct bio *bio,
+			       unsigned live_data_sectors,
+			       struct bch_extent_crc64 crc)
+{
+	void *dst_data = NULL;
+	size_t dst_len = crc.uncompressed_size << 9;
+	int ret = -ENOMEM;
+
+	BUG_ON(DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS) > bio->bi_max_vecs);
+
+	/* XXX mempoolify */
+	dst_data = kmalloc(dst_len, GFP_NOIO|__GFP_NOWARN);
+	if (!dst_data) {
+		dst_data = vmalloc(dst_len);
+		if (!dst_data)
+			goto err;
+	}
+
+	ret = __bio_uncompress(c, bio, dst_data, crc);
+	if (ret)
+		goto err;
+
+	while (bio->bi_vcnt < DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS)) {
+		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
+
+		bv->bv_page = alloc_page(GFP_NOIO);
+		if (!bv->bv_page)
+			goto use_mempool;
+
+		bv->bv_len = PAGE_SIZE;
+		bv->bv_offset = 0;
+		bio->bi_vcnt++;
+	}
+
+	bio->bi_iter.bi_size = live_data_sectors << 9;
+copy_data:
+	memcpy_to_bio(bio, bio->bi_iter, dst_data + (crc.offset << 9));
+err:
+	kvfree(dst_data);
+	return ret;
+use_mempool:
+	/*
+	 * We already allocated from mempool, we can't allocate from it again
+	 * without freeing the pages we already allocated or else we could
+	 * deadlock:
+	 */
+
+	bch_bio_free_pages_pool(c, bio);
+	bch_bio_alloc_pages_pool(c, bio, live_data_sectors << 9);
+	goto copy_data;
+}
+
+int bch_bio_uncompress(struct cache_set *c, struct bio *src,
+		       struct bio *dst, struct bvec_iter dst_iter,
+		       struct bch_extent_crc64 crc)
+{
+	void *dst_data = NULL;
+	unsigned dst_bounced;
+	size_t dst_len = crc.uncompressed_size << 9;
+	int ret = -ENOMEM;
+
+	dst_data = dst_len == dst_iter.bi_size
+		? __bio_map_or_bounce(c, dst, dst_iter, &dst_bounced, WRITE)
+		: __bounce_alloc(c, dst_len, &dst_bounced, WRITE);
+
+	ret = __bio_uncompress(c, src, dst_data, crc);
+	if (ret)
+		goto err;
+
+	if (dst_bounced)
+		memcpy_to_bio(dst, dst_iter, dst_data + (crc.offset << 9));
+err:
+	bio_unmap_or_unbounce(c, dst_data, dst_bounced, WRITE);
+	return ret;
+}
+
+static int __bio_compress(struct cache_set *c,
+			  struct bio *dst, size_t *dst_len,
+			  struct bio *src, size_t *src_len,
+			  unsigned compression_type)
+{
+	void *src_data = NULL, *dst_data = NULL;
+	unsigned src_bounced, dst_bounced, pad;
+	int ret = -1;
+
+	dst_data = bio_map_or_bounce(c, dst, &dst_bounced, WRITE);
+	src_data = bio_map_or_bounce(c, src, &src_bounced, READ);
+
+	switch (compression_type) {
+	case BCH_COMPRESSION_LZ4: {
+		void *workspace;
+
+		*dst_len = dst->bi_iter.bi_size;
+		*src_len = src->bi_iter.bi_size;
+
+		workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO);
+retry_compress:
+		ret = lz4_compress(src_data, *src_len,
+				   dst_data, dst_len,
+				   workspace);
+		/*
+		 * On error, the compressed data was bigger than dst_len, and
+		 * -ret is the amount of data we were able to compress - round
+		 * down to nearest block and try again:
+		 */
+		if (ret && round_down(-ret, block_bytes(c)) > *dst_len) {
+			BUG_ON(ret > 0);
+
+			/* not supposed to happen */
+			if (WARN_ON(-ret >= *src_len))
+				goto err;
+
+			*src_len = round_down(-ret, block_bytes(c));
+			if (!*src_len)
+				goto err;
+
+			goto retry_compress;
+		}
+		mempool_free(workspace, &c->lz4_workspace_pool);
+
+		if (ret)
+			goto err;
+		break;
+	}
+	case BCH_COMPRESSION_GZIP: {
+		void *workspace;
+		z_stream strm;
+
+		workspace = kmalloc(zlib_deflate_workspacesize(MAX_WBITS,
+							       DEF_MEM_LEVEL),
+				    GFP_NOIO|__GFP_NOWARN);
+		if (!workspace) {
+			mutex_lock(&c->zlib_workspace_lock);
+			workspace = c->zlib_workspace;
+		}
+
+		strm.workspace	= workspace;
+		strm.next_in	= src_data;
+		strm.avail_in	= min(src->bi_iter.bi_size,
+				      dst->bi_iter.bi_size);
+		strm.next_out	= dst_data;
+		strm.avail_out	= dst->bi_iter.bi_size;
+		zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+				  Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
+				  Z_DEFAULT_STRATEGY);
+
+		ret = zlib_deflate(&strm, Z_FINISH);
+		if (ret != Z_STREAM_END) {
+			ret = -EIO;
+			goto zlib_err;
+		}
+
+		ret = zlib_deflateEnd(&strm);
+		if (ret != Z_OK) {
+			ret = -EIO;
+			goto zlib_err;
+		}
+
+		ret = 0;
+zlib_err:
+		if (workspace == c->zlib_workspace)
+			mutex_unlock(&c->zlib_workspace_lock);
+		else
+			kfree(workspace);
+
+		if (ret)
+			goto err;
+
+		*dst_len = strm.total_out;
+		*src_len = strm.total_in;
+		break;
+	}
+	default:
+		BUG();
+	}
+
+	BUG_ON(!*dst_len);
+
+	/* Didn't get smaller: */
+	if (round_up(*dst_len, block_bytes(c)) >= *src_len) {
+		ret = -1;
+		goto err;
+	}
+
+	pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
+
+	memset(dst_data + *dst_len, 0, pad);
+	*dst_len += pad;
+
+	if (dst_bounced)
+		memcpy_to_bio(dst, dst->bi_iter, dst_data);
+err:
+	bio_unmap_or_unbounce(c, src_data, src_bounced, READ);
+	bio_unmap_or_unbounce(c, dst_data, dst_bounced, WRITE);
+	return ret;
+}
+
+void bch_bio_compress(struct cache_set *c,
+		      struct bio *dst, size_t *dst_len,
+		      struct bio *src, size_t *src_len,
+		      unsigned *compression_type)
+{
+	unsigned orig_dst = dst->bi_iter.bi_size;
+	unsigned orig_src = src->bi_iter.bi_size;
+
+	/* Don't consume more than BCH_COMPRESSED_EXTENT_MAX from @src: */
+	src->bi_iter.bi_size =
+		min(src->bi_iter.bi_size, BCH_COMPRESSED_EXTENT_MAX << 9);
+
+	/* Don't generate a bigger output than input: */
+	dst->bi_iter.bi_size =
+		min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+
+	/* If it's only one block, don't bother trying to compress: */
+	if (*compression_type != BCH_COMPRESSION_NONE &&
+	    bio_sectors(src) > c->sb.block_size &&
+	    !__bio_compress(c, dst, dst_len, src, src_len, *compression_type))
+		goto out;
+
+	/* If compressing failed (didn't get smaller), just copy: */
+	*compression_type = BCH_COMPRESSION_NONE;
+	*dst_len = *src_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+	bio_copy_data(dst, src);
+out:
+	dst->bi_iter.bi_size = orig_dst;
+	src->bi_iter.bi_size = orig_src;
+}
+
+void bch_compress_free(struct cache_set *c)
+{
+	vfree(c->zlib_workspace);
+	mempool_exit(&c->lz4_workspace_pool);
+	mempool_exit(&c->compression_bounce[WRITE]);
+	mempool_exit(&c->compression_bounce[READ]);
+	free_percpu(c->bio_decompress_worker);
+}
+
+#define COMPRESSION_WORKSPACE_SIZE					\
+	max_t(size_t, zlib_inflate_workspacesize(),			\
+	      zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL))
+
+int bch_compress_init(struct cache_set *c)
+{
+	int ret, cpu;
+
+	c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker);
+	if (!c->bio_decompress_worker)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		struct bio_decompress_worker *d =
+			per_cpu_ptr(c->bio_decompress_worker, cpu);
+
+		d->c = c;
+		INIT_WORK(&d->work, bch_bio_decompress_work);
+		init_llist_head(&d->bio_list);
+	}
+
+	ret = mempool_init_page_pool(&c->compression_bounce[READ], 1,
+				     get_order(BCH_COMPRESSED_EXTENT_MAX << 9));
+	if (ret)
+		return ret;
+
+	ret = mempool_init_page_pool(&c->compression_bounce[WRITE], 1,
+				     get_order(BCH_COMPRESSED_EXTENT_MAX << 9));
+	if (ret)
+		return ret;
+
+	ret = mempool_init_kmalloc_pool(&c->lz4_workspace_pool, 1,
+					LZ4_MEM_COMPRESS);
+	if (ret)
+		return ret;
+
+	c->zlib_workspace = vmalloc(COMPRESSION_WORKSPACE_SIZE);
+	if (!c->zlib_workspace)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/libbcache/compress.h b/libbcache/compress.h
new file mode 100644
index 0000000..02578ef
--- /dev/null
+++ b/libbcache/compress.h
@@ -0,0 +1,14 @@
+#ifndef _BCACHE_COMPRESS_H
+#define _BCACHE_COMPRESS_H
+
+int bch_bio_uncompress_inplace(struct cache_set *, struct bio *,
+			       unsigned, struct bch_extent_crc64);
+int bch_bio_uncompress(struct cache_set *, struct bio *, struct bio *,
+		       struct bvec_iter, struct bch_extent_crc64);
+void bch_bio_compress(struct cache_set *, struct bio *, size_t *,
+		      struct bio *, size_t *, unsigned *);
+
+void bch_compress_free(struct cache_set *);
+int bch_compress_init(struct cache_set *);
+
+#endif /* _BCACHE_COMPRESS_H */
diff --git a/libbcache/debug.c b/libbcache/debug.c
new file mode 100644
index 0000000..1be2e60
--- /dev/null
+++ b/libbcache/debug.c
@@ -0,0 +1,513 @@
+/*
+ * Assorted bcache debug code
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "fs-gc.h"
+#include "inode.h"
+#include "io.h"
+#include "super.h"
+
+#include <linux/console.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+static struct dentry *bch_debug;
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+static void btree_verify_endio(struct bio *bio)
+{
+	struct closure *cl = bio->bi_private;
+
+	closure_put(cl);
+}
+
+void __bch_btree_verify(struct cache_set *c, struct btree *b)
+{
+	struct btree *v = c->verify_data;
+	struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
+	struct bset *sorted, *inmemory;
+	struct extent_pick_ptr pick;
+	struct bio *bio;
+	struct closure cl;
+
+	closure_init_stack(&cl);
+
+	btree_node_io_lock(b);
+	mutex_lock(&c->verify_lock);
+
+	n_ondisk = c->verify_ondisk;
+	n_sorted = c->verify_data->data;
+	n_inmemory = b->data;
+
+	bkey_copy(&v->key, &b->key);
+	v->written	= 0;
+	v->level	= b->level;
+	v->btree_id	= b->btree_id;
+	bch_btree_keys_init(v, &c->expensive_debug_checks);
+
+	pick = bch_btree_pick_ptr(c, b);
+	if (IS_ERR_OR_NULL(pick.ca))
+		return;
+
+	bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
+	bio->bi_bdev		= pick.ca->disk_sb.bdev;
+	bio->bi_iter.bi_sector	= pick.ptr.offset;
+	bio->bi_iter.bi_size	= btree_bytes(c);
+	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC);
+	bio->bi_private		= &cl;
+	bio->bi_end_io		= btree_verify_endio;
+	bch_bio_map(bio, n_sorted);
+
+	closure_get(&cl);
+	bch_generic_make_request(bio, c);
+	closure_sync(&cl);
+
+	bio_put(bio);
+
+	memcpy(n_ondisk, n_sorted, btree_bytes(c));
+
+	bch_btree_node_read_done(c, v, pick.ca, &pick.ptr);
+	n_sorted = c->verify_data->data;
+
+	percpu_ref_put(&pick.ca->ref);
+
+	sorted = &n_sorted->keys;
+	inmemory = &n_inmemory->keys;
+
+	if (inmemory->u64s != sorted->u64s ||
+	    memcmp(inmemory->start,
+		   sorted->start,
+		   (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) {
+		unsigned offset = 0, sectors;
+		struct bset *i;
+		unsigned j;
+
+		console_lock();
+
+		printk(KERN_ERR "*** in memory:\n");
+		bch_dump_bset(b, inmemory, 0);
+
+		printk(KERN_ERR "*** read back in:\n");
+		bch_dump_bset(v, sorted, 0);
+
+		while (offset < b->written) {
+			if (!offset ) {
+				i = &n_ondisk->keys;
+				sectors = __set_blocks(n_ondisk,
+						       le16_to_cpu(n_ondisk->keys.u64s),
+						       block_bytes(c)) <<
+					c->block_bits;
+			} else {
+				struct btree_node_entry *bne =
+					(void *) n_ondisk + (offset << 9);
+				i = &bne->keys;
+
+				sectors = __set_blocks(bne,
+						       le16_to_cpu(bne->keys.u64s),
+						       block_bytes(c)) <<
+					c->block_bits;
+			}
+
+			printk(KERN_ERR "*** on disk block %u:\n", offset);
+			bch_dump_bset(b, i, offset);
+
+			offset += sectors;
+		}
+
+		printk(KERN_ERR "*** block %u/%u not written\n",
+		       offset >> c->block_bits, btree_blocks(c));
+
+		for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
+			if (inmemory->_data[j] != sorted->_data[j])
+				break;
+
+		printk(KERN_ERR "b->written %u\n", b->written);
+
+		console_unlock();
+		panic("verify failed at %u\n", j);
+	}
+
+	mutex_unlock(&c->verify_lock);
+	btree_node_io_unlock(b);
+}
+
+void bch_data_verify(struct cached_dev *dc, struct bio *bio)
+{
+	char name[BDEVNAME_SIZE];
+	struct bio *check;
+	struct bio_vec bv;
+	struct bvec_iter iter;
+
+	check = bio_clone(bio, GFP_NOIO);
+	if (!check)
+		return;
+	bio_set_op_attrs(check, REQ_OP_READ, READ_SYNC);
+
+	if (bio_alloc_pages(check, GFP_NOIO))
+		goto out_put;
+
+	submit_bio_wait(check);
+
+	bio_for_each_segment(bv, bio, iter) {
+		void *p1 = kmap_atomic(bv.bv_page);
+		void *p2 = page_address(check->bi_io_vec[iter.bi_idx].bv_page);
+
+		if (memcmp(p1 + bv.bv_offset,
+			   p2 + bv.bv_offset,
+			   bv.bv_len))
+			panic("verify failed at dev %s sector %llu\n",
+			      bdevname(dc->disk_sb.bdev, name),
+			      (uint64_t) bio->bi_iter.bi_sector);
+
+		kunmap_atomic(p1);
+	}
+
+	bio_free_pages(check);
+out_put:
+	bio_put(check);
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+
+/* XXX: cache set refcounting */
+
+struct dump_iter {
+	struct bpos		from;
+	struct cache_set	*c;
+	enum btree_id		id;
+
+	char			buf[PAGE_SIZE];
+	size_t			bytes;	/* what's currently in buf */
+
+	char __user		*ubuf;	/* destination user buffer */
+	size_t			size;	/* size of requested read */
+	ssize_t			ret;	/* bytes read so far */
+};
+
+static int flush_buf(struct dump_iter *i)
+{
+	if (i->bytes) {
+		size_t bytes = min(i->bytes, i->size);
+		int err = copy_to_user(i->ubuf, i->buf, bytes);
+
+		if (err)
+			return err;
+
+		i->ret	 += bytes;
+		i->ubuf	 += bytes;
+		i->size	 -= bytes;
+		i->bytes -= bytes;
+		memmove(i->buf, i->buf + bytes, i->bytes);
+	}
+
+	return 0;
+}
+
+static int bch_dump_open(struct inode *inode, struct file *file)
+{
+	struct btree_debug *bd = inode->i_private;
+	struct dump_iter *i;
+
+	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+	if (!i)
+		return -ENOMEM;
+
+	file->private_data = i;
+	i->from = POS_MIN;
+	i->c	= container_of(bd, struct cache_set, btree_debug[bd->id]);
+	i->id	= bd->id;
+
+	return 0;
+}
+
+static int bch_dump_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static ssize_t bch_read_btree(struct file *file, char __user *buf,
+			      size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	err = flush_buf(i);
+	if (err)
+		return err;
+
+	if (!i->size)
+		return i->ret;
+
+	bch_btree_iter_init(&iter, i->c, i->id, i->from);
+
+	while ((k = bch_btree_iter_peek(&iter)).k &&
+	       !(err = btree_iter_err(k))) {
+		bch_bkey_val_to_text(i->c, bkey_type(0, i->id),
+				     i->buf, sizeof(i->buf), k);
+		i->bytes = strlen(i->buf);
+		BUG_ON(i->bytes >= PAGE_SIZE);
+		i->buf[i->bytes] = '\n';
+		i->bytes++;
+
+		bch_btree_iter_advance_pos(&iter);
+		i->from = iter.pos;
+
+		err = flush_buf(i);
+		if (err)
+			break;
+
+		if (!i->size)
+			break;
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return err < 0 ? err : i->ret;
+}
+
+static const struct file_operations btree_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch_dump_open,
+	.release	= bch_dump_release,
+	.read		= bch_read_btree,
+};
+
+static int print_btree_node(struct dump_iter *i, struct btree *b)
+{
+	const struct bkey_format *f = &b->format;
+	struct bset_stats stats;
+
+	memset(&stats, 0, sizeof(stats));
+
+	bch_btree_keys_stats(b, &stats);
+
+	i->bytes = scnprintf(i->buf, sizeof(i->buf),
+			     "l %u %llu:%llu - %llu:%llu:\n"
+			     "    format: u64s %u fields %u %u %u %u %u\n"
+			     "    unpack fn len: %u\n"
+			     "    bytes used %zu/%zu (%zu%% full)\n"
+			     "    sib u64s: %u, %u (merge threshold %zu)\n"
+			     "    nr packed keys %u\n"
+			     "    nr unpacked keys %u\n"
+			     "    floats %zu\n"
+			     "    failed unpacked %zu\n"
+			     "    failed prev %zu\n"
+			     "    failed overflow %zu\n",
+			     b->level,
+			     b->data->min_key.inode,
+			     b->data->min_key.offset,
+			     b->data->max_key.inode,
+			     b->data->max_key.offset,
+			     f->key_u64s,
+			     f->bits_per_field[0],
+			     f->bits_per_field[1],
+			     f->bits_per_field[2],
+			     f->bits_per_field[3],
+			     f->bits_per_field[4],
+			     b->unpack_fn_len,
+			     b->nr.live_u64s * sizeof(u64),
+			     btree_bytes(i->c) - sizeof(struct btree_node),
+			     b->nr.live_u64s * 100 / btree_max_u64s(i->c),
+			     b->sib_u64s[0],
+			     b->sib_u64s[1],
+			     BTREE_FOREGROUND_MERGE_THRESHOLD(i->c),
+			     b->nr.packed_keys,
+			     b->nr.unpacked_keys,
+			     stats.floats,
+			     stats.failed_unpacked,
+			     stats.failed_prev,
+			     stats.failed_overflow);
+
+	return flush_buf(i);
+}
+
+static ssize_t bch_read_btree_formats(struct file *file, char __user *buf,
+				      size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_iter iter;
+	struct btree *b;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	err = flush_buf(i);
+	if (err)
+		return err;
+
+	if (!i->size || !bkey_cmp(POS_MAX, i->from))
+		return i->ret;
+
+	for_each_btree_node(&iter, i->c, i->id, i->from, 0, b) {
+		err = print_btree_node(i, b);
+		if (err)
+			break;
+
+		/*
+		 * can't easily correctly restart a btree node traversal across
+		 * all nodes, meh
+		 */
+		i->from = bkey_cmp(POS_MAX, b->key.k.p)
+			? bkey_successor(b->key.k.p)
+			: b->key.k.p;
+
+		if (!i->size)
+			break;
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return err < 0 ? err : i->ret;
+}
+
+static const struct file_operations btree_format_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch_dump_open,
+	.release	= bch_dump_release,
+	.read		= bch_read_btree_formats,
+};
+
+static ssize_t bch_read_bfloat_failed(struct file *file, char __user *buf,
+				      size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct btree *prev_node = NULL;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	err = flush_buf(i);
+	if (err)
+		return err;
+
+	if (!i->size)
+		return i->ret;
+
+	bch_btree_iter_init(&iter, i->c, i->id, i->from);
+
+	while ((k = bch_btree_iter_peek(&iter)).k &&
+	       !(err = btree_iter_err(k))) {
+		struct btree *b = iter.nodes[0];
+		struct btree_node_iter *node_iter = &iter.node_iters[0];
+		struct bkey_packed *_k = bch_btree_node_iter_peek(node_iter, b);
+
+		if (iter.nodes[0] != prev_node) {
+			err = print_btree_node(i, iter.nodes[0]);
+			if (err)
+				break;
+		}
+		prev_node = iter.nodes[0];
+
+		i->bytes = bch_bkey_print_bfloat(b, _k, i->buf, sizeof(i->buf));
+
+		err = flush_buf(i);
+		if (err)
+			break;
+
+		bch_btree_iter_advance_pos(&iter);
+		i->from = iter.pos;
+
+		err = flush_buf(i);
+		if (err)
+			break;
+
+		if (!i->size)
+			break;
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return err < 0 ? err : i->ret;
+}
+
+static const struct file_operations bfloat_failed_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch_dump_open,
+	.release	= bch_dump_release,
+	.read		= bch_read_bfloat_failed,
+};
+
+void bch_debug_exit_cache_set(struct cache_set *c)
+{
+	if (!IS_ERR_OR_NULL(c->debug))
+		debugfs_remove_recursive(c->debug);
+}
+
+void bch_debug_init_cache_set(struct cache_set *c)
+{
+	struct btree_debug *bd;
+	char name[100];
+
+	if (IS_ERR_OR_NULL(bch_debug))
+		return;
+
+	snprintf(name, sizeof(name), "%pU", c->disk_sb.user_uuid.b);
+	c->debug = debugfs_create_dir(name, bch_debug);
+	if (IS_ERR_OR_NULL(c->debug))
+		return;
+
+	for (bd = c->btree_debug;
+	     bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
+	     bd++) {
+		bd->id = bd - c->btree_debug;
+		bd->btree = debugfs_create_file(bch_btree_id_names[bd->id],
+						0400, c->debug, bd,
+						&btree_debug_ops);
+
+		snprintf(name, sizeof(name), "%s-formats",
+			 bch_btree_id_names[bd->id]);
+
+		bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
+						       &btree_format_debug_ops);
+
+		snprintf(name, sizeof(name), "%s-bfloat-failed",
+			 bch_btree_id_names[bd->id]);
+
+		bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
+						 &bfloat_failed_debug_ops);
+	}
+}
+
+#endif
+
+void bch_debug_exit(void)
+{
+	if (!IS_ERR_OR_NULL(bch_debug))
+		debugfs_remove_recursive(bch_debug);
+}
+
+int __init bch_debug_init(void)
+{
+	int ret = 0;
+
+	bch_debug = debugfs_create_dir("bcache", NULL);
+	return ret;
+}
diff --git a/libbcache/debug.h b/libbcache/debug.h
new file mode 100644
index 0000000..a3635e6
--- /dev/null
+++ b/libbcache/debug.h
@@ -0,0 +1,65 @@
+#ifndef _BCACHE_DEBUG_H
+#define _BCACHE_DEBUG_H
+
+#include "bcache.h"
+
+struct bio;
+struct btree;
+struct cached_dev;
+struct cache_set;
+
+#define BCH_DEBUG_PARAM(name, description) extern bool bch_##name;
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	static inline bool name(struct cache_set *c)			\
+	{ return bch_##name || c->name;	}
+BCH_DEBUG_PARAMS_ALWAYS()
+#undef BCH_DEBUG_PARAM
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	static inline bool name(struct cache_set *c)			\
+	{ return bch_##name || c->name;	}
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+
+void __bch_btree_verify(struct cache_set *, struct btree *);
+void bch_data_verify(struct cached_dev *, struct bio *);
+
+#define bypass_torture_test(d)		((d)->bypass_torture_test)
+
+#else /* DEBUG */
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	static inline bool name(struct cache_set *c) { return false; }
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+
+static inline void __bch_btree_verify(struct cache_set *c, struct btree *b) {}
+static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
+
+#define bypass_torture_test(d)		0
+
+#endif
+
+static inline void bch_btree_verify(struct cache_set *c, struct btree *b)
+{
+	if (verify_btree_ondisk(c))
+		__bch_btree_verify(c, b);
+}
+
+#ifdef CONFIG_DEBUG_FS
+void bch_debug_exit_cache_set(struct cache_set *);
+void bch_debug_init_cache_set(struct cache_set *);
+#else
+static inline void bch_debug_exit_cache_set(struct cache_set *c) {}
+static inline void bch_debug_init_cache_set(struct cache_set *c) {}
+#endif
+
+void bch_debug_exit(void);
+int bch_debug_init(void);
+
+#endif
diff --git a/libbcache/dirent.c b/libbcache/dirent.c
new file mode 100644
index 0000000..920ad2f
--- /dev/null
+++ b/libbcache/dirent.c
@@ -0,0 +1,449 @@
+
+#include "bcache.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "dirent.h"
+#include "fs.h"
+#include "keylist.h"
+#include "str_hash.h"
+
+#include <linux/dcache.h>
+
+static unsigned dirent_name_bytes(struct bkey_s_c_dirent d)
+{
+	unsigned len = bkey_val_bytes(d.k) - sizeof(struct bch_dirent);
+
+	while (len && !d.v->d_name[len - 1])
+		--len;
+
+	return len;
+}
+
+static u64 bch_dirent_hash(const struct bch_hash_info *info,
+			   const struct qstr *name)
+{
+	switch (info->type) {
+	case BCH_STR_HASH_SHA1: {
+		SHASH_DESC_ON_STACK(desc, bch_sha1);
+		u8 digest[SHA1_DIGEST_SIZE];
+		u64 ret;
+		desc->tfm = bch_sha1;
+		desc->flags = 0;
+		crypto_shash_init(desc);
+
+		crypto_shash_update(desc, (void *) &info->seed, sizeof(info->seed));
+
+		crypto_shash_update(desc, (void *) name->name, name->len);
+		crypto_shash_final(desc, digest);
+		memcpy(&ret, &digest, sizeof(ret));
+		return max_t(u64, ret >> 1, 2);
+	}
+	default: {
+		struct bch_str_hash_ctx ctx;
+
+		bch_str_hash_init(&ctx, info->type);
+		bch_str_hash_update(&ctx, info->type, &info->seed, sizeof(info->seed));
+
+		bch_str_hash_update(&ctx, info->type, name->name, name->len);
+
+		/* [0,2) reserved for dots */
+		return max_t(u64, bch_str_hash_end(&ctx, info->type), 2);
+	}
+	}
+}
+
+static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
+{
+	return bch_dirent_hash(info, key);
+}
+
+static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+	struct qstr name = QSTR_INIT(d.v->d_name, dirent_name_bytes(d));
+
+	return bch_dirent_hash(info, &name);
+}
+
+static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
+{
+	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
+	int len = dirent_name_bytes(l);
+	const struct qstr *r = _r;
+
+	return len - r->len ?: memcmp(l.v->d_name, r->name, len);
+}
+
+static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
+{
+	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
+	struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
+	int l_len = dirent_name_bytes(l);
+	int r_len = dirent_name_bytes(r);
+
+	return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
+}
+
+static const struct bch_hash_desc dirent_hash_desc = {
+	.btree_id	= BTREE_ID_DIRENTS,
+	.key_type	= BCH_DIRENT,
+	.whiteout_type	= BCH_DIRENT_WHITEOUT,
+	.hash_key	= dirent_hash_key,
+	.hash_bkey	= dirent_hash_bkey,
+	.cmp_key	= dirent_cmp_key,
+	.cmp_bkey	= dirent_cmp_bkey,
+};
+
+static const char *bch_dirent_invalid(const struct cache_set *c,
+				      struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_DIRENT:
+		return bkey_val_bytes(k.k) < sizeof(struct bch_dirent)
+			? "value too small"
+			: NULL;
+
+	case BCH_DIRENT_WHITEOUT:
+		return bkey_val_bytes(k.k) != 0
+			? "value size should be zero"
+			: NULL;
+
+	default:
+		return "invalid type";
+	}
+}
+
+static void bch_dirent_to_text(struct cache_set *c, char *buf,
+			       size_t size, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d;
+
+	switch (k.k->type) {
+	case BCH_DIRENT:
+		d = bkey_s_c_to_dirent(k);
+
+		if (size) {
+			unsigned n = min_t(unsigned, size,
+					   dirent_name_bytes(d));
+			memcpy(buf, d.v->d_name, n);
+			buf[size - 1] = '\0';
+			buf += n;
+			size -= n;
+		}
+
+		scnprintf(buf, size, " -> %llu", d.v->d_inum);
+		break;
+	case BCH_DIRENT_WHITEOUT:
+		scnprintf(buf, size, "whiteout");
+		break;
+	}
+}
+
+const struct bkey_ops bch_bkey_dirent_ops = {
+	.key_invalid	= bch_dirent_invalid,
+	.val_to_text	= bch_dirent_to_text,
+};
+
+static struct bkey_i_dirent *dirent_create_key(u8 type,
+				const struct qstr *name, u64 dst)
+{
+	struct bkey_i_dirent *dirent;
+	unsigned u64s = BKEY_U64s +
+		DIV_ROUND_UP(sizeof(struct bch_dirent) + name->len,
+			     sizeof(u64));
+
+	dirent = kmalloc(u64s * sizeof(u64), GFP_NOFS);
+	if (!dirent)
+		return NULL;
+
+	bkey_dirent_init(&dirent->k_i);
+	dirent->k.u64s = u64s;
+	dirent->v.d_inum = cpu_to_le64(dst);
+	dirent->v.d_type = type;
+
+	memcpy(dirent->v.d_name, name->name, name->len);
+	memset(dirent->v.d_name + name->len, 0,
+	       bkey_val_bytes(&dirent->k) -
+	       (sizeof(struct bch_dirent) + name->len));
+
+	EBUG_ON(dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
+
+	return dirent;
+}
+
+int bch_dirent_create(struct cache_set *c, struct inode *dir, u8 type,
+		      const struct qstr *name, u64 dst_inum)
+{
+	struct bch_inode_info *ei = to_bch_ei(dir);
+	struct bkey_i_dirent *dirent;
+	int ret;
+
+	dirent = dirent_create_key(type, name, dst_inum);
+	if (!dirent)
+		return -ENOMEM;
+
+	ret = bch_hash_set(dirent_hash_desc, &ei->str_hash, c,
+			   ei->vfs_inode.i_ino, &ei->journal_seq,
+			   &dirent->k_i, BCH_HASH_SET_MUST_CREATE);
+	kfree(dirent);
+
+	return ret;
+}
+
+static void dirent_copy_target(struct bkey_i_dirent *dst,
+			       struct bkey_s_c_dirent src)
+{
+	dst->v.d_inum = src.v->d_inum;
+	dst->v.d_type = src.v->d_type;
+}
+
+static struct bpos bch_dirent_pos(struct bch_inode_info *ei,
+				  const struct qstr *name)
+{
+	return POS(ei->vfs_inode.i_ino, bch_dirent_hash(&ei->str_hash, name));
+}
+
+int bch_dirent_rename(struct cache_set *c,
+		      struct inode *src_dir, const struct qstr *src_name,
+		      struct inode *dst_dir, const struct qstr *dst_name,
+		      u64 *journal_seq, enum bch_rename_mode mode)
+{
+	struct bch_inode_info *src_ei = to_bch_ei(src_dir);
+	struct bch_inode_info *dst_ei = to_bch_ei(dst_dir);
+	struct btree_iter src_iter, dst_iter, whiteout_iter;
+	struct bkey_s_c old_src, old_dst;
+	struct bkey delete;
+	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
+	struct bpos src_pos = bch_dirent_pos(src_ei, src_name);
+	struct bpos dst_pos = bch_dirent_pos(dst_ei, dst_name);
+	bool need_whiteout;
+	int ret = -ENOMEM;
+
+	bch_btree_iter_init_intent(&src_iter, c, BTREE_ID_DIRENTS, src_pos);
+	bch_btree_iter_init_intent(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos);
+	bch_btree_iter_link(&src_iter, &dst_iter);
+
+	bch_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos);
+	bch_btree_iter_link(&src_iter, &whiteout_iter);
+
+	if (mode == BCH_RENAME_EXCHANGE) {
+		new_src = dirent_create_key(0, src_name, 0);
+		if (!new_src)
+			goto err;
+	} else {
+		new_src = (void *) &delete;
+	}
+
+	new_dst = dirent_create_key(0, dst_name, 0);
+	if (!new_dst)
+		goto err;
+retry:
+	/*
+	 * Note that on -EINTR/dropped locks we're not restarting the lookup
+	 * from the original hashed position (like we do when creating dirents,
+	 * in bch_hash_set) -  we never move existing dirents to different slot:
+	 */
+	old_src = bch_hash_lookup_at(dirent_hash_desc,
+				     &src_ei->str_hash,
+				     &src_iter, src_name);
+	if ((ret = btree_iter_err(old_src)))
+		goto err;
+
+	ret = bch_hash_needs_whiteout(dirent_hash_desc,
+				&src_ei->str_hash,
+				&whiteout_iter, &src_iter);
+	if (ret < 0)
+		goto err;
+	need_whiteout = ret;
+
+	/*
+	 * Note that in BCH_RENAME mode, we're _not_ checking if
+	 * the target already exists - we're relying on the VFS
+	 * to do that check for us for correctness:
+	 */
+	old_dst = mode == BCH_RENAME
+		? bch_hash_hole_at(dirent_hash_desc, &dst_iter)
+		: bch_hash_lookup_at(dirent_hash_desc,
+				     &dst_ei->str_hash,
+				     &dst_iter, dst_name);
+	if ((ret = btree_iter_err(old_dst)))
+		goto err;
+
+	switch (mode) {
+	case BCH_RENAME:
+		bkey_init(&new_src->k);
+		dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+
+		if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
+		    bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
+			/*
+			 * If we couldn't insert new_dst at its hashed
+			 * position (dst_pos) due to a hash collision,
+			 * and we're going to be deleting in
+			 * between the hashed position and first empty
+			 * slot we found - just overwrite the pos we
+			 * were going to delete:
+			 *
+			 * Note: this is a correctness issue, in this
+			 * situation bch_hash_needs_whiteout() could
+			 * return false when the whiteout would have
+			 * been needed if we inserted at the pos
+			 * __dirent_find_hole() found
+			 */
+			new_dst->k.p = src_iter.pos;
+			ret = bch_btree_insert_at(c, NULL, NULL,
+					journal_seq,
+					BTREE_INSERT_ATOMIC,
+					BTREE_INSERT_ENTRY(&src_iter,
+							   &new_dst->k_i));
+			goto err;
+		}
+
+		if (need_whiteout)
+			new_src->k.type = BCH_DIRENT_WHITEOUT;
+		break;
+	case BCH_RENAME_OVERWRITE:
+		bkey_init(&new_src->k);
+		dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+
+		if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
+		    bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
+			/*
+			 * Same case described above -
+			 * bch_hash_needs_whiteout could spuriously
+			 * return false, but we have to insert at
+			 * dst_iter.pos because we're overwriting
+			 * another dirent:
+			 */
+			new_src->k.type = BCH_DIRENT_WHITEOUT;
+		} else if (need_whiteout)
+			new_src->k.type = BCH_DIRENT_WHITEOUT;
+		break;
+	case BCH_RENAME_EXCHANGE:
+		dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
+		dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+		break;
+	}
+
+	new_src->k.p = src_iter.pos;
+	new_dst->k.p = dst_iter.pos;
+	ret = bch_btree_insert_at(c, NULL, NULL, journal_seq,
+			BTREE_INSERT_ATOMIC,
+			BTREE_INSERT_ENTRY(&src_iter, &new_src->k_i),
+			BTREE_INSERT_ENTRY(&dst_iter, &new_dst->k_i));
+err:
+	if (ret == -EINTR)
+		goto retry;
+
+	bch_btree_iter_unlock(&whiteout_iter);
+	bch_btree_iter_unlock(&dst_iter);
+	bch_btree_iter_unlock(&src_iter);
+
+	if (new_src != (void *) &delete)
+		kfree(new_src);
+	kfree(new_dst);
+	return ret;
+}
+
+int bch_dirent_delete(struct cache_set *c, struct inode *dir,
+		      const struct qstr *name)
+{
+	struct bch_inode_info *ei = to_bch_ei(dir);
+
+	return bch_hash_delete(dirent_hash_desc, &ei->str_hash,
+			       c, ei->vfs_inode.i_ino,
+			       &ei->journal_seq, name);
+}
+
+u64 bch_dirent_lookup(struct cache_set *c, struct inode *dir,
+		      const struct qstr *name)
+{
+	struct bch_inode_info *ei = to_bch_ei(dir);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 inum;
+
+	k = bch_hash_lookup(dirent_hash_desc, &ei->str_hash, c,
+			    ei->vfs_inode.i_ino, &iter, name);
+	if (IS_ERR(k.k)) {
+		bch_btree_iter_unlock(&iter);
+		return 0;
+	}
+
+	inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
+	bch_btree_iter_unlock(&iter);
+
+	return inum;
+}
+
+int bch_empty_dir(struct cache_set *c, u64 dir_inum)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), k) {
+		if (k.k->p.inode > dir_inum)
+			break;
+
+		if (k.k->type == BCH_DIRENT) {
+			ret = -ENOTEMPTY;
+			break;
+		}
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+int bch_readdir(struct cache_set *c, struct file *file,
+		struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent dirent;
+	unsigned len;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	pr_debug("listing for %lu from %llu", inode->i_ino, ctx->pos);
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
+			   POS(inode->i_ino, ctx->pos), k) {
+		if (k.k->type != BCH_DIRENT)
+			continue;
+
+		dirent = bkey_s_c_to_dirent(k);
+
+		pr_debug("saw %llu:%llu (%s) -> %llu",
+			 k.k->p.inode, k.k->p.offset,
+			 dirent.v->d_name, dirent.v->d_inum);
+
+		if (bkey_cmp(k.k->p, POS(inode->i_ino, ctx->pos)) < 0)
+			continue;
+
+		if (k.k->p.inode > inode->i_ino)
+			break;
+
+		len = dirent_name_bytes(dirent);
+
+		pr_debug("emitting %s", dirent.v->d_name);
+
+		/*
+		 * XXX: dir_emit() can fault and block, while we're holding
+		 * locks
+		 */
+		if (!dir_emit(ctx, dirent.v->d_name, len,
+			      le64_to_cpu(dirent.v->d_inum),
+			      dirent.v->d_type))
+			break;
+
+		ctx->pos = k.k->p.offset + 1;
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return 0;
+}
diff --git a/libbcache/dirent.h b/libbcache/dirent.h
new file mode 100644
index 0000000..e18089b
--- /dev/null
+++ b/libbcache/dirent.h
@@ -0,0 +1,32 @@
+#ifndef _BCACHE_DIRENT_H
+#define _BCACHE_DIRENT_H
+
+extern const struct bkey_ops bch_bkey_dirent_ops;
+
+struct qstr;
+struct file;
+struct dir_context;
+struct cache_set;
+
+int bch_dirent_create(struct cache_set *c, struct inode *, u8,
+		      const struct qstr *, u64);
+int bch_dirent_delete(struct cache_set *c, struct inode *, const struct qstr *);
+
+enum bch_rename_mode {
+	BCH_RENAME,
+	BCH_RENAME_OVERWRITE,
+	BCH_RENAME_EXCHANGE,
+};
+
+int bch_dirent_rename(struct cache_set *,
+		      struct inode *, const struct qstr *,
+		      struct inode *, const struct qstr *,
+		      u64 *, enum bch_rename_mode);
+
+u64 bch_dirent_lookup(struct cache_set *c, struct inode *,
+		      const struct qstr *);
+int bch_empty_dir(struct cache_set *, u64);
+int bch_readdir(struct cache_set *, struct file *, struct dir_context *);
+
+#endif /* _BCACHE_DIRENT_H */
+
diff --git a/libbcache/error.c b/libbcache/error.c
new file mode 100644
index 0000000..9ba33ef
--- /dev/null
+++ b/libbcache/error.c
@@ -0,0 +1,140 @@
+#include "bcache.h"
+#include "error.h"
+#include "io.h"
+#include "notify.h"
+#include "super.h"
+
+void bch_inconsistent_error(struct cache_set *c)
+{
+	set_bit(CACHE_SET_ERROR, &c->flags);
+
+	switch (c->opts.errors) {
+	case BCH_ON_ERROR_CONTINUE:
+		break;
+	case BCH_ON_ERROR_RO:
+		if (!test_bit(CACHE_SET_INITIAL_GC_DONE, &c->flags)) {
+			/* XXX do something better here? */
+			bch_cache_set_stop(c);
+			return;
+		}
+
+		if (bch_cache_set_emergency_read_only(c))
+			bch_err(c, "emergency read only");
+		break;
+	case BCH_ON_ERROR_PANIC:
+		panic(bch_fmt(c, "panic after error"));
+		break;
+	}
+}
+
+void bch_fatal_error(struct cache_set *c)
+{
+	if (bch_cache_set_emergency_read_only(c))
+		bch_err(c, "emergency read only");
+}
+
+/* Nonfatal IO errors, IO error/latency accounting: */
+
+/* Just does IO error accounting: */
+void bch_account_io_completion(struct cache *ca)
+{
+	/*
+	 * The halflife of an error is:
+	 * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh
+	 */
+
+	if (ca->set->error_decay) {
+		unsigned count = atomic_inc_return(&ca->io_count);
+
+		while (count > ca->set->error_decay) {
+			unsigned errors;
+			unsigned old = count;
+			unsigned new = count - ca->set->error_decay;
+
+			/*
+			 * First we subtract refresh from count; each time we
+			 * succesfully do so, we rescale the errors once:
+			 */
+
+			count = atomic_cmpxchg(&ca->io_count, old, new);
+
+			if (count == old) {
+				count = new;
+
+				errors = atomic_read(&ca->io_errors);
+				do {
+					old = errors;
+					new = ((uint64_t) errors * 127) / 128;
+					errors = atomic_cmpxchg(&ca->io_errors,
+								old, new);
+				} while (old != errors);
+			}
+		}
+	}
+}
+
+/* IO error accounting and latency accounting: */
+void bch_account_io_completion_time(struct cache *ca,
+				    unsigned submit_time_us, int op)
+{
+	struct cache_set *c;
+	unsigned threshold;
+
+	if (!ca)
+		return;
+
+	c = ca->set;
+	threshold = op_is_write(op)
+		? c->congested_write_threshold_us
+		: c->congested_read_threshold_us;
+
+	if (threshold && submit_time_us) {
+		unsigned t = local_clock_us();
+
+		int us = t - submit_time_us;
+		int congested = atomic_read(&c->congested);
+
+		if (us > (int) threshold) {
+			int ms = us / 1024;
+			c->congested_last_us = t;
+
+			ms = min(ms, CONGESTED_MAX + congested);
+			atomic_sub(ms, &c->congested);
+		} else if (congested < 0)
+			atomic_inc(&c->congested);
+	}
+
+	bch_account_io_completion(ca);
+}
+
+void bch_nonfatal_io_error_work(struct work_struct *work)
+{
+	struct cache *ca = container_of(work, struct cache, io_error_work);
+	struct cache_set *c = ca->set;
+	unsigned errors = atomic_read(&ca->io_errors);
+	char buf[BDEVNAME_SIZE];
+	bool dev;
+
+	if (errors < c->error_limit) {
+		bch_notify_cache_error(ca, false);
+	} else {
+		bch_notify_cache_error(ca, true);
+
+		mutex_lock(&bch_register_lock);
+		dev = bch_cache_may_remove(ca);
+		if (dev
+		    ? bch_cache_read_only(ca)
+		    : bch_cache_set_emergency_read_only(c))
+			bch_err(c,
+				"too many IO errors on %s, setting %s RO",
+				bdevname(ca->disk_sb.bdev, buf),
+				dev ? "device" : "filesystem");
+		mutex_unlock(&bch_register_lock);
+	}
+}
+
+void bch_nonfatal_io_error(struct cache *ca)
+{
+	atomic_add(1 << IO_ERROR_SHIFT, &ca->io_errors);
+	queue_work(system_long_wq, &ca->io_error_work);
+}
diff --git a/libbcache/error.h b/libbcache/error.h
new file mode 100644
index 0000000..9eb9335
--- /dev/null
+++ b/libbcache/error.h
@@ -0,0 +1,238 @@
+#ifndef _BCACHE_ERROR_H
+#define _BCACHE_ERROR_H
+
+#include <linux/printk.h>
+
+struct cache;
+struct cache_set;
+
+/*
+ * XXX: separate out errors that indicate on disk data is inconsistent, and flag
+ * superblock as such
+ */
+
+/* Error messages: */
+
+#define __bch_cache_error(ca, fmt, ...)					\
+do {									\
+	char _buf[BDEVNAME_SIZE];					\
+	bch_err((ca)->set, "%s: " fmt,					\
+		bdevname((ca)->disk_sb.bdev, _buf), ##__VA_ARGS__);	\
+} while (0)
+
+/*
+ * Very fatal logic/inconsistency errors: these indicate that we've majorly
+ * screwed up at runtime, i.e. it's not likely that it was just caused by the
+ * data on disk being inconsistent. These BUG():
+ *
+ * XXX: audit and convert to inconsistent() checks
+ */
+
+#define cache_set_bug(c, ...)						\
+do {									\
+	bch_err(c, __VA_ARGS__);					\
+	BUG();								\
+} while (0)
+
+#define cache_set_bug_on(cond, c, ...)					\
+do {									\
+	if (cond)							\
+		cache_set_bug(c, __VA_ARGS__);				\
+} while (0)
+
+/*
+ * Inconsistency errors: The on disk data is inconsistent. If these occur during
+ * initial recovery, they don't indicate a bug in the running code - we walk all
+ * the metadata before modifying anything. If they occur at runtime, they
+ * indicate either a bug in the running code or (less likely) data is being
+ * silently corrupted under us.
+ *
+ * XXX: audit all inconsistent errors and make sure they're all recoverable, in
+ * BCH_ON_ERROR_CONTINUE mode
+ */
+
+void bch_inconsistent_error(struct cache_set *);
+
+#define cache_set_inconsistent(c, ...)					\
+do {									\
+	bch_err(c, __VA_ARGS__);					\
+	bch_inconsistent_error(c);					\
+} while (0)
+
+#define cache_set_inconsistent_on(cond, c, ...)				\
+({									\
+	int _ret = !!(cond);						\
+									\
+	if (_ret)							\
+		cache_set_inconsistent(c, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * Later we might want to mark only the particular device inconsistent, not the
+ * entire cache set:
+ */
+
+#define cache_inconsistent(ca, ...)					\
+do {									\
+	__bch_cache_error(ca, __VA_ARGS__);				\
+	bch_inconsistent_error((ca)->set);				\
+} while (0)
+
+#define cache_inconsistent_on(cond, ca, ...)				\
+({									\
+	int _ret = !!(cond);						\
+									\
+	if (_ret)							\
+		cache_inconsistent(ca, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * Fsck errors: inconsistency errors we detect at mount time, and should ideally
+ * be able to repair:
+ */
+
+enum {
+	BCH_FSCK_OK			= 0,
+	BCH_FSCK_ERRORS_NOT_FIXED	= 1,
+	BCH_FSCK_REPAIR_UNIMPLEMENTED	= 2,
+	BCH_FSCK_REPAIR_IMPOSSIBLE	= 3,
+	BCH_FSCK_UNKNOWN_VERSION	= 4,
+};
+
+#define unfixable_fsck_err(c, msg, ...)					\
+do {									\
+	bch_err(c, msg " (repair unimplemented)", ##__VA_ARGS__);	\
+	ret = BCH_FSCK_REPAIR_UNIMPLEMENTED;				\
+	goto fsck_err;							\
+} while (0)
+
+#define unfixable_fsck_err_on(cond, c, ...)				\
+do {									\
+	if (cond)							\
+		unfixable_fsck_err(c, __VA_ARGS__);			\
+} while (0)
+
+#define fsck_err(c, msg, ...)						\
+do {									\
+	if (!(c)->opts.fix_errors) {					\
+		bch_err(c, msg, ##__VA_ARGS__);				\
+		ret = BCH_FSCK_ERRORS_NOT_FIXED;			\
+		goto fsck_err;						\
+	}								\
+	set_bit(CACHE_SET_FSCK_FIXED_ERRORS, &(c)->flags);		\
+	bch_err(c, msg ", fixing", ##__VA_ARGS__);			\
+} while (0)
+
+#define fsck_err_on(cond, c, ...)					\
+({									\
+	bool _ret = (cond);						\
+									\
+	if (_ret)							\
+		fsck_err(c, __VA_ARGS__);				\
+	_ret;								\
+})
+
+/*
+ * Fatal errors: these don't indicate a bug, but we can't continue running in RW
+ * mode - pretty much just due to metadata IO errors:
+ */
+
+void bch_fatal_error(struct cache_set *);
+
+#define cache_set_fatal_error(c, ...)					\
+do {									\
+	bch_err(c, __VA_ARGS__);					\
+	bch_fatal_error(c);						\
+} while (0)
+
+#define cache_set_fatal_err_on(cond, c, ...)				\
+({									\
+	int _ret = !!(cond);						\
+									\
+	if (_ret)							\
+		cache_set_fatal_error(c, __VA_ARGS__);			\
+	_ret;								\
+})
+
+#define cache_fatal_error(ca, ...)					\
+do {									\
+	__bch_cache_error(ca, __VA_ARGS__);				\
+	bch_fatal_error(c);						\
+} while (0)
+
+#define cache_fatal_io_error(ca, fmt, ...)				\
+do {									\
+	char _buf[BDEVNAME_SIZE];					\
+									\
+	printk_ratelimited(KERN_ERR bch_fmt((ca)->set,			\
+		"fatal IO error on %s for " fmt),			\
+		bdevname((ca)->disk_sb.bdev, _buf), ##__VA_ARGS__);	\
+	bch_fatal_error((ca)->set);					\
+} while (0)
+
+#define cache_fatal_io_err_on(cond, ca, ...)				\
+({									\
+	int _ret = !!(cond);						\
+									\
+	if (_ret)							\
+		cache_fatal_io_error(ca, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * Nonfatal IO errors: either recoverable metadata IO (because we have
+ * replicas), or data IO - we need to log it and print out a message, but we
+ * don't (necessarily) want to shut down the fs:
+ */
+
+void bch_account_io_completion(struct cache *);
+void bch_account_io_completion_time(struct cache *, unsigned, int);
+
+void bch_nonfatal_io_error_work(struct work_struct *);
+
+/* Does the error handling without logging a message */
+void bch_nonfatal_io_error(struct cache *);
+
+#if 0
+#define cache_set_nonfatal_io_error(c, ...)				\
+do {									\
+	bch_err(c, __VA_ARGS__);					\
+	bch_nonfatal_io_error(c);					\
+} while (0)
+#endif
+
+/* Logs message and handles the error: */
+#define cache_nonfatal_io_error(ca, fmt, ...)				\
+do {									\
+	char _buf[BDEVNAME_SIZE];					\
+									\
+	printk_ratelimited(KERN_ERR bch_fmt((ca)->set,			\
+		"IO error on %s for " fmt),				\
+		bdevname((ca)->disk_sb.bdev, _buf), ##__VA_ARGS__);	\
+	bch_nonfatal_io_error(ca);					\
+} while (0)
+
+#define cache_nonfatal_io_err_on(cond, ca, ...)				\
+({									\
+	bool _ret = (cond);						\
+									\
+	if (_ret)							\
+		cache_nonfatal_io_error(ca, __VA_ARGS__);		\
+	_ret;								\
+})
+
+/* kill? */
+
+#define __bcache_io_error(c, fmt, ...)					\
+	printk_ratelimited(KERN_ERR bch_fmt(c,				\
+			"IO error: " fmt), ##__VA_ARGS__)
+
+#define bcache_io_error(c, bio, fmt, ...)				\
+do {									\
+	__bcache_io_error(c, fmt, ##__VA_ARGS__);			\
+	(bio)->bi_error = -EIO;						\
+} while (0)
+
+#endif /* _BCACHE_ERROR_H */
diff --git a/libbcache/extents.c b/libbcache/extents.c
new file mode 100644
index 0000000..45fa220
--- /dev/null
+++ b/libbcache/extents.c
@@ -0,0 +1,2514 @@
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Code for managing the extent btree and dynamically updating the writeback
+ * dirty sector count.
+ */
+
+#include "bcache.h"
+#include "bkey_methods.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "debug.h"
+#include "dirent.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "journal.h"
+#include "super.h"
+#include "writeback.h"
+#include "xattr.h"
+
+#include <trace/events/bcache.h>
+
+static bool __bch_extent_normalize(struct cache_set *, struct bkey_s, bool);
+static enum merge_result bch_extent_merge(struct cache_set *, struct btree *,
+					  struct bkey_i *, struct bkey_i *);
+
+static void sort_key_next(struct btree_node_iter *iter,
+			  struct btree *b,
+			  struct btree_node_iter_set *i)
+{
+	i->k += __btree_node_offset_to_key(b, i->k)->u64s;
+
+	if (i->k == i->end)
+		*i = iter->data[--iter->used];
+}
+
+/*
+ * Returns true if l > r - unless l == r, in which case returns true if l is
+ * older than r.
+ *
+ * Necessary for btree_sort_fixup() - if there are multiple keys that compare
+ * equal in different sets, we have to process them newest to oldest.
+ */
+#define key_sort_cmp(l, r)						\
+({									\
+	int _c = bkey_cmp_packed(b,					\
+				 __btree_node_offset_to_key(b, (l).k),	\
+				 __btree_node_offset_to_key(b, (r).k));	\
+									\
+	_c ? _c > 0 : (l).k > (r).k;					\
+})
+
+static inline bool should_drop_next_key(struct btree_node_iter *iter,
+					struct btree *b)
+{
+	struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
+	struct bkey_packed *k = __btree_node_offset_to_key(b, l->k);
+
+	if (bkey_whiteout(k))
+		return true;
+
+	if (iter->used < 2)
+		return false;
+
+	if (iter->used > 2 &&
+	    key_sort_cmp(r[0], r[1]))
+		r++;
+
+	/*
+	 * key_sort_cmp() ensures that when keys compare equal the older key
+	 * comes first; so if l->k compares equal to r->k then l->k is older and
+	 * should be dropped.
+	 */
+	return !bkey_cmp_packed(b,
+				__btree_node_offset_to_key(b, l->k),
+				__btree_node_offset_to_key(b, r->k));
+}
+
+struct btree_nr_keys bch_key_sort_fix_overlapping(struct bset *dst,
+						  struct btree *b,
+						  struct btree_node_iter *iter)
+{
+	struct bkey_packed *out = dst->start;
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	heap_resort(iter, key_sort_cmp);
+
+	while (!bch_btree_node_iter_end(iter)) {
+		if (!should_drop_next_key(iter, b)) {
+			struct bkey_packed *k =
+				__btree_node_offset_to_key(b, iter->data->k);
+
+			bkey_copy(out, k);
+			btree_keys_account_key_add(&nr, 0, out);
+			out = bkey_next(out);
+		}
+
+		sort_key_next(iter, b, iter->data);
+		heap_sift(iter, 0, key_sort_cmp);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+/* Common among btree and extent ptrs */
+
+bool bch_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
+{
+	const struct bch_extent_ptr *ptr;
+
+	extent_for_each_ptr(e, ptr)
+		if (ptr->dev == dev)
+			return true;
+
+	return false;
+}
+
+unsigned bch_extent_nr_ptrs_from(struct bkey_s_c_extent e,
+				 const struct bch_extent_ptr *start)
+{
+	const struct bch_extent_ptr *ptr;
+	unsigned nr_ptrs = 0;
+
+	extent_for_each_ptr_from(e, ptr, start)
+		nr_ptrs++;
+
+	return nr_ptrs;
+}
+
+unsigned bch_extent_nr_ptrs(struct bkey_s_c_extent e)
+{
+	return bch_extent_nr_ptrs_from(e, &e.v->start->ptr);
+}
+
+/* returns true if equal */
+static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r)
+{
+	return extent_crc_type(l) == extent_crc_type(r) &&
+		!memcmp(l, r, extent_entry_bytes(to_entry(l)));
+}
+
+/* Increment pointers after @crc by crc's offset until the next crc entry: */
+void bch_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc *crc)
+{
+	union bch_extent_entry *entry;
+
+	extent_for_each_entry_from(e, entry, extent_entry_next(to_entry(crc))) {
+		if (!extent_entry_is_ptr(entry))
+			return;
+
+		entry->ptr.offset += crc_offset(crc);
+	}
+}
+
+/*
+ * We're writing another replica for this extent, so while we've got the data in
+ * memory we'll be computing a new checksum for the currently live data.
+ *
+ * If there are other replicas we aren't moving, and they are checksummed but
+ * not compressed, we can modify them to point to only the data that is
+ * currently live (so that readers won't have to bounce) while we've got the
+ * checksum we need:
+ *
+ * XXX: to guard against data being corrupted while in memory, instead of
+ * recomputing the checksum here, it would be better in the read path to instead
+ * of computing the checksum of the entire extent:
+ *
+ * | extent                              |
+ *
+ * compute the checksums of the live and dead data separately
+ * | dead data || live data || dead data |
+ *
+ * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then
+ * use crc_live here (that we verified was correct earlier)
+ */
+void bch_extent_narrow_crcs(struct bkey_s_extent e)
+{
+	union bch_extent_crc *crc;
+	bool have_wide = false, have_narrow = false;
+	u64 csum = 0;
+	unsigned csum_type = 0;
+
+	extent_for_each_crc(e, crc) {
+		if (crc_compression_type(crc))
+			continue;
+
+		if (crc_uncompressed_size(e.k, crc) != e.k->size) {
+			have_wide = true;
+		} else {
+			have_narrow = true;
+			csum = crc_csum(crc);
+			csum_type = crc_csum_type(crc);
+		}
+	}
+
+	if (!have_wide || !have_narrow)
+		return;
+
+	extent_for_each_crc(e, crc) {
+		if (crc_compression_type(crc))
+			continue;
+
+		if (crc_uncompressed_size(e.k, crc) != e.k->size) {
+			switch (extent_crc_type(crc)) {
+			case BCH_EXTENT_CRC_NONE:
+				BUG();
+			case BCH_EXTENT_CRC32:
+				if (bch_crc_size[csum_type] > sizeof(crc->crc32.csum))
+					continue;
+
+				bch_extent_crc_narrow_pointers(e, crc);
+				crc->crc32.compressed_size	= e.k->size;
+				crc->crc32.uncompressed_size	= e.k->size;
+				crc->crc32.offset		= 0;
+				crc->crc32.csum_type		= csum_type;
+				crc->crc32.csum			= csum;
+				break;
+			case BCH_EXTENT_CRC64:
+				if (bch_crc_size[csum_type] > sizeof(crc->crc64.csum))
+					continue;
+
+				bch_extent_crc_narrow_pointers(e, crc);
+				crc->crc64.compressed_size	= e.k->size;
+				crc->crc64.uncompressed_size	= e.k->size;
+				crc->crc64.offset		= 0;
+				crc->crc64.csum_type		= csum_type;
+				crc->crc64.csum			= csum;
+				break;
+			}
+		}
+	}
+}
+
+void bch_extent_drop_redundant_crcs(struct bkey_s_extent e)
+{
+	union bch_extent_entry *entry = e.v->start;
+	union bch_extent_crc *crc, *prev = NULL;
+
+	while (entry != extent_entry_last(e)) {
+		union bch_extent_entry *next = extent_entry_next(entry);
+		size_t crc_u64s = extent_entry_u64s(entry);
+
+		if (!extent_entry_is_crc(entry))
+			goto next;
+
+		crc = entry_to_crc(entry);
+
+		if (next == extent_entry_last(e)) {
+			/* crc entry with no pointers after it: */
+			goto drop;
+		}
+
+		if (extent_entry_is_crc(next)) {
+			/* no pointers before next crc entry: */
+			goto drop;
+		}
+
+		if (prev && crc_cmp(crc, prev)) {
+			/* identical to previous crc entry: */
+			goto drop;
+		}
+
+		if (!prev &&
+		    !crc_csum_type(crc) &&
+		    !crc_compression_type(crc)) {
+			/* null crc entry: */
+			bch_extent_crc_narrow_pointers(e, crc);
+			goto drop;
+		}
+
+		prev = crc;
+next:
+		entry = next;
+		continue;
+drop:
+		memmove_u64s_down(crc, next,
+				  (u64 *) extent_entry_last(e) - (u64 *) next);
+		e.k->u64s -= crc_u64s;
+	}
+
+	EBUG_ON(bkey_val_u64s(e.k) && !bch_extent_nr_ptrs(e.c));
+}
+
+static bool should_drop_ptr(const struct cache_set *c,
+			    struct bkey_s_c_extent e,
+			    const struct bch_extent_ptr *ptr)
+{
+	struct cache *ca;
+
+	return (ca = PTR_CACHE(c, ptr)) && ptr_stale(ca, ptr);
+}
+
+static void bch_extent_drop_stale(struct cache_set *c, struct bkey_s_extent e)
+{
+	struct bch_extent_ptr *ptr = &e.v->start->ptr;
+	bool dropped = false;
+
+	/*
+	 * We don't want to change which pointers are considered cached/dirty,
+	 * so don't remove pointers that are considered dirty:
+	 */
+	rcu_read_lock();
+	while ((ptr = extent_ptr_next(e, ptr)) &&
+	       !bch_extent_ptr_is_dirty(c, e.c, ptr))
+		if (should_drop_ptr(c, e.c, ptr)) {
+			__bch_extent_drop_ptr(e, ptr);
+			dropped = true;
+		} else
+			ptr++;
+	rcu_read_unlock();
+
+	if (dropped)
+		bch_extent_drop_redundant_crcs(e);
+}
+
+static bool bch_ptr_normalize(struct cache_set *c, struct btree *bk,
+			      struct bkey_s k)
+{
+	return __bch_extent_normalize(c, k, false);
+}
+
+static void bch_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
+{
+	u64 *d = (u64 *) bkeyp_val(f, k);
+	unsigned i;
+
+	for (i = 0; i < bkeyp_val_u64s(f, k); i++)
+		d[i] = swab64(d[i]);
+}
+
+static const char *extent_ptr_invalid(struct bkey_s_c_extent e,
+				      const struct cache_member_rcu *mi,
+				      const struct bch_extent_ptr *ptr,
+				      unsigned size_ondisk)
+{
+	const struct bch_extent_ptr *ptr2;
+	const struct cache_member_cpu *m = mi->m + ptr->dev;
+
+	if (ptr->dev > mi->nr_in_set || !m->valid)
+		return "pointer to invalid device";
+
+	extent_for_each_ptr(e, ptr2)
+		if (ptr != ptr2 && ptr->dev == ptr2->dev)
+			return "multiple pointers to same device";
+
+	if (ptr->offset + size_ondisk > m->bucket_size * m->nbuckets)
+		return "offset past end of device";
+
+	if (ptr->offset < m->bucket_size * m->first_bucket)
+		return "offset before first bucket";
+
+	if ((ptr->offset & (m->bucket_size - 1)) + size_ondisk > m->bucket_size)
+		return "spans multiple buckets";
+
+	return NULL;
+}
+
+static size_t extent_print_ptrs(struct cache_set *c, char *buf,
+				size_t size, struct bkey_s_c_extent e)
+{
+	char *out = buf, *end = buf + size;
+	const union bch_extent_entry *entry;
+	const union bch_extent_crc *crc;
+	const struct bch_extent_ptr *ptr;
+	struct cache *ca;
+	bool first = true;
+
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+	rcu_read_lock();
+	extent_for_each_entry(e, entry) {
+		if (!first)
+			p(" ");
+
+		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+			crc = entry_to_crc(entry);
+			p("crc: c_size %u size %u offset %u csum %u compress %u",
+			  crc_compressed_size(e.k, crc),
+			  crc_uncompressed_size(e.k, crc),
+			  crc_offset(crc), crc_csum_type(crc),
+			  crc_compression_type(crc));
+			break;
+		case BCH_EXTENT_ENTRY_ptr:
+			ptr = &entry->ptr;
+			p("ptr: %u:%llu gen %u%s", ptr->dev,
+			  (u64) ptr->offset, ptr->gen,
+			  (ca = PTR_CACHE(c, ptr)) && ptr_stale(ca, ptr)
+			  ? " stale" : "");
+			break;
+		default:
+			p("(invalid extent entry %.16llx)", *((u64 *) entry));
+			goto out;
+		}
+
+		first = false;
+	}
+out:
+	rcu_read_unlock();
+
+	if (bkey_extent_is_cached(e.k))
+		p(" cached");
+#undef p
+	return out - buf;
+}
+
+/* Btree ptrs */
+
+static const char *bch_btree_ptr_invalid(const struct cache_set *c,
+					 struct bkey_s_c k)
+{
+	if (bkey_extent_is_cached(k.k))
+		return "cached";
+
+	if (k.k->size)
+		return "nonzero key size";
+
+	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
+		return "value too big";
+
+	switch (k.k->type) {
+	case BCH_EXTENT: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+		const struct bch_extent_ptr *ptr;
+		const union bch_extent_crc *crc;
+		struct cache_member_rcu *mi;
+		const char *reason;
+
+		extent_for_each_entry(e, entry)
+			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
+				return "invalid extent entry type";
+
+		mi = cache_member_info_get(c);
+
+		extent_for_each_ptr_crc(e, ptr, crc) {
+			reason = extent_ptr_invalid(e, mi, ptr,
+						c->sb.btree_node_size);
+
+			if (reason) {
+				cache_member_info_put();
+				return reason;
+			}
+		}
+
+		cache_member_info_put();
+
+		if (crc)
+			return "has crc field";
+
+		return NULL;
+	}
+
+	default:
+		return "invalid value type";
+	}
+}
+
+static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b,
+				 struct bkey_s_c k)
+{
+	struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+	const struct bch_extent_ptr *ptr;
+	unsigned seq;
+	const char *err;
+	char buf[160];
+	struct bucket *g;
+	struct cache *ca;
+	unsigned replicas = 0;
+	bool bad;
+
+	rcu_read_lock();
+
+	extent_for_each_online_device(c, e, ptr, ca) {
+		replicas++;
+
+		if ((ca = PTR_CACHE(c, ptr))) {
+			g = PTR_BUCKET(ca, ptr);
+
+			err = "stale";
+			if (ptr_stale(ca, ptr))
+				goto err;
+
+			do {
+				seq = read_seqcount_begin(&c->gc_pos_lock);
+				bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
+				       !g->mark.is_metadata;
+			} while (read_seqcount_retry(&c->gc_pos_lock, seq));
+
+			err = "inconsistent";
+			if (bad)
+				goto err;
+		}
+	}
+
+	rcu_read_unlock();
+
+	if (replicas < c->sb.meta_replicas_have) {
+		bch_bkey_val_to_text(c, btree_node_type(b),
+				     buf, sizeof(buf), k);
+		cache_set_bug(c,
+			"btree key bad (too few replicas, %u < %u): %s",
+			replicas, c->sb.meta_replicas_have, buf);
+		return;
+	}
+
+	return;
+err:
+	bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
+	cache_set_bug(c, "%s btree pointer %s: bucket %zi prio %i "
+		      "gen %i last_gc %i mark %08x",
+		      err, buf, PTR_BUCKET_NR(ca, ptr),
+		      g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen,
+		      ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)],
+		      (unsigned) g->mark.counter);
+	rcu_read_unlock();
+}
+
+static void bch_btree_ptr_to_text(struct cache_set *c, char *buf,
+				  size_t size, struct bkey_s_c k)
+{
+	char *out = buf, *end = buf + size;
+	const char *invalid;
+
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+	if (bkey_extent_is_data(k.k))
+		out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
+
+	invalid = bch_btree_ptr_invalid(c, k);
+	if (invalid)
+		p(" invalid: %s", invalid);
+#undef p
+}
+
+struct extent_pick_ptr
+bch_btree_pick_ptr(struct cache_set *c, const struct btree *b)
+{
+	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
+	const union bch_extent_crc *crc;
+	const struct bch_extent_ptr *ptr;
+	struct cache *ca;
+
+	rcu_read_lock();
+
+	extent_for_each_online_device_crc(c, e, crc, ptr, ca) {
+		struct btree *root = btree_node_root(c, b);
+
+		if (cache_set_inconsistent_on(crc, c,
+				"btree node pointer with crc at btree %u level %u/%u bucket %zu",
+				b->btree_id, b->level, root ? root->level : -1,
+				PTR_BUCKET_NR(ca, ptr)))
+			break;
+
+		if (cache_inconsistent_on(ptr_stale(ca, ptr), ca,
+				"stale btree node pointer at btree %u level %u/%u bucket %zu",
+				b->btree_id, b->level, root ? root->level : -1,
+				PTR_BUCKET_NR(ca, ptr)))
+			continue;
+
+		percpu_ref_get(&ca->ref);
+		rcu_read_unlock();
+
+		return (struct extent_pick_ptr) { .ptr = *ptr, .ca = ca };
+	}
+
+	rcu_read_unlock();
+
+	return (struct extent_pick_ptr) { .ca = NULL, };
+}
+
+const struct bkey_ops bch_bkey_btree_ops = {
+	.key_invalid	= bch_btree_ptr_invalid,
+	.key_debugcheck	= btree_ptr_debugcheck,
+	.val_to_text	= bch_btree_ptr_to_text,
+	.swab		= bch_ptr_swab,
+};
+
+/* Extents */
+
+static bool __bch_cut_front(struct bpos where, struct bkey_s k)
+{
+	u64 len = 0;
+
+	if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
+		return false;
+
+	EBUG_ON(bkey_cmp(where, k.k->p) > 0);
+
+	len = k.k->p.offset - where.offset;
+
+	BUG_ON(len > k.k->size);
+
+	/*
+	 * Don't readjust offset if the key size is now 0, because that could
+	 * cause offset to point to the next bucket:
+	 */
+	if (!len)
+		__set_bkey_deleted(k.k);
+	else if (bkey_extent_is_data(k.k)) {
+		struct bkey_s_extent e = bkey_s_to_extent(k);
+		struct bch_extent_ptr *ptr;
+		union bch_extent_crc *crc, *prev_crc = NULL;
+
+		extent_for_each_ptr_crc(e, ptr, crc) {
+			switch (extent_crc_type(crc)) {
+			case BCH_EXTENT_CRC_NONE:
+				ptr->offset += e.k->size - len;
+				break;
+			case BCH_EXTENT_CRC32:
+				if (prev_crc != crc)
+					crc->crc32.offset += e.k->size - len;
+				break;
+			case BCH_EXTENT_CRC64:
+				if (prev_crc != crc)
+					crc->crc64.offset += e.k->size - len;
+				break;
+			}
+			prev_crc = crc;
+		}
+	}
+
+	k.k->size = len;
+
+	return true;
+}
+
+bool bch_cut_front(struct bpos where, struct bkey_i *k)
+{
+	return __bch_cut_front(where, bkey_i_to_s(k));
+}
+
+bool bch_cut_back(struct bpos where, struct bkey *k)
+{
+	u64 len = 0;
+
+	if (bkey_cmp(where, k->p) >= 0)
+		return false;
+
+	EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0);
+
+	len = where.offset - bkey_start_offset(k);
+
+	BUG_ON(len > k->size);
+
+	k->p = where;
+	k->size = len;
+
+	if (!len)
+		__set_bkey_deleted(k);
+
+	return true;
+}
+
+/**
+ * bch_key_resize - adjust size of @k
+ *
+ * bkey_start_offset(k) will be preserved, modifies where the extent ends
+ */
+void bch_key_resize(struct bkey *k,
+		    unsigned new_size)
+{
+	k->p.offset -= k->size;
+	k->p.offset += new_size;
+	k->size = new_size;
+}
+
+/*
+ * In extent_sort_fix_overlapping(), insert_fixup_extent(),
+ * extent_merge_inline() - we're modifying keys in place that are packed. To do
+ * that we have to unpack the key, modify the unpacked key - then this
+ * copies/repacks the unpacked to the original as necessary.
+ */
+static bool __extent_save(struct btree *b, struct btree_node_iter *iter,
+			  struct bkey_packed *dst, struct bkey *src)
+{
+	struct bkey_format *f = &b->format;
+	struct bkey_i *dst_unpacked;
+	bool ret;
+
+	if ((dst_unpacked = packed_to_bkey(dst))) {
+		dst_unpacked->k = *src;
+		ret = true;
+	} else {
+		ret = bkey_pack_key(dst, src, f);
+	}
+
+	if (ret && iter)
+		bch_verify_key_order(b, iter, dst);
+
+	return ret;
+}
+
+static void extent_save(struct btree *b, struct btree_node_iter *iter,
+			struct bkey_packed *dst, struct bkey *src)
+{
+	BUG_ON(!__extent_save(b, iter, dst, src));
+}
+
+/*
+ * Returns true if l > r - unless l == r, in which case returns true if l is
+ * older than r.
+ *
+ * Necessary for sort_fix_overlapping() - if there are multiple keys that
+ * compare equal in different sets, we have to process them newest to oldest.
+ */
+#define extent_sort_cmp(l, r)						\
+({									\
+	struct bkey _ul = bkey_unpack_key(b,				\
+				__btree_node_offset_to_key(b, (l).k));	\
+	struct bkey _ur = bkey_unpack_key(b,				\
+				__btree_node_offset_to_key(b, (r).k));	\
+									\
+	int _c = bkey_cmp(bkey_start_pos(&_ul), bkey_start_pos(&_ur));	\
+	_c ? _c > 0 : (l).k < (r).k;					\
+})
+
+static inline void extent_sort_sift(struct btree_node_iter *iter,
+				    struct btree *b, size_t i)
+{
+	heap_sift(iter, i, extent_sort_cmp);
+}
+
+static inline void extent_sort_next(struct btree_node_iter *iter,
+				    struct btree *b,
+				    struct btree_node_iter_set *i)
+{
+	sort_key_next(iter, b, i);
+	heap_sift(iter, i - iter->data, extent_sort_cmp);
+}
+
+static void extent_sort_append(struct cache_set *c,
+			       struct btree *b,
+			       struct btree_nr_keys *nr,
+			       struct bkey_packed *start,
+			       struct bkey_packed **prev,
+			       struct bkey_packed *k)
+{
+	struct bkey_format *f = &b->format;
+	BKEY_PADDED(k) tmp;
+
+	if (bkey_whiteout(k))
+		return;
+
+	bkey_unpack(b, &tmp.k, k);
+
+	if (*prev &&
+	    bch_extent_merge(c, b, (void *) *prev, &tmp.k))
+		return;
+
+	if (*prev) {
+		bkey_pack(*prev, (void *) *prev, f);
+
+		btree_keys_account_key_add(nr, 0, *prev);
+		*prev = bkey_next(*prev);
+	} else {
+		*prev = start;
+	}
+
+	bkey_copy(*prev, &tmp.k);
+}
+
+struct btree_nr_keys bch_extent_sort_fix_overlapping(struct cache_set *c,
+					struct bset *dst,
+					struct btree *b,
+					struct btree_node_iter *iter)
+{
+	struct bkey_format *f = &b->format;
+	struct btree_node_iter_set *_l = iter->data, *_r;
+	struct bkey_packed *prev = NULL, *out, *lk, *rk;
+	struct bkey l_unpacked, r_unpacked;
+	struct bkey_s l, r;
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	heap_resort(iter, extent_sort_cmp);
+
+	while (!bch_btree_node_iter_end(iter)) {
+		lk = __btree_node_offset_to_key(b, _l->k);
+
+		if (iter->used == 1) {
+			extent_sort_append(c, b, &nr, dst->start, &prev, lk);
+			extent_sort_next(iter, b, _l);
+			continue;
+		}
+
+		_r = iter->data + 1;
+		if (iter->used > 2 &&
+		    extent_sort_cmp(_r[0], _r[1]))
+			_r++;
+
+		rk = __btree_node_offset_to_key(b, _r->k);
+
+		l = __bkey_disassemble(b, lk, &l_unpacked);
+		r = __bkey_disassemble(b, rk, &r_unpacked);
+
+		/* If current key and next key don't overlap, just append */
+		if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
+			extent_sort_append(c, b, &nr, dst->start, &prev, lk);
+			extent_sort_next(iter, b, _l);
+			continue;
+		}
+
+		/* Skip 0 size keys */
+		if (!r.k->size) {
+			extent_sort_next(iter, b, _r);
+			continue;
+		}
+
+		/*
+		 * overlap: keep the newer key and trim the older key so they
+		 * don't overlap. comparing pointers tells us which one is
+		 * newer, since the bsets are appended one after the other.
+		 */
+
+		/* can't happen because of comparison func */
+		BUG_ON(_l->k < _r->k &&
+		       !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
+
+		if (_l->k > _r->k) {
+			/* l wins, trim r */
+			if (bkey_cmp(l.k->p, r.k->p) >= 0) {
+				sort_key_next(iter, b, _r);
+			} else {
+				__bch_cut_front(l.k->p, r);
+				extent_save(b, NULL, rk, r.k);
+			}
+
+			extent_sort_sift(iter, b, _r - iter->data);
+		} else if (bkey_cmp(l.k->p, r.k->p) > 0) {
+			BKEY_PADDED(k) tmp;
+
+			/*
+			 * r wins, but it overlaps in the middle of l - split l:
+			 */
+			bkey_reassemble(&tmp.k, l.s_c);
+			bch_cut_back(bkey_start_pos(r.k), &tmp.k.k);
+
+			__bch_cut_front(r.k->p, l);
+			extent_save(b, NULL, lk, l.k);
+
+			extent_sort_sift(iter, b, 0);
+
+			extent_sort_append(c, b, &nr, dst->start, &prev,
+					   bkey_to_packed(&tmp.k));
+		} else {
+			bch_cut_back(bkey_start_pos(r.k), l.k);
+			extent_save(b, NULL, lk, l.k);
+		}
+	}
+
+	if (prev) {
+		bkey_pack(prev, (void *) prev, f);
+		btree_keys_account_key_add(&nr, 0, prev);
+		out = bkey_next(prev);
+	} else {
+		out = dst->start;
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+struct extent_insert_state {
+	struct btree_insert		*trans;
+	struct btree_insert_entry	*insert;
+	struct bpos			committed;
+	struct bucket_stats_cache_set	stats;
+
+	/* for deleting: */
+	struct bkey_i			whiteout;
+	bool				do_journal;
+	bool				deleting;
+};
+
+static void bch_add_sectors(struct extent_insert_state *s,
+			    struct bkey_s_c k, u64 offset, s64 sectors)
+{
+	struct cache_set *c = s->trans->c;
+	struct btree *b = s->insert->iter->nodes[0];
+
+	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0);
+
+	if (!sectors)
+		return;
+
+	bch_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
+		     &s->stats, s->trans->journal_res.seq);
+
+	if (bkey_extent_is_data(k.k) &&
+	    !bkey_extent_is_cached(k.k))
+		bcache_dev_sectors_dirty_add(c, k.k->p.inode, offset, sectors);
+}
+
+static void bch_subtract_sectors(struct extent_insert_state *s,
+				 struct bkey_s_c k, u64 offset, s64 sectors)
+{
+	bch_add_sectors(s, k, offset, -sectors);
+}
+
+/* These wrappers subtract exactly the sectors that we're removing from @k */
+static void bch_cut_subtract_back(struct extent_insert_state *s,
+				  struct bpos where, struct bkey_s k)
+{
+	bch_subtract_sectors(s, k.s_c, where.offset,
+			     k.k->p.offset - where.offset);
+	bch_cut_back(where, k.k);
+}
+
+static void bch_cut_subtract_front(struct extent_insert_state *s,
+				   struct bpos where, struct bkey_s k)
+{
+	bch_subtract_sectors(s, k.s_c, bkey_start_offset(k.k),
+			     where.offset - bkey_start_offset(k.k));
+	__bch_cut_front(where, k);
+}
+
+static void bch_drop_subtract(struct extent_insert_state *s, struct bkey_s k)
+{
+	if (k.k->size)
+		bch_subtract_sectors(s, k.s_c,
+				     bkey_start_offset(k.k), k.k->size);
+	k.k->size = 0;
+	__set_bkey_deleted(k.k);
+}
+
+/*
+ * Note: If this returns true because only some pointers matched,
+ * we can lose some caching that had happened in the interim.
+ * Because cache promotion only promotes the part of the extent
+ * actually read, and not the whole extent, and due to the key
+ * splitting done in bch_extent_insert_fixup, preserving such
+ * caching is difficult.
+ */
+static bool bch_extent_cmpxchg_cmp(struct bkey_s_c l, struct bkey_s_c r)
+{
+	struct bkey_s_c_extent le, re;
+	const struct bch_extent_ptr *lp, *rp;
+	s64 offset;
+
+	BUG_ON(!l.k->size || !r.k->size);
+
+	if (l.k->type != r.k->type ||
+	    l.k->version != r.k->version)
+		return false;
+
+	switch (l.k->type) {
+	case KEY_TYPE_COOKIE:
+		return !memcmp(bkey_s_c_to_cookie(l).v,
+			       bkey_s_c_to_cookie(r).v,
+			       sizeof(struct bch_cookie));
+
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		le = bkey_s_c_to_extent(l);
+		re = bkey_s_c_to_extent(r);
+
+		/*
+		 * bkey_cmpxchg() handles partial matches - when either l or r
+		 * has been trimmed - so we need just to handle l or r not
+		 * starting at the same place when checking for a match here.
+		 *
+		 * If the starts of the keys are different, we just apply that
+		 * offset to the device pointer offsets when checking those -
+		 * matching how bch_cut_front() adjusts device pointer offsets
+		 * when adjusting the start of a key:
+		 */
+		offset = bkey_start_offset(l.k) - bkey_start_offset(r.k);
+
+		/*
+		 * XXX: perhaps we only raced with copygc or tiering replacing
+		 * one of the pointers: it should suffice to find _any_ matching
+		 * pointer
+		 */
+
+		if (bkey_val_u64s(le.k) != bkey_val_u64s(re.k))
+			return false;
+
+		extent_for_each_ptr(le, lp) {
+			const union bch_extent_entry *entry =
+				bkey_idx(re.v, (u64 *) lp - le.v->_data);
+
+			if (!extent_entry_is_ptr(entry))
+				return false;
+
+			rp = &entry->ptr;
+
+			if (lp->offset	!= rp->offset + offset ||
+			    lp->dev	!= rp->dev ||
+			    lp->gen	!= rp->gen)
+				return false;
+		}
+
+		return true;
+	default:
+		return false;
+	}
+
+}
+
+/*
+ * Returns true on success, false on failure (and false means @new no longer
+ * overlaps with @k)
+ *
+ * If returned true, we may have inserted up to one key in @b.
+ * If returned false, we may have inserted up to two keys in @b.
+ *
+ * On return, there is room in @res for at least one more key of the same size
+ * as @new.
+ */
+enum extent_insert_hook_ret bch_extent_cmpxchg(struct extent_insert_hook *hook,
+					       struct bpos committed_pos,
+					       struct bpos next_pos,
+					       struct bkey_s_c k,
+					       const struct bkey_i *new)
+{
+	struct bch_replace_info *replace = container_of(hook,
+					struct bch_replace_info, hook);
+	struct bkey_i *old = &replace->key;
+
+	EBUG_ON(bkey_cmp(committed_pos, bkey_start_pos(&new->k)) < 0);
+
+	/* must have something to compare against */
+	EBUG_ON(!bkey_val_u64s(&old->k));
+
+	/* new must be a subset of old */
+	EBUG_ON(bkey_cmp(new->k.p, old->k.p) > 0 ||
+		bkey_cmp(bkey_start_pos(&new->k), bkey_start_pos(&old->k)) < 0);
+
+	if (k.k && bch_extent_cmpxchg_cmp(k, bkey_i_to_s_c(old))) {
+		replace->successes++;
+		return BTREE_HOOK_DO_INSERT;
+	} else {
+		replace->failures++;
+		return BTREE_HOOK_NO_INSERT;
+	}
+}
+
+static bool bch_extent_merge_inline(struct cache_set *,
+				    struct btree_iter *,
+				    struct bkey_packed *,
+				    struct bkey_packed *,
+				    bool);
+
+#define MAX_LOCK_HOLD_TIME	(5 * NSEC_PER_MSEC)
+
+static enum btree_insert_ret
+extent_insert_should_stop(struct extent_insert_state *s)
+{
+	struct btree *b = s->insert->iter->nodes[0];
+
+	/*
+	 * Check if we have sufficient space in both the btree node and the
+	 * journal reservation:
+	 *
+	 * Each insert checks for room in the journal entry, but we check for
+	 * room in the btree node up-front. In the worst case, bkey_cmpxchg()
+	 * will insert two keys, and one iteration of this room will insert one
+	 * key, so we need room for three keys.
+	 */
+	if (!bch_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s))
+		return BTREE_INSERT_BTREE_NODE_FULL;
+	else if (!journal_res_insert_fits(s->trans, s->insert))
+		return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */
+	else
+		return BTREE_INSERT_OK;
+}
+
+static void extent_bset_insert(struct cache_set *c, struct btree_iter *iter,
+			       struct bkey_i *insert)
+{
+	struct btree *b = iter->nodes[0];
+	struct btree_node_iter *node_iter = &iter->node_iters[0];
+	struct bset_tree *t = bset_tree_last(b);
+	struct bkey_packed *where =
+		bch_btree_node_iter_bset_pos(node_iter, b, t);
+	struct bkey_packed *prev = bkey_prev(b, t, where);
+	struct bkey_packed *next_live_key = where;
+	unsigned clobber_u64s;
+
+	if (prev)
+		where = bkey_next(prev);
+
+	while (next_live_key != btree_bkey_last(b, t) &&
+	       bkey_deleted(next_live_key))
+		next_live_key = bkey_next(next_live_key);
+
+	/*
+	 * Everything between where and next_live_key is now deleted keys, and
+	 * is overwritten:
+	 */
+	clobber_u64s = (u64 *) next_live_key - (u64 *) where;
+
+	if (prev &&
+	    bch_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true))
+		goto drop_deleted_keys;
+
+	if (next_live_key != btree_bkey_last(b, t) &&
+	    bch_extent_merge_inline(c, iter, bkey_to_packed(insert),
+				    next_live_key, false))
+		goto drop_deleted_keys;
+
+	bch_bset_insert(b, node_iter, where, insert, clobber_u64s);
+	bch_btree_node_iter_fix(iter, b, node_iter, t, where,
+				clobber_u64s, where->u64s);
+	return;
+drop_deleted_keys:
+	bch_bset_delete(b, where, clobber_u64s);
+	bch_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, 0);
+}
+
+static void extent_insert_committed(struct extent_insert_state *s)
+{
+	struct cache_set *c = s->trans->c;
+	struct btree_iter *iter = s->insert->iter;
+	struct bkey_i *insert = !s->deleting
+		? s->insert->k
+		: &s->whiteout;
+	BKEY_PADDED(k) split;
+
+	EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0);
+	EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0);
+
+	if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k)))
+		return;
+
+	if (s->deleting && !s->do_journal) {
+		bch_cut_front(s->committed, insert);
+		goto done;
+	}
+
+	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+
+	bkey_copy(&split.k, insert);
+
+	if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
+	    bkey_cmp(s->committed, insert->k.p) &&
+	    bkey_extent_is_compressed(c, bkey_i_to_s_c(insert))) {
+		/* XXX: possibly need to increase our reservation? */
+		bch_cut_subtract_back(s, s->committed,
+				      bkey_i_to_s(&split.k));
+		bch_cut_front(s->committed, insert);
+		bch_add_sectors(s, bkey_i_to_s_c(insert),
+				bkey_start_offset(&insert->k),
+				insert->k.size);
+	} else {
+		bch_cut_back(s->committed, &split.k.k);
+		bch_cut_front(s->committed, insert);
+	}
+
+	if (debug_check_bkeys(c))
+		bkey_debugcheck(c, iter->nodes[iter->level],
+				bkey_i_to_s_c(&split.k));
+
+	bch_btree_journal_key(s->trans, iter, &split.k);
+
+	if (!s->deleting)
+		extent_bset_insert(c, iter, &split.k);
+done:
+	bch_btree_iter_set_pos_same_leaf(iter, s->committed);
+
+	insert->k.needs_whiteout	= false;
+	s->do_journal			= false;
+	s->trans->did_work		= true;
+}
+
+static enum extent_insert_hook_ret
+__extent_insert_advance_pos(struct extent_insert_state *s,
+			    struct bpos next_pos,
+			    struct bkey_s_c k)
+{
+	struct extent_insert_hook *hook = s->trans->hook;
+	enum extent_insert_hook_ret ret;
+
+	if (k.k && k.k->size &&
+	    s->insert->k->k.version &&
+	    k.k->version > s->insert->k->k.version)
+		ret = BTREE_HOOK_NO_INSERT;
+	else if (hook)
+		ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
+	else
+		ret = BTREE_HOOK_DO_INSERT;
+
+	EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size);
+
+	switch (ret) {
+	case BTREE_HOOK_DO_INSERT:
+		break;
+	case BTREE_HOOK_NO_INSERT:
+		extent_insert_committed(s);
+		bch_cut_subtract_front(s, next_pos, bkey_i_to_s(s->insert->k));
+
+		bch_btree_iter_set_pos_same_leaf(s->insert->iter, next_pos);
+		break;
+	case BTREE_HOOK_RESTART_TRANS:
+		return ret;
+	}
+
+	s->committed = next_pos;
+	return ret;
+}
+
+/*
+ * Update iter->pos, marking how much of @insert we've processed, and call hook
+ * fn:
+ */
+static enum extent_insert_hook_ret
+extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
+{
+	struct btree *b = s->insert->iter->nodes[0];
+	struct bpos next_pos = bpos_min(s->insert->k->k.p,
+					k.k ? k.k->p : b->key.k.p);
+
+	/* hole? */
+	if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
+		bool have_uncommitted = bkey_cmp(s->committed,
+				bkey_start_pos(&s->insert->k->k)) > 0;
+
+		switch (__extent_insert_advance_pos(s, bkey_start_pos(k.k),
+						    bkey_s_c_null)) {
+		case BTREE_HOOK_DO_INSERT:
+			break;
+		case BTREE_HOOK_NO_INSERT:
+			/*
+			 * we had to split @insert and insert the committed
+			 * part - need to bail out and recheck journal
+			 * reservation/btree node before we advance pos past @k:
+			 */
+			if (have_uncommitted)
+				return BTREE_HOOK_NO_INSERT;
+			break;
+		case BTREE_HOOK_RESTART_TRANS:
+			return BTREE_HOOK_RESTART_TRANS;
+		}
+	}
+
+	/* avoid redundant calls to hook fn: */
+	if (!bkey_cmp(s->committed, next_pos))
+		return BTREE_HOOK_DO_INSERT;
+
+	return __extent_insert_advance_pos(s, next_pos, k);
+}
+
+static enum btree_insert_ret
+extent_insert_check_split_compressed(struct extent_insert_state *s,
+				     struct bkey_s_c k,
+				     enum bch_extent_overlap overlap)
+{
+	struct cache_set *c = s->trans->c;
+	unsigned sectors;
+
+	if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
+	    (sectors = bkey_extent_is_compressed(c, k))) {
+		int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
+
+		if (s->trans->flags & BTREE_INSERT_NOFAIL)
+			flags |= BCH_DISK_RESERVATION_NOFAIL;
+
+		switch (bch_disk_reservation_add(c,
+				s->trans->disk_res,
+				sectors, flags)) {
+		case 0:
+			break;
+		case -ENOSPC:
+			return BTREE_INSERT_ENOSPC;
+		case -EINTR:
+			return BTREE_INSERT_NEED_GC_LOCK;
+		default:
+			BUG();
+		}
+	}
+
+	return BTREE_INSERT_OK;
+}
+
+static enum btree_insert_ret
+extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
+	      struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k,
+	      enum bch_extent_overlap overlap)
+{
+	struct cache_set *c = s->trans->c;
+	struct btree_iter *iter = s->insert->iter;
+	struct btree *b = iter->nodes[0];
+	struct btree_node_iter *node_iter = &iter->node_iters[0];
+
+	switch (overlap) {
+	case BCH_EXTENT_OVERLAP_FRONT:
+		/* insert overlaps with start of k: */
+		bch_cut_subtract_front(s, insert->k.p, k);
+		BUG_ON(bkey_deleted(k.k));
+		extent_save(b, node_iter, _k, k.k);
+		break;
+
+	case BCH_EXTENT_OVERLAP_BACK:
+		/* insert overlaps with end of k: */
+		bch_cut_subtract_back(s, bkey_start_pos(&insert->k), k);
+		BUG_ON(bkey_deleted(k.k));
+		extent_save(b, node_iter, _k, k.k);
+
+		/*
+		 * As the auxiliary tree is indexed by the end of the
+		 * key and we've just changed the end, update the
+		 * auxiliary tree.
+		 */
+		bch_bset_fix_invalidated_key(b, t, _k);
+		bch_btree_node_iter_fix(iter, b, node_iter, t,
+					_k, _k->u64s, _k->u64s);
+		break;
+
+	case BCH_EXTENT_OVERLAP_ALL: {
+		struct bpos orig_pos = k.k->p;
+
+		/* The insert key completely covers k, invalidate k */
+		if (!bkey_whiteout(k.k))
+			btree_keys_account_key_drop(&b->nr,
+						t - b->set, _k);
+
+		bch_drop_subtract(s, k);
+		k.k->p = bkey_start_pos(&insert->k);
+		if (!__extent_save(b, node_iter, _k, k.k)) {
+			/*
+			 * Couldn't repack: we aren't necessarily able
+			 * to repack if the new key is outside the range
+			 * of the old extent, so we have to split
+			 * @insert:
+			 */
+			k.k->p = orig_pos;
+			extent_save(b, node_iter, _k, k.k);
+
+			if (extent_insert_advance_pos(s, k.s_c) ==
+			    BTREE_HOOK_RESTART_TRANS)
+				return BTREE_INSERT_NEED_TRAVERSE;
+
+			extent_insert_committed(s);
+			/*
+			 * We split and inserted upto at k.k->p - that
+			 * has to coincide with iter->pos, so that we
+			 * don't have anything more we have to insert
+			 * until we recheck our journal reservation:
+			 */
+			EBUG_ON(bkey_cmp(s->committed, k.k->p));
+		} else {
+			bch_bset_fix_invalidated_key(b, t, _k);
+			bch_btree_node_iter_fix(iter, b, node_iter, t,
+						_k, _k->u64s, _k->u64s);
+		}
+
+		break;
+	}
+	case BCH_EXTENT_OVERLAP_MIDDLE: {
+		BKEY_PADDED(k) split;
+		/*
+		 * The insert key falls 'in the middle' of k
+		 * The insert key splits k in 3:
+		 * - start only in k, preserve
+		 * - middle common section, invalidate in k
+		 * - end only in k, preserve
+		 *
+		 * We update the old key to preserve the start,
+		 * insert will be the new common section,
+		 * we manually insert the end that we are preserving.
+		 *
+		 * modify k _before_ doing the insert (which will move
+		 * what k points to)
+		 */
+		bkey_reassemble(&split.k, k.s_c);
+		split.k.k.needs_whiteout |= bset_written(b, bset(b, t));
+
+		bch_cut_back(bkey_start_pos(&insert->k), &split.k.k);
+		BUG_ON(bkey_deleted(&split.k.k));
+
+		bch_cut_subtract_front(s, insert->k.p, k);
+		BUG_ON(bkey_deleted(k.k));
+		extent_save(b, node_iter, _k, k.k);
+
+		bch_add_sectors(s, bkey_i_to_s_c(&split.k),
+				bkey_start_offset(&split.k.k),
+				split.k.k.size);
+		extent_bset_insert(c, iter, &split.k);
+		break;
+	}
+	}
+
+	return BTREE_INSERT_OK;
+}
+
+static enum btree_insert_ret
+bch_delete_fixup_extent(struct extent_insert_state *s)
+{
+	struct cache_set *c = s->trans->c;
+	struct btree_iter *iter = s->insert->iter;
+	struct btree *b = iter->nodes[0];
+	struct btree_node_iter *node_iter = &iter->node_iters[0];
+	struct bkey_packed *_k;
+	struct bkey unpacked;
+	struct bkey_i *insert = s->insert->k;
+	enum btree_insert_ret ret = BTREE_INSERT_OK;
+
+	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+
+	s->whiteout	= *insert;
+	s->do_journal	= false;
+
+	while (bkey_cmp(s->committed, insert->k.p) < 0 &&
+	       (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
+	       (_k = bch_btree_node_iter_peek_all(node_iter, b))) {
+		struct bset_tree *t = bch_bkey_to_bset(b, _k);
+		struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
+		enum bch_extent_overlap overlap;
+
+		EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+		EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
+
+		if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+			break;
+
+		if (bkey_whiteout(k.k)) {
+			s->committed = bpos_min(insert->k.p, k.k->p);
+			goto next;
+		}
+
+		overlap = bch_extent_overlap(&insert->k, k.k);
+
+		ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
+		if (ret != BTREE_INSERT_OK)
+			goto stop;
+
+		switch (extent_insert_advance_pos(s, k.s_c)) {
+		case BTREE_HOOK_DO_INSERT:
+			break;
+		case BTREE_HOOK_NO_INSERT:
+			continue;
+		case BTREE_HOOK_RESTART_TRANS:
+			ret = BTREE_INSERT_NEED_TRAVERSE;
+			goto stop;
+		}
+
+		s->do_journal = true;
+
+		if (overlap == BCH_EXTENT_OVERLAP_ALL) {
+			btree_keys_account_key_drop(&b->nr,
+						t - b->set, _k);
+			bch_subtract_sectors(s, k.s_c,
+					     bkey_start_offset(k.k), k.k->size);
+			_k->type = KEY_TYPE_DISCARD;
+			reserve_whiteout(b, t, _k);
+		} else if (k.k->needs_whiteout ||
+			   bset_written(b, bset(b, t))) {
+			struct bkey_i discard = *insert;
+
+			switch (overlap) {
+			case BCH_EXTENT_OVERLAP_FRONT:
+				bch_cut_front(bkey_start_pos(k.k), &discard);
+				break;
+			case BCH_EXTENT_OVERLAP_BACK:
+				bch_cut_back(k.k->p, &discard.k);
+				break;
+			default:
+				break;
+			}
+
+			discard.k.needs_whiteout = true;
+
+			ret = extent_squash(s, insert, t, _k, k, overlap);
+			BUG_ON(ret != BTREE_INSERT_OK);
+
+			extent_bset_insert(c, iter, &discard);
+		} else {
+			ret = extent_squash(s, insert, t, _k, k, overlap);
+			BUG_ON(ret != BTREE_INSERT_OK);
+		}
+next:
+		bch_cut_front(s->committed, insert);
+		bch_btree_iter_set_pos_same_leaf(iter, s->committed);
+	}
+
+	if (bkey_cmp(s->committed, insert->k.p) < 0 &&
+	    ret == BTREE_INSERT_OK &&
+	    extent_insert_advance_pos(s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
+		ret = BTREE_INSERT_NEED_TRAVERSE;
+stop:
+	extent_insert_committed(s);
+
+	bch_cache_set_stats_apply(c, &s->stats, s->trans->disk_res,
+				  gc_pos_btree_node(b));
+
+	EBUG_ON(bkey_cmp(iter->pos, s->committed));
+	EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf);
+
+	bch_cut_front(iter->pos, insert);
+
+	if (insert->k.size && iter->at_end_of_leaf)
+		ret = BTREE_INSERT_NEED_TRAVERSE;
+
+	EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK);
+
+	return ret;
+}
+
+/**
+ * bch_extent_insert_fixup - insert a new extent and deal with overlaps
+ *
+ * this may result in not actually doing the insert, or inserting some subset
+ * of the insert key. For cmpxchg operations this is where that logic lives.
+ *
+ * All subsets of @insert that need to be inserted are inserted using
+ * bch_btree_insert_and_journal(). If @b or @res fills up, this function
+ * returns false, setting @iter->pos for the prefix of @insert that actually got
+ * inserted.
+ *
+ * BSET INVARIANTS: this function is responsible for maintaining all the
+ * invariants for bsets of extents in memory. things get really hairy with 0
+ * size extents
+ *
+ * within one bset:
+ *
+ * bkey_start_pos(bkey_next(k)) >= k
+ * or bkey_start_offset(bkey_next(k)) >= k->offset
+ *
+ * i.e. strict ordering, no overlapping extents.
+ *
+ * multiple bsets (i.e. full btree node):
+ *
+ * ∀ k, j
+ *   k.size != 0 ∧ j.size != 0 →
+ *     ¬ (k > bkey_start_pos(j) ∧ k < j)
+ *
+ * i.e. no two overlapping keys _of nonzero size_
+ *
+ * We can't realistically maintain this invariant for zero size keys because of
+ * the key merging done in bch_btree_insert_key() - for two mergeable keys k, j
+ * there may be another 0 size key between them in another bset, and it will
+ * thus overlap with the merged key.
+ *
+ * In addition, the end of iter->pos indicates how much has been processed.
+ * If the end of iter->pos is not the same as the end of insert, then
+ * key insertion needs to continue/be retried.
+ */
+enum btree_insert_ret
+bch_insert_fixup_extent(struct btree_insert *trans,
+			struct btree_insert_entry *insert)
+{
+	struct cache_set *c = trans->c;
+	struct btree_iter *iter = insert->iter;
+	struct btree *b = iter->nodes[0];
+	struct btree_node_iter *node_iter = &iter->node_iters[0];
+	struct bkey_packed *_k;
+	struct bkey unpacked;
+	enum btree_insert_ret ret = BTREE_INSERT_OK;
+
+	struct extent_insert_state s = {
+		.trans		= trans,
+		.insert		= insert,
+		.committed	= insert->iter->pos,
+		.deleting	= bkey_whiteout(&insert->k->k),
+	};
+
+	EBUG_ON(iter->level);
+	EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size);
+
+	if (s.deleting)
+		return bch_delete_fixup_extent(&s);
+
+	/*
+	 * As we process overlapping extents, we advance @iter->pos both to
+	 * signal to our caller (btree_insert_key()) how much of @insert->k has
+	 * been inserted, and also to keep @iter->pos consistent with
+	 * @insert->k and the node iterator that we're advancing:
+	 */
+	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
+
+	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+		bch_add_sectors(&s, bkey_i_to_s_c(insert->k),
+				bkey_start_offset(&insert->k->k),
+				insert->k->k.size);
+
+	while (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
+	       (ret = extent_insert_should_stop(&s)) == BTREE_INSERT_OK &&
+	       (_k = bch_btree_node_iter_peek_all(node_iter, b))) {
+		struct bset_tree *t = bch_bkey_to_bset(b, _k);
+		struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
+		enum bch_extent_overlap overlap;
+
+		EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
+		EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
+
+		if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0)
+			break;
+
+		overlap = bch_extent_overlap(&insert->k->k, k.k);
+
+		ret = extent_insert_check_split_compressed(&s, k.s_c, overlap);
+		if (ret != BTREE_INSERT_OK)
+			goto stop;
+
+		if (!k.k->size)
+			goto squash;
+
+		/*
+		 * Only call advance pos & call hook for nonzero size extents:
+		 * If hook returned BTREE_HOOK_NO_INSERT, @insert->k no longer
+		 * overlaps with @k:
+		 */
+		switch (extent_insert_advance_pos(&s, k.s_c)) {
+		case BTREE_HOOK_DO_INSERT:
+			break;
+		case BTREE_HOOK_NO_INSERT:
+			continue;
+		case BTREE_HOOK_RESTART_TRANS:
+			ret = BTREE_INSERT_NEED_TRAVERSE;
+			goto stop;
+		}
+
+		if (k.k->size &&
+		    (k.k->needs_whiteout || bset_written(b, bset(b, t))))
+			insert->k->k.needs_whiteout = true;
+
+		if (overlap == BCH_EXTENT_OVERLAP_ALL &&
+		    bkey_whiteout(k.k) &&
+		    k.k->needs_whiteout) {
+			unreserve_whiteout(b, t, _k);
+			_k->needs_whiteout = false;
+		}
+squash:
+		ret = extent_squash(&s, insert->k, t, _k, k, overlap);
+		if (ret != BTREE_INSERT_OK)
+			goto stop;
+	}
+
+	if (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
+	    ret == BTREE_INSERT_OK &&
+	    extent_insert_advance_pos(&s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
+		ret = BTREE_INSERT_NEED_TRAVERSE;
+stop:
+	extent_insert_committed(&s);
+	/*
+	 * Subtract any remaining sectors from @insert, if we bailed out early
+	 * and didn't fully insert @insert:
+	 */
+	if (insert->k->k.size &&
+	    !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+		bch_subtract_sectors(&s, bkey_i_to_s_c(insert->k),
+				     bkey_start_offset(&insert->k->k),
+				     insert->k->k.size);
+
+	bch_cache_set_stats_apply(c, &s.stats, trans->disk_res,
+				  gc_pos_btree_node(b));
+
+	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
+	EBUG_ON(bkey_cmp(iter->pos, s.committed));
+	EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf);
+
+	if (insert->k->k.size && iter->at_end_of_leaf)
+		ret = BTREE_INSERT_NEED_TRAVERSE;
+
+	EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK);
+
+	return ret;
+}
+
+static const char *bch_extent_invalid(const struct cache_set *c,
+				      struct bkey_s_c k)
+{
+	if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
+		return "value too big";
+
+	if (!k.k->size)
+		return "zero key size";
+
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+		const union bch_extent_crc *crc;
+		struct cache_member_rcu *mi = cache_member_info_get(c);
+		unsigned size_ondisk = e.k->size;
+		const char *reason;
+
+		extent_for_each_entry(e, entry) {
+			reason = "invalid extent entry type";
+			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
+				goto invalid;
+
+			switch (extent_entry_type(entry)) {
+			case BCH_EXTENT_ENTRY_crc32:
+			case BCH_EXTENT_ENTRY_crc64:
+				crc = entry_to_crc(entry);
+
+				reason = "checksum offset + key size > uncompressed size";
+				if (crc_offset(crc) + e.k->size >
+				    crc_uncompressed_size(e.k, crc))
+					goto invalid;
+
+				size_ondisk = crc_compressed_size(e.k, crc);
+
+				reason = "invalid checksum type";
+				if (crc_csum_type(crc) >= BCH_CSUM_NR)
+					goto invalid;
+
+				reason = "invalid compression type";
+				if (crc_compression_type(crc) >= BCH_COMPRESSION_NR)
+					goto invalid;
+				break;
+			case BCH_EXTENT_ENTRY_ptr:
+				reason = extent_ptr_invalid(e, mi,
+						&entry->ptr, size_ondisk);
+				if (reason)
+					goto invalid;
+				break;
+			}
+		}
+
+		cache_member_info_put();
+		return NULL;
+invalid:
+		cache_member_info_put();
+		return reason;
+	}
+
+	case BCH_RESERVATION:
+		return NULL;
+
+	default:
+		return "invalid value type";
+	}
+}
+
+static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b,
+					 struct bkey_s_c_extent e)
+{
+	const struct bch_extent_ptr *ptr;
+	struct cache_member_rcu *mi;
+	struct cache *ca;
+	struct bucket *g;
+	unsigned seq, stale;
+	char buf[160];
+	bool bad;
+	unsigned ptrs_per_tier[CACHE_TIERS];
+	unsigned tier, replicas = 0;
+
+	/*
+	 * XXX: we should be doing most/all of these checks at startup time,
+	 * where we check bkey_invalid() in btree_node_read_done()
+	 *
+	 * But note that we can't check for stale pointers or incorrect gc marks
+	 * until after journal replay is done (it might be an extent that's
+	 * going to get overwritten during replay)
+	 */
+
+	memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));
+
+	mi = cache_member_info_get(c);
+
+	extent_for_each_ptr(e, ptr) {
+		bool dirty = bch_extent_ptr_is_dirty(c, e, ptr);
+
+		replicas++;
+
+		if (ptr->dev >= mi->nr_in_set)
+			goto bad_device;
+
+		/*
+		 * If journal replay hasn't finished, we might be seeing keys
+		 * that will be overwritten by the time journal replay is done:
+		 */
+		if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+			continue;
+
+		if (!mi->m[ptr->dev].valid)
+			goto bad_device;
+
+		tier = mi->m[ptr->dev].tier;
+		ptrs_per_tier[tier]++;
+
+		stale = 0;
+
+		if ((ca = PTR_CACHE(c, ptr))) {
+			g = PTR_BUCKET(ca, ptr);
+
+			do {
+				struct bucket_mark mark;
+
+				seq = read_seqcount_begin(&c->gc_pos_lock);
+				mark = READ_ONCE(g->mark);
+
+				/* between mark and bucket gen */
+				smp_rmb();
+
+				stale = ptr_stale(ca, ptr);
+
+				cache_set_bug_on(stale && dirty, c,
+						 "stale dirty pointer");
+
+				cache_set_bug_on(stale > 96, c,
+						 "key too stale: %i",
+						 stale);
+
+				if (stale)
+					break;
+
+				bad = (mark.is_metadata ||
+				       (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
+					!mark.owned_by_allocator &&
+					!(dirty
+					  ? mark.dirty_sectors
+					  : mark.cached_sectors)));
+			} while (read_seqcount_retry(&c->gc_pos_lock, seq));
+
+			if (bad)
+				goto bad_ptr;
+		}
+	}
+	cache_member_info_put();
+
+	if (replicas > BCH_REPLICAS_MAX) {
+		bch_bkey_val_to_text(c, btree_node_type(b), buf,
+				     sizeof(buf), e.s_c);
+		cache_set_bug(c,
+			"extent key bad (too many replicas: %u): %s",
+			replicas, buf);
+		return;
+	}
+
+	if (!bkey_extent_is_cached(e.k) &&
+	    replicas < c->sb.data_replicas_have) {
+		bch_bkey_val_to_text(c, btree_node_type(b), buf,
+				     sizeof(buf), e.s_c);
+		cache_set_bug(c,
+			"extent key bad (too few replicas, %u < %u): %s",
+			replicas, c->sb.data_replicas_have, buf);
+		return;
+	}
+
+	return;
+
+bad_device:
+	bch_bkey_val_to_text(c, btree_node_type(b), buf,
+			     sizeof(buf), e.s_c);
+	cache_set_bug(c, "extent pointer to dev %u missing device: %s",
+		      ptr->dev, buf);
+	cache_member_info_put();
+	return;
+
+bad_ptr:
+	bch_bkey_val_to_text(c, btree_node_type(b), buf,
+			     sizeof(buf), e.s_c);
+	cache_set_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu prio %i "
+		      "gen %i last_gc %i mark 0x%08x",
+		      buf, PTR_BUCKET_NR(ca, ptr),
+		      g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen,
+		      ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)],
+		      (unsigned) g->mark.counter);
+	cache_member_info_put();
+	return;
+}
+
+static void bch_extent_debugcheck(struct cache_set *c, struct btree *b,
+				  struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		bch_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k));
+	case BCH_RESERVATION:
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void bch_extent_to_text(struct cache_set *c, char *buf,
+			       size_t size, struct bkey_s_c k)
+{
+	char *out = buf, *end = buf + size;
+	const char *invalid;
+
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+	if (bkey_extent_is_data(k.k))
+		out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
+
+	invalid = bch_extent_invalid(c, k);
+	if (invalid)
+		p(" invalid: %s", invalid);
+#undef p
+}
+
+static unsigned PTR_TIER(struct cache_member_rcu *mi,
+			 const struct bch_extent_ptr *ptr)
+{
+	return ptr->dev < mi->nr_in_set
+		? mi->m[ptr->dev].tier
+		: UINT_MAX;
+}
+
+void bch_extent_entry_append(struct bkey_i_extent *e,
+			     union bch_extent_entry *entry)
+{
+	BUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
+	       BKEY_EXTENT_VAL_U64s_MAX);
+
+	memcpy_u64s(extent_entry_last(extent_i_to_s(e)),
+		    entry,
+		    extent_entry_u64s(entry));
+	e->k.u64s += extent_entry_u64s(entry);
+}
+
+const unsigned bch_crc_size[] = {
+	[BCH_CSUM_NONE]			= 0,
+	[BCH_CSUM_CRC32C]		= 4,
+	[BCH_CSUM_CRC64]		= 8,
+};
+
+static void bch_extent_crc_init(union bch_extent_crc *crc,
+				unsigned compressed_size,
+				unsigned uncompressed_size,
+				unsigned compression_type,
+				u64 csum, unsigned csum_type)
+{
+	if (bch_crc_size[csum_type] <= 4 &&
+	    uncompressed_size <= CRC32_EXTENT_SIZE_MAX) {
+		crc->crc32 = (struct bch_extent_crc32) {
+			.type = 1 << BCH_EXTENT_ENTRY_crc32,
+			.compressed_size	= compressed_size,
+			.uncompressed_size	= uncompressed_size,
+			.offset			= 0,
+			.compression_type	= compression_type,
+			.csum_type		= csum_type,
+			.csum			= csum,
+		};
+	} else {
+		BUG_ON(uncompressed_size > CRC64_EXTENT_SIZE_MAX);
+
+		crc->crc64 = (struct bch_extent_crc64) {
+			.type = 1 << BCH_EXTENT_ENTRY_crc64,
+			.compressed_size	= compressed_size,
+			.uncompressed_size	= uncompressed_size,
+			.offset			= 0,
+			.compression_type	= compression_type,
+			.csum_type		= csum_type,
+			.csum			= csum,
+		};
+	}
+}
+
+void bch_extent_crc_append(struct bkey_i_extent *e,
+			   unsigned compressed_size,
+			   unsigned uncompressed_size,
+			   unsigned compression_type,
+			   u64 csum, unsigned csum_type)
+{
+	union bch_extent_crc *crc;
+	union bch_extent_crc new;
+
+	BUG_ON(compressed_size > uncompressed_size);
+	BUG_ON(uncompressed_size != e->k.size);
+	BUG_ON(!compressed_size || !uncompressed_size);
+
+	/*
+	 * Look up the last crc entry, so we can check if we need to add
+	 * another:
+	 */
+	extent_for_each_crc(extent_i_to_s(e), crc)
+		;
+
+	switch (extent_crc_type(crc)) {
+	case BCH_EXTENT_CRC_NONE:
+		if (!csum_type && !compression_type)
+			return;
+		break;
+	case BCH_EXTENT_CRC32:
+	case BCH_EXTENT_CRC64:
+		if (crc_compressed_size(&e->k, crc)	== compressed_size &&
+		    crc_uncompressed_size(&e->k, crc)	== uncompressed_size &&
+		    crc_offset(crc)			== 0 &&
+		    crc_compression_type(crc)		== compression_type &&
+		    crc_csum_type(crc)			== csum_type &&
+		    crc_csum(crc)			== csum)
+			return;
+		break;
+	}
+
+	bch_extent_crc_init(&new,
+			    compressed_size,
+			    uncompressed_size,
+			    compression_type,
+			    csum, csum_type);
+	bch_extent_entry_append(e, to_entry(&new));
+}
+
+static void __extent_sort_ptrs(struct cache_member_rcu *mi,
+			       struct bkey_s_extent src)
+{
+	struct bch_extent_ptr *src_ptr, *dst_ptr;
+	union bch_extent_crc *src_crc, *dst_crc;
+	union bch_extent_crc _src;
+	BKEY_PADDED(k) tmp;
+	struct bkey_s_extent dst;
+	size_t u64s, crc_u64s;
+	u64 *p;
+
+	/*
+	 * Insertion sort:
+	 *
+	 * Note: this sort needs to be stable, because pointer order determines
+	 * pointer dirtyness.
+	 */
+
+	tmp.k.k = *src.k;
+	dst = bkey_i_to_s_extent(&tmp.k);
+	set_bkey_val_u64s(dst.k, 0);
+
+	extent_for_each_ptr_crc(src, src_ptr, src_crc) {
+		extent_for_each_ptr_crc(dst, dst_ptr, dst_crc)
+			if (PTR_TIER(mi, src_ptr) < PTR_TIER(mi, dst_ptr))
+				goto found;
+
+		dst_ptr = &extent_entry_last(dst)->ptr;
+		dst_crc = NULL;
+found:
+		/* found insert position: */
+
+		/*
+		 * we're making sure everything has a crc at this point, if
+		 * dst_ptr points to a pointer it better have a crc:
+		 */
+		BUG_ON(dst_ptr != &extent_entry_last(dst)->ptr && !dst_crc);
+		BUG_ON(dst_crc &&
+		       (extent_entry_next(to_entry(dst_crc)) !=
+			to_entry(dst_ptr)));
+
+		if (!src_crc) {
+			bch_extent_crc_init(&_src, src.k->size,
+					    src.k->size, 0, 0, 0);
+			src_crc = &_src;
+		}
+
+		p = dst_ptr != &extent_entry_last(dst)->ptr
+			? (void *) dst_crc
+			: (void *) dst_ptr;
+
+		crc_u64s = extent_entry_u64s(to_entry(src_crc));
+		u64s = crc_u64s + sizeof(*dst_ptr) / sizeof(u64);
+
+		memmove_u64s_up(p + u64s, p,
+				(u64 *) extent_entry_last(dst) - (u64 *) p);
+		set_bkey_val_u64s(dst.k, bkey_val_u64s(dst.k) + u64s);
+
+		memcpy_u64s(p, src_crc, crc_u64s);
+		memcpy_u64s(p + crc_u64s, src_ptr,
+			    sizeof(*src_ptr) / sizeof(u64));
+	}
+
+	/* Sort done - now drop redundant crc entries: */
+	bch_extent_drop_redundant_crcs(dst);
+
+	memcpy_u64s(src.v, dst.v, bkey_val_u64s(dst.k));
+	set_bkey_val_u64s(src.k, bkey_val_u64s(dst.k));
+}
+
+static void extent_sort_ptrs(struct cache_set *c, struct bkey_s_extent e)
+{
+	struct cache_member_rcu *mi;
+	struct bch_extent_ptr *ptr, *prev = NULL;
+	union bch_extent_crc *crc;
+
+	/*
+	 * First check if any pointers are out of order before doing the actual
+	 * sort:
+	 */
+	mi = cache_member_info_get(c);
+
+	extent_for_each_ptr_crc(e, ptr, crc) {
+		if (prev &&
+		    PTR_TIER(mi, ptr) < PTR_TIER(mi, prev)) {
+			__extent_sort_ptrs(mi, e);
+			break;
+		}
+		prev = ptr;
+	}
+
+	cache_member_info_put();
+}
+
+/*
+ * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
+ *
+ * Returns true if @k should be dropped entirely
+ *
+ * For existing keys, only called when btree nodes are being rewritten, not when
+ * they're merely being compacted/resorted in memory.
+ */
+static bool __bch_extent_normalize(struct cache_set *c, struct bkey_s k,
+				   bool sort)
+{
+	struct bkey_s_extent e;
+
+	switch (k.k->type) {
+	case KEY_TYPE_ERROR:
+		return false;
+
+	case KEY_TYPE_DELETED:
+	case KEY_TYPE_COOKIE:
+		return true;
+
+	case KEY_TYPE_DISCARD:
+		return !k.k->version;
+
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		e = bkey_s_to_extent(k);
+
+		bch_extent_drop_stale(c, e);
+
+		if (sort)
+			extent_sort_ptrs(c, e);
+
+		if (!bkey_val_u64s(e.k)) {
+			if (bkey_extent_is_cached(e.k)) {
+				k.k->type = KEY_TYPE_DISCARD;
+				if (!k.k->version)
+					return true;
+			} else {
+				k.k->type = KEY_TYPE_ERROR;
+			}
+		}
+
+		return false;
+	case BCH_RESERVATION:
+		return false;
+	default:
+		BUG();
+	}
+}
+
+bool bch_extent_normalize(struct cache_set *c, struct bkey_s k)
+{
+	return __bch_extent_normalize(c, k, true);
+}
+
+/*
+ * This picks a non-stale pointer, preferabbly from a device other than
+ * avoid.  Avoid can be NULL, meaning pick any.  If there are no non-stale
+ * pointers to other devices, it will still pick a pointer from avoid.
+ * Note that it prefers lowered-numbered pointers to higher-numbered pointers
+ * as the pointers are sorted by tier, hence preferring pointers to tier 0
+ * rather than pointers to tier 1.
+ */
+void bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k,
+				  struct cache *avoid,
+				  struct extent_pick_ptr *ret)
+{
+	struct bkey_s_c_extent e;
+	const union bch_extent_crc *crc;
+	const struct bch_extent_ptr *ptr;
+	struct cache *ca;
+
+	switch (k.k->type) {
+	case KEY_TYPE_DELETED:
+	case KEY_TYPE_DISCARD:
+	case KEY_TYPE_COOKIE:
+		ret->ca = NULL;
+		return;
+
+	case KEY_TYPE_ERROR:
+		ret->ca = ERR_PTR(-EIO);
+		return;
+
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		e = bkey_s_c_to_extent(k);
+		rcu_read_lock();
+		ret->ca = NULL;
+
+		extent_for_each_online_device_crc(c, e, crc, ptr, ca)
+			if (!ptr_stale(ca, ptr)) {
+				*ret = (struct extent_pick_ptr) {
+					.crc = crc_to_64(e.k, crc),
+					.ptr = *ptr,
+					.ca = ca,
+				};
+
+				if (ca != avoid)
+					break;
+			}
+
+		if (ret->ca)
+			percpu_ref_get(&ret->ca->ref);
+		else if (!bkey_extent_is_cached(e.k))
+			ret->ca = ERR_PTR(-EIO);
+
+		rcu_read_unlock();
+		return;
+
+	case BCH_RESERVATION:
+		ret->ca = NULL;
+		return;
+
+	default:
+		BUG();
+	}
+}
+
+static enum merge_result bch_extent_merge(struct cache_set *c,
+					  struct btree *bk,
+					  struct bkey_i *l, struct bkey_i *r)
+{
+	struct bkey_s_extent el, er;
+	union bch_extent_entry *en_l, *en_r;
+
+	if (key_merging_disabled(c))
+		return BCH_MERGE_NOMERGE;
+
+	/*
+	 * Generic header checks
+	 * Assumes left and right are in order
+	 * Left and right must be exactly aligned
+	 */
+
+	if (l->k.u64s		!= r->k.u64s ||
+	    l->k.type		!= r->k.type ||
+	    l->k.version	!= r->k.version ||
+	    bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
+		return BCH_MERGE_NOMERGE;
+
+	switch (l->k.type) {
+	case KEY_TYPE_DELETED:
+	case KEY_TYPE_DISCARD:
+	case KEY_TYPE_ERROR:
+	case BCH_RESERVATION:
+		/* These types are mergeable, and no val to check */
+		break;
+
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		el = bkey_i_to_s_extent(l);
+		er = bkey_i_to_s_extent(r);
+
+		extent_for_each_entry(el, en_l) {
+			struct bch_extent_ptr *lp, *rp;
+			struct cache_member_cpu *m;
+
+			en_r = bkey_idx(er.v, (u64 *) en_l - el.v->_data);
+
+			if ((extent_entry_type(en_l) !=
+			     extent_entry_type(en_r)) ||
+			    extent_entry_is_crc(en_l))
+				return BCH_MERGE_NOMERGE;
+
+			lp = &en_l->ptr;
+			rp = &en_r->ptr;
+
+			if (lp->offset + el.k->size	!= rp->offset ||
+			    lp->dev			!= rp->dev ||
+			    lp->gen			!= rp->gen)
+				return BCH_MERGE_NOMERGE;
+
+			/* We don't allow extents to straddle buckets: */
+
+			m = cache_member_info_get(c)->m + lp->dev;
+			if ((lp->offset & ~((u64) m->bucket_size - 1)) !=
+			    (rp->offset & ~((u64) m->bucket_size - 1))) {
+				cache_member_info_put();
+				return BCH_MERGE_NOMERGE;
+
+			}
+			cache_member_info_put();
+		}
+
+		break;
+	default:
+		return BCH_MERGE_NOMERGE;
+	}
+
+	l->k.needs_whiteout |= r->k.needs_whiteout;
+
+	/* Keys with no pointers aren't restricted to one bucket and could
+	 * overflow KEY_SIZE
+	 */
+	if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) {
+		bch_key_resize(&l->k, KEY_SIZE_MAX);
+		bch_cut_front(l->k.p, r);
+		return BCH_MERGE_PARTIAL;
+	}
+
+	bch_key_resize(&l->k, l->k.size + r->k.size);
+
+	return BCH_MERGE_MERGE;
+}
+
+static void extent_i_save(struct btree *b, struct bkey_packed *dst,
+			  struct bkey_i *src)
+{
+	struct bkey_format *f = &b->format;
+	struct bkey_i *dst_unpacked;
+
+	BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k));
+
+	/*
+	 * We don't want the bch_verify_key_order() call in extent_save(),
+	 * because we may be out of order with deleted keys that are about to be
+	 * removed by extent_bset_insert()
+	 */
+
+	if ((dst_unpacked = packed_to_bkey(dst)))
+		bkey_copy(dst_unpacked, src);
+	else
+		BUG_ON(!bkey_pack(dst, src, f));
+}
+
+static bool extent_merge_one_overlapping(struct btree_iter *iter,
+					 struct bpos new_pos,
+					 struct bset_tree *t,
+					 struct bkey_packed *k, struct bkey uk,
+					 bool check, bool could_pack)
+{
+	struct btree *b = iter->nodes[0];
+	struct btree_node_iter *node_iter = &iter->node_iters[0];
+
+	BUG_ON(!bkey_deleted(k));
+
+	if (check) {
+		return !bkey_packed(k) || could_pack;
+	} else {
+		uk.p = new_pos;
+		extent_save(b, node_iter, k, &uk);
+		bch_bset_fix_invalidated_key(b, t, k);
+		bch_btree_node_iter_fix(iter, b, node_iter, t,
+					k, k->u64s, k->u64s);
+		return true;
+	}
+}
+
+static bool extent_merge_do_overlapping(struct btree_iter *iter,
+					struct bkey *m, bool back_merge)
+{
+	struct btree *b = iter->nodes[0];
+	struct btree_node_iter *node_iter = &iter->node_iters[0];
+	struct bset_tree *t;
+	struct bkey_packed *k;
+	struct bkey uk;
+	struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m);
+	bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b);
+	bool check = true;
+
+	/*
+	 * @m is the new merged extent:
+	 *
+	 * The merge took place in the last bset; we know there can't be any 0
+	 * size extents overlapping with m there because if so they would have
+	 * been between the two extents we merged.
+	 *
+	 * But in the other bsets, we have to check for and fix such extents:
+	 */
+do_fixup:
+	for_each_bset(b, t) {
+		if (t == bset_tree_last(b))
+			break;
+
+		/*
+		 * if we don't find this bset in the iterator we already got to
+		 * the end of that bset, so start searching from the end.
+		 */
+		k = bch_btree_node_iter_bset_pos(node_iter, b, t);
+
+		if (k == btree_bkey_last(b, t))
+			k = bkey_prev_all(b, t, k);
+		if (!k)
+			continue;
+
+		if (back_merge) {
+			/*
+			 * Back merge: 0 size extents will be before the key
+			 * that was just inserted (and thus the iterator
+			 * position) - walk backwards to find them
+			 */
+			for (;
+			     k &&
+			     (uk = bkey_unpack_key(b, k),
+			      bkey_cmp(uk.p, bkey_start_pos(m)) > 0);
+			     k = bkey_prev_all(b, t, k)) {
+				if (bkey_cmp(uk.p, m->p) >= 0)
+					continue;
+
+				if (!extent_merge_one_overlapping(iter, new_pos,
+						t, k, uk, check, could_pack))
+					return false;
+			}
+		} else {
+			/* Front merge - walk forwards */
+			for (;
+			     k != btree_bkey_last(b, t) &&
+			     (uk = bkey_unpack_key(b, k),
+			      bkey_cmp(uk.p, m->p) < 0);
+			     k = bkey_next(k)) {
+				if (bkey_cmp(uk.p,
+					     bkey_start_pos(m)) <= 0)
+					continue;
+
+				if (!extent_merge_one_overlapping(iter, new_pos,
+						t, k, uk, check, could_pack))
+					return false;
+			}
+		}
+	}
+
+	if (check) {
+		check = false;
+		goto do_fixup;
+	}
+
+	return true;
+}
+
+/*
+ * When merging an extent that we're inserting into a btree node, the new merged
+ * extent could overlap with an existing 0 size extent - if we don't fix that,
+ * it'll break the btree node iterator so this code finds those 0 size extents
+ * and shifts them out of the way.
+ *
+ * Also unpacks and repacks.
+ */
+static bool bch_extent_merge_inline(struct cache_set *c,
+				    struct btree_iter *iter,
+				    struct bkey_packed *l,
+				    struct bkey_packed *r,
+				    bool back_merge)
+{
+	struct btree *b = iter->nodes[0];
+	struct btree_node_iter *node_iter = &iter->node_iters[0];
+	const struct bkey_format *f = &b->format;
+	struct bset_tree *t = bset_tree_last(b);
+	struct bkey_packed *m;
+	BKEY_PADDED(k) li;
+	BKEY_PADDED(k) ri;
+	struct bkey_i *mi;
+	struct bkey tmp;
+
+	/*
+	 * We need to save copies of both l and r, because we might get a
+	 * partial merge (which modifies both) and then fails to repack
+	 */
+	bkey_unpack(b, &li.k, l);
+	bkey_unpack(b, &ri.k, r);
+
+	m = back_merge ? l : r;
+	mi = back_merge ? &li.k : &ri.k;
+
+	/* l & r should be in last bset: */
+	EBUG_ON(bch_bkey_to_bset(b, m) != t);
+
+	switch (bch_extent_merge(c, b, &li.k, &ri.k)) {
+	case BCH_MERGE_NOMERGE:
+		return false;
+	case BCH_MERGE_PARTIAL:
+		if (bkey_packed(m) && !bkey_pack_key((void *) &tmp, &mi->k, f))
+			return false;
+
+		if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
+			return false;
+
+		extent_i_save(b, m, mi);
+		bch_bset_fix_invalidated_key(b, t, m);
+
+		/*
+		 * Update iterator to reflect what we just inserted - otherwise,
+		 * the iter_fix() call is going to put us _before_ the key we
+		 * just partially merged with:
+		 */
+		if (back_merge)
+			bch_btree_iter_set_pos_same_leaf(iter, li.k.k.p);
+
+		bch_btree_node_iter_fix(iter, iter->nodes[0], node_iter,
+					t, m, m->u64s, m->u64s);
+
+		if (!back_merge)
+			bkey_copy(packed_to_bkey(l), &li.k);
+		else
+			bkey_copy(packed_to_bkey(r), &ri.k);
+		return false;
+	case BCH_MERGE_MERGE:
+		if (bkey_packed(m) && !bkey_pack_key((void *) &tmp, &li.k.k, f))
+			return false;
+
+		if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
+			return false;
+
+		extent_i_save(b, m, &li.k);
+		bch_bset_fix_invalidated_key(b, t, m);
+
+		bch_btree_node_iter_fix(iter, iter->nodes[0], node_iter,
+					t, m, m->u64s, m->u64s);
+		return true;
+	default:
+		BUG();
+	}
+}
+
+const struct bkey_ops bch_bkey_extent_ops = {
+	.key_invalid	= bch_extent_invalid,
+	.key_debugcheck	= bch_extent_debugcheck,
+	.val_to_text	= bch_extent_to_text,
+	.swab		= bch_ptr_swab,
+	.key_normalize	= bch_ptr_normalize,
+	.key_merge	= bch_extent_merge,
+	.is_extents	= true,
+};
diff --git a/libbcache/extents.h b/libbcache/extents.h
new file mode 100644
index 0000000..2dc6446
--- /dev/null
+++ b/libbcache/extents.h
@@ -0,0 +1,494 @@
+#ifndef _BCACHE_EXTENTS_H
+#define _BCACHE_EXTENTS_H
+
+#include "bkey.h"
+
+#include <linux/bcache.h>
+
+struct bch_replace_info;
+union bch_extent_crc;
+struct btree_iter;
+struct btree_insert;
+struct btree_insert_entry;
+
+struct btree_nr_keys bch_key_sort_fix_overlapping(struct bset *,
+						  struct btree *,
+						  struct btree_node_iter *);
+struct btree_nr_keys bch_extent_sort_fix_overlapping(struct cache_set *c,
+						     struct bset *,
+						     struct btree *,
+						     struct btree_node_iter *);
+
+extern const struct bkey_ops bch_bkey_btree_ops;
+extern const struct bkey_ops bch_bkey_extent_ops;
+
+struct cache_set;
+struct journal_res;
+
+struct extent_pick_ptr {
+	struct bch_extent_crc64		crc;
+	struct bch_extent_ptr		ptr;
+	struct cache			*ca;
+};
+
+struct extent_pick_ptr
+bch_btree_pick_ptr(struct cache_set *, const struct btree *);
+
+void bch_extent_pick_ptr_avoiding(struct cache_set *, struct bkey_s_c,
+				  struct cache *, struct extent_pick_ptr *);
+
+static inline void
+bch_extent_pick_ptr(struct cache_set *c, struct bkey_s_c k,
+		    struct extent_pick_ptr *ret)
+{
+	bch_extent_pick_ptr_avoiding(c, k, NULL, ret);
+}
+
+enum extent_insert_hook_ret
+bch_extent_cmpxchg(struct extent_insert_hook *, struct bpos, struct bpos,
+		   struct bkey_s_c, const struct bkey_i *);
+
+enum btree_insert_ret
+bch_insert_fixup_extent(struct btree_insert *,
+			struct btree_insert_entry *);
+
+bool bch_extent_normalize(struct cache_set *, struct bkey_s);
+
+unsigned bch_extent_nr_ptrs_from(struct bkey_s_c_extent,
+				 const struct bch_extent_ptr *);
+unsigned bch_extent_nr_ptrs(struct bkey_s_c_extent);
+
+static inline bool bkey_extent_is_data(const struct bkey *k)
+{
+	switch (k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bkey_extent_is_allocation(const struct bkey *k)
+{
+	switch (k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+	case BCH_RESERVATION:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bkey_extent_is_cached(const struct bkey *k)
+{
+	return k->type == BCH_EXTENT_CACHED;
+}
+
+static inline void bkey_extent_set_cached(struct bkey *k, bool cached)
+{
+	EBUG_ON(k->type != BCH_EXTENT &&
+		k->type != BCH_EXTENT_CACHED);
+
+	k->type = cached ? BCH_EXTENT_CACHED : BCH_EXTENT;
+}
+
+static inline unsigned
+__extent_entry_type(const union bch_extent_entry *e)
+{
+	return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
+}
+
+static inline enum bch_extent_entry_type
+extent_entry_type(const union bch_extent_entry *e)
+{
+	int ret = __ffs(e->type);
+
+	EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
+
+	return ret;
+}
+
+static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
+{
+	switch (extent_entry_type(entry)) {
+	case BCH_EXTENT_ENTRY_crc32:
+		return sizeof(struct bch_extent_crc32);
+	case BCH_EXTENT_ENTRY_crc64:
+		return sizeof(struct bch_extent_crc64);
+	case BCH_EXTENT_ENTRY_ptr:
+		return sizeof(struct bch_extent_ptr);
+	default:
+		BUG();
+	}
+}
+
+static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
+{
+	return extent_entry_bytes(entry) / sizeof(u64);
+}
+
+static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
+{
+	return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+}
+
+static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
+{
+	return !extent_entry_is_ptr(e);
+}
+
+union bch_extent_crc {
+	u8				type;
+	struct bch_extent_crc32		crc32;
+	struct bch_extent_crc64		crc64;
+};
+
+/* downcast, preserves const */
+#define to_entry(_entry)						\
+({									\
+	BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&	\
+		     !type_is(_entry, struct bch_extent_ptr *));	\
+									\
+	__builtin_choose_expr(						\
+		(type_is_exact(_entry, const union bch_extent_crc *) ||	\
+		 type_is_exact(_entry, const struct bch_extent_ptr *)),	\
+		(const union bch_extent_entry *) (_entry),		\
+		(union bch_extent_entry *) (_entry));			\
+})
+
+#define __entry_to_crc(_entry)						\
+	__builtin_choose_expr(						\
+		type_is_exact(_entry, const union bch_extent_entry *),	\
+		(const union bch_extent_crc *) (_entry),		\
+		(union bch_extent_crc *) (_entry))
+
+#define entry_to_crc(_entry)						\
+({									\
+	EBUG_ON((_entry) && !extent_entry_is_crc(_entry));		\
+									\
+	__entry_to_crc(_entry);						\
+})
+
+#define entry_to_ptr(_entry)						\
+({									\
+	EBUG_ON((_entry) && !extent_entry_is_ptr(_entry));		\
+									\
+	__builtin_choose_expr(						\
+		type_is_exact(_entry, const union bch_extent_entry *),	\
+		(const struct bch_extent_ptr *) (_entry),		\
+		(struct bch_extent_ptr *) (_entry));			\
+})
+
+enum bch_extent_crc_type {
+	BCH_EXTENT_CRC_NONE,
+	BCH_EXTENT_CRC32,
+	BCH_EXTENT_CRC64,
+};
+
+static inline enum bch_extent_crc_type
+extent_crc_type(const union bch_extent_crc *crc)
+{
+	if (!crc)
+		return BCH_EXTENT_CRC_NONE;
+
+	switch (extent_entry_type(to_entry(crc))) {
+	case BCH_EXTENT_ENTRY_crc32:
+		return BCH_EXTENT_CRC32;
+	case BCH_EXTENT_ENTRY_crc64:
+		return BCH_EXTENT_CRC64;
+	default:
+		BUG();
+	}
+}
+
+#define extent_entry_next(_entry)					\
+	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+
+#define extent_entry_last(_e)						\
+	bkey_idx((_e).v, bkey_val_u64s((_e).k))
+
+/* Iterate over all entries: */
+
+#define extent_for_each_entry_from(_e, _entry, _start)			\
+	for ((_entry) = _start;						\
+	     (_entry) < extent_entry_last(_e);				\
+	     (_entry) = extent_entry_next(_entry))
+
+#define extent_for_each_entry(_e, _entry)				\
+	extent_for_each_entry_from(_e, _entry, (_e).v->start)
+
+/* Iterate over crcs only: */
+
+#define extent_crc_next(_e, _p)						\
+({									\
+	typeof(&(_e).v->start[0]) _entry = _p;				\
+									\
+	while ((_entry) < extent_entry_last(_e) &&			\
+	       !extent_entry_is_crc(_entry))				\
+		(_entry) = extent_entry_next(_entry);			\
+									\
+	entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL);	\
+})
+
+#define extent_for_each_crc(_e, _crc)					\
+	for ((_crc) = extent_crc_next(_e, (_e).v->start);		\
+	     (_crc);							\
+	     (_crc) = extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
+
+/* Iterate over pointers, with crcs: */
+
+#define extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter)		\
+({									\
+	__label__ out;							\
+	typeof(&(_e).v->start[0]) _entry;				\
+									\
+	extent_for_each_entry_from(_e, _entry, to_entry(_ptr))		\
+		if (extent_entry_is_crc(_entry)) {			\
+			(_crc) = entry_to_crc(_entry);			\
+		} else {						\
+			_ptr = entry_to_ptr(_entry);			\
+			if (_filter)					\
+				goto out;				\
+		}							\
+									\
+	_ptr = NULL;							\
+out:									\
+	_ptr;								\
+})
+
+#define extent_for_each_ptr_crc_filter(_e, _ptr, _crc, _filter)		\
+	for ((_crc) = NULL,						\
+	     (_ptr) = &(_e).v->start->ptr;				\
+	     ((_ptr) = extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter));\
+	     (_ptr)++)
+
+#define extent_for_each_ptr_crc(_e, _ptr, _crc)				\
+	extent_for_each_ptr_crc_filter(_e, _ptr, _crc, true)
+
+#define extent_for_each_online_device_crc(_c, _e, _crc, _ptr, _ca)	\
+	extent_for_each_ptr_crc_filter(_e, _ptr, _crc,			\
+				       ((_ca) = PTR_CACHE(_c, _ptr)))
+
+/* Iterate over pointers only, and from a given position: */
+
+#define extent_ptr_next_filter(_e, _ptr, _filter)			\
+({									\
+	typeof(__entry_to_crc(&(_e).v->start[0])) _crc;			\
+									\
+	extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter);		\
+})
+
+#define extent_ptr_next(_e, _ptr)					\
+	extent_ptr_next_filter(_e, _ptr, true)
+
+#define extent_for_each_ptr_from_filter(_e, _ptr, _start, _filter)	\
+	for ((_ptr) = (_start);				\
+	     ((_ptr) = extent_ptr_next_filter(_e, _ptr, _filter));	\
+	     (_ptr)++)
+
+#define extent_for_each_ptr_from(_e, _ptr, _start)			\
+	extent_for_each_ptr_from_filter(_e, _ptr, _start, true)
+
+#define extent_for_each_ptr(_e, _ptr)					\
+	extent_for_each_ptr_from_filter(_e, _ptr, &(_e).v->start->ptr, true)
+
+#define extent_for_each_online_device(_c, _e, _ptr, _ca)		\
+	extent_for_each_ptr_from_filter(_e, _ptr, &(_e).v->start->ptr,	\
+					((_ca) = PTR_CACHE(_c, _ptr)))
+
+#define extent_ptr_prev(_e, _ptr)					\
+({									\
+	typeof(&(_e).v->start->ptr) _p;					\
+	typeof(&(_e).v->start->ptr) _prev = NULL;			\
+									\
+	extent_for_each_ptr(_e, _p) {					\
+		if (_p == (_ptr))					\
+			break;						\
+		_prev = _p;						\
+	}								\
+									\
+	_prev;								\
+})
+
+/*
+ * Use this when you'll be dropping pointers as you iterate. Quadratic,
+ * unfortunately:
+ */
+#define extent_for_each_ptr_backwards(_e, _ptr)				\
+	for ((_ptr) = extent_ptr_prev(_e, NULL);			\
+	     (_ptr);							\
+	     (_ptr) = extent_ptr_prev(_e, _ptr))
+
+void bch_extent_entry_append(struct bkey_i_extent *, union bch_extent_entry *);
+void bch_extent_crc_append(struct bkey_i_extent *, unsigned, unsigned,
+			   unsigned, u64, unsigned);
+
+static inline void extent_ptr_append(struct bkey_i_extent *e,
+				     struct bch_extent_ptr ptr)
+{
+	ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+	bch_extent_entry_append(e, to_entry(&ptr));
+}
+
+/* XXX: inefficient */
+static inline bool bch_extent_ptr_is_dirty(const struct cache_set *c,
+					   struct bkey_s_c_extent e,
+					   const struct bch_extent_ptr *ptr)
+{
+	if (bkey_extent_is_cached(e.k))
+		return false;
+
+	/* Dirty pointers come last */
+	return bch_extent_nr_ptrs_from(e, ptr) <= c->opts.data_replicas;
+}
+
+extern const unsigned bch_crc_size[];
+
+static inline struct bch_extent_crc64 crc_to_64(const struct bkey *k,
+						const union bch_extent_crc *crc)
+{
+	switch (extent_crc_type(crc)) {
+	case BCH_EXTENT_CRC_NONE:
+		return (struct bch_extent_crc64) {
+			.compressed_size	= k->size,
+			.uncompressed_size	= k->size,
+		};
+	case BCH_EXTENT_CRC32:
+		return (struct bch_extent_crc64) {
+			.compressed_size	= crc->crc32.compressed_size,
+			.uncompressed_size	= crc->crc32.uncompressed_size,
+			.offset			= crc->crc32.offset,
+			.csum_type		= crc->crc32.csum_type,
+			.compression_type	= crc->crc32.compression_type,
+			.csum			= crc->crc32.csum,
+		};
+	case BCH_EXTENT_CRC64:
+		return crc->crc64;
+	default:
+		BUG();
+	}
+}
+
+static inline unsigned crc_compressed_size(const struct bkey *k,
+					   const union bch_extent_crc *crc)
+{
+	return crc_to_64(k, crc).compressed_size;
+}
+
+static inline unsigned crc_uncompressed_size(const struct bkey *k,
+					     const union bch_extent_crc *crc)
+{
+	return crc_to_64(k, crc).uncompressed_size;
+}
+
+static inline unsigned crc_offset(const union bch_extent_crc *crc)
+{
+	switch (extent_crc_type(crc)) {
+	case BCH_EXTENT_CRC_NONE:
+		return 0;
+	case BCH_EXTENT_CRC32:
+		return crc->crc32.offset;
+	case BCH_EXTENT_CRC64:
+		return crc->crc64.offset;
+	default:
+		BUG();
+	}
+}
+
+static inline unsigned crc_csum_type(const union bch_extent_crc *crc)
+{
+	switch (extent_crc_type(crc)) {
+	case BCH_EXTENT_CRC_NONE:
+		return 0;
+	case BCH_EXTENT_CRC32:
+		return crc->crc32.csum_type;
+	case BCH_EXTENT_CRC64:
+		return crc->crc64.csum_type;
+	default:
+		BUG();
+	}
+}
+
+static inline unsigned crc_compression_type(const union bch_extent_crc *crc)
+{
+	switch (extent_crc_type(crc)) {
+	case BCH_EXTENT_CRC_NONE:
+		return 0;
+	case BCH_EXTENT_CRC32:
+		return crc->crc32.compression_type;
+	case BCH_EXTENT_CRC64:
+		return crc->crc64.compression_type;
+	default:
+		BUG();
+	}
+}
+
+static inline u64 crc_csum(const union bch_extent_crc *crc)
+{
+	switch (extent_crc_type(crc)) {
+	case BCH_EXTENT_CRC_NONE:
+		return 0;
+	case BCH_EXTENT_CRC32:
+		return crc->crc32.csum;
+	case BCH_EXTENT_CRC64:
+		return crc->crc64.csum;
+	default:
+		BUG();
+	}
+}
+
+static inline unsigned bkey_extent_is_compressed(struct cache_set *c,
+						 struct bkey_s_c k)
+{
+	struct bkey_s_c_extent e;
+	const struct bch_extent_ptr *ptr;
+	const union bch_extent_crc *crc;
+	unsigned ret = 0;
+
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		e = bkey_s_c_to_extent(k);
+
+		extent_for_each_ptr_crc(e, ptr, crc)
+			if (bch_extent_ptr_is_dirty(c, e, ptr) &&
+			    crc_compression_type(crc) != BCH_COMPRESSION_NONE &&
+			    crc_compressed_size(e.k, crc) < k.k->size)
+				ret = max_t(unsigned, ret,
+					    crc_compressed_size(e.k, crc));
+	}
+
+	return ret;
+}
+
+void bch_extent_narrow_crcs(struct bkey_s_extent);
+void bch_extent_drop_redundant_crcs(struct bkey_s_extent);
+
+/* Doesn't cleanup redundant crcs */
+static inline void __bch_extent_drop_ptr(struct bkey_s_extent e,
+					 struct bch_extent_ptr *ptr)
+{
+	EBUG_ON(ptr < &e.v->start->ptr ||
+		ptr >= &extent_entry_last(e)->ptr);
+	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+	memmove_u64s_down(ptr, ptr + 1,
+			  (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1));
+	e.k->u64s -= sizeof(*ptr) / sizeof(u64);
+}
+
+static inline void bch_extent_drop_ptr(struct bkey_s_extent e,
+				       struct bch_extent_ptr *ptr)
+{
+	__bch_extent_drop_ptr(e, ptr);
+	bch_extent_drop_redundant_crcs(e);
+}
+
+bool bch_extent_has_device(struct bkey_s_c_extent, unsigned);
+
+bool bch_cut_front(struct bpos, struct bkey_i *);
+bool bch_cut_back(struct bpos, struct bkey *);
+void bch_key_resize(struct bkey *, unsigned);
+
+#endif /* _BCACHE_EXTENTS_H */
diff --git a/libbcache/eytzinger.h b/libbcache/eytzinger.h
new file mode 100644
index 0000000..13d54e5
--- /dev/null
+++ b/libbcache/eytzinger.h
@@ -0,0 +1,196 @@
+#ifndef _EYTZINGER_H
+#define _EYTZINGER_H
+
+#include <linux/bitops.h>
+#include <linux/log2.h>
+
+#include "util.h"
+
+/*
+ * Traversal for trees in eytzinger layout - a full binary tree layed out in an
+ * array
+ *
+ * We used one based indexing, not zero based: with one based indexing, each
+ * level of the tree starts at a power of two - leading to better alignment -
+ * and it's what you want for implementing next/prev and to/from inorder.
+ *
+ * To/from inorder also uses 1 based indexing.
+ *
+ * Size parameter is treated as if we were using 0 based indexing, however:
+ * valid nodes, and inorder indices, are in the range [1..size)
+ */
+
+static inline unsigned eytzinger_child(unsigned j, unsigned child)
+{
+	EBUG_ON(child > 1);
+
+	return (j << 1) + child;
+}
+
+static inline unsigned eytzinger_left_child(unsigned j)
+{
+	return eytzinger_child(j, 0);
+}
+
+static inline unsigned eytzinger_right_child(unsigned j)
+{
+	return eytzinger_child(j, 1);
+}
+
+static inline unsigned eytzinger_first(unsigned size)
+{
+	return rounddown_pow_of_two(size - 1);
+}
+
+static inline unsigned eytzinger_last(unsigned size)
+{
+	return rounddown_pow_of_two(size) - 1;
+}
+
+/*
+ * eytzinger_next() and eytzinger_prev() have the nice properties that
+ *
+ * eytzinger_next(0) == eytzinger_first())
+ * eytzinger_prev(0) == eytzinger_last())
+ *
+ * eytzinger_prev(eytzinger_first()) == 0
+ * eytzinger_next(eytzinger_last()) == 0
+ */
+
+static inline unsigned eytzinger_next(unsigned j, unsigned size)
+{
+	EBUG_ON(j >= size);
+
+	if (eytzinger_right_child(j) < size) {
+		j = eytzinger_right_child(j);
+
+		j <<= __fls(size) - __fls(j);
+		j >>= j >= size;
+	} else {
+		j >>= ffz(j) + 1;
+	}
+
+	return j;
+}
+
+static inline unsigned eytzinger_prev(unsigned j, unsigned size)
+{
+	EBUG_ON(j >= size);
+
+	if (eytzinger_left_child(j) < size) {
+		j = eytzinger_left_child(j);
+
+		j <<= __fls(size) - __fls(j);
+		j -= 1;
+		j >>= j >= size;
+	} else {
+		j >>= __ffs(j) + 1;
+	}
+
+	return j;
+}
+
+static inline unsigned eytzinger_extra(unsigned size)
+{
+	return (size - rounddown_pow_of_two(size - 1)) << 1;
+}
+
+static inline unsigned __eytzinger_to_inorder(unsigned j, unsigned size,
+					      unsigned extra)
+{
+	unsigned b = __fls(j);
+	unsigned shift = __fls(size - 1) - b;
+	int s;
+
+	EBUG_ON(!j || j >= size);
+
+	j  ^= 1U << b;
+	j <<= 1;
+	j  |= 1;
+	j <<= shift;
+
+	/*
+	 * sign bit trick:
+	 *
+	 * if (j > extra)
+	 *	j -= (j - extra) >> 1;
+	 */
+	s = extra - j;
+	j += (s >> 1) & (s >> 31);
+
+	return j;
+}
+
+static inline unsigned __inorder_to_eytzinger(unsigned j, unsigned size,
+					      unsigned extra)
+{
+	unsigned shift;
+	int s;
+
+	EBUG_ON(!j || j >= size);
+
+	/*
+	 * sign bit trick:
+	 *
+	 * if (j > extra)
+	 *	j += j - extra;
+	 */
+	s = extra - j;
+	j -= s & (s >> 31);
+
+	shift = __ffs(j);
+
+	j >>= shift + 1;
+	j  |= 1U << (__fls(size - 1) - shift);
+
+	return j;
+}
+
+static inline unsigned eytzinger_to_inorder(unsigned j, unsigned size)
+{
+	return __eytzinger_to_inorder(j, size, eytzinger_extra(size));
+}
+
+static inline unsigned inorder_to_eytzinger(unsigned j, unsigned size)
+{
+	return __inorder_to_eytzinger(j, size, eytzinger_extra(size));
+}
+
+#define eytzinger_for_each(_i, _size)			\
+	for ((_i) = eytzinger_first((_size));		\
+	     (_i) != 0;					\
+	     (_i) = eytzinger_next((_i), (_size)))
+
+#if 0
+void eytzinger_test(void)
+{
+	unsigned i, j, size;
+
+	for (size = 2;
+	     size < 65536000;
+	     size++) {
+		if (!(size % 4096))
+			printk(KERN_INFO "tree size %u\n", size);
+
+		assert(eytzinger_prev(0, size) == eytzinger_last(size));
+		assert(eytzinger_next(0, size) == eytzinger_first(size));
+
+		assert(eytzinger_prev(eytzinger_first(size), size) == 0);
+		assert(eytzinger_next(eytzinger_last(size), size) == 0);
+
+		eytzinger_for_each(j, size) {
+			assert(from_inorder(i, size) == j);
+			assert(to_inorder(j, size) == i);
+
+			if (j != eytzinger_last(size)) {
+				unsigned next = eytzinger_next(j, size);
+
+				assert(eytzinger_prev(next, size) == j);
+			}
+		}
+	}
+
+}
+#endif
+
+#endif /* _EYTZINGER_H */
diff --git a/libbcache/fifo.h b/libbcache/fifo.h
new file mode 100644
index 0000000..2908ca2
--- /dev/null
+++ b/libbcache/fifo.h
@@ -0,0 +1,123 @@
+#ifndef _BCACHE_FIFO_H
+#define _BCACHE_FIFO_H
+
+#define DECLARE_FIFO(type, name)					\
+	struct {							\
+		size_t front, back, size, mask;				\
+		type *data;						\
+	} name
+
+#define init_fifo(fifo, _size, _gfp)					\
+({									\
+	bool _ret = true;						\
+	gfp_t gfp_flags = (_gfp);					\
+									\
+	if (gfp_flags & GFP_KERNEL)					\
+		gfp_flags |= __GFP_NOWARN;				\
+									\
+	(fifo)->size	= (_size);					\
+	(fifo)->front	= (fifo)->back = 0;				\
+	(fifo)->data	= NULL;						\
+									\
+	if ((fifo)->size) {						\
+		size_t _allocated_size, _bytes;				\
+									\
+		_allocated_size = roundup_pow_of_two((fifo)->size);	\
+		_bytes = _allocated_size * sizeof(*(fifo)->data);	\
+									\
+		(fifo)->mask = _allocated_size - 1;			\
+									\
+		if (_bytes < KMALLOC_MAX_SIZE)				\
+			(fifo)->data = kmalloc(_bytes, gfp_flags);	\
+		if ((!(fifo)->data) && (gfp_flags & GFP_KERNEL))	\
+			(fifo)->data = vmalloc(_bytes);			\
+		if ((!(fifo)->data))					\
+			_ret = false;					\
+	}								\
+	_ret;								\
+})
+
+#define free_fifo(fifo)							\
+do {									\
+	kvfree((fifo)->data);						\
+	(fifo)->data = NULL;						\
+} while (0)
+
+#define fifo_swap(l, r)							\
+do {									\
+	swap((l)->front, (r)->front);					\
+	swap((l)->back, (r)->back);					\
+	swap((l)->size, (r)->size);					\
+	swap((l)->mask, (r)->mask);					\
+	swap((l)->data, (r)->data);					\
+} while (0)
+
+#define fifo_move(dest, src)						\
+do {									\
+	typeof(*((dest)->data)) _t;					\
+	while (!fifo_full(dest) &&					\
+	       fifo_pop(src, _t))					\
+		fifo_push(dest, _t);					\
+} while (0)
+
+#define fifo_used(fifo)		(((fifo)->back - (fifo)->front))
+#define fifo_free(fifo)		((fifo)->size - fifo_used(fifo))
+
+#define fifo_empty(fifo)	((fifo)->front == (fifo)->back)
+#define fifo_full(fifo)		(fifo_used(fifo) == (fifo)->size)
+
+#define fifo_peek_front(fifo)	((fifo)->data[(fifo)->front & (fifo)->mask])
+#define fifo_peek_back(fifo)	((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
+
+#define fifo_entry_idx(fifo, p)	(((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
+
+#define fifo_push_back(fifo, i)						\
+({									\
+	bool _r = !fifo_full((fifo));					\
+	if (_r)								\
+		(fifo)->data[(fifo)->back++ & (fifo)->mask] = (i);	\
+	_r;								\
+})
+
+#define fifo_pop_front(fifo, i)						\
+({									\
+	bool _r = !fifo_empty((fifo));					\
+	if (_r)								\
+		(i) = (fifo)->data[(fifo)->front++ & (fifo)->mask];	\
+	_r;								\
+})
+
+#define fifo_push_front(fifo, i)					\
+({									\
+	bool _r = !fifo_full((fifo));					\
+	if (_r)								\
+		(fifo)->data[--(fifo)->front & (fifo)->mask] = (i);	\
+	_r;								\
+})
+
+#define fifo_pop_back(fifo, i)						\
+({									\
+	bool _r = !fifo_empty((fifo));					\
+	if (_r)								\
+		(i) = (fifo)->data[--(fifo)->back & (fifo)->mask]	\
+	_r;								\
+})
+
+#define fifo_push(fifo, i)	fifo_push_back(fifo, (i))
+#define fifo_pop(fifo, i)	fifo_pop_front(fifo, (i))
+#define fifo_peek(fifo)		fifo_peek_front(fifo)
+
+#define fifo_for_each_entry(_entry, _fifo, _iter)			\
+	for (_iter = (_fifo)->front;					\
+	     ((_iter != (_fifo)->back) &&				\
+	      (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true));	\
+	     _iter++)
+
+#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter)			\
+	for (_iter = (_fifo)->front;					\
+	     ((_iter != (_fifo)->back) &&				\
+	      (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true));	\
+	     _iter++)
+
+#endif /* _BCACHE_FIFO_H */
+
diff --git a/libbcache/fs-gc.c b/libbcache/fs-gc.c
new file mode 100644
index 0000000..bd2a867
--- /dev/null
+++ b/libbcache/fs-gc.c
@@ -0,0 +1,475 @@
+
+#include "bcache.h"
+#include "btree_update.h"
+#include "dirent.h"
+#include "error.h"
+#include "fs.h"
+#include "fs-gc.h"
+#include "inode.h"
+#include "keylist.h"
+#include "super.h"
+
+#include <linux/generic-radix-tree.h>
+
+struct nlink {
+	u32	count;
+	u32	dir_count;
+};
+
+DECLARE_GENRADIX_TYPE(nlinks, struct nlink);
+
+static void inc_link(struct cache_set *c, struct nlinks *links,
+		     u64 range_start, u64 *range_end,
+		     u64 inum, bool dir)
+{
+	struct nlink *link;
+
+	if (inum < range_start || inum >= *range_end)
+		return;
+
+	link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL);
+	if (!link) {
+		bch_verbose(c, "allocation failed during fs gc - will need another pass");
+		*range_end = inum;
+		return;
+	}
+
+	if (dir)
+		link->dir_count++;
+	else
+		link->count++;
+}
+
+/*
+ * XXX: should do a DFS (via filesystem heirarchy), and make sure all dirents
+ * are reachable
+ */
+
+noinline_for_stack
+static int bch_gc_walk_dirents(struct cache_set *c, struct nlinks *links,
+			       u64 range_start, u64 *range_end)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+	u64 d_inum;
+	int ret;
+
+	inc_link(c, links, range_start, range_end, BCACHE_ROOT_INO, false);
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, k) {
+		switch (k.k->type) {
+		case BCH_DIRENT:
+			d = bkey_s_c_to_dirent(k);
+			d_inum = le64_to_cpu(d.v->d_inum);
+
+			if (d.v->d_type == DT_DIR)
+				inc_link(c, links, range_start, range_end,
+					 d.k->p.inode, true);
+
+			inc_link(c, links, range_start, range_end,
+				 d_inum, false);
+
+			break;
+		}
+
+		bch_btree_iter_cond_resched(&iter);
+	}
+	ret = bch_btree_iter_unlock(&iter);
+	if (ret)
+		bch_err(c, "error in fs gc: btree error %i while walking dirents", ret);
+
+	return ret;
+}
+
+s64 bch_count_inode_sectors(struct cache_set *c, u64 inum)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 sectors = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inum, 0), k) {
+		if (k.k->p.inode != inum)
+			break;
+
+		if (bkey_extent_is_allocation(k.k))
+			sectors += k.k->size;
+	}
+
+	return bch_btree_iter_unlock(&iter) ?: sectors;
+}
+
+static int bch_gc_do_inode(struct cache_set *c, struct btree_iter *iter,
+			   struct bkey_s_c_inode inode, struct nlink link)
+{
+	u16 i_mode  = le16_to_cpu(inode.v->i_mode);
+	u32 i_flags = le32_to_cpu(inode.v->i_flags);
+	u32 i_nlink = le32_to_cpu(inode.v->i_nlink);
+	u64 i_size  = le64_to_cpu(inode.v->i_size);
+	s64 i_sectors = 0;
+	int ret = 0;
+	u32 real_i_nlink;
+
+	fsck_err_on(i_nlink < link.count, c,
+		    "inode %llu i_link too small (%u < %u, type %i)",
+		    inode.k->p.inode, i_nlink,
+		    link.count, mode_to_type(i_mode));
+
+	if (S_ISDIR(i_mode)) {
+		unfixable_fsck_err_on(link.count > 1, c,
+			"directory %llu with multiple hardlinks: %u",
+			inode.k->p.inode, link.count);
+
+		real_i_nlink = link.count * 2 + link.dir_count;
+	} else {
+		unfixable_fsck_err_on(link.dir_count, c,
+			"found dirents for non directory %llu",
+			inode.k->p.inode);
+
+		real_i_nlink = link.count + link.dir_count;
+	}
+
+	if (!link.count) {
+		fsck_err_on(c->sb.clean, c,
+			    "filesystem marked clean, "
+			    "but found orphaned inode %llu",
+			    inode.k->p.inode);
+
+		unfixable_fsck_err_on(S_ISDIR(i_mode) &&
+			bch_empty_dir(c, inode.k->p.inode), c,
+			"non empty directory with link count 0, "
+			"inode nlink %u, dir links found %u",
+			i_nlink, link.dir_count);
+
+		bch_verbose(c, "deleting inode %llu", inode.k->p.inode);
+
+		ret = bch_inode_rm(c, inode.k->p.inode);
+		if (ret)
+			bch_err(c, "error in fs gc: error %i "
+				"while deleting inode", ret);
+		return ret;
+	}
+
+	if (i_flags & BCH_INODE_I_SIZE_DIRTY) {
+		fsck_err_on(c->sb.clean, c,
+			    "filesystem marked clean, "
+			    "but inode %llu has i_size dirty",
+			    inode.k->p.inode);
+
+		bch_verbose(c, "truncating inode %llu", inode.k->p.inode);
+
+		/*
+		 * XXX: need to truncate partial blocks too here - or ideally
+		 * just switch units to bytes and that issue goes away
+		 */
+
+		ret = bch_inode_truncate(c, inode.k->p.inode,
+				round_up(i_size, PAGE_SIZE) >> 9,
+				NULL, NULL);
+		if (ret) {
+			bch_err(c, "error in fs gc: error %i "
+				"truncating inode", ret);
+			return ret;
+		}
+
+		/*
+		 * We truncated without our normal sector accounting hook, just
+		 * make sure we recalculate it:
+		 */
+		i_flags |= BCH_INODE_I_SECTORS_DIRTY;
+	}
+
+	if (i_flags & BCH_INODE_I_SECTORS_DIRTY) {
+		fsck_err_on(c->sb.clean, c,
+			    "filesystem marked clean, "
+			    "but inode %llu has i_sectors dirty",
+			    inode.k->p.inode);
+
+		bch_verbose(c, "recounting sectors for inode %llu",
+			    inode.k->p.inode);
+
+		i_sectors = bch_count_inode_sectors(c, inode.k->p.inode);
+		if (i_sectors < 0) {
+			bch_err(c, "error in fs gc: error %i "
+				"recounting inode sectors",
+				(int) i_sectors);
+			return i_sectors;
+		}
+	}
+
+	if (i_nlink != real_i_nlink) {
+		fsck_err_on(c->sb.clean, c,
+			    "filesystem marked clean, "
+			    "but inode %llu has wrong i_nlink "
+			    "(type %u i_nlink %u, should be %u)",
+			    inode.k->p.inode, mode_to_type(i_mode),
+			    i_nlink, real_i_nlink);
+
+		bch_verbose(c, "setting inode %llu nlinks from %u to %u",
+			    inode.k->p.inode, i_nlink, real_i_nlink);
+	}
+
+	if (i_nlink != real_i_nlink||
+	    i_flags & BCH_INODE_I_SECTORS_DIRTY ||
+	    i_flags & BCH_INODE_I_SIZE_DIRTY) {
+		struct bkey_i_inode update;
+
+		bkey_reassemble(&update.k_i, inode.s_c);
+		update.v.i_nlink = cpu_to_le32(real_i_nlink);
+		update.v.i_flags = cpu_to_le32(i_flags &
+				~(BCH_INODE_I_SIZE_DIRTY|
+				  BCH_INODE_I_SECTORS_DIRTY));
+
+		if (i_flags & BCH_INODE_I_SECTORS_DIRTY)
+			update.v.i_sectors = cpu_to_le64(i_sectors);
+
+		ret = bch_btree_insert_at(c, NULL, NULL, NULL,
+					  BTREE_INSERT_NOFAIL,
+					  BTREE_INSERT_ENTRY(iter, &update.k_i));
+		if (ret && ret != -EINTR)
+			bch_err(c, "error in fs gc: error %i "
+				"updating inode", ret);
+	}
+fsck_err:
+	return ret;
+}
+
+noinline_for_stack
+static int bch_gc_walk_inodes(struct cache_set *c, struct nlinks *links,
+			      u64 range_start, u64 range_end)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct nlink *link, zero_links = { 0, 0 };
+	struct genradix_iter nlinks_iter;
+	int ret = 0, ret2 = 0;
+	u64 nlinks_pos;
+
+	bch_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0));
+	genradix_iter_init(&nlinks_iter);
+
+	while ((k = bch_btree_iter_peek(&iter)).k &&
+	       !btree_iter_err(k)) {
+peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
+
+		if (!link && (!k.k || iter.pos.inode >= range_end))
+			break;
+
+		nlinks_pos = range_start + nlinks_iter.pos;
+		if (iter.pos.inode > nlinks_pos) {
+			unfixable_fsck_err_on(link && link->count, c,
+				"missing inode %llu (nlink %u)",
+				nlinks_pos, link->count);
+			genradix_iter_advance(&nlinks_iter, links);
+			goto peek_nlinks;
+		}
+
+		if (iter.pos.inode < nlinks_pos || !link)
+			link = &zero_links;
+
+		if (k.k && k.k->type == BCH_INODE_FS) {
+			/*
+			 * Avoid potential deadlocks with iter for
+			 * truncate/rm/etc.:
+			 */
+			bch_btree_iter_unlock(&iter);
+
+			ret = bch_gc_do_inode(c, &iter,
+					      bkey_s_c_to_inode(k),
+					      *link);
+			if (ret == -EINTR)
+				continue;
+			if (ret)
+				break;
+
+			if (link->count)
+				atomic_long_inc(&c->nr_inodes);
+		} else {
+			unfixable_fsck_err_on(link->count, c,
+				"missing inode %llu (nlink %u)",
+				nlinks_pos, link->count);
+		}
+
+		if (nlinks_pos == iter.pos.inode)
+			genradix_iter_advance(&nlinks_iter, links);
+
+		bch_btree_iter_advance_pos(&iter);
+		bch_btree_iter_cond_resched(&iter);
+	}
+fsck_err:
+	ret2 = bch_btree_iter_unlock(&iter);
+	if (ret2)
+		bch_err(c, "error in fs gc: btree error %i while walking inodes", ret2);
+
+	return ret ?: ret2;
+}
+
+int bch_gc_inode_nlinks(struct cache_set *c)
+{
+	struct nlinks links;
+	u64 this_iter_range_start, next_iter_range_start = 0;
+	int ret = 0;
+
+	genradix_init(&links);
+
+	do {
+		this_iter_range_start = next_iter_range_start;
+		next_iter_range_start = U64_MAX;
+
+		ret = bch_gc_walk_dirents(c, &links,
+					  this_iter_range_start,
+					  &next_iter_range_start);
+		if (ret)
+			break;
+
+		ret = bch_gc_walk_inodes(c, &links,
+					 this_iter_range_start,
+					 next_iter_range_start);
+		if (ret)
+			break;
+
+		genradix_free(&links);
+	} while (next_iter_range_start != U64_MAX);
+
+	genradix_free(&links);
+
+	return ret;
+}
+
+static void next_inode(struct cache_set *c, u64 inum, u64 *cur_inum,
+		       struct bkey_i_inode *inode,
+		       bool *first_this_inode, bool *have_inode,
+		       u64 *i_size, u16 *i_mode)
+{
+	*first_this_inode = inum != *cur_inum;
+	*cur_inum = inum;
+
+	if (*first_this_inode) {
+		*have_inode = !bch_inode_find_by_inum(c, inum, inode);
+
+		if (*have_inode) {
+			*i_mode = le16_to_cpu(inode->v.i_mode);
+			*i_size = le64_to_cpu(inode->v.i_size);
+		}
+	}
+}
+
+/*
+ * Checks for inconsistencies that shouldn't happen, unless we have a bug.
+ * Doesn't fix them yet, mainly because they haven't yet been observed:
+ */
+int bch_fsck(struct cache_set *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_inode inode;
+	bool first_this_inode, have_inode;
+	u64 cur_inum, i_sectors;
+	u64 i_size = 0;
+	u16 i_mode = 0;
+	int ret = 0;
+
+	cur_inum = -1;
+	have_inode = false;
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+			   POS(BCACHE_ROOT_INO, 0), k) {
+		if (k.k->type == KEY_TYPE_DISCARD)
+			continue;
+
+		next_inode(c, k.k->p.inode, &cur_inum, &inode,
+			   &first_this_inode, &have_inode,
+			   &i_size, &i_mode);
+
+		unfixable_fsck_err_on(!have_inode, c,
+			"extent type %u for missing inode %llu",
+			k.k->type, k.k->p.inode);
+
+		unfixable_fsck_err_on(first_this_inode && have_inode &&
+			le64_to_cpu(inode.v.i_sectors) !=
+			(i_sectors = bch_count_inode_sectors(c, cur_inum)),
+			c, "i_sectors wrong: got %llu, should be %llu",
+			le64_to_cpu(inode.v.i_sectors), i_sectors);
+
+		unfixable_fsck_err_on(have_inode &&
+			!S_ISREG(i_mode) && !S_ISLNK(i_mode), c,
+			"extent type %u for non regular file, inode %llu mode %o",
+			k.k->type, k.k->p.inode, i_mode);
+
+		unfixable_fsck_err_on(k.k->type != BCH_RESERVATION &&
+			k.k->p.offset > round_up(i_size, PAGE_SIZE) >> 9, c,
+			"extent type %u offset %llu past end of inode %llu, i_size %llu",
+			k.k->type, k.k->p.offset, k.k->p.inode, i_size);
+	}
+	ret = bch_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	cur_inum = -1;
+	have_inode = false;
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
+			   POS(BCACHE_ROOT_INO, 0), k) {
+		struct bkey_s_c_dirent d;
+		struct bkey_i_inode target;
+		bool have_target;
+		u64 d_inum;
+
+		next_inode(c, k.k->p.inode, &cur_inum, &inode,
+			   &first_this_inode, &have_inode,
+			   &i_size, &i_mode);
+
+		unfixable_fsck_err_on(!have_inode, c,
+			"dirent in nonexisting directory %llu",
+			k.k->p.inode);
+
+		unfixable_fsck_err_on(!S_ISDIR(i_mode), c,
+			"dirent in non directory inode %llu, type %u",
+			k.k->p.inode, mode_to_type(i_mode));
+
+		if (k.k->type != BCH_DIRENT)
+			continue;
+
+		d = bkey_s_c_to_dirent(k);
+		d_inum = le64_to_cpu(d.v->d_inum);
+
+		unfixable_fsck_err_on(d_inum == d.k->p.inode, c,
+			"dirent points to own directory");
+
+		have_target = !bch_inode_find_by_inum(c, d_inum, &target);
+
+		unfixable_fsck_err_on(!have_target, c,
+			"dirent points to missing inode %llu, type %u filename %s",
+			d_inum, d.v->d_type, d.v->d_name);
+
+		unfixable_fsck_err_on(have_target &&
+			d.v->d_type !=
+			mode_to_type(le16_to_cpu(target.v.i_mode)), c,
+			"incorrect d_type: got %u should be %u, filename %s",
+			d.v->d_type,
+			mode_to_type(le16_to_cpu(target.v.i_mode)),
+			d.v->d_name);
+	}
+	ret = bch_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	cur_inum = -1;
+	have_inode = false;
+	for_each_btree_key(&iter, c, BTREE_ID_XATTRS,
+			   POS(BCACHE_ROOT_INO, 0), k) {
+		next_inode(c, k.k->p.inode, &cur_inum, &inode,
+			   &first_this_inode, &have_inode,
+			   &i_size, &i_mode);
+
+		unfixable_fsck_err_on(!have_inode, c,
+			"xattr for missing inode %llu",
+			k.k->p.inode);
+	}
+	ret = bch_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	return 0;
+fsck_err:
+	bch_btree_iter_unlock(&iter);
+	return ret;
+}
diff --git a/libbcache/fs-gc.h b/libbcache/fs-gc.h
new file mode 100644
index 0000000..c44086c
--- /dev/null
+++ b/libbcache/fs-gc.h
@@ -0,0 +1,8 @@
+#ifndef _BCACHE_FS_GC_H
+#define _BCACHE_FS_GC_H
+
+s64 bch_count_inode_sectors(struct cache_set *, u64);
+int bch_gc_inode_nlinks(struct cache_set *);
+int bch_fsck(struct cache_set *);
+
+#endif /* _BCACHE_FS_GC_H */
diff --git a/libbcache/fs-io.c b/libbcache/fs-io.c
new file mode 100644
index 0000000..942baeb
--- /dev/null
+++ b/libbcache/fs-io.c
@@ -0,0 +1,2457 @@
+
+#include "bcache.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "error.h"
+#include "fs.h"
+#include "fs-gc.h"
+#include "fs-io.h"
+#include "inode.h"
+#include "journal.h"
+#include "io.h"
+#include "keylist.h"
+
+#include <linux/aio.h>
+#include <linux/backing-dev.h>
+#include <linux/falloc.h>
+#include <linux/migrate.h>
+#include <linux/mmu_context.h>
+#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/uio.h>
+#include <linux/writeback.h>
+#include <trace/events/writeback.h>
+
+struct bio_set *bch_writepage_bioset;
+struct bio_set *bch_dio_read_bioset;
+struct bio_set *bch_dio_write_bioset;
+
+/* pagecache_block must be held */
+static int write_invalidate_inode_pages_range(struct address_space *mapping,
+					      loff_t start, loff_t end)
+{
+	int ret;
+
+	/*
+	 * XXX: the way this is currently implemented, we can spin if a process
+	 * is continually redirtying a specific page
+	 */
+	do {
+		if (!mapping->nrpages &&
+		    !mapping->nrexceptional)
+			return 0;
+
+		ret = filemap_write_and_wait_range(mapping, start, end);
+		if (ret)
+			break;
+
+		if (!mapping->nrpages)
+			return 0;
+
+		ret = invalidate_inode_pages2_range(mapping,
+				start >> PAGE_SHIFT,
+				end >> PAGE_SHIFT);
+	} while (ret == -EBUSY);
+
+	return ret;
+}
+
+/* i_size updates: */
+
+static int inode_set_size(struct bch_inode_info *ei, struct bch_inode *bi,
+			  void *p)
+{
+	loff_t *new_i_size = p;
+	unsigned i_flags = le32_to_cpu(bi->i_flags);
+
+	lockdep_assert_held(&ei->update_lock);
+
+	bi->i_size = cpu_to_le64(*new_i_size);
+
+	if (atomic_long_read(&ei->i_size_dirty_count))
+		i_flags |= BCH_INODE_I_SIZE_DIRTY;
+	else
+		i_flags &= ~BCH_INODE_I_SIZE_DIRTY;
+
+	bi->i_flags = cpu_to_le32(i_flags);
+
+	return 0;
+}
+
+static int __must_check bch_write_inode_size(struct cache_set *c,
+					     struct bch_inode_info *ei,
+					     loff_t new_size)
+{
+	return __bch_write_inode(c, ei, inode_set_size, &new_size);
+}
+
+static inline void i_size_dirty_put(struct bch_inode_info *ei)
+{
+	atomic_long_dec_bug(&ei->i_size_dirty_count);
+}
+
+static inline void i_size_dirty_get(struct bch_inode_info *ei)
+{
+	lockdep_assert_held(&ei->vfs_inode.i_rwsem);
+
+	atomic_long_inc(&ei->i_size_dirty_count);
+}
+
+/* i_sectors accounting: */
+
+static enum extent_insert_hook_ret
+i_sectors_hook_fn(struct extent_insert_hook *hook,
+		  struct bpos committed_pos,
+		  struct bpos next_pos,
+		  struct bkey_s_c k,
+		  const struct bkey_i *insert)
+{
+	struct i_sectors_hook *h = container_of(hook,
+				struct i_sectors_hook, hook);
+	s64 sectors = next_pos.offset - committed_pos.offset;
+	int sign = bkey_extent_is_allocation(&insert->k) -
+		(k.k && bkey_extent_is_allocation(k.k));
+
+	EBUG_ON(!(h->ei->i_flags & BCH_INODE_I_SECTORS_DIRTY));
+	EBUG_ON(!atomic_long_read(&h->ei->i_sectors_dirty_count));
+
+	h->sectors += sectors * sign;
+
+	return BTREE_HOOK_DO_INSERT;
+}
+
+static int inode_set_i_sectors_dirty(struct bch_inode_info *ei,
+				    struct bch_inode *bi, void *p)
+{
+	BUG_ON(le32_to_cpu(bi->i_flags) & BCH_INODE_I_SECTORS_DIRTY);
+
+	bi->i_flags = cpu_to_le32(le32_to_cpu(bi->i_flags)|
+				  BCH_INODE_I_SECTORS_DIRTY);
+	return 0;
+}
+
+static int inode_clear_i_sectors_dirty(struct bch_inode_info *ei,
+				       struct bch_inode *bi, void *p)
+{
+	BUG_ON(!(le32_to_cpu(bi->i_flags) & BCH_INODE_I_SECTORS_DIRTY));
+
+	bi->i_sectors	= cpu_to_le64(atomic64_read(&ei->i_sectors));
+	bi->i_flags	= cpu_to_le32(le32_to_cpu(bi->i_flags) &
+				      ~BCH_INODE_I_SECTORS_DIRTY);
+	return 0;
+}
+
+static void i_sectors_dirty_put(struct bch_inode_info *ei,
+				struct i_sectors_hook *h)
+{
+	struct inode *inode = &ei->vfs_inode;
+
+	if (h->sectors) {
+		spin_lock(&inode->i_lock);
+		inode->i_blocks += h->sectors;
+		spin_unlock(&inode->i_lock);
+
+		atomic64_add(h->sectors, &ei->i_sectors);
+		EBUG_ON(atomic64_read(&ei->i_sectors) < 0);
+	}
+
+	EBUG_ON(atomic_long_read(&ei->i_sectors_dirty_count) <= 0);
+
+	mutex_lock(&ei->update_lock);
+
+	if (atomic_long_dec_and_test(&ei->i_sectors_dirty_count)) {
+		struct cache_set *c = ei->vfs_inode.i_sb->s_fs_info;
+		int ret = __bch_write_inode(c, ei, inode_clear_i_sectors_dirty, NULL);
+
+		ret = ret;
+	}
+
+	mutex_unlock(&ei->update_lock);
+}
+
+static int __must_check i_sectors_dirty_get(struct bch_inode_info *ei,
+					    struct i_sectors_hook *h)
+{
+	int ret = 0;
+
+	h->hook.fn	= i_sectors_hook_fn;
+	h->sectors	= 0;
+#ifdef CONFIG_BCACHE_DEBUG
+	h->ei		= ei;
+#endif
+
+	if (atomic_long_inc_not_zero(&ei->i_sectors_dirty_count))
+		return 0;
+
+	mutex_lock(&ei->update_lock);
+
+	if (!(ei->i_flags & BCH_INODE_I_SECTORS_DIRTY)) {
+		struct cache_set *c = ei->vfs_inode.i_sb->s_fs_info;
+
+		ret = __bch_write_inode(c, ei, inode_set_i_sectors_dirty, NULL);
+	}
+
+	if (!ret)
+		atomic_long_inc(&ei->i_sectors_dirty_count);
+
+	mutex_unlock(&ei->update_lock);
+
+	return ret;
+}
+
+struct bchfs_extent_trans_hook {
+	struct bchfs_write_op		*op;
+	struct extent_insert_hook	hook;
+	struct bkey_i_inode		new_inode;
+	bool				need_inode_update;
+};
+
+static enum extent_insert_hook_ret
+bchfs_extent_update_hook(struct extent_insert_hook *hook,
+			 struct bpos committed_pos,
+			 struct bpos next_pos,
+			 struct bkey_s_c k,
+			 const struct bkey_i *insert)
+{
+	struct bchfs_extent_trans_hook *h = container_of(hook,
+				struct bchfs_extent_trans_hook, hook);
+	struct bch_inode_info *ei = h->op->ei;
+	struct inode *inode = &ei->vfs_inode;
+	int sign = bkey_extent_is_allocation(&insert->k) -
+		(k.k && bkey_extent_is_allocation(k.k));
+	s64 sectors = (s64) (next_pos.offset - committed_pos.offset) * sign;
+	u64 offset = min(next_pos.offset << 9, h->op->new_i_size);
+
+	BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));
+
+	/* XXX: ei->i_size locking */
+	if (offset > ei->i_size) {
+		BUG_ON(ei->i_flags & BCH_INODE_I_SIZE_DIRTY);
+
+		if (!h->need_inode_update) {
+			h->need_inode_update = true;
+			return BTREE_HOOK_RESTART_TRANS;
+		}
+
+		h->new_inode.v.i_size = cpu_to_le64(offset);
+		ei->i_size = offset;
+
+		if (h->op->is_dio)
+			i_size_write(inode, offset);
+	}
+
+	if (sectors) {
+		if (!h->need_inode_update) {
+			h->need_inode_update = true;
+			return BTREE_HOOK_RESTART_TRANS;
+		}
+
+		le64_add_cpu(&h->new_inode.v.i_sectors, sectors);
+		atomic64_add(sectors, &ei->i_sectors);
+
+		h->op->sectors_added += sectors;
+
+		if (h->op->is_dio) {
+			spin_lock(&inode->i_lock);
+			inode->i_blocks += sectors;
+			spin_unlock(&inode->i_lock);
+		}
+	}
+
+	return BTREE_HOOK_DO_INSERT;
+}
+
+static int bchfs_write_index_update(struct bch_write_op *wop)
+{
+	struct bchfs_write_op *op = container_of(wop,
+				struct bchfs_write_op, op);
+	struct keylist *keys = &op->op.insert_keys;
+	struct btree_iter extent_iter, inode_iter;
+	struct bchfs_extent_trans_hook hook;
+	struct bkey_i *k = bch_keylist_front(keys);
+	int ret;
+
+	BUG_ON(k->k.p.inode != op->ei->vfs_inode.i_ino);
+
+	bch_btree_iter_init_intent(&extent_iter, wop->c, BTREE_ID_EXTENTS,
+				   bkey_start_pos(&bch_keylist_front(keys)->k));
+	bch_btree_iter_init_intent(&inode_iter, wop->c,	BTREE_ID_INODES,
+				   POS(extent_iter.pos.inode, 0));
+
+	hook.op			= op;
+	hook.hook.fn		= bchfs_extent_update_hook;
+	hook.need_inode_update	= false;
+
+	do {
+		ret = bch_btree_iter_traverse(&extent_iter);
+		if (ret)
+			goto err;
+
+		/* XXX: ei->i_size locking */
+		k = bch_keylist_front(keys);
+		if (min(k->k.p.offset << 9, op->new_i_size) > op->ei->i_size)
+			hook.need_inode_update = true;
+
+		if (hook.need_inode_update) {
+			struct bkey_s_c inode;
+
+			if (!btree_iter_linked(&inode_iter))
+				bch_btree_iter_link(&extent_iter, &inode_iter);
+
+			inode = bch_btree_iter_peek_with_holes(&inode_iter);
+			if ((ret = btree_iter_err(inode)))
+				goto err;
+
+			if (WARN_ONCE(inode.k->type != BCH_INODE_FS,
+				      "inode %llu not found when updating",
+				      extent_iter.pos.inode)) {
+				ret = -ENOENT;
+				break;
+			}
+
+			bkey_reassemble(&hook.new_inode.k_i, inode);
+
+			ret = bch_btree_insert_at(wop->c, &wop->res,
+					&hook.hook, op_journal_seq(wop),
+					BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
+					BTREE_INSERT_ENTRY(&extent_iter, k),
+					BTREE_INSERT_ENTRY(&inode_iter, &hook.new_inode.k_i));
+		} else {
+			ret = bch_btree_insert_at(wop->c, &wop->res,
+					&hook.hook, op_journal_seq(wop),
+					BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
+					BTREE_INSERT_ENTRY(&extent_iter, k));
+		}
+err:
+		if (ret == -EINTR)
+			continue;
+		if (ret)
+			break;
+
+		bch_keylist_pop_front(keys);
+	} while (!bch_keylist_empty(keys));
+
+	bch_btree_iter_unlock(&extent_iter);
+	bch_btree_iter_unlock(&inode_iter);
+
+	return ret;
+}
+
+/* page state: */
+
+/* stored in page->private: */
+
+/*
+ * bch_page_state has to (unfortunately) be manipulated with cmpxchg - we could
+ * almost protected it with the page lock, except that bch_writepage_io_done has
+ * to update the sector counts (and from interrupt/bottom half context).
+ */
+struct bch_page_state {
+union { struct {
+	/*
+	 * BCH_PAGE_ALLOCATED: page is _fully_ written on disk, and not
+	 * compressed - which means to write this page we don't have to reserve
+	 * space (the new write will never take up more space on disk than what
+	 * it's overwriting)
+	 *
+	 * BCH_PAGE_UNALLOCATED: page is not fully written on disk, or is
+	 * compressed - before writing we have to reserve space with
+	 * bch_reserve_sectors()
+	 *
+	 * BCH_PAGE_RESERVED: page has space reserved on disk (reservation will
+	 * be consumed when the page is written).
+	 */
+	enum {
+		BCH_PAGE_UNALLOCATED	= 0,
+		BCH_PAGE_ALLOCATED,
+	}			alloc_state:2;
+
+	/* Owns PAGE_SECTORS sized reservation: */
+	unsigned		reserved:1;
+
+	/*
+	 * Number of sectors on disk - for i_blocks
+	 * Uncompressed size, not compressed size:
+	 */
+	u8			sectors;
+	u8			dirty_sectors;
+};
+	/* for cmpxchg: */
+	unsigned long		v;
+};
+};
+
+#define page_state_cmpxchg(_ptr, _new, _expr)				\
+({									\
+	unsigned long _v = READ_ONCE((_ptr)->v);			\
+	struct bch_page_state _old;					\
+									\
+	do {								\
+		_old.v = _new.v = _v;					\
+		_expr;							\
+									\
+		EBUG_ON(_new.sectors + _new.dirty_sectors > PAGE_SECTORS);\
+	} while (_old.v != _new.v &&					\
+		 (_v = cmpxchg(&(_ptr)->v, _old.v, _new.v)) != _old.v);	\
+									\
+	_old;								\
+})
+
+static inline struct bch_page_state *page_state(struct page *page)
+{
+	struct bch_page_state *s = (void *) &page->private;
+
+	BUILD_BUG_ON(sizeof(*s) > sizeof(page->private));
+
+	if (!PagePrivate(page))
+		SetPagePrivate(page);
+
+	return s;
+}
+
+static void bch_put_page_reservation(struct cache_set *c, struct page *page)
+{
+	struct disk_reservation res = { .sectors = PAGE_SECTORS };
+	struct bch_page_state s;
+
+	s = page_state_cmpxchg(page_state(page), s, {
+		if (!s.reserved)
+			return;
+		s.reserved = 0;
+	});
+
+	bch_disk_reservation_put(c, &res);
+}
+
+static int bch_get_page_reservation(struct cache_set *c, struct page *page,
+				    bool check_enospc)
+{
+	struct bch_page_state *s = page_state(page), new;
+	struct disk_reservation res;
+	int ret = 0;
+
+	BUG_ON(s->alloc_state == BCH_PAGE_ALLOCATED &&
+	       s->sectors != PAGE_SECTORS);
+
+	if (s->reserved ||
+	    s->alloc_state == BCH_PAGE_ALLOCATED)
+		return 0;
+
+	ret = bch_disk_reservation_get(c, &res, PAGE_SECTORS, !check_enospc
+				       ? BCH_DISK_RESERVATION_NOFAIL : 0);
+	if (ret)
+		return ret;
+
+	page_state_cmpxchg(s, new, {
+		if (new.reserved) {
+			bch_disk_reservation_put(c, &res);
+			return 0;
+		}
+		new.reserved = 1;
+	});
+
+	return 0;
+}
+
+static void bch_clear_page_bits(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct disk_reservation res = { .sectors = PAGE_SECTORS };
+	struct bch_page_state s;
+
+	if (!PagePrivate(page))
+		return;
+
+	s = xchg(page_state(page), (struct bch_page_state) { .v = 0 });
+	ClearPagePrivate(page);
+
+	if (s.dirty_sectors) {
+		spin_lock(&inode->i_lock);
+		inode->i_blocks -= s.dirty_sectors;
+		spin_unlock(&inode->i_lock);
+	}
+
+	if (s.reserved)
+		bch_disk_reservation_put(c, &res);
+}
+
+int bch_set_page_dirty(struct page *page)
+{
+	struct bch_page_state old, new;
+
+	old = page_state_cmpxchg(page_state(page), new,
+		new.dirty_sectors = PAGE_SECTORS - new.sectors;
+	);
+
+	if (old.dirty_sectors != new.dirty_sectors) {
+		struct inode *inode = page->mapping->host;
+
+		spin_lock(&inode->i_lock);
+		inode->i_blocks += new.dirty_sectors - old.dirty_sectors;
+		spin_unlock(&inode->i_lock);
+	}
+
+	return __set_page_dirty_nobuffers(page);
+}
+
+/* readpages/writepages: */
+
+static bool bio_can_add_page_contig(struct bio *bio, struct page *page)
+{
+	sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9);
+
+	return bio->bi_vcnt < bio->bi_max_vecs &&
+		bio_end_sector(bio) == offset;
+}
+
+static int bio_add_page_contig(struct bio *bio, struct page *page)
+{
+	sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9);
+
+	BUG_ON(!bio->bi_max_vecs);
+
+	if (!bio->bi_vcnt)
+		bio->bi_iter.bi_sector = offset;
+	else if (!bio_can_add_page_contig(bio, page))
+		return -1;
+
+	bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
+		.bv_page = page,
+		.bv_len = PAGE_SIZE,
+		.bv_offset = 0,
+	};
+
+	bio->bi_iter.bi_size += PAGE_SIZE;
+
+	return 0;
+}
+
+static void bch_readpages_end_io(struct bio *bio)
+{
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment_all(bv, bio, i) {
+		struct page *page = bv->bv_page;
+
+		if (!bio->bi_error) {
+			SetPageUptodate(page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+		unlock_page(page);
+	}
+
+	bio_put(bio);
+}
+
+static inline struct page *__readpage_next_page(struct address_space *mapping,
+						struct list_head *pages,
+						unsigned *nr_pages)
+{
+	struct page *page;
+	int ret;
+
+	while (*nr_pages) {
+		page = list_entry(pages->prev, struct page, lru);
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		ret = add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS);
+
+		/* if add_to_page_cache_lru() succeeded, page is locked: */
+		put_page(page);
+
+		if (!ret)
+			return page;
+
+		(*nr_pages)--;
+	}
+
+	return NULL;
+}
+
+#define for_each_readpage_page(_mapping, _pages, _nr_pages, _page)	\
+	for (;								\
+	     ((_page) = __readpage_next_page(_mapping, _pages, &(_nr_pages)));\
+	     (_nr_pages)--)
+
+static void bch_mark_pages_unalloc(struct bio *bio)
+{
+	struct bvec_iter iter;
+	struct bio_vec bv;
+
+	bio_for_each_segment(bv, bio, iter)
+		page_state(bv.bv_page)->alloc_state = BCH_PAGE_UNALLOCATED;
+}
+
+static void bch_add_page_sectors(struct bio *bio, const struct bkey *k)
+{
+	struct bvec_iter iter;
+	struct bio_vec bv;
+
+	bio_for_each_segment(bv, bio, iter) {
+		struct bch_page_state *s = page_state(bv.bv_page);
+
+		/* sectors in @k from the start of this page: */
+		unsigned k_sectors = k->size - (iter.bi_sector - k->p.offset);
+
+		unsigned page_sectors = min(bv.bv_len >> 9, k_sectors);
+
+		BUG_ON(s->sectors + page_sectors > PAGE_SECTORS);
+
+		s->sectors += page_sectors;
+	}
+}
+
+static void bchfs_read(struct cache_set *c, struct bch_read_bio *rbio, u64 inode)
+{
+	struct bio *bio = &rbio->bio;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bio_vec *bv;
+	unsigned i;
+	int ret;
+
+	bch_increment_clock(c, bio_sectors(bio), READ);
+
+	/*
+	 * Initialize page state:
+	 * If a page is partly allocated and partly a hole, we want it to be
+	 * marked BCH_PAGE_UNALLOCATED - so we initially mark all pages
+	 * allocated and then mark them unallocated as we find holes:
+	 *
+	 * Note that the bio hasn't been split yet - it's the only bio that
+	 * points to these pages. As we walk extents and split @bio, that
+	 * necessarily be true, the splits won't necessarily be on page
+	 * boundaries:
+	 */
+	bio_for_each_segment_all(bv, bio, i) {
+		struct bch_page_state *s = page_state(bv->bv_page);
+
+		EBUG_ON(s->reserved);
+
+		s->alloc_state = BCH_PAGE_ALLOCATED;
+		s->sectors = 0;
+	}
+
+	for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
+				      POS(inode, bio->bi_iter.bi_sector), k) {
+		BKEY_PADDED(k) tmp;
+		struct extent_pick_ptr pick;
+		unsigned bytes, sectors;
+		bool is_last;
+
+		bkey_reassemble(&tmp.k, k);
+		bch_btree_iter_unlock(&iter);
+		k = bkey_i_to_s_c(&tmp.k);
+
+		if (!bkey_extent_is_allocation(k.k) ||
+		    bkey_extent_is_compressed(c, k))
+			bch_mark_pages_unalloc(bio);
+
+		bch_extent_pick_ptr(c, k, &pick);
+		if (IS_ERR(pick.ca)) {
+			bcache_io_error(c, bio, "no device to read from");
+			bio_endio(bio);
+			return;
+		}
+
+		sectors = min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
+			bio->bi_iter.bi_sector;
+		bytes = sectors << 9;
+		is_last = bytes == bio->bi_iter.bi_size;
+		swap(bio->bi_iter.bi_size, bytes);
+
+		if (bkey_extent_is_allocation(k.k))
+			bch_add_page_sectors(bio, k.k);
+
+		if (pick.ca) {
+			PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
+				c->prio_clock[READ].hand;
+
+			bch_read_extent(c, rbio, k, &pick,
+					BCH_READ_RETRY_IF_STALE|
+					BCH_READ_PROMOTE|
+					(is_last ? BCH_READ_IS_LAST : 0));
+		} else {
+			zero_fill_bio_iter(bio, bio->bi_iter);
+
+			if (is_last)
+				bio_endio(bio);
+		}
+
+		if (is_last)
+			return;
+
+		swap(bio->bi_iter.bi_size, bytes);
+		bio_advance(bio, bytes);
+	}
+
+	/*
+	 * If we get here, it better have been because there was an error
+	 * reading a btree node
+	 */
+	ret = bch_btree_iter_unlock(&iter);
+	BUG_ON(!ret);
+	bcache_io_error(c, bio, "btree IO error %i", ret);
+	bio_endio(bio);
+}
+
+int bch_readpages(struct file *file, struct address_space *mapping,
+		  struct list_head *pages, unsigned nr_pages)
+{
+	struct inode *inode = mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct bch_read_bio *rbio = NULL;
+	struct page *page;
+
+	pr_debug("reading %u pages", nr_pages);
+
+	if (current->pagecache_lock != &mapping->add_lock)
+		pagecache_add_get(&mapping->add_lock);
+
+	for_each_readpage_page(mapping, pages, nr_pages, page) {
+again:
+		if (!rbio) {
+			rbio = container_of(bio_alloc_bioset(GFP_NOFS,
+						min_t(unsigned, nr_pages,
+						      BIO_MAX_PAGES),
+						&c->bio_read),
+					   struct bch_read_bio, bio);
+
+			rbio->bio.bi_end_io = bch_readpages_end_io;
+		}
+
+		if (bio_add_page_contig(&rbio->bio, page)) {
+			bchfs_read(c, rbio, inode->i_ino);
+			rbio = NULL;
+			goto again;
+		}
+	}
+
+	if (rbio)
+		bchfs_read(c, rbio, inode->i_ino);
+
+	if (current->pagecache_lock != &mapping->add_lock)
+		pagecache_add_put(&mapping->add_lock);
+
+	pr_debug("success");
+	return 0;
+}
+
+int bch_readpage(struct file *file, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct bch_read_bio *rbio;
+
+	rbio = container_of(bio_alloc_bioset(GFP_NOFS, 1,
+					    &c->bio_read),
+			   struct bch_read_bio, bio);
+	bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
+	rbio->bio.bi_end_io = bch_readpages_end_io;
+
+	bio_add_page_contig(&rbio->bio, page);
+	bchfs_read(c, rbio, inode->i_ino);
+
+	return 0;
+}
+
+struct bch_writepage_state {
+	struct bch_writepage_io	*io;
+};
+
+static void bch_writepage_io_free(struct closure *cl)
+{
+	struct bch_writepage_io *io = container_of(cl,
+					struct bch_writepage_io, cl);
+	struct bio *bio = &io->bio.bio;
+
+	bio_put(bio);
+}
+
+static void bch_writepage_io_done(struct closure *cl)
+{
+	struct bch_writepage_io *io = container_of(cl,
+					struct bch_writepage_io, cl);
+	struct cache_set *c = io->op.op.c;
+	struct bio *bio = &io->bio.bio;
+	struct bio_vec *bvec;
+	unsigned i;
+
+	atomic_sub(bio->bi_vcnt, &c->writeback_pages);
+	wake_up(&c->writeback_wait);
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		if (io->op.op.error) {
+			SetPageError(page);
+			if (page->mapping)
+				set_bit(AS_EIO, &page->mapping->flags);
+		}
+
+		if (io->op.op.written >= PAGE_SECTORS) {
+			struct bch_page_state old, new;
+
+			old = page_state_cmpxchg(page_state(page), new, {
+				new.sectors = PAGE_SECTORS;
+				new.dirty_sectors = 0;
+			});
+
+			io->op.sectors_added -= old.dirty_sectors;
+			io->op.op.written -= PAGE_SECTORS;
+		}
+	}
+
+	/*
+	 * racing with fallocate can cause us to add fewer sectors than
+	 * expected - but we shouldn't add more sectors than expected:
+	 *
+	 * (error (due to going RO) halfway through a page can screw that up
+	 * slightly)
+	 */
+	BUG_ON(io->op.sectors_added >= (s64) PAGE_SECTORS);
+
+	/*
+	 * PageWriteback is effectively our ref on the inode - fixup i_blocks
+	 * before calling end_page_writeback:
+	 */
+	if (io->op.sectors_added) {
+		struct inode *inode = &io->op.ei->vfs_inode;
+
+		spin_lock(&inode->i_lock);
+		inode->i_blocks += io->op.sectors_added;
+		spin_unlock(&inode->i_lock);
+	}
+
+	bio_for_each_segment_all(bvec, bio, i)
+		end_page_writeback(bvec->bv_page);
+
+	closure_return_with_destructor(&io->cl, bch_writepage_io_free);
+}
+
+static void bch_writepage_do_io(struct bch_writepage_state *w)
+{
+	struct bch_writepage_io *io = w->io;
+
+	w->io = NULL;
+	atomic_add(io->bio.bio.bi_vcnt, &io->op.op.c->writeback_pages);
+
+	io->op.op.pos.offset = io->bio.bio.bi_iter.bi_sector;
+
+	closure_call(&io->op.op.cl, bch_write, NULL, &io->cl);
+	continue_at(&io->cl, bch_writepage_io_done, NULL);
+}
+
+/*
+ * Get a bch_writepage_io and add @page to it - appending to an existing one if
+ * possible, else allocating a new one:
+ */
+static void bch_writepage_io_alloc(struct cache_set *c,
+				   struct bch_writepage_state *w,
+				   struct bch_inode_info *ei,
+				   struct page *page)
+{
+	u64 inum = ei->vfs_inode.i_ino;
+
+	if (!w->io) {
+alloc_io:
+		w->io = container_of(bio_alloc_bioset(GFP_NOFS,
+						      BIO_MAX_PAGES,
+						      bch_writepage_bioset),
+				     struct bch_writepage_io, bio.bio);
+
+		closure_init(&w->io->cl, NULL);
+		w->io->op.ei		= ei;
+		w->io->op.sectors_added	= 0;
+		w->io->op.is_dio	= false;
+		bch_write_op_init(&w->io->op.op, c, &w->io->bio,
+				  (struct disk_reservation) {
+					.nr_replicas = c->opts.data_replicas,
+				  },
+				  foreground_write_point(c, inum),
+				  POS(inum, 0),
+				  &ei->journal_seq, 0);
+		w->io->op.op.index_update_fn = bchfs_write_index_update;
+	}
+
+	if (bio_add_page_contig(&w->io->bio.bio, page)) {
+		bch_writepage_do_io(w);
+		goto alloc_io;
+	}
+
+	/*
+	 * We shouldn't ever be handed pages for multiple inodes in a single
+	 * pass - right?
+	 */
+	BUG_ON(ei != w->io->op.ei);
+}
+
+static int __bch_writepage(struct cache_set *c, struct page *page,
+			   struct writeback_control *wbc,
+			   struct bch_writepage_state *w)
+{
+	struct inode *inode = page->mapping->host;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct bch_page_state new, old;
+	unsigned offset;
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_SHIFT;
+
+	EBUG_ON(!PageUptodate(page));
+
+	/* Is the page fully inside i_size? */
+	if (page->index < end_index)
+		goto do_io;
+
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_SIZE - 1);
+	if (page->index > end_index || !offset) {
+		unlock_page(page);
+		return 0;
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	zero_user_segment(page, offset, PAGE_SIZE);
+do_io:
+	bch_writepage_io_alloc(c, w, ei, page);
+
+	/* while page is locked: */
+	w->io->op.new_i_size = i_size;
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		w->io->bio.bio.bi_opf |= WRITE_SYNC;
+
+	/* Before unlocking the page, transfer reservation to w->io: */
+	old = page_state_cmpxchg(page_state(page), new, {
+		BUG_ON(!new.reserved &&
+		       (new.sectors != PAGE_SECTORS ||
+			new.alloc_state != BCH_PAGE_ALLOCATED));
+
+		if (new.alloc_state == BCH_PAGE_ALLOCATED &&
+		    w->io->op.op.compression_type != BCH_COMPRESSION_NONE)
+			new.alloc_state = BCH_PAGE_UNALLOCATED;
+		else if (!new.reserved)
+			goto out;
+		new.reserved = 0;
+	});
+
+	w->io->op.op.res.sectors += PAGE_SECTORS * (old.reserved - new.reserved);
+out:
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	unlock_page(page);
+
+	return 0;
+}
+
+int bch_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	struct cache_set *c = mapping->host->i_sb->s_fs_info;
+	struct bch_writepage_state w = { NULL };
+	struct pagecache_iter iter;
+	struct page *page;
+	int ret = 0;
+	int done = 0;
+	pgoff_t uninitialized_var(writeback_index);
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	pgoff_t done_index;
+	int cycled;
+	int range_whole = 0;
+	int tag;
+
+	if (wbc->range_cyclic) {
+		writeback_index = mapping->writeback_index; /* prev offset */
+		index = writeback_index;
+		if (index == 0)
+			cycled = 1;
+		else
+			cycled = 0;
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_SHIFT;
+		end = wbc->range_end >> PAGE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		cycled = 1; /* ignore range_cyclic tests */
+	}
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+		tag = PAGECACHE_TAG_TOWRITE;
+	else
+		tag = PAGECACHE_TAG_DIRTY;
+retry:
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+		tag_pages_for_writeback(mapping, index, end);
+
+	done_index = index;
+get_pages:
+	for_each_pagecache_tag(&iter, mapping, tag, index, end, page) {
+		done_index = page->index;
+
+		if (w.io &&
+		    !bio_can_add_page_contig(&w.io->bio.bio, page))
+			bch_writepage_do_io(&w);
+
+		if (!w.io &&
+		    atomic_read(&c->writeback_pages) >=
+		    c->writeback_pages_max) {
+			/* don't sleep with pages pinned: */
+			pagecache_iter_release(&iter);
+
+			__wait_event(c->writeback_wait,
+				     atomic_read(&c->writeback_pages) <
+				     c->writeback_pages_max);
+			goto get_pages;
+		}
+
+		lock_page(page);
+
+		/*
+		 * Page truncated or invalidated. We can freely skip it
+		 * then, even for data integrity operations: the page
+		 * has disappeared concurrently, so there could be no
+		 * real expectation of this data interity operation
+		 * even if there is now a new, dirty page at the same
+		 * pagecache address.
+		 */
+		if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+			unlock_page(page);
+			continue;
+		}
+
+		if (!PageDirty(page)) {
+			/* someone wrote it for us */
+			goto continue_unlock;
+		}
+
+		if (PageWriteback(page)) {
+			if (wbc->sync_mode != WB_SYNC_NONE)
+				wait_on_page_writeback(page);
+			else
+				goto continue_unlock;
+		}
+
+		BUG_ON(PageWriteback(page));
+		if (!clear_page_dirty_for_io(page))
+			goto continue_unlock;
+
+		trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
+		ret = __bch_writepage(c, page, wbc, &w);
+		if (unlikely(ret)) {
+			if (ret == AOP_WRITEPAGE_ACTIVATE) {
+				unlock_page(page);
+				ret = 0;
+			} else {
+				/*
+				 * done_index is set past this page,
+				 * so media errors will not choke
+				 * background writeout for the entire
+				 * file. This has consequences for
+				 * range_cyclic semantics (ie. it may
+				 * not be suitable for data integrity
+				 * writeout).
+				 */
+				done_index = page->index + 1;
+				done = 1;
+				break;
+			}
+		}
+
+		/*
+		 * We stop writing back only if we are not doing
+		 * integrity sync. In case of integrity sync we have to
+		 * keep going until we have written all the pages
+		 * we tagged for writeback prior to entering this loop.
+		 */
+		if (--wbc->nr_to_write <= 0 &&
+		    wbc->sync_mode == WB_SYNC_NONE) {
+			done = 1;
+			break;
+		}
+	}
+	pagecache_iter_release(&iter);
+
+	if (w.io)
+		bch_writepage_do_io(&w);
+
+	if (!cycled && !done) {
+		/*
+		 * range_cyclic:
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		cycled = 1;
+		index = 0;
+		end = writeback_index - 1;
+		goto retry;
+	}
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = done_index;
+
+	return ret;
+}
+
+int bch_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct cache_set *c = page->mapping->host->i_sb->s_fs_info;
+	struct bch_writepage_state w = { NULL };
+	int ret;
+
+	ret = __bch_writepage(c, page, wbc, &w);
+	if (w.io)
+		bch_writepage_do_io(&w);
+
+	return ret;
+}
+
+static void bch_read_single_page_end_io(struct bio *bio)
+{
+	complete(bio->bi_private);
+}
+
+static int bch_read_single_page(struct page *page,
+				struct address_space *mapping)
+{
+	struct inode *inode = mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct bch_read_bio *rbio;
+	int ret;
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	rbio = container_of(bio_alloc_bioset(GFP_NOFS, 1,
+					     &c->bio_read),
+			    struct bch_read_bio, bio);
+	bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
+	rbio->bio.bi_private = &done;
+	rbio->bio.bi_end_io = bch_read_single_page_end_io;
+	bio_add_page_contig(&rbio->bio, page);
+
+	bchfs_read(c, rbio, inode->i_ino);
+	wait_for_completion(&done);
+
+	ret = rbio->bio.bi_error;
+	bio_put(&rbio->bio);
+
+	if (ret < 0)
+		return ret;
+
+	SetPageUptodate(page);
+	return 0;
+}
+
+int bch_write_begin(struct file *file, struct address_space *mapping,
+		    loff_t pos, unsigned len, unsigned flags,
+		    struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	pgoff_t index = pos >> PAGE_SHIFT;
+	unsigned offset = pos & (PAGE_SIZE - 1);
+	struct page *page;
+	int ret = -ENOMEM;
+
+	BUG_ON(inode_unhashed(mapping->host));
+
+	/* Not strictly necessary - same reason as mkwrite(): */
+	pagecache_add_get(&mapping->add_lock);
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		goto err_unlock;
+
+	if (PageUptodate(page))
+		goto out;
+
+	/* If we're writing entire page, don't need to read it in first: */
+	if (len == PAGE_SIZE)
+		goto out;
+
+	if (!offset && pos + len >= inode->i_size) {
+		zero_user_segment(page, len, PAGE_SIZE);
+		flush_dcache_page(page);
+		goto out;
+	}
+
+	if (index > inode->i_size >> PAGE_SHIFT) {
+		zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE);
+		flush_dcache_page(page);
+		goto out;
+	}
+readpage:
+	ret = bch_read_single_page(page, mapping);
+	if (ret)
+		goto err;
+out:
+	ret = bch_get_page_reservation(c, page, true);
+	if (ret) {
+		if (!PageUptodate(page)) {
+			/*
+			 * If the page hasn't been read in, we won't know if we
+			 * actually need a reservation - we don't actually need
+			 * to read here, we just need to check if the page is
+			 * fully backed by uncompressed data:
+			 */
+			goto readpage;
+		}
+
+		goto err;
+	}
+
+	*pagep = page;
+	return 0;
+err:
+	unlock_page(page);
+	put_page(page);
+	*pagep = NULL;
+err_unlock:
+	pagecache_add_put(&mapping->add_lock);
+	return ret;
+}
+
+int bch_write_end(struct file *filp, struct address_space *mapping,
+		  loff_t pos, unsigned len, unsigned copied,
+		  struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	if (unlikely(copied < len && !PageUptodate(page))) {
+		/*
+		 * The page needs to be read in, but that would destroy
+		 * our partial write - simplest thing is to just force
+		 * userspace to redo the write:
+		 */
+		zero_user(page, 0, PAGE_SIZE);
+		flush_dcache_page(page);
+		copied = 0;
+	}
+
+	if (pos + copied > inode->i_size)
+		i_size_write(inode, pos + copied);
+
+	if (copied) {
+		if (!PageUptodate(page))
+			SetPageUptodate(page);
+		if (!PageDirty(page))
+			set_page_dirty(page);
+	} else {
+		bch_put_page_reservation(c, page);
+	}
+
+	unlock_page(page);
+	put_page(page);
+	pagecache_add_put(&mapping->add_lock);
+
+	return copied;
+}
+
+/* O_DIRECT */
+
+static void bch_dio_read_complete(struct closure *cl)
+{
+	struct dio_read *dio = container_of(cl, struct dio_read, cl);
+
+	dio->req->ki_complete(dio->req, dio->ret, 0);
+	bio_check_pages_dirty(&dio->rbio.bio);	/* transfers ownership */
+}
+
+static void bch_direct_IO_read_endio(struct bio *bio)
+{
+	struct dio_read *dio = bio->bi_private;
+
+	if (bio->bi_error)
+		dio->ret = bio->bi_error;
+
+	closure_put(&dio->cl);
+}
+
+static void bch_direct_IO_read_split_endio(struct bio *bio)
+{
+	bch_direct_IO_read_endio(bio);
+	bio_check_pages_dirty(bio);	/* transfers ownership */
+}
+
+static int bch_direct_IO_read(struct cache_set *c, struct kiocb *req,
+			      struct file *file, struct inode *inode,
+			      struct iov_iter *iter, loff_t offset)
+{
+	struct dio_read *dio;
+	struct bio *bio;
+	bool sync = is_sync_kiocb(req);
+	ssize_t ret;
+
+	if ((offset|iter->count) & (block_bytes(c) - 1))
+		return -EINVAL;
+
+	ret = min_t(loff_t, iter->count,
+		    max_t(loff_t, 0, i_size_read(inode) - offset));
+	iov_iter_truncate(iter, round_up(ret, block_bytes(c)));
+
+	if (!ret)
+		return ret;
+
+	bio = bio_alloc_bioset(GFP_KERNEL,
+			       iov_iter_npages(iter, BIO_MAX_PAGES),
+			       bch_dio_read_bioset);
+
+	bio->bi_end_io = bch_direct_IO_read_endio;
+
+	dio = container_of(bio, struct dio_read, rbio.bio);
+	closure_init(&dio->cl, NULL);
+
+	/*
+	 * this is a _really_ horrible hack just to avoid an atomic sub at the
+	 * end:
+	 */
+	if (!sync) {
+		set_closure_fn(&dio->cl, bch_dio_read_complete, NULL);
+		atomic_set(&dio->cl.remaining,
+			   CLOSURE_REMAINING_INITIALIZER -
+			   CLOSURE_RUNNING +
+			   CLOSURE_DESTRUCTOR);
+	} else {
+		atomic_set(&dio->cl.remaining,
+			   CLOSURE_REMAINING_INITIALIZER + 1);
+	}
+
+	dio->req	= req;
+	dio->ret	= ret;
+
+	goto start;
+	while (iter->count) {
+		bio = bio_alloc_bioset(GFP_KERNEL,
+				       iov_iter_npages(iter, BIO_MAX_PAGES),
+				       &c->bio_read);
+		bio->bi_end_io		= bch_direct_IO_read_split_endio;
+start:
+		bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC);
+		bio->bi_iter.bi_sector	= offset >> 9;
+		bio->bi_private		= dio;
+
+		ret = bio_get_user_pages(bio, iter, 1);
+		if (ret < 0) {
+			/* XXX: fault inject this path */
+			bio->bi_error = ret;
+			bio_endio(bio);
+			break;
+		}
+
+		offset += bio->bi_iter.bi_size;
+		bio_set_pages_dirty(bio);
+
+		if (iter->count)
+			closure_get(&dio->cl);
+
+		bch_read(c, container_of(bio,
+				struct bch_read_bio, bio),
+			 inode->i_ino);
+	}
+
+	if (sync) {
+		closure_sync(&dio->cl);
+		closure_debug_destroy(&dio->cl);
+		ret = dio->ret;
+		bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */
+		return ret;
+	} else {
+		return -EIOCBQUEUED;
+	}
+}
+
+static long __bch_dio_write_complete(struct dio_write *dio)
+{
+	struct file *file = dio->req->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = file->f_inode;
+	long ret = dio->error ?: dio->written;
+
+	bch_disk_reservation_put(dio->c, &dio->res);
+
+	__pagecache_block_put(&mapping->add_lock);
+	inode_dio_end(inode);
+
+	if (dio->iovec && dio->iovec != dio->inline_vecs)
+		kfree(dio->iovec);
+
+	bio_put(&dio->bio.bio);
+	return ret;
+}
+
+static void bch_dio_write_complete(struct closure *cl)
+{
+	struct dio_write *dio = container_of(cl, struct dio_write, cl);
+	struct kiocb *req = dio->req;
+
+	req->ki_complete(req, __bch_dio_write_complete(dio), 0);
+}
+
+static void bch_dio_write_done(struct dio_write *dio)
+{
+	struct bio_vec *bv;
+	int i;
+
+	dio->written += dio->iop.op.written << 9;
+
+	if (dio->iop.op.error)
+		dio->error = dio->iop.op.error;
+
+	bio_for_each_segment_all(bv, &dio->bio.bio, i)
+		put_page(bv->bv_page);
+
+	if (dio->iter.count)
+		bio_reset(&dio->bio.bio);
+}
+
+static void bch_do_direct_IO_write(struct dio_write *dio)
+{
+	struct file *file = dio->req->ki_filp;
+	struct inode *inode = file->f_inode;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct bio *bio = &dio->bio.bio;
+	unsigned flags = 0;
+	int ret;
+
+	if ((dio->req->ki_flags & IOCB_DSYNC) &&
+	    !dio->c->opts.journal_flush_disabled)
+		flags |= BCH_WRITE_FLUSH;
+
+	bio->bi_iter.bi_sector = (dio->offset + dio->written) >> 9;
+
+	ret = bio_get_user_pages(bio, &dio->iter, 0);
+	if (ret < 0) {
+		/*
+		 * these didn't get initialized, but bch_dio_write_done() will
+		 * look at them:
+		 */
+		dio->iop.op.error = 0;
+		dio->iop.op.written = 0;
+		dio->error = ret;
+		return;
+	}
+
+	dio->iop.ei		= ei;
+	dio->iop.sectors_added	= 0;
+	dio->iop.is_dio		= true;
+	dio->iop.new_i_size	= U64_MAX;
+	bch_write_op_init(&dio->iop.op, dio->c, &dio->bio,
+			  dio->res,
+			  foreground_write_point(dio->c, inode->i_ino),
+			  POS(inode->i_ino, bio->bi_iter.bi_sector),
+			  &ei->journal_seq, flags);
+	dio->iop.op.index_update_fn = bchfs_write_index_update;
+
+	dio->res.sectors -= bio_sectors(bio);
+	dio->iop.op.res.sectors = bio_sectors(bio);
+
+	task_io_account_write(bio->bi_iter.bi_size);
+
+	closure_call(&dio->iop.op.cl, bch_write, NULL, &dio->cl);
+}
+
+static void bch_dio_write_loop_async(struct closure *cl)
+{
+	struct dio_write *dio =
+		container_of(cl, struct dio_write, cl);
+	struct address_space *mapping = dio->req->ki_filp->f_mapping;
+
+	bch_dio_write_done(dio);
+
+	if (dio->iter.count && !dio->error) {
+		use_mm(dio->mm);
+		pagecache_block_get(&mapping->add_lock);
+
+		bch_do_direct_IO_write(dio);
+
+		pagecache_block_put(&mapping->add_lock);
+		unuse_mm(dio->mm);
+
+		continue_at(&dio->cl, bch_dio_write_loop_async, NULL);
+	} else {
+#if 0
+		closure_return_with_destructor(cl, bch_dio_write_complete);
+#else
+		closure_debug_destroy(cl);
+		bch_dio_write_complete(cl);
+#endif
+	}
+}
+
+static int bch_direct_IO_write(struct cache_set *c, struct kiocb *req,
+			       struct file *file, struct inode *inode,
+			       struct iov_iter *iter, loff_t offset)
+{
+	struct address_space *mapping = file->f_mapping;
+	struct dio_write *dio;
+	struct bio *bio;
+	ssize_t ret;
+	bool sync = is_sync_kiocb(req);
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	if (unlikely(!iter->count))
+		return 0;
+
+	if (unlikely((offset|iter->count) & (block_bytes(c) - 1)))
+		return -EINVAL;
+
+	bio = bio_alloc_bioset(GFP_KERNEL,
+			       iov_iter_npages(iter, BIO_MAX_PAGES),
+			       bch_dio_write_bioset);
+	dio = container_of(bio, struct dio_write, bio.bio);
+	dio->req	= req;
+	dio->c		= c;
+	dio->written	= 0;
+	dio->error	= 0;
+	dio->offset	= offset;
+	dio->iovec	= NULL;
+	dio->iter	= *iter;
+	dio->mm		= current->mm;
+	closure_init(&dio->cl, NULL);
+
+	if (offset + iter->count > inode->i_size)
+		sync = true;
+
+	/*
+	 * XXX: we shouldn't return -ENOSPC if we're overwriting existing data -
+	 * if getting a reservation fails we should check if we are doing an
+	 * overwrite.
+	 *
+	 * Have to then guard against racing with truncate (deleting data that
+	 * we would have been overwriting)
+	 */
+	ret = bch_disk_reservation_get(c, &dio->res, iter->count >> 9, 0);
+	if (unlikely(ret)) {
+		closure_debug_destroy(&dio->cl);
+		bio_put(bio);
+		return ret;
+	}
+
+	inode_dio_begin(inode);
+	__pagecache_block_get(&mapping->add_lock);
+
+	if (sync) {
+		do {
+			bch_do_direct_IO_write(dio);
+
+			closure_sync(&dio->cl);
+			bch_dio_write_done(dio);
+		} while (dio->iter.count && !dio->error);
+
+		closure_debug_destroy(&dio->cl);
+		return __bch_dio_write_complete(dio);
+	} else {
+		bch_do_direct_IO_write(dio);
+
+		if (dio->iter.count && !dio->error) {
+			if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
+				dio->iovec = kmalloc(dio->iter.nr_segs *
+						     sizeof(struct iovec),
+						     GFP_KERNEL);
+				if (!dio->iovec)
+					dio->error = -ENOMEM;
+			} else {
+				dio->iovec = dio->inline_vecs;
+			}
+
+			memcpy(dio->iovec,
+			       dio->iter.iov,
+			       dio->iter.nr_segs * sizeof(struct iovec));
+			dio->iter.iov = dio->iovec;
+		}
+
+		continue_at_noreturn(&dio->cl, bch_dio_write_loop_async, NULL);
+		return -EIOCBQUEUED;
+	}
+}
+
+ssize_t bch_direct_IO(struct kiocb *req, struct iov_iter *iter)
+{
+	struct file *file = req->ki_filp;
+	struct inode *inode = file->f_inode;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+
+	return ((iov_iter_rw(iter) == WRITE)
+		? bch_direct_IO_write
+		: bch_direct_IO_read)(c, req, file, inode, iter, req->ki_pos);
+}
+
+static ssize_t
+bch_direct_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_inode;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct address_space *mapping = file->f_mapping;
+	loff_t pos = iocb->ki_pos;
+	ssize_t	ret;
+
+	pagecache_block_get(&mapping->add_lock);
+
+	/* Write and invalidate pagecache range that we're writing to: */
+	ret = write_invalidate_inode_pages_range(file->f_mapping, pos,
+					pos + iov_iter_count(iter) - 1);
+	if (unlikely(ret))
+		goto err;
+
+	ret = bch_direct_IO_write(c, iocb, file, inode, iter, pos);
+err:
+	pagecache_block_put(&mapping->add_lock);
+
+	return ret;
+}
+
+static ssize_t __bch_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t	ret;
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = inode_to_bdi(inode);
+	ret = file_remove_privs(file);
+	if (ret)
+		goto out;
+
+	ret = file_update_time(file);
+	if (ret)
+		goto out;
+
+	ret = iocb->ki_flags & IOCB_DIRECT
+		? bch_direct_write(iocb, from)
+		: generic_perform_write(file, from, iocb->ki_pos);
+
+	if (likely(ret > 0))
+		iocb->ki_pos += ret;
+out:
+	current->backing_dev_info = NULL;
+	return ret;
+}
+
+ssize_t bch_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	bool direct = iocb->ki_flags & IOCB_DIRECT;
+	ssize_t ret;
+
+	inode_lock(inode);
+	ret = generic_write_checks(iocb, from);
+	if (ret > 0)
+		ret = __bch_write_iter(iocb, from);
+	inode_unlock(inode);
+
+	if (ret > 0 && !direct)
+		ret = generic_write_sync(iocb, ret);
+
+	return ret;
+}
+
+int bch_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = file_inode(vma->vm_file);
+	struct address_space *mapping = inode->i_mapping;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	int ret = VM_FAULT_LOCKED;
+
+	sb_start_pagefault(inode->i_sb);
+	file_update_time(vma->vm_file);
+
+	/*
+	 * Not strictly necessary, but helps avoid dio writes livelocking in
+	 * write_invalidate_inode_pages_range() - can drop this if/when we get
+	 * a write_invalidate_inode_pages_range() that works without dropping
+	 * page lock before invalidating page
+	 */
+	if (current->pagecache_lock != &mapping->add_lock)
+		pagecache_add_get(&mapping->add_lock);
+
+	lock_page(page);
+	if (page->mapping != mapping ||
+	    page_offset(page) > i_size_read(inode)) {
+		unlock_page(page);
+		ret = VM_FAULT_NOPAGE;
+		goto out;
+	}
+
+	if (bch_get_page_reservation(c, page, true)) {
+		unlock_page(page);
+		ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
+
+	if (!PageDirty(page))
+		set_page_dirty(page);
+	wait_for_stable_page(page);
+out:
+	if (current->pagecache_lock != &mapping->add_lock)
+		pagecache_add_put(&mapping->add_lock);
+	sb_end_pagefault(inode->i_sb);
+	return ret;
+}
+
+void bch_invalidatepage(struct page *page, unsigned int offset,
+			unsigned int length)
+{
+	EBUG_ON(!PageLocked(page));
+	EBUG_ON(PageWriteback(page));
+
+	if (offset || length < PAGE_SIZE)
+		return;
+
+	bch_clear_page_bits(page);
+}
+
+int bch_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	EBUG_ON(!PageLocked(page));
+	EBUG_ON(PageWriteback(page));
+
+	if (PageDirty(page))
+		return 0;
+
+	bch_clear_page_bits(page);
+	return 1;
+}
+
+#ifdef CONFIG_MIGRATION
+int bch_migrate_page(struct address_space *mapping, struct page *newpage,
+		     struct page *page, enum migrate_mode mode)
+{
+	int ret;
+
+	ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+	if (ret != MIGRATEPAGE_SUCCESS)
+		return ret;
+
+	if (PagePrivate(page)) {
+		*page_state(newpage) = *page_state(page);
+		ClearPagePrivate(page);
+	}
+
+	migrate_page_copy(newpage, page);
+	return MIGRATEPAGE_SUCCESS;
+}
+#endif
+
+int bch_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	int ret;
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+
+	if (c->opts.journal_flush_disabled)
+		return 0;
+
+	return bch_journal_flush_seq(&c->journal, ei->journal_seq);
+}
+
+static int __bch_truncate_page(struct address_space *mapping,
+			       pgoff_t index, loff_t start, loff_t end)
+{
+	struct inode *inode = mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	unsigned start_offset = start & (PAGE_SIZE - 1);
+	unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
+	struct page *page;
+	int ret = 0;
+
+	/* Page boundary? Nothing to do */
+	if (!((index == start >> PAGE_SHIFT && start_offset) ||
+	      (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE)))
+		return 0;
+
+	/* Above i_size? */
+	if (index << PAGE_SHIFT >= inode->i_size)
+		return 0;
+
+	page = find_lock_page(mapping, index);
+	if (!page) {
+		struct btree_iter iter;
+		struct bkey_s_c k = bkey_s_c_null;
+
+		/*
+		 * XXX: we're doing two index lookups when we end up reading the
+		 * page
+		 */
+		for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+				   POS(inode->i_ino,
+				       index << (PAGE_SHIFT - 9)), k) {
+			if (bkey_cmp(bkey_start_pos(k.k),
+				     POS(inode->i_ino,
+					 (index + 1) << (PAGE_SHIFT - 9))) >= 0)
+				break;
+
+			if (k.k->type != KEY_TYPE_DISCARD &&
+			    k.k->type != BCH_RESERVATION) {
+				bch_btree_iter_unlock(&iter);
+				goto create;
+			}
+		}
+		bch_btree_iter_unlock(&iter);
+		return 0;
+create:
+		page = find_or_create_page(mapping, index, GFP_KERNEL);
+		if (unlikely(!page)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	if (!PageUptodate(page)) {
+		ret = bch_read_single_page(page, mapping);
+		if (ret)
+			goto unlock;
+	}
+
+	/*
+	 * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
+	 *
+	 * XXX: because we aren't currently tracking whether the page has actual
+	 * data in it (vs. just 0s, or only partially written) this wrong. ick.
+	 */
+	ret = bch_get_page_reservation(c, page, false);
+	BUG_ON(ret);
+
+	if (index == start >> PAGE_SHIFT &&
+	    index == end >> PAGE_SHIFT)
+		zero_user_segment(page, start_offset, end_offset);
+	else if (index == start >> PAGE_SHIFT)
+		zero_user_segment(page, start_offset, PAGE_SIZE);
+	else if (index == end >> PAGE_SHIFT)
+		zero_user_segment(page, 0, end_offset);
+
+	if (!PageDirty(page))
+		set_page_dirty(page);
+unlock:
+	unlock_page(page);
+	put_page(page);
+out:
+	return ret;
+}
+
+static int bch_truncate_page(struct address_space *mapping, loff_t from)
+{
+	return __bch_truncate_page(mapping, from >> PAGE_SHIFT,
+				   from, from + PAGE_SIZE);
+}
+
+int bch_truncate(struct inode *inode, struct iattr *iattr)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	bool shrink = iattr->ia_size <= inode->i_size;
+	int ret = 0;
+
+	inode_dio_wait(inode);
+	pagecache_block_get(&mapping->add_lock);
+
+	truncate_setsize(inode, iattr->ia_size);
+
+	/* sync appends.. */
+	/* XXX what protects ei->i_size? */
+	if (iattr->ia_size > ei->i_size)
+		ret = filemap_write_and_wait_range(mapping, ei->i_size, S64_MAX);
+	if (ret)
+		goto err_put_pagecache;
+
+	mutex_lock(&ei->update_lock);
+	i_size_dirty_get(ei);
+	ret = bch_write_inode_size(c, ei, inode->i_size);
+	mutex_unlock(&ei->update_lock);
+
+	if (unlikely(ret))
+		goto err;
+
+	/*
+	 * There might be persistent reservations (from fallocate())
+	 * above i_size, which bch_inode_truncate() will discard - we're
+	 * only supposed to discard them if we're doing a real truncate
+	 * here (new i_size < current i_size):
+	 */
+	if (shrink) {
+		struct i_sectors_hook i_sectors_hook;
+		int ret;
+
+		ret = i_sectors_dirty_get(ei, &i_sectors_hook);
+		if (unlikely(ret))
+			goto err;
+
+		ret = bch_truncate_page(inode->i_mapping, iattr->ia_size);
+		if (unlikely(ret)) {
+			i_sectors_dirty_put(ei, &i_sectors_hook);
+			goto err;
+		}
+
+		ret = bch_inode_truncate(c, inode->i_ino,
+					 round_up(iattr->ia_size, PAGE_SIZE) >> 9,
+					 &i_sectors_hook.hook,
+					 &ei->journal_seq);
+
+		i_sectors_dirty_put(ei, &i_sectors_hook);
+
+		if (unlikely(ret))
+			goto err;
+	}
+
+	mutex_lock(&ei->update_lock);
+	setattr_copy(inode, iattr);
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	/* clear I_SIZE_DIRTY: */
+	i_size_dirty_put(ei);
+	ret = bch_write_inode_size(c, ei, inode->i_size);
+	mutex_unlock(&ei->update_lock);
+
+	pagecache_block_put(&mapping->add_lock);
+
+	return 0;
+err:
+	i_size_dirty_put(ei);
+err_put_pagecache:
+	pagecache_block_put(&mapping->add_lock);
+	return ret;
+}
+
+static long bch_fpunch(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	u64 ino = inode->i_ino;
+	u64 discard_start = round_up(offset, PAGE_SIZE) >> 9;
+	u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9;
+	int ret = 0;
+
+	inode_lock(inode);
+	inode_dio_wait(inode);
+	pagecache_block_get(&mapping->add_lock);
+
+	ret = __bch_truncate_page(inode->i_mapping,
+				  offset >> PAGE_SHIFT,
+				  offset, offset + len);
+	if (unlikely(ret))
+		goto out;
+
+	if (offset >> PAGE_SHIFT !=
+	    (offset + len) >> PAGE_SHIFT) {
+		ret = __bch_truncate_page(inode->i_mapping,
+					  (offset + len) >> PAGE_SHIFT,
+					  offset, offset + len);
+		if (unlikely(ret))
+			goto out;
+	}
+
+	truncate_pagecache_range(inode, offset, offset + len - 1);
+
+	if (discard_start < discard_end) {
+		struct disk_reservation disk_res;
+		struct i_sectors_hook i_sectors_hook;
+		int ret;
+
+		BUG_ON(bch_disk_reservation_get(c, &disk_res, 0, 0));
+
+		ret = i_sectors_dirty_get(ei, &i_sectors_hook);
+		if (unlikely(ret))
+			goto out;
+
+		ret = bch_discard(c,
+				  POS(ino, discard_start),
+				  POS(ino, discard_end),
+				  0,
+				  &disk_res,
+				  &i_sectors_hook.hook,
+				  &ei->journal_seq);
+
+		i_sectors_dirty_put(ei, &i_sectors_hook);
+		bch_disk_reservation_put(c, &disk_res);
+	}
+out:
+	pagecache_block_put(&mapping->add_lock);
+	inode_unlock(inode);
+
+	return ret;
+}
+
+static long bch_fcollapse(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct btree_iter src;
+	struct btree_iter dst;
+	BKEY_PADDED(k) copy;
+	struct bkey_s_c k;
+	struct i_sectors_hook i_sectors_hook;
+	loff_t new_size;
+	int ret;
+
+	if ((offset | len) & (PAGE_SIZE - 1))
+		return -EINVAL;
+
+	bch_btree_iter_init_intent(&dst, c, BTREE_ID_EXTENTS,
+				   POS(inode->i_ino, offset >> 9));
+	/* position will be set from dst iter's position: */
+	bch_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN);
+	bch_btree_iter_link(&src, &dst);
+
+	/*
+	 * We need i_mutex to keep the page cache consistent with the extents
+	 * btree, and the btree consistent with i_size - we don't need outside
+	 * locking for the extents btree itself, because we're using linked
+	 * iterators
+	 */
+	inode_lock(inode);
+	inode_dio_wait(inode);
+	pagecache_block_get(&mapping->add_lock);
+
+	ret = -EINVAL;
+	if (offset + len >= inode->i_size)
+		goto err;
+
+	if (inode->i_size < len)
+		goto err;
+
+	new_size = inode->i_size - len;
+
+	ret = write_invalidate_inode_pages_range(inode->i_mapping,
+						 offset, LLONG_MAX);
+	if (ret)
+		goto err;
+
+	ret = i_sectors_dirty_get(ei, &i_sectors_hook);
+	if (ret)
+		goto err;
+
+	while (bkey_cmp(dst.pos,
+			POS(inode->i_ino,
+			    round_up(new_size, PAGE_SIZE) >> 9)) < 0) {
+		struct disk_reservation disk_res;
+
+		bch_btree_iter_set_pos(&src,
+			POS(dst.pos.inode, dst.pos.offset + (len >> 9)));
+
+		ret = bch_btree_iter_traverse(&dst);
+		if (ret)
+			goto btree_iter_err;
+
+		k = bch_btree_iter_peek_with_holes(&src);
+		if ((ret = btree_iter_err(k)))
+			goto btree_iter_err;
+
+		bkey_reassemble(&copy.k, k);
+
+		if (bkey_deleted(&copy.k.k))
+			copy.k.k.type = KEY_TYPE_DISCARD;
+
+		bch_cut_front(src.pos, &copy.k);
+		copy.k.k.p.offset -= len >> 9;
+
+		BUG_ON(bkey_cmp(dst.pos, bkey_start_pos(&copy.k.k)));
+
+		ret = bch_disk_reservation_get(c, &disk_res, copy.k.k.size,
+					       BCH_DISK_RESERVATION_NOFAIL);
+		BUG_ON(ret);
+
+		ret = bch_btree_insert_at(c, &disk_res, &i_sectors_hook.hook,
+					  &ei->journal_seq,
+					  BTREE_INSERT_ATOMIC|
+					  BTREE_INSERT_NOFAIL,
+					  BTREE_INSERT_ENTRY(&dst, &copy.k));
+		bch_disk_reservation_put(c, &disk_res);
+btree_iter_err:
+		if (ret < 0 && ret != -EINTR)
+			goto err_unwind;
+
+		bch_btree_iter_cond_resched(&src);
+	}
+
+	bch_btree_iter_unlock(&src);
+	bch_btree_iter_unlock(&dst);
+
+	ret = bch_inode_truncate(c, inode->i_ino,
+				 round_up(new_size, PAGE_SIZE) >> 9,
+				 &i_sectors_hook.hook,
+				 &ei->journal_seq);
+	if (ret)
+		goto err_unwind;
+
+	i_sectors_dirty_put(ei, &i_sectors_hook);
+
+	mutex_lock(&ei->update_lock);
+	i_size_write(inode, new_size);
+	ret = bch_write_inode_size(c, ei, inode->i_size);
+	mutex_unlock(&ei->update_lock);
+
+	pagecache_block_put(&mapping->add_lock);
+	inode_unlock(inode);
+
+	return ret;
+err_unwind:
+	/*
+	 * XXX: we've left data with multiple pointers... which isn't a _super_
+	 * serious problem...
+	 */
+	i_sectors_dirty_put(ei, &i_sectors_hook);
+err:
+	bch_btree_iter_unlock(&src);
+	bch_btree_iter_unlock(&dst);
+	pagecache_block_put(&mapping->add_lock);
+	inode_unlock(inode);
+	return ret;
+}
+
+static long bch_fallocate(struct inode *inode, int mode,
+			  loff_t offset, loff_t len)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct i_sectors_hook i_sectors_hook;
+	struct btree_iter iter;
+	struct bkey_i reservation;
+	struct bkey_s_c k;
+	struct bpos end;
+	loff_t block_start, block_end;
+	loff_t new_size = offset + len;
+	unsigned sectors;
+	int ret;
+
+	bch_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+
+	inode_lock(inode);
+	inode_dio_wait(inode);
+	pagecache_block_get(&mapping->add_lock);
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	    new_size > inode->i_size) {
+		ret = inode_newsize_ok(inode, new_size);
+		if (ret)
+			goto err;
+	}
+
+	if (mode & FALLOC_FL_ZERO_RANGE) {
+		ret = __bch_truncate_page(inode->i_mapping,
+					  offset >> PAGE_SHIFT,
+					  offset, offset + len);
+
+		if (!ret &&
+		    offset >> PAGE_SHIFT !=
+		    (offset + len) >> PAGE_SHIFT)
+			ret = __bch_truncate_page(inode->i_mapping,
+						  (offset + len) >> PAGE_SHIFT,
+						  offset, offset + len);
+
+		if (unlikely(ret))
+			goto err;
+
+		truncate_pagecache_range(inode, offset, offset + len - 1);
+
+		block_start	= round_up(offset, PAGE_SIZE);
+		block_end	= round_down(offset + len, PAGE_SIZE);
+	} else {
+		block_start	= round_down(offset, PAGE_SIZE);
+		block_end	= round_up(offset + len, PAGE_SIZE);
+	}
+
+	bch_btree_iter_set_pos(&iter, POS(inode->i_ino, block_start >> 9));
+	end = POS(inode->i_ino, block_end >> 9);
+
+	ret = i_sectors_dirty_get(ei, &i_sectors_hook);
+	if (unlikely(ret))
+		goto err;
+
+	while (bkey_cmp(iter.pos, end) < 0) {
+		struct disk_reservation disk_res = { 0 };
+
+		k = bch_btree_iter_peek_with_holes(&iter);
+		if ((ret = btree_iter_err(k)))
+			goto btree_iter_err;
+
+		/* already reserved */
+		if (k.k->type == BCH_RESERVATION) {
+			bch_btree_iter_advance_pos(&iter);
+			continue;
+		}
+
+		if (bkey_extent_is_data(k.k)) {
+			if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+				bch_btree_iter_advance_pos(&iter);
+				continue;
+			}
+		}
+
+		bkey_init(&reservation.k);
+		reservation.k.type	= BCH_RESERVATION;
+		reservation.k.p		= k.k->p;
+		reservation.k.size	= k.k->size;
+
+		bch_cut_front(iter.pos, &reservation);
+		bch_cut_back(end, &reservation.k);
+
+		sectors = reservation.k.size;
+
+		if (!bkey_extent_is_allocation(k.k) ||
+		    bkey_extent_is_compressed(c, k)) {
+			ret = bch_disk_reservation_get(c, &disk_res,
+						       sectors, 0);
+			if (ret)
+				goto err_put_sectors_dirty;
+		}
+
+		ret = bch_btree_insert_at(c, &disk_res, &i_sectors_hook.hook,
+					  &ei->journal_seq,
+					  BTREE_INSERT_ATOMIC|
+					  BTREE_INSERT_NOFAIL,
+					  BTREE_INSERT_ENTRY(&iter, &reservation));
+		bch_disk_reservation_put(c, &disk_res);
+btree_iter_err:
+		if (ret < 0 && ret != -EINTR)
+			goto err_put_sectors_dirty;
+
+	}
+	bch_btree_iter_unlock(&iter);
+
+	i_sectors_dirty_put(ei, &i_sectors_hook);
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	    new_size > inode->i_size) {
+		i_size_write(inode, new_size);
+
+		mutex_lock(&ei->update_lock);
+		ret = bch_write_inode_size(c, ei, inode->i_size);
+		mutex_unlock(&ei->update_lock);
+	}
+
+	/* blech */
+	if ((mode & FALLOC_FL_KEEP_SIZE) &&
+	    (mode & FALLOC_FL_ZERO_RANGE) &&
+	    ei->i_size != inode->i_size) {
+		/* sync appends.. */
+		ret = filemap_write_and_wait_range(mapping, ei->i_size, S64_MAX);
+		if (ret)
+			goto err;
+
+		if (ei->i_size != inode->i_size) {
+			mutex_lock(&ei->update_lock);
+			ret = bch_write_inode_size(c, ei, inode->i_size);
+			mutex_unlock(&ei->update_lock);
+		}
+	}
+
+	pagecache_block_put(&mapping->add_lock);
+	inode_unlock(inode);
+
+	return 0;
+err_put_sectors_dirty:
+	i_sectors_dirty_put(ei, &i_sectors_hook);
+err:
+	bch_btree_iter_unlock(&iter);
+	pagecache_block_put(&mapping->add_lock);
+	inode_unlock(inode);
+	return ret;
+}
+
+long bch_fallocate_dispatch(struct file *file, int mode,
+			    loff_t offset, loff_t len)
+{
+	struct inode *inode = file_inode(file);
+
+	if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
+		return bch_fallocate(inode, mode, offset, len);
+
+	if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
+		return bch_fpunch(inode, offset, len);
+
+	if (mode == FALLOC_FL_COLLAPSE_RANGE)
+		return bch_fcollapse(inode, offset, len);
+
+	return -EOPNOTSUPP;
+}
+
+static bool page_is_data(struct page *page)
+{
+	/* XXX: should only have to check PageDirty */
+	return PagePrivate(page) &&
+		(page_state(page)->sectors ||
+		 page_state(page)->dirty_sectors);
+}
+
+static loff_t bch_next_pagecache_data(struct inode *inode,
+				      loff_t start_offset,
+				      loff_t end_offset)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	pgoff_t index;
+
+	for (index = start_offset >> PAGE_SHIFT;
+	     index < end_offset >> PAGE_SHIFT;
+	     index++) {
+		if (find_get_pages(mapping, index, 1, &page)) {
+			lock_page(page);
+			index = page->index;
+
+			if (page_is_data(page))
+				end_offset =
+					min(end_offset,
+					max(start_offset,
+					    ((loff_t) index) << PAGE_SHIFT));
+			unlock_page(page);
+			put_page(page);
+		} else {
+			break;
+		}
+	}
+
+	return end_offset;
+}
+
+static loff_t bch_seek_data(struct file *file, u64 offset)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 isize, next_data = MAX_LFS_FILESIZE;
+	int ret;
+
+	isize = i_size_read(inode);
+	if (offset >= isize)
+		return -ENXIO;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+			   POS(inode->i_ino, offset >> 9), k) {
+		if (k.k->p.inode != inode->i_ino) {
+			break;
+		} else if (bkey_extent_is_data(k.k)) {
+			next_data = max(offset, bkey_start_offset(k.k) << 9);
+			break;
+		} else if (k.k->p.offset >> 9 > isize)
+			break;
+	}
+
+	ret = bch_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	if (next_data > offset)
+		next_data = bch_next_pagecache_data(inode, offset, next_data);
+
+	if (next_data > isize)
+		return -ENXIO;
+
+	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
+}
+
+static bool page_slot_is_data(struct address_space *mapping, pgoff_t index)
+{
+	struct page *page;
+	bool ret;
+
+	page = find_lock_entry(mapping, index);
+	if (!page || radix_tree_exception(page))
+		return false;
+
+	ret = page_is_data(page);
+	unlock_page(page);
+
+	return ret;
+}
+
+static loff_t bch_next_pagecache_hole(struct inode *inode,
+				      loff_t start_offset,
+				      loff_t end_offset)
+{
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t index;
+
+	for (index = start_offset >> PAGE_SHIFT;
+	     index < end_offset >> PAGE_SHIFT;
+	     index++)
+		if (!page_slot_is_data(mapping, index))
+			end_offset = max(start_offset,
+					 ((loff_t) index) << PAGE_SHIFT);
+
+	return end_offset;
+}
+
+static loff_t bch_seek_hole(struct file *file, u64 offset)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 isize, next_hole = MAX_LFS_FILESIZE;
+	int ret;
+
+	isize = i_size_read(inode);
+	if (offset >= isize)
+		return -ENXIO;
+
+	for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
+				      POS(inode->i_ino, offset >> 9), k) {
+		if (k.k->p.inode != inode->i_ino) {
+			next_hole = bch_next_pagecache_hole(inode,
+					offset, MAX_LFS_FILESIZE);
+			break;
+		} else if (!bkey_extent_is_data(k.k)) {
+			next_hole = bch_next_pagecache_hole(inode,
+					max(offset, bkey_start_offset(k.k) << 9),
+					k.k->p.offset << 9);
+
+			if (next_hole < k.k->p.offset << 9)
+				break;
+		} else {
+			offset = max(offset, bkey_start_offset(k.k) << 9);
+		}
+	}
+
+	ret = bch_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	if (next_hole > isize)
+		next_hole = isize;
+
+	return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
+}
+
+loff_t bch_llseek(struct file *file, loff_t offset, int whence)
+{
+	switch (whence) {
+	case SEEK_SET:
+	case SEEK_CUR:
+	case SEEK_END:
+		return generic_file_llseek(file, offset, whence);
+	case SEEK_DATA:
+		return bch_seek_data(file, offset);
+	case SEEK_HOLE:
+		return bch_seek_hole(file, offset);
+	}
+
+	return -EINVAL;
+}
diff --git a/libbcache/fs-io.h b/libbcache/fs-io.h
new file mode 100644
index 0000000..d598bc8
--- /dev/null
+++ b/libbcache/fs-io.h
@@ -0,0 +1,96 @@
+#ifndef _BCACHE_FS_IO_H
+#define _BCACHE_FS_IO_H
+
+#include "buckets.h"
+#include <linux/uio.h>
+
+int bch_set_page_dirty(struct page *);
+
+int bch_writepage(struct page *, struct writeback_control *);
+int bch_readpage(struct file *, struct page *);
+
+int bch_writepages(struct address_space *, struct writeback_control *);
+int bch_readpages(struct file *, struct address_space *,
+		  struct list_head *, unsigned);
+
+int bch_write_begin(struct file *, struct address_space *, loff_t,
+		    unsigned, unsigned, struct page **, void **);
+int bch_write_end(struct file *, struct address_space *, loff_t,
+		  unsigned, unsigned, struct page *, void *);
+
+ssize_t bch_direct_IO(struct kiocb *, struct iov_iter *);
+
+ssize_t bch_write_iter(struct kiocb *, struct iov_iter *);
+
+int bch_fsync(struct file *, loff_t, loff_t, int);
+
+int bch_truncate(struct inode *, struct iattr *);
+long bch_fallocate_dispatch(struct file *, int, loff_t, loff_t);
+
+loff_t bch_llseek(struct file *, loff_t, int);
+
+int bch_page_mkwrite(struct vm_area_struct *, struct vm_fault *);
+void bch_invalidatepage(struct page *, unsigned int, unsigned int);
+int bch_releasepage(struct page *, gfp_t);
+int bch_migrate_page(struct address_space *, struct page *,
+		     struct page *, enum migrate_mode);
+
+struct i_sectors_hook {
+	struct extent_insert_hook	hook;
+	s64				sectors;
+	struct bch_inode_info		*ei;
+};
+
+struct bchfs_write_op {
+	struct bch_inode_info	*ei;
+	s64			sectors_added;
+	bool			is_dio;
+	u64			new_i_size;
+	struct bch_write_op	op;
+};
+
+struct bch_writepage_io {
+	struct closure		cl;
+
+	struct bchfs_write_op	op;
+
+	/* must come last: */
+	struct bch_write_bio	bio;
+};
+
+extern struct bio_set *bch_writepage_bioset;
+
+struct dio_write {
+	struct closure		cl;
+	struct kiocb		*req;
+	struct cache_set	*c;
+	long			written;
+	long			error;
+	loff_t			offset;
+
+	struct disk_reservation	res;
+
+	struct iovec		*iovec;
+	struct iovec		inline_vecs[UIO_FASTIOV];
+	struct iov_iter		iter;
+
+	struct mm_struct	*mm;
+
+	struct bchfs_write_op	iop;
+
+	/* must be last: */
+	struct bch_write_bio	bio;
+};
+
+extern struct bio_set *bch_dio_write_bioset;
+
+struct dio_read {
+	struct closure		cl;
+	struct kiocb		*req;
+	long			ret;
+	struct bch_read_bio	rbio;
+};
+
+extern struct bio_set *bch_dio_read_bioset;
+
+#endif /* _BCACHE_FS_IO_H */
diff --git a/libbcache/fs.c b/libbcache/fs.c
new file mode 100644
index 0000000..1f01e48
--- /dev/null
+++ b/libbcache/fs.c
@@ -0,0 +1,1506 @@
+
+#include "bcache.h"
+#include "acl.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "chardev.h"
+#include "dirent.h"
+#include "extents.h"
+#include "fs.h"
+#include "fs-gc.h"
+#include "fs-io.h"
+#include "inode.h"
+#include "journal.h"
+#include "keylist.h"
+#include "super.h"
+#include "xattr.h"
+
+#include <linux/aio.h>
+#include <linux/backing-dev.h>
+#include <linux/compat.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/random.h>
+#include <linux/statfs.h>
+#include <linux/xattr.h>
+
+static struct kmem_cache *bch_inode_cache;
+
+static void bch_inode_init(struct bch_inode_info *, struct bkey_s_c_inode);
+
+/*
+ * I_SIZE_DIRTY requires special handling:
+ *
+ * To the recovery code, the flag means that there is stale data past i_size
+ * that needs to be deleted; it's used for implementing atomic appends and
+ * truncates.
+ *
+ * On append, we set I_SIZE_DIRTY before doing the write, then after the write
+ * we clear I_SIZE_DIRTY atomically with updating i_size to the new larger size
+ * that exposes the data we just wrote.
+ *
+ * On truncate, it's the reverse: We set I_SIZE_DIRTY atomically with setting
+ * i_size to the new smaller size, then we delete the data that we just made
+ * invisible, and then we clear I_SIZE_DIRTY.
+ *
+ * Because there can be multiple appends in flight at a time, we need a refcount
+ * (i_size_dirty_count) instead of manipulating the flag directly. Nonzero
+ * refcount means I_SIZE_DIRTY is set, zero means it's cleared.
+ *
+ * Because write_inode() can be called at any time, i_size_dirty_count means
+ * something different to the runtime code - it means to write_inode() "don't
+ * update i_size yet".
+ *
+ * We don't clear I_SIZE_DIRTY directly, we let write_inode() clear it when
+ * i_size_dirty_count is zero - but the reverse is not true, I_SIZE_DIRTY must
+ * be set explicitly.
+ */
+
+int __must_check __bch_write_inode(struct cache_set *c,
+				   struct bch_inode_info *ei,
+				   inode_set_fn set,
+				   void *p)
+{
+	struct btree_iter iter;
+	struct inode *inode = &ei->vfs_inode;
+	struct bkey_i_inode new_inode;
+	struct bch_inode *bi;
+	u64 inum = inode->i_ino;
+	int ret;
+
+	lockdep_assert_held(&ei->update_lock);
+
+	bch_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(inum, 0));
+
+	do {
+		struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter);
+
+		if ((ret = btree_iter_err(k)))
+			goto out;
+
+		if (WARN_ONCE(k.k->type != BCH_INODE_FS,
+			      "inode %llu not found when updating", inum)) {
+			bch_btree_iter_unlock(&iter);
+			return -ENOENT;
+		}
+
+		bkey_reassemble(&new_inode.k_i, k);
+		bi = &new_inode.v;
+
+		if (set) {
+			ret = set(ei, bi, p);
+			if (ret)
+				goto out;
+		}
+
+		bi->i_mode	= cpu_to_le16(inode->i_mode);
+		bi->i_uid	= cpu_to_le32(i_uid_read(inode));
+		bi->i_gid	= cpu_to_le32(i_gid_read(inode));
+		bi->i_nlink	= cpu_to_le32(inode->i_nlink);
+		bi->i_dev	= cpu_to_le32(inode->i_rdev);
+		bi->i_atime	= cpu_to_le64(timespec_to_ns(&inode->i_atime));
+		bi->i_mtime	= cpu_to_le64(timespec_to_ns(&inode->i_mtime));
+		bi->i_ctime	= cpu_to_le64(timespec_to_ns(&inode->i_ctime));
+
+		ret = bch_btree_insert_at(c, NULL, NULL, &ei->journal_seq,
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL,
+				BTREE_INSERT_ENTRY(&iter, &new_inode.k_i));
+	} while (ret == -EINTR);
+
+	if (!ret) {
+		ei->i_size	= le64_to_cpu(bi->i_size);
+		ei->i_flags	= le32_to_cpu(bi->i_flags);
+	}
+out:
+	bch_btree_iter_unlock(&iter);
+
+	return ret < 0 ? ret : 0;
+}
+
+int __must_check bch_write_inode(struct cache_set *c,
+				 struct bch_inode_info *ei)
+{
+	return __bch_write_inode(c, ei, NULL, NULL);
+}
+
+int bch_inc_nlink(struct cache_set *c, struct bch_inode_info *ei)
+{
+	int ret;
+
+	mutex_lock(&ei->update_lock);
+	inc_nlink(&ei->vfs_inode);
+	ret = bch_write_inode(c, ei);
+	mutex_unlock(&ei->update_lock);
+
+	return ret;
+}
+
+int bch_dec_nlink(struct cache_set *c, struct bch_inode_info *ei)
+{
+	int ret;
+
+	mutex_lock(&ei->update_lock);
+	drop_nlink(&ei->vfs_inode);
+	ret = bch_write_inode(c, ei);
+	mutex_unlock(&ei->update_lock);
+
+	return ret;
+}
+
+static struct inode *bch_vfs_inode_get(struct super_block *sb, u64 inum)
+{
+	struct cache_set *c = sb->s_fs_info;
+	struct inode *inode;
+	struct bch_inode_info *ei;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	pr_debug("inum %llu", inum);
+
+	inode = iget_locked(sb, inum);
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	bch_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0));
+	k = bch_btree_iter_peek_with_holes(&iter);
+
+	if ((ret = btree_iter_err(k)) || k.k->type != BCH_INODE_FS) {
+		ret = bch_btree_iter_unlock(&iter);
+		iget_failed(inode);
+		return ERR_PTR(ret ?: -ENOENT);
+	}
+
+	ei = to_bch_ei(inode);
+	bch_inode_init(ei, bkey_s_c_to_inode(k));
+
+	ei->journal_seq = bch_inode_journal_seq(&c->journal, inum);
+
+	unlock_new_inode(inode);
+
+	bch_btree_iter_unlock(&iter);
+
+	return inode;
+}
+
+static struct inode *bch_vfs_inode_create(struct cache_set *c,
+					  struct inode *parent,
+					  umode_t mode, dev_t rdev)
+{
+	struct inode *inode;
+	struct posix_acl *default_acl = NULL, *acl = NULL;
+	struct bch_inode_info *ei;
+	struct bch_inode *bi;
+	struct bkey_i_inode bkey_inode;
+	struct timespec ts = CURRENT_TIME;
+	s64 now = timespec_to_ns(&ts);
+	int ret;
+
+	inode = new_inode(parent->i_sb);
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+
+	inode_init_owner(inode, parent, mode);
+
+	ret = posix_acl_create(parent, &inode->i_mode, &default_acl, &acl);
+	if (ret) {
+		make_bad_inode(inode);
+		goto err;
+	}
+
+	ei = to_bch_ei(inode);
+
+	bi = &bkey_inode_init(&bkey_inode.k_i)->v;
+	bi->i_uid	= cpu_to_le32(i_uid_read(inode));
+	bi->i_gid	= cpu_to_le32(i_gid_read(inode));
+
+	bi->i_mode	= cpu_to_le16(inode->i_mode);
+	bi->i_dev	= cpu_to_le32(rdev);
+	bi->i_atime	= cpu_to_le64(now);
+	bi->i_mtime	= cpu_to_le64(now);
+	bi->i_ctime	= cpu_to_le64(now);
+	bi->i_nlink	= cpu_to_le32(S_ISDIR(mode) ? 2 : 1);
+
+	get_random_bytes(&bi->i_hash_seed, sizeof(bi->i_hash_seed));
+	SET_INODE_STR_HASH_TYPE(bi, c->sb.str_hash_type);
+
+	ret = bch_inode_create(c, &bkey_inode.k_i,
+			       BLOCKDEV_INODE_MAX, 0,
+			       &c->unused_inode_hint);
+	if (unlikely(ret)) {
+		/*
+		 * indicate to bch_evict_inode that the inode was never actually
+		 * created:
+		 */
+		make_bad_inode(inode);
+		goto err;
+	}
+
+	bch_inode_init(ei, inode_i_to_s_c(&bkey_inode));
+
+	if (default_acl) {
+		ret = bch_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		if (unlikely(ret))
+			goto err;
+	}
+
+	if (acl) {
+		ret = bch_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		if (unlikely(ret))
+			goto err;
+	}
+
+	insert_inode_hash(inode);
+	atomic_long_inc(&c->nr_inodes);
+out:
+	posix_acl_release(default_acl);
+	posix_acl_release(acl);
+	return inode;
+err:
+	clear_nlink(inode);
+	iput(inode);
+	inode = ERR_PTR(ret);
+	goto out;
+}
+
+static int bch_vfs_dirent_create(struct cache_set *c, struct inode *dir,
+				 u8 type, const struct qstr *name,
+				 struct inode *dst)
+{
+	int ret;
+
+	ret = bch_dirent_create(c, dir, type, name, dst->i_ino);
+	if (unlikely(ret))
+		return ret;
+
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	mark_inode_dirty_sync(dir);
+	return 0;
+}
+
+static int __bch_create(struct inode *dir, struct dentry *dentry,
+			umode_t mode, dev_t rdev)
+{
+	struct bch_inode_info *dir_ei = to_bch_ei(dir);
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode;
+	struct bch_inode_info *ei;
+	int ret;
+
+	inode = bch_vfs_inode_create(c, dir, mode, rdev);
+	if (unlikely(IS_ERR(inode)))
+		return PTR_ERR(inode);
+
+	ei = to_bch_ei(inode);
+
+	ret = bch_vfs_dirent_create(c, dir, mode_to_type(mode),
+				    &dentry->d_name, inode);
+	if (unlikely(ret)) {
+		clear_nlink(inode);
+		iput(inode);
+		return ret;
+	}
+
+	if (dir_ei->journal_seq > ei->journal_seq)
+		ei->journal_seq = dir_ei->journal_seq;
+
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+/* methods */
+
+static struct dentry *bch_lookup(struct inode *dir, struct dentry *dentry,
+				 unsigned int flags)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode = NULL;
+	u64 inum;
+
+	inum = bch_dirent_lookup(c, dir, &dentry->d_name);
+
+	if (inum)
+		inode = bch_vfs_inode_get(dir->i_sb, inum);
+
+	return d_splice_alias(inode, dentry);
+}
+
+static int bch_create(struct inode *dir, struct dentry *dentry,
+		      umode_t mode, bool excl)
+{
+	return __bch_create(dir, dentry, mode|S_IFREG, 0);
+}
+
+static int bch_link(struct dentry *old_dentry, struct inode *dir,
+		    struct dentry *dentry)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode = old_dentry->d_inode;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	int ret;
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	inode->i_ctime = CURRENT_TIME;
+
+	ret = bch_inc_nlink(c, ei);
+	if (ret)
+		return ret;
+
+	ihold(inode);
+
+	ret = bch_vfs_dirent_create(c, dir, mode_to_type(inode->i_mode),
+				    &dentry->d_name, inode);
+	if (unlikely(ret)) {
+		bch_dec_nlink(c, ei);
+		iput(inode);
+		return ret;
+	}
+
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+static int bch_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct bch_inode_info *dir_ei = to_bch_ei(dir);
+	struct inode *inode = dentry->d_inode;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	int ret;
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	ret = bch_dirent_delete(c, dir, &dentry->d_name);
+	if (ret)
+		return ret;
+
+	if (dir_ei->journal_seq > ei->journal_seq)
+		ei->journal_seq = dir_ei->journal_seq;
+
+	inode->i_ctime = dir->i_ctime;
+
+	if (S_ISDIR(inode->i_mode)) {
+		bch_dec_nlink(c, dir_ei);
+		drop_nlink(inode);
+	}
+
+	drop_nlink(inode);
+	if (inode->i_nlink) {
+		mutex_lock(&ei->update_lock);
+		ret = bch_write_inode(c, ei);
+		mutex_unlock(&ei->update_lock);
+	}
+
+	return 0;
+}
+
+static int bch_symlink(struct inode *dir, struct dentry *dentry,
+		       const char *symname)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode;
+	struct bch_inode_info *ei, *dir_ei = to_bch_ei(dir);
+	int ret;
+
+	inode = bch_vfs_inode_create(c, dir, S_IFLNK|S_IRWXUGO, 0);
+	if (unlikely(IS_ERR(inode)))
+		return PTR_ERR(inode);
+
+	ei = to_bch_ei(inode);
+
+	inode_lock(inode);
+	ret = page_symlink(inode, symname, strlen(symname) + 1);
+	inode_unlock(inode);
+
+	if (unlikely(ret))
+		goto err;
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
+	if (unlikely(ret))
+		goto err;
+
+	/* XXX: racy */
+	if (dir_ei->journal_seq < ei->journal_seq)
+		dir_ei->journal_seq = ei->journal_seq;
+
+	ret = bch_vfs_dirent_create(c, dir, DT_LNK, &dentry->d_name, inode);
+	if (unlikely(ret))
+		goto err;
+
+	d_instantiate(dentry, inode);
+	return 0;
+err:
+	clear_nlink(inode);
+	iput(inode);
+	return ret;
+}
+
+static int bch_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	int ret;
+
+	lockdep_assert_held(&dir->i_rwsem);
+
+	ret = __bch_create(dir, dentry, mode|S_IFDIR, 0);
+	if (unlikely(ret))
+		return ret;
+
+	bch_inc_nlink(c, to_bch_ei(dir));
+
+	return 0;
+}
+
+static int bch_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode = dentry->d_inode;
+
+	if (bch_empty_dir(c, inode->i_ino))
+		return -ENOTEMPTY;
+
+	return bch_unlink(dir, dentry);
+}
+
+static int bch_mknod(struct inode *dir, struct dentry *dentry,
+		     umode_t mode, dev_t rdev)
+{
+	return __bch_create(dir, dentry, mode, rdev);
+}
+
+static int bch_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct cache_set *c = old_dir->i_sb->s_fs_info;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct bch_inode_info *ei = to_bch_ei(old_inode);
+	struct inode *new_inode = new_dentry->d_inode;
+	struct timespec now = CURRENT_TIME;
+	int ret;
+
+	lockdep_assert_held(&old_dir->i_rwsem);
+	lockdep_assert_held(&new_dir->i_rwsem);
+
+	if (new_inode)
+		filemap_write_and_wait_range(old_inode->i_mapping,
+					     0, LLONG_MAX);
+
+	if (new_inode && S_ISDIR(old_inode->i_mode)) {
+		lockdep_assert_held(&new_inode->i_rwsem);
+
+		if (!S_ISDIR(new_inode->i_mode))
+			return -ENOTDIR;
+
+		if (bch_empty_dir(c, new_inode->i_ino))
+			return -ENOTEMPTY;
+
+		ret = bch_dirent_rename(c,
+					old_dir, &old_dentry->d_name,
+					new_dir, &new_dentry->d_name,
+					&ei->journal_seq, BCH_RENAME_OVERWRITE);
+		if (unlikely(ret))
+			return ret;
+
+		clear_nlink(new_inode);
+		bch_dec_nlink(c, to_bch_ei(old_dir));
+	} else if (new_inode) {
+		lockdep_assert_held(&new_inode->i_rwsem);
+
+		ret = bch_dirent_rename(c,
+					old_dir, &old_dentry->d_name,
+					new_dir, &new_dentry->d_name,
+					&ei->journal_seq, BCH_RENAME_OVERWRITE);
+		if (unlikely(ret))
+			return ret;
+
+		new_inode->i_ctime = now;
+		bch_dec_nlink(c, to_bch_ei(new_inode));
+	} else if (S_ISDIR(old_inode->i_mode)) {
+		ret = bch_dirent_rename(c,
+					old_dir, &old_dentry->d_name,
+					new_dir, &new_dentry->d_name,
+					&ei->journal_seq, BCH_RENAME);
+		if (unlikely(ret))
+			return ret;
+
+		bch_inc_nlink(c, to_bch_ei(new_dir));
+		bch_dec_nlink(c, to_bch_ei(old_dir));
+	} else {
+		ret = bch_dirent_rename(c,
+					old_dir, &old_dentry->d_name,
+					new_dir, &new_dentry->d_name,
+					&ei->journal_seq, BCH_RENAME);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	old_dir->i_ctime = old_dir->i_mtime = now;
+	new_dir->i_ctime = new_dir->i_mtime = now;
+	mark_inode_dirty_sync(old_dir);
+	mark_inode_dirty_sync(new_dir);
+
+	old_inode->i_ctime = now;
+	mark_inode_dirty_sync(old_inode);
+
+	return 0;
+}
+
+static int bch_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
+			       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct cache_set *c = old_dir->i_sb->s_fs_info;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct bch_inode_info *ei = to_bch_ei(old_inode);
+	struct timespec now = CURRENT_TIME;
+	int ret;
+
+	ret = bch_dirent_rename(c,
+				old_dir, &old_dentry->d_name,
+				new_dir, &new_dentry->d_name,
+				&ei->journal_seq, BCH_RENAME_EXCHANGE);
+	if (unlikely(ret))
+		return ret;
+
+	if (S_ISDIR(old_inode->i_mode) !=
+	    S_ISDIR(new_inode->i_mode)) {
+		if (S_ISDIR(old_inode->i_mode)) {
+			bch_inc_nlink(c, to_bch_ei(new_dir));
+			bch_dec_nlink(c, to_bch_ei(old_dir));
+		} else {
+			bch_dec_nlink(c, to_bch_ei(new_dir));
+			bch_inc_nlink(c, to_bch_ei(old_dir));
+		}
+	}
+
+	old_dir->i_ctime = old_dir->i_mtime = now;
+	new_dir->i_ctime = new_dir->i_mtime = now;
+	mark_inode_dirty_sync(old_dir);
+	mark_inode_dirty_sync(new_dir);
+
+	old_inode->i_ctime = now;
+	new_inode->i_ctime = now;
+	mark_inode_dirty_sync(old_inode);
+	mark_inode_dirty_sync(new_inode);
+
+	return 0;
+}
+
+static int bch_rename2(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry,
+		       unsigned flags)
+{
+	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
+		return -EINVAL;
+
+	if (flags & RENAME_EXCHANGE)
+		return bch_rename_exchange(old_dir, old_dentry,
+					   new_dir, new_dentry);
+
+	return bch_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
+static int bch_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	int ret = 0;
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	pr_debug("i_size was %llu update has %llu",
+		 inode->i_size, iattr->ia_size);
+
+	ret = setattr_prepare(dentry, iattr);
+	if (ret)
+		return ret;
+
+	if (iattr->ia_valid & ATTR_SIZE) {
+		ret = bch_truncate(inode, iattr);
+	} else {
+		mutex_lock(&ei->update_lock);
+		setattr_copy(inode, iattr);
+		ret = bch_write_inode(c, ei);
+		mutex_unlock(&ei->update_lock);
+	}
+
+	if (unlikely(ret))
+		return ret;
+
+	if (iattr->ia_valid & ATTR_MODE)
+		ret = posix_acl_chmod(inode, inode->i_mode);
+
+	return ret;
+}
+
+static int bch_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode;
+
+	/* XXX: i_nlink should be 0? */
+	inode = bch_vfs_inode_create(c, dir, mode, 0);
+	if (unlikely(IS_ERR(inode)))
+		return PTR_ERR(inode);
+
+	d_tmpfile(dentry, inode);
+	return 0;
+}
+
+static int bch_fill_extent(struct fiemap_extent_info *info,
+			   const struct bkey_i *k, unsigned flags)
+{
+	if (bkey_extent_is_data(&k->k)) {
+		struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+		const struct bch_extent_ptr *ptr;
+		const union bch_extent_crc *crc;
+		int ret;
+
+		extent_for_each_ptr_crc(e, ptr, crc) {
+			int flags2 = 0;
+			u64 offset = ptr->offset;
+
+			if (crc_compression_type(crc))
+				flags2 |= FIEMAP_EXTENT_ENCODED;
+			else
+				offset += crc_offset(crc);
+
+			if ((offset & (PAGE_SECTORS - 1)) ||
+			    (e.k->size & (PAGE_SECTORS - 1)))
+				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
+
+			ret = fiemap_fill_next_extent(info,
+						      bkey_start_offset(e.k) << 9,
+						      offset << 9,
+						      e.k->size << 9, flags|flags2);
+			if (ret)
+				return ret;
+		}
+
+		return 0;
+	} else if (k->k.type == BCH_RESERVATION) {
+		return fiemap_fill_next_extent(info,
+					       bkey_start_offset(&k->k) << 9,
+					       0, k->k.size << 9,
+					       flags|
+					       FIEMAP_EXTENT_DELALLOC|
+					       FIEMAP_EXTENT_UNWRITTEN);
+	} else {
+		BUG();
+	}
+}
+
+static int bch_fiemap(struct inode *inode, struct fiemap_extent_info *info,
+		      u64 start, u64 len)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	BKEY_PADDED(k) tmp;
+	bool have_extent = false;
+	int ret = 0;
+
+	if (start + len < start)
+		return -EINVAL;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+			   POS(inode->i_ino, start >> 9), k)
+		if (bkey_extent_is_data(k.k) ||
+		    k.k->type == BCH_RESERVATION) {
+			if (bkey_cmp(bkey_start_pos(k.k),
+				     POS(inode->i_ino, (start + len) >> 9)) >= 0)
+				break;
+
+			if (have_extent) {
+				ret = bch_fill_extent(info, &tmp.k, 0);
+				if (ret)
+					goto out;
+			}
+
+			bkey_reassemble(&tmp.k, k);
+			have_extent = true;
+		}
+
+	if (have_extent)
+		ret = bch_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST);
+out:
+	bch_btree_iter_unlock(&iter);
+	return ret < 0 ? ret : 0;
+}
+
+static const struct vm_operations_struct bch_vm_ops = {
+	.fault		= filemap_fault,
+	.map_pages	= filemap_map_pages,
+	.page_mkwrite   = bch_page_mkwrite,
+};
+
+static int bch_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	file_accessed(file);
+
+	vma->vm_ops = &bch_vm_ops;
+	return 0;
+}
+
+/* Inode flags: */
+
+static const unsigned bch_inode_flags_to_vfs_flags_map[] = {
+	[__BCH_INODE_SYNC]	= S_SYNC,
+	[__BCH_INODE_IMMUTABLE]	= S_IMMUTABLE,
+	[__BCH_INODE_APPEND]	= S_APPEND,
+	[__BCH_INODE_NOATIME]	= S_NOATIME,
+};
+
+static const unsigned bch_inode_flags_to_user_flags_map[] = {
+	[__BCH_INODE_SYNC]	= FS_SYNC_FL,
+	[__BCH_INODE_IMMUTABLE]	= FS_IMMUTABLE_FL,
+	[__BCH_INODE_APPEND]	= FS_APPEND_FL,
+	[__BCH_INODE_NODUMP]	= FS_NODUMP_FL,
+	[__BCH_INODE_NOATIME]	= FS_NOATIME_FL,
+};
+
+/* Set VFS inode flags from bcache inode: */
+static void bch_inode_flags_to_vfs(struct inode *inode)
+{
+	unsigned i, flags = to_bch_ei(inode)->i_flags;
+
+	for (i = 0; i < ARRAY_SIZE(bch_inode_flags_to_vfs_flags_map); i++)
+		if (flags & (1 << i))
+			inode->i_flags |=  bch_inode_flags_to_vfs_flags_map[i];
+		else
+			inode->i_flags &= ~bch_inode_flags_to_vfs_flags_map[i];
+}
+
+/* Get FS_IOC_GETFLAGS flags from bcache inode: */
+static unsigned bch_inode_flags_to_user_flags(unsigned flags)
+{
+	unsigned i, ret = 0;
+
+	for (i = 0; i < ARRAY_SIZE(bch_inode_flags_to_user_flags_map); i++)
+		if (flags & (1 << i))
+			ret |= bch_inode_flags_to_user_flags_map[i];
+
+	return ret;
+}
+
+static int bch_inode_user_flags_set(struct bch_inode_info *ei,
+				    struct bch_inode *bi,
+				    void *p)
+{
+	/*
+	 * We're relying on btree locking here for exclusion with other ioctl
+	 * calls - use the flags in the btree (@bi), not ei->i_flags:
+	 */
+	unsigned bch_flags = le32_to_cpu(bi->i_flags);
+	unsigned oldflags = bch_inode_flags_to_user_flags(bch_flags);
+	unsigned newflags = *((unsigned *) p);
+	unsigned i;
+
+	if (((newflags ^ oldflags) & (FS_APPEND_FL|FS_IMMUTABLE_FL)) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		return -EPERM;
+
+	for (i = 0; i < ARRAY_SIZE(bch_inode_flags_to_user_flags_map); i++) {
+		if (newflags & bch_inode_flags_to_user_flags_map[i])
+			bch_flags |=  (1 << i);
+		else
+			bch_flags &= ~(1 << i);
+
+		newflags &= ~bch_inode_flags_to_user_flags_map[i];
+		oldflags &= ~bch_inode_flags_to_user_flags_map[i];
+	}
+
+	if (oldflags != newflags)
+		return -EOPNOTSUPP;
+
+	bi->i_flags = cpu_to_le32(bch_flags);
+	ei->vfs_inode.i_ctime = CURRENT_TIME;
+
+	return 0;
+}
+
+#define FS_IOC_GOINGDOWN	     _IOR ('X', 125, __u32)
+
+static long bch_fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct super_block *sb = inode->i_sb;
+	struct cache_set *c = sb->s_fs_info;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	unsigned flags;
+	int ret;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		return put_user(bch_inode_flags_to_user_flags(ei->i_flags),
+				(int __user *) arg);
+
+	case FS_IOC_SETFLAGS: {
+		ret = mnt_want_write_file(filp);
+		if (ret)
+			return ret;
+
+		if (!inode_owner_or_capable(inode)) {
+			ret = -EACCES;
+			goto setflags_out;
+		}
+
+		if (get_user(flags, (int __user *) arg)) {
+			ret = -EFAULT;
+			goto setflags_out;
+		}
+
+		if (!S_ISREG(inode->i_mode) &&
+		    !S_ISDIR(inode->i_mode) &&
+		    (flags & (FS_NODUMP_FL|FS_NOATIME_FL)) != flags) {
+			ret = -EINVAL;
+			goto setflags_out;
+		}
+
+		inode_lock(inode);
+
+		mutex_lock(&ei->update_lock);
+		ret = __bch_write_inode(c, ei, bch_inode_user_flags_set, &flags);
+		mutex_unlock(&ei->update_lock);
+
+		if (!ret)
+			bch_inode_flags_to_vfs(inode);
+
+		inode_unlock(inode);
+setflags_out:
+		mnt_drop_write_file(filp);
+		return ret;
+	}
+
+	case FS_IOC_GETVERSION:
+		return -ENOTTY;
+	case FS_IOC_SETVERSION:
+		return -ENOTTY;
+
+	case FS_IOC_GOINGDOWN:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		down_write(&sb->s_umount);
+		sb->s_flags |= MS_RDONLY;
+		bch_cache_set_emergency_read_only(c);
+		up_write(&sb->s_umount);
+		return 0;
+
+	default:
+		return bch_cache_set_ioctl(c, cmd, (void __user *) arg);
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static long bch_compat_fs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		cmd = FS_IOC_GETFLAGS;
+		break;
+	case FS_IOC32_SETFLAGS:
+		cmd = FS_IOC_SETFLAGS;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return bch_fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
+/* Directories: */
+
+static loff_t bch_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	return generic_file_llseek_size(file, offset, whence,
+					S64_MAX, S64_MAX);
+}
+
+static int bch_vfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+
+	return bch_readdir(c, file, ctx);
+}
+
+static const struct file_operations bch_file_operations = {
+	.llseek		= bch_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= bch_write_iter,
+	.mmap		= bch_mmap,
+	.open		= generic_file_open,
+	.fsync		= bch_fsync,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+	.fallocate	= bch_fallocate_dispatch,
+	.unlocked_ioctl = bch_fs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= bch_compat_fs_ioctl,
+#endif
+};
+
+static const struct inode_operations bch_file_inode_operations = {
+	.setattr	= bch_setattr,
+	.fiemap		= bch_fiemap,
+	.listxattr	= bch_xattr_list,
+	.get_acl	= bch_get_acl,
+	.set_acl	= bch_set_acl,
+};
+
+static const struct inode_operations bch_dir_inode_operations = {
+	.lookup		= bch_lookup,
+	.create		= bch_create,
+	.link		= bch_link,
+	.unlink		= bch_unlink,
+	.symlink	= bch_symlink,
+	.mkdir		= bch_mkdir,
+	.rmdir		= bch_rmdir,
+	.mknod		= bch_mknod,
+	.rename		= bch_rename2,
+	.setattr	= bch_setattr,
+	.tmpfile	= bch_tmpfile,
+	.listxattr	= bch_xattr_list,
+	.get_acl	= bch_get_acl,
+	.set_acl	= bch_set_acl,
+};
+
+static const struct file_operations bch_dir_file_operations = {
+	.llseek		= bch_dir_llseek,
+	.read		= generic_read_dir,
+	.iterate	= bch_vfs_readdir,
+	.fsync		= bch_fsync,
+	.unlocked_ioctl = bch_fs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= bch_compat_fs_ioctl,
+#endif
+};
+
+static const struct inode_operations bch_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.get_link	= page_get_link,
+	.setattr	= bch_setattr,
+	.listxattr	= bch_xattr_list,
+	.get_acl	= bch_get_acl,
+	.set_acl	= bch_set_acl,
+};
+
+static const struct inode_operations bch_special_inode_operations = {
+	.setattr	= bch_setattr,
+	.listxattr	= bch_xattr_list,
+	.get_acl	= bch_get_acl,
+	.set_acl	= bch_set_acl,
+};
+
+static const struct address_space_operations bch_address_space_operations = {
+	.writepage	= bch_writepage,
+	.readpage	= bch_readpage,
+	.writepages	= bch_writepages,
+	.readpages	= bch_readpages,
+	.set_page_dirty	= bch_set_page_dirty,
+	.write_begin	= bch_write_begin,
+	.write_end	= bch_write_end,
+	.invalidatepage	= bch_invalidatepage,
+	.releasepage	= bch_releasepage,
+	.direct_IO	= bch_direct_IO,
+#ifdef CONFIG_MIGRATION
+	.migratepage	= bch_migrate_page,
+#endif
+	.error_remove_page = generic_error_remove_page,
+};
+
+static void bch_inode_init(struct bch_inode_info *ei,
+			   struct bkey_s_c_inode bkey_inode)
+{
+	struct inode *inode = &ei->vfs_inode;
+	const struct bch_inode *bi = bkey_inode.v;
+
+	pr_debug("init inode %llu with mode %o",
+		 bkey_inode.k->p.inode, bi->i_mode);
+
+	ei->i_flags	= le32_to_cpu(bi->i_flags);
+	ei->i_size	= le64_to_cpu(bi->i_size);
+
+	inode->i_mode	= le16_to_cpu(bi->i_mode);
+	i_uid_write(inode, le32_to_cpu(bi->i_uid));
+	i_gid_write(inode, le32_to_cpu(bi->i_gid));
+
+	atomic64_set(&ei->i_sectors, le64_to_cpu(bi->i_sectors));
+	inode->i_blocks = atomic64_read(&ei->i_sectors);
+
+	inode->i_ino	= bkey_inode.k->p.inode;
+	set_nlink(inode, le32_to_cpu(bi->i_nlink));
+	inode->i_rdev	= le32_to_cpu(bi->i_dev);
+	inode->i_size	= le64_to_cpu(bi->i_size);
+	inode->i_atime	= ns_to_timespec(le64_to_cpu(bi->i_atime));
+	inode->i_mtime	= ns_to_timespec(le64_to_cpu(bi->i_mtime));
+	inode->i_ctime	= ns_to_timespec(le64_to_cpu(bi->i_ctime));
+	bch_inode_flags_to_vfs(inode);
+
+	ei->str_hash.seed = le64_to_cpu(bi->i_hash_seed);
+	ei->str_hash.type = INODE_STR_HASH_TYPE(bi);
+
+	inode->i_mapping->a_ops = &bch_address_space_operations;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &bch_file_inode_operations;
+		inode->i_fop = &bch_file_operations;
+		break;
+	case S_IFDIR:
+		inode->i_op = &bch_dir_inode_operations;
+		inode->i_fop = &bch_dir_file_operations;
+		break;
+	case S_IFLNK:
+		inode_nohighmem(inode);
+		inode->i_op = &bch_symlink_inode_operations;
+		break;
+	default:
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		inode->i_op = &bch_special_inode_operations;
+		break;
+	}
+}
+
+static struct inode *bch_alloc_inode(struct super_block *sb)
+{
+	struct bch_inode_info *ei;
+
+	ei = kmem_cache_alloc(bch_inode_cache, GFP_NOFS);
+	if (!ei)
+		return NULL;
+
+	pr_debug("allocated %p", &ei->vfs_inode);
+
+	inode_init_once(&ei->vfs_inode);
+	mutex_init(&ei->update_lock);
+	ei->journal_seq = 0;
+	atomic_long_set(&ei->i_size_dirty_count, 0);
+	atomic_long_set(&ei->i_sectors_dirty_count, 0);
+
+	return &ei->vfs_inode;
+}
+
+static void bch_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+
+	kmem_cache_free(bch_inode_cache, to_bch_ei(inode));
+}
+
+static void bch_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, bch_i_callback);
+}
+
+static int bch_vfs_write_inode(struct inode *inode,
+			       struct writeback_control *wbc)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	int ret;
+
+	mutex_lock(&ei->update_lock);
+	ret = bch_write_inode(c, ei);
+	mutex_unlock(&ei->update_lock);
+
+	if (c->opts.journal_flush_disabled)
+		return ret;
+
+	if (!ret && wbc->sync_mode == WB_SYNC_ALL)
+		ret = bch_journal_flush_seq(&c->journal, ei->journal_seq);
+
+	return ret;
+}
+
+static void bch_evict_inode(struct inode *inode)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+
+	truncate_inode_pages_final(&inode->i_data);
+
+	if (!bch_journal_error(&c->journal) && !is_bad_inode(inode)) {
+		struct bch_inode_info *ei = to_bch_ei(inode);
+
+		/* XXX - we want to check this stuff iff there weren't IO errors: */
+		BUG_ON(atomic_long_read(&ei->i_sectors_dirty_count));
+		BUG_ON(atomic64_read(&ei->i_sectors) != inode->i_blocks);
+	}
+
+	clear_inode(inode);
+
+	if (!inode->i_nlink && !is_bad_inode(inode)) {
+		bch_inode_rm(c, inode->i_ino);
+		atomic_long_dec(&c->nr_inodes);
+	}
+}
+
+static int bch_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct cache_set *c = sb->s_fs_info;
+	u64 fsid;
+
+	buf->f_type	= BCACHE_STATFS_MAGIC;
+	buf->f_bsize	= sb->s_blocksize;
+	buf->f_blocks	= c->capacity >> PAGE_SECTOR_SHIFT;
+	buf->f_bfree	= (c->capacity - cache_set_sectors_used(c)) >> PAGE_SECTOR_SHIFT;
+	buf->f_bavail	= buf->f_bfree;
+	buf->f_files	= atomic_long_read(&c->nr_inodes);
+	buf->f_ffree	= U64_MAX;
+
+	fsid = le64_to_cpup((void *) c->disk_sb.user_uuid.b) ^
+	       le64_to_cpup((void *) c->disk_sb.user_uuid.b + sizeof(u64));
+	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+	buf->f_namelen	= NAME_MAX;
+
+	return 0;
+}
+
+static int bch_sync_fs(struct super_block *sb, int wait)
+{
+	struct cache_set *c = sb->s_fs_info;
+
+	if (!wait) {
+		bch_journal_flush_async(&c->journal, NULL);
+		return 0;
+	}
+
+	return bch_journal_flush(&c->journal);
+}
+
+static struct cache_set *bdev_to_cache_set(struct block_device *bdev)
+{
+	struct cache_set *c;
+	struct cache *ca;
+	unsigned i;
+
+	rcu_read_lock();
+
+	list_for_each_entry(c, &bch_cache_sets, list)
+		for_each_cache_rcu(ca, c, i)
+			if (ca->disk_sb.bdev == bdev) {
+				rcu_read_unlock();
+				return c;
+			}
+
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static struct cache_set *bch_open_as_blockdevs(const char *_dev_name,
+					       struct cache_set_opts opts)
+{
+	size_t nr_devs = 0, i = 0;
+	char *dev_name, *s, **devs;
+	struct cache_set *c = NULL;
+	const char *err;
+
+	dev_name = kstrdup(_dev_name, GFP_KERNEL);
+	if (!dev_name)
+		return NULL;
+
+	for (s = dev_name; s; s = strchr(s + 1, ':'))
+		nr_devs++;
+
+	devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
+	if (!devs)
+		goto err;
+
+	for (i = 0, s = dev_name;
+	     s;
+	     (s = strchr(s, ':')) && (*s++ = '\0'))
+		devs[i++] = s;
+
+	err = bch_register_cache_set(devs, nr_devs, opts, &c);
+	if (err) {
+		/*
+		 * Already open?
+		 * Look up each block device, make sure they all belong to a
+		 * cache set and they all belong to the _same_ cache set
+		 */
+
+		mutex_lock(&bch_register_lock);
+
+		for (i = 0; i < nr_devs; i++) {
+			struct block_device *bdev = lookup_bdev(devs[i]);
+			struct cache_set *c2;
+
+			if (IS_ERR(bdev))
+				goto err_unlock;
+
+			c2 = bdev_to_cache_set(bdev);
+			bdput(bdev);
+
+			if (!c)
+				c = c2;
+
+			if (c != c2)
+				goto err_unlock;
+		}
+
+		if (!c)
+			goto err_unlock;
+
+		if (!test_bit(CACHE_SET_RUNNING, &c->flags)) {
+			err = "incomplete cache set";
+			c = NULL;
+			goto err_unlock;
+		}
+
+		closure_get(&c->cl);
+		mutex_unlock(&bch_register_lock);
+	}
+
+	set_bit(CACHE_SET_BDEV_MOUNTED, &c->flags);
+err:
+	kfree(devs);
+	kfree(dev_name);
+
+	return c;
+err_unlock:
+	mutex_unlock(&bch_register_lock);
+	pr_err("register_cache_set err %s", err);
+	goto err;
+}
+
+static int bch_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct cache_set *c = sb->s_fs_info;
+	struct cache_set_opts opts;
+	int ret;
+
+	ret = bch_parse_options(&opts, *flags, data);
+	if (ret)
+		return ret;
+
+	mutex_lock(&bch_register_lock);
+
+	if (opts.read_only >= 0 &&
+	    opts.read_only != c->opts.read_only) {
+		const char *err = NULL;
+
+		if (opts.read_only) {
+			bch_cache_set_read_only_sync(c);
+
+			sb->s_flags |= MS_RDONLY;
+		} else {
+			err = bch_cache_set_read_write(c);
+			if (err) {
+				bch_err(c, "error going rw: %s", err);
+				ret = -EINVAL;
+				goto unlock;
+			}
+
+			sb->s_flags &= ~MS_RDONLY;
+		}
+
+		c->opts.read_only = opts.read_only;
+	}
+
+	if (opts.errors >= 0)
+		c->opts.errors = opts.errors;
+
+unlock:
+	mutex_unlock(&bch_register_lock);
+
+	return ret;
+}
+
+static const struct super_operations bch_super_operations = {
+	.alloc_inode	= bch_alloc_inode,
+	.destroy_inode	= bch_destroy_inode,
+	.write_inode	= bch_vfs_write_inode,
+	.evict_inode	= bch_evict_inode,
+	.sync_fs	= bch_sync_fs,
+	.statfs		= bch_statfs,
+	.show_options	= generic_show_options,
+	.remount_fs	= bch_remount,
+#if 0
+	.put_super	= bch_put_super,
+	.freeze_fs	= bch_freeze,
+	.unfreeze_fs	= bch_unfreeze,
+#endif
+};
+
+static int bch_test_super(struct super_block *s, void *data)
+{
+	return s->s_fs_info == data;
+}
+
+static int bch_set_super(struct super_block *s, void *data)
+{
+	s->s_fs_info = data;
+	return 0;
+}
+
+static struct dentry *bch_mount(struct file_system_type *fs_type,
+				int flags, const char *dev_name, void *data)
+{
+	struct cache_set *c;
+	struct cache *ca;
+	struct super_block *sb;
+	struct inode *inode;
+	struct cache_set_opts opts;
+	unsigned i;
+	int ret;
+
+	ret = bch_parse_options(&opts, flags, data);
+	if (ret)
+		return ERR_PTR(ret);
+
+	c = bch_open_as_blockdevs(dev_name, opts);
+	if (!c)
+		return ERR_PTR(-ENOENT);
+
+	sb = sget(fs_type, bch_test_super, bch_set_super, flags|MS_NOSEC, c);
+	if (IS_ERR(sb)) {
+		closure_put(&c->cl);
+		return ERR_CAST(sb);
+	}
+
+	BUG_ON(sb->s_fs_info != c);
+
+	if (sb->s_root) {
+		closure_put(&c->cl);
+
+		if ((flags ^ sb->s_flags) & MS_RDONLY) {
+			ret = -EBUSY;
+			goto err_put_super;
+		}
+		goto out;
+	}
+
+	/* XXX: blocksize */
+	sb->s_blocksize		= PAGE_SIZE;
+	sb->s_blocksize_bits	= PAGE_SHIFT;
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
+	sb->s_op		= &bch_super_operations;
+	sb->s_xattr		= bch_xattr_handlers;
+	sb->s_magic		= BCACHE_STATFS_MAGIC;
+	sb->s_time_gran		= 1;
+	c->vfs_sb		= sb;
+	sb->s_bdi		= &c->bdi;
+
+	rcu_read_lock();
+	for_each_cache_rcu(ca, c, i) {
+		struct block_device *bdev = ca->disk_sb.bdev;
+
+		BUILD_BUG_ON(sizeof(sb->s_id) < BDEVNAME_SIZE);
+
+		bdevname(bdev, sb->s_id);
+
+		/* XXX: do we even need s_bdev? */
+		sb->s_bdev	= bdev;
+		sb->s_dev	= bdev->bd_dev;
+		break;
+	}
+	rcu_read_unlock();
+
+	if (opts.posix_acl < 0)
+		sb->s_flags	|= MS_POSIXACL;
+	else
+		sb->s_flags	|= opts.posix_acl ? MS_POSIXACL : 0;
+
+	inode = bch_vfs_inode_get(sb, BCACHE_ROOT_INO);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto err_put_super;
+	}
+
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root) {
+		ret = -ENOMEM;
+		goto err_put_super;
+	}
+
+	sb->s_flags |= MS_ACTIVE;
+out:
+	return dget(sb->s_root);
+
+err_put_super:
+	deactivate_locked_super(sb);
+	return ERR_PTR(ret);
+}
+
+static void bch_kill_sb(struct super_block *sb)
+{
+	struct cache_set *c = sb->s_fs_info;
+
+	generic_shutdown_super(sb);
+
+	if (test_bit(CACHE_SET_BDEV_MOUNTED, &c->flags)) {
+		DECLARE_COMPLETION_ONSTACK(complete);
+
+		c->stop_completion = &complete;
+		bch_cache_set_stop(c);
+		closure_put(&c->cl);
+
+		/* Killable? */
+		wait_for_completion(&complete);
+	} else
+		closure_put(&c->cl);
+}
+
+static struct file_system_type bcache_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "bcache",
+	.mount		= bch_mount,
+	.kill_sb	= bch_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+MODULE_ALIAS_FS("bcache");
+
+void bch_fs_exit(void)
+{
+	unregister_filesystem(&bcache_fs_type);
+	if (bch_dio_write_bioset)
+		bioset_free(bch_dio_write_bioset);
+	if (bch_dio_read_bioset)
+		bioset_free(bch_dio_read_bioset);
+	if (bch_writepage_bioset)
+		bioset_free(bch_writepage_bioset);
+	if (bch_inode_cache)
+		kmem_cache_destroy(bch_inode_cache);
+}
+
+int __init bch_fs_init(void)
+{
+	int ret = -ENOMEM;
+
+	bch_inode_cache = KMEM_CACHE(bch_inode_info, 0);
+	if (!bch_inode_cache)
+		goto err;
+
+	bch_writepage_bioset =
+		bioset_create(4, offsetof(struct bch_writepage_io, bio.bio));
+	if (!bch_writepage_bioset)
+		goto err;
+
+	bch_dio_read_bioset = bioset_create(4, offsetof(struct dio_read, rbio.bio));
+	if (!bch_dio_read_bioset)
+		goto err;
+
+	bch_dio_write_bioset = bioset_create(4, offsetof(struct dio_write, bio.bio));
+	if (!bch_dio_write_bioset)
+		goto err;
+
+	ret = register_filesystem(&bcache_fs_type);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	bch_fs_exit();
+	return ret;
+}
diff --git a/libbcache/fs.h b/libbcache/fs.h
new file mode 100644
index 0000000..c982024
--- /dev/null
+++ b/libbcache/fs.h
@@ -0,0 +1,49 @@
+#ifndef _BCACHE_FS_H
+#define _BCACHE_FS_H
+
+#include "str_hash.h"
+
+#include <linux/seqlock.h>
+
+struct bch_inode_info {
+	struct inode		vfs_inode;
+
+	struct mutex		update_lock;
+	u64			journal_seq;
+
+	atomic_long_t		i_size_dirty_count;
+
+	/*
+	 * these are updated whenever we update the inode in the btree - for
+	 * e.g. fsync
+	 */
+	u64			i_size;
+	u32			i_flags;
+
+	atomic_long_t		i_sectors_dirty_count;
+	atomic64_t		i_sectors;
+
+	struct bch_hash_info	str_hash;
+};
+
+#define to_bch_ei(_inode)					\
+	container_of(_inode, struct bch_inode_info, vfs_inode)
+
+static inline u8 mode_to_type(umode_t mode)
+{
+	return (mode >> 12) & 15;
+}
+
+/* returns 0 if we want to do the update, or error is passed up */
+typedef int (*inode_set_fn)(struct bch_inode_info *,
+			    struct bch_inode *, void *);
+
+int __must_check __bch_write_inode(struct cache_set *, struct bch_inode_info *,
+				   inode_set_fn, void *);
+int __must_check bch_write_inode(struct cache_set *,
+				 struct bch_inode_info *);
+
+void bch_fs_exit(void);
+int bch_fs_init(void);
+
+#endif /* _BCACHE_FS_H */
diff --git a/libbcache/inode.c b/libbcache/inode.c
new file mode 100644
index 0000000..d36de43
--- /dev/null
+++ b/libbcache/inode.c
@@ -0,0 +1,283 @@
+
+#include "bcache.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "inode.h"
+#include "io.h"
+#include "keylist.h"
+
+ssize_t bch_inode_status(char *buf, size_t len, const struct bkey *k)
+{
+	if (k->p.offset)
+		return scnprintf(buf, len, "offset nonzero: %llu", k->p.offset);
+
+	if (k->size)
+		return scnprintf(buf, len, "size nonzero: %u", k->size);
+
+	switch (k->type) {
+	case KEY_TYPE_DELETED:
+		return scnprintf(buf, len, "deleted");
+	case KEY_TYPE_DISCARD:
+		return scnprintf(buf, len, "discarded");
+	case KEY_TYPE_ERROR:
+		return scnprintf(buf, len, "error");
+	case KEY_TYPE_COOKIE:
+		return scnprintf(buf, len, "cookie");
+
+	case BCH_INODE_FS:
+		if (bkey_val_bytes(k) != sizeof(struct bch_inode))
+			return scnprintf(buf, len, "bad size: %zu",
+					 bkey_val_bytes(k));
+
+		if (k->p.inode < BLOCKDEV_INODE_MAX)
+			return scnprintf(buf, len,
+					 "fs inode in blockdev range: %llu",
+					 k->p.inode);
+		return 0;
+
+	case BCH_INODE_BLOCKDEV:
+		if (bkey_val_bytes(k) != sizeof(struct bch_inode_blockdev))
+			return scnprintf(buf, len, "bad size: %zu",
+					 bkey_val_bytes(k));
+
+		if (k->p.inode >= BLOCKDEV_INODE_MAX)
+			return scnprintf(buf, len,
+					 "blockdev inode in fs range: %llu",
+					 k->p.inode);
+		return 0;
+
+	default:
+		return scnprintf(buf, len, "unknown inode type: %u", k->type);
+	}
+}
+
+static const char *bch_inode_invalid(const struct cache_set *c,
+				     struct bkey_s_c k)
+{
+	if (k.k->p.offset)
+		return "nonzero offset";
+
+	switch (k.k->type) {
+	case BCH_INODE_FS: {
+		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+
+		if (bkey_val_bytes(k.k) != sizeof(struct bch_inode))
+			return "incorrect value size";
+
+		if (k.k->p.inode < BLOCKDEV_INODE_MAX)
+			return "fs inode in blockdev range";
+
+		if (INODE_STR_HASH_TYPE(inode.v) >= BCH_STR_HASH_NR)
+			return "invalid str hash type";
+
+		return NULL;
+	}
+	case BCH_INODE_BLOCKDEV:
+		if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_blockdev))
+			return "incorrect value size";
+
+		if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
+			return "blockdev inode in fs range";
+
+		return NULL;
+	default:
+		return "invalid type";
+	}
+}
+
+static void bch_inode_to_text(struct cache_set *c, char *buf,
+			      size_t size, struct bkey_s_c k)
+{
+	struct bkey_s_c_inode inode;
+
+	switch (k.k->type) {
+	case BCH_INODE_FS:
+		inode = bkey_s_c_to_inode(k);
+
+		scnprintf(buf, size, "i_size %llu", inode.v->i_size);
+		break;
+	}
+}
+
+const struct bkey_ops bch_bkey_inode_ops = {
+	.key_invalid	= bch_inode_invalid,
+	.val_to_text	= bch_inode_to_text,
+};
+
+int bch_inode_create(struct cache_set *c, struct bkey_i *inode,
+		     u64 min, u64 max, u64 *hint)
+{
+	struct btree_iter iter;
+	bool searched_from_start = false;
+	int ret;
+
+	if (!max)
+		max = ULLONG_MAX;
+
+	if (c->opts.inodes_32bit)
+		max = min_t(u64, max, U32_MAX);
+
+	if (*hint >= max || *hint < min)
+		*hint = min;
+
+	if (*hint == min)
+		searched_from_start = true;
+again:
+	bch_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(*hint, 0));
+
+	while (1) {
+		struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter);
+
+		ret = btree_iter_err(k);
+		if (ret) {
+			bch_btree_iter_unlock(&iter);
+			return ret;
+		}
+
+		if (k.k->type < BCH_INODE_FS) {
+			inode->k.p = k.k->p;
+
+			pr_debug("inserting inode %llu (size %u)",
+				 inode->k.p.inode, inode->k.u64s);
+
+			ret = bch_btree_insert_at(c, NULL, NULL, NULL,
+					BTREE_INSERT_ATOMIC,
+					BTREE_INSERT_ENTRY(&iter, inode));
+
+			if (ret == -EINTR)
+				continue;
+
+			bch_btree_iter_unlock(&iter);
+			if (!ret)
+				*hint = k.k->p.inode + 1;
+
+			return ret;
+		} else {
+			if (iter.pos.inode == max)
+				break;
+			/* slot used */
+			bch_btree_iter_advance_pos(&iter);
+		}
+	}
+	bch_btree_iter_unlock(&iter);
+
+	if (!searched_from_start) {
+		/* Retry from start */
+		*hint = min;
+		searched_from_start = true;
+		goto again;
+	}
+
+	return -ENOSPC;
+}
+
+int bch_inode_truncate(struct cache_set *c, u64 inode_nr, u64 new_size,
+		       struct extent_insert_hook *hook, u64 *journal_seq)
+{
+	return bch_discard(c, POS(inode_nr, new_size), POS(inode_nr + 1, 0),
+			   0, NULL, hook, journal_seq);
+}
+
+int bch_inode_rm(struct cache_set *c, u64 inode_nr)
+{
+	struct bkey_i delete;
+	int ret;
+
+	ret = bch_inode_truncate(c, inode_nr, 0, NULL, NULL);
+	if (ret < 0)
+		return ret;
+
+	ret = bch_btree_delete_range(c, BTREE_ID_XATTRS,
+				     POS(inode_nr, 0),
+				     POS(inode_nr + 1, 0),
+				     0, NULL, NULL, NULL);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * If this was a directory, there shouldn't be any real dirents left -
+	 * but there could be whiteouts (from hash collisions) that we should
+	 * delete:
+	 *
+	 * XXX: the dirent could ideally would delete whitouts when they're no
+	 * longer needed
+	 */
+	ret = bch_btree_delete_range(c, BTREE_ID_DIRENTS,
+				     POS(inode_nr, 0),
+				     POS(inode_nr + 1, 0),
+				     0, NULL, NULL, NULL);
+	if (ret < 0)
+		return ret;
+
+	bkey_init(&delete.k);
+	delete.k.p.inode = inode_nr;
+
+	return bch_btree_insert(c, BTREE_ID_INODES, &delete, NULL,
+				NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
+int bch_inode_update(struct cache_set *c, struct bkey_i *inode,
+		     u64 *journal_seq)
+{
+	return bch_btree_update(c, BTREE_ID_INODES, inode, journal_seq);
+}
+
+int bch_inode_find_by_inum(struct cache_set *c, u64 inode_nr,
+			   struct bkey_i_inode *inode)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = -ENOENT;
+
+	for_each_btree_key_with_holes(&iter, c, BTREE_ID_INODES,
+				      POS(inode_nr, 0), k) {
+		switch (k.k->type) {
+		case BCH_INODE_FS:
+			ret = 0;
+			bkey_reassemble(&inode->k_i, k);
+			break;
+		default:
+			/* hole, not found */
+			break;
+		}
+
+		break;
+
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+int bch_cached_dev_inode_find_by_uuid(struct cache_set *c, uuid_le *uuid,
+				      struct bkey_i_inode_blockdev *ret)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+
+	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS(0, 0), k) {
+		if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
+			break;
+
+		if (k.k->type == BCH_INODE_BLOCKDEV) {
+			struct bkey_s_c_inode_blockdev inode =
+				bkey_s_c_to_inode_blockdev(k);
+
+			pr_debug("found inode %llu: %pU (u64s %u)",
+				 inode.k->p.inode, inode.v->i_uuid.b,
+				 inode.k->u64s);
+
+			if (CACHED_DEV(inode.v) &&
+			    !memcmp(uuid, &inode.v->i_uuid, 16)) {
+				bkey_reassemble(&ret->k_i, k);
+				bch_btree_iter_unlock(&iter);
+				return 0;
+			}
+		}
+
+		bch_btree_iter_cond_resched(&iter);
+	}
+	bch_btree_iter_unlock(&iter);
+	return -ENOENT;
+}
diff --git a/libbcache/inode.h b/libbcache/inode.h
new file mode 100644
index 0000000..d8b28c7
--- /dev/null
+++ b/libbcache/inode.h
@@ -0,0 +1,18 @@
+#ifndef _BCACHE_INODE_H
+#define _BCACHE_INODE_H
+
+extern const struct bkey_ops bch_bkey_inode_ops;
+
+ssize_t bch_inode_status(char *, size_t, const struct bkey *);
+
+int bch_inode_create(struct cache_set *, struct bkey_i *, u64, u64, u64 *);
+int bch_inode_truncate(struct cache_set *, u64, u64,
+		       struct extent_insert_hook *, u64 *);
+int bch_inode_rm(struct cache_set *, u64);
+int bch_inode_update(struct cache_set *, struct bkey_i *, u64 *);
+
+int bch_inode_find_by_inum(struct cache_set *, u64, struct bkey_i_inode *);
+int bch_cached_dev_inode_find_by_uuid(struct cache_set *, uuid_le *,
+				      struct bkey_i_inode_blockdev *);
+
+#endif
diff --git a/libbcache/io.c b/libbcache/io.c
new file mode 100644
index 0000000..7219b65
--- /dev/null
+++ b/libbcache/io.c
@@ -0,0 +1,1378 @@
+/*
+ * Some low level IO code, and hacks for various block layer limitations
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "alloc.h"
+#include "bset.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "compress.h"
+#include "clock.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "notify.h"
+#include "stats.h"
+#include "super.h"
+
+#include <linux/blkdev.h>
+#include <linux/random.h>
+
+#include <trace/events/bcache.h>
+
+static inline void __bio_inc_remaining(struct bio *bio)
+{
+	bio_set_flag(bio, BIO_CHAIN);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_remaining);
+}
+
+void bch_generic_make_request(struct bio *bio, struct cache_set *c)
+{
+	if (current->bio_list) {
+		spin_lock(&c->bio_submit_lock);
+		bio_list_add(&c->bio_submit_list, bio);
+		spin_unlock(&c->bio_submit_lock);
+		queue_work(bcache_io_wq, &c->bio_submit_work);
+	} else {
+		generic_make_request(bio);
+	}
+}
+
+void bch_bio_submit_work(struct work_struct *work)
+{
+	struct cache_set *c = container_of(work, struct cache_set,
+					   bio_submit_work);
+	struct bio_list bl;
+	struct bio *bio;
+
+	spin_lock(&c->bio_submit_lock);
+	bl = c->bio_submit_list;
+	bio_list_init(&c->bio_submit_list);
+	spin_unlock(&c->bio_submit_lock);
+
+	while ((bio = bio_list_pop(&bl)))
+		generic_make_request(bio);
+}
+
+/* Allocate, free from mempool: */
+
+void bch_bio_free_pages_pool(struct cache_set *c, struct bio *bio)
+{
+	struct bio_vec *bv;
+	unsigned i;
+
+	bio_for_each_segment_all(bv, bio, i)
+		if (bv->bv_page != ZERO_PAGE(0))
+			mempool_free(bv->bv_page, &c->bio_bounce_pages);
+	bio->bi_vcnt = 0;
+}
+
+static void bch_bio_alloc_page_pool(struct cache_set *c, struct bio *bio,
+				    bool *using_mempool)
+{
+	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++];
+
+	if (likely(!*using_mempool)) {
+		bv->bv_page = alloc_page(GFP_NOIO);
+		if (unlikely(!bv->bv_page)) {
+			mutex_lock(&c->bio_bounce_pages_lock);
+			*using_mempool = true;
+			goto pool_alloc;
+
+		}
+	} else {
+pool_alloc:
+		bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
+	}
+
+	bv->bv_len = PAGE_SIZE;
+	bv->bv_offset = 0;
+}
+
+void bch_bio_alloc_pages_pool(struct cache_set *c, struct bio *bio,
+			      size_t bytes)
+{
+	bool using_mempool = false;
+
+	bio->bi_iter.bi_size = bytes;
+
+	while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
+		bch_bio_alloc_page_pool(c, bio, &using_mempool);
+
+	if (using_mempool)
+		mutex_unlock(&c->bio_bounce_pages_lock);
+}
+
+/* Bios with headers */
+
+static void bch_submit_wbio(struct cache_set *c, struct bch_write_bio *wbio,
+			    struct cache *ca, const struct bch_extent_ptr *ptr,
+			    bool punt)
+{
+	wbio->ca		= ca;
+	wbio->submit_time_us	= local_clock_us();
+	wbio->bio.bi_iter.bi_sector = ptr->offset;
+	wbio->bio.bi_bdev	= ca ? ca->disk_sb.bdev : NULL;
+
+	if (!ca)
+		bcache_io_error(c, &wbio->bio, "device has been removed");
+	else if (punt)
+		bch_generic_make_request(&wbio->bio, c);
+	else
+		generic_make_request(&wbio->bio);
+}
+
+void bch_submit_wbio_replicas(struct bch_write_bio *wbio, struct cache_set *c,
+			      const struct bkey_i *k, bool punt)
+{
+	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+	const struct bch_extent_ptr *ptr;
+	struct bch_write_bio *n;
+	struct cache *ca;
+
+	wbio->split = false;
+	wbio->c = c;
+
+	extent_for_each_ptr(e, ptr) {
+		rcu_read_lock();
+		ca = PTR_CACHE(c, ptr);
+		if (ca)
+			percpu_ref_get(&ca->ref);
+		rcu_read_unlock();
+
+		if (!ca) {
+			bch_submit_wbio(c, wbio, ca, ptr, punt);
+			break;
+		}
+
+		if (ptr + 1 < &extent_entry_last(e)->ptr) {
+			n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
+						   &ca->replica_set));
+
+			n->bio.bi_end_io	= wbio->bio.bi_end_io;
+			n->bio.bi_private	= wbio->bio.bi_private;
+			n->c			= c;
+			n->orig			= &wbio->bio;
+			n->bounce		= false;
+			n->split		= true;
+			n->put_bio		= true;
+			n->bio.bi_opf		= wbio->bio.bi_opf;
+			__bio_inc_remaining(n->orig);
+		} else {
+			n = wbio;
+		}
+
+		if (!journal_flushes_device(ca))
+			n->bio.bi_opf |= REQ_FUA;
+
+		bch_submit_wbio(c, n, ca, ptr, punt);
+	}
+}
+
+/* IO errors */
+
+/* Writes */
+
+static struct workqueue_struct *index_update_wq(struct bch_write_op *op)
+{
+	return op->alloc_reserve == RESERVE_MOVINGGC
+		? op->c->copygc_wq
+		: op->c->wq;
+}
+
+static void __bch_write(struct closure *);
+
+static void bch_write_done(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+
+	BUG_ON(!(op->flags & BCH_WRITE_DONE));
+
+	if (!op->error && (op->flags & BCH_WRITE_FLUSH))
+		op->error = bch_journal_error(&op->c->journal);
+
+	bch_disk_reservation_put(op->c, &op->res);
+	percpu_ref_put(&op->c->writes);
+	bch_keylist_free(&op->insert_keys, op->inline_keys);
+	closure_return(cl);
+}
+
+static u64 keylist_sectors(struct keylist *keys)
+{
+	struct bkey_i *k;
+	u64 ret = 0;
+
+	for_each_keylist_key(keys, k)
+		ret += k->k.size;
+
+	return ret;
+}
+
+static int bch_write_index_default(struct bch_write_op *op)
+{
+	struct keylist *keys = &op->insert_keys;
+	struct btree_iter iter;
+	int ret;
+
+	bch_btree_iter_init_intent(&iter, op->c, BTREE_ID_EXTENTS,
+		bkey_start_pos(&bch_keylist_front(keys)->k));
+
+	ret = bch_btree_insert_list_at(&iter, keys, &op->res,
+				       NULL, op_journal_seq(op),
+				       BTREE_INSERT_NOFAIL);
+	bch_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+/**
+ * bch_write_index - after a write, update index to point to new data
+ */
+static void bch_write_index(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct cache_set *c = op->c;
+	struct keylist *keys = &op->insert_keys;
+	unsigned i;
+
+	op->flags |= BCH_WRITE_LOOPED;
+
+	if (!bch_keylist_empty(keys)) {
+		u64 sectors_start = keylist_sectors(keys);
+		int ret = op->index_update_fn(op);
+
+		BUG_ON(keylist_sectors(keys) && !ret);
+
+		op->written += sectors_start - keylist_sectors(keys);
+
+		if (ret) {
+			__bcache_io_error(c, "btree IO error %i", ret);
+			op->error = ret;
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(op->open_buckets); i++)
+		if (op->open_buckets[i]) {
+			bch_open_bucket_put(c,
+					    c->open_buckets +
+					    op->open_buckets[i]);
+			op->open_buckets[i] = 0;
+		}
+
+	if (!(op->flags & BCH_WRITE_DONE))
+		continue_at(cl, __bch_write, op->io_wq);
+
+	if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
+		bch_journal_flush_seq_async(&c->journal,
+					    *op_journal_seq(op),
+					    cl);
+		continue_at(cl, bch_write_done, index_update_wq(op));
+	} else {
+		continue_at_nobarrier(cl, bch_write_done, NULL);
+	}
+}
+
+/**
+ * bch_write_discard - discard range of keys
+ *
+ * Used to implement discard, and to handle when writethrough write hits
+ * a write error on the cache device.
+ */
+static void bch_write_discard(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bio *bio = &op->bio->bio;
+	struct bpos end = op->pos;
+
+	end.offset += bio_sectors(bio);
+
+	op->error = bch_discard(op->c, op->pos, end, op->version,
+				&op->res, NULL, NULL);
+}
+
+/*
+ * Convert extents to be inserted to discards after an error:
+ */
+static void bch_write_io_error(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+
+	if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) {
+		struct bkey_i *src = bch_keylist_front(&op->insert_keys);
+		struct bkey_i *dst = bch_keylist_front(&op->insert_keys);
+
+		/*
+		 * Our data write just errored, which means we've got a bunch
+		 * of keys to insert that point to data that wasn't
+		 * successfully written.
+		 *
+		 * We don't have to insert those keys but we still have to
+		 * invalidate that region of the cache - so, if we just strip
+		 * off all the pointers from the keys we'll accomplish just
+		 * that.
+		 */
+
+		while (src != op->insert_keys.top) {
+			struct bkey_i *n = bkey_next(src);
+
+			set_bkey_val_u64s(&src->k, 0);
+			src->k.type = KEY_TYPE_DISCARD;
+			bkey_copy(dst, src);
+
+			dst = bkey_next(dst);
+			src = n;
+		}
+
+		op->insert_keys.top = dst;
+		op->flags |= BCH_WRITE_DISCARD;
+	} else {
+		/* TODO: We could try to recover from this. */
+		while (!bch_keylist_empty(&op->insert_keys))
+			bch_keylist_pop_front(&op->insert_keys);
+
+		op->error = -EIO;
+		op->flags |= BCH_WRITE_DONE;
+	}
+
+	bch_write_index(cl);
+}
+
+static void bch_write_endio(struct bio *bio)
+{
+	struct closure *cl = bio->bi_private;
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bch_write_bio *wbio = to_wbio(bio);
+	struct cache_set *c = wbio->c;
+	struct bio *orig = wbio->orig;
+	struct cache *ca = wbio->ca;
+
+	if (cache_nonfatal_io_err_on(bio->bi_error, ca,
+				     "data write"))
+		set_closure_fn(cl, bch_write_io_error, index_update_wq(op));
+
+	bch_account_io_completion_time(ca, wbio->submit_time_us,
+				       REQ_OP_WRITE);
+	if (ca)
+		percpu_ref_put(&ca->ref);
+
+	if (bio->bi_error && orig)
+		orig->bi_error = bio->bi_error;
+
+	if (wbio->bounce)
+		bch_bio_free_pages_pool(c, bio);
+
+	if (wbio->put_bio)
+		bio_put(bio);
+
+	if (orig)
+		bio_endio(orig);
+	else
+		closure_put(cl);
+}
+
+static void init_append_extent(struct bch_write_op *op,
+			       unsigned compressed_size,
+			       unsigned uncompressed_size,
+			       unsigned compression_type,
+			       u64 csum, unsigned csum_type,
+			       struct open_bucket *ob)
+{
+	struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
+
+	op->pos.offset += uncompressed_size;
+	e->k.p = op->pos;
+	e->k.size = uncompressed_size;
+
+	bch_extent_crc_append(e, compressed_size,
+			      uncompressed_size,
+			      compression_type,
+			      csum, csum_type);
+
+	bch_alloc_sectors_append_ptrs(op->c, e, op->nr_replicas,
+				      ob, compressed_size);
+
+	bkey_extent_set_cached(&e->k, (op->flags & BCH_WRITE_CACHED));
+	bch_keylist_push(&op->insert_keys);
+}
+
+static int bch_write_extent(struct bch_write_op *op,
+			    struct open_bucket *ob,
+			    struct bio *orig)
+{
+	struct cache_set *c = op->c;
+	struct bio *bio;
+	struct bch_write_bio *wbio;
+	unsigned key_to_write_offset = op->insert_keys.top_p -
+		op->insert_keys.keys_p;
+	struct bkey_i *key_to_write;
+	unsigned csum_type = c->opts.data_checksum;
+	unsigned compression_type = op->compression_type;
+	int ret;
+
+	/* don't refetch csum type/compression type */
+	barrier();
+
+	/* Need to decompress data? */
+	if ((op->flags & BCH_WRITE_DATA_COMPRESSED) &&
+	    (op->crc.uncompressed_size != op->size ||
+	     op->crc.compressed_size > ob->sectors_free)) {
+		int ret;
+
+		ret = bch_bio_uncompress_inplace(c, orig, op->size, op->crc);
+		if (ret)
+			return ret;
+
+		op->flags &= ~BCH_WRITE_DATA_COMPRESSED;
+	}
+
+	if (op->flags & BCH_WRITE_DATA_COMPRESSED) {
+		init_append_extent(op,
+				   op->crc.compressed_size,
+				   op->crc.uncompressed_size,
+				   op->crc.compression_type,
+				   op->crc.csum,
+				   op->crc.csum_type,
+				   ob);
+
+		bio			= orig;
+		wbio			= to_wbio(bio);
+		wbio->orig		= NULL;
+		wbio->bounce		= false;
+		wbio->put_bio		= false;
+		ret			= 0;
+	} else if (csum_type != BCH_CSUM_NONE ||
+		   compression_type != BCH_COMPRESSION_NONE) {
+		/* all units here in bytes */
+		unsigned total_output = 0, output_available =
+			min(ob->sectors_free << 9, orig->bi_iter.bi_size);
+		u64 csum;
+
+		bio = bio_alloc_bioset(GFP_NOIO,
+				       DIV_ROUND_UP(output_available, PAGE_SIZE),
+				       &c->bio_write);
+		/*
+		 * XXX: can't use mempool for more than
+		 * BCH_COMPRESSED_EXTENT_MAX worth of pages
+		 */
+		bch_bio_alloc_pages_pool(c, bio, output_available);
+
+		/* copy WRITE_SYNC flag */
+		bio->bi_opf		= orig->bi_opf;
+		wbio			= to_wbio(bio);
+		wbio->orig		= NULL;
+		wbio->bounce		= true;
+		wbio->put_bio		= true;
+
+		do {
+			unsigned fragment_compression_type = compression_type;
+			size_t dst_len, src_len;
+
+			bch_bio_compress(c, bio, &dst_len,
+					 orig, &src_len,
+					 &fragment_compression_type);
+
+			BUG_ON(!dst_len || dst_len > bio->bi_iter.bi_size);
+			BUG_ON(!src_len || src_len > orig->bi_iter.bi_size);
+			BUG_ON(dst_len & (block_bytes(c) - 1));
+			BUG_ON(src_len & (block_bytes(c) - 1));
+
+			swap(bio->bi_iter.bi_size, dst_len);
+			csum = bch_checksum_bio(bio, csum_type);
+			swap(bio->bi_iter.bi_size, dst_len);
+
+			init_append_extent(op,
+					   dst_len >> 9, src_len >> 9,
+					   fragment_compression_type,
+					   csum, csum_type, ob);
+
+			total_output += dst_len;
+			bio_advance(bio, dst_len);
+			bio_advance(orig, src_len);
+		} while (bio->bi_iter.bi_size &&
+			 orig->bi_iter.bi_size &&
+			 !bch_keylist_realloc(&op->insert_keys,
+					      op->inline_keys,
+					      ARRAY_SIZE(op->inline_keys),
+					      BKEY_EXTENT_U64s_MAX));
+
+		BUG_ON(total_output > output_available);
+
+		memset(&bio->bi_iter, 0, sizeof(bio->bi_iter));
+		bio->bi_iter.bi_size = total_output;
+
+		/*
+		 * Free unneeded pages after compressing:
+		 */
+		while (bio->bi_vcnt * PAGE_SIZE >
+		       round_up(bio->bi_iter.bi_size, PAGE_SIZE))
+			mempool_free(bio->bi_io_vec[--bio->bi_vcnt].bv_page,
+				     &c->bio_bounce_pages);
+
+		ret = orig->bi_iter.bi_size != 0;
+	} else {
+		bio = bio_next_split(orig, ob->sectors_free, GFP_NOIO,
+				     &c->bio_write);
+
+		wbio			= to_wbio(bio);
+		wbio->orig		= NULL;
+		wbio->bounce		= false;
+		wbio->put_bio		= bio != orig;
+
+		init_append_extent(op, bio_sectors(bio), bio_sectors(bio),
+				   compression_type, 0, csum_type, ob);
+
+		ret = bio != orig;
+	}
+
+	bio->bi_end_io	= bch_write_endio;
+	bio->bi_private	= &op->cl;
+	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+	closure_get(bio->bi_private);
+
+	/* might have done a realloc... */
+
+	key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
+
+	if (!(op->flags & BCH_WRITE_CACHED))
+		bch_check_mark_super(c, key_to_write, false);
+
+#ifndef CONFIG_BCACHE_NO_IO
+	bch_submit_wbio_replicas(to_wbio(bio), c, key_to_write, false);
+#else
+	to_wbio(bio)->ca = NULL;
+	bio_endio(bio);
+#endif
+	return ret;
+}
+
+static void __bch_write(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct cache_set *c = op->c;
+	struct bio *bio = &op->bio->bio;
+	unsigned open_bucket_nr = 0;
+	struct open_bucket *b;
+	int ret;
+
+	memset(op->open_buckets, 0, sizeof(op->open_buckets));
+
+	if (op->flags & BCH_WRITE_DISCARD) {
+		op->flags |= BCH_WRITE_DONE;
+		bch_write_discard(cl);
+		bio_put(bio);
+		continue_at(cl, bch_write_done, index_update_wq(op));
+	}
+
+	/*
+	 * Journal writes are marked REQ_PREFLUSH; if the original write was a
+	 * flush, it'll wait on the journal write.
+	 */
+	bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA);
+
+	do {
+		EBUG_ON(bio->bi_iter.bi_sector != op->pos.offset);
+		EBUG_ON(!bio_sectors(bio));
+
+		if (open_bucket_nr == ARRAY_SIZE(op->open_buckets))
+			continue_at(cl, bch_write_index, index_update_wq(op));
+
+		/* for the device pointers and 1 for the chksum */
+		if (bch_keylist_realloc(&op->insert_keys,
+					op->inline_keys,
+					ARRAY_SIZE(op->inline_keys),
+					BKEY_EXTENT_U64s_MAX))
+			continue_at(cl, bch_write_index, index_update_wq(op));
+
+		b = bch_alloc_sectors_start(c, op->wp, op->nr_replicas,
+			op->alloc_reserve,
+			(op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
+		EBUG_ON(!b);
+
+		if (unlikely(IS_ERR(b))) {
+			if (unlikely(PTR_ERR(b) != -EAGAIN)) {
+				ret = PTR_ERR(b);
+				goto err;
+			}
+
+			/*
+			 * If we already have some keys, must insert them first
+			 * before allocating another open bucket. We only hit
+			 * this case if open_bucket_nr > 1.
+			 */
+			if (!bch_keylist_empty(&op->insert_keys))
+				continue_at(cl, bch_write_index,
+					    index_update_wq(op));
+
+			/*
+			 * If we've looped, we're running out of a workqueue -
+			 * not the bch_write() caller's context - and we don't
+			 * want to block the workqueue:
+			 */
+			if (op->flags & BCH_WRITE_LOOPED)
+				continue_at(cl, __bch_write, op->io_wq);
+
+			/*
+			 * Otherwise, we do want to block the caller on alloc
+			 * failure instead of letting it queue up more and more
+			 * writes:
+			 * XXX: this technically needs a try_to_freeze() -
+			 * except that that's not safe because caller may have
+			 * issued other IO... hmm..
+			 */
+			closure_sync(cl);
+			continue;
+		}
+
+		BUG_ON(b - c->open_buckets == 0 ||
+		       b - c->open_buckets > U8_MAX);
+		op->open_buckets[open_bucket_nr++] = b - c->open_buckets;
+
+		ret = bch_write_extent(op, b, bio);
+
+		bch_alloc_sectors_done(c, op->wp, b);
+
+		if (ret < 0)
+			goto err;
+	} while (ret);
+
+	op->flags |= BCH_WRITE_DONE;
+	continue_at(cl, bch_write_index, index_update_wq(op));
+err:
+	if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) {
+		/*
+		 * If we were writing cached data, not doing the write is fine
+		 * so long as we discard whatever would have been overwritten -
+		 * then it's equivalent to doing the write and immediately
+		 * reclaiming it.
+		 */
+
+		bch_write_discard(cl);
+	} else {
+		/*
+		 * Right now we can only error here if we went RO - the
+		 * allocation failed, but we already checked for -ENOSPC when we
+		 * got our reservation.
+		 *
+		 * XXX capacity might have changed, but we don't check for that
+		 * yet:
+		 */
+		op->error = ret;
+	}
+
+	op->flags |= BCH_WRITE_DONE;
+
+	/*
+	 * No reason not to insert keys for whatever data was successfully
+	 * written (especially for a cmpxchg operation that's moving data
+	 * around)
+	 */
+	continue_at(cl, !bch_keylist_empty(&op->insert_keys)
+		    ? bch_write_index
+		    : bch_write_done, index_update_wq(op));
+}
+
+void bch_wake_delayed_writes(unsigned long data)
+{
+	struct cache_set *c = (void *) data;
+	struct bch_write_op *op;
+	unsigned long flags;
+
+	spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
+
+	while ((op = c->write_wait_head)) {
+		if (!test_bit(CACHE_SET_RO, &c->flags) &&
+		    !test_bit(CACHE_SET_STOPPING, &c->flags) &&
+		    time_after(op->expires, jiffies)) {
+			mod_timer(&c->foreground_write_wakeup, op->expires);
+			break;
+		}
+
+		c->write_wait_head = op->next;
+		if (!c->write_wait_head)
+			c->write_wait_tail = NULL;
+
+		closure_put(&op->cl);
+	}
+
+	spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags);
+}
+
+/**
+ * bch_write - handle a write to a cache device or flash only volume
+ *
+ * This is the starting point for any data to end up in a cache device; it could
+ * be from a normal write, or a writeback write, or a write to a flash only
+ * volume - it's also used by the moving garbage collector to compact data in
+ * mostly empty buckets.
+ *
+ * It first writes the data to the cache, creating a list of keys to be inserted
+ * (if the data won't fit in a single open bucket, there will be multiple keys);
+ * after the data is written it calls bch_journal, and after the keys have been
+ * added to the next journal write they're inserted into the btree.
+ *
+ * It inserts the data in op->bio; bi_sector is used for the key offset, and
+ * op->inode is used for the key inode.
+ *
+ * If op->discard is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
+ */
+void bch_write(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bio *bio = &op->bio->bio;
+	struct cache_set *c = op->c;
+	u64 inode = op->pos.inode;
+
+	trace_bcache_write(c, inode, bio,
+			   !(op->flags & BCH_WRITE_CACHED),
+			   op->flags & BCH_WRITE_DISCARD);
+
+	if (!percpu_ref_tryget(&c->writes)) {
+		__bcache_io_error(c, "read only");
+		op->error = -EROFS;
+		bch_disk_reservation_put(c, &op->res);
+		closure_return(cl);
+	}
+
+	if (!(op->flags & BCH_WRITE_DISCARD))
+		bch_increment_clock(c, bio_sectors(bio), WRITE);
+
+	if (!(op->flags & BCH_WRITE_DISCARD))
+		bch_mark_foreground_write(c, bio_sectors(bio));
+	else
+		bch_mark_discard(c, bio_sectors(bio));
+
+	/* Don't call bch_next_delay() if rate is >= 1 GB/sec */
+
+	if (c->foreground_write_ratelimit_enabled &&
+	    c->foreground_write_pd.rate.rate < (1 << 30) &&
+	    !(op->flags & BCH_WRITE_DISCARD) && op->wp->throttle) {
+		unsigned long flags;
+		u64 delay;
+
+		spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
+		bch_ratelimit_increment(&c->foreground_write_pd.rate,
+					bio->bi_iter.bi_size);
+
+		delay = bch_ratelimit_delay(&c->foreground_write_pd.rate);
+
+		if (delay >= HZ / 100) {
+			trace_bcache_write_throttle(c, inode, bio, delay);
+
+			closure_get(&op->cl); /* list takes a ref */
+
+			op->expires = jiffies + delay;
+			op->next = NULL;
+
+			if (c->write_wait_tail)
+				c->write_wait_tail->next = op;
+			else
+				c->write_wait_head = op;
+			c->write_wait_tail = op;
+
+			if (!timer_pending(&c->foreground_write_wakeup))
+				mod_timer(&c->foreground_write_wakeup,
+					  op->expires);
+
+			spin_unlock_irqrestore(&c->foreground_write_pd_lock,
+					       flags);
+			continue_at(cl, __bch_write, index_update_wq(op));
+		}
+
+		spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags);
+	}
+
+	continue_at_nobarrier(cl, __bch_write, NULL);
+}
+
+void bch_write_op_init(struct bch_write_op *op, struct cache_set *c,
+		       struct bch_write_bio *bio, struct disk_reservation res,
+		       struct write_point *wp, struct bpos pos,
+		       u64 *journal_seq, unsigned flags)
+{
+	op->c		= c;
+	op->io_wq	= index_update_wq(op);
+	op->bio		= bio;
+	op->written	= 0;
+	op->error	= 0;
+	op->flags	= flags;
+	op->compression_type = c->opts.compression;
+	op->nr_replicas	= res.nr_replicas;
+	op->alloc_reserve = RESERVE_NONE;
+	op->pos		= pos;
+	op->version	= 0;
+	op->res		= res;
+	op->wp		= wp;
+
+	if (journal_seq) {
+		op->journal_seq_p = journal_seq;
+		op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
+	} else {
+		op->journal_seq = 0;
+	}
+
+	op->index_update_fn = bch_write_index_default;
+
+	bch_keylist_init(&op->insert_keys,
+			 op->inline_keys,
+			 ARRAY_SIZE(op->inline_keys));
+
+	if (version_stress_test(c))
+		get_random_bytes(&op->version, sizeof(op->version));
+}
+
+/* Discard */
+
+/* bch_discard - discard a range of keys from start_key to end_key.
+ * @c		cache set
+ * @start_key	pointer to start location
+ *		NOTE: discard starts at bkey_start_offset(start_key)
+ * @end_key	pointer to end location
+ *		NOTE: discard ends at KEY_OFFSET(end_key)
+ * @version	version of discard (0ULL if none)
+ *
+ * Returns:
+ *	 0 on success
+ *	<0 on error
+ *
+ * XXX: this needs to be refactored with inode_truncate, or more
+ *	appropriately inode_truncate should call this
+ */
+int bch_discard(struct cache_set *c, struct bpos start,
+		struct bpos end, u64 version,
+		struct disk_reservation *disk_res,
+		struct extent_insert_hook *hook,
+		u64 *journal_seq)
+{
+	return bch_btree_delete_range(c, BTREE_ID_EXTENTS, start, end, version,
+				      disk_res, hook, journal_seq);
+}
+
+/* Cache promotion on read */
+
+struct cache_promote_op {
+	struct closure		cl;
+	struct migrate_write	write;
+	struct bio_vec		bi_inline_vecs[0]; /* must be last */
+};
+
+/* Read */
+
+static int bio_checksum_uncompress(struct cache_set *c,
+				   struct bch_read_bio *rbio)
+{
+	struct bio *src = &rbio->bio;
+	struct bio *dst = &bch_rbio_parent(rbio)->bio;
+	struct bvec_iter dst_iter = rbio->parent_iter;
+	u64 csum;
+	int ret = 0;
+
+	/*
+	 * reset iterator for checksumming and copying bounced data: here we've
+	 * set rbio->compressed_size to the amount of data we actually read,
+	 * which was not necessarily the full extent if we were only bouncing
+	 * in order to promote
+	 */
+	if (rbio->bounce) {
+		src->bi_iter.bi_size		= rbio->crc.compressed_size << 9;
+		src->bi_iter.bi_idx		= 0;
+		src->bi_iter.bi_bvec_done	= 0;
+	} else {
+		src->bi_iter = rbio->parent_iter;
+	}
+
+	csum = bch_checksum_bio(src, rbio->crc.csum_type);
+	if (cache_nonfatal_io_err_on(rbio->crc.csum != csum, rbio->ca,
+			"data checksum error, inode %llu offset %llu: expected %0llx got %0llx (type %u)",
+			rbio->inode, (u64) rbio->parent_iter.bi_sector << 9,
+			rbio->crc.csum, csum, rbio->crc.csum_type))
+		ret = -EIO;
+
+	/*
+	 * If there was a checksum error, still copy the data back - unless it
+	 * was compressed, we don't want to decompress bad data:
+	 */
+	if (rbio->crc.compression_type != BCH_COMPRESSION_NONE) {
+		if (!ret) {
+			ret = bch_bio_uncompress(c, src, dst,
+						 dst_iter, rbio->crc);
+			if (ret)
+				__bcache_io_error(c, "decompression error");
+		}
+	} else if (rbio->bounce) {
+		bio_advance(src, rbio->crc.offset << 9);
+		bio_copy_data_iter(dst, dst_iter,
+				   src, src->bi_iter);
+	}
+
+	return ret;
+}
+
+static void bch_rbio_free(struct cache_set *c, struct bch_read_bio *rbio)
+{
+	struct bio *bio = &rbio->bio;
+
+	BUG_ON(rbio->ca);
+	BUG_ON(!rbio->split);
+
+	if (rbio->promote)
+		kfree(rbio->promote);
+	if (rbio->bounce)
+		bch_bio_free_pages_pool(c, bio);
+
+	bio_put(bio);
+}
+
+static void bch_rbio_done(struct cache_set *c, struct bch_read_bio *rbio)
+{
+	struct bio *orig = &bch_rbio_parent(rbio)->bio;
+
+	percpu_ref_put(&rbio->ca->ref);
+	rbio->ca = NULL;
+
+	if (rbio->split) {
+		if (rbio->bio.bi_error)
+			orig->bi_error = rbio->bio.bi_error;
+
+		bio_endio(orig);
+		bch_rbio_free(c, rbio);
+	} else {
+		if (rbio->promote)
+			kfree(rbio->promote);
+
+		orig->bi_end_io = rbio->orig_bi_end_io;
+		bio_endio_nodec(orig);
+	}
+}
+
+/*
+ * Decide if we want to retry the read - returns true if read is being retried,
+ * false if caller should pass error on up
+ */
+static void bch_read_error_maybe_retry(struct cache_set *c,
+				       struct bch_read_bio *rbio,
+				       int error)
+{
+	unsigned long flags;
+
+	if ((error == -EINTR) &&
+	    (rbio->flags & BCH_READ_RETRY_IF_STALE)) {
+		atomic_long_inc(&c->cache_read_races);
+		goto retry;
+	}
+
+	if (error == -EIO) {
+		/* io error - do we have another replica? */
+	}
+
+	bch_rbio_parent(rbio)->bio.bi_error = error;
+	bch_rbio_done(c, rbio);
+	return;
+retry:
+	percpu_ref_put(&rbio->ca->ref);
+	rbio->ca = NULL;
+
+	spin_lock_irqsave(&c->read_retry_lock, flags);
+	bio_list_add(&c->read_retry_list, &rbio->bio);
+	spin_unlock_irqrestore(&c->read_retry_lock, flags);
+	queue_work(c->wq, &c->read_retry_work);
+}
+
+static void cache_promote_done(struct closure *cl)
+{
+	struct cache_promote_op *op =
+		container_of(cl, struct cache_promote_op, cl);
+
+	bch_bio_free_pages_pool(op->write.op.c, &op->write.wbio.bio);
+	kfree(op);
+}
+
+/* Inner part that may run in process context */
+static void __bch_read_endio(struct cache_set *c, struct bch_read_bio *rbio)
+{
+	int ret;
+
+	ret = bio_checksum_uncompress(c, rbio);
+	if (ret) {
+		bch_read_error_maybe_retry(c, rbio, ret);
+		return;
+	}
+
+	if (rbio->promote &&
+	    !test_bit(CACHE_SET_RO, &c->flags) &&
+	    !test_bit(CACHE_SET_STOPPING, &c->flags)) {
+		struct cache_promote_op *promote = rbio->promote;
+		struct closure *cl = &promote->cl;
+
+		BUG_ON(!rbio->split || !rbio->bounce);
+
+		/* we now own pages: */
+		swap(promote->write.wbio.bio.bi_vcnt, rbio->bio.bi_vcnt);
+		rbio->promote = NULL;
+
+		bch_rbio_done(c, rbio);
+
+		closure_init(cl, &c->cl);
+		closure_call(&promote->write.op.cl, bch_write, c->wq, cl);
+		closure_return_with_destructor(cl, cache_promote_done);
+	} else {
+		bch_rbio_done(c, rbio);
+	}
+}
+
+void bch_bio_decompress_work(struct work_struct *work)
+{
+	struct bio_decompress_worker *d =
+		container_of(work, struct bio_decompress_worker, work);
+	struct llist_node *list, *next;
+	struct bch_read_bio *rbio;
+
+	while ((list = llist_del_all(&d->bio_list)))
+		for (list = llist_reverse_order(list);
+		     list;
+		     list = next) {
+			next = llist_next(list);
+			rbio = container_of(list, struct bch_read_bio, list);
+
+			__bch_read_endio(d->c, rbio);
+		}
+}
+
+static void bch_read_endio(struct bio *bio)
+{
+	struct bch_read_bio *rbio =
+		container_of(bio, struct bch_read_bio, bio);
+	struct cache_set *c = rbio->ca->set;
+	int stale = ((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+		ptr_stale(rbio->ca, &rbio->ptr) ? -EINTR : 0;
+	int error = bio->bi_error ?: stale;
+
+	bch_account_io_completion_time(rbio->ca, rbio->submit_time_us, REQ_OP_READ);
+
+	cache_nonfatal_io_err_on(bio->bi_error, rbio->ca, "data read");
+
+	if (error) {
+		bch_read_error_maybe_retry(c, rbio, error);
+		return;
+	}
+
+	if (rbio->crc.compression_type != BCH_COMPRESSION_NONE) {
+		struct bio_decompress_worker *d;
+
+		preempt_disable();
+		d = this_cpu_ptr(c->bio_decompress_worker);
+		llist_add(&rbio->list, &d->bio_list);
+		queue_work(system_unbound_wq, &d->work);
+		preempt_enable();
+	} else {
+		__bch_read_endio(c, rbio);
+	}
+}
+
+void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
+			  struct bvec_iter iter, struct bkey_s_c k,
+			  struct extent_pick_ptr *pick, unsigned flags)
+{
+	struct bch_read_bio *rbio;
+	struct cache_promote_op *promote_op = NULL;
+	unsigned skip = iter.bi_sector - bkey_start_offset(k.k);
+	bool bounce = false, split, read_full = false;
+
+	EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
+		k.k->p.offset < bvec_iter_end_sector(iter));
+
+	/* only promote if we're not reading from the fastest tier: */
+
+	/*
+	 * XXX: multiple promotes can race with each other, wastefully. Keep a
+	 * list of outstanding promotes?
+	 */
+	if ((flags & BCH_READ_PROMOTE) && pick->ca->mi.tier) {
+		/*
+		 * biovec needs to be big enough to hold decompressed data, if
+		 * the bch_write_extent() has to decompress/recompress it:
+		 */
+		unsigned sectors =
+			max_t(unsigned, k.k->size,
+			      pick->crc.uncompressed_size);
+		unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+
+		promote_op = kmalloc(sizeof(*promote_op) +
+				sizeof(struct bio_vec) * pages, GFP_NOIO);
+		if (promote_op) {
+			struct bio *promote_bio = &promote_op->write.wbio.bio;
+
+			bio_init(promote_bio);
+			promote_bio->bi_max_vecs = pages;
+			promote_bio->bi_io_vec	= promote_bio->bi_inline_vecs;
+			bounce = true;
+			/* could also set read_full */
+		}
+	}
+
+	/*
+	 * note: if compression_type and crc_type both == none, then
+	 * compressed/uncompressed size is zero
+	 */
+	if (pick->crc.compression_type != BCH_COMPRESSION_NONE ||
+	    (pick->crc.csum_type != BCH_CSUM_NONE &&
+	     (bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
+	      (flags & BCH_READ_FORCE_BOUNCE)))) {
+		read_full = true;
+		bounce = true;
+	}
+
+	if (bounce) {
+		unsigned sectors = read_full
+			? (pick->crc.compressed_size ?: k.k->size)
+			: bvec_iter_sectors(iter);
+
+		rbio = container_of(bio_alloc_bioset(GFP_NOIO,
+					DIV_ROUND_UP(sectors, PAGE_SECTORS),
+					&c->bio_read_split),
+				    struct bch_read_bio, bio);
+
+		bch_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
+		split = true;
+	} else if (!(flags & BCH_READ_MAY_REUSE_BIO) ||
+		   !(flags & BCH_READ_IS_LAST)) {
+		/*
+		 * Have to clone if there were any splits, due to error
+		 * reporting issues (if a split errored, and retrying didn't
+		 * work, when it reports the error to its parent (us) we don't
+		 * know if the error was from our bio, and we should retry, or
+		 * from the whole bio, in which case we don't want to retry and
+		 * lose the error)
+		 */
+		rbio = container_of(bio_clone_fast(&orig->bio,
+					GFP_NOIO, &c->bio_read_split),
+				    struct bch_read_bio, bio);
+		rbio->bio.bi_iter = iter;
+		split = true;
+	} else {
+		rbio = orig;
+		rbio->bio.bi_iter = iter;
+		split = false;
+		BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
+	}
+
+	if (!(flags & BCH_READ_IS_LAST))
+		__bio_inc_remaining(&orig->bio);
+
+	if (split)
+		rbio->parent	= orig;
+	else
+		rbio->orig_bi_end_io = orig->bio.bi_end_io;
+	rbio->parent_iter	= iter;
+
+	rbio->inode		= k.k->p.inode;
+	rbio->flags		= flags;
+	rbio->bounce		= bounce;
+	rbio->split		= split;
+	rbio->crc		= pick->crc;
+	/*
+	 * crc.compressed_size will be 0 if there wasn't any checksum
+	 * information, also we need to stash the original size of the bio if we
+	 * bounced (which isn't necessarily the original key size, if we bounced
+	 * only for promoting)
+	 */
+	rbio->crc.compressed_size = bio_sectors(&rbio->bio);
+	rbio->ptr		= pick->ptr;
+	rbio->ca		= pick->ca;
+	rbio->promote		= promote_op;
+
+	rbio->bio.bi_bdev	= pick->ca->disk_sb.bdev;
+	rbio->bio.bi_opf	= orig->bio.bi_opf;
+	rbio->bio.bi_iter.bi_sector = pick->ptr.offset;
+	rbio->bio.bi_end_io	= bch_read_endio;
+
+	if (promote_op) {
+		struct bio *promote_bio = &promote_op->write.wbio.bio;
+
+		promote_bio->bi_iter = rbio->bio.bi_iter;
+		memcpy(promote_bio->bi_io_vec, rbio->bio.bi_io_vec,
+		       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
+
+		bch_migrate_write_init(c, &promote_op->write,
+				       &c->promote_write_point,
+				       k, NULL,
+				       BCH_WRITE_ALLOC_NOWAIT);
+		promote_op->write.promote = true;
+
+		if (rbio->crc.compression_type) {
+			promote_op->write.op.flags |= BCH_WRITE_DATA_COMPRESSED;
+			promote_op->write.op.crc = rbio->crc;
+			promote_op->write.op.size = k.k->size;
+		} else if (read_full) {
+			/*
+			 * Adjust bio to correspond to _live_ portion of @k -
+			 * which might be less than what we're actually reading:
+			 */
+			bio_advance(promote_bio, rbio->crc.offset << 9);
+			BUG_ON(bio_sectors(promote_bio) < k.k->size);
+			promote_bio->bi_iter.bi_size = k.k->size << 9;
+		} else {
+			/*
+			 * Set insert pos to correspond to what we're actually
+			 * reading:
+			 */
+			promote_op->write.op.pos.offset = iter.bi_sector;
+		}
+
+		promote_bio->bi_iter.bi_sector =
+			promote_op->write.op.pos.offset;
+	}
+
+	/* _after_ promete stuff has looked at rbio->crc.offset */
+	if (read_full)
+		rbio->crc.offset += skip;
+	else
+		rbio->bio.bi_iter.bi_sector += skip;
+
+	rbio->submit_time_us = local_clock_us();
+
+#ifndef CONFIG_BCACHE_NO_IO
+	generic_make_request(&rbio->bio);
+#else
+	bio_endio(&rbio->bio);
+#endif
+}
+
+static void bch_read_iter(struct cache_set *c, struct bch_read_bio *rbio,
+			  struct bvec_iter bvec_iter, u64 inode,
+			  unsigned flags)
+{
+	struct bio *bio = &rbio->bio;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
+				      POS(inode, bvec_iter.bi_sector), k) {
+		BKEY_PADDED(k) tmp;
+		struct extent_pick_ptr pick;
+		unsigned bytes, sectors;
+		bool is_last;
+
+		/*
+		 * Unlock the iterator while the btree node's lock is still in
+		 * cache, before doing the IO:
+		 */
+		bkey_reassemble(&tmp.k, k);
+		k = bkey_i_to_s_c(&tmp.k);
+		bch_btree_iter_unlock(&iter);
+
+		bch_extent_pick_ptr(c, k, &pick);
+		if (IS_ERR(pick.ca)) {
+			bcache_io_error(c, bio, "no device to read from");
+			bio_endio(bio);
+			return;
+		}
+
+		sectors = min_t(u64, k.k->p.offset,
+				bvec_iter_end_sector(bvec_iter)) -
+			bvec_iter.bi_sector;
+		bytes = sectors << 9;
+		is_last = bytes == bvec_iter.bi_size;
+		swap(bvec_iter.bi_size, bytes);
+
+		if (is_last)
+			flags |= BCH_READ_IS_LAST;
+
+		if (pick.ca) {
+			PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
+				c->prio_clock[READ].hand;
+
+			bch_read_extent_iter(c, rbio, bvec_iter,
+					     k, &pick, flags);
+
+			flags &= ~BCH_READ_MAY_REUSE_BIO;
+		} else {
+			zero_fill_bio_iter(bio, bvec_iter);
+
+			if (is_last)
+				bio_endio(bio);
+		}
+
+		if (is_last)
+			return;
+
+		swap(bvec_iter.bi_size, bytes);
+		bio_advance_iter(bio, &bvec_iter, bytes);
+	}
+
+	/*
+	 * If we get here, it better have been because there was an error
+	 * reading a btree node
+	 */
+	ret = bch_btree_iter_unlock(&iter);
+	BUG_ON(!ret);
+	bcache_io_error(c, bio, "btree IO error %i", ret);
+	bio_endio(bio);
+}
+
+void bch_read(struct cache_set *c, struct bch_read_bio *bio, u64 inode)
+{
+	bch_increment_clock(c, bio_sectors(&bio->bio), READ);
+
+	bch_read_iter(c, bio, bio->bio.bi_iter, inode,
+		      BCH_READ_FORCE_BOUNCE|
+		      BCH_READ_RETRY_IF_STALE|
+		      BCH_READ_PROMOTE|
+		      BCH_READ_MAY_REUSE_BIO);
+}
+EXPORT_SYMBOL(bch_read);
+
+/**
+ * bch_read_retry - re-submit a bio originally from bch_read()
+ */
+static void bch_read_retry(struct cache_set *c, struct bch_read_bio *rbio)
+{
+	struct bch_read_bio *parent = bch_rbio_parent(rbio);
+	struct bvec_iter iter = rbio->parent_iter;
+	u64 inode = rbio->inode;
+
+	trace_bcache_read_retry(&rbio->bio);
+
+	if (rbio->split)
+		bch_rbio_free(c, rbio);
+	else
+		rbio->bio.bi_end_io = rbio->orig_bi_end_io;
+
+	bch_read_iter(c, parent, iter, inode,
+		      BCH_READ_FORCE_BOUNCE|
+		      BCH_READ_RETRY_IF_STALE|
+		      BCH_READ_PROMOTE);
+}
+
+void bch_read_retry_work(struct work_struct *work)
+{
+	struct cache_set *c = container_of(work, struct cache_set,
+					   read_retry_work);
+	struct bch_read_bio *rbio;
+	struct bio *bio;
+	unsigned long flags;
+
+	while (1) {
+		spin_lock_irqsave(&c->read_retry_lock, flags);
+		bio = bio_list_pop(&c->read_retry_list);
+		spin_unlock_irqrestore(&c->read_retry_lock, flags);
+
+		if (!bio)
+			break;
+
+		rbio = container_of(bio, struct bch_read_bio, bio);
+		bch_read_retry(c, rbio);
+	}
+}
diff --git a/libbcache/io.h b/libbcache/io.h
new file mode 100644
index 0000000..b7668b4
--- /dev/null
+++ b/libbcache/io.h
@@ -0,0 +1,90 @@
+#ifndef _BCACHE_IO_H
+#define _BCACHE_IO_H
+
+#include "io_types.h"
+
+#define to_wbio(_bio)			\
+	container_of((_bio), struct bch_write_bio, bio)
+
+#define to_rbio(_bio)			\
+	container_of((_bio), struct bch_read_bio, bio)
+
+void bch_bio_free_pages_pool(struct cache_set *, struct bio *);
+void bch_bio_alloc_pages_pool(struct cache_set *, struct bio *, size_t);
+
+enum bch_write_flags {
+	BCH_WRITE_ALLOC_NOWAIT		= (1 << 0),
+	BCH_WRITE_DISCARD		= (1 << 1),
+	BCH_WRITE_CACHED		= (1 << 2),
+	BCH_WRITE_FLUSH			= (1 << 3),
+	BCH_WRITE_DISCARD_ON_ERROR	= (1 << 4),
+	BCH_WRITE_DATA_COMPRESSED	= (1 << 5),
+
+	/* Internal: */
+	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 6),
+	BCH_WRITE_DONE			= (1 << 7),
+	BCH_WRITE_LOOPED		= (1 << 8),
+};
+
+static inline u64 *op_journal_seq(struct bch_write_op *op)
+{
+	return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR)
+		? op->journal_seq_p : &op->journal_seq;
+}
+
+static inline struct write_point *foreground_write_point(struct cache_set *c,
+							 unsigned long v)
+{
+	return c->write_points +
+		hash_long(v, ilog2(ARRAY_SIZE(c->write_points)));
+}
+
+void bch_write_op_init(struct bch_write_op *, struct cache_set *,
+		       struct bch_write_bio *,
+		       struct disk_reservation, struct write_point *,
+		       struct bpos, u64 *, unsigned);
+void bch_write(struct closure *);
+
+struct cache_promote_op;
+
+struct extent_pick_ptr;
+
+void bch_read_extent_iter(struct cache_set *, struct bch_read_bio *,
+			  struct bvec_iter, struct bkey_s_c k,
+			  struct extent_pick_ptr *, unsigned);
+
+static inline void bch_read_extent(struct cache_set *c,
+				   struct bch_read_bio *orig,
+				   struct bkey_s_c k,
+				   struct extent_pick_ptr *pick,
+				   unsigned flags)
+{
+	bch_read_extent_iter(c, orig, orig->bio.bi_iter,
+			     k, pick, flags);
+}
+
+enum bch_read_flags {
+	BCH_READ_FORCE_BOUNCE		= 1 << 0,
+	BCH_READ_RETRY_IF_STALE		= 1 << 1,
+	BCH_READ_PROMOTE		= 1 << 2,
+	BCH_READ_IS_LAST		= 1 << 3,
+	BCH_READ_MAY_REUSE_BIO		= 1 << 4,
+};
+
+void bch_read(struct cache_set *, struct bch_read_bio *, u64);
+
+void bch_generic_make_request(struct bio *, struct cache_set *);
+void bch_bio_submit_work(struct work_struct *);
+void bch_submit_wbio_replicas(struct bch_write_bio *, struct cache_set *,
+			      const struct bkey_i *, bool);
+
+int bch_discard(struct cache_set *, struct bpos, struct bpos,
+		u64, struct disk_reservation *,
+		struct extent_insert_hook *, u64 *);
+
+void bch_read_retry_work(struct work_struct *);
+void bch_wake_delayed_writes(unsigned long data);
+
+void bch_bio_decompress_work(struct work_struct *);
+
+#endif /* _BCACHE_IO_H */
diff --git a/libbcache/io_types.h b/libbcache/io_types.h
new file mode 100644
index 0000000..f7d99cd
--- /dev/null
+++ b/libbcache/io_types.h
@@ -0,0 +1,148 @@
+#ifndef _BCACHE_IO_TYPES_H
+#define _BCACHE_IO_TYPES_H
+
+#include "btree_types.h"
+#include "buckets_types.h"
+#include "keylist_types.h"
+
+#include <linux/llist.h>
+#include <linux/workqueue.h>
+
+struct bch_read_bio {
+	/*
+	 * Reads will often have to be split, and if the extent being read from
+	 * was checksummed or compressed we'll also have to allocate bounce
+	 * buffers and copy the data back into the original bio.
+	 *
+	 * If we didn't have to split, we have to save and restore the original
+	 * bi_end_io - @split below indicates which:
+	 */
+	union {
+	struct bch_read_bio	*parent;
+	bio_end_io_t		*orig_bi_end_io;
+	};
+
+	/*
+	 * Saved copy of parent->bi_iter, from submission time - allows us to
+	 * resubmit on IO error, and also to copy data back to the original bio
+	 * when we're bouncing:
+	 */
+	struct bvec_iter	parent_iter;
+
+	/*
+	 * If we have to retry the read (IO error, checksum failure, read stale
+	 * data (raced with allocator), we retry the portion of the parent bio
+	 * that failed (i.e. this bio's portion, parent_iter).
+	 *
+	 * But we need to stash the inode somewhere:
+	 */
+	u64			inode;
+
+	unsigned		submit_time_us;
+	u16			flags;
+	u8			bounce:1,
+				split:1;
+
+	struct bch_extent_crc64	crc;
+	struct bch_extent_ptr	ptr;
+	struct cache		*ca;
+
+	struct cache_promote_op *promote;
+
+	/* bio_decompress_worker list */
+	struct llist_node	list;
+
+	struct bio		bio;
+};
+
+static inline struct bch_read_bio *
+bch_rbio_parent(struct bch_read_bio *rbio)
+{
+	return rbio->split ? rbio->parent : rbio;
+}
+
+struct bch_write_bio {
+	struct cache_set	*c;
+	struct cache		*ca;
+	union {
+		struct bio	*orig;
+		struct closure	*cl;
+	};
+
+	unsigned		submit_time_us;
+	unsigned		split:1,
+				bounce:1,
+				put_bio:1;
+
+	/* Only for btree writes: */
+	unsigned		used_mempool:1;
+	u8			order;
+
+	struct bio		bio;
+};
+
+struct bch_replace_info {
+	struct extent_insert_hook	hook;
+	/* How many insertions succeeded */
+	unsigned			successes;
+	/* How many insertions failed */
+	unsigned			failures;
+	BKEY_PADDED(key);
+};
+
+struct bch_write_op {
+	struct closure		cl;
+	struct cache_set	*c;
+	struct workqueue_struct	*io_wq;
+	struct bch_write_bio	*bio;
+
+	unsigned		written; /* sectors */
+
+	short			error;
+
+	u16			flags;
+	unsigned		compression_type:4;
+	unsigned		nr_replicas:4;
+	unsigned		alloc_reserve:4;
+
+	struct bpos		pos;
+	unsigned		version;
+
+	/* For BCH_WRITE_DATA_COMPRESSED: */
+	struct bch_extent_crc64	crc;
+	unsigned		size;
+
+	struct disk_reservation	res;
+
+	struct write_point	*wp;
+
+	union {
+	u8			open_buckets[16];
+	struct {
+	struct bch_write_op	*next;
+	unsigned long		expires;
+	};
+	};
+
+	/*
+	 * If caller wants to flush but hasn't passed us a journal_seq ptr, we
+	 * still need to stash the journal_seq somewhere:
+	 */
+	union {
+		u64			*journal_seq_p;
+		u64			journal_seq;
+	};
+
+	int			(*index_update_fn)(struct bch_write_op *);
+
+	struct keylist		insert_keys;
+	u64			inline_keys[BKEY_EXTENT_U64s_MAX * 2];
+};
+
+struct bio_decompress_worker {
+	struct cache_set		*c;
+	struct work_struct		work;
+	struct llist_head		bio_list;
+};
+
+#endif /* _BCACHE_IO_TYPES_H */
diff --git a/libbcache/journal.c b/libbcache/journal.c
new file mode 100644
index 0000000..ffc9573
--- /dev/null
+++ b/libbcache/journal.c
@@ -0,0 +1,2585 @@
+/*
+ * bcache journalling code, for btree insertions
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "alloc.h"
+#include "bkey_methods.h"
+#include "buckets.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_io.h"
+#include "checksum.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "keylist.h"
+#include "journal.h"
+#include "super.h"
+
+#include <trace/events/bcache.h>
+
+static void journal_write(struct closure *);
+static void journal_reclaim_fast(struct journal *);
+static void journal_pin_add_entry(struct journal *,
+				  struct journal_entry_pin_list *,
+				  struct journal_entry_pin *,
+				  journal_pin_flush_fn);
+
+static inline struct journal_buf *journal_cur_buf(struct journal *j)
+{
+	return j->buf + j->reservations.idx;
+}
+
+static inline struct journal_buf *journal_prev_buf(struct journal *j)
+{
+	return j->buf + !j->reservations.idx;
+}
+
+/* Sequence number of oldest dirty journal entry */
+
+static inline u64 last_seq(struct journal *j)
+{
+	return atomic64_read(&j->seq) - fifo_used(&j->pin) + 1;
+}
+
+static inline u64 journal_pin_seq(struct journal *j,
+				  struct journal_entry_pin_list *pin_list)
+{
+	return last_seq(j) + fifo_entry_idx(&j->pin, pin_list);
+}
+
+#define for_each_jset_entry(entry, jset)				\
+	for (entry = (jset)->start;					\
+	     entry < bkey_idx(jset, le32_to_cpu((jset)->u64s));		\
+	     entry = jset_keys_next(entry))
+
+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
+					struct jset_entry *entry, unsigned type)
+{
+	while (entry < bkey_idx(jset, le32_to_cpu(jset->u64s))) {
+		if (JOURNAL_ENTRY_TYPE(entry) == type)
+			return entry;
+
+		entry = jset_keys_next(entry);
+	}
+
+	return NULL;
+}
+
+#define for_each_jset_entry_type(entry, jset, type)			\
+	for (entry = (jset)->start;					\
+	     (entry = __jset_entry_type_next(jset, entry, type));	\
+	     entry = jset_keys_next(entry))
+
+#define for_each_jset_key(k, _n, entry, jset)				\
+	for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS)	\
+		for (k = (entry)->start;			\
+		     (k < bkey_idx(entry, le16_to_cpu((entry)->u64s)) &&\
+		      (_n = bkey_next(k), 1));			\
+		     k = _n)
+
+static inline void bch_journal_add_entry(struct journal_buf *buf,
+					 const void *data, size_t u64s,
+					 unsigned type, enum btree_id id,
+					 unsigned level)
+{
+	struct jset *jset = buf->data;
+
+	bch_journal_add_entry_at(buf, data, u64s, type, id, level,
+				 le32_to_cpu(jset->u64s));
+	le32_add_cpu(&jset->u64s, jset_u64s(u64s));
+}
+
+static struct jset_entry *bch_journal_find_entry(struct jset *j, unsigned type,
+						 enum btree_id id)
+{
+	struct jset_entry *entry;
+
+	for_each_jset_entry_type(entry, j, type)
+		if (entry->btree_id == id)
+			return entry;
+
+	return NULL;
+}
+
+struct bkey_i *bch_journal_find_btree_root(struct cache_set *c, struct jset *j,
+					   enum btree_id id, unsigned *level)
+{
+	struct bkey_i *k;
+	struct jset_entry *entry =
+		bch_journal_find_entry(j, JOURNAL_ENTRY_BTREE_ROOT, id);
+
+	if (!entry)
+		return NULL;
+
+	k = entry->start;
+	*level = entry->level;
+	*level = entry->level;
+	return k;
+}
+
+static void bch_journal_add_btree_root(struct journal_buf *buf,
+				       enum btree_id id, struct bkey_i *k,
+				       unsigned level)
+{
+	bch_journal_add_entry(buf, k, k->k.u64s,
+			      JOURNAL_ENTRY_BTREE_ROOT, id, level);
+}
+
+static inline void bch_journal_add_prios(struct journal *j,
+					 struct journal_buf *buf)
+{
+	/*
+	 * no prio bucket ptrs yet... XXX should change the allocator so this
+	 * can't happen:
+	 */
+	if (!buf->nr_prio_buckets)
+		return;
+
+	bch_journal_add_entry(buf, j->prio_buckets, buf->nr_prio_buckets,
+			      JOURNAL_ENTRY_PRIO_PTRS, 0, 0);
+}
+
+static void journal_seq_blacklist_flush(struct journal *j,
+					struct journal_entry_pin *pin)
+{
+	struct cache_set *c =
+		container_of(j, struct cache_set, journal);
+	struct journal_seq_blacklist *bl =
+		container_of(pin, struct journal_seq_blacklist, pin);
+	struct blacklisted_node n;
+	struct closure cl;
+	unsigned i;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	for (i = 0;; i++) {
+		struct btree_iter iter;
+		struct btree *b;
+
+		mutex_lock(&j->blacklist_lock);
+		if (i >= bl->nr_entries) {
+			mutex_unlock(&j->blacklist_lock);
+			break;
+		}
+		n = bl->entries[i];
+		mutex_unlock(&j->blacklist_lock);
+
+		bch_btree_iter_init(&iter, c, n.btree_id, n.pos);
+		iter.is_extents = false;
+redo_peek:
+		b = bch_btree_iter_peek_node(&iter);
+
+		/* The node might have already been rewritten: */
+
+		if (b->data->keys.seq == n.seq &&
+		    !bkey_cmp(b->key.k.p, n.pos)) {
+			ret = bch_btree_node_rewrite(&iter, b, &cl);
+			if (ret) {
+				bch_btree_iter_unlock(&iter);
+				closure_sync(&cl);
+
+				if (ret == -EAGAIN ||
+				    ret == -EINTR)
+					goto redo_peek;
+
+				/* -EROFS or perhaps -ENOSPC - bail out: */
+				/* XXX warn here */
+				return;
+			}
+		}
+
+		bch_btree_iter_unlock(&iter);
+	}
+
+	closure_sync(&cl);
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	for (i = 0;; i++) {
+		struct btree_interior_update *as;
+		struct pending_btree_node_free *d;
+
+		mutex_lock(&j->blacklist_lock);
+		if (i >= bl->nr_entries) {
+			mutex_unlock(&j->blacklist_lock);
+			break;
+		}
+		n = bl->entries[i];
+		mutex_unlock(&j->blacklist_lock);
+
+		/*
+		 * Is the node on the list of pending interior node updates -
+		 * being freed? If so, wait for that to finish:
+		 */
+		for_each_pending_btree_node_free(c, as, d)
+			if (n.seq	== d->seq &&
+			    n.btree_id	== d->btree_id &&
+			    !d->level &&
+			    !bkey_cmp(n.pos, d->key.k.p)) {
+				closure_wait(&as->wait, &cl);
+				mutex_unlock(&c->btree_interior_update_lock);
+				closure_sync(&cl);
+				break;
+			}
+	}
+
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	mutex_lock(&j->blacklist_lock);
+
+	bch_journal_pin_drop(j, &bl->pin);
+	list_del(&bl->list);
+	kfree(bl->entries);
+	kfree(bl);
+
+	mutex_unlock(&j->blacklist_lock);
+}
+
+static struct journal_seq_blacklist *
+journal_seq_blacklist_find(struct journal *j, u64 seq)
+{
+	struct journal_seq_blacklist *bl;
+
+	lockdep_assert_held(&j->blacklist_lock);
+
+	list_for_each_entry(bl, &j->seq_blacklist, list)
+		if (seq == bl->seq)
+			return bl;
+
+	return NULL;
+}
+
+static struct journal_seq_blacklist *
+bch_journal_seq_blacklisted_new(struct journal *j, u64 seq)
+{
+	struct journal_seq_blacklist *bl;
+
+	lockdep_assert_held(&j->blacklist_lock);
+
+	bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+	if (!bl)
+		return NULL;
+
+	bl->seq = seq;
+	list_add_tail(&bl->list, &j->seq_blacklist);
+	return bl;
+}
+
+/*
+ * Returns true if @seq is newer than the most recent journal entry that got
+ * written, and data corresponding to @seq should be ignored - also marks @seq
+ * as blacklisted so that on future restarts the corresponding data will still
+ * be ignored:
+ */
+int bch_journal_seq_should_ignore(struct cache_set *c, u64 seq, struct btree *b)
+{
+	struct journal *j = &c->journal;
+	struct journal_seq_blacklist *bl = NULL;
+	struct blacklisted_node *n;
+	u64 journal_seq, i;
+	int ret = 0;
+
+	if (!seq)
+		return 0;
+
+	journal_seq = atomic64_read(&j->seq);
+
+	/* Interier updates aren't journalled: */
+	BUG_ON(b->level);
+	BUG_ON(seq > journal_seq && test_bit(CACHE_SET_INITIAL_GC_DONE, &c->flags));
+
+	if (seq <= journal_seq) {
+		if (list_empty_careful(&j->seq_blacklist))
+			return 0;
+
+		mutex_lock(&j->blacklist_lock);
+		ret = journal_seq_blacklist_find(j, seq) != NULL;
+		mutex_unlock(&j->blacklist_lock);
+		return ret;
+	}
+
+	/*
+	 * Decrease this back to j->seq + 2 when we next rev the on disk format:
+	 * increasing it temporarily to work around bug in old kernels
+	 */
+	cache_set_inconsistent_on(seq > journal_seq + 4, c,
+			 "bset journal seq too far in the future: %llu > %llu",
+			 seq, journal_seq);
+
+	bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
+		    b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
+
+	/*
+	 * When we start the journal, bch_journal_start() will skip over @seq:
+	 */
+
+	mutex_lock(&j->blacklist_lock);
+
+	for (i = journal_seq + 1; i <= seq; i++) {
+		bl = journal_seq_blacklist_find(j, i) ?:
+			bch_journal_seq_blacklisted_new(j, i);
+
+		if (!bl) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
+		if (b->data->keys.seq	== n->seq &&
+		    b->btree_id		== n->btree_id &&
+		    !bkey_cmp(b->key.k.p, n->pos))
+			goto found_entry;
+
+	if (!bl->nr_entries ||
+	    is_power_of_2(bl->nr_entries)) {
+		n = krealloc(bl->entries,
+			     max(bl->nr_entries * 2, 8UL) * sizeof(*n),
+			     GFP_KERNEL);
+		if (!n) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		bl->entries = n;
+	}
+
+	bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
+		.seq		= b->data->keys.seq,
+		.btree_id	= b->btree_id,
+		.pos		= b->key.k.p,
+	};
+found_entry:
+	ret = 1;
+out:
+	mutex_unlock(&j->blacklist_lock);
+	return ret;
+}
+
+/*
+ * Journal replay/recovery:
+ *
+ * This code is all driven from run_cache_set(); we first read the journal
+ * entries, do some other stuff, then we mark all the keys in the journal
+ * entries (same as garbage collection would), then we replay them - reinserting
+ * them into the cache in precisely the same order as they appear in the
+ * journal.
+ *
+ * We only journal keys that go in leaf nodes, which simplifies things quite a
+ * bit.
+ */
+
+struct journal_list {
+	struct closure		cl;
+	struct mutex		lock;
+	struct mutex		cache_set_buffer_lock;
+	struct list_head	*head;
+	int			ret;
+};
+
+#define JOURNAL_ENTRY_ADD_OK		0
+#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE	5
+
+/*
+ * Given a journal entry we just read, add it to the list of journal entries to
+ * be replayed:
+ */
+static int journal_entry_add(struct cache_set *c, struct journal_list *jlist,
+		    struct jset *j)
+{
+	struct journal_replay *i, *pos;
+	struct list_head *where;
+	size_t bytes = __set_bytes(j, le32_to_cpu(j->u64s));
+	__le64 last_seq;
+	int ret;
+
+	mutex_lock(&jlist->lock);
+
+	last_seq = !list_empty(jlist->head)
+		? list_last_entry(jlist->head, struct journal_replay,
+				  list)->j.last_seq
+		: 0;
+
+	/* Is this entry older than the range we need? */
+	if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
+		ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+		goto out;
+	}
+
+	/* Drop entries we don't need anymore */
+	list_for_each_entry_safe(i, pos, jlist->head, list) {
+		if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
+			break;
+		list_del(&i->list);
+		kfree(i);
+	}
+
+	list_for_each_entry_reverse(i, jlist->head, list) {
+		/* Duplicate? */
+		if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
+			fsck_err_on(bytes != __set_bytes(&i->j,
+						le32_to_cpu(i->j.u64s)) ||
+				    memcmp(j, &i->j, bytes), c,
+				    "found duplicate but non identical journal entries (seq %llu)",
+				    le64_to_cpu(j->seq));
+
+			ret = JOURNAL_ENTRY_ADD_OK;
+			goto out;
+		}
+
+		if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
+			where = &i->list;
+			goto add;
+		}
+	}
+
+	where = jlist->head;
+add:
+	i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+	if (!i) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	memcpy(&i->j, j, bytes);
+	list_add(&i->list, where);
+	ret = JOURNAL_ENTRY_ADD_OK;
+out:
+fsck_err:
+	mutex_unlock(&jlist->lock);
+	return ret;
+}
+
+static void journal_entry_null_range(void *start, void *end)
+{
+	struct jset_entry *entry;
+
+	for (entry = start; entry != end; entry = jset_keys_next(entry)) {
+		entry->u64s	= 0;
+		entry->btree_id	= 0;
+		entry->level	= 0;
+		entry->flags	= 0;
+		SET_JOURNAL_ENTRY_TYPE(entry, 0);
+	}
+}
+
+static int journal_validate_key(struct cache_set *c, struct jset *j,
+				struct jset_entry *entry,
+				struct bkey_i *k, enum bkey_type key_type,
+				const char *type)
+{
+	void *next = jset_keys_next(entry);
+	const char *invalid;
+	char buf[160];
+	int ret = 0;
+
+	if (fsck_err_on(!k->k.u64s, c,
+			"invalid %s in journal: k->u64s 0", type)) {
+		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+		journal_entry_null_range(jset_keys_next(entry), next);
+		return 0;
+	}
+
+	if (fsck_err_on((void *) bkey_next(k) >
+			(void *) jset_keys_next(entry), c,
+			"invalid %s in journal: extends past end of journal entry",
+			type)) {
+		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+		journal_entry_null_range(jset_keys_next(entry), next);
+		return 0;
+	}
+
+	if (fsck_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
+			"invalid %s in journal: bad format %u",
+			type, k->k.format)) {
+		le16_add_cpu(&entry->u64s, -k->k.u64s);
+		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+		journal_entry_null_range(jset_keys_next(entry), next);
+		return 0;
+	}
+
+	if (JSET_BIG_ENDIAN(j) != CPU_BIG_ENDIAN)
+		bch_bkey_swab(key_type, NULL, bkey_to_packed(k));
+
+	invalid = bkey_invalid(c, key_type, bkey_i_to_s_c(k));
+	if (invalid) {
+		bch_bkey_val_to_text(c, key_type, buf, sizeof(buf),
+				     bkey_i_to_s_c(k));
+		fsck_err(c, "invalid %s in journal: %s", type, buf);
+
+		le16_add_cpu(&entry->u64s, -k->k.u64s);
+		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+		journal_entry_null_range(jset_keys_next(entry), next);
+		return 0;
+	}
+fsck_err:
+	return ret;
+}
+
+#define JOURNAL_ENTRY_REREAD	5
+#define JOURNAL_ENTRY_NONE	6
+#define JOURNAL_ENTRY_BAD	7
+
+static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 sector,
+				  unsigned bucket_sectors_left,
+				  unsigned sectors_read)
+{
+	struct jset_entry *entry;
+	size_t bytes = __set_bytes(j, le32_to_cpu(j->u64s));
+	u64 got, expect;
+	int ret = 0;
+
+	if (le64_to_cpu(j->magic) != jset_magic(&c->disk_sb))
+		return JOURNAL_ENTRY_NONE;
+
+	if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) {
+		bch_err(c, "unknown journal entry version %u",
+			le32_to_cpu(j->version));
+		return BCH_FSCK_UNKNOWN_VERSION;
+	}
+
+	if (fsck_err_on(bytes > bucket_sectors_left << 9 ||
+			bytes > c->journal.entry_size_max, c,
+			"journal entry too big (%zu bytes), sector %lluu",
+			bytes, sector)) {
+		/* XXX: note we might have missing journal entries */
+		return JOURNAL_ENTRY_BAD;
+	}
+
+	if (bytes > sectors_read << 9)
+		return JOURNAL_ENTRY_REREAD;
+
+	got = le64_to_cpu(j->csum);
+	expect = __csum_set(j, le32_to_cpu(j->u64s), JSET_CSUM_TYPE(j));
+	if (fsck_err_on(got != expect, c,
+			"journal checksum bad (got %llu expect %llu), sector %lluu",
+			got, expect, sector)) {
+		/* XXX: retry IO, when we start retrying checksum errors */
+		/* XXX: note we might have missing journal entries */
+		return JOURNAL_ENTRY_BAD;
+	}
+
+	if (fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c,
+			"invalid journal entry: last_seq > seq"))
+		j->last_seq = j->seq;
+
+	for_each_jset_entry(entry, j) {
+		struct bkey_i *k;
+
+		if (fsck_err_on(jset_keys_next(entry) >
+				bkey_idx(j, le32_to_cpu(j->u64s)), c,
+				"journal entry extents past end of jset")) {
+			j->u64s = cpu_to_le64((u64 *) entry - j->_data);
+			break;
+		}
+
+		switch (JOURNAL_ENTRY_TYPE(entry)) {
+		case JOURNAL_ENTRY_BTREE_KEYS:
+			for (k = entry->start;
+			     k < bkey_idx(entry, le16_to_cpu(entry->u64s));
+			     k = bkey_next(k)) {
+				ret = journal_validate_key(c, j, entry, k,
+						bkey_type(entry->level,
+							  entry->btree_id),
+						"key");
+				if (ret)
+					goto fsck_err;
+			}
+			break;
+
+		case JOURNAL_ENTRY_BTREE_ROOT:
+			k = entry->start;
+
+			if (fsck_err_on(!entry->u64s ||
+					le16_to_cpu(entry->u64s) != k->k.u64s, c,
+					"invalid btree root journal entry: wrong number of keys")) {
+				journal_entry_null_range(entry,
+						jset_keys_next(entry));
+				continue;
+			}
+
+			ret = journal_validate_key(c, j, entry, k,
+						   BKEY_TYPE_BTREE, "btree root");
+			if (ret)
+				goto fsck_err;
+			break;
+
+		case JOURNAL_ENTRY_PRIO_PTRS:
+			break;
+
+		case JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED:
+			if (fsck_err_on(le16_to_cpu(entry->u64s) != 1, c,
+				"invalid journal seq blacklist entry: bad size")) {
+				journal_entry_null_range(entry,
+						jset_keys_next(entry));
+			}
+
+			break;
+		default:
+			fsck_err(c, "invalid journal entry type %llu",
+				 JOURNAL_ENTRY_TYPE(entry));
+			journal_entry_null_range(entry, jset_keys_next(entry));
+			break;
+		}
+	}
+
+fsck_err:
+	return ret;
+}
+
+static int journal_read_bucket(struct cache *ca, struct journal_list *jlist,
+			       unsigned bucket, u64 *seq, bool *entries_found)
+{
+	struct cache_set *c = ca->set;
+	struct journal_device *ja = &ca->journal;
+	struct bio *bio = ja->bio;
+	struct jset *j, *data;
+	unsigned blocks, sectors_read, bucket_offset = 0;
+	unsigned max_entry_sectors = c->journal.entry_size_max >> 9;
+	u64 sector = bucket_to_sector(ca,
+				journal_bucket(ca->disk_sb.sb, bucket));
+	bool saw_bad = false;
+	int ret = 0;
+
+	data = (void *) __get_free_pages(GFP_KERNEL,
+				get_order(c->journal.entry_size_max));
+	if (!data) {
+		mutex_lock(&jlist->cache_set_buffer_lock);
+		data = c->journal.buf[0].data;
+	}
+
+	pr_debug("reading %u", bucket);
+
+	while (bucket_offset < ca->mi.bucket_size) {
+reread:
+		sectors_read = min_t(unsigned,
+				     ca->mi.bucket_size - bucket_offset,
+				     max_entry_sectors);
+
+		bio_reset(bio);
+		bio->bi_bdev		= ca->disk_sb.bdev;
+		bio->bi_iter.bi_sector	= sector + bucket_offset;
+		bio->bi_iter.bi_size	= sectors_read << 9;
+		bio_set_op_attrs(bio, REQ_OP_READ, 0);
+		bch_bio_map(bio, data);
+
+		ret = submit_bio_wait(bio);
+
+		if (cache_fatal_io_err_on(ret, ca,
+					  "journal read from sector %llu",
+					  sector + bucket_offset) ||
+		    bch_meta_read_fault("journal")) {
+			ret = -EIO;
+			goto err;
+		}
+
+		/* This function could be simpler now since we no longer write
+		 * journal entries that overlap bucket boundaries; this means
+		 * the start of a bucket will always have a valid journal entry
+		 * if it has any journal entries at all.
+		 */
+
+		j = data;
+		while (sectors_read) {
+			ret = journal_entry_validate(c, j,
+					sector + bucket_offset,
+					ca->mi.bucket_size - bucket_offset,
+					sectors_read);
+			switch (ret) {
+			case BCH_FSCK_OK:
+				break;
+			case JOURNAL_ENTRY_REREAD:
+				goto reread;
+			case JOURNAL_ENTRY_NONE:
+				if (!saw_bad)
+					goto out;
+				blocks = 1;
+				goto next_block;
+			case JOURNAL_ENTRY_BAD:
+				saw_bad = true;
+				blocks = 1;
+				goto next_block;
+			default:
+				goto err;
+			}
+
+			/*
+			 * This happens sometimes if we don't have discards on -
+			 * when we've partially overwritten a bucket with new
+			 * journal entries. We don't need the rest of the
+			 * bucket:
+			 */
+			if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
+				goto out;
+
+			ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
+
+			ret = journal_entry_add(c, jlist, j);
+			switch (ret) {
+			case JOURNAL_ENTRY_ADD_OK:
+				*entries_found = true;
+				break;
+			case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
+				break;
+			default:
+				goto err;
+			}
+
+			if (le64_to_cpu(j->seq) > *seq)
+				*seq = le64_to_cpu(j->seq);
+next_block:
+			blocks = __set_blocks(j, le32_to_cpu(j->u64s),
+					      block_bytes(c));
+
+			pr_debug("next");
+			bucket_offset	+= blocks * c->sb.block_size;
+			sectors_read	-= blocks * c->sb.block_size;
+			j = ((void *) j) + blocks * block_bytes(c);
+		}
+	}
+out:
+	ret = 0;
+err:
+	if (data == c->journal.buf[0].data)
+		mutex_unlock(&jlist->cache_set_buffer_lock);
+	else
+		free_pages((unsigned long) data,
+				get_order(c->journal.entry_size_max));
+
+	return ret;
+}
+
+static void bch_journal_read_device(struct closure *cl)
+{
+#define read_bucket(b)							\
+	({								\
+		bool entries_found = false;				\
+		int ret = journal_read_bucket(ca, jlist, b,		\
+					      &seq, &entries_found);	\
+		__set_bit(b, bitmap);					\
+		if (ret) {						\
+			mutex_lock(&jlist->lock);			\
+			jlist->ret = ret;				\
+			mutex_unlock(&jlist->lock);			\
+			closure_return(cl);				\
+		}							\
+		entries_found;						\
+	 })
+
+	struct journal_device *ja =
+		container_of(cl, struct journal_device, read);
+	struct cache *ca = container_of(ja, struct cache, journal);
+	struct journal_list *jlist =
+		container_of(cl->parent, struct journal_list, cl);
+	struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
+
+	unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
+	DECLARE_BITMAP(bitmap, nr_buckets);
+	unsigned i, l, r;
+	u64 seq = 0;
+
+	if (!nr_buckets)
+		closure_return(cl);
+
+	bitmap_zero(bitmap, nr_buckets);
+	pr_debug("%u journal buckets", nr_buckets);
+
+	/*
+	 * If the device supports discard but not secure discard, we can't do
+	 * the fancy fibonacci hash/binary search because the live journal
+	 * entries might not form a contiguous range:
+	 */
+	for (i = 0; i < nr_buckets; i++)
+		read_bucket(i);
+	goto search_done;
+
+	if (!blk_queue_nonrot(q))
+		goto linear_scan;
+
+	/*
+	 * Read journal buckets ordered by golden ratio hash to quickly
+	 * find a sequence of buckets with valid journal entries
+	 */
+	for (i = 0; i < nr_buckets; i++) {
+		l = (i * 2654435769U) % nr_buckets;
+
+		if (test_bit(l, bitmap))
+			break;
+
+		if (read_bucket(l))
+			goto bsearch;
+	}
+
+	/*
+	 * If that fails, check all the buckets we haven't checked
+	 * already
+	 */
+	pr_debug("falling back to linear search");
+linear_scan:
+	for (l = find_first_zero_bit(bitmap, nr_buckets);
+	     l < nr_buckets;
+	     l = find_next_zero_bit(bitmap, nr_buckets, l + 1))
+		if (read_bucket(l))
+			goto bsearch;
+
+	/* no journal entries on this device? */
+	if (l == nr_buckets)
+		closure_return(cl);
+bsearch:
+	/* Binary search */
+	r = find_next_bit(bitmap, nr_buckets, l + 1);
+	pr_debug("starting binary search, l %u r %u", l, r);
+
+	while (l + 1 < r) {
+		unsigned m = (l + r) >> 1;
+		u64 cur_seq = seq;
+
+		read_bucket(m);
+
+		if (cur_seq != seq)
+			l = m;
+		else
+			r = m;
+	}
+
+search_done:
+	/*
+	 * Find the journal bucket with the highest sequence number:
+	 *
+	 * If there's duplicate journal entries in multiple buckets (which
+	 * definitely isn't supposed to happen, but...) - make sure to start
+	 * cur_idx at the last of those buckets, so we don't deadlock trying to
+	 * allocate
+	 */
+	seq = 0;
+
+	for (i = 0; i < nr_buckets; i++)
+		if (ja->bucket_seq[i] >= seq &&
+		    ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % nr_buckets]) {
+			/*
+			 * When journal_next_bucket() goes to allocate for
+			 * the first time, it'll use the bucket after
+			 * ja->cur_idx
+			 */
+			ja->cur_idx = i;
+			seq = ja->bucket_seq[i];
+		}
+
+	/*
+	 * Set last_idx to indicate the entire journal is full and needs to be
+	 * reclaimed - journal reclaim will immediately reclaim whatever isn't
+	 * pinned when it first runs:
+	 */
+	ja->last_idx = (ja->cur_idx + 1) % nr_buckets;
+
+	/*
+	 * Read buckets in reverse order until we stop finding more journal
+	 * entries:
+	 */
+	for (i = (ja->cur_idx + nr_buckets - 1) % nr_buckets;
+	     i != ja->cur_idx;
+	     i = (i + nr_buckets - 1) % nr_buckets)
+		if (!test_bit(i, bitmap) &&
+		    !read_bucket(i))
+			break;
+
+	closure_return(cl);
+#undef read_bucket
+}
+
+void bch_journal_entries_free(struct list_head *list)
+{
+
+	while (!list_empty(list)) {
+		struct journal_replay *i =
+			list_first_entry(list, struct journal_replay, list);
+		list_del(&i->list);
+		kvfree(i);
+	}
+}
+
+static int journal_seq_blacklist_read(struct journal *j,
+				      struct journal_replay *i,
+				      struct journal_entry_pin_list *p)
+{
+	struct cache_set *c = container_of(j, struct cache_set, journal);
+	struct jset_entry *entry;
+	struct journal_seq_blacklist *bl;
+	u64 seq;
+
+	for_each_jset_entry_type(entry, &i->j,
+			JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) {
+		seq = le64_to_cpu(entry->_data[0]);
+
+		bch_verbose(c, "blacklisting existing journal seq %llu", seq);
+
+		bl = bch_journal_seq_blacklisted_new(j, seq);
+		if (!bl)
+			return -ENOMEM;
+
+		journal_pin_add_entry(j, p, &bl->pin,
+				  journal_seq_blacklist_flush);
+		bl->written = true;
+	}
+
+	return 0;
+}
+
+int bch_journal_read(struct cache_set *c, struct list_head *list)
+{
+	struct jset_entry *prio_ptrs;
+	struct journal_list jlist;
+	struct journal_replay *i;
+	struct jset *j;
+	struct journal_entry_pin_list *p;
+	struct cache *ca;
+	u64 cur_seq, end_seq;
+	unsigned iter;
+	int ret = 0;
+
+	closure_init_stack(&jlist.cl);
+	mutex_init(&jlist.lock);
+	mutex_init(&jlist.cache_set_buffer_lock);
+	jlist.head = list;
+	jlist.ret = 0;
+
+	for_each_cache(ca, c, iter)
+		closure_call(&ca->journal.read,
+			     bch_journal_read_device,
+			     system_unbound_wq,
+			     &jlist.cl);
+
+	closure_sync(&jlist.cl);
+
+	if (jlist.ret)
+		return jlist.ret;
+
+	if (list_empty(list)){
+		bch_err(c, "no journal entries found");
+		return BCH_FSCK_REPAIR_IMPOSSIBLE;
+	}
+
+	j = &list_entry(list->prev, struct journal_replay, list)->j;
+
+	unfixable_fsck_err_on(le64_to_cpu(j->seq) -
+			le64_to_cpu(j->last_seq) + 1 >
+			c->journal.pin.size, c,
+			"too many journal entries open for refcount fifo");
+
+	c->journal.pin.back = le64_to_cpu(j->seq) -
+		le64_to_cpu(j->last_seq) + 1;
+
+	atomic64_set(&c->journal.seq, le64_to_cpu(j->seq));
+	c->journal.last_seq_ondisk = le64_to_cpu(j->last_seq);
+
+	BUG_ON(last_seq(&c->journal) != le64_to_cpu(j->last_seq));
+
+	i = list_first_entry(list, struct journal_replay, list);
+
+	mutex_lock(&c->journal.blacklist_lock);
+
+	fifo_for_each_entry_ptr(p, &c->journal.pin, iter) {
+		u64 seq = journal_pin_seq(&c->journal, p);
+
+		INIT_LIST_HEAD(&p->list);
+
+		if (i && le64_to_cpu(i->j.seq) == seq) {
+			atomic_set(&p->count, 1);
+
+			if (journal_seq_blacklist_read(&c->journal, i, p)) {
+				mutex_unlock(&c->journal.blacklist_lock);
+				return -ENOMEM;
+			}
+
+			i = list_is_last(&i->list, list)
+				? NULL
+				: list_next_entry(i, list);
+		} else {
+			atomic_set(&p->count, 0);
+		}
+	}
+
+	mutex_unlock(&c->journal.blacklist_lock);
+
+	cur_seq = last_seq(&c->journal);
+	end_seq = le64_to_cpu(list_last_entry(list,
+				struct journal_replay, list)->j.seq);
+
+	list_for_each_entry(i, list, list) {
+		bool blacklisted;
+
+		mutex_lock(&c->journal.blacklist_lock);
+		while (cur_seq < le64_to_cpu(i->j.seq) &&
+		       journal_seq_blacklist_find(&c->journal, cur_seq))
+			cur_seq++;
+
+		blacklisted = journal_seq_blacklist_find(&c->journal,
+							 le64_to_cpu(i->j.seq));
+		mutex_unlock(&c->journal.blacklist_lock);
+
+		fsck_err_on(blacklisted, c,
+			    "found blacklisted journal entry %llu",
+			    le64_to_cpu(i->j.seq));
+
+		fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
+			"journal entries %llu-%llu missing! (replaying %llu-%llu)",
+			cur_seq, le64_to_cpu(i->j.seq) - 1,
+			last_seq(&c->journal), end_seq);
+
+		cur_seq = le64_to_cpu(i->j.seq) + 1;
+	}
+
+	prio_ptrs = bch_journal_find_entry(j, JOURNAL_ENTRY_PRIO_PTRS, 0);
+	if (prio_ptrs) {
+		memcpy_u64s(c->journal.prio_buckets,
+			    prio_ptrs->_data,
+			    le16_to_cpu(prio_ptrs->u64s));
+		c->journal.nr_prio_buckets = le16_to_cpu(prio_ptrs->u64s);
+	}
+fsck_err:
+	return ret;
+}
+
+void bch_journal_mark(struct cache_set *c, struct list_head *list)
+{
+	struct bkey_i *k, *n;
+	struct jset_entry *j;
+	struct journal_replay *r;
+
+	list_for_each_entry(r, list, list)
+		for_each_jset_key(k, n, j, &r->j) {
+			enum bkey_type type = bkey_type(j->level, j->btree_id);
+			struct bkey_s_c k_s_c = bkey_i_to_s_c(k);
+
+			if (btree_type_has_ptrs(type))
+				__bch_btree_mark_key(c, type, k_s_c);
+		}
+}
+
+static bool journal_entry_is_open(struct journal *j)
+{
+	return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+}
+
+void bch_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
+{
+	struct cache_set *c = container_of(j, struct cache_set, journal);
+
+	if (!need_write_just_set &&
+	    test_bit(JOURNAL_NEED_WRITE, &j->flags))
+		__bch_time_stats_update(j->delay_time,
+					j->need_write_time);
+#if 0
+	closure_call(&j->io, journal_write, NULL, &c->cl);
+#else
+	/* Shut sparse up: */
+	closure_init(&j->io, &c->cl);
+	set_closure_fn(&j->io, journal_write, NULL);
+	journal_write(&j->io);
+#endif
+}
+
+static void __bch_journal_next_entry(struct journal *j)
+{
+	struct journal_entry_pin_list pin_list, *p;
+	struct journal_buf *buf;
+
+	/*
+	 * The fifo_push() needs to happen at the same time as j->seq is
+	 * incremented for last_seq() to be calculated correctly
+	 */
+	atomic64_inc(&j->seq);
+	BUG_ON(!fifo_push(&j->pin, pin_list));
+	p = &fifo_peek_back(&j->pin);
+
+	INIT_LIST_HEAD(&p->list);
+	atomic_set(&p->count, 1);
+
+	if (test_bit(JOURNAL_REPLAY_DONE, &j->flags)) {
+		smp_wmb();
+		j->cur_pin_list = p;
+	}
+
+	buf = journal_cur_buf(j);
+	memset(buf->has_inode, 0, sizeof(buf->has_inode));
+
+	memset(buf->data, 0, sizeof(*buf->data));
+	buf->data->seq	= cpu_to_le64(atomic64_read(&j->seq));
+	buf->data->u64s	= 0;
+
+	BUG_ON(journal_pin_seq(j, p) != atomic64_read(&j->seq));
+}
+
+static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf)
+{
+	unsigned ret = BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
+
+	if (buf->nr_prio_buckets)
+		ret += JSET_KEYS_U64s + buf->nr_prio_buckets;
+
+	return ret;
+}
+
+static enum {
+	JOURNAL_ENTRY_ERROR,
+	JOURNAL_ENTRY_INUSE,
+	JOURNAL_ENTRY_CLOSED,
+	JOURNAL_UNLOCKED,
+} journal_buf_switch(struct journal *j, bool need_write_just_set)
+{
+	struct cache_set *c = container_of(j, struct cache_set, journal);
+	struct journal_buf *buf;
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+
+	do {
+		old.v = new.v = v;
+		if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
+			return JOURNAL_ENTRY_CLOSED;
+
+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+			return JOURNAL_ENTRY_ERROR;
+
+		if (new.prev_buf_unwritten)
+			return JOURNAL_ENTRY_INUSE;
+
+		/*
+		 * avoid race between setting buf->data->u64s and
+		 * journal_res_put starting write:
+		 */
+		journal_state_inc(&new);
+
+		new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
+		new.idx++;
+		new.prev_buf_unwritten = 1;
+
+		BUG_ON(journal_state_count(new, new.idx));
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	journal_reclaim_fast(j);
+
+	clear_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+	buf = &j->buf[old.idx];
+	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);
+	buf->data->last_seq	= cpu_to_le64(last_seq(j));
+
+	j->prev_buf_sectors =
+		__set_blocks(buf->data,
+			     le32_to_cpu(buf->data->u64s) +
+			     journal_entry_u64s_reserve(buf),
+			     block_bytes(c)) * c->sb.block_size;
+
+	BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
+
+	atomic_dec_bug(&fifo_peek_back(&j->pin).count);
+	__bch_journal_next_entry(j);
+
+	cancel_delayed_work(&j->write_work);
+	spin_unlock(&j->lock);
+
+	if (c->bucket_journal_seq > 1 << 14) {
+		c->bucket_journal_seq = 0;
+		bch_bucket_seq_cleanup(c);
+	}
+
+	/* ugh - might be called from __journal_res_get() under wait_event() */
+	__set_current_state(TASK_RUNNING);
+	bch_journal_buf_put(j, old.idx, need_write_just_set);
+
+	return JOURNAL_UNLOCKED;
+}
+
+void bch_journal_halt(struct journal *j)
+{
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+
+	do {
+		old.v = new.v = v;
+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+			return;
+
+		new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	wake_up(&j->wait);
+	closure_wake_up(&journal_cur_buf(j)->wait);
+	closure_wake_up(&journal_prev_buf(j)->wait);
+}
+
+static unsigned journal_dev_buckets_available(struct journal *j,
+					      struct cache *ca)
+{
+	struct journal_device *ja = &ca->journal;
+	unsigned nr = bch_nr_journal_buckets(ca->disk_sb.sb);
+	unsigned next = (ja->cur_idx + 1) % nr;
+	unsigned available = (ja->last_idx + nr - next) % nr;
+
+	/*
+	 * Hack to avoid a deadlock during journal replay:
+	 * journal replay might require setting a new btree
+	 * root, which requires writing another journal entry -
+	 * thus, if the journal is full (and this happens when
+	 * replaying the first journal bucket's entries) we're
+	 * screwed.
+	 *
+	 * So don't let the journal fill up unless we're in
+	 * replay:
+	 */
+	if (test_bit(JOURNAL_REPLAY_DONE, &j->flags))
+		available = max((int) available - 2, 0);
+
+	/*
+	 * Don't use the last bucket unless writing the new last_seq
+	 * will make another bucket available:
+	 */
+	if (ja->bucket_seq[ja->last_idx] >= last_seq(j))
+		available = max((int) available - 1, 0);
+
+	return available;
+}
+
+/* returns number of sectors available for next journal entry: */
+static int journal_entry_sectors(struct journal *j)
+{
+	struct cache_set *c = container_of(j, struct cache_set, journal);
+	struct cache *ca;
+	struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
+	unsigned sectors_available = j->entry_size_max >> 9;
+	unsigned i, nr_online = 0, nr_devs = 0;
+
+	lockdep_assert_held(&j->lock);
+
+	rcu_read_lock();
+	group_for_each_cache_rcu(ca, &j->devs, i) {
+		unsigned buckets_required = 0;
+
+		sectors_available = min_t(unsigned, sectors_available,
+					  ca->mi.bucket_size);
+
+		/*
+		 * Note that we don't allocate the space for a journal entry
+		 * until we write it out - thus, if we haven't started the write
+		 * for the previous entry we have to make sure we have space for
+		 * it too:
+		 */
+		if (bch_extent_has_device(e.c, ca->sb.nr_this_dev)) {
+			if (j->prev_buf_sectors > ca->journal.sectors_free)
+				buckets_required++;
+
+			if (j->prev_buf_sectors + sectors_available >
+			    ca->journal.sectors_free)
+				buckets_required++;
+		} else {
+			if (j->prev_buf_sectors + sectors_available >
+			    ca->mi.bucket_size)
+				buckets_required++;
+
+			buckets_required++;
+		}
+
+		if (journal_dev_buckets_available(j, ca) >= buckets_required)
+			nr_devs++;
+		nr_online++;
+	}
+	rcu_read_unlock();
+
+	if (nr_online < c->opts.metadata_replicas)
+		return -EROFS;
+
+	if (nr_devs < c->opts.metadata_replicas)
+		return 0;
+
+	return sectors_available;
+}
+
+/*
+ * should _only_ called from journal_res_get() - when we actually want a
+ * journal reservation - journal entry is open means journal is dirty:
+ */
+static int journal_entry_open(struct journal *j)
+{
+	struct journal_buf *buf = journal_cur_buf(j);
+	ssize_t u64s;
+	int ret = 0, sectors;
+
+	lockdep_assert_held(&j->lock);
+	BUG_ON(journal_entry_is_open(j));
+
+	if (!fifo_free(&j->pin))
+		return 0;
+
+	sectors = journal_entry_sectors(j);
+	if (sectors <= 0)
+		return sectors;
+
+	j->cur_buf_sectors	= sectors;
+	buf->nr_prio_buckets	= j->nr_prio_buckets;
+
+	u64s = (sectors << 9) / sizeof(u64);
+
+	/* Subtract the journal header */
+	u64s -= sizeof(struct jset) / sizeof(u64);
+	/*
+	 * Btree roots, prio pointers don't get added until right before we do
+	 * the write:
+	 */
+	u64s -= journal_entry_u64s_reserve(buf);
+	u64s  = max_t(ssize_t, 0L, u64s);
+
+	BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL);
+
+	if (u64s > le32_to_cpu(buf->data->u64s)) {
+		union journal_res_state old, new;
+		u64 v = atomic64_read(&j->reservations.counter);
+
+		/*
+		 * Must be set before marking the journal entry as open:
+		 */
+		j->cur_entry_u64s = u64s;
+
+		do {
+			old.v = new.v = v;
+
+			if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+				return false;
+
+			/* Handle any already added entries */
+			new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+		} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+					       old.v, new.v)) != old.v);
+		ret = 1;
+
+		wake_up(&j->wait);
+
+		if (j->res_get_blocked_start) {
+			__bch_time_stats_update(j->blocked_time,
+						j->res_get_blocked_start);
+			j->res_get_blocked_start = 0;
+		}
+
+		mod_delayed_work(system_freezable_wq,
+				 &j->write_work,
+				 msecs_to_jiffies(j->write_delay_ms));
+	}
+
+	return ret;
+}
+
+void bch_journal_start(struct cache_set *c)
+{
+	struct journal *j = &c->journal;
+	struct journal_seq_blacklist *bl;
+	struct cache *ca;
+	u64 new_seq = 0;
+	unsigned i;
+
+	for_each_cache(ca, c, i)
+		if (is_journal_device(ca))
+			bch_cache_group_add_cache(&c->journal.devs, ca);
+
+	list_for_each_entry(bl, &j->seq_blacklist, list)
+		new_seq = max(new_seq, bl->seq);
+
+	spin_lock(&j->lock);
+
+	set_bit(JOURNAL_STARTED, &j->flags);
+
+	while (atomic64_read(&j->seq) < new_seq) {
+		struct journal_entry_pin_list pin_list, *p;
+
+		BUG_ON(!fifo_push(&j->pin, pin_list));
+		p = &fifo_peek_back(&j->pin);
+
+		INIT_LIST_HEAD(&p->list);
+		atomic_set(&p->count, 0);
+		atomic64_inc(&j->seq);
+	}
+
+	/*
+	 * journal_buf_switch() only inits the next journal entry when it
+	 * closes an open journal entry - the very first journal entry gets
+	 * initialized here:
+	 */
+	__bch_journal_next_entry(j);
+
+	/*
+	 * Adding entries to the next journal entry before allocating space on
+	 * disk for the next journal entry - this is ok, because these entries
+	 * only have to go down with the next journal entry we write:
+	 */
+	list_for_each_entry(bl, &j->seq_blacklist, list)
+		if (!bl->written) {
+			bch_journal_add_entry(journal_cur_buf(j), &bl->seq, 1,
+					JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED,
+					0, 0);
+
+			journal_pin_add_entry(j,
+					      &fifo_peek_back(&j->pin),
+					      &bl->pin,
+					      journal_seq_blacklist_flush);
+			bl->written = true;
+		}
+
+	spin_unlock(&j->lock);
+
+	queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+}
+
+int bch_journal_replay(struct cache_set *c, struct list_head *list)
+{
+	int ret = 0, keys = 0, entries = 0;
+	struct journal *j = &c->journal;
+	struct bkey_i *k, *_n;
+	struct jset_entry *entry;
+	struct journal_replay *i, *n;
+
+	list_for_each_entry_safe(i, n, list, list) {
+		j->cur_pin_list =
+			&j->pin.data[((j->pin.back - 1 -
+				       (atomic64_read(&j->seq) -
+					le64_to_cpu(i->j.seq))) &
+				      j->pin.mask)];
+
+		for_each_jset_key(k, _n, entry, &i->j) {
+			struct disk_reservation disk_res;
+
+			/*
+			 * We might cause compressed extents to be split, so we
+			 * need to pass in a disk_reservation:
+			 */
+			BUG_ON(bch_disk_reservation_get(c, &disk_res, 0, 0));
+
+			trace_bcache_journal_replay_key(&k->k);
+
+			ret = bch_btree_insert(c, entry->btree_id, k,
+					       &disk_res, NULL, NULL,
+					       BTREE_INSERT_NOFAIL|
+					       BTREE_INSERT_JOURNAL_REPLAY);
+			bch_disk_reservation_put(c, &disk_res);
+
+			if (ret)
+				goto err;
+
+			cond_resched();
+			keys++;
+		}
+
+		if (atomic_dec_and_test(&j->cur_pin_list->count))
+			wake_up(&j->wait);
+
+		entries++;
+	}
+
+	bch_info(c, "journal replay done, %i keys in %i entries, seq %llu",
+		 keys, entries, (u64) atomic64_read(&j->seq));
+
+	fsck_err_on(c->sb.clean && keys, c,
+		    "filesystem marked clean, but journal had keys to replay");
+
+	bch_journal_set_replay_done(&c->journal);
+err:
+	if (ret)
+		bch_err(c, "journal replay error: %d", ret);
+fsck_err:
+	bch_journal_entries_free(list);
+
+	return ret;
+}
+
+static int bch_set_nr_journal_buckets(struct cache *ca, unsigned nr)
+{
+	unsigned u64s = bch_journal_buckets_offset(ca->disk_sb.sb) + nr;
+	u64 *p;
+	int ret;
+
+	ret = bch_super_realloc(&ca->disk_sb, u64s);
+	if (ret)
+		return ret;
+
+	p = krealloc(ca->journal.bucket_seq,
+		     nr * sizeof(u64),
+		     GFP_KERNEL|__GFP_ZERO);
+	if (!p)
+		return -ENOMEM;
+
+	ca->journal.bucket_seq = p;
+	ca->disk_sb.sb->u64s = cpu_to_le16(u64s);
+
+	return 0;
+}
+
+int bch_cache_journal_alloc(struct cache *ca)
+{
+	int ret;
+	unsigned i;
+
+	if (ca->mi.tier != 0)
+		return 0;
+
+	if (dynamic_fault("bcache:add:journal_alloc"))
+		return -ENOMEM;
+
+	/*
+	 * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
+	 * is smaller:
+	 */
+	ret = bch_set_nr_journal_buckets(ca,
+			clamp_t(unsigned, ca->mi.nbuckets >> 8,
+				BCH_JOURNAL_BUCKETS_MIN,
+				min(1 << 10,
+				    (1 << 20) / ca->mi.bucket_size)));
+	if (ret)
+		return ret;
+
+	for (i = 0; i < bch_nr_journal_buckets(ca->disk_sb.sb); i++) {
+		unsigned long r = ca->mi.first_bucket + i;
+
+		bch_mark_metadata_bucket(ca, &ca->buckets[r], true);
+		set_journal_bucket(ca->disk_sb.sb, i, r);
+	}
+
+	return 0;
+}
+
+/* Journalling */
+
+/**
+ * journal_reclaim_fast - do the fast part of journal reclaim
+ *
+ * Called from IO submission context, does not block. Cleans up after btree
+ * write completions by advancing the journal pin and each cache's last_idx,
+ * kicking off discards and background reclaim as necessary.
+ */
+static void journal_reclaim_fast(struct journal *j)
+{
+	struct journal_entry_pin_list temp;
+	bool popped = false;
+
+	lockdep_assert_held(&j->lock);
+
+	/*
+	 * Unpin journal entries whose reference counts reached zero, meaning
+	 * all btree nodes got written out
+	 */
+	while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
+		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+		BUG_ON(!fifo_pop(&j->pin, temp));
+		popped = true;
+	}
+
+	if (popped)
+		wake_up(&j->wait);
+}
+
+/*
+ * Journal entry pinning - machinery for holding a reference on a given journal
+ * entry, marking it as dirty:
+ */
+
+static inline void __journal_pin_add(struct journal *j,
+				     struct journal_entry_pin_list *pin_list,
+				     struct journal_entry_pin *pin,
+				     journal_pin_flush_fn flush_fn)
+{
+	BUG_ON(journal_pin_active(pin));
+
+	atomic_inc(&pin_list->count);
+	pin->pin_list	= pin_list;
+	pin->flush	= flush_fn;
+
+	if (flush_fn)
+		list_add(&pin->list, &pin_list->list);
+	else
+		INIT_LIST_HEAD(&pin->list);
+}
+
+static void journal_pin_add_entry(struct journal *j,
+				  struct journal_entry_pin_list *pin_list,
+				  struct journal_entry_pin *pin,
+				  journal_pin_flush_fn flush_fn)
+{
+	spin_lock_irq(&j->pin_lock);
+	__journal_pin_add(j, pin_list, pin, flush_fn);
+	spin_unlock_irq(&j->pin_lock);
+}
+
+void bch_journal_pin_add(struct journal *j,
+			 struct journal_entry_pin *pin,
+			 journal_pin_flush_fn flush_fn)
+{
+	spin_lock_irq(&j->pin_lock);
+	__journal_pin_add(j, j->cur_pin_list, pin, flush_fn);
+	spin_unlock_irq(&j->pin_lock);
+}
+
+static inline bool __journal_pin_drop(struct journal *j,
+				      struct journal_entry_pin *pin)
+{
+	struct journal_entry_pin_list *pin_list = pin->pin_list;
+
+	pin->pin_list = NULL;
+
+	/* journal_reclaim_work() might have already taken us off the list */
+	if (!list_empty_careful(&pin->list))
+		list_del_init(&pin->list);
+
+	return atomic_dec_and_test(&pin_list->count);
+}
+
+void bch_journal_pin_drop(struct journal *j,
+			  struct journal_entry_pin *pin)
+{
+	unsigned long flags;
+	bool wakeup;
+
+	if (!journal_pin_active(pin))
+		return;
+
+	spin_lock_irqsave(&j->pin_lock, flags);
+	wakeup = __journal_pin_drop(j, pin);
+	spin_unlock_irqrestore(&j->pin_lock, flags);
+
+	/*
+	 * Unpinning a journal entry make make journal_next_bucket() succeed, if
+	 * writing a new last_seq will now make another bucket available:
+	 *
+	 * Nested irqsave is expensive, don't do the wakeup with lock held:
+	 */
+	if (wakeup)
+		wake_up(&j->wait);
+}
+
+void bch_journal_pin_add_if_older(struct journal *j,
+				  struct journal_entry_pin *src_pin,
+				  struct journal_entry_pin *pin,
+				  journal_pin_flush_fn flush_fn)
+{
+	spin_lock_irq(&j->pin_lock);
+
+	if (journal_pin_active(src_pin) &&
+	    (!journal_pin_active(pin) ||
+	     fifo_entry_idx(&j->pin, src_pin->pin_list) <
+	     fifo_entry_idx(&j->pin, pin->pin_list))) {
+		if (journal_pin_active(pin))
+			__journal_pin_drop(j, pin);
+		__journal_pin_add(j, src_pin->pin_list,
+				  pin, NULL);
+	}
+
+	spin_unlock_irq(&j->pin_lock);
+}
+
+
+static struct journal_entry_pin *
+journal_get_next_pin(struct journal *j, u64 seq_to_flush)
+{
+	struct journal_entry_pin_list *pin_list;
+	struct journal_entry_pin *ret = NULL;
+	unsigned iter;
+
+	/* so we don't iterate over empty fifo entries below: */
+	if (!atomic_read(&fifo_peek_front(&j->pin).count)) {
+		spin_lock(&j->lock);
+		journal_reclaim_fast(j);
+		spin_unlock(&j->lock);
+	}
+
+	spin_lock_irq(&j->pin_lock);
+	fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
+		if (journal_pin_seq(j, pin_list) > seq_to_flush)
+			break;
+
+		ret = list_first_entry_or_null(&pin_list->list,
+				struct journal_entry_pin, list);
+		if (ret) {
+			/* must be list_del_init(), see bch_journal_pin_drop() */
+			list_del_init(&ret->list);
+			break;
+		}
+	}
+	spin_unlock_irq(&j->pin_lock);
+
+	return ret;
+}
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+	bool ret;
+
+	spin_lock(&j->lock);
+	ret = (ja->last_idx != ja->cur_idx &&
+	       ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+/**
+ * journal_reclaim_work - free up journal buckets
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+static void journal_reclaim_work(struct work_struct *work)
+{
+	struct cache_set *c = container_of(to_delayed_work(work),
+				struct cache_set, journal.reclaim_work);
+	struct journal *j = &c->journal;
+	struct cache *ca;
+	struct journal_entry_pin *pin;
+	u64 seq_to_flush = 0;
+	unsigned iter, nr, bucket_to_flush;
+	unsigned long next_flush;
+	bool reclaim_lock_held = false, need_flush;
+
+	/*
+	 * Advance last_idx to point to the oldest journal entry containing
+	 * btree node updates that have not yet been written out
+	 */
+	group_for_each_cache(ca, &j->devs, iter) {
+		struct journal_device *ja = &ca->journal;
+
+		while (should_discard_bucket(j, ja)) {
+			if (!reclaim_lock_held) {
+				/*
+				 * ugh:
+				 * might be called from __journal_res_get()
+				 * under wait_event() - have to go back to
+				 * TASK_RUNNING before doing something that
+				 * would block, but only if we're doing work:
+				 */
+				__set_current_state(TASK_RUNNING);
+
+				mutex_lock(&j->reclaim_lock);
+				reclaim_lock_held = true;
+				/* recheck under reclaim_lock: */
+				continue;
+			}
+
+			if (ca->mi.discard &&
+			    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+				blkdev_issue_discard(ca->disk_sb.bdev,
+					bucket_to_sector(ca,
+						journal_bucket(ca->disk_sb.sb,
+							       ja->last_idx)),
+					ca->mi.bucket_size, GFP_NOIO, 0);
+
+			spin_lock(&j->lock);
+			ja->last_idx = (ja->last_idx + 1) %
+				bch_nr_journal_buckets(ca->disk_sb.sb);
+			spin_unlock(&j->lock);
+
+			wake_up(&j->wait);
+		}
+
+		/*
+		 * Write out enough btree nodes to free up 50% journal
+		 * buckets
+		 */
+		spin_lock(&j->lock);
+		nr = bch_nr_journal_buckets(ca->disk_sb.sb),
+		bucket_to_flush = (ja->cur_idx + (nr >> 1)) % nr;
+		seq_to_flush = max_t(u64, seq_to_flush,
+				     ja->bucket_seq[bucket_to_flush]);
+		spin_unlock(&j->lock);
+	}
+
+	if (reclaim_lock_held)
+		mutex_unlock(&j->reclaim_lock);
+
+	/* Also flush if the pin fifo is more than half full */
+	seq_to_flush = max_t(s64, seq_to_flush,
+			     (s64) atomic64_read(&j->seq) -
+			     (j->pin.size >> 1));
+
+	/*
+	 * If it's been longer than j->reclaim_delay_ms since we last flushed,
+	 * make sure to flush at least one journal pin:
+	 */
+	next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
+	need_flush = time_after(jiffies, next_flush);
+
+	while ((pin = journal_get_next_pin(j, need_flush
+					   ? U64_MAX
+					   : seq_to_flush))) {
+		__set_current_state(TASK_RUNNING);
+		pin->flush(j, pin);
+		need_flush = false;
+
+		j->last_flushed = jiffies;
+	}
+
+	if (!test_bit(CACHE_SET_RO, &c->flags))
+		queue_delayed_work(system_freezable_wq, &j->reclaim_work,
+				   msecs_to_jiffies(j->reclaim_delay_ms));
+}
+
+/**
+ * journal_next_bucket - move on to the next journal bucket if possible
+ */
+static int journal_write_alloc(struct journal *j, unsigned sectors)
+{
+	struct cache_set *c = container_of(j, struct cache_set, journal);
+	struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
+	struct bch_extent_ptr *ptr;
+	struct cache *ca;
+	unsigned iter, replicas, replicas_want =
+		READ_ONCE(c->opts.metadata_replicas);
+
+	spin_lock(&j->lock);
+	rcu_read_lock();
+
+	/*
+	 * Drop any pointers to devices that have been removed, are no longer
+	 * empty, or filled up their current journal bucket:
+	 *
+	 * Note that a device may have had a small amount of free space (perhaps
+	 * one sector) that wasn't enough for the smallest possible journal
+	 * entry - that's why we drop pointers to devices <= current free space,
+	 * i.e. whichever device was limiting the current journal entry size.
+	 */
+	extent_for_each_ptr_backwards(e, ptr)
+		if (!(ca = PTR_CACHE(c, ptr)) ||
+		    ca->mi.state != CACHE_ACTIVE ||
+		    ca->journal.sectors_free <= sectors)
+			__bch_extent_drop_ptr(e, ptr);
+		else
+			ca->journal.sectors_free -= sectors;
+
+	replicas = bch_extent_nr_ptrs(e.c);
+
+	/*
+	 * Determine location of the next journal write:
+	 * XXX: sort caches by free journal space
+	 */
+	group_for_each_cache_rcu(ca, &j->devs, iter) {
+		struct journal_device *ja = &ca->journal;
+		unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
+
+		if (replicas >= replicas_want)
+			break;
+
+		/*
+		 * Check that we can use this device, and aren't already using
+		 * it:
+		 */
+		if (bch_extent_has_device(e.c, ca->sb.nr_this_dev) ||
+		    !journal_dev_buckets_available(j, ca) ||
+		    sectors > ca->mi.bucket_size)
+			continue;
+
+		ja->sectors_free = ca->mi.bucket_size - sectors;
+		ja->cur_idx = (ja->cur_idx + 1) % nr_buckets;
+		ja->bucket_seq[ja->cur_idx] = atomic64_read(&j->seq);
+
+		extent_ptr_append(bkey_i_to_extent(&j->key),
+			(struct bch_extent_ptr) {
+				  .offset = bucket_to_sector(ca,
+					journal_bucket(ca->disk_sb.sb,
+						       ja->cur_idx)),
+				  .dev = ca->sb.nr_this_dev,
+		});
+		replicas++;
+
+		trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx);
+	}
+
+	rcu_read_unlock();
+
+	j->prev_buf_sectors = 0;
+	spin_unlock(&j->lock);
+
+	if (replicas < replicas_want)
+		return -EROFS;
+
+	return 0;
+}
+
+static void journal_write_compact(struct jset *jset)
+{
+	struct jset_entry *i, *next, *prev = NULL;
+
+	/*
+	 * Simple compaction, dropping empty jset_entries (from journal
+	 * reservations that weren't fully used) and merging jset_entries that
+	 * can be.
+	 *
+	 * If we wanted to be really fancy here, we could sort all the keys in
+	 * the jset and drop keys that were overwritten - probably not worth it:
+	 */
+	for (i = jset->start;
+	     i < (struct jset_entry *) bkey_idx(jset, le32_to_cpu(jset->u64s)) &&
+	     (next = jset_keys_next(i), true);
+	     i = next) {
+		unsigned u64s = le16_to_cpu(i->u64s);
+
+		/* Empty entry: */
+		if (!u64s)
+			continue;
+
+		/* Can we merge with previous entry? */
+		if (prev &&
+		    i->btree_id == prev->btree_id &&
+		    i->level	== prev->level &&
+		    JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_TYPE(prev) &&
+		    JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_BTREE_KEYS &&
+		    le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
+			memmove_u64s_down(jset_keys_next(prev),
+					  i->_data,
+					  u64s);
+			le16_add_cpu(&prev->u64s, u64s);
+			continue;
+		}
+
+		/* Couldn't merge, move i into new position (after prev): */
+		prev = prev ? jset_keys_next(prev) : jset->start;
+		if (i != prev)
+			memmove_u64s_down(prev, i, jset_u64s(u64s));
+	}
+
+	prev = prev ? jset_keys_next(prev) : jset->start;
+	jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
+}
+
+static void journal_write_endio(struct bio *bio)
+{
+	struct cache *ca = bio->bi_private;
+	struct journal *j = &ca->set->journal;
+
+	if (cache_fatal_io_err_on(bio->bi_error, ca, "journal write") ||
+	    bch_meta_write_fault("journal"))
+		bch_journal_halt(j);
+
+	closure_put(&j->io);
+	percpu_ref_put(&ca->ref);
+}
+
+static void journal_write_done(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io);
+	struct journal_buf *w = journal_prev_buf(j);
+
+	j->last_seq_ondisk = le64_to_cpu(w->data->last_seq);
+
+	__bch_time_stats_update(j->write_time, j->write_start_time);
+
+	BUG_ON(!j->reservations.prev_buf_unwritten);
+	atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
+		     &j->reservations.counter);
+
+	/*
+	 * XXX: this is racy, we could technically end up doing the wake up
+	 * after the journal_buf struct has been reused for the next write
+	 * (because we're clearing JOURNAL_IO_IN_FLIGHT) and wake up things that
+	 * are waiting on the _next_ write, not this one.
+	 *
+	 * The wake up can't come before, because journal_flush_seq_async() is
+	 * looking at JOURNAL_IO_IN_FLIGHT when it has to wait on a journal
+	 * write that was already in flight.
+	 *
+	 * The right fix is to use a lock here, but using j.lock here means it
+	 * has to be a spin_lock_irqsave() lock which then requires propagating
+	 * the irq()ness to other locks and it's all kinds of nastiness.
+	 */
+
+	closure_wake_up(&w->wait);
+	wake_up(&j->wait);
+
+	/*
+	 * Updating last_seq_ondisk may let journal_reclaim_work() discard more
+	 * buckets:
+	 */
+	mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+}
+
+static void journal_write(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io);
+	struct cache_set *c = container_of(j, struct cache_set, journal);
+	struct cache *ca;
+	struct journal_buf *w = journal_prev_buf(j);
+	struct bio *bio;
+	struct bch_extent_ptr *ptr;
+	unsigned i, sectors, bytes;
+
+	j->write_start_time = local_clock();
+
+	bch_journal_add_prios(j, w);
+
+	mutex_lock(&c->btree_root_lock);
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct btree_root *r = &c->btree_roots[i];
+
+		if (r->alive)
+			bch_journal_add_btree_root(w, i, &r->key, r->level);
+	}
+	mutex_unlock(&c->btree_root_lock);
+
+	journal_write_compact(w->data);
+
+	w->data->read_clock	= cpu_to_le16(c->prio_clock[READ].hand);
+	w->data->write_clock	= cpu_to_le16(c->prio_clock[WRITE].hand);
+	w->data->magic		= cpu_to_le64(jset_magic(&c->disk_sb));
+	w->data->version	= cpu_to_le32(BCACHE_JSET_VERSION);
+
+	SET_JSET_BIG_ENDIAN(w->data, CPU_BIG_ENDIAN);
+	SET_JSET_CSUM_TYPE(w->data, c->opts.metadata_checksum);
+	w->data->csum = cpu_to_le64(__csum_set(w->data,
+					       le32_to_cpu(w->data->u64s),
+					       JSET_CSUM_TYPE(w->data)));
+
+	sectors = __set_blocks(w->data, le32_to_cpu(w->data->u64s),
+			       block_bytes(c)) * c->sb.block_size;
+	BUG_ON(sectors > j->prev_buf_sectors);
+
+	bytes = __set_bytes(w->data, le32_to_cpu(w->data->u64s));
+	memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
+
+	if (journal_write_alloc(j, sectors)) {
+		bch_journal_halt(j);
+		bch_err(c, "Unable to allocate journal write");
+		bch_fatal_error(c);
+		closure_return_with_destructor(cl, journal_write_done);
+	}
+
+	bch_check_mark_super(c, &j->key, true);
+
+	extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) {
+		rcu_read_lock();
+		ca = PTR_CACHE(c, ptr);
+		if (ca)
+			percpu_ref_get(&ca->ref);
+		rcu_read_unlock();
+
+		if (!ca) {
+			/* XXX: fix this */
+			bch_err(c, "missing device for journal write\n");
+			continue;
+		}
+
+		atomic64_add(sectors, &ca->meta_sectors_written);
+
+		bio = ca->journal.bio;
+		bio_reset(bio);
+		bio->bi_iter.bi_sector	= ptr->offset;
+		bio->bi_bdev		= ca->disk_sb.bdev;
+		bio->bi_iter.bi_size	= sectors << 9;
+		bio->bi_end_io		= journal_write_endio;
+		bio->bi_private		= ca;
+		bio_set_op_attrs(bio, REQ_OP_WRITE,
+				 REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
+		bch_bio_map(bio, w->data);
+
+		trace_bcache_journal_write(bio);
+		closure_bio_submit_punt(bio, cl, c);
+
+		ptr->offset += sectors;
+
+		ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq);
+	}
+
+	for_each_cache(ca, c, i)
+		if (ca->mi.state == CACHE_ACTIVE &&
+		    journal_flushes_device(ca) &&
+		    !bch_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) {
+			percpu_ref_get(&ca->ref);
+
+			bio = ca->journal.bio;
+			bio_reset(bio);
+			bio->bi_bdev		= ca->disk_sb.bdev;
+			bio->bi_end_io		= journal_write_endio;
+			bio->bi_private		= ca;
+			bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+			closure_bio_submit_punt(bio, cl, c);
+		}
+
+	closure_return_with_destructor(cl, journal_write_done);
+}
+
+static void journal_write_work(struct work_struct *work)
+{
+	struct journal *j = container_of(to_delayed_work(work),
+					 struct journal, write_work);
+	spin_lock(&j->lock);
+	set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+	if (journal_buf_switch(j, false) != JOURNAL_UNLOCKED)
+		spin_unlock(&j->lock);
+}
+
+/*
+ * Given an inode number, if that inode number has data in the journal that
+ * hasn't yet been flushed, return the journal sequence number that needs to be
+ * flushed:
+ */
+u64 bch_inode_journal_seq(struct journal *j, u64 inode)
+{
+	size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
+	u64 seq = 0;
+
+	if (!test_bit(h, j->buf[0].has_inode) &&
+	    !test_bit(h, j->buf[1].has_inode))
+		return 0;
+
+	spin_lock(&j->lock);
+	if (test_bit(h, journal_cur_buf(j)->has_inode))
+		seq = atomic64_read(&j->seq);
+	else if (test_bit(h, journal_prev_buf(j)->has_inode))
+		seq = atomic64_read(&j->seq) - 1;
+	spin_unlock(&j->lock);
+
+	return seq;
+}
+
+static int __journal_res_get(struct journal *j, struct journal_res *res,
+			      unsigned u64s_min, unsigned u64s_max)
+{
+	struct cache_set *c = container_of(j, struct cache_set, journal);
+	int ret;
+retry:
+	ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
+	if (ret)
+		return ret;
+
+	spin_lock(&j->lock);
+	/*
+	 * Recheck after taking the lock, so we don't race with another thread
+	 * that just did journal_entry_open() and call journal_entry_close()
+	 * unnecessarily
+	 */
+	ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
+	if (ret) {
+		spin_unlock(&j->lock);
+		return 1;
+	}
+
+	/*
+	 * Ok, no more room in the current journal entry - try to start a new
+	 * one:
+	 */
+	switch (journal_buf_switch(j, false)) {
+	case JOURNAL_ENTRY_ERROR:
+		spin_unlock(&j->lock);
+		return -EIO;
+	case JOURNAL_ENTRY_INUSE:
+		/* haven't finished writing out the previous one: */
+		spin_unlock(&j->lock);
+		trace_bcache_journal_entry_full(c);
+		goto blocked;
+	case JOURNAL_ENTRY_CLOSED:
+		break;
+	case JOURNAL_UNLOCKED:
+		goto retry;
+	}
+
+	/* We now have a new, closed journal buf - see if we can open it: */
+	ret = journal_entry_open(j);
+	spin_unlock(&j->lock);
+
+	if (ret < 0)
+		return ret;
+	if (ret)
+		goto retry;
+
+	/* Journal's full, we have to wait */
+
+	/*
+	 * Direct reclaim - can't rely on reclaim from work item
+	 * due to freezing..
+	 */
+	journal_reclaim_work(&j->reclaim_work.work);
+
+	trace_bcache_journal_full(c);
+blocked:
+	if (!j->res_get_blocked_start)
+		j->res_get_blocked_start = local_clock() ?: 1;
+	return 0;
+}
+
+/*
+ * Essentially the entry function to the journaling code. When bcache is doing
+ * a btree insert, it calls this function to get the current journal write.
+ * Journal write is the structure used set up journal writes. The calling
+ * function will then add its keys to the structure, queuing them for the
+ * next write.
+ *
+ * To ensure forward progress, the current task must not be holding any
+ * btree node write locks.
+ */
+int bch_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
+				 unsigned u64s_min, unsigned u64s_max)
+{
+	int ret;
+
+	wait_event(j->wait,
+		   (ret = __journal_res_get(j, res, u64s_min,
+					    u64s_max)));
+	return ret < 0 ? ret : 0;
+}
+
+void bch_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent)
+{
+	spin_lock(&j->lock);
+
+	BUG_ON(seq > atomic64_read(&j->seq));
+
+	if (bch_journal_error(j)) {
+		spin_unlock(&j->lock);
+		return;
+	}
+
+	if (seq == atomic64_read(&j->seq)) {
+		if (!closure_wait(&journal_cur_buf(j)->wait, parent))
+			BUG();
+	} else if (seq + 1 == atomic64_read(&j->seq) &&
+		   j->reservations.prev_buf_unwritten) {
+		if (!closure_wait(&journal_prev_buf(j)->wait, parent))
+			BUG();
+
+		smp_mb();
+
+		/* check if raced with write completion (or failure) */
+		if (!j->reservations.prev_buf_unwritten ||
+		    bch_journal_error(j))
+			closure_wake_up(&journal_prev_buf(j)->wait);
+	}
+
+	spin_unlock(&j->lock);
+}
+
+void bch_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent)
+{
+	spin_lock(&j->lock);
+
+	BUG_ON(seq > atomic64_read(&j->seq));
+
+	if (bch_journal_error(j)) {
+		spin_unlock(&j->lock);
+		return;
+	}
+
+	if (seq == atomic64_read(&j->seq)) {
+		bool set_need_write = false;
+
+		if (parent &&
+		    !closure_wait(&journal_cur_buf(j)->wait, parent))
+			BUG();
+
+		if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
+			j->need_write_time = local_clock();
+			set_need_write = true;
+		}
+
+		switch (journal_buf_switch(j, set_need_write)) {
+		case JOURNAL_ENTRY_ERROR:
+			if (parent)
+				closure_wake_up(&journal_cur_buf(j)->wait);
+			break;
+		case JOURNAL_ENTRY_CLOSED:
+			/*
+			 * Journal entry hasn't been opened yet, but caller
+			 * claims it has something (seq == j->seq):
+			 */
+			BUG();
+		case JOURNAL_ENTRY_INUSE:
+			break;
+		case JOURNAL_UNLOCKED:
+			return;
+		}
+	} else if (parent &&
+		   seq + 1 == atomic64_read(&j->seq) &&
+		   j->reservations.prev_buf_unwritten) {
+		if (!closure_wait(&journal_prev_buf(j)->wait, parent))
+			BUG();
+
+		smp_mb();
+
+		/* check if raced with write completion (or failure) */
+		if (!j->reservations.prev_buf_unwritten ||
+		    bch_journal_error(j))
+			closure_wake_up(&journal_prev_buf(j)->wait);
+	}
+
+	spin_unlock(&j->lock);
+}
+
+int bch_journal_flush_seq(struct journal *j, u64 seq)
+{
+	struct closure cl;
+	u64 start_time = local_clock();
+
+	closure_init_stack(&cl);
+	bch_journal_flush_seq_async(j, seq, &cl);
+	closure_sync(&cl);
+
+	bch_time_stats_update(j->flush_seq_time, start_time);
+
+	return bch_journal_error(j);
+}
+
+void bch_journal_meta_async(struct journal *j, struct closure *parent)
+{
+	struct journal_res res;
+	unsigned u64s = jset_u64s(0);
+
+	memset(&res, 0, sizeof(res));
+
+	bch_journal_res_get(j, &res, u64s, u64s);
+	bch_journal_res_put(j, &res);
+
+	bch_journal_flush_seq_async(j, res.seq, parent);
+}
+
+int bch_journal_meta(struct journal *j)
+{
+	struct journal_res res;
+	unsigned u64s = jset_u64s(0);
+	int ret;
+
+	memset(&res, 0, sizeof(res));
+
+	ret = bch_journal_res_get(j, &res, u64s, u64s);
+	if (ret)
+		return ret;
+
+	bch_journal_res_put(j, &res);
+
+	return bch_journal_flush_seq(j, res.seq);
+}
+
+void bch_journal_flush_async(struct journal *j, struct closure *parent)
+{
+	u64 seq, journal_seq;
+
+	spin_lock(&j->lock);
+	journal_seq = atomic64_read(&j->seq);
+
+	if (journal_entry_is_open(j)) {
+		seq = journal_seq;
+	} else if (journal_seq) {
+		seq = journal_seq - 1;
+	} else {
+		spin_unlock(&j->lock);
+		return;
+	}
+	spin_unlock(&j->lock);
+
+	bch_journal_flush_seq_async(j, seq, parent);
+}
+
+int bch_journal_flush(struct journal *j)
+{
+	u64 seq, journal_seq;
+
+	spin_lock(&j->lock);
+	journal_seq = atomic64_read(&j->seq);
+
+	if (journal_entry_is_open(j)) {
+		seq = journal_seq;
+	} else if (journal_seq) {
+		seq = journal_seq - 1;
+	} else {
+		spin_unlock(&j->lock);
+		return 0;
+	}
+	spin_unlock(&j->lock);
+
+	return bch_journal_flush_seq(j, seq);
+}
+
+void bch_journal_free(struct journal *j)
+{
+	unsigned order = get_order(j->entry_size_max);
+
+	free_pages((unsigned long) j->buf[1].data, order);
+	free_pages((unsigned long) j->buf[0].data, order);
+	free_fifo(&j->pin);
+}
+
+int bch_journal_alloc(struct journal *j, unsigned entry_size_max)
+{
+	static struct lock_class_key res_key;
+	unsigned order = get_order(entry_size_max);
+
+	spin_lock_init(&j->lock);
+	spin_lock_init(&j->pin_lock);
+	init_waitqueue_head(&j->wait);
+	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
+	INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
+	mutex_init(&j->blacklist_lock);
+	INIT_LIST_HEAD(&j->seq_blacklist);
+	spin_lock_init(&j->devs.lock);
+	mutex_init(&j->reclaim_lock);
+
+	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
+
+	j->entry_size_max	= entry_size_max;
+	j->write_delay_ms	= 100;
+	j->reclaim_delay_ms	= 100;
+
+	bkey_extent_init(&j->key);
+
+	atomic64_set(&j->reservations.counter,
+		((union journal_res_state)
+		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
+
+	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+	    !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
+	    !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
+		return -ENOMEM;
+
+	return 0;
+}
+
+ssize_t bch_journal_print_debug(struct journal *j, char *buf)
+{
+	union journal_res_state *s = &j->reservations;
+	struct cache *ca;
+	unsigned iter;
+	ssize_t ret = 0;
+
+	rcu_read_lock();
+	spin_lock(&j->lock);
+
+	ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+			 "active journal entries:\t%zu\n"
+			 "seq:\t\t\t%llu\n"
+			 "last_seq:\t\t%llu\n"
+			 "last_seq_ondisk:\t%llu\n"
+			 "reservation count:\t%u\n"
+			 "reservation offset:\t%u\n"
+			 "current entry u64s:\t%u\n"
+			 "io in flight:\t\t%i\n"
+			 "need write:\t\t%i\n"
+			 "dirty:\t\t\t%i\n"
+			 "replay done:\t\t%i\n",
+			 fifo_used(&j->pin),
+			 (u64) atomic64_read(&j->seq),
+			 last_seq(j),
+			 j->last_seq_ondisk,
+			 journal_state_count(*s, s->idx),
+			 s->cur_entry_offset,
+			 j->cur_entry_u64s,
+			 s->prev_buf_unwritten,
+			 test_bit(JOURNAL_NEED_WRITE,	&j->flags),
+			 journal_entry_is_open(j),
+			 test_bit(JOURNAL_REPLAY_DONE,	&j->flags));
+
+	group_for_each_cache_rcu(ca, &j->devs, iter) {
+		struct journal_device *ja = &ca->journal;
+
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+				 "dev %u:\n"
+				 "\tnr\t\t%u\n"
+				 "\tcur_idx\t\t%u (seq %llu)\n"
+				 "\tlast_idx\t%u (seq %llu)\n",
+				 iter, bch_nr_journal_buckets(ca->disk_sb.sb),
+				 ja->cur_idx,	ja->bucket_seq[ja->cur_idx],
+				 ja->last_idx,	ja->bucket_seq[ja->last_idx]);
+	}
+
+	spin_unlock(&j->lock);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static bool bch_journal_writing_to_device(struct cache *ca)
+{
+	struct journal *j = &ca->set->journal;
+	bool ret;
+
+	spin_lock(&j->lock);
+	ret = bch_extent_has_device(bkey_i_to_s_c_extent(&j->key),
+				    ca->sb.nr_this_dev);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+/*
+ * This asumes that ca has already been marked read-only so that
+ * journal_next_bucket won't pick buckets out of ca any more.
+ * Hence, if the journal is not currently pointing to ca, there
+ * will be no new writes to journal entries in ca after all the
+ * pending ones have been flushed to disk.
+ *
+ * If the journal is being written to ca, write a new record, and
+ * journal_next_bucket will notice that the device is no longer
+ * writeable and pick a new set of devices to write to.
+ */
+
+int bch_journal_move(struct cache *ca)
+{
+	unsigned i, nr_buckets;
+	u64 last_flushed_seq;
+	struct cache_set *c = ca->set;
+	struct journal *j = &c->journal;
+	int ret = 0;		/* Success */
+
+	if (bch_journal_writing_to_device(ca)) {
+		/*
+		 * bch_journal_meta will write a record and we'll wait
+		 * for the write to complete.
+		 * Actually writing the journal (journal_write_locked)
+		 * will call journal_next_bucket which notices that the
+		 * device is no longer writeable, and picks a new one.
+		 */
+		bch_journal_meta(j);
+		BUG_ON(bch_journal_writing_to_device(ca));
+	}
+
+	/*
+	 * Flush all btree updates to backing store so that any
+	 * journal entries written to ca become stale and are no
+	 * longer needed.
+	 */
+
+	/*
+	 * XXX: switch to normal journal reclaim machinery
+	 */
+	bch_btree_flush(c);
+
+	/*
+	 * Force a meta-data journal entry to be written so that
+	 * we have newer journal entries in devices other than ca,
+	 * and wait for the meta data write to complete.
+	 */
+	bch_journal_meta(j);
+
+	/*
+	 * Verify that we no longer need any of the journal entries in
+	 * the device
+	 */
+	spin_lock(&j->lock);
+	last_flushed_seq = last_seq(j);
+	spin_unlock(&j->lock);
+
+	nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
+
+	for (i = 0; i < nr_buckets; i += 1)
+		BUG_ON(ca->journal.bucket_seq[i] > last_flushed_seq);
+
+	return ret;
+}
diff --git a/libbcache/journal.h b/libbcache/journal.h
new file mode 100644
index 0000000..759ed60
--- /dev/null
+++ b/libbcache/journal.h
@@ -0,0 +1,387 @@
+#ifndef _BCACHE_JOURNAL_H
+#define _BCACHE_JOURNAL_H
+
+/*
+ * THE JOURNAL:
+ *
+ * The primary purpose of the journal is to log updates (insertions) to the
+ * b-tree, to avoid having to do synchronous updates to the b-tree on disk.
+ *
+ * Without the journal, the b-tree is always internally consistent on
+ * disk - and in fact, in the earliest incarnations bcache didn't have a journal
+ * but did handle unclean shutdowns by doing all index updates synchronously
+ * (with coalescing).
+ *
+ * Updates to interior nodes still happen synchronously and without the journal
+ * (for simplicity) - this may change eventually but updates to interior nodes
+ * are rare enough it's not a huge priority.
+ *
+ * This means the journal is relatively separate from the b-tree; it consists of
+ * just a list of keys and journal replay consists of just redoing those
+ * insertions in same order that they appear in the journal.
+ *
+ * PERSISTENCE:
+ *
+ * For synchronous updates (where we're waiting on the index update to hit
+ * disk), the journal entry will be written out immediately (or as soon as
+ * possible, if the write for the previous journal entry was still in flight).
+ *
+ * Synchronous updates are specified by passing a closure (@flush_cl) to
+ * bch_btree_insert() or bch_btree_insert_node(), which then pass that parameter
+ * down to the journalling code. That closure will will wait on the journal
+ * write to complete (via closure_wait()).
+ *
+ * If the index update wasn't synchronous, the journal entry will be
+ * written out after 10 ms have elapsed, by default (the delay_ms field
+ * in struct journal).
+ *
+ * JOURNAL ENTRIES:
+ *
+ * A journal entry is variable size (struct jset), it's got a fixed length
+ * header and then a variable number of struct jset_entry entries.
+ *
+ * Journal entries are identified by monotonically increasing 64 bit sequence
+ * numbers - jset->seq; other places in the code refer to this sequence number.
+ *
+ * A jset_entry entry contains one or more bkeys (which is what gets inserted
+ * into the b-tree). We need a container to indicate which b-tree the key is
+ * for; also, the roots of the various b-trees are stored in jset_entry entries
+ * (one for each b-tree) - this lets us add new b-tree types without changing
+ * the on disk format.
+ *
+ * We also keep some things in the journal header that are logically part of the
+ * superblock - all the things that are frequently updated. This is for future
+ * bcache on raw flash support; the superblock (which will become another
+ * journal) can't be moved or wear leveled, so it contains just enough
+ * information to find the main journal, and the superblock only has to be
+ * rewritten when we want to move/wear level the main journal.
+ *
+ * JOURNAL LAYOUT ON DISK:
+ *
+ * The journal is written to a ringbuffer of buckets (which is kept in the
+ * superblock); the individual buckets are not necessarily contiguous on disk
+ * which means that journal entries are not allowed to span buckets, but also
+ * that we can resize the journal at runtime if desired (unimplemented).
+ *
+ * The journal buckets exist in the same pool as all the other buckets that are
+ * managed by the allocator and garbage collection - garbage collection marks
+ * the journal buckets as metadata buckets.
+ *
+ * OPEN/DIRTY JOURNAL ENTRIES:
+ *
+ * Open/dirty journal entries are journal entries that contain b-tree updates
+ * that have not yet been written out to the b-tree on disk. We have to track
+ * which journal entries are dirty, and we also have to avoid wrapping around
+ * the journal and overwriting old but still dirty journal entries with new
+ * journal entries.
+ *
+ * On disk, this is represented with the "last_seq" field of struct jset;
+ * last_seq is the first sequence number that journal replay has to replay.
+ *
+ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in
+ * journal_device->seq) of for each journal bucket, the highest sequence number
+ * any journal entry it contains. Then, by comparing that against last_seq we
+ * can determine whether that journal bucket contains dirty journal entries or
+ * not.
+ *
+ * To track which journal entries are dirty, we maintain a fifo of refcounts
+ * (where each entry corresponds to a specific sequence number) - when a ref
+ * goes to 0, that journal entry is no longer dirty.
+ *
+ * Journalling of index updates is done at the same time as the b-tree itself is
+ * being modified (see btree_insert_key()); when we add the key to the journal
+ * the pending b-tree write takes a ref on the journal entry the key was added
+ * to. If a pending b-tree write would need to take refs on multiple dirty
+ * journal entries, it only keeps the ref on the oldest one (since a newer
+ * journal entry will still be replayed if an older entry was dirty).
+ *
+ * JOURNAL FILLING UP:
+ *
+ * There are two ways the journal could fill up; either we could run out of
+ * space to write to, or we could have too many open journal entries and run out
+ * of room in the fifo of refcounts. Since those refcounts are decremented
+ * without any locking we can't safely resize that fifo, so we handle it the
+ * same way.
+ *
+ * If the journal fills up, we start flushing dirty btree nodes until we can
+ * allocate space for a journal write again - preferentially flushing btree
+ * nodes that are pinning the oldest journal entries first.
+ */
+
+#include <linux/hash.h>
+
+#include "journal_types.h"
+
+static inline struct jset_entry *jset_keys_next(struct jset_entry *j)
+{
+	return (void *) __bkey_idx(j, le16_to_cpu(j->u64s));
+}
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+	struct list_head	list;
+	struct jset		j;
+};
+
+#define JOURNAL_PIN	((32 * 1024) - 1)
+
+static inline bool journal_pin_active(struct journal_entry_pin *pin)
+{
+	return pin->pin_list != NULL;
+}
+
+void bch_journal_pin_add(struct journal *, struct journal_entry_pin *,
+			 journal_pin_flush_fn);
+void bch_journal_pin_drop(struct journal *, struct journal_entry_pin *);
+void bch_journal_pin_add_if_older(struct journal *,
+				  struct journal_entry_pin *,
+				  struct journal_entry_pin *,
+				  journal_pin_flush_fn);
+
+struct closure;
+struct cache_set;
+struct keylist;
+
+struct bkey_i *bch_journal_find_btree_root(struct cache_set *, struct jset *,
+					   enum btree_id, unsigned *);
+
+int bch_journal_seq_should_ignore(struct cache_set *, u64, struct btree *);
+
+u64 bch_inode_journal_seq(struct journal *, u64);
+
+static inline int journal_state_count(union journal_res_state s, int idx)
+{
+	return idx == 0 ? s.buf0_count : s.buf1_count;
+}
+
+static inline void journal_state_inc(union journal_res_state *s)
+{
+	s->buf0_count += s->idx == 0;
+	s->buf1_count += s->idx == 1;
+}
+
+static inline void bch_journal_set_has_inode(struct journal_buf *buf, u64 inum)
+{
+	set_bit(hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)), buf->has_inode);
+}
+
+/*
+ * Amount of space that will be taken up by some keys in the journal (i.e.
+ * including the jset header)
+ */
+static inline unsigned jset_u64s(unsigned u64s)
+{
+	return u64s + sizeof(struct jset_entry) / sizeof(u64);
+}
+
+static inline void bch_journal_add_entry_at(struct journal_buf *buf,
+					    const void *data, size_t u64s,
+					    unsigned type, enum btree_id id,
+					    unsigned level, unsigned offset)
+{
+	struct jset_entry *entry = bkey_idx(buf->data, offset);
+
+	entry->u64s = cpu_to_le16(u64s);
+	entry->btree_id = id;
+	entry->level = level;
+	entry->flags = 0;
+	SET_JOURNAL_ENTRY_TYPE(entry, type);
+
+	memcpy_u64s(entry->_data, data, u64s);
+}
+
+static inline void bch_journal_add_keys(struct journal *j, struct journal_res *res,
+					enum btree_id id, const struct bkey_i *k)
+{
+	struct journal_buf *buf = &j->buf[res->idx];
+	unsigned actual = jset_u64s(k->k.u64s);
+
+	EBUG_ON(!res->ref);
+	BUG_ON(actual > res->u64s);
+
+	bch_journal_set_has_inode(buf, k->k.p.inode);
+
+	bch_journal_add_entry_at(buf, k, k->k.u64s,
+				 JOURNAL_ENTRY_BTREE_KEYS, id,
+				 0, res->offset);
+
+	res->offset	+= actual;
+	res->u64s	-= actual;
+}
+
+void bch_journal_buf_put_slowpath(struct journal *, bool);
+
+static inline void bch_journal_buf_put(struct journal *j, unsigned idx,
+				       bool need_write_just_set)
+{
+	union journal_res_state s;
+
+	s.v = atomic64_sub_return(((union journal_res_state) {
+				    .buf0_count = idx == 0,
+				    .buf1_count = idx == 1,
+				    }).v, &j->reservations.counter);
+
+	EBUG_ON(s.idx != idx && !s.prev_buf_unwritten);
+
+	/*
+	 * Do not initiate a journal write if the journal is in an error state
+	 * (previous journal entry write may have failed)
+	 */
+	if (s.idx != idx &&
+	    !journal_state_count(s, idx) &&
+	    s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL)
+		bch_journal_buf_put_slowpath(j, need_write_just_set);
+}
+
+/*
+ * This function releases the journal write structure so other threads can
+ * then proceed to add their keys as well.
+ */
+static inline void bch_journal_res_put(struct journal *j,
+				       struct journal_res *res)
+{
+	if (!res->ref)
+		return;
+
+	lock_release(&j->res_map, 0, _RET_IP_);
+
+	while (res->u64s) {
+		bch_journal_add_entry_at(&j->buf[res->idx], NULL, 0,
+					 JOURNAL_ENTRY_BTREE_KEYS,
+					 0, 0, res->offset);
+		res->offset	+= jset_u64s(0);
+		res->u64s	-= jset_u64s(0);
+	}
+
+	bch_journal_buf_put(j, res->idx, false);
+
+	res->ref = 0;
+}
+
+int bch_journal_res_get_slowpath(struct journal *, struct journal_res *,
+				 unsigned, unsigned);
+
+static inline int journal_res_get_fast(struct journal *j,
+				       struct journal_res *res,
+				       unsigned u64s_min,
+				       unsigned u64s_max)
+{
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+
+	do {
+		old.v = new.v = v;
+
+		/*
+		 * Check if there is still room in the current journal
+		 * entry:
+		 */
+		if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s)
+			return 0;
+
+		res->offset	= old.cur_entry_offset;
+		res->u64s	= min(u64s_max, j->cur_entry_u64s -
+				      old.cur_entry_offset);
+
+		journal_state_inc(&new);
+		new.cur_entry_offset += res->u64s;
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	res->ref = true;
+	res->idx = new.idx;
+	res->seq = le64_to_cpu(j->buf[res->idx].data->seq);
+	return 1;
+}
+
+static inline int bch_journal_res_get(struct journal *j, struct journal_res *res,
+				      unsigned u64s_min, unsigned u64s_max)
+{
+	int ret;
+
+	EBUG_ON(res->ref);
+	EBUG_ON(u64s_max < u64s_min);
+
+	if (journal_res_get_fast(j, res, u64s_min, u64s_max))
+		goto out;
+
+	ret = bch_journal_res_get_slowpath(j, res, u64s_min, u64s_max);
+	if (ret)
+		return ret;
+out:
+	lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
+	EBUG_ON(!res->ref);
+	return 0;
+}
+
+void bch_journal_wait_on_seq(struct journal *, u64, struct closure *);
+void bch_journal_flush_seq_async(struct journal *, u64, struct closure *);
+void bch_journal_flush_async(struct journal *, struct closure *);
+void bch_journal_meta_async(struct journal *, struct closure *);
+
+int bch_journal_flush_seq(struct journal *, u64);
+int bch_journal_flush(struct journal *);
+int bch_journal_meta(struct journal *);
+
+void bch_journal_halt(struct journal *);
+
+static inline int bch_journal_error(struct journal *j)
+{
+	return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
+		? -EIO : 0;
+}
+
+static inline bool is_journal_device(struct cache *ca)
+{
+	return ca->mi.state == CACHE_ACTIVE && ca->mi.tier == 0;
+}
+
+static inline bool journal_flushes_device(struct cache *ca)
+{
+	return true;
+}
+
+void bch_journal_start(struct cache_set *);
+void bch_journal_mark(struct cache_set *, struct list_head *);
+void bch_journal_entries_free(struct list_head *);
+int bch_journal_read(struct cache_set *, struct list_head *);
+int bch_journal_replay(struct cache_set *, struct list_head *);
+
+static inline void bch_journal_set_replay_done(struct journal *j)
+{
+	spin_lock(&j->lock);
+	BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+
+	set_bit(JOURNAL_REPLAY_DONE, &j->flags);
+	j->cur_pin_list = &fifo_peek_back(&j->pin);
+	spin_unlock(&j->lock);
+}
+
+void bch_journal_free(struct journal *);
+int bch_journal_alloc(struct journal *, unsigned);
+
+ssize_t bch_journal_print_debug(struct journal *, char *);
+
+int bch_cache_journal_alloc(struct cache *);
+
+static inline __le64 *__journal_buckets(struct cache_sb *sb)
+{
+	return sb->_data + bch_journal_buckets_offset(sb);
+}
+
+static inline u64 journal_bucket(struct cache_sb *sb, unsigned nr)
+{
+	return le64_to_cpu(__journal_buckets(sb)[nr]);
+}
+
+static inline void set_journal_bucket(struct cache_sb *sb, unsigned nr, u64 bucket)
+{
+	__journal_buckets(sb)[nr] = cpu_to_le64(bucket);
+}
+
+int bch_journal_move(struct cache *);
+
+#endif /* _BCACHE_JOURNAL_H */
diff --git a/libbcache/journal_types.h b/libbcache/journal_types.h
new file mode 100644
index 0000000..e3698b5
--- /dev/null
+++ b/libbcache/journal_types.h
@@ -0,0 +1,240 @@
+#ifndef _BCACHE_JOURNAL_TYPES_H
+#define _BCACHE_JOURNAL_TYPES_H
+
+#include <linux/cache.h>
+#include <linux/workqueue.h>
+
+#include "alloc_types.h"
+#include "fifo.h"
+
+struct journal_res;
+
+/*
+ * We put two of these in struct journal; we used them for writes to the
+ * journal that are being staged or in flight.
+ */
+struct journal_buf {
+	struct jset		*data;
+	struct closure_waitlist	wait;
+
+	/*
+	 * ugh, prio_buckets are stupid - need to convert them to new
+	 * transaction machinery when it arrives
+	 */
+	unsigned		nr_prio_buckets;
+
+	/* bloom filter: */
+	unsigned long		has_inode[1024 / sizeof(unsigned long)];
+};
+
+/*
+ * Something that makes a journal entry dirty - i.e. a btree node that has to be
+ * flushed:
+ */
+
+struct journal_entry_pin_list {
+	struct list_head		list;
+	atomic_t			count;
+};
+
+struct journal;
+struct journal_entry_pin;
+typedef void (*journal_pin_flush_fn)(struct journal *j, struct journal_entry_pin *);
+
+struct journal_entry_pin {
+	struct list_head		list;
+	journal_pin_flush_fn		flush;
+	struct journal_entry_pin_list	*pin_list;
+};
+
+/* corresponds to a btree node with a blacklisted bset: */
+struct blacklisted_node {
+	__le64			seq;
+	enum btree_id		btree_id;
+	struct bpos		pos;
+};
+
+struct journal_seq_blacklist {
+	struct list_head	list;
+	u64			seq;
+	bool			written;
+	struct journal_entry_pin pin;
+
+	struct blacklisted_node	*entries;
+	size_t			nr_entries;
+};
+
+struct journal_res {
+	bool			ref;
+	u8			idx;
+	u16			u64s;
+	u32			offset;
+	u64			seq;
+};
+
+union journal_res_state {
+	struct {
+		atomic64_t	counter;
+	};
+
+	struct {
+		u64		v;
+	};
+
+	struct {
+		u64		cur_entry_offset:20,
+				idx:1,
+				prev_buf_unwritten:1,
+				buf0_count:21,
+				buf1_count:21;
+	};
+};
+
+/* 4 mb, in bytes: */
+#define JOURNAL_ENTRY_SIZE_MAX		(4U << 20)
+
+/*
+ * We stash some journal state as sentinal values in cur_entry_offset:
+ */
+#define JOURNAL_ENTRY_OFFSET_MAX	((1U << 20) - 1)
+
+#define JOURNAL_ENTRY_CLOSED_VAL	(JOURNAL_ENTRY_OFFSET_MAX - 1)
+#define JOURNAL_ENTRY_ERROR_VAL		(JOURNAL_ENTRY_OFFSET_MAX)
+
+/*
+ * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
+ * either because something's waiting on the write to complete or because it's
+ * been dirty too long and the timer's expired.
+ */
+
+enum {
+	JOURNAL_REPLAY_DONE,
+	JOURNAL_STARTED,
+	JOURNAL_NEED_WRITE,
+};
+
+/* Embedded in struct cache_set */
+struct journal {
+	/* Fastpath stuff up front: */
+
+	unsigned long		flags;
+
+	union journal_res_state reservations;
+	unsigned		cur_entry_u64s;
+	unsigned		prev_buf_sectors;
+	unsigned		cur_buf_sectors;
+	unsigned		entry_size_max; /* bytes */
+
+	/*
+	 * Two journal entries -- one is currently open for new entries, the
+	 * other is possibly being written out.
+	 */
+	struct journal_buf	buf[2];
+
+	spinlock_t		lock;
+
+	/* Used when waiting because the journal was full */
+	wait_queue_head_t	wait;
+
+	struct closure		io;
+	struct delayed_work	write_work;
+
+	/* Sequence number of most recent journal entry (last entry in @pin) */
+	atomic64_t		seq;
+
+	/* last_seq from the most recent journal entry written */
+	u64			last_seq_ondisk;
+
+	/*
+	 * FIFO of journal entries whose btree updates have not yet been
+	 * written out.
+	 *
+	 * Each entry is a reference count. The position in the FIFO is the
+	 * entry's sequence number relative to @seq.
+	 *
+	 * The journal entry itself holds a reference count, put when the
+	 * journal entry is written out. Each btree node modified by the journal
+	 * entry also holds a reference count, put when the btree node is
+	 * written.
+	 *
+	 * When a reference count reaches zero, the journal entry is no longer
+	 * needed. When all journal entries in the oldest journal bucket are no
+	 * longer needed, the bucket can be discarded and reused.
+	 */
+	DECLARE_FIFO(struct journal_entry_pin_list, pin);
+	struct journal_entry_pin_list *cur_pin_list;
+
+	/*
+	 * Protects the pin lists - the fifo itself is still protected by
+	 * j->lock though:
+	 */
+	spinlock_t		pin_lock;
+
+	struct mutex		blacklist_lock;
+	struct list_head	seq_blacklist;
+
+	BKEY_PADDED(key);
+	struct cache_group	devs;
+
+	struct delayed_work	reclaim_work;
+	unsigned long		last_flushed;
+
+	/* protects advancing ja->last_idx: */
+	struct mutex		reclaim_lock;
+
+	/*
+	 * ugh: need to get prio_buckets converted over to the eventual new
+	 * transaction machinery
+	 */
+	__le64			prio_buckets[MAX_CACHES_PER_SET];
+	unsigned		nr_prio_buckets;
+
+	unsigned		write_delay_ms;
+	unsigned		reclaim_delay_ms;
+
+	u64			res_get_blocked_start;
+	u64			need_write_time;
+	u64			write_start_time;
+
+	struct time_stats	*write_time;
+	struct time_stats	*delay_time;
+	struct time_stats	*blocked_time;
+	struct time_stats	*flush_seq_time;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	res_map;
+#endif
+};
+
+/*
+ * Embedded in struct cache. First three fields refer to the array of journal
+ * buckets, in cache_sb.
+ */
+struct journal_device {
+	/*
+	 * For each journal bucket, contains the max sequence number of the
+	 * journal writes it contains - so we know when a bucket can be reused.
+	 */
+	u64			*bucket_seq;
+
+	unsigned		sectors_free;
+
+	/* Journal bucket we're currently writing to */
+	unsigned		cur_idx;
+
+	/* Last journal bucket that still contains an open journal entry */
+
+	/*
+	 * j->lock and j->reclaim_lock must both be held to modify, j->lock
+	 * sufficient to read:
+	 */
+	unsigned		last_idx;
+
+	/* Bio for journal reads/writes to this device */
+	struct bio		*bio;
+
+	/* for bch_journal_read_device */
+	struct closure		read;
+};
+
+#endif /* _BCACHE_JOURNAL_TYPES_H */
diff --git a/libbcache/keybuf.c b/libbcache/keybuf.c
new file mode 100644
index 0000000..a3c6b03
--- /dev/null
+++ b/libbcache/keybuf.c
@@ -0,0 +1,195 @@
+
+#include "bcache.h"
+#include "btree_gc.h"
+#include "btree_iter.h"
+#include "keybuf.h"
+
+#include <trace/events/bcache.h>
+
+/*
+ * For buffered iteration over the btree, with predicates and ratelimiting and
+ * whatnot
+ */
+
+static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
+{
+	/* Overlapping keys compare equal */
+	if (bkey_cmp(l->key.k.p, bkey_start_pos(&r->key.k)) <= 0)
+		return -1;
+	if (bkey_cmp(bkey_start_pos(&l->key.k), r->key.k.p) >= 0)
+		return 1;
+	return 0;
+}
+
+static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
+					    struct keybuf_key *r)
+{
+	return clamp_t(s64, bkey_cmp(l->key.k.p, r->key.k.p), -1, 1);
+}
+
+void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
+		       struct bpos end, keybuf_pred_fn *pred)
+{
+	struct bpos start = buf->last_scanned;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	unsigned nr_found = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, buf->last_scanned, k) {
+		if (bkey_cmp(k.k->p, end) >= 0) {
+			buf->last_scanned = k.k->p;
+			goto done;
+		}
+
+		if (pred(buf, k)) {
+			struct keybuf_key *w;
+
+			spin_lock(&buf->lock);
+
+			w = array_alloc(&buf->freelist);
+			if (!w) {
+				spin_unlock(&buf->lock);
+				goto done;
+			}
+
+			bkey_reassemble(&w->key, k);
+			atomic_set(&w->ref, -1); /* -1 means hasn't started */
+
+			if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
+				array_free(&buf->freelist, w);
+			else
+				nr_found++;
+
+			spin_unlock(&buf->lock);
+		}
+
+		buf->last_scanned = k.k->p;
+		bch_btree_iter_cond_resched(&iter);
+	}
+
+	/* If we end up here, it means:
+	 * - the map_fn didn't fill up the keybuf
+	 * - the map_fn didn't see the end key
+	 * - there were no more keys to map over
+	 * Therefore, we are at the end of the key space */
+	buf->last_scanned = POS_MAX;
+done:
+	bch_btree_iter_unlock(&iter);
+
+	trace_bcache_keyscan(nr_found,
+			     start.inode, start.offset,
+			     buf->last_scanned.inode,
+			     buf->last_scanned.offset);
+
+	spin_lock(&buf->lock);
+
+	if (!RB_EMPTY_ROOT(&buf->keys)) {
+		struct keybuf_key *w;
+
+		w = RB_FIRST(&buf->keys, struct keybuf_key, node);
+		buf->start	= bkey_start_pos(&w->key.k);
+
+		w = RB_LAST(&buf->keys, struct keybuf_key, node);
+		buf->end	= w->key.k.p;
+	} else {
+		buf->start	= POS_MAX;
+		buf->end	= POS_MAX;
+	}
+
+	spin_unlock(&buf->lock);
+}
+
+static void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
+{
+	rb_erase(&w->node, &buf->keys);
+	array_free(&buf->freelist, w);
+}
+
+void bch_keybuf_put(struct keybuf *buf, struct keybuf_key *w)
+{
+	BUG_ON(atomic_read(&w->ref) <= 0);
+
+	if (atomic_dec_and_test(&w->ref)) {
+		up(&buf->in_flight);
+
+		spin_lock(&buf->lock);
+		bch_keybuf_del(buf, w);
+		spin_unlock(&buf->lock);
+	}
+}
+
+void bch_keybuf_recalc_oldest_gens(struct cache_set *c, struct keybuf *buf)
+{
+	struct keybuf_key *w, *n;
+
+	spin_lock(&buf->lock);
+	rbtree_postorder_for_each_entry_safe(w, n,
+				&buf->keys, node)
+		bch_btree_key_recalc_oldest_gen(c, bkey_i_to_s_c(&w->key));
+	spin_unlock(&buf->lock);
+}
+
+bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bpos start,
+				  struct bpos end)
+{
+	bool ret = false;
+	struct keybuf_key *w, *next, s = { .key.k.p = start };
+
+	if (bkey_cmp(end, buf->start) <= 0 ||
+	    bkey_cmp(start, buf->end) >= 0)
+		return false;
+
+	spin_lock(&buf->lock);
+
+	for (w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
+	     w && bkey_cmp(bkey_start_pos(&w->key.k), end) < 0;
+	     w = next) {
+		next = RB_NEXT(w, node);
+
+		if (atomic_read(&w->ref) == -1)
+			bch_keybuf_del(buf, w);
+		else
+			ret = true;
+	}
+
+	spin_unlock(&buf->lock);
+	return ret;
+}
+
+struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
+{
+	struct keybuf_key *w;
+
+	spin_lock(&buf->lock);
+
+	w = RB_FIRST(&buf->keys, struct keybuf_key, node);
+
+	while (w && atomic_read(&w->ref) != -1)
+		w = RB_NEXT(w, node);
+
+	if (!w) {
+		spin_unlock(&buf->lock);
+		return NULL;
+	}
+
+	atomic_set(&w->ref, 1);
+	spin_unlock(&buf->lock);
+
+	down(&buf->in_flight);
+
+	return w;
+}
+
+void bch_keybuf_init(struct keybuf *buf)
+{
+	sema_init(&buf->in_flight, KEYBUF_REFILL_BATCH / 2);
+
+	buf->last_scanned	= POS_MAX;
+	buf->start		= POS_MIN;
+	buf->end		= POS_MIN;
+
+	buf->keys		= RB_ROOT;
+
+	spin_lock_init(&buf->lock);
+	array_allocator_init(&buf->freelist);
+}
diff --git a/libbcache/keybuf.h b/libbcache/keybuf.h
new file mode 100644
index 0000000..d6fdda9
--- /dev/null
+++ b/libbcache/keybuf.h
@@ -0,0 +1,16 @@
+#ifndef _BCACHE_KEYBUF_H
+#define _BCACHE_KEYBUF_H
+
+#include "keybuf_types.h"
+
+typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey_s_c);
+
+void bch_keybuf_init(struct keybuf *);
+void bch_refill_keybuf(struct cache_set *, struct keybuf *,
+		       struct bpos, keybuf_pred_fn *);
+void bch_keybuf_recalc_oldest_gens(struct cache_set *, struct keybuf *);
+bool bch_keybuf_check_overlapping(struct keybuf *, struct bpos, struct bpos);
+void bch_keybuf_put(struct keybuf *, struct keybuf_key *);
+struct keybuf_key *bch_keybuf_next(struct keybuf *);
+
+#endif /* _BCACHE_KEYBUF_H */
diff --git a/libbcache/keybuf_types.h b/libbcache/keybuf_types.h
new file mode 100644
index 0000000..3facc4a
--- /dev/null
+++ b/libbcache/keybuf_types.h
@@ -0,0 +1,33 @@
+#ifndef _BCACHE_KEYBUF_TYPES_H
+#define _BCACHE_KEYBUF_TYPES_H
+
+struct keybuf_key {
+	struct rb_node		node;
+	BKEY_PADDED(key);
+	atomic_t		ref;
+};
+
+#define KEYBUF_REFILL_BATCH	500
+
+struct keybuf {
+	struct bpos		last_scanned;
+	spinlock_t		lock;
+
+	/*
+	 * Beginning and end of range in rb tree - so that we can skip taking
+	 * lock and checking the rb tree when we need to check for overlapping
+	 * keys.
+	 */
+	struct bpos		start;
+	struct bpos		end;
+
+	struct rb_root		keys;
+
+	unsigned		max_in_flight;
+	struct semaphore	in_flight;
+
+	DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist,
+				KEYBUF_REFILL_BATCH);
+};
+
+#endif /* _BCACHE_KEYBUF_TYPES_H */
diff --git a/libbcache/keylist.c b/libbcache/keylist.c
new file mode 100644
index 0000000..adf5eeb
--- /dev/null
+++ b/libbcache/keylist.c
@@ -0,0 +1,55 @@
+
+#include "bcache.h"
+#include "keylist.h"
+
+int bch_keylist_realloc(struct keylist *l, u64 *inline_u64s,
+			size_t nr_inline_u64s, size_t new_u64s)
+{
+	size_t oldsize = bch_keylist_u64s(l);
+	size_t newsize = oldsize + new_u64s;
+	u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
+	u64 *new_keys;
+
+	newsize = roundup_pow_of_two(newsize);
+
+	if (newsize <= nr_inline_u64s ||
+	    (old_buf && roundup_pow_of_two(oldsize) == newsize))
+		return 0;
+
+	new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO);
+	if (!new_keys)
+		return -ENOMEM;
+
+	if (!old_buf)
+		memcpy_u64s(new_keys, inline_u64s, oldsize);
+
+	l->keys_p = new_keys;
+	l->top_p = new_keys + oldsize;
+
+	return 0;
+}
+
+void bch_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
+{
+	struct bkey_i *where;
+
+	for_each_keylist_key(l, where)
+		if (bkey_cmp(insert->k.p, where->k.p) < 0)
+			break;
+
+	memmove_u64s_up((u64 *) where + insert->k.u64s,
+			where,
+			((u64 *) l->top) - ((u64 *) where));
+
+	l->top_p += insert->k.u64s;
+	bkey_copy(where, insert);
+}
+
+void bch_keylist_pop_front(struct keylist *l)
+{
+	l->top_p -= bch_keylist_front(l)->k.u64s;
+
+	memmove_u64s_down(l->keys,
+			  bkey_next(l->keys),
+			  bch_keylist_u64s(l));
+}
diff --git a/libbcache/keylist.h b/libbcache/keylist.h
new file mode 100644
index 0000000..1166f94
--- /dev/null
+++ b/libbcache/keylist.h
@@ -0,0 +1,62 @@
+#ifndef _BCACHE_KEYLIST_H
+#define _BCACHE_KEYLIST_H
+
+#include "keylist_types.h"
+
+int bch_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
+void bch_keylist_add_in_order(struct keylist *, struct bkey_i *);
+void bch_keylist_pop_front(struct keylist *);
+
+static inline void bch_keylist_init(struct keylist *l, u64 *inline_keys,
+				    size_t nr_inline_u64s)
+{
+	l->top_p = l->keys_p = inline_keys;
+}
+
+static inline void bch_keylist_free(struct keylist *l, u64 *inline_keys)
+{
+	if (l->keys_p != inline_keys)
+		kfree(l->keys_p);
+	memset(l, 0, sizeof(*l));
+}
+
+static inline void bch_keylist_push(struct keylist *l)
+{
+	l->top = bkey_next(l->top);
+}
+
+static inline void bch_keylist_add(struct keylist *l, const struct bkey_i *k)
+{
+	bkey_copy(l->top, k);
+	bch_keylist_push(l);
+}
+
+static inline bool bch_keylist_empty(struct keylist *l)
+{
+	return l->top == l->keys;
+}
+
+static inline size_t bch_keylist_u64s(struct keylist *l)
+{
+	return l->top_p - l->keys_p;
+}
+
+static inline size_t bch_keylist_bytes(struct keylist *l)
+{
+	return bch_keylist_u64s(l) * sizeof(u64);
+}
+
+static inline struct bkey_i *bch_keylist_front(struct keylist *l)
+{
+	return l->keys;
+}
+
+#define for_each_keylist_key(_keylist, _k)			\
+	for (_k = (_keylist)->keys;				\
+	     _k != (_keylist)->top;				\
+	     _k = bkey_next(_k))
+
+#define keylist_single(k)					\
+	((struct keylist) { .keys = k, .top = bkey_next(k) })
+
+#endif /* _BCACHE_KEYLIST_H */
diff --git a/libbcache/keylist_types.h b/libbcache/keylist_types.h
new file mode 100644
index 0000000..195785b
--- /dev/null
+++ b/libbcache/keylist_types.h
@@ -0,0 +1,15 @@
+#ifndef _BCACHE_KEYLIST_TYPES_H
+#define _BCACHE_KEYLIST_TYPES_H
+
+struct keylist {
+	union {
+		struct bkey_i		*keys;
+		u64			*keys_p;
+	};
+	union {
+		struct bkey_i		*top;
+		u64			*top_p;
+	};
+};
+
+#endif /* _BCACHE_KEYLIST_TYPES_H */
diff --git a/libbcache/migrate.c b/libbcache/migrate.c
new file mode 100644
index 0000000..5a26e22
--- /dev/null
+++ b/libbcache/migrate.c
@@ -0,0 +1,369 @@
+/*
+ * Code for moving data off a device.
+ */
+
+#include "bcache.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "extents.h"
+#include "io.h"
+#include "journal.h"
+#include "keylist.h"
+#include "migrate.h"
+#include "move.h"
+
+static int issue_migration_move(struct cache *ca,
+				struct moving_context *ctxt,
+				struct bkey_s_c k)
+{
+	struct cache_set *c = ca->set;
+	struct disk_reservation res;
+	const struct bch_extent_ptr *ptr;
+	int ret;
+
+	if (bch_disk_reservation_get(c, &res, k.k->size, 0))
+		return -ENOSPC;
+
+	extent_for_each_ptr(bkey_s_c_to_extent(k), ptr)
+		if (ptr->dev == ca->sb.nr_this_dev)
+			goto found;
+
+	BUG();
+found:
+	/* XXX: we need to be doing something with the disk reservation */
+
+	ret = bch_data_move(c, ctxt, &c->migration_write_point, k, ptr);
+	if (ret)
+		bch_disk_reservation_put(c, &res);
+	return ret;
+}
+
+#define MAX_DATA_OFF_ITER	10
+
+/*
+ * This moves only the data off, leaving the meta-data (if any) in place.
+ * It walks the key space, and for any key with a valid pointer to the
+ * relevant device, it copies it elsewhere, updating the key to point to
+ * the copy.
+ * The meta-data is moved off by bch_move_meta_data_off_device.
+ *
+ * Note: If the number of data replicas desired is > 1, ideally, any
+ * new copies would not be made in the same device that already have a
+ * copy (if there are enough devices).
+ * This is _not_ currently implemented.  The multiple replicas can
+ * land in the same device even if there are others available.
+ */
+
+int bch_move_data_off_device(struct cache *ca)
+{
+	struct moving_context ctxt;
+	struct cache_set *c = ca->set;
+	unsigned pass = 0;
+	u64 seen_key_count;
+	int ret = 0;
+
+	BUG_ON(ca->mi.state == CACHE_ACTIVE);
+
+	bch_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE);
+	ctxt.avoid = ca;
+
+	/*
+	 * In theory, only one pass should be necessary as we've
+	 * quiesced all writes before calling this.
+	 *
+	 * However, in practice, more than one pass may be necessary:
+	 * - Some move fails due to an error. We can can find this out
+	 *   from the moving_context.
+	 * - Some key swap failed because some of the pointers in the
+	 *   key in the tree changed due to caching behavior, btree gc
+	 *   pruning stale pointers, or tiering (if the device being
+	 *   removed is in tier 0).  A smarter bkey_cmpxchg would
+	 *   handle these cases.
+	 *
+	 * Thus this scans the tree one more time than strictly necessary,
+	 * but that can be viewed as a verification pass.
+	 */
+
+	do {
+		struct btree_iter iter;
+		struct bkey_s_c k;
+
+		seen_key_count = 0;
+		atomic_set(&ctxt.error_count, 0);
+		atomic_set(&ctxt.error_flags, 0);
+
+		bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+
+		while (!bch_move_ctxt_wait(&ctxt) &&
+		       (k = bch_btree_iter_peek(&iter)).k &&
+		       !(ret = btree_iter_err(k))) {
+			if (!bkey_extent_is_data(k.k) ||
+			    !bch_extent_has_device(bkey_s_c_to_extent(k),
+						   ca->sb.nr_this_dev))
+				goto next;
+
+			ret = issue_migration_move(ca, &ctxt, k);
+			if (ret == -ENOMEM) {
+				bch_btree_iter_unlock(&iter);
+
+				/*
+				 * memory allocation failure, wait for some IO
+				 * to finish
+				 */
+				bch_move_ctxt_wait_for_io(&ctxt);
+				continue;
+			}
+			if (ret == -ENOSPC)
+				break;
+			BUG_ON(ret);
+
+			seen_key_count++;
+next:
+			bch_btree_iter_advance_pos(&iter);
+			bch_btree_iter_cond_resched(&iter);
+
+		}
+		bch_btree_iter_unlock(&iter);
+		bch_move_ctxt_exit(&ctxt);
+
+		if (ret)
+			return ret;
+	} while (seen_key_count && pass++ < MAX_DATA_OFF_ITER);
+
+	if (seen_key_count) {
+		pr_err("Unable to migrate all data in %d iterations.",
+		       MAX_DATA_OFF_ITER);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * This walks the btree, and for any node on the relevant device it moves the
+ * node elsewhere.
+ */
+static int bch_move_btree_off(struct cache *ca, enum btree_id id)
+{
+	struct cache_set *c = ca->set;
+	struct btree_iter iter;
+	struct closure cl;
+	struct btree *b;
+	int ret;
+
+	BUG_ON(ca->mi.state == CACHE_ACTIVE);
+
+	closure_init_stack(&cl);
+
+	for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
+		struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
+retry:
+		if (!bch_extent_has_device(e, ca->sb.nr_this_dev))
+			continue;
+
+		ret = bch_btree_node_rewrite(&iter, b, &cl);
+		if (ret == -EINTR || ret == -ENOSPC) {
+			/*
+			 * Drop locks to upgrade locks or wait on
+			 * reserve: after retaking, recheck in case we
+			 * raced.
+			 */
+			bch_btree_iter_unlock(&iter);
+			closure_sync(&cl);
+			b = bch_btree_iter_peek_node(&iter);
+			goto retry;
+		}
+		if (ret) {
+			bch_btree_iter_unlock(&iter);
+			return ret;
+		}
+
+		bch_btree_iter_set_locks_want(&iter, 0);
+	}
+	ret = bch_btree_iter_unlock(&iter);
+	if (ret)
+		return ret; /* btree IO error */
+
+	if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
+		for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
+			struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
+
+			BUG_ON(bch_extent_has_device(e, ca->sb.nr_this_dev));
+		}
+		bch_btree_iter_unlock(&iter);
+	}
+
+	return 0;
+}
+
+/*
+ * This moves only the meta-data off, leaving the data (if any) in place.
+ * The data is moved off by bch_move_data_off_device, if desired, and
+ * called first.
+ *
+ * Before calling this, allocation of buckets to the device must have
+ * been disabled, as else we'll continue to write meta-data to the device
+ * when new buckets are picked for meta-data writes.
+ * In addition, the copying gc and allocator threads for the device
+ * must have been stopped.  The allocator thread is the only thread
+ * that writes prio/gen information.
+ *
+ * Meta-data consists of:
+ * - Btree nodes
+ * - Prio/gen information
+ * - Journal entries
+ * - Superblock
+ *
+ * This has to move the btree nodes and the journal only:
+ * - prio/gen information is not written once the allocator thread is stopped.
+ *   also, as the prio/gen information is per-device it is not moved.
+ * - the superblock will be written by the caller once after everything
+ *   is stopped.
+ *
+ * Note that currently there is no way to stop btree node and journal
+ * meta-data writes to a device without moving the meta-data because
+ * once a bucket is open for a btree node, unless a replacement btree
+ * node is allocated (and the tree updated), the bucket will continue
+ * to be written with updates.  Similarly for the journal (it gets
+ * written until filled).
+ *
+ * This routine leaves the data (if any) in place.  Whether the data
+ * should be moved off is a decision independent of whether the meta
+ * data should be moved off and stopped:
+ *
+ * - For device removal, both data and meta-data are moved off, in
+ *   that order.
+ *
+ * - However, for turning a device read-only without removing it, only
+ *   meta-data is moved off since that's the only way to prevent it
+ *   from being written.  Data is left in the device, but no new data
+ *   is written.
+ */
+
+int bch_move_meta_data_off_device(struct cache *ca)
+{
+	unsigned i;
+	int ret;
+
+	/* 1st, Move the btree nodes off the device */
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		ret = bch_move_btree_off(ca, i);
+		if (ret)
+			return ret;
+	}
+
+	/* There are no prios/gens to move -- they are already in the device. */
+
+	/* 2nd. Move the journal off the device */
+
+	ret = bch_journal_move(ca);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+/*
+ * Flagging data bad when forcibly removing a device after failing to
+ * migrate the data off the device.
+ */
+
+static int bch_flag_key_bad(struct btree_iter *iter,
+			    struct cache *ca,
+			    struct bkey_s_c_extent orig)
+{
+	BKEY_PADDED(key) tmp;
+	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
+	struct cache_set *c = ca->set;
+
+	bkey_reassemble(&tmp.key, orig.s_c);
+	e = bkey_i_to_s_extent(&tmp.key);
+
+	extent_for_each_ptr_backwards(e, ptr)
+		if (ptr->dev == ca->sb.nr_this_dev)
+			bch_extent_drop_ptr(e, ptr);
+
+	/*
+	 * If the new extent no longer has any pointers, bch_extent_normalize()
+	 * will do the appropriate thing with it (turning it into a
+	 * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
+	 */
+	bch_extent_normalize(c, e.s);
+
+	return bch_btree_insert_at(c, NULL, NULL, NULL,
+				   BTREE_INSERT_ATOMIC,
+				   BTREE_INSERT_ENTRY(iter, &tmp.key));
+}
+
+/*
+ * This doesn't actually move any data -- it marks the keys as bad
+ * if they contain a pointer to a device that is forcibly removed
+ * and don't have other valid pointers.  If there are valid pointers,
+ * the necessary pointers to the removed device are replaced with
+ * bad pointers instead.
+ * This is only called if bch_move_data_off_device above failed, meaning
+ * that we've already tried to move the data MAX_DATA_OFF_ITER times and
+ * are not likely to succeed if we try again.
+ */
+
+int bch_flag_data_bad(struct cache *ca)
+{
+	int ret = 0;
+	struct bkey_s_c k;
+	struct bkey_s_c_extent e;
+	struct btree_iter iter;
+
+	bch_btree_iter_init(&iter, ca->set, BTREE_ID_EXTENTS, POS_MIN);
+
+	while ((k = bch_btree_iter_peek(&iter)).k &&
+	       !(ret = btree_iter_err(k))) {
+		if (!bkey_extent_is_data(k.k))
+			goto advance;
+
+		e = bkey_s_c_to_extent(k);
+		if (!bch_extent_has_device(e, ca->sb.nr_this_dev))
+			goto advance;
+
+		ret = bch_flag_key_bad(&iter, ca, e);
+
+		/*
+		 * don't want to leave ret == -EINTR, since if we raced and
+		 * something else overwrote the key we could spuriously return
+		 * -EINTR below:
+		 */
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			break;
+
+		/*
+		 * If the replica we're dropping was dirty and there is an
+		 * additional cached replica, the cached replica will now be
+		 * considered dirty - upon inserting the new version of the key,
+		 * the bucket accounting will be updated to reflect the fact
+		 * that the cached data is now dirty and everything works out as
+		 * if by magic without us having to do anything.
+		 *
+		 * The one thing we need to be concerned with here is there's a
+		 * race between when we drop any stale pointers from the key
+		 * we're about to insert, and when the key actually gets
+		 * inserted and the cached data is marked as dirty - we could
+		 * end up trying to insert a key with a pointer that should be
+		 * dirty, but points to stale data.
+		 *
+		 * If that happens the insert code just bails out and doesn't do
+		 * the insert - however, it doesn't return an error. Hence we
+		 * need to always recheck the current key before advancing to
+		 * the next:
+		 */
+		continue;
+advance:
+		bch_btree_iter_advance_pos(&iter);
+	}
+
+	bch_btree_iter_unlock(&iter);
+
+	return ret;
+}
diff --git a/libbcache/migrate.h b/libbcache/migrate.h
new file mode 100644
index 0000000..55636e0
--- /dev/null
+++ b/libbcache/migrate.h
@@ -0,0 +1,8 @@
+#ifndef _BCACHE_MIGRATE_H
+#define _BCACHE_MIGRATE_H
+
+int bch_move_data_off_device(struct cache *);
+int bch_move_meta_data_off_device(struct cache *);
+int bch_flag_data_bad(struct cache *);
+
+#endif /* _BCACHE_MIGRATE_H */
diff --git a/libbcache/move.c b/libbcache/move.c
new file mode 100644
index 0000000..f3ab9e8
--- /dev/null
+++ b/libbcache/move.c
@@ -0,0 +1,388 @@
+
+#include "bcache.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "io.h"
+#include "move.h"
+#include "super.h"
+#include "keylist.h"
+
+#include <linux/ioprio.h>
+
+#include <trace/events/bcache.h>
+
+static struct bch_extent_ptr *bkey_find_ptr(struct cache_set *c,
+					    struct bkey_s_extent e,
+					    struct bch_extent_ptr ptr)
+{
+	struct bch_extent_ptr *ptr2;
+	struct cache_member_rcu *mi;
+	unsigned bucket_bits;
+
+	mi = cache_member_info_get(c);
+	bucket_bits = ilog2(mi->m[ptr.dev].bucket_size);
+	cache_member_info_put();
+
+	extent_for_each_ptr(e, ptr2)
+		if (ptr2->dev == ptr.dev &&
+		    ptr2->gen == ptr.gen &&
+		    (ptr2->offset >> bucket_bits) ==
+		    (ptr.offset >> bucket_bits))
+			return ptr2;
+
+	return NULL;
+}
+
+static struct bch_extent_ptr *bch_migrate_matching_ptr(struct migrate_write *m,
+						       struct bkey_s_extent e)
+{
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_ptr *ret;
+
+	if (m->move)
+		ret = bkey_find_ptr(m->op.c, e, m->move_ptr);
+	else
+		extent_for_each_ptr(bkey_i_to_s_c_extent(&m->key), ptr)
+			if ((ret = bkey_find_ptr(m->op.c, e, *ptr)))
+				break;
+
+	return ret;
+}
+
+static int bch_migrate_index_update(struct bch_write_op *op)
+{
+	struct cache_set *c = op->c;
+	struct migrate_write *m =
+		container_of(op, struct migrate_write, op);
+	struct keylist *keys = &op->insert_keys;
+	struct btree_iter iter;
+	int ret = 0;
+
+	bch_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS,
+		bkey_start_pos(&bch_keylist_front(keys)->k));
+
+	while (1) {
+		struct bkey_i *insert = bch_keylist_front(keys);
+		struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter);
+		struct bch_extent_ptr *ptr;
+		struct bkey_s_extent e;
+		BKEY_PADDED(k) new;
+
+		if (!k.k) {
+			ret = bch_btree_iter_unlock(&iter);
+			break;
+		}
+
+		if (!bkey_extent_is_data(k.k))
+			goto nomatch;
+
+		bkey_reassemble(&new.k, k);
+		bch_cut_front(iter.pos, &new.k);
+		bch_cut_back(insert->k.p, &new.k.k);
+		e = bkey_i_to_s_extent(&new.k);
+
+		/* hack - promotes can race: */
+		if (m->promote)
+			extent_for_each_ptr(bkey_i_to_s_extent(insert), ptr)
+				if (bch_extent_has_device(e.c, ptr->dev))
+					goto nomatch;
+
+		ptr = bch_migrate_matching_ptr(m, e);
+		if (ptr) {
+			unsigned insert_flags =
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL;
+
+			/* copygc uses btree node reserve: */
+			if (m->move)
+				insert_flags |= BTREE_INSERT_USE_RESERVE;
+
+			if (m->move)
+				__bch_extent_drop_ptr(e, ptr);
+
+			memcpy_u64s(extent_entry_last(e),
+				    &insert->v,
+				    bkey_val_u64s(&insert->k));
+			e.k->u64s += bkey_val_u64s(&insert->k);
+
+			bch_extent_narrow_crcs(e);
+			bch_extent_drop_redundant_crcs(e);
+			bch_extent_normalize(c, e.s);
+
+			ret = bch_btree_insert_at(c, &op->res,
+					NULL, op_journal_seq(op),
+					insert_flags,
+					BTREE_INSERT_ENTRY(&iter, &new.k));
+			if (ret && ret != -EINTR)
+				break;
+		} else {
+nomatch:
+			bch_btree_iter_advance_pos(&iter);
+		}
+
+		while (bkey_cmp(iter.pos, bch_keylist_front(keys)->k.p) >= 0) {
+			bch_keylist_pop_front(keys);
+			if (bch_keylist_empty(keys))
+				goto out;
+		}
+
+		bch_cut_front(iter.pos, bch_keylist_front(keys));
+	}
+out:
+	bch_btree_iter_unlock(&iter);
+	return ret;
+}
+
+void bch_migrate_write_init(struct cache_set *c,
+			    struct migrate_write *m,
+			    struct write_point *wp,
+			    struct bkey_s_c k,
+			    const struct bch_extent_ptr *move_ptr,
+			    unsigned flags)
+{
+	bkey_reassemble(&m->key, k);
+
+	m->promote = false;
+	m->move = move_ptr != NULL;
+	if (move_ptr)
+		m->move_ptr = *move_ptr;
+
+	if (bkey_extent_is_cached(k.k))
+		flags |= BCH_WRITE_CACHED;
+
+	bch_write_op_init(&m->op, c, &m->wbio,
+			  (struct disk_reservation) { 0 },
+			  wp,
+			  bkey_start_pos(k.k),
+			  NULL, flags);
+
+	if (m->move)
+		m->op.alloc_reserve = RESERVE_MOVINGGC;
+
+	m->op.nr_replicas	= 1;
+	m->op.index_update_fn	= bch_migrate_index_update;
+}
+
+static void migrate_bio_init(struct moving_io *io, struct bio *bio,
+			     unsigned sectors)
+{
+	bio_init(bio);
+	bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+	bio->bi_iter.bi_size	= sectors << 9;
+	bio->bi_max_vecs	= DIV_ROUND_UP(sectors, PAGE_SECTORS);
+	bio->bi_private		= &io->cl;
+	bio->bi_io_vec		= io->bi_inline_vecs;
+	bch_bio_map(bio, NULL);
+}
+
+static void moving_io_destructor(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct moving_context *ctxt = io->ctxt;
+	struct bio_vec *bv;
+	int i;
+
+	//if (io->replace.failures)
+	//	trace_bcache_copy_collision(q, &io->key.k);
+
+	atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight);
+	wake_up(&ctxt->wait);
+
+	bio_for_each_segment_all(bv, &io->write.wbio.bio, i)
+		if (bv->bv_page)
+			__free_page(bv->bv_page);
+
+	kfree(io);
+}
+
+static void moving_error(struct moving_context *ctxt, unsigned flag)
+{
+	atomic_inc(&ctxt->error_count);
+	//atomic_or(flag, &ctxt->error_flags);
+}
+
+static void moving_io_after_write(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct moving_context *ctxt = io->ctxt;
+
+	if (io->write.op.error)
+		moving_error(ctxt, MOVING_FLAG_WRITE);
+
+	moving_io_destructor(cl);
+}
+
+static void write_moving(struct moving_io *io)
+{
+	struct bch_write_op *op = &io->write.op;
+
+	if (op->error) {
+		closure_return_with_destructor(&io->cl, moving_io_destructor);
+	} else {
+		closure_call(&op->cl, bch_write, NULL, &io->cl);
+		closure_return_with_destructor(&io->cl, moving_io_after_write);
+	}
+}
+
+static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
+{
+	struct moving_io *io =
+		list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
+
+	return io && io->read_completed ? io : NULL;
+}
+
+static void read_moving_endio(struct bio *bio)
+{
+	struct closure *cl = bio->bi_private;
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct moving_context *ctxt = io->ctxt;
+
+	trace_bcache_move_read_done(&io->write.key.k);
+
+	if (bio->bi_error) {
+		io->write.op.error = bio->bi_error;
+		moving_error(io->ctxt, MOVING_FLAG_READ);
+	}
+
+	io->read_completed = true;
+	if (next_pending_write(ctxt))
+		wake_up(&ctxt->wait);
+
+	closure_put(&ctxt->cl);
+}
+
+static void __bch_data_move(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct cache_set *c = io->write.op.c;
+	struct extent_pick_ptr pick;
+
+	bch_extent_pick_ptr_avoiding(c, bkey_i_to_s_c(&io->write.key),
+				     io->ctxt->avoid, &pick);
+	if (IS_ERR_OR_NULL(pick.ca))
+		closure_return_with_destructor(cl, moving_io_destructor);
+
+	bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
+	io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&io->write.key.k);
+	io->rbio.bio.bi_end_io	= read_moving_endio;
+
+	/*
+	 * dropped by read_moving_endio() - guards against use after free of
+	 * ctxt when doing wakeup
+	 */
+	closure_get(&io->ctxt->cl);
+
+	bch_read_extent(c, &io->rbio,
+			bkey_i_to_s_c(&io->write.key),
+			&pick, BCH_READ_IS_LAST);
+}
+
+int bch_data_move(struct cache_set *c,
+		  struct moving_context *ctxt,
+		  struct write_point *wp,
+		  struct bkey_s_c k,
+		  const struct bch_extent_ptr *move_ptr)
+{
+	struct moving_io *io;
+
+	io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) *
+		     DIV_ROUND_UP(k.k->size, PAGE_SECTORS),
+		     GFP_KERNEL);
+	if (!io)
+		return -ENOMEM;
+
+	io->ctxt = ctxt;
+
+	migrate_bio_init(io, &io->rbio.bio, k.k->size);
+
+	if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) {
+		kfree(io);
+		return -ENOMEM;
+	}
+
+	migrate_bio_init(io, &io->write.wbio.bio, k.k->size);
+	bio_get(&io->write.wbio.bio);
+	io->write.wbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
+
+	bch_migrate_write_init(c, &io->write, wp, k, move_ptr, 0);
+
+	trace_bcache_move_read(&io->write.key.k);
+
+	ctxt->keys_moved++;
+	ctxt->sectors_moved += k.k->size;
+	if (ctxt->rate)
+		bch_ratelimit_increment(ctxt->rate, k.k->size);
+
+	atomic_add(k.k->size, &ctxt->sectors_in_flight);
+	list_add_tail(&io->list, &ctxt->reads);
+
+	closure_call(&io->cl, __bch_data_move, NULL, &ctxt->cl);
+	return 0;
+}
+
+static void do_pending_writes(struct moving_context *ctxt)
+{
+	struct moving_io *io;
+
+	while ((io = next_pending_write(ctxt))) {
+		list_del(&io->list);
+		trace_bcache_move_write(&io->write.key.k);
+		write_moving(io);
+	}
+}
+
+#define move_ctxt_wait_event(_ctxt, _cond)			\
+do {								\
+	do_pending_writes(_ctxt);				\
+								\
+	if (_cond)						\
+		break;						\
+	__wait_event((_ctxt)->wait,				\
+		     next_pending_write(_ctxt) || (_cond));	\
+} while (1)
+
+int bch_move_ctxt_wait(struct moving_context *ctxt)
+{
+	move_ctxt_wait_event(ctxt,
+			     atomic_read(&ctxt->sectors_in_flight) <
+			     ctxt->max_sectors_in_flight);
+
+	return ctxt->rate
+		? bch_ratelimit_wait_freezable_stoppable(ctxt->rate)
+		: 0;
+}
+
+void bch_move_ctxt_wait_for_io(struct moving_context *ctxt)
+{
+	unsigned sectors_pending = atomic_read(&ctxt->sectors_in_flight);
+
+	move_ctxt_wait_event(ctxt,
+		!atomic_read(&ctxt->sectors_in_flight) ||
+		atomic_read(&ctxt->sectors_in_flight) != sectors_pending);
+}
+
+void bch_move_ctxt_exit(struct moving_context *ctxt)
+{
+	move_ctxt_wait_event(ctxt, !atomic_read(&ctxt->sectors_in_flight));
+	closure_sync(&ctxt->cl);
+
+	EBUG_ON(!list_empty(&ctxt->reads));
+	EBUG_ON(atomic_read(&ctxt->sectors_in_flight));
+}
+
+void bch_move_ctxt_init(struct moving_context *ctxt,
+			struct bch_ratelimit *rate,
+			unsigned max_sectors_in_flight)
+{
+	memset(ctxt, 0, sizeof(*ctxt));
+	closure_init_stack(&ctxt->cl);
+
+	ctxt->rate = rate;
+	ctxt->max_sectors_in_flight = max_sectors_in_flight;
+
+	INIT_LIST_HEAD(&ctxt->reads);
+	init_waitqueue_head(&ctxt->wait);
+}
diff --git a/libbcache/move.h b/libbcache/move.h
new file mode 100644
index 0000000..787023e
--- /dev/null
+++ b/libbcache/move.h
@@ -0,0 +1,87 @@
+#ifndef _BCACHE_MOVE_H
+#define _BCACHE_MOVE_H
+
+#include "buckets.h"
+#include "io_types.h"
+#include "move_types.h"
+
+enum moving_flag_bitnos {
+	MOVING_FLAG_BITNO_READ = 0,
+	MOVING_FLAG_BITNO_WRITE,
+};
+
+#define MOVING_FLAG_READ	(1U << MOVING_FLAG_BITNO_READ)
+#define MOVING_FLAG_WRITE	(1U << MOVING_FLAG_BITNO_WRITE)
+
+struct migrate_write {
+	BKEY_PADDED(key);
+	bool			promote;
+	bool			move;
+	struct bch_extent_ptr	move_ptr;
+	struct bch_write_op	op;
+	struct bch_write_bio	wbio;
+};
+
+void bch_migrate_write_init(struct cache_set *,
+			    struct migrate_write *,
+			    struct write_point *,
+			    struct bkey_s_c,
+			    const struct bch_extent_ptr *,
+			    unsigned);
+
+#define SECTORS_IN_FLIGHT_PER_DEVICE	2048
+
+struct moving_context {
+	/* Closure for waiting on all reads and writes to complete */
+	struct closure		cl;
+
+	/* Number and types of errors reported */
+	atomic_t		error_count;
+	atomic_t		error_flags;
+
+	/* Key and sector moves issued, updated from submission context */
+	u64			keys_moved;
+	u64			sectors_moved;
+
+	/* Rate-limiter counting submitted reads */
+	struct bch_ratelimit	*rate;
+
+	/* Try to avoid reading the following device */
+	struct cache		*avoid;
+
+	struct list_head	reads;
+
+	/* Configuration */
+	unsigned		max_sectors_in_flight;
+	atomic_t		sectors_in_flight;
+
+	wait_queue_head_t	wait;
+};
+
+struct moving_io {
+	struct list_head	list;
+	struct rb_node		node;
+	struct closure		cl;
+	struct moving_context	*ctxt;
+	struct migrate_write	write;
+	bool			read_completed;
+
+	struct bch_read_bio	rbio;
+	/* Must be last since it is variable size */
+	struct bio_vec		bi_inline_vecs[0];
+};
+
+int bch_data_move(struct cache_set *,
+		  struct moving_context *,
+		  struct write_point *,
+		  struct bkey_s_c,
+		  const struct bch_extent_ptr *);
+
+int bch_move_ctxt_wait(struct moving_context *);
+void bch_move_ctxt_wait_for_io(struct moving_context *);
+
+void bch_move_ctxt_exit(struct moving_context *);
+void bch_move_ctxt_init(struct moving_context *, struct bch_ratelimit *,
+			unsigned);
+
+#endif /* _BCACHE_MOVE_H */
diff --git a/libbcache/move_types.h b/libbcache/move_types.h
new file mode 100644
index 0000000..0e2275e
--- /dev/null
+++ b/libbcache/move_types.h
@@ -0,0 +1,4 @@
+#ifndef _BCACHE_MOVE_TYPES_H
+#define _BCACHE_MOVE_TYPES_H
+
+#endif /* _BCACHE_MOVE_TYPES_H */
diff --git a/libbcache/movinggc.c b/libbcache/movinggc.c
new file mode 100644
index 0000000..3c85d49
--- /dev/null
+++ b/libbcache/movinggc.c
@@ -0,0 +1,298 @@
+/*
+ * Moving/copying garbage collector
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "clock.h"
+#include "extents.h"
+#include "io.h"
+#include "keylist.h"
+#include "move.h"
+#include "movinggc.h"
+
+#include <trace/events/bcache.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+
+/* Moving GC - IO loop */
+
+static const struct bch_extent_ptr *moving_pred(struct cache *ca,
+						struct bkey_s_c k)
+{
+	const struct bch_extent_ptr *ptr;
+
+	if (bkey_extent_is_data(k.k)) {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+
+		extent_for_each_ptr(e, ptr)
+			if ((ca->sb.nr_this_dev == ptr->dev) &&
+			    PTR_BUCKET(ca, ptr)->mark.copygc)
+				return ptr;
+	}
+
+	return NULL;
+}
+
+static int issue_moving_gc_move(struct cache *ca,
+				struct moving_context *ctxt,
+				struct bkey_s_c k)
+{
+	struct cache_set *c = ca->set;
+	const struct bch_extent_ptr *ptr;
+	int ret;
+
+	ptr = moving_pred(ca, k);
+	if (!ptr) /* We raced - bucket's been reused */
+		return 0;
+
+	ret = bch_data_move(c, ctxt, &ca->copygc_write_point, k, ptr);
+	if (!ret)
+		trace_bcache_gc_copy(k.k);
+	else
+		trace_bcache_moving_gc_alloc_fail(c, k.k->size);
+	return ret;
+}
+
+static void read_moving(struct cache *ca, size_t buckets_to_move,
+			u64 sectors_to_move)
+{
+	struct cache_set *c = ca->set;
+	struct bucket *g;
+	struct moving_context ctxt;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 sectors_not_moved = 0;
+	size_t buckets_not_moved = 0;
+
+	bch_ratelimit_reset(&ca->moving_gc_pd.rate);
+	bch_move_ctxt_init(&ctxt, &ca->moving_gc_pd.rate,
+				SECTORS_IN_FLIGHT_PER_DEVICE);
+	bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+
+	while (1) {
+		if (kthread_should_stop())
+			goto out;
+		if (bch_move_ctxt_wait(&ctxt))
+			goto out;
+		k = bch_btree_iter_peek(&iter);
+		if (!k.k)
+			break;
+		if (btree_iter_err(k))
+			goto out;
+
+		if (!moving_pred(ca, k))
+			goto next;
+
+		if (issue_moving_gc_move(ca, &ctxt, k)) {
+			bch_btree_iter_unlock(&iter);
+
+			/* memory allocation failure, wait for some IO to finish */
+			bch_move_ctxt_wait_for_io(&ctxt);
+			continue;
+		}
+next:
+		bch_btree_iter_advance_pos(&iter);
+		//bch_btree_iter_cond_resched(&iter);
+
+		/* unlock before calling moving_context_wait() */
+		bch_btree_iter_unlock(&iter);
+		cond_resched();
+	}
+
+	bch_btree_iter_unlock(&iter);
+	bch_move_ctxt_exit(&ctxt);
+	trace_bcache_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
+				   buckets_to_move);
+
+	/* don't check this if we bailed out early: */
+	for_each_bucket(g, ca)
+		if (g->mark.copygc && bucket_sectors_used(g)) {
+			sectors_not_moved += bucket_sectors_used(g);
+			buckets_not_moved++;
+		}
+
+	if (sectors_not_moved)
+		bch_warn(c, "copygc finished but %llu/%llu sectors, %zu/%zu buckets not moved",
+			 sectors_not_moved, sectors_to_move,
+			 buckets_not_moved, buckets_to_move);
+	return;
+out:
+	bch_btree_iter_unlock(&iter);
+	bch_move_ctxt_exit(&ctxt);
+	trace_bcache_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
+				   buckets_to_move);
+}
+
+static bool have_copygc_reserve(struct cache *ca)
+{
+	bool ret;
+
+	spin_lock(&ca->freelist_lock);
+	ret = fifo_used(&ca->free[RESERVE_MOVINGGC]) >=
+		COPYGC_BUCKETS_PER_ITER(ca);
+	spin_unlock(&ca->freelist_lock);
+
+	return ret;
+}
+
+static void bch_moving_gc(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	struct bucket *g;
+	struct bucket_mark new;
+	u64 sectors_to_move;
+	size_t buckets_to_move, buckets_unused = 0;
+	struct bucket_heap_entry e;
+	unsigned sectors_used, i;
+	int reserve_sectors;
+
+	if (!have_copygc_reserve(ca)) {
+		struct closure cl;
+
+		closure_init_stack(&cl);
+		while (1) {
+			closure_wait(&c->freelist_wait, &cl);
+			if (have_copygc_reserve(ca))
+				break;
+			closure_sync(&cl);
+		}
+		closure_wake_up(&c->freelist_wait);
+	}
+
+	reserve_sectors = COPYGC_SECTORS_PER_ITER(ca);
+
+	trace_bcache_moving_gc_start(ca);
+
+	/*
+	 * Find buckets with lowest sector counts, skipping completely
+	 * empty buckets, by building a maxheap sorted by sector count,
+	 * and repeatedly replacing the maximum element until all
+	 * buckets have been visited.
+	 */
+
+	/*
+	 * We need bucket marks to be up to date, so gc can't be recalculating
+	 * them, and we don't want the allocator invalidating a bucket after
+	 * we've decided to evacuate it but before we set copygc:
+	 */
+	down_read(&c->gc_lock);
+	mutex_lock(&ca->heap_lock);
+	mutex_lock(&ca->set->bucket_lock);
+
+	ca->heap.used = 0;
+	for_each_bucket(g, ca) {
+		bucket_cmpxchg(g, new, new.copygc = 0);
+
+		if (bucket_unused(g)) {
+			buckets_unused++;
+			continue;
+		}
+
+		if (g->mark.owned_by_allocator ||
+		    g->mark.is_metadata)
+			continue;
+
+		sectors_used = bucket_sectors_used(g);
+
+		if (sectors_used >= ca->mi.bucket_size)
+			continue;
+
+		bucket_heap_push(ca, g, sectors_used);
+	}
+
+	sectors_to_move = 0;
+	for (i = 0; i < ca->heap.used; i++)
+		sectors_to_move += ca->heap.data[i].val;
+
+	while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
+		BUG_ON(!heap_pop(&ca->heap, e, bucket_min_cmp));
+		sectors_to_move -= e.val;
+	}
+
+	for (i = 0; i < ca->heap.used; i++)
+		bucket_cmpxchg(ca->heap.data[i].g, new, new.copygc = 1);
+
+	buckets_to_move = ca->heap.used;
+
+	mutex_unlock(&ca->set->bucket_lock);
+	mutex_unlock(&ca->heap_lock);
+	up_read(&c->gc_lock);
+
+	read_moving(ca, buckets_to_move, sectors_to_move);
+}
+
+static int bch_moving_gc_thread(void *arg)
+{
+	struct cache *ca = arg;
+	struct cache_set *c = ca->set;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	unsigned long last;
+	u64 available, want, next;
+
+	set_freezable();
+
+	while (!kthread_should_stop()) {
+		if (kthread_wait_freezable(c->copy_gc_enabled))
+			break;
+
+		last = atomic_long_read(&clock->now);
+		/*
+		 * don't start copygc until less than half the gc reserve is
+		 * available:
+		 */
+		available = buckets_available_cache(ca);
+		want = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
+				 c->opts.gc_reserve_percent, 200);
+		if (available > want) {
+			next = last + (available - want) *
+				ca->mi.bucket_size;
+			bch_kthread_io_clock_wait(clock, next);
+			continue;
+		}
+
+		bch_moving_gc(ca);
+	}
+
+	return 0;
+}
+
+void bch_moving_init_cache(struct cache *ca)
+{
+	bch_pd_controller_init(&ca->moving_gc_pd);
+	ca->moving_gc_pd.d_term = 0;
+}
+
+int bch_moving_gc_thread_start(struct cache *ca)
+{
+	struct task_struct *t;
+
+	/* The moving gc read thread must be stopped */
+	BUG_ON(ca->moving_gc_read != NULL);
+
+	if (cache_set_init_fault("moving_gc_start"))
+		return -ENOMEM;
+
+	t = kthread_create(bch_moving_gc_thread, ca, "bch_copygc_read");
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
+	ca->moving_gc_read = t;
+	wake_up_process(ca->moving_gc_read);
+
+	return 0;
+}
+
+void bch_moving_gc_stop(struct cache *ca)
+{
+	ca->moving_gc_pd.rate.rate = UINT_MAX;
+	bch_ratelimit_reset(&ca->moving_gc_pd.rate);
+
+	if (ca->moving_gc_read)
+		kthread_stop(ca->moving_gc_read);
+	ca->moving_gc_read = NULL;
+}
diff --git a/libbcache/movinggc.h b/libbcache/movinggc.h
new file mode 100644
index 0000000..5f15308
--- /dev/null
+++ b/libbcache/movinggc.h
@@ -0,0 +1,30 @@
+#ifndef _BCACHE_MOVINGGC_H
+#define _BCACHE_MOVINGGC_H
+
+/*
+ * We can't use the entire copygc reserve in one iteration of copygc: we may
+ * need the buckets we're freeing up to go back into the copygc reserve to make
+ * forward progress, but if the copygc reserve is full they'll be available for
+ * any allocation - and it's possible that in a given iteration, we free up most
+ * of the buckets we're going to free before we allocate most of the buckets
+ * we're going to allocate.
+ *
+ * If we only use half of the reserve per iteration, then in steady state we'll
+ * always have room in the reserve for the buckets we're going to need in the
+ * next iteration:
+ */
+#define COPYGC_BUCKETS_PER_ITER(ca)					\
+	((ca)->free[RESERVE_MOVINGGC].size / 2)
+
+/*
+ * Max sectors to move per iteration: Have to take into account internal
+ * fragmentation from the multiple write points for each generation:
+ */
+#define COPYGC_SECTORS_PER_ITER(ca)					\
+	((ca)->mi.bucket_size *	COPYGC_BUCKETS_PER_ITER(ca))
+
+void bch_moving_init_cache(struct cache *);
+void bch_moving_gc_stop(struct cache *);
+int bch_moving_gc_thread_start(struct cache *);
+
+#endif
diff --git a/libbcache/notify.c b/libbcache/notify.c
new file mode 100644
index 0000000..e9b5568
--- /dev/null
+++ b/libbcache/notify.c
@@ -0,0 +1,133 @@
+/*
+ * Code for sending uevent notifications to user-space.
+ *
+ * Copyright 2015 Datera, Inc.
+ */
+
+#include "bcache.h"
+#include "notify.h"
+
+#include <linux/kobject.h>
+
+#define notify_var(c, format, ...)					\
+({									\
+	int ret;							\
+	lockdep_assert_held(&(c)->uevent_lock);				\
+	ret = add_uevent_var(&(c)->uevent_env, format, ##__VA_ARGS__);	\
+	WARN_ON_ONCE(ret);						\
+})
+
+static void notify_get(struct cache_set *c)
+{
+	struct kobj_uevent_env *env = &c->uevent_env;
+
+	mutex_lock(&c->uevent_lock);
+	env->envp_idx = 0;
+	env->buflen = 0;
+
+	notify_var(c, "SET_UUID=%pU", c->disk_sb.user_uuid.b);
+}
+
+static void notify_get_cache(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	char buf[BDEVNAME_SIZE];
+
+	notify_get(c);
+	notify_var(c, "UUID=%pU", ca->disk_sb.sb->disk_uuid.b);
+	notify_var(c, "BLOCKDEV=%s", bdevname(ca->disk_sb.bdev, buf));
+}
+
+static void notify_put(struct cache_set *c)
+{
+	struct kobj_uevent_env *env = &c->uevent_env;
+
+	env->envp[env->envp_idx] = NULL;
+	kobject_uevent_env(&c->kobj, KOBJ_CHANGE, env->envp);
+	mutex_unlock(&c->uevent_lock);
+}
+
+void bch_notify_cache_set_read_write(struct cache_set *c)
+{
+	notify_get(c);
+	notify_var(c, "STATE=active");
+	notify_put(c);
+}
+
+void bch_notify_cache_set_read_only(struct cache_set *c)
+{
+	notify_get(c);
+	notify_var(c, "STATE=readonly");
+	notify_put(c);
+}
+
+void bch_notify_cache_set_stopped(struct cache_set *c)
+{
+	notify_get(c);
+	notify_var(c, "STATE=stopped");
+	notify_put(c);
+}
+
+void bch_notify_cache_read_write(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+
+	notify_get_cache(ca);
+	notify_var(c, "STATE=active");
+	notify_put(c);
+}
+
+void bch_notify_cache_read_only(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+
+	notify_get_cache(ca);
+	notify_var(c, "STATE=readonly");
+	notify_put(c);
+}
+
+void bch_notify_cache_added(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+
+	notify_get_cache(ca);
+	notify_var(c, "STATE=removing");
+	notify_put(c);
+}
+
+void bch_notify_cache_removing(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+
+	notify_get_cache(ca);
+	notify_var(c, "STATE=removing");
+	notify_put(c);
+}
+
+void bch_notify_cache_remove_failed(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+
+	notify_get_cache(ca);
+	notify_var(c, "STATE=remove_failed");
+	notify_put(c);
+}
+
+void bch_notify_cache_removed(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+
+	notify_get_cache(ca);
+	notify_var(c, "STATE=removed");
+	notify_put(c);
+}
+
+void bch_notify_cache_error(struct cache *ca, bool fatal)
+{
+	struct cache_set *c = ca->set;
+
+	notify_get_cache(ca);
+	notify_var(c, "STATE=error");
+	notify_var(c, "FATAL=%d", fatal);
+	notify_put(c);
+}
diff --git a/libbcache/notify.h b/libbcache/notify.h
new file mode 100644
index 0000000..80d6587
--- /dev/null
+++ b/libbcache/notify.h
@@ -0,0 +1,22 @@
+/*
+ * Code for sending uevent notifications to user-space.
+ *
+ * Copyright 2015 Datera, Inc.
+ */
+
+#ifndef _NOTIFY_H
+#define _NOTIFY_H
+
+void bch_notify_cache_set_read_write(struct cache_set *);
+void bch_notify_cache_set_read_only(struct cache_set *);
+void bch_notify_cache_set_stopped(struct cache_set *);
+
+void bch_notify_cache_read_write(struct cache *);
+void bch_notify_cache_read_only(struct cache *);
+void bch_notify_cache_added(struct cache *);
+void bch_notify_cache_removing(struct cache *);
+void bch_notify_cache_removed(struct cache *);
+void bch_notify_cache_remove_failed(struct cache *);
+void bch_notify_cache_error(struct cache *, bool);
+
+#endif /* _NOTIFY_H */
diff --git a/libbcache/opts.c b/libbcache/opts.c
new file mode 100644
index 0000000..249dd5d
--- /dev/null
+++ b/libbcache/opts.c
@@ -0,0 +1,179 @@
+
+#include <linux/kernel.h>
+
+#include "opts.h"
+#include "util.h"
+
+const char * const bch_bool_opt[] = {
+	"0",
+	"1",
+	NULL
+};
+
+const char * const bch_uint_opt[] = {
+	NULL
+};
+
+const char * const bch_error_actions[] = {
+	"continue",
+	"remount-ro",
+	"panic",
+	NULL
+};
+
+const char * const bch_csum_types[] = {
+	"none",
+	"crc32c",
+	"crc64",
+	NULL
+};
+
+const char * const bch_compression_types[] = {
+	"none",
+	"lz4",
+	"gzip",
+	NULL
+};
+
+const char * const bch_str_hash_types[] = {
+	"crc32c",
+	"crc64",
+	"siphash",
+	"sha1",
+	NULL
+};
+
+enum bch_opts {
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)	\
+	Opt_##_name,
+
+	CACHE_SET_VISIBLE_OPTS()
+#undef CACHE_SET_OPT
+
+	Opt_bad_opt,
+};
+
+struct bch_option {
+	const char		*name;
+	const char * const	*opts;
+	unsigned long		min, max;
+};
+
+struct bch_opt_result {
+	enum bch_opts		opt;
+	unsigned		val;
+};
+
+static int parse_bool_opt(const struct bch_option *opt, const char *s)
+{
+	if (!strcmp(opt->name, s))
+		return true;
+
+	if (!strncmp("no", s, 2) && !strcmp(opt->name, s + 2))
+		return false;
+
+	return -1;
+}
+
+static int parse_uint_opt(const struct bch_option *opt, const char *s)
+{
+	unsigned long v;
+	int ret;
+
+	if (strncmp(opt->name, s, strlen(opt->name)))
+		return -1;
+
+	s += strlen(opt->name);
+
+	if (*s != '=')
+		return -1;
+
+	s++;
+
+	ret = kstrtoul(s, 10, &v);
+	if (ret)
+		return ret;
+
+	if (v < opt->min || v >= opt->max)
+		return -ERANGE;
+
+	return 0;
+}
+
+static int parse_string_opt(const struct bch_option *opt, const char *s)
+{
+	if (strncmp(opt->name, s, strlen(opt->name)))
+		return -1;
+
+	s += strlen(opt->name);
+
+	if (*s != '=')
+		return -1;
+
+	s++;
+
+	return bch_read_string_list(s, opt->opts);
+}
+
+static struct bch_opt_result parse_one_opt(const char *opt)
+{
+	static const struct bch_option opt_table[] = {
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)	\
+		[Opt_##_name] = {					\
+			.name = #_name,					\
+			.opts = _choices,				\
+			.min = _min,					\
+			.max = _max,					\
+		},
+		CACHE_SET_VISIBLE_OPTS()
+#undef CACHE_SET_OPT
+	}, *i;
+
+	for (i = opt_table;
+	     i < opt_table + ARRAY_SIZE(opt_table);
+	     i++) {
+		int res = i->opts == bch_bool_opt ? parse_bool_opt(i, opt)
+			: i->opts == bch_uint_opt ? parse_uint_opt(i, opt)
+			: parse_string_opt(i, opt);
+
+		if (res >= 0)
+			return (struct bch_opt_result) {
+				i - opt_table, res
+			};
+	}
+
+	return (struct bch_opt_result) { Opt_bad_opt };
+}
+
+int bch_parse_options(struct cache_set_opts *opts, int flags, char *options)
+{
+	char *p;
+
+	*opts = cache_set_opts_empty();
+
+	opts->read_only = (flags & MS_RDONLY) != 0;
+
+	if (!options)
+		return 0;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		struct bch_opt_result res = parse_one_opt(p);
+
+		switch (res.opt) {
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)	\
+		case Opt_##_name:					\
+			opts->_name = res.val;				\
+			break;
+
+		CACHE_SET_VISIBLE_OPTS()
+#undef CACHE_SET_OPT
+
+		case Opt_bad_opt:
+			return -EINVAL;
+		default:
+			BUG();
+		}
+	}
+
+	return 0;
+}
diff --git a/libbcache/opts.h b/libbcache/opts.h
new file mode 100644
index 0000000..1d19ac6
--- /dev/null
+++ b/libbcache/opts.h
@@ -0,0 +1,100 @@
+#ifndef _BCACHE_OPTS_H
+#define _BCACHE_OPTS_H
+
+#include <linux/bcache.h>
+#include <linux/bug.h>
+#include <linux/log2.h>
+#include <linux/string.h>
+
+/*
+ * Mount options; we also store defaults in the superblock.
+ *
+ * Also exposed via sysfs: if an option is writeable, and it's also stored in
+ * the superblock, changing it via sysfs (currently? might change this) also
+ * updates the superblock.
+ *
+ * We store options as signed integers, where -1 means undefined. This means we
+ * can pass the mount options to cache_set_alloc() as a whole struct, and then
+ * only apply the options from that struct that are defined.
+ */
+
+extern const char * const bch_bool_opt[];
+extern const char * const bch_uint_opt[];
+extern const char * const bch_error_actions[];
+extern const char * const bch_csum_types[];
+extern const char * const bch_compression_types[];
+extern const char * const bch_str_hash_types[];
+
+/* dummy option, for options that aren't stored in the superblock */
+LE64_BITMASK(NO_SB_OPT,		struct cache_sb, flags, 0, 0);
+
+#define CACHE_SET_VISIBLE_OPTS()				\
+	CACHE_SET_OPT(verbose_recovery,				\
+		      bch_bool_opt, 0, 2,			\
+		      NO_SB_OPT, false)				\
+	CACHE_SET_OPT(posix_acl,				\
+		      bch_bool_opt, 0, 2,			\
+		      NO_SB_OPT, false)				\
+	CACHE_SET_OPT(journal_flush_disabled,			\
+		      bch_bool_opt, 0, 2,			\
+		      NO_SB_OPT, true)				\
+	CACHE_SET_OPT(nofsck,					\
+		      bch_bool_opt, 0, 2,			\
+		      NO_SB_OPT, true)				\
+	CACHE_SET_OPT(fix_errors,				\
+		      bch_bool_opt, 0, 2,			\
+		      NO_SB_OPT, true)				\
+	CACHE_SET_SB_OPTS()
+
+#define CACHE_SET_OPTS()					\
+	CACHE_SET_OPT(read_only,				\
+		      bch_bool_opt, 0, 2,			\
+		      NO_SB_OPT, 0)				\
+	CACHE_SET_VISIBLE_OPTS()
+
+struct cache_set_opts {
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\
+	s8 _name;
+
+	CACHE_SET_OPTS()
+#undef CACHE_SET_OPT
+};
+
+static inline struct cache_set_opts cache_set_opts_empty(void)
+{
+	struct cache_set_opts ret;
+
+	memset(&ret, 255, sizeof(ret));
+	return ret;
+}
+
+/*
+ * Initial options from superblock - here we don't want any options undefined,
+ * any options the superblock doesn't specify are set to 0:
+ */
+static inline struct cache_set_opts cache_superblock_opts(struct cache_sb *sb)
+{
+	return (struct cache_set_opts) {
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\
+		._name = _sb_opt##_BITS ? _sb_opt(sb) : 0,
+
+	CACHE_SET_OPTS()
+#undef CACHE_SET_OPT
+	};
+}
+
+static inline void cache_set_opts_apply(struct cache_set_opts *dst,
+					struct cache_set_opts src)
+{
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\
+	BUILD_BUG_ON(_max > S8_MAX);				\
+	if (src._name >= 0)					\
+		dst->_name = src._name;
+
+	CACHE_SET_OPTS()
+#undef CACHE_SET_OPT
+}
+
+int bch_parse_options(struct cache_set_opts *, int, char *);
+
+#endif /* _BCACHE_OPTS_H */
diff --git a/libbcache/request.c b/libbcache/request.c
new file mode 100644
index 0000000..b41d472
--- /dev/null
+++ b/libbcache/request.c
@@ -0,0 +1,825 @@
+/*
+ * Handle a read or a write request and decide what to do with it.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ *
+ * Main pieces here:
+ *
+ * 1) Data insert path, via bch_data_insert() -- writes data to cache and
+ *    updates extents btree
+ * 2) Read path, via bch_read() -- for now only used by bcachefs and ioctl
+ *    interface
+ * 3) Read path, via cache_lookup() and struct search -- used by block device
+ *    make_request functions
+ * 4) Cache promotion -- used by bch_read() and cache_lookup() to copy data to
+ *    the cache, either from a backing device or a cache device in a higher tier
+ *
+ * One tricky thing that comes up is a race condition where a bucket may be
+ * re-used while reads from it are still in flight. To guard against this, we
+ * save the ptr that is being read and check if it is stale once the read
+ * completes. If the ptr is stale, the read is retried.
+ *
+ * #2 and #3 will be unified further in the future.
+ */
+
+#include "bcache.h"
+#include "blockdev.h"
+#include "btree_update.h"
+#include "btree_iter.h"
+#include "clock.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "journal.h"
+#include "keybuf.h"
+#include "request.h"
+#include "writeback.h"
+#include "stats.h"
+
+#include <linux/module.h>
+#include <linux/hash.h>
+#include <linux/random.h>
+#include <linux/backing-dev.h>
+
+#include <trace/events/bcache.h>
+
+#define CUTOFF_CACHE_ADD	10
+#define CUTOFF_CACHE_READA	15
+
+/* Congested? */
+
+unsigned bch_get_congested(struct cache_set *c)
+{
+	int i;
+	long rand;
+
+	if (!c->congested_read_threshold_us &&
+	    !c->congested_write_threshold_us)
+		return 0;
+
+	i = (local_clock_us() - c->congested_last_us) / 1024;
+	if (i < 0)
+		return 0;
+
+	i += atomic_read(&c->congested);
+	if (i >= 0)
+		return 0;
+
+	i += CONGESTED_MAX;
+
+	if (i > 0)
+		i = fract_exp_two(i, 6);
+
+	rand = get_random_int();
+	i -= bitmap_weight(&rand, BITS_PER_LONG);
+
+	return i > 0 ? i : 1;
+}
+
+static void add_sequential(struct task_struct *t)
+{
+	t->sequential_io_avg = ewma_add(t->sequential_io_avg,
+					t->sequential_io, 3);
+	t->sequential_io = 0;
+}
+
+static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
+{
+	return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
+}
+
+static bool check_should_bypass(struct cached_dev *dc, struct bio *bio, int rw)
+{
+	struct cache_set *c = dc->disk.c;
+	unsigned mode = BDEV_CACHE_MODE(dc->disk_sb.sb);
+	unsigned sectors, congested = bch_get_congested(c);
+	struct task_struct *task = current;
+	struct io *i;
+
+	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
+	    sectors_available(c) * 100 < c->capacity * CUTOFF_CACHE_ADD ||
+	    (bio_op(bio) == REQ_OP_DISCARD))
+		goto skip;
+
+	if (mode == CACHE_MODE_NONE ||
+	    (mode == CACHE_MODE_WRITEAROUND &&
+	     op_is_write(bio_op(bio))))
+		goto skip;
+
+	if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
+	    bio_sectors(bio) & (c->sb.block_size - 1)) {
+		pr_debug("skipping unaligned io");
+		goto skip;
+	}
+
+	if (bypass_torture_test(dc)) {
+		if ((get_random_int() & 3) == 3)
+			goto skip;
+		else
+			goto rescale;
+	}
+
+	if (!congested && !dc->sequential_cutoff)
+		goto rescale;
+
+	if (!congested &&
+	    mode == CACHE_MODE_WRITEBACK &&
+	    op_is_write(bio_op(bio)) &&
+	    (bio->bi_opf & REQ_SYNC))
+		goto rescale;
+
+	spin_lock(&dc->io_lock);
+
+	hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
+		if (i->last == bio->bi_iter.bi_sector &&
+		    time_before(jiffies, i->last_io))
+			goto found;
+
+	i = list_first_entry(&dc->io_lru, struct io, lru);
+
+	add_sequential(task);
+	i->sequential = 0;
+found:
+	if (i->sequential + bio->bi_iter.bi_size > i->sequential)
+		i->sequential	+= bio->bi_iter.bi_size;
+
+	i->last			 = bio_end_sector(bio);
+	i->last_io		 = jiffies + msecs_to_jiffies(5000);
+	task->sequential_io	 = i->sequential;
+
+	hlist_del(&i->hash);
+	hlist_add_head(&i->hash, iohash(dc, i->last));
+	list_move_tail(&i->lru, &dc->io_lru);
+
+	spin_unlock(&dc->io_lock);
+
+	sectors = max(task->sequential_io,
+		      task->sequential_io_avg) >> 9;
+
+	if (dc->sequential_cutoff &&
+	    sectors >= dc->sequential_cutoff >> 9) {
+		trace_bcache_bypass_sequential(bio);
+		goto skip;
+	}
+
+	if (congested && sectors >= congested) {
+		trace_bcache_bypass_congested(bio);
+		goto skip;
+	}
+
+rescale:
+	return false;
+skip:
+	bch_mark_sectors_bypassed(c, dc, bio_sectors(bio));
+	return true;
+}
+
+/* Common code for the make_request functions */
+
+/**
+ * request_endio - endio function for backing device bios
+ */
+static void request_endio(struct bio *bio)
+{
+	struct closure *cl = bio->bi_private;
+
+	if (bio->bi_error) {
+		struct search *s = container_of(cl, struct search, cl);
+		s->iop.error = bio->bi_error;
+		/* Only cache read errors are recoverable */
+		s->recoverable = false;
+	}
+
+	bio_put(bio);
+	closure_put(cl);
+}
+
+static void bio_complete(struct search *s)
+{
+	if (s->orig_bio) {
+		generic_end_io_acct(bio_data_dir(s->orig_bio),
+				    &s->d->disk->part0, s->start_time);
+
+		trace_bcache_request_end(s->d, s->orig_bio);
+		s->orig_bio->bi_error = s->iop.error;
+		bio_endio(s->orig_bio);
+		s->orig_bio = NULL;
+	}
+}
+
+static void do_bio_hook(struct search *s, struct bio *orig_bio)
+{
+	int rw = bio_data_dir(orig_bio);
+	struct bio *bio = rw ? &s->wbio.bio : &s->rbio.bio;
+
+	bio_init(bio);
+	__bio_clone_fast(bio, orig_bio);
+	bio->bi_end_io		= request_endio;
+	bio->bi_private		= &s->cl;
+
+	bio_cnt_set(bio, 3);
+}
+
+static void search_free(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+
+	bio_complete(s);
+
+	if (s->iop.bio)
+		bio_put(&s->iop.bio->bio);
+
+	closure_debug_destroy(cl);
+	mempool_free(s, &s->d->c->search);
+}
+
+static inline struct search *search_alloc(struct bio *bio,
+					  struct bcache_device *d)
+{
+	struct search *s;
+
+	s = mempool_alloc(&d->c->search, GFP_NOIO);
+
+	closure_init(&s->cl, NULL);
+	do_bio_hook(s, bio);
+
+	s->orig_bio		= bio;
+	s->d			= d;
+	s->recoverable		= 1;
+	s->bypass		= 0;
+	s->write		= op_is_write(bio_op(bio));
+	s->read_dirty_data	= 0;
+	s->cache_miss		= 0;
+	s->start_time		= jiffies;
+	s->inode		= bcache_dev_inum(d);
+
+	s->iop.c		= d->c;
+	s->iop.bio		= NULL;
+	s->iop.error		= 0;
+
+	return s;
+}
+
+/* Cached devices */
+
+static void cached_dev_bio_complete(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	search_free(cl);
+	cached_dev_put(dc);
+}
+
+/* Process reads */
+
+static void cached_dev_read_error(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct bio *bio = &s->rbio.bio;
+
+	if (s->recoverable) {
+		/* Read bucket invalidate races are handled here, also plain
+		 * old IO errors from the cache that can be retried from the
+		 * backing device (reads of clean data) */
+		trace_bcache_read_retry(s->orig_bio);
+
+		s->iop.error = 0;
+		do_bio_hook(s, s->orig_bio);
+
+		/* XXX: invalidate cache, don't count twice */
+
+		closure_bio_submit(bio, cl);
+	}
+
+	continue_at(cl, cached_dev_bio_complete, NULL);
+}
+
+static void cached_dev_read_done(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	if (dc->verify && s->recoverable && !s->read_dirty_data)
+		bch_data_verify(dc, s->orig_bio);
+
+	continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
+}
+
+static void cached_dev_read_done_bh(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	bch_mark_cache_accounting(s->iop.c, dc, !s->cache_miss, s->bypass);
+	trace_bcache_read(s->orig_bio, !s->cache_miss, s->bypass);
+
+	if (s->iop.error)
+		continue_at_nobarrier(cl, cached_dev_read_error, s->iop.c->wq);
+	else if (dc->verify)
+		continue_at_nobarrier(cl, cached_dev_read_done, s->iop.c->wq);
+	else
+		continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
+}
+
+/**
+ * __cache_promote -- insert result of read bio into cache
+ *
+ * Used for backing devices and flash-only volumes.
+ *
+ * @orig_bio must actually be a bbio with a valid key.
+ */
+void __cache_promote(struct cache_set *c, struct bch_read_bio *orig_bio,
+		     struct bkey_s_c old,
+		     struct bkey_s_c new,
+		     unsigned write_flags)
+{
+#if 0
+	struct cache_promote_op *op;
+	struct bio *bio;
+	unsigned pages = DIV_ROUND_UP(orig_bio->bio.bi_iter.bi_size, PAGE_SIZE);
+
+	/* XXX: readahead? */
+
+	op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
+	if (!op)
+		goto out_submit;
+
+	/* clone the bbio */
+	memcpy(&op->bio, orig_bio, offsetof(struct bbio, bio));
+
+	bio = &op->bio.bio.bio;
+	bio_init(bio);
+	bio_get(bio);
+	bio->bi_bdev		= orig_bio->bio.bi_bdev;
+	bio->bi_iter.bi_sector	= orig_bio->bio.bi_iter.bi_sector;
+	bio->bi_iter.bi_size	= orig_bio->bio.bi_iter.bi_size;
+	bio->bi_end_io		= cache_promote_endio;
+	bio->bi_private		= &op->cl;
+	bio->bi_io_vec		= bio->bi_inline_vecs;
+	bch_bio_map(bio, NULL);
+
+	if (bio_alloc_pages(bio, __GFP_NOWARN|GFP_NOIO))
+		goto out_free;
+
+	orig_bio->ca = NULL;
+
+	closure_init(&op->cl, &c->cl);
+	op->orig_bio		= &orig_bio->bio;
+	op->stale		= 0;
+
+	bch_write_op_init(&op->iop, c, &op->bio, &c->promote_write_point,
+			  new, old,
+			  BCH_WRITE_ALLOC_NOWAIT|write_flags);
+	op->iop.nr_replicas = 1;
+
+	//bch_cut_front(bkey_start_pos(&orig_bio->key.k), &op->iop.insert_key);
+	//bch_cut_back(orig_bio->key.k.p, &op->iop.insert_key.k);
+
+	trace_bcache_promote(&orig_bio->bio);
+
+	op->bio.bio.submit_time_us = local_clock_us();
+	closure_bio_submit(bio, &op->cl);
+
+	continue_at(&op->cl, cache_promote_write, c->wq);
+out_free:
+	kfree(op);
+out_submit:
+	generic_make_request(&orig_bio->bio);
+#endif
+}
+
+/**
+ * cached_dev_cache_miss - populate cache with data from backing device
+ *
+ * We don't write to the cache if s->bypass is set.
+ */
+static int cached_dev_cache_miss(struct btree_iter *iter, struct search *s,
+				 struct bio *bio, unsigned sectors)
+{
+	int ret;
+	unsigned reada = 0;
+	struct bio *miss;
+	BKEY_PADDED(key) replace;
+
+	s->cache_miss = 1;
+
+	if (s->bypass)
+		goto nopromote;
+#if 0
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	/* XXX: broken */
+	if (!(bio->bi_opf & REQ_RAHEAD) &&
+	    !(bio->bi_opf & REQ_META) &&
+	    ((u64) sectors_available(dc->disk.c) * 100 <
+	     (u64) iter->c->capacity * CUTOFF_CACHE_READA))
+		reada = min_t(sector_t, dc->readahead >> 9,
+			      bdev_sectors(bio->bi_bdev) - bio_end_sector(bio));
+#endif
+	sectors = min(sectors, bio_sectors(bio) + reada);
+
+	replace.key.k = KEY(s->inode,
+			    bio->bi_iter.bi_sector + sectors,
+			    sectors);
+
+	ret = bch_btree_insert_check_key(iter, &replace.key);
+	if (ret == -EINTR)
+		return ret;
+
+	miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
+
+	miss->bi_end_io		= request_endio;
+	miss->bi_private	= &s->cl;
+
+	//to_bbio(miss)->key.k = KEY(s->inode,
+	//			   bio_end_sector(miss),
+	//			   bio_sectors(miss));
+	to_rbio(miss)->ca = NULL;
+
+	closure_get(&s->cl);
+	__cache_promote(s->iop.c, to_rbio(miss),
+			bkey_i_to_s_c(&replace.key),
+			bkey_to_s_c(&KEY(replace.key.k.p.inode,
+					 replace.key.k.p.offset,
+					 replace.key.k.size)),
+			BCH_WRITE_CACHED);
+
+	return 0;
+nopromote:
+	miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
+
+	miss->bi_end_io		= request_endio;
+	miss->bi_private	= &s->cl;
+	closure_bio_submit(miss, &s->cl);
+
+	return 0;
+}
+
+static void cached_dev_read(struct cached_dev *dc, struct search *s)
+{
+	struct cache_set *c = s->iop.c;
+	struct closure *cl = &s->cl;
+	struct bio *bio = &s->rbio.bio;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch_increment_clock(c, bio_sectors(bio), READ);
+
+	for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
+				POS(s->inode, bio->bi_iter.bi_sector), k) {
+		BKEY_PADDED(k) tmp;
+		struct extent_pick_ptr pick;
+		unsigned sectors, bytes;
+		bool is_last;
+retry:
+		bkey_reassemble(&tmp.k, k);
+		bch_btree_iter_unlock(&iter);
+		k = bkey_i_to_s_c(&tmp.k);
+
+		bch_extent_pick_ptr(c, k, &pick);
+		if (IS_ERR(pick.ca)) {
+			bcache_io_error(c, bio, "no device to read from");
+			goto out;
+		}
+
+		sectors = min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
+			bio->bi_iter.bi_sector;
+		bytes = sectors << 9;
+		is_last = bytes == bio->bi_iter.bi_size;
+		swap(bio->bi_iter.bi_size, bytes);
+
+		if (pick.ca) {
+			PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
+				c->prio_clock[READ].hand;
+
+			if (!bkey_extent_is_cached(k.k))
+				s->read_dirty_data = true;
+
+			bch_read_extent(c, &s->rbio, k, &pick,
+					BCH_READ_FORCE_BOUNCE|
+					BCH_READ_RETRY_IF_STALE|
+					(!s->bypass ? BCH_READ_PROMOTE : 0)|
+					(is_last ? BCH_READ_IS_LAST : 0));
+		} else {
+			/* not present (hole), or stale cached data */
+			if (cached_dev_cache_miss(&iter, s, bio, sectors)) {
+				k = bch_btree_iter_peek_with_holes(&iter);
+				if (btree_iter_err(k))
+					break;
+				goto retry;
+			}
+		}
+
+		swap(bio->bi_iter.bi_size, bytes);
+		bio_advance(bio, bytes);
+
+		if (is_last) {
+			bch_btree_iter_unlock(&iter);
+			goto out;
+		}
+	}
+
+	/*
+	 * If we get here, it better have been because there was an error
+	 * reading a btree node
+	 */
+	ret = bch_btree_iter_unlock(&iter);
+	BUG_ON(!ret);
+	bcache_io_error(c, bio, "btree IO error %i", ret);
+out:
+	continue_at(cl, cached_dev_read_done_bh, NULL);
+}
+
+/* Process writes */
+
+static void cached_dev_write_complete(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	up_read_non_owner(&dc->writeback_lock);
+	cached_dev_bio_complete(cl);
+}
+
+static void cached_dev_write(struct cached_dev *dc, struct search *s)
+{
+	struct closure *cl = &s->cl;
+	struct bio *bio = &s->wbio.bio;
+	bool writeback = false;
+	bool bypass = s->bypass;
+	struct bkey insert_key = KEY(s->inode,
+				     bio_end_sector(bio),
+				     bio_sectors(bio));
+	unsigned flags = BCH_WRITE_DISCARD_ON_ERROR;
+
+	down_read_non_owner(&dc->writeback_lock);
+	if (bch_keybuf_check_overlapping(&dc->writeback_keys,
+					 bkey_start_pos(&insert_key),
+					 insert_key.p)) {
+		/*
+		 * We overlap with some dirty data undergoing background
+		 * writeback, force this write to writeback
+		 */
+		bypass = false;
+		writeback = true;
+	}
+
+	/*
+	 * Discards aren't _required_ to do anything, so skipping if
+	 * check_overlapping returned true is ok
+	 *
+	 * But check_overlapping drops dirty keys for which io hasn't started,
+	 * so we still want to call it.
+	 */
+	if (bio_op(bio) == REQ_OP_DISCARD)
+		bypass = true;
+
+	if (should_writeback(dc, bio, BDEV_CACHE_MODE(dc->disk_sb.sb),
+			     bypass)) {
+		bypass = false;
+		writeback = true;
+	}
+
+	if (bypass) {
+		/*
+		 * If this is a bypass-write (as opposed to a discard), send
+		 * it down to the backing device. If this is a discard, only
+		 * send it to the backing device if the backing device
+		 * supports discards. Otherwise, we simply discard the key
+		 * range from the cache and don't touch the backing device.
+		 */
+		if ((bio_op(bio) != REQ_OP_DISCARD) ||
+		    blk_queue_discard(bdev_get_queue(dc->disk_sb.bdev)))
+			closure_bio_submit(s->orig_bio, cl);
+	} else if (writeback) {
+		bch_writeback_add(dc);
+
+		if (bio->bi_opf & REQ_PREFLUSH) {
+			/* Also need to send a flush to the backing device */
+			struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0,
+							     &dc->disk.bio_split);
+
+			flush->bi_bdev	= bio->bi_bdev;
+			flush->bi_end_io = request_endio;
+			flush->bi_private = cl;
+			bio_set_op_attrs(flush, REQ_OP_WRITE, WRITE_FLUSH);
+
+			closure_bio_submit(flush, cl);
+		}
+	} else {
+		struct bio *writethrough =
+			bio_clone_fast(bio, GFP_NOIO, &dc->disk.bio_split);
+
+		closure_bio_submit(writethrough, cl);
+
+		flags |= BCH_WRITE_CACHED;
+		flags |= BCH_WRITE_ALLOC_NOWAIT;
+	}
+
+	if (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA))
+		flags |= BCH_WRITE_FLUSH;
+	if (bypass)
+		flags |= BCH_WRITE_DISCARD;
+
+	bch_write_op_init(&s->iop, dc->disk.c, &s->wbio,
+			  (struct disk_reservation) { 0 },
+			  foreground_write_point(dc->disk.c,
+					(unsigned long) current),
+			  bkey_start_pos(&insert_key),
+			  NULL, flags);
+
+	closure_call(&s->iop.cl, bch_write, NULL, cl);
+	continue_at(cl, cached_dev_write_complete, NULL);
+}
+
+/* Cached devices - read & write stuff */
+
+static void __cached_dev_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct search *s;
+	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+	int rw = bio_data_dir(bio);
+
+	generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0);
+
+	bio->bi_bdev = dc->disk_sb.bdev;
+	bio->bi_iter.bi_sector += le64_to_cpu(dc->disk_sb.sb->data_offset);
+
+	if (cached_dev_get(dc)) {
+		struct bio *clone;
+
+		s = search_alloc(bio, d);
+		trace_bcache_request_start(s->d, bio);
+
+		clone = rw ? &s->wbio.bio : &s->rbio.bio;
+
+		if (!bio->bi_iter.bi_size) {
+			if (s->orig_bio->bi_opf & (REQ_PREFLUSH|REQ_FUA))
+				bch_journal_flush_async(&s->iop.c->journal,
+							&s->cl);
+
+			/*
+			 * If it's a flush, we send the flush to the backing
+			 * device too
+			 */
+			closure_bio_submit(clone, &s->cl);
+
+			continue_at(&s->cl, cached_dev_bio_complete, NULL);
+		} else {
+			s->bypass = check_should_bypass(dc, bio, rw);
+
+			if (rw)
+				cached_dev_write(dc, s);
+			else
+				cached_dev_read(dc, s);
+		}
+	} else {
+		if ((bio_op(bio) == REQ_OP_DISCARD) &&
+		    !blk_queue_discard(bdev_get_queue(dc->disk_sb.bdev)))
+			bio_endio(bio);
+		else
+			generic_make_request(bio);
+	}
+}
+
+static blk_qc_t cached_dev_make_request(struct request_queue *q,
+					struct bio *bio)
+{
+	__cached_dev_make_request(q, bio);
+	return BLK_QC_T_NONE;
+}
+
+static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
+			    unsigned int cmd, unsigned long arg)
+{
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+	return __blkdev_driver_ioctl(dc->disk_sb.bdev, mode, cmd, arg);
+}
+
+static int cached_dev_congested(void *data, int bits)
+{
+	struct bcache_device *d = data;
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+	struct request_queue *q = bdev_get_queue(dc->disk_sb.bdev);
+	int ret = 0;
+
+	if (bdi_congested(&q->backing_dev_info, bits))
+		return 1;
+
+	if (cached_dev_get(dc)) {
+		unsigned i;
+		struct cache *ca;
+
+		for_each_cache(ca, d->c, i) {
+			q = bdev_get_queue(ca->disk_sb.bdev);
+			ret |= bdi_congested(&q->backing_dev_info, bits);
+		}
+
+		cached_dev_put(dc);
+	}
+
+	return ret;
+}
+
+void bch_cached_dev_request_init(struct cached_dev *dc)
+{
+	struct gendisk *g = dc->disk.disk;
+
+	g->queue->make_request_fn		= cached_dev_make_request;
+	g->queue->backing_dev_info.congested_fn = cached_dev_congested;
+	dc->disk.ioctl				= cached_dev_ioctl;
+}
+
+/* Blockdev volumes */
+
+static void __blockdev_volume_make_request(struct request_queue *q,
+					   struct bio *bio)
+{
+	struct search *s;
+	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
+	int rw = bio_data_dir(bio);
+
+	generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0);
+
+	trace_bcache_request_start(d, bio);
+
+	s = search_alloc(bio, d);
+
+	if (!bio->bi_iter.bi_size) {
+		if (s->orig_bio->bi_opf & (REQ_PREFLUSH|REQ_FUA))
+			bch_journal_flush_async(&s->iop.c->journal,
+						&s->cl);
+
+		continue_at(&s->cl, search_free, NULL);
+	} else if (rw) {
+		struct disk_reservation res = { 0 };
+		unsigned flags = 0;
+
+		if (bio_op(bio) != REQ_OP_DISCARD &&
+		    bch_disk_reservation_get(d->c, &res, bio_sectors(bio), 0)) {
+			s->iop.error = -ENOSPC;
+			continue_at(&s->cl, search_free, NULL);
+			return;
+		}
+
+		if (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA))
+			flags |= BCH_WRITE_FLUSH;
+		if (bio_op(bio) == REQ_OP_DISCARD)
+			flags |= BCH_WRITE_DISCARD;
+
+		bch_write_op_init(&s->iop, d->c, &s->wbio, res,
+				  foreground_write_point(d->c,
+						(unsigned long) current),
+				  POS(s->inode, bio->bi_iter.bi_sector),
+				  NULL, flags);
+
+		closure_call(&s->iop.cl, bch_write, NULL, &s->cl);
+	} else {
+		closure_get(&s->cl);
+		bch_read(d->c, &s->rbio, bcache_dev_inum(d));
+	}
+	continue_at(&s->cl, search_free, NULL);
+}
+
+static blk_qc_t blockdev_volume_make_request(struct request_queue *q,
+					     struct bio *bio)
+{
+	__blockdev_volume_make_request(q, bio);
+	return BLK_QC_T_NONE;
+}
+
+static int blockdev_volume_ioctl(struct bcache_device *d, fmode_t mode,
+				 unsigned int cmd, unsigned long arg)
+{
+	return -ENOTTY;
+}
+
+static int blockdev_volume_congested(void *data, int bits)
+{
+	struct bcache_device *d = data;
+	struct request_queue *q;
+	struct cache *ca;
+	unsigned i;
+	int ret = 0;
+
+	for_each_cache(ca, d->c, i) {
+		q = bdev_get_queue(ca->disk_sb.bdev);
+		ret |= bdi_congested(&q->backing_dev_info, bits);
+	}
+
+	return ret;
+}
+
+void bch_blockdev_volume_request_init(struct bcache_device *d)
+{
+	struct gendisk *g = d->disk;
+
+	g->queue->make_request_fn		= blockdev_volume_make_request;
+	g->queue->backing_dev_info.congested_fn = blockdev_volume_congested;
+	d->ioctl				= blockdev_volume_ioctl;
+}
diff --git a/libbcache/request.h b/libbcache/request.h
new file mode 100644
index 0000000..cd3fe12
--- /dev/null
+++ b/libbcache/request.h
@@ -0,0 +1,16 @@
+#ifndef _BCACHE_REQUEST_H_
+#define _BCACHE_REQUEST_H_
+
+#include "stats.h"
+
+struct cache_set;
+struct cached_dev;
+struct bcache_device;
+struct kmem_cache;
+
+unsigned bch_get_congested(struct cache_set *);
+
+void bch_cached_dev_request_init(struct cached_dev *dc);
+void bch_blockdev_volume_request_init(struct bcache_device *d);
+
+#endif /* _BCACHE_REQUEST_H_ */
diff --git a/libbcache/siphash.c b/libbcache/siphash.c
new file mode 100644
index 0000000..5ba80b5
--- /dev/null
+++ b/libbcache/siphash.c
@@ -0,0 +1,185 @@
+/*	$OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */
+
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
+ * are the number of compression rounds and the number of finalization rounds.
+ * A compression round is identical to a finalization round and this round
+ * function is called SipRound.  Given a 128-bit key k and a (possibly empty)
+ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
+ *
+ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
+ * by Jean-Philippe Aumasson and Daniel J. Bernstein,
+ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
+ * https://131002.net/siphash/siphash.pdf
+ * https://131002.net/siphash/
+ */
+
+//#include <sys/param.h>
+//#include <sys/systm.h>
+
+#include <asm/byteorder.h>
+#include <linux/string.h>
+
+#include "siphash.h"
+
+static void	SipHash_CRounds(SIPHASH_CTX *, int);
+static void	SipHash_Rounds(SIPHASH_CTX *, int);
+
+void
+SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
+{
+	u64 k0, k1;
+
+	k0 = le64_to_cpu(key->k0);
+	k1 = le64_to_cpu(key->k1);
+
+	ctx->v[0] = 0x736f6d6570736575ULL ^ k0;
+	ctx->v[1] = 0x646f72616e646f6dULL ^ k1;
+	ctx->v[2] = 0x6c7967656e657261ULL ^ k0;
+	ctx->v[3] = 0x7465646279746573ULL ^ k1;
+
+	memset(ctx->buf, 0, sizeof(ctx->buf));
+	ctx->bytes = 0;
+}
+
+void
+SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len)
+{
+	const u8 *ptr = src;
+	size_t left, used;
+
+	if (len == 0)
+		return;
+
+	used = ctx->bytes % sizeof(ctx->buf);
+	ctx->bytes += len;
+
+	if (used > 0) {
+		left = sizeof(ctx->buf) - used;
+
+		if (len >= left) {
+			memcpy(&ctx->buf[used], ptr, left);
+			SipHash_CRounds(ctx, rc);
+			len -= left;
+			ptr += left;
+		} else {
+			memcpy(&ctx->buf[used], ptr, len);
+			return;
+		}
+	}
+
+	while (len >= sizeof(ctx->buf)) {
+		memcpy(ctx->buf, ptr, sizeof(ctx->buf));
+		SipHash_CRounds(ctx, rc);
+		len -= sizeof(ctx->buf);
+		ptr += sizeof(ctx->buf);
+	}
+
+	if (len > 0)
+		memcpy(&ctx->buf[used], ptr, len);
+}
+
+void
+SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
+{
+	u64 r;
+
+	r = SipHash_End(ctx, rc, rf);
+
+	*((__le64 *) dst) = cpu_to_le64(r);
+}
+
+u64
+SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
+{
+	u64 r;
+	size_t left, used;
+
+	used = ctx->bytes % sizeof(ctx->buf);
+	left = sizeof(ctx->buf) - used;
+	memset(&ctx->buf[used], 0, left - 1);
+	ctx->buf[7] = ctx->bytes;
+
+	SipHash_CRounds(ctx, rc);
+	ctx->v[2] ^= 0xff;
+	SipHash_Rounds(ctx, rf);
+
+	r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
+	memset(ctx, 0, sizeof(*ctx));
+	return (r);
+}
+
+u64
+SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
+{
+	SIPHASH_CTX ctx;
+
+	SipHash_Init(&ctx, key);
+	SipHash_Update(&ctx, rc, rf, src, len);
+	return (SipHash_End(&ctx, rc, rf));
+}
+
+#define SIP_ROTL(x, b) ((x) << (b)) | ( (x) >> (64 - (b)))
+
+static void
+SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
+{
+	while (rounds--) {
+		ctx->v[0] += ctx->v[1];
+		ctx->v[2] += ctx->v[3];
+		ctx->v[1] = SIP_ROTL(ctx->v[1], 13);
+		ctx->v[3] = SIP_ROTL(ctx->v[3], 16);
+
+		ctx->v[1] ^= ctx->v[0];
+		ctx->v[3] ^= ctx->v[2];
+		ctx->v[0] = SIP_ROTL(ctx->v[0], 32);
+
+		ctx->v[2] += ctx->v[1];
+		ctx->v[0] += ctx->v[3];
+		ctx->v[1] = SIP_ROTL(ctx->v[1], 17);
+		ctx->v[3] = SIP_ROTL(ctx->v[3], 21);
+
+		ctx->v[1] ^= ctx->v[2];
+		ctx->v[3] ^= ctx->v[0];
+		ctx->v[2] = SIP_ROTL(ctx->v[2], 32);
+	}
+}
+
+static void
+SipHash_CRounds(SIPHASH_CTX *ctx, int rounds)
+{
+	u64 m = le64_to_cpu(*((__le64 *)ctx->buf));
+
+	ctx->v[3] ^= m;
+	SipHash_Rounds(ctx, rounds);
+	ctx->v[0] ^= m;
+}
diff --git a/libbcache/siphash.h b/libbcache/siphash.h
new file mode 100644
index 0000000..7a4b224
--- /dev/null
+++ b/libbcache/siphash.h
@@ -0,0 +1,86 @@
+/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions)
+ * optimized for speed on short messages returning a 64bit hash/digest value.
+ *
+ * The number of rounds is defined during the initialization:
+ *  SipHash24_Init() for the fast and resonable strong version
+ *  SipHash48_Init() for the strong version (half as fast)
+ *
+ * struct SIPHASH_CTX ctx;
+ * SipHash24_Init(&ctx);
+ * SipHash_SetKey(&ctx, "16bytes long key");
+ * SipHash_Update(&ctx, pointer_to_string, length_of_string);
+ * SipHash_Final(output, &ctx);
+ */
+
+#ifndef _SIPHASH_H_
+#define _SIPHASH_H_
+
+#include <linux/types.h>
+
+#define SIPHASH_BLOCK_LENGTH	 8
+#define SIPHASH_KEY_LENGTH	16
+#define SIPHASH_DIGEST_LENGTH	 8
+
+typedef struct _SIPHASH_CTX {
+	u64		v[4];
+	u8		buf[SIPHASH_BLOCK_LENGTH];
+	u32		bytes;
+} SIPHASH_CTX;
+
+typedef struct {
+	__le64		k0;
+	__le64		k1;
+} SIPHASH_KEY;
+
+void	SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *);
+void	SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t);
+u64	SipHash_End(SIPHASH_CTX *, int, int);
+void	SipHash_Final(void *, SIPHASH_CTX *, int, int);
+u64	SipHash(const SIPHASH_KEY *, int, int, const void *, size_t);
+
+#define SipHash24_Init(_c, _k)		SipHash_Init((_c), (_k))
+#define SipHash24_Update(_c, _p, _l)	SipHash_Update((_c), 2, 4, (_p), (_l))
+#define SipHash24_End(_d)		SipHash_End((_d), 2, 4)
+#define SipHash24_Final(_d, _c)		SipHash_Final((_d), (_c), 2, 4)
+#define SipHash24(_k, _p, _l)		SipHash((_k), 2, 4, (_p), (_l))
+
+#define SipHash48_Init(_c, _k)		SipHash_Init((_c), (_k))
+#define SipHash48_Update(_c, _p, _l)	SipHash_Update((_c), 4, 8, (_p), (_l))
+#define SipHash48_End(_d)		SipHash_End((_d), 4, 8)
+#define SipHash48_Final(_d, _c)		SipHash_Final((_d), (_c), 4, 8)
+#define SipHash48(_k, _p, _l)		SipHash((_k), 4, 8, (_p), (_l))
+
+#endif /* _SIPHASH_H_ */
diff --git a/libbcache/six.c b/libbcache/six.c
new file mode 100644
index 0000000..1bb8bfc
--- /dev/null
+++ b/libbcache/six.c
@@ -0,0 +1,396 @@
+
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+
+#include "six.h"
+
+#define six_acquire(l, t)	lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
+#define six_release(l)		lock_release(l, 0, _RET_IP_)
+
+#define __SIX_LOCK_HELD_read		__SIX_VAL(read_lock, ~0)
+#define __SIX_LOCK_HELD_intent		__SIX_VAL(intent_lock, ~0)
+#define __SIX_LOCK_HELD_write		__SIX_VAL(seq, 1)
+
+struct six_lock_vals {
+	/* Value we add to the lock in order to take the lock: */
+	u64			lock_val;
+
+	/* If the lock has this value (used as a mask), taking the lock fails: */
+	u64			lock_fail;
+
+	/* Value we add to the lock in order to release the lock: */
+	u64			unlock_val;
+
+	/* Mask that indicates lock is held for this type: */
+	u64			held_mask;
+
+	/* Waitlist we wakeup when releasing the lock: */
+	enum six_lock_type	unlock_wakeup;
+};
+
+#define LOCK_VALS {							\
+	[SIX_LOCK_read] = {						\
+		.lock_val	= __SIX_VAL(read_lock, 1),		\
+		.lock_fail	= __SIX_LOCK_HELD_write,		\
+		.unlock_val	= -__SIX_VAL(read_lock, 1),		\
+		.held_mask	= __SIX_LOCK_HELD_read,			\
+		.unlock_wakeup	= SIX_LOCK_write,			\
+	},								\
+	[SIX_LOCK_intent] = {						\
+		.lock_val	= __SIX_VAL(intent_lock, 1),		\
+		.lock_fail	= __SIX_LOCK_HELD_intent,		\
+		.unlock_val	= -__SIX_VAL(intent_lock, 1),		\
+		.held_mask	= __SIX_LOCK_HELD_intent,		\
+		.unlock_wakeup	= SIX_LOCK_intent,			\
+	},								\
+	[SIX_LOCK_write] = {						\
+		.lock_val	= __SIX_VAL(seq, 1),			\
+		.lock_fail	= __SIX_LOCK_HELD_read,			\
+		.unlock_val	= __SIX_VAL(seq, 1),			\
+		.held_mask	= __SIX_LOCK_HELD_write,		\
+		.unlock_wakeup	= SIX_LOCK_read,			\
+	},								\
+}
+
+static void six_set_owner(struct six_lock *lock, enum six_lock_type type)
+{
+	if (type == SIX_LOCK_intent)
+		lock->owner = current;
+}
+
+static void six_clear_owner(struct six_lock *lock, enum six_lock_type type)
+{
+	if (type == SIX_LOCK_intent)
+		lock->owner = NULL;
+}
+
+static inline bool __six_trylock_type(struct six_lock *lock,
+				      enum six_lock_type type)
+{
+	const struct six_lock_vals l[] = LOCK_VALS;
+	union six_lock_state old;
+	u64 v = READ_ONCE(lock->state.v);
+
+	do {
+		old.v = v;
+
+		EBUG_ON(type == SIX_LOCK_write &&
+			((old.v & __SIX_LOCK_HELD_write) ||
+			 !(old.v & __SIX_LOCK_HELD_intent)));
+
+		if (old.v & l[type].lock_fail)
+			return false;
+	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+				old.v,
+				old.v + l[type].lock_val)) != old.v);
+	return true;
+}
+
+bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	bool ret = __six_trylock_type(lock, type);
+
+	if (ret) {
+		six_acquire(&lock->dep_map, 1);
+		six_set_owner(lock, type);
+	}
+
+	return ret;
+}
+
+bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
+		     unsigned seq)
+{
+	const struct six_lock_vals l[] = LOCK_VALS;
+	union six_lock_state old;
+	u64 v = READ_ONCE(lock->state.v);
+
+	do {
+		old.v = v;
+
+		if (old.seq != seq || old.v & l[type].lock_fail)
+			return false;
+	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+				old.v,
+				old.v + l[type].lock_val)) != old.v);
+
+	six_acquire(&lock->dep_map, 1);
+	six_set_owner(lock, type);
+	return true;
+}
+
+struct six_lock_waiter {
+	struct list_head	list;
+	struct task_struct	*task;
+};
+
+/* This is probably up there with the more evil things I've done */
+#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
+
+static inline int six_can_spin_on_owner(struct six_lock *lock)
+{
+	struct task_struct *owner;
+	int retval = 1;
+
+	if (need_resched())
+		return 0;
+
+	rcu_read_lock();
+	owner = READ_ONCE(lock->owner);
+	if (owner)
+		retval = owner->on_cpu;
+	rcu_read_unlock();
+	/*
+	 * if lock->owner is not set, the mutex owner may have just acquired
+	 * it and not set the owner yet or the mutex has been released.
+	 */
+	return retval;
+}
+
+static bool six_spin_on_owner(struct six_lock *lock, struct task_struct *owner)
+{
+	bool ret = true;
+
+	rcu_read_lock();
+	while (lock->owner == owner) {
+		/*
+		 * Ensure we emit the owner->on_cpu, dereference _after_
+		 * checking lock->owner still matches owner. If that fails,
+		 * owner might point to freed memory. If it still matches,
+		 * the rcu_read_lock() ensures the memory stays valid.
+		 */
+		barrier();
+
+		if (!owner->on_cpu || need_resched()) {
+			ret = false;
+			break;
+		}
+
+		cpu_relax_lowlatency();
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+	struct task_struct *task = current;
+
+	if (type == SIX_LOCK_write)
+		return false;
+
+	preempt_disable();
+	if (!six_can_spin_on_owner(lock))
+		goto fail;
+
+	if (!osq_lock(&lock->osq))
+		goto fail;
+
+	while (1) {
+		struct task_struct *owner;
+
+		/*
+		 * If there's an owner, wait for it to either
+		 * release the lock or go to sleep.
+		 */
+		owner = READ_ONCE(lock->owner);
+		if (owner && !six_spin_on_owner(lock, owner))
+			break;
+
+		if (__six_trylock_type(lock, type)) {
+			osq_unlock(&lock->osq);
+			preempt_enable();
+			return true;
+		}
+
+		/*
+		 * When there's no owner, we might have preempted between the
+		 * owner acquiring the lock and setting the owner field. If
+		 * we're an RT task that will live-lock because we won't let
+		 * the owner complete.
+		 */
+		if (!owner && (need_resched() || rt_task(task)))
+			break;
+
+		/*
+		 * The cpu_relax() call is a compiler barrier which forces
+		 * everything in this loop to be re-loaded. We don't need
+		 * memory barriers as we'll eventually observe the right
+		 * values at the cost of a few extra spins.
+		 */
+		cpu_relax_lowlatency();
+	}
+
+	osq_unlock(&lock->osq);
+fail:
+	preempt_enable();
+
+	/*
+	 * If we fell out of the spin path because of need_resched(),
+	 * reschedule now, before we try-lock again. This avoids getting
+	 * scheduled out right after we obtained the lock.
+	 */
+	if (need_resched())
+		schedule();
+
+	return false;
+}
+
+void six_lock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	const struct six_lock_vals l[] = LOCK_VALS;
+	union six_lock_state old, new;
+	struct six_lock_waiter wait;
+	u64 v;
+
+	six_acquire(&lock->dep_map, 0);
+
+	if (__six_trylock_type(lock, type))
+		goto done;
+
+	if (six_optimistic_spin(lock, type))
+		goto done;
+
+	lock_contended(&lock->dep_map, _RET_IP_);
+
+	INIT_LIST_HEAD(&wait.list);
+	wait.task = current;
+
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (list_empty_careful(&wait.list)) {
+			raw_spin_lock(&lock->wait_lock);
+			list_add_tail(&wait.list, &lock->wait_list[type]);
+			raw_spin_unlock(&lock->wait_lock);
+		}
+
+		v = READ_ONCE(lock->state.v);
+		do {
+			new.v = old.v = v;
+
+			if (!(old.v & l[type].lock_fail))
+				new.v += l[type].lock_val;
+			else if (!(new.waiters & (1 << type)))
+				new.waiters |= 1 << type;
+			else
+				break; /* waiting bit already set */
+		} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+					old.v, new.v)) != old.v);
+
+		if (!(old.v & l[type].lock_fail))
+			break;
+
+		schedule();
+	}
+
+	__set_current_state(TASK_RUNNING);
+
+	if (!list_empty_careful(&wait.list)) {
+		raw_spin_lock(&lock->wait_lock);
+		list_del_init(&wait.list);
+		raw_spin_unlock(&lock->wait_lock);
+	}
+done:
+	lock_acquired(&lock->dep_map, _RET_IP_);
+	six_set_owner(lock, type);
+}
+
+static inline void six_lock_wakeup(struct six_lock *lock,
+				   union six_lock_state state,
+				   unsigned waitlist_id)
+{
+	struct list_head *wait_list = &lock->wait_list[waitlist_id];
+	struct six_lock_waiter *w, *next;
+
+	if (waitlist_id == SIX_LOCK_write && state.read_lock)
+		return;
+
+	if (!(state.waiters & (1 << waitlist_id)))
+		return;
+
+	clear_bit(waitlist_bitnr(waitlist_id),
+		  (unsigned long *) &lock->state.v);
+
+	raw_spin_lock(&lock->wait_lock);
+
+	list_for_each_entry_safe(w, next, wait_list, list) {
+		list_del_init(&w->list);
+
+		if (wake_up_process(w->task) &&
+		    waitlist_id != SIX_LOCK_read) {
+			if (!list_empty(wait_list))
+				set_bit(waitlist_bitnr(waitlist_id),
+					(unsigned long *) &lock->state.v);
+			break;
+		}
+	}
+
+	raw_spin_unlock(&lock->wait_lock);
+}
+
+void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	const struct six_lock_vals l[] = LOCK_VALS;
+	union six_lock_state state;
+
+	six_clear_owner(lock, type);
+
+	EBUG_ON(!(lock->state.v & l[type].held_mask));
+	EBUG_ON(type == SIX_LOCK_write &&
+		!(lock->state.v & __SIX_LOCK_HELD_intent));
+
+	state.v = atomic64_add_return_release(l[type].unlock_val,
+					      &lock->state.counter);
+	six_release(&lock->dep_map);
+	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
+}
+
+bool six_trylock_convert(struct six_lock *lock,
+			 enum six_lock_type from,
+			 enum six_lock_type to)
+{
+	const struct six_lock_vals l[] = LOCK_VALS;
+	union six_lock_state old, new;
+	u64 v = READ_ONCE(lock->state.v);
+
+	do {
+		new.v = old.v = v;
+		new.v += l[from].unlock_val;
+
+		if (new.v & l[to].lock_fail)
+			return false;
+	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+				old.v,
+				new.v + l[to].lock_val)) != old.v);
+
+	six_clear_owner(lock, from);
+	six_set_owner(lock, to);
+
+	six_lock_wakeup(lock, new, l[from].unlock_wakeup);
+
+	return true;
+}
+
+/*
+ * Increment read/intent lock count, assuming we already have it read or intent
+ * locked:
+ */
+void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
+{
+	const struct six_lock_vals l[] = LOCK_VALS;
+
+	EBUG_ON(type == SIX_LOCK_write);
+	six_acquire(&lock->dep_map, 0);
+
+	/* XXX: assert already locked, and that we don't overflow: */
+
+	atomic64_add(l[type].lock_val, &lock->state.counter);
+}
+
+/* Convert from intent to read: */
+void six_lock_downgrade(struct six_lock *lock)
+{
+	six_lock_increment(lock, SIX_LOCK_read);
+	six_unlock_intent(lock);
+}
diff --git a/libbcache/six.h b/libbcache/six.h
new file mode 100644
index 0000000..01ed338
--- /dev/null
+++ b/libbcache/six.h
@@ -0,0 +1,136 @@
+
+#ifndef _BCACHE_SIX_H
+#define _BCACHE_SIX_H
+
+#include <linux/lockdep.h>
+#include <linux/osq_lock.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+#include "util.h"
+
+/*
+ * LOCK STATES:
+ *
+ * read, intent, write (i.e. shared/intent/exclusive, hence the name)
+ *
+ * read and write work as with normal read/write locks - a lock can have
+ * multiple readers, but write excludes reads and other write locks.
+ *
+ * Intent does not block read, but it does block other intent locks. The idea is
+ * by taking an intent lock, you can then later upgrade to a write lock without
+ * dropping your read lock and without deadlocking - because no other thread has
+ * the intent lock and thus no other thread could be trying to take the write
+ * lock.
+ */
+
+union six_lock_state {
+	struct {
+		atomic64_t	counter;
+	};
+
+	struct {
+		u64		v;
+	};
+
+	struct {
+		/* for waitlist_bitnr() */
+		unsigned long	l;
+	};
+
+	struct {
+		unsigned	read_lock:26;
+		unsigned	intent_lock:3;
+		unsigned	waiters:3;
+		/*
+		 * seq works much like in seqlocks: it's incremented every time
+		 * we lock and unlock for write.
+		 *
+		 * If it's odd write lock is held, even unlocked.
+		 *
+		 * Thus readers can unlock, and then lock again later iff it
+		 * hasn't been modified in the meantime.
+		 */
+		u32		seq;
+	};
+};
+
+#define SIX_LOCK_MAX_RECURSE	((1 << 3) - 1)
+
+enum six_lock_type {
+	SIX_LOCK_read,
+	SIX_LOCK_intent,
+	SIX_LOCK_write,
+};
+
+struct six_lock {
+	union six_lock_state	state;
+	struct task_struct	*owner;
+	struct optimistic_spin_queue osq;
+
+	raw_spinlock_t		wait_lock;
+	struct list_head	wait_list[3];
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+static __always_inline void __six_lock_init(struct six_lock *lock,
+					    const char *name,
+					    struct lock_class_key *key)
+{
+	atomic64_set(&lock->state.counter, 0);
+	raw_spin_lock_init(&lock->wait_lock);
+	INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]);
+	INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]);
+	INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_write]);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	debug_check_no_locks_freed((void *) lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+}
+
+#define six_lock_init(lock)						\
+do {									\
+	static struct lock_class_key __key;				\
+									\
+	__six_lock_init((lock), #lock, &__key);				\
+} while (0)
+
+bool six_trylock_type(struct six_lock *, enum six_lock_type);
+bool six_relock_type(struct six_lock *, enum six_lock_type, unsigned);
+void six_lock_type(struct six_lock *, enum six_lock_type);
+void six_unlock_type(struct six_lock *, enum six_lock_type);
+bool six_trylock_convert(struct six_lock *, enum six_lock_type,
+			 enum six_lock_type);
+void six_lock_increment(struct six_lock *, enum six_lock_type);
+void six_lock_downgrade(struct six_lock *);
+
+#define __SIX_VAL(field, _v)	(((union six_lock_state) { .field = _v }).v)
+
+#define __SIX_LOCK(type)						\
+static __always_inline bool six_trylock_##type(struct six_lock *lock)	\
+{									\
+	return six_trylock_type(lock, SIX_LOCK_##type);			\
+}									\
+									\
+static __always_inline bool six_relock_##type(struct six_lock *lock, u32 seq)\
+{									\
+	return six_relock_type(lock, SIX_LOCK_##type, seq);		\
+}									\
+									\
+static __always_inline void six_lock_##type(struct six_lock *lock)	\
+{									\
+	six_lock_type(lock, SIX_LOCK_##type);				\
+}									\
+									\
+static __always_inline void six_unlock_##type(struct six_lock *lock)	\
+{									\
+	six_unlock_type(lock, SIX_LOCK_##type);				\
+}
+
+__SIX_LOCK(read)
+__SIX_LOCK(intent)
+__SIX_LOCK(write)
+
+#endif /* _BCACHE_SIX_H */
diff --git a/libbcache/stats.c b/libbcache/stats.c
new file mode 100644
index 0000000..a8a4eb3
--- /dev/null
+++ b/libbcache/stats.c
@@ -0,0 +1,219 @@
+/*
+ * bcache stats code
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "stats.h"
+#include "sysfs.h"
+
+/*
+ * We keep absolute totals of various statistics, and addionally a set of three
+ * rolling averages.
+ *
+ * Every so often, a timer goes off and rescales the rolling averages.
+ * accounting_rescale[] is how many times the timer has to go off before we
+ * rescale each set of numbers; that gets us half lives of 5 minutes, one hour,
+ * and one day.
+ *
+ * accounting_delay is how often the timer goes off - 22 times in 5 minutes,
+ * and accounting_weight is what we use to rescale:
+ *
+ * pow(31 / 32, 22) ~= 1/2
+ *
+ * So that we don't have to increment each set of numbers every time we (say)
+ * get a cache hit, we increment a single atomic_t in acc->collector, and when
+ * the rescale function runs it resets the atomic counter to 0 and adds its
+ * old value to each of the exported numbers.
+ *
+ * To reduce rounding error, the numbers in struct cache_stats are all
+ * stored left shifted by 16, and scaled back in the sysfs show() function.
+ */
+
+static const unsigned DAY_RESCALE		= 288;
+static const unsigned HOUR_RESCALE		= 12;
+static const unsigned FIVE_MINUTE_RESCALE	= 1;
+static const unsigned accounting_delay		= (HZ * 300) / 22;
+static const unsigned accounting_weight		= 5;
+
+/* sysfs reading/writing */
+
+read_attribute(cache_hits);
+read_attribute(cache_misses);
+read_attribute(cache_bypass_hits);
+read_attribute(cache_bypass_misses);
+read_attribute(cache_hit_ratio);
+read_attribute(cache_readaheads);
+read_attribute(cache_miss_collisions);
+read_attribute(bypassed);
+read_attribute(foreground_write_ratio);
+read_attribute(foreground_writes);
+read_attribute(gc_writes);
+read_attribute(discards);
+
+SHOW(bch_stats)
+{
+	struct cache_stats *s =
+		container_of(kobj, struct cache_stats, kobj);
+#define var(stat)		(s->stat >> 16)
+	var_print(cache_hits);
+	var_print(cache_misses);
+	var_print(cache_bypass_hits);
+	var_print(cache_bypass_misses);
+
+	sysfs_print(cache_hit_ratio,
+		    DIV_SAFE(var(cache_hits) * 100,
+			     var(cache_hits) + var(cache_misses)));
+
+	var_print(cache_readaheads);
+	var_print(cache_miss_collisions);
+
+	sysfs_hprint(bypassed,		var(sectors_bypassed) << 9);
+	sysfs_hprint(foreground_writes,	var(foreground_write_sectors) << 9);
+	sysfs_hprint(gc_writes,		var(gc_write_sectors) << 9);
+	sysfs_hprint(discards,		var(discard_sectors) << 9);
+
+	sysfs_print(foreground_write_ratio,
+		    DIV_SAFE(var(foreground_write_sectors) * 100,
+			     var(foreground_write_sectors) +
+			     var(gc_write_sectors)));
+#undef var
+	return 0;
+}
+
+STORE(bch_stats)
+{
+	return size;
+}
+
+static void bch_stats_release(struct kobject *k)
+{
+}
+
+static struct attribute *bch_stats_files[] = {
+	&sysfs_cache_hits,
+	&sysfs_cache_misses,
+	&sysfs_cache_bypass_hits,
+	&sysfs_cache_bypass_misses,
+	&sysfs_cache_hit_ratio,
+	&sysfs_cache_readaheads,
+	&sysfs_cache_miss_collisions,
+	&sysfs_bypassed,
+	&sysfs_foreground_write_ratio,
+	&sysfs_foreground_writes,
+	&sysfs_gc_writes,
+	&sysfs_discards,
+	NULL
+};
+static KTYPE(bch_stats);
+
+int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
+				   struct kobject *parent)
+{
+	int ret = kobject_add(&acc->total.kobj, parent,
+			      "stats_total");
+	ret = ret ?: kobject_add(&acc->five_minute.kobj, parent,
+				 "stats_five_minute");
+	ret = ret ?: kobject_add(&acc->hour.kobj, parent,
+				 "stats_hour");
+	ret = ret ?: kobject_add(&acc->day.kobj, parent,
+				 "stats_day");
+	return ret;
+}
+
+void bch_cache_accounting_clear(struct cache_accounting *acc)
+{
+	memset(&acc->total.cache_hits,
+	       0,
+	       sizeof(unsigned long) * 9);
+}
+
+void bch_cache_accounting_destroy(struct cache_accounting *acc)
+{
+	kobject_put(&acc->total.kobj);
+	kobject_put(&acc->five_minute.kobj);
+	kobject_put(&acc->hour.kobj);
+	kobject_put(&acc->day.kobj);
+
+	atomic_set(&acc->closing, 1);
+	if (del_timer_sync(&acc->timer))
+		closure_return(&acc->cl);
+}
+
+/* EWMA scaling */
+
+static void scale_stat(unsigned long *stat)
+{
+	*stat =  ewma_add(*stat, 0, accounting_weight);
+}
+
+static void scale_stats(struct cache_stats *stats, unsigned long rescale_at)
+{
+	if (++stats->rescale == rescale_at) {
+		stats->rescale = 0;
+		scale_stat(&stats->cache_hits);
+		scale_stat(&stats->cache_misses);
+		scale_stat(&stats->cache_bypass_hits);
+		scale_stat(&stats->cache_bypass_misses);
+		scale_stat(&stats->cache_readaheads);
+		scale_stat(&stats->cache_miss_collisions);
+		scale_stat(&stats->sectors_bypassed);
+		scale_stat(&stats->foreground_write_sectors);
+		scale_stat(&stats->gc_write_sectors);
+		scale_stat(&stats->discard_sectors);
+	}
+}
+
+static void scale_accounting(unsigned long data)
+{
+	struct cache_accounting *acc = (struct cache_accounting *) data;
+
+#define move_stat(name) do {						\
+	unsigned t = atomic_xchg(&acc->collector.name, 0);		\
+	t <<= 16;							\
+	acc->five_minute.name += t;					\
+	acc->hour.name += t;						\
+	acc->day.name += t;						\
+	acc->total.name += t;						\
+} while (0)
+
+	move_stat(cache_hits);
+	move_stat(cache_misses);
+	move_stat(cache_bypass_hits);
+	move_stat(cache_bypass_misses);
+	move_stat(cache_readaheads);
+	move_stat(cache_miss_collisions);
+	move_stat(sectors_bypassed);
+	move_stat(foreground_write_sectors);
+	move_stat(gc_write_sectors);
+	move_stat(discard_sectors);
+
+	scale_stats(&acc->total, 0);
+	scale_stats(&acc->day, DAY_RESCALE);
+	scale_stats(&acc->hour, HOUR_RESCALE);
+	scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE);
+
+	acc->timer.expires += accounting_delay;
+
+	if (!atomic_read(&acc->closing))
+		add_timer(&acc->timer);
+	else
+		closure_return(&acc->cl);
+}
+
+void bch_cache_accounting_init(struct cache_accounting *acc,
+			       struct closure *parent)
+{
+	kobject_init(&acc->total.kobj,		&bch_stats_ktype);
+	kobject_init(&acc->five_minute.kobj,	&bch_stats_ktype);
+	kobject_init(&acc->hour.kobj,		&bch_stats_ktype);
+	kobject_init(&acc->day.kobj,		&bch_stats_ktype);
+
+	closure_init(&acc->cl, parent);
+	init_timer(&acc->timer);
+	acc->timer.expires	= jiffies + accounting_delay;
+	acc->timer.data		= (unsigned long) acc;
+	acc->timer.function	= scale_accounting;
+	add_timer(&acc->timer);
+}
diff --git a/libbcache/stats.h b/libbcache/stats.h
new file mode 100644
index 0000000..39877f9
--- /dev/null
+++ b/libbcache/stats.h
@@ -0,0 +1,52 @@
+#ifndef _BCACHE_STATS_H_
+#define _BCACHE_STATS_H_
+
+#include "stats_types.h"
+
+struct cache_set;
+struct cached_dev;
+struct bcache_device;
+
+void bch_cache_accounting_init(struct cache_accounting *, struct closure *);
+int bch_cache_accounting_add_kobjs(struct cache_accounting *, struct kobject *);
+void bch_cache_accounting_clear(struct cache_accounting *);
+void bch_cache_accounting_destroy(struct cache_accounting *);
+
+static inline void mark_cache_stats(struct cache_stat_collector *stats,
+				    bool hit, bool bypass)
+{
+	atomic_inc(&stats->cache_hit_array[!bypass][!hit]);
+}
+
+static inline void bch_mark_cache_accounting(struct cache_set *c,
+					     struct cached_dev *dc,
+					     bool hit, bool bypass)
+{
+	mark_cache_stats(&dc->accounting.collector, hit, bypass);
+	mark_cache_stats(&c->accounting.collector, hit, bypass);
+}
+
+static inline void bch_mark_sectors_bypassed(struct cache_set *c,
+					     struct cached_dev *dc,
+					     unsigned sectors)
+{
+	atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
+	atomic_add(sectors, &c->accounting.collector.sectors_bypassed);
+}
+
+static inline void bch_mark_gc_write(struct cache_set *c, int sectors)
+{
+	atomic_add(sectors, &c->accounting.collector.gc_write_sectors);
+}
+
+static inline void bch_mark_foreground_write(struct cache_set *c, int sectors)
+{
+	atomic_add(sectors, &c->accounting.collector.foreground_write_sectors);
+}
+
+static inline void bch_mark_discard(struct cache_set *c, int sectors)
+{
+	atomic_add(sectors, &c->accounting.collector.discard_sectors);
+}
+
+#endif /* _BCACHE_STATS_H_ */
diff --git a/libbcache/stats_types.h b/libbcache/stats_types.h
new file mode 100644
index 0000000..28e4c69
--- /dev/null
+++ b/libbcache/stats_types.h
@@ -0,0 +1,56 @@
+#ifndef _BCACHE_STATS_TYPES_H_
+#define _BCACHE_STATS_TYPES_H_
+
+struct cache_stat_collector {
+	union {
+	struct {
+		atomic_t	cache_hits;
+		atomic_t	cache_misses;
+		atomic_t	cache_bypass_hits;
+		atomic_t	cache_bypass_misses;
+	};
+
+	/* cache_hit_array[!bypass][!hit]: */
+	atomic_t		cache_hit_array[2][2];
+	};
+
+
+	atomic_t		cache_readaheads;
+	atomic_t		cache_miss_collisions;
+	atomic_t		sectors_bypassed;
+	atomic_t		foreground_write_sectors;
+	atomic_t		gc_write_sectors;
+	atomic_t		discard_sectors;
+};
+
+struct cache_stats {
+	struct kobject		kobj;
+
+	unsigned long		cache_hits;
+	unsigned long		cache_misses;
+	unsigned long		cache_bypass_hits;
+	unsigned long		cache_bypass_misses;
+	unsigned long		cache_readaheads;
+	unsigned long		cache_miss_collisions;
+	unsigned long		sectors_bypassed;
+	unsigned long		foreground_write_sectors;
+	unsigned long		gc_write_sectors;
+	unsigned long		discard_sectors;
+
+	unsigned		rescale;
+};
+
+struct cache_accounting {
+	struct closure		cl;
+	struct timer_list	timer;
+	atomic_t		closing;
+
+	struct cache_stat_collector collector;
+
+	struct cache_stats	total;
+	struct cache_stats	five_minute;
+	struct cache_stats	hour;
+	struct cache_stats	day;
+};
+
+#endif /* _BCACHE_STATS_TYPES_H_ */
diff --git a/libbcache/str_hash.h b/libbcache/str_hash.h
new file mode 100644
index 0000000..9a718a8
--- /dev/null
+++ b/libbcache/str_hash.h
@@ -0,0 +1,352 @@
+#ifndef _BCACHE_STR_HASH_H
+#define _BCACHE_STR_HASH_H
+
+#include "btree_iter.h"
+#include "checksum.h"
+#include "siphash.h"
+#include "super.h"
+
+#include <crypto/sha1_base.h>
+#include <linux/crc32c.h>
+
+static const SIPHASH_KEY bch_siphash_key = {
+	.k0 = cpu_to_le64(0x5a9585fd80087730ULL),
+	.k1 = cpu_to_le64(0xc8de666d50b45664ULL ),
+};
+
+struct bch_str_hash_ctx {
+	union {
+		u32			crc32c;
+		u64			crc64;
+		SIPHASH_CTX		siphash;
+	};
+};
+
+static inline void bch_str_hash_init(struct bch_str_hash_ctx *ctx,
+				     enum bch_str_hash_type type)
+{
+	switch (type) {
+	case BCH_STR_HASH_CRC32C:
+		ctx->crc32c = ~0;
+		break;
+	case BCH_STR_HASH_CRC64:
+		ctx->crc64 = ~0;
+		break;
+	case BCH_STR_HASH_SIPHASH:
+		SipHash24_Init(&ctx->siphash, &bch_siphash_key);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline void bch_str_hash_update(struct bch_str_hash_ctx *ctx,
+				enum bch_str_hash_type type,
+				const void *data, size_t len)
+{
+	switch (type) {
+	case BCH_STR_HASH_CRC32C:
+		ctx->crc32c = crc32c(ctx->crc32c, data, len);
+		break;
+	case BCH_STR_HASH_CRC64:
+		ctx->crc64 = bch_crc64_update(ctx->crc64, data, len);
+		break;
+	case BCH_STR_HASH_SIPHASH:
+		SipHash24_Update(&ctx->siphash, data, len);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline u64 bch_str_hash_end(struct bch_str_hash_ctx *ctx,
+				   enum bch_str_hash_type type)
+{
+	switch (type) {
+	case BCH_STR_HASH_CRC32C:
+		return ctx->crc32c;
+	case BCH_STR_HASH_CRC64:
+		return ctx->crc64 >> 1;
+	case BCH_STR_HASH_SIPHASH:
+		return SipHash24_End(&ctx->siphash) >> 1;
+	default:
+		BUG();
+	}
+}
+
+struct bch_hash_info {
+	u64		seed;
+	u8		type;
+};
+
+struct bch_hash_desc {
+	enum btree_id	btree_id;
+	u8		key_type;
+	u8		whiteout_type;
+
+	u64		(*hash_key)(const struct bch_hash_info *, const void *);
+	u64		(*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
+	bool		(*cmp_key)(struct bkey_s_c, const void *);
+	bool		(*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
+};
+
+static inline struct bkey_s_c
+bch_hash_lookup_at(const struct bch_hash_desc desc,
+		   const struct bch_hash_info *info,
+		   struct btree_iter *iter, const void *search)
+{
+	u64 inode = iter->pos.inode;
+
+	do {
+		struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter);
+
+		if (btree_iter_err(k))
+			return k;
+
+		if (k.k->type == desc.key_type) {
+			if (!desc.cmp_key(k, search))
+				return k;
+		} else if (k.k->type == desc.whiteout_type) {
+			;
+		} else {
+			/* hole, not found */
+			break;
+		}
+
+		bch_btree_iter_advance_pos(iter);
+	} while (iter->pos.inode == inode);
+
+	return bkey_s_c_err(-ENOENT);
+}
+
+static inline struct bkey_s_c
+bch_hash_lookup_bkey_at(const struct bch_hash_desc desc,
+			const struct bch_hash_info *info,
+			struct btree_iter *iter, struct bkey_s_c search)
+{
+	u64 inode = iter->pos.inode;
+
+	do {
+		struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter);
+
+		if (btree_iter_err(k))
+			return k;
+
+		if (k.k->type == desc.key_type) {
+			if (!desc.cmp_bkey(k, search))
+				return k;
+		} else if (k.k->type == desc.whiteout_type) {
+			;
+		} else {
+			/* hole, not found */
+			break;
+		}
+
+		bch_btree_iter_advance_pos(iter);
+	} while (iter->pos.inode == inode);
+
+	return bkey_s_c_err(-ENOENT);
+}
+
+static inline struct bkey_s_c
+bch_hash_lookup(const struct bch_hash_desc desc,
+		const struct bch_hash_info *info,
+		struct cache_set *c, u64 inode,
+		struct btree_iter *iter, const void *key)
+{
+	bch_btree_iter_init(iter, c, desc.btree_id,
+			    POS(inode, desc.hash_key(info, key)));
+
+	return bch_hash_lookup_at(desc, info, iter, key);
+}
+
+static inline struct bkey_s_c
+bch_hash_lookup_intent(const struct bch_hash_desc desc,
+		       const struct bch_hash_info *info,
+		       struct cache_set *c, u64 inode,
+		       struct btree_iter *iter, const void *key)
+{
+	bch_btree_iter_init_intent(iter, c, desc.btree_id,
+			    POS(inode, desc.hash_key(info, key)));
+
+	return bch_hash_lookup_at(desc, info, iter, key);
+}
+
+static inline struct bkey_s_c
+bch_hash_hole_at(const struct bch_hash_desc desc, struct btree_iter *iter)
+{
+	while (1) {
+		struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter);
+
+		if (btree_iter_err(k))
+			return k;
+
+		if (k.k->type != desc.key_type)
+			return k;
+
+		/* hash collision, keep going */
+		bch_btree_iter_advance_pos(iter);
+		if (iter->pos.inode != k.k->p.inode)
+			return bkey_s_c_err(-ENOENT);
+	}
+}
+
+static inline struct bkey_s_c bch_hash_hole(const struct bch_hash_desc desc,
+					    const struct bch_hash_info *info,
+					    struct cache_set *c, u64 inode,
+					    struct btree_iter *iter,
+					    const void *key)
+{
+	bch_btree_iter_init_intent(iter, c, desc.btree_id,
+			    POS(inode, desc.hash_key(info, key)));
+
+	return bch_hash_hole_at(desc, iter);
+}
+
+static inline int bch_hash_needs_whiteout(const struct bch_hash_desc desc,
+					   const struct bch_hash_info *info,
+					   struct btree_iter *iter,
+					   struct btree_iter *start)
+{
+	bch_btree_iter_set_pos(iter,
+			btree_type_successor(start->btree_id, start->pos));
+
+	while (1) {
+		struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter);
+		int ret = btree_iter_err(k);
+
+		if (ret)
+			return ret;
+
+		if (k.k->type != desc.key_type &&
+		    k.k->type != desc.whiteout_type)
+			return false;
+
+		if (k.k->type == desc.key_type &&
+		    desc.hash_bkey(info, k) <= start->pos.offset)
+			return true;
+
+		bch_btree_iter_advance_pos(iter);
+	}
+}
+
+#define BCH_HASH_SET_MUST_CREATE	1
+#define BCH_HASH_SET_MUST_REPLACE	2
+
+static inline int bch_hash_set(const struct bch_hash_desc desc,
+			       const struct bch_hash_info *info,
+			       struct cache_set *c, u64 inode,
+			       u64 *journal_seq,
+			       struct bkey_i *insert, int flags)
+{
+	struct btree_iter iter, hashed_slot;
+	struct bkey_s_c k;
+	int ret;
+
+	bch_btree_iter_init_intent(&hashed_slot, c, desc.btree_id,
+		POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))));
+	bch_btree_iter_init_intent(&iter, c, desc.btree_id, hashed_slot.pos);
+	bch_btree_iter_link(&hashed_slot, &iter);
+retry:
+	/*
+	 * On hash collision, we have to keep the slot we hashed to locked while
+	 * we do the insert - to avoid racing with another thread deleting
+	 * whatever's in the slot we hashed to:
+	 */
+	ret = bch_btree_iter_traverse(&hashed_slot);
+	if (ret)
+		goto err;
+
+	/*
+	 * On -EINTR/retry, we dropped locks - always restart from the slot we
+	 * hashed to:
+	 */
+	bch_btree_iter_copy(&iter, &hashed_slot);
+
+	k = bch_hash_lookup_bkey_at(desc, info, &iter, bkey_i_to_s_c(insert));
+
+	ret = btree_iter_err(k);
+	if (ret == -ENOENT) {
+		if (flags & BCH_HASH_SET_MUST_REPLACE) {
+			ret = -ENOENT;
+			goto err;
+		}
+
+		/*
+		 * Not found, so we're now looking for any open
+		 * slot - we might have skipped over a whiteout
+		 * that we could have used, so restart from the
+		 * slot we hashed to:
+		 */
+		bch_btree_iter_copy(&iter, &hashed_slot);
+		k = bch_hash_hole_at(desc, &iter);
+		if ((ret = btree_iter_err(k)))
+			goto err;
+	} else if (!ret) {
+		if (flags & BCH_HASH_SET_MUST_CREATE) {
+			ret = -EEXIST;
+			goto err;
+		}
+	} else {
+		goto err;
+	}
+
+	insert->k.p = iter.pos;
+	ret = bch_btree_insert_at(c, NULL, NULL, journal_seq,
+				  BTREE_INSERT_ATOMIC,
+				  BTREE_INSERT_ENTRY(&iter, insert));
+err:
+	if (ret == -EINTR)
+		goto retry;
+
+	/*
+	 * On successful insert, we don't want to clobber ret with error from
+	 * iter:
+	 */
+	bch_btree_iter_unlock(&iter);
+	bch_btree_iter_unlock(&hashed_slot);
+	return ret;
+}
+
+static inline int bch_hash_delete(const struct bch_hash_desc desc,
+				  const struct bch_hash_info *info,
+				  struct cache_set *c, u64 inode,
+				  u64 *journal_seq, const void *key)
+{
+	struct btree_iter iter, whiteout_iter;
+	struct bkey_s_c k;
+	struct bkey_i delete;
+	int ret = -ENOENT;
+
+	bch_btree_iter_init_intent(&iter, c, desc.btree_id,
+			    POS(inode, desc.hash_key(info, key)));
+	bch_btree_iter_init(&whiteout_iter, c, desc.btree_id,
+			    POS(inode, desc.hash_key(info, key)));
+	bch_btree_iter_link(&iter, &whiteout_iter);
+retry:
+	k = bch_hash_lookup_at(desc, info, &iter, key);
+	if ((ret = btree_iter_err(k)))
+		goto err;
+
+	ret = bch_hash_needs_whiteout(desc, info, &whiteout_iter, &iter);
+	if (ret < 0)
+		goto err;
+
+	bkey_init(&delete.k);
+	delete.k.p = k.k->p;
+	delete.k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
+
+	ret = bch_btree_insert_at(c, NULL, NULL, journal_seq,
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_ATOMIC,
+				  BTREE_INSERT_ENTRY(&iter, &delete));
+err:
+	if (ret == -EINTR)
+		goto retry;
+
+	bch_btree_iter_unlock(&whiteout_iter);
+	bch_btree_iter_unlock(&iter);
+	return ret;
+}
+
+#endif /* _BCACHE_STR_HASH_H */
diff --git a/libbcache/super.c b/libbcache/super.c
new file mode 100644
index 0000000..5f6a85e
--- /dev/null
+++ b/libbcache/super.c
@@ -0,0 +1,2503 @@
+/*
+ * bcache setup/teardown code, and some metadata io - read a superblock and
+ * figure out what to do with it.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "blockdev.h"
+#include "alloc.h"
+#include "btree_cache.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_io.h"
+#include "chardev.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "debug.h"
+#include "error.h"
+#include "fs.h"
+#include "fs-gc.h"
+#include "inode.h"
+#include "io.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "migrate.h"
+#include "movinggc.h"
+#include "notify.h"
+#include "stats.h"
+#include "super.h"
+#include "tier.h"
+#include "writeback.h"
+
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/debugfs.h>
+#include <linux/genhd.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/random.h>
+#include <linux/reboot.h>
+#include <linux/sysfs.h>
+#include <crypto/hash.h>
+
+#include <trace/events/bcache.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+
+static const uuid_le invalid_uuid = {
+	.b = {
+		0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
+		0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
+	}
+};
+
+static struct kset *bcache_kset;
+struct mutex bch_register_lock;
+LIST_HEAD(bch_cache_sets);
+
+static int bch_chardev_major;
+static struct class *bch_chardev_class;
+static struct device *bch_chardev;
+static DEFINE_IDR(bch_chardev_minor);
+static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
+struct workqueue_struct *bcache_io_wq;
+struct crypto_shash *bch_sha1;
+
+static void bch_cache_stop(struct cache *);
+static int bch_cache_online(struct cache *);
+
+static bool bch_is_open_cache(struct block_device *bdev)
+{
+	struct cache_set *c;
+	struct cache *ca;
+	unsigned i;
+
+	rcu_read_lock();
+	list_for_each_entry(c, &bch_cache_sets, list)
+		for_each_cache_rcu(ca, c, i)
+			if (ca->disk_sb.bdev == bdev) {
+				rcu_read_unlock();
+				return true;
+			}
+	rcu_read_unlock();
+	return false;
+}
+
+static bool bch_is_open(struct block_device *bdev)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
+}
+
+static const char *bch_blkdev_open(const char *path, void *holder,
+				   struct block_device **ret)
+{
+	struct block_device *bdev;
+	const char *err;
+
+	*ret = NULL;
+	bdev = blkdev_get_by_path(path, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				  holder);
+
+	if (bdev == ERR_PTR(-EBUSY)) {
+		bdev = lookup_bdev(path);
+		if (IS_ERR(bdev))
+			return "device busy";
+
+		err = bch_is_open(bdev)
+			? "device already registered"
+			: "device busy";
+
+		bdput(bdev);
+		return err;
+	}
+
+	if (IS_ERR(bdev))
+		return "failed to open device";
+
+	bdev_get_queue(bdev)->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
+
+	*ret = bdev;
+	return NULL;
+}
+
+static int bch_congested_fn(void *data, int bdi_bits)
+{
+	struct backing_dev_info *bdi;
+	struct cache_set *c = data;
+	struct cache *ca;
+	unsigned i;
+	int ret = 0;
+
+	rcu_read_lock();
+	if (bdi_bits & (1 << WB_sync_congested)) {
+		/* Reads - check all devices: */
+		for_each_cache_rcu(ca, c, i) {
+			bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
+
+			if (bdi_congested(bdi, bdi_bits)) {
+				ret = 1;
+				break;
+			}
+		}
+	} else {
+		/* Writes only go to tier 0: */
+		group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
+			bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
+
+			if (bdi_congested(bdi, bdi_bits)) {
+				ret = 1;
+				break;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+/* Superblock */
+
+static struct cache_member_cpu cache_mi_to_cpu_mi(struct cache_member *mi)
+{
+	return (struct cache_member_cpu) {
+		.nbuckets	= le64_to_cpu(mi->nbuckets),
+		.first_bucket	= le16_to_cpu(mi->first_bucket),
+		.bucket_size	= le16_to_cpu(mi->bucket_size),
+		.state		= CACHE_STATE(mi),
+		.tier		= CACHE_TIER(mi),
+		.replication_set= CACHE_REPLICATION_SET(mi),
+		.has_metadata	= CACHE_HAS_METADATA(mi),
+		.has_data	= CACHE_HAS_DATA(mi),
+		.replacement	= CACHE_REPLACEMENT(mi),
+		.discard	= CACHE_DISCARD(mi),
+		.valid		= !bch_is_zero(mi->uuid.b, sizeof(uuid_le)),
+	};
+}
+
+static const char *validate_cache_super(struct bcache_superblock *disk_sb)
+{
+	struct cache_sb *sb = disk_sb->sb;
+	struct cache_member_cpu	mi;
+	u16 block_size;
+	unsigned i;
+
+	switch (le64_to_cpu(sb->version)) {
+	case BCACHE_SB_VERSION_CDEV_V0:
+	case BCACHE_SB_VERSION_CDEV_WITH_UUID:
+	case BCACHE_SB_VERSION_CDEV_V2:
+	case BCACHE_SB_VERSION_CDEV_V3:
+		break;
+	default:
+		return"Unsupported superblock version";
+	}
+
+	if (CACHE_SET_SYNC(sb) &&
+	    le64_to_cpu(sb->version) != BCACHE_SB_VERSION_CDEV_V3)
+		return "Unsupported superblock version";
+
+	block_size = le16_to_cpu(sb->block_size);
+
+	if (!is_power_of_2(block_size) ||
+	    block_size > PAGE_SECTORS)
+		return "Bad block size";
+
+	if (bch_is_zero(sb->disk_uuid.b, sizeof(uuid_le)))
+		return "Bad disk UUID";
+
+	if (bch_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
+		return "Bad user UUID";
+
+	if (bch_is_zero(sb->set_uuid.b, sizeof(uuid_le)))
+		return "Bad set UUID";
+
+	if (!sb->nr_in_set ||
+	    sb->nr_in_set <= sb->nr_this_dev ||
+	    sb->nr_in_set > MAX_CACHES_PER_SET)
+		return "Bad cache device number in set";
+
+	if (!CACHE_SET_META_REPLICAS_WANT(sb) ||
+	    CACHE_SET_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+		return "Invalid number of metadata replicas";
+
+	if (!CACHE_SET_META_REPLICAS_HAVE(sb) ||
+	    CACHE_SET_META_REPLICAS_HAVE(sb) >
+	    CACHE_SET_META_REPLICAS_WANT(sb))
+		return "Invalid number of metadata replicas";
+
+	if (!CACHE_SET_DATA_REPLICAS_WANT(sb) ||
+	    CACHE_SET_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+		return "Invalid number of data replicas";
+
+	if (!CACHE_SET_DATA_REPLICAS_HAVE(sb) ||
+	    CACHE_SET_DATA_REPLICAS_HAVE(sb) >
+	    CACHE_SET_DATA_REPLICAS_WANT(sb))
+		return "Invalid number of data replicas";
+
+	if (CACHE_SB_CSUM_TYPE(sb) >= BCH_CSUM_NR)
+		return "Invalid checksum type";
+
+	if (!CACHE_SET_BTREE_NODE_SIZE(sb))
+		return "Btree node size not set";
+
+	if (!is_power_of_2(CACHE_SET_BTREE_NODE_SIZE(sb)))
+		return "Btree node size not a power of two";
+
+	if (CACHE_SET_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX)
+		return "Btree node size too large";
+
+	/* Default value, for old filesystems: */
+	if (!CACHE_SET_GC_RESERVE(sb))
+		SET_CACHE_SET_GC_RESERVE(sb, 10);
+
+	if (CACHE_SET_GC_RESERVE(sb) < 5)
+		return "gc reserve percentage too small";
+
+	if (!CACHE_SET_JOURNAL_ENTRY_SIZE(sb))
+		SET_CACHE_SET_JOURNAL_ENTRY_SIZE(sb, 9);
+
+	/* 4 mb max: */
+	if (512U << CACHE_SET_JOURNAL_ENTRY_SIZE(sb) > JOURNAL_ENTRY_SIZE_MAX)
+		return "max journal entry size too big";
+
+	if (le16_to_cpu(sb->u64s) < bch_journal_buckets_offset(sb))
+		return "Invalid superblock: member info area missing";
+
+	mi = cache_mi_to_cpu_mi(sb->members + sb->nr_this_dev);
+
+	if (mi.nbuckets > LONG_MAX)
+		return "Too many buckets";
+
+	if (mi.nbuckets < 1 << 8)
+		return "Not enough buckets";
+
+	if (!is_power_of_2(mi.bucket_size) ||
+	    mi.bucket_size < PAGE_SECTORS ||
+	    mi.bucket_size < block_size)
+		return "Bad bucket size";
+
+	if (get_capacity(disk_sb->bdev->bd_disk) <
+	    mi.bucket_size * mi.nbuckets)
+		return "Invalid superblock: device too small";
+
+	if (le64_to_cpu(sb->offset) +
+	    (__set_blocks(sb, le16_to_cpu(sb->u64s),
+			  block_size << 9) * block_size) >
+	    mi.first_bucket * mi.bucket_size)
+		return "Invalid superblock: first bucket comes before end of super";
+
+	for (i = 0; i < bch_nr_journal_buckets(sb); i++)
+		if (journal_bucket(sb, i) <  mi.first_bucket ||
+		    journal_bucket(sb, i) >= mi.nbuckets)
+			return "bad journal bucket";
+
+	return NULL;
+}
+
+void free_super(struct bcache_superblock *sb)
+{
+	if (sb->bio)
+		bio_put(sb->bio);
+	if (!IS_ERR_OR_NULL(sb->bdev))
+		blkdev_put(sb->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+
+	free_pages((unsigned long) sb->sb, sb->page_order);
+	memset(sb, 0, sizeof(*sb));
+}
+
+static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order)
+{
+	struct cache_sb *new_sb;
+	struct bio *bio;
+
+	if (sb->page_order >= order && sb->sb)
+		return 0;
+
+	new_sb = (void *) __get_free_pages(GFP_KERNEL, order);
+	if (!new_sb)
+		return -ENOMEM;
+
+	bio = (dynamic_fault("bcache:add:super_realloc")
+	       ? NULL
+	       : bio_kmalloc(GFP_KERNEL, 1 << order));
+	if (!bio) {
+		free_pages((unsigned long) new_sb, order);
+		return -ENOMEM;
+	}
+
+	if (sb->sb)
+		memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order);
+
+	free_pages((unsigned long) sb->sb, sb->page_order);
+	sb->sb = new_sb;
+
+	if (sb->bio)
+		bio_put(sb->bio);
+	sb->bio = bio;
+
+	sb->page_order = order;
+
+	return 0;
+}
+
+int bch_super_realloc(struct bcache_superblock *sb, unsigned u64s)
+{
+	struct cache_member *mi = sb->sb->members + sb->sb->nr_this_dev;
+	char buf[BDEVNAME_SIZE];
+	size_t bytes = __set_bytes((struct cache_sb *) NULL, u64s);
+	u64 want = bytes + (SB_SECTOR << 9);
+
+	u64 first_bucket_offset = (u64) le16_to_cpu(mi->first_bucket) *
+		((u64) le16_to_cpu(mi->bucket_size) << 9);
+
+	if (want > first_bucket_offset) {
+		pr_err("%s: superblock too big: want %llu but have %llu",
+		       bdevname(sb->bdev, buf), want, first_bucket_offset);
+		return -ENOSPC;
+	}
+
+	return __bch_super_realloc(sb, get_order(bytes));
+}
+
+static const char *read_super(struct bcache_superblock *sb,
+			      const char *path)
+{
+	const char *err;
+	unsigned order = 0;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	memset(sb, 0, sizeof(*sb));
+
+	err = bch_blkdev_open(path, &sb, &sb->bdev);
+	if (err)
+		return err;
+retry:
+	err = "cannot allocate memory";
+	if (__bch_super_realloc(sb, order))
+		goto err;
+
+	err = "dynamic fault";
+	if (cache_set_init_fault("read_super"))
+		goto err;
+
+	bio_reset(sb->bio);
+	sb->bio->bi_bdev = sb->bdev;
+	sb->bio->bi_iter.bi_sector = SB_SECTOR;
+	sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
+	bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
+	bch_bio_map(sb->bio, sb->sb);
+
+	err = "IO error";
+	if (submit_bio_wait(sb->bio))
+		goto err;
+
+	err = "Not a bcache superblock";
+	if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
+		goto err;
+
+	err = "Superblock has incorrect offset";
+	if (le64_to_cpu(sb->sb->offset) != SB_SECTOR)
+		goto err;
+
+	pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
+		 le64_to_cpu(sb->sb->version),
+		 le64_to_cpu(sb->sb->flags),
+		 le64_to_cpu(sb->sb->seq),
+		 le16_to_cpu(sb->sb->u64s));
+
+	err = "Superblock block size smaller than device block size";
+	if (le16_to_cpu(sb->sb->block_size) << 9 <
+	    bdev_logical_block_size(sb->bdev))
+		goto err;
+
+	order = get_order(__set_bytes(sb->sb, le16_to_cpu(sb->sb->u64s)));
+	if (order > sb->page_order)
+		goto retry;
+
+	err = "bad checksum reading superblock";
+	if (le64_to_cpu(sb->sb->csum) !=
+	    __csum_set(sb->sb, le16_to_cpu(sb->sb->u64s),
+		       le64_to_cpu(sb->sb->version) <
+		       BCACHE_SB_VERSION_CDEV_V3
+		       ? BCH_CSUM_CRC64
+		       : CACHE_SB_CSUM_TYPE(sb->sb)))
+		goto err;
+
+	return NULL;
+err:
+	free_super(sb);
+	return err;
+}
+
+void __write_super(struct cache_set *c, struct bcache_superblock *disk_sb)
+{
+	struct cache_sb *sb = disk_sb->sb;
+	struct bio *bio = disk_sb->bio;
+
+	bio->bi_bdev		= disk_sb->bdev;
+	bio->bi_iter.bi_sector	= SB_SECTOR;
+	bio->bi_iter.bi_size	=
+		roundup(__set_bytes(sb, le16_to_cpu(sb->u64s)),
+			bdev_logical_block_size(disk_sb->bdev));
+	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
+	bch_bio_map(bio, sb);
+
+	pr_debug("ver %llu, flags %llu, seq %llu",
+		 le64_to_cpu(sb->version),
+		 le64_to_cpu(sb->flags),
+		 le64_to_cpu(sb->seq));
+
+	bch_generic_make_request(bio, c);
+}
+
+static void write_super_endio(struct bio *bio)
+{
+	struct cache *ca = bio->bi_private;
+
+	/* XXX: return errors directly */
+
+	cache_fatal_io_err_on(bio->bi_error, ca, "superblock write");
+
+	bch_account_io_completion(ca);
+
+	closure_put(&ca->set->sb_write);
+	percpu_ref_put(&ca->ref);
+}
+
+static void bcache_write_super_unlock(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, sb_write);
+
+	up(&c->sb_write_mutex);
+}
+
+/* Update cached mi: */
+static int cache_set_mi_update(struct cache_set *c,
+			       struct cache_member *mi,
+			       unsigned nr_in_set)
+{
+	struct cache_member_rcu *new, *old;
+	struct cache *ca;
+	unsigned i;
+
+	mutex_lock(&c->mi_lock);
+
+	new = kzalloc(sizeof(struct cache_member_rcu) +
+		      sizeof(struct cache_member_cpu) * nr_in_set,
+		      GFP_KERNEL);
+	if (!new) {
+		mutex_unlock(&c->mi_lock);
+		return -ENOMEM;
+	}
+
+	new->nr_in_set = nr_in_set;
+
+	for (i = 0; i < nr_in_set; i++)
+		new->m[i] = cache_mi_to_cpu_mi(&mi[i]);
+
+	rcu_read_lock();
+	for_each_cache(ca, c, i)
+		ca->mi = new->m[i];
+	rcu_read_unlock();
+
+	old = rcu_dereference_protected(c->members,
+				lockdep_is_held(&c->mi_lock));
+
+	rcu_assign_pointer(c->members, new);
+	if (old)
+		kfree_rcu(old, rcu);
+
+	mutex_unlock(&c->mi_lock);
+	return 0;
+}
+
+/* doesn't copy member info */
+static void __copy_super(struct cache_sb *dst, struct cache_sb *src)
+{
+	dst->version		= src->version;
+	dst->seq		= src->seq;
+	dst->user_uuid		= src->user_uuid;
+	dst->set_uuid		= src->set_uuid;
+	memcpy(dst->label, src->label, SB_LABEL_SIZE);
+	dst->flags		= src->flags;
+	dst->flags2		= src->flags2;
+	dst->nr_in_set		= src->nr_in_set;
+	dst->block_size		= src->block_size;
+}
+
+static int cache_sb_to_cache_set(struct cache_set *c, struct cache_sb *src)
+{
+	struct cache_member *new;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	new = kzalloc(sizeof(struct cache_member) * src->nr_in_set,
+		      GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	memcpy(new, src->members,
+	       src->nr_in_set * sizeof(struct cache_member));
+
+	if (cache_set_mi_update(c, new, src->nr_in_set)) {
+		kfree(new);
+		return -ENOMEM;
+	}
+
+	kfree(c->disk_mi);
+	c->disk_mi = new;
+
+	__copy_super(&c->disk_sb, src);
+
+	c->sb.block_size	= le16_to_cpu(src->block_size);
+	c->sb.btree_node_size	= CACHE_SET_BTREE_NODE_SIZE(src);
+	c->sb.nr_in_set		= src->nr_in_set;
+	c->sb.clean		= CACHE_SET_CLEAN(src);
+	c->sb.meta_replicas_have= CACHE_SET_META_REPLICAS_HAVE(src);
+	c->sb.data_replicas_have= CACHE_SET_DATA_REPLICAS_HAVE(src);
+	c->sb.str_hash_type	= CACHE_SET_STR_HASH_TYPE(src);
+
+	return 0;
+}
+
+static int cache_sb_from_cache_set(struct cache_set *c, struct cache *ca)
+{
+	struct cache_sb *src = &c->disk_sb, *dst = ca->disk_sb.sb;
+
+	if (src->nr_in_set != dst->nr_in_set) {
+		/*
+		 * We have to preserve the list of journal buckets on the
+		 * cache's superblock:
+		 */
+		unsigned old_offset = bch_journal_buckets_offset(dst);
+		unsigned u64s = bch_journal_buckets_offset(src)
+			+ bch_nr_journal_buckets(dst);
+		int ret = bch_super_realloc(&ca->disk_sb, u64s);
+
+		if (ret)
+			return ret;
+
+		dst->nr_in_set	= src->nr_in_set;
+		dst->u64s	= cpu_to_le16(u64s);
+
+		memmove(dst->_data + bch_journal_buckets_offset(dst),
+			dst->_data + old_offset,
+			bch_nr_journal_buckets(dst) * sizeof(u64));
+	}
+
+	memcpy(dst->_data,
+	       c->disk_mi,
+	       src->nr_in_set * sizeof(struct cache_member));
+
+	__copy_super(dst, src);
+
+	return 0;
+}
+
+static void __bcache_write_super(struct cache_set *c)
+{
+	struct closure *cl = &c->sb_write;
+	struct cache *ca;
+	unsigned i;
+
+	cache_set_mi_update(c, c->disk_mi, c->sb.nr_in_set);
+
+	closure_init(cl, &c->cl);
+
+	le64_add_cpu(&c->disk_sb.seq, 1);
+
+	for_each_cache(ca, c, i) {
+		struct cache_sb *sb = ca->disk_sb.sb;
+		struct bio *bio = ca->disk_sb.bio;
+
+		cache_sb_from_cache_set(c, ca);
+
+		SET_CACHE_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
+		sb->csum = cpu_to_le64(__csum_set(sb,
+						  le16_to_cpu(sb->u64s),
+						  CACHE_SB_CSUM_TYPE(sb)));
+
+		bio_reset(bio);
+		bio->bi_bdev	= ca->disk_sb.bdev;
+		bio->bi_end_io	= write_super_endio;
+		bio->bi_private = ca;
+
+		closure_get(cl);
+		percpu_ref_get(&ca->ref);
+		__write_super(c, &ca->disk_sb);
+	}
+
+	closure_return_with_destructor(cl, bcache_write_super_unlock);
+}
+
+void bcache_write_super(struct cache_set *c)
+{
+	down(&c->sb_write_mutex);
+	__bcache_write_super(c);
+}
+
+void bch_check_mark_super_slowpath(struct cache_set *c, const struct bkey_i *k,
+				   bool meta)
+{
+	struct cache_member *mi;
+	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+	const struct bch_extent_ptr *ptr;
+
+	if (!CACHE_SET_SYNC(&c->disk_sb))
+		return;
+
+	down(&c->sb_write_mutex);
+
+	/* recheck, might have raced */
+	if (bch_check_super_marked(c, k, meta)) {
+		up(&c->sb_write_mutex);
+		return;
+	}
+
+	mi = c->disk_mi;
+
+	extent_for_each_ptr(e, ptr)
+		if (bch_extent_ptr_is_dirty(c, e, ptr))
+			(meta
+			 ? SET_CACHE_HAS_METADATA
+			 : SET_CACHE_HAS_DATA)(mi + ptr->dev, true);
+
+	__bcache_write_super(c);
+}
+
+/* Cache set RO/RW: */
+
+/*
+ * For startup/shutdown of RW stuff, the dependencies are:
+ *
+ * - foreground writes depend on copygc and tiering (to free up space)
+ *
+ * - copygc and tiering depend on mark and sweep gc (they actually probably
+ *   don't because they either reserve ahead of time or don't block if
+ *   allocations fail, but allocations can require mark and sweep gc to run
+ *   because of generation number wraparound)
+ *
+ * - all of the above depends on the allocator threads
+ *
+ * - allocator depends on the journal (when it rewrites prios and gens)
+ */
+
+static void __bch_cache_set_read_only(struct cache_set *c)
+{
+	struct cache *ca;
+	unsigned i;
+
+	c->tiering_pd.rate.rate = UINT_MAX;
+	bch_ratelimit_reset(&c->tiering_pd.rate);
+	bch_tiering_read_stop(c);
+
+	for_each_cache(ca, c, i)
+		bch_moving_gc_stop(ca);
+
+	bch_gc_thread_stop(c);
+
+	bch_btree_flush(c);
+
+	for_each_cache(ca, c, i)
+		bch_cache_allocator_stop(ca);
+
+	/*
+	 * Write a journal entry after flushing the btree, so we don't end up
+	 * replaying everything we just flushed:
+	 */
+	if (test_bit(JOURNAL_STARTED, &c->journal.flags)) {
+		int ret;
+
+		bch_journal_flush_async(&c->journal, NULL);
+		ret = bch_journal_meta(&c->journal);
+		BUG_ON(ret && !bch_journal_error(&c->journal));
+	}
+
+	cancel_delayed_work_sync(&c->journal.write_work);
+	cancel_delayed_work_sync(&c->journal.reclaim_work);
+}
+
+static void bch_writes_disabled(struct percpu_ref *writes)
+{
+	struct cache_set *c = container_of(writes, struct cache_set, writes);
+
+	set_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags);
+	wake_up(&bch_read_only_wait);
+}
+
+static void bch_cache_set_read_only_work(struct work_struct *work)
+{
+	struct cache_set *c =
+		container_of(work, struct cache_set, read_only_work);
+
+	percpu_ref_put(&c->writes);
+
+	del_timer(&c->foreground_write_wakeup);
+	cancel_delayed_work(&c->pd_controllers_update);
+
+	c->foreground_write_pd.rate.rate = UINT_MAX;
+	bch_wake_delayed_writes((unsigned long) c);
+
+	if (!test_bit(CACHE_SET_EMERGENCY_RO, &c->flags)) {
+		/*
+		 * If we're not doing an emergency shutdown, we want to wait on
+		 * outstanding writes to complete so they don't see spurious
+		 * errors due to shutting down the allocator:
+		 */
+		wait_event(bch_read_only_wait,
+			   test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags));
+
+		__bch_cache_set_read_only(c);
+
+		if (!bch_journal_error(&c->journal) &&
+		    !test_bit(CACHE_SET_ERROR, &c->flags)) {
+			SET_CACHE_SET_CLEAN(&c->disk_sb, true);
+			bcache_write_super(c);
+		}
+	} else {
+		/*
+		 * If we are doing an emergency shutdown outstanding writes may
+		 * hang until we shutdown the allocator so we don't want to wait
+		 * on outstanding writes before shutting everything down - but
+		 * we do need to wait on them before returning and signalling
+		 * that going RO is complete:
+		 */
+		__bch_cache_set_read_only(c);
+
+		wait_event(bch_read_only_wait,
+			   test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags));
+	}
+
+	bch_notify_cache_set_read_only(c);
+	trace_bcache_cache_set_read_only_done(c);
+
+	set_bit(CACHE_SET_RO_COMPLETE, &c->flags);
+	wake_up(&bch_read_only_wait);
+}
+
+bool bch_cache_set_read_only(struct cache_set *c)
+{
+	if (test_and_set_bit(CACHE_SET_RO, &c->flags))
+		return false;
+
+	trace_bcache_cache_set_read_only(c);
+
+	percpu_ref_get(&c->writes);
+
+	/*
+	 * Block new foreground-end write operations from starting - any new
+	 * writes will return -EROFS:
+	 *
+	 * (This is really blocking new _allocations_, writes to previously
+	 * allocated space can still happen until stopping the allocator in
+	 * bch_cache_allocator_stop()).
+	 */
+	percpu_ref_kill(&c->writes);
+
+	queue_work(system_freezable_wq, &c->read_only_work);
+	return true;
+}
+
+bool bch_cache_set_emergency_read_only(struct cache_set *c)
+{
+	bool ret = !test_and_set_bit(CACHE_SET_EMERGENCY_RO, &c->flags);
+
+	bch_cache_set_read_only(c);
+	bch_journal_halt(&c->journal);
+
+	wake_up(&bch_read_only_wait);
+	return ret;
+}
+
+void bch_cache_set_read_only_sync(struct cache_set *c)
+{
+	/* so we don't race with bch_cache_set_read_write() */
+	lockdep_assert_held(&bch_register_lock);
+
+	bch_cache_set_read_only(c);
+
+	wait_event(bch_read_only_wait,
+		   test_bit(CACHE_SET_RO_COMPLETE, &c->flags) &&
+		   test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags));
+}
+
+static const char *__bch_cache_set_read_write(struct cache_set *c)
+{
+	struct cache *ca;
+	const char *err;
+	unsigned i;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	err = "error starting allocator thread";
+	for_each_cache(ca, c, i)
+		if (ca->mi.state == CACHE_ACTIVE &&
+		    bch_cache_allocator_start(ca)) {
+			percpu_ref_put(&ca->ref);
+			goto err;
+		}
+
+	err = "error starting btree GC thread";
+	if (bch_gc_thread_start(c))
+		goto err;
+
+	for_each_cache(ca, c, i) {
+		if (ca->mi.state != CACHE_ACTIVE)
+			continue;
+
+		err = "error starting moving GC thread";
+		if (bch_moving_gc_thread_start(ca)) {
+			percpu_ref_put(&ca->ref);
+			goto err;
+		}
+	}
+
+	err = "error starting tiering thread";
+	if (bch_tiering_read_start(c))
+		goto err;
+
+	schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
+
+	return NULL;
+err:
+	__bch_cache_set_read_only(c);
+	return err;
+}
+
+const char *bch_cache_set_read_write(struct cache_set *c)
+{
+	const char *err;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	if (!test_bit(CACHE_SET_RO_COMPLETE, &c->flags))
+		return NULL;
+
+	err = __bch_cache_set_read_write(c);
+	if (err)
+		return err;
+
+	percpu_ref_reinit(&c->writes);
+
+	clear_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags);
+	clear_bit(CACHE_SET_EMERGENCY_RO, &c->flags);
+	clear_bit(CACHE_SET_RO_COMPLETE, &c->flags);
+	clear_bit(CACHE_SET_RO, &c->flags);
+	return NULL;
+}
+
+/* Cache set startup/shutdown: */
+
+static void cache_set_free(struct cache_set *c)
+{
+	del_timer_sync(&c->foreground_write_wakeup);
+	cancel_delayed_work_sync(&c->pd_controllers_update);
+	cancel_work_sync(&c->read_only_work);
+	cancel_work_sync(&c->bio_submit_work);
+	cancel_work_sync(&c->read_retry_work);
+
+	bch_btree_cache_free(c);
+	bch_journal_free(&c->journal);
+	bch_io_clock_exit(&c->io_clock[WRITE]);
+	bch_io_clock_exit(&c->io_clock[READ]);
+	bch_compress_free(c);
+	bdi_destroy(&c->bdi);
+	lg_lock_free(&c->bucket_stats_lock);
+	free_percpu(c->bucket_stats_percpu);
+	mempool_exit(&c->btree_bounce_pool);
+	mempool_exit(&c->bio_bounce_pages);
+	bioset_exit(&c->bio_write);
+	bioset_exit(&c->bio_read_split);
+	bioset_exit(&c->bio_read);
+	bioset_exit(&c->btree_read_bio);
+	mempool_exit(&c->btree_interior_update_pool);
+	mempool_exit(&c->btree_reserve_pool);
+	mempool_exit(&c->fill_iter);
+	mempool_exit(&c->search);
+	percpu_ref_exit(&c->writes);
+
+	if (c->copygc_wq)
+		destroy_workqueue(c->copygc_wq);
+	if (c->wq)
+		destroy_workqueue(c->wq);
+
+	kfree_rcu(rcu_dereference_protected(c->members, 1), rcu); /* shutting down */
+	kfree(c->disk_mi);
+	kfree(c);
+	module_put(THIS_MODULE);
+}
+
+/*
+ * should be __cache_set_stop4 - block devices are closed, now we can finally
+ * free it
+ */
+void bch_cache_set_release(struct kobject *kobj)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+	struct completion *stop_completion = c->stop_completion;
+
+	bch_notify_cache_set_stopped(c);
+	bch_info(c, "stopped");
+
+	cache_set_free(c);
+
+	if (stop_completion)
+		complete(stop_completion);
+}
+
+/*
+ * All activity on the cache_set should have stopped now - close devices:
+ */
+static void __cache_set_stop3(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, cl);
+	struct cache *ca;
+	unsigned i;
+
+	mutex_lock(&bch_register_lock);
+	for_each_cache(ca, c, i)
+		bch_cache_stop(ca);
+	mutex_unlock(&bch_register_lock);
+
+	mutex_lock(&bch_register_lock);
+	list_del(&c->list);
+	if (c->minor >= 0)
+		idr_remove(&bch_chardev_minor, c->minor);
+	mutex_unlock(&bch_register_lock);
+
+	closure_debug_destroy(&c->cl);
+	kobject_put(&c->kobj);
+}
+
+/*
+ * Openers (i.e. block devices) should have exited, shutdown all userspace
+ * interfaces and wait for &c->cl to hit 0
+ */
+static void __cache_set_stop2(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, caching);
+
+	bch_debug_exit_cache_set(c);
+
+	if (!IS_ERR_OR_NULL(c->chardev))
+		device_unregister(c->chardev);
+
+	if (c->kobj.state_in_sysfs)
+		kobject_del(&c->kobj);
+
+	bch_cache_accounting_destroy(&c->accounting);
+
+	kobject_put(&c->time_stats);
+	kobject_put(&c->opts_dir);
+	kobject_put(&c->internal);
+
+	mutex_lock(&bch_register_lock);
+	bch_cache_set_read_only_sync(c);
+	mutex_unlock(&bch_register_lock);
+
+	closure_return(cl);
+}
+
+/*
+ * First phase of the shutdown process that's kicked off by cache_set_stop(); we
+ * haven't waited for anything to stop yet, we're just punting to process
+ * context to shut down block devices:
+ */
+static void __cache_set_stop1(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, caching);
+
+	bch_blockdevs_stop(c);
+
+	continue_at(cl, __cache_set_stop2, system_wq);
+}
+
+void bch_cache_set_stop(struct cache_set *c)
+{
+	if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
+		closure_queue(&c->caching);
+}
+
+void bch_cache_set_unregister(struct cache_set *c)
+{
+	if (!test_and_set_bit(CACHE_SET_UNREGISTERING, &c->flags))
+		bch_cache_set_stop(c);
+}
+
+static unsigned cache_set_nr_devices(struct cache_set *c)
+{
+	unsigned i, nr = 0;
+	struct cache_member *mi = c->disk_mi;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	for (i = 0; i < c->disk_sb.nr_in_set; i++)
+		if (!bch_is_zero(mi[i].uuid.b, sizeof(uuid_le)))
+			nr++;
+
+	return nr;
+}
+
+static unsigned cache_set_nr_online_devices(struct cache_set *c)
+{
+	unsigned i, nr = 0;
+
+	for (i = 0; i < c->sb.nr_in_set; i++)
+		if (c->cache[i])
+			nr++;
+
+	return nr;
+}
+
+#define alloc_bucket_pages(gfp, ca)			\
+	((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca))))
+
+static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
+					     struct cache_set_opts opts)
+{
+	struct cache_set *c;
+	unsigned iter_size, journal_entry_bytes;
+
+	c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
+	if (!c)
+		return NULL;
+
+	__module_get(THIS_MODULE);
+
+	c->minor		= -1;
+
+	sema_init(&c->sb_write_mutex, 1);
+	INIT_RADIX_TREE(&c->devices, GFP_KERNEL);
+	mutex_init(&c->btree_cache_lock);
+	mutex_init(&c->bucket_lock);
+	mutex_init(&c->btree_root_lock);
+	INIT_WORK(&c->read_only_work, bch_cache_set_read_only_work);
+	mutex_init(&c->mi_lock);
+
+	init_rwsem(&c->gc_lock);
+
+#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
+	spin_lock_init(&c->name##_time.lock);
+	BCH_TIME_STATS()
+#undef BCH_TIME_STAT
+
+	bch_open_buckets_init(c);
+	bch_tiering_init_cache_set(c);
+
+	INIT_LIST_HEAD(&c->list);
+	INIT_LIST_HEAD(&c->cached_devs);
+	INIT_LIST_HEAD(&c->btree_cache);
+	INIT_LIST_HEAD(&c->btree_cache_freeable);
+	INIT_LIST_HEAD(&c->btree_cache_freed);
+
+	INIT_LIST_HEAD(&c->btree_interior_update_list);
+	mutex_init(&c->btree_reserve_cache_lock);
+	mutex_init(&c->btree_interior_update_lock);
+
+	mutex_init(&c->bio_bounce_pages_lock);
+	INIT_WORK(&c->bio_submit_work, bch_bio_submit_work);
+	spin_lock_init(&c->bio_submit_lock);
+	bio_list_init(&c->read_retry_list);
+	spin_lock_init(&c->read_retry_lock);
+	INIT_WORK(&c->read_retry_work, bch_read_retry_work);
+	mutex_init(&c->zlib_workspace_lock);
+
+	seqcount_init(&c->gc_pos_lock);
+
+	c->prio_clock[READ].hand = 1;
+	c->prio_clock[READ].min_prio = 0;
+	c->prio_clock[WRITE].hand = 1;
+	c->prio_clock[WRITE].min_prio = 0;
+
+	c->congested_read_threshold_us	= 2000;
+	c->congested_write_threshold_us	= 20000;
+	c->error_limit	= 16 << IO_ERROR_SHIFT;
+	init_waitqueue_head(&c->writeback_wait);
+
+	c->writeback_pages_max = (256 << 10) / PAGE_SIZE;
+
+	c->copy_gc_enabled = 1;
+	c->tiering_enabled = 1;
+	c->tiering_percent = 10;
+
+	c->foreground_target_percent = 20;
+
+	c->journal.write_time	= &c->journal_write_time;
+	c->journal.delay_time	= &c->journal_delay_time;
+	c->journal.blocked_time	= &c->journal_blocked_time;
+	c->journal.flush_seq_time = &c->journal_flush_seq_time;
+
+	mutex_init(&c->uevent_lock);
+
+	if (cache_sb_to_cache_set(c, sb))
+		goto err;
+
+	scnprintf(c->name, sizeof(c->name), "%pU", &c->disk_sb.user_uuid);
+
+	c->opts = cache_superblock_opts(sb);
+	cache_set_opts_apply(&c->opts, opts);
+
+	c->block_bits		= ilog2(c->sb.block_size);
+
+	if (cache_set_init_fault("cache_set_alloc"))
+		goto err;
+
+	iter_size = (btree_blocks(c) + 1) * 2 *
+		sizeof(struct btree_node_iter_set);
+
+	journal_entry_bytes = 512U << CACHE_SET_JOURNAL_ENTRY_SIZE(sb);
+
+	if (!(c->wq = alloc_workqueue("bcache",
+				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
+	    !(c->copygc_wq = alloc_workqueue("bcache_copygc",
+				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
+	    percpu_ref_init(&c->writes, bch_writes_disabled, 0, GFP_KERNEL) ||
+	    mempool_init_slab_pool(&c->search, 1, bch_search_cache) ||
+	    mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
+				      sizeof(struct btree_reserve)) ||
+	    mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
+				      sizeof(struct btree_interior_update)) ||
+	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
+	    bioset_init(&c->btree_read_bio, 1, 0) ||
+	    bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
+	    bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
+	    bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
+	    mempool_init_page_pool(&c->bio_bounce_pages,
+				   max_t(unsigned,
+					 c->sb.btree_node_size,
+					 CRC32_EXTENT_SIZE_MAX) /
+				   PAGE_SECTORS, 0) ||
+	    !(c->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache_set)) ||
+	    lg_lock_init(&c->bucket_stats_lock) ||
+	    mempool_init_page_pool(&c->btree_bounce_pool, 1,
+				   ilog2(btree_pages(c))) ||
+	    bdi_setup_and_register(&c->bdi, "bcache") ||
+	    bch_io_clock_init(&c->io_clock[READ]) ||
+	    bch_io_clock_init(&c->io_clock[WRITE]) ||
+	    bch_journal_alloc(&c->journal, journal_entry_bytes) ||
+	    bch_btree_cache_alloc(c) ||
+	    bch_compress_init(c))
+		goto err;
+
+	c->bdi.ra_pages		= VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+	c->bdi.congested_fn	= bch_congested_fn;
+	c->bdi.congested_data	= c;
+
+	/*
+	 * Now that all allocations have succeeded, init various refcounty
+	 * things that let us shutdown:
+	 */
+	closure_init(&c->cl, NULL);
+
+	c->kobj.kset = bcache_kset;
+	kobject_init(&c->kobj, &bch_cache_set_ktype);
+	kobject_init(&c->internal, &bch_cache_set_internal_ktype);
+	kobject_init(&c->opts_dir, &bch_cache_set_opts_dir_ktype);
+	kobject_init(&c->time_stats, &bch_cache_set_time_stats_ktype);
+
+	bch_cache_accounting_init(&c->accounting, &c->cl);
+
+	closure_init(&c->caching, &c->cl);
+	set_closure_fn(&c->caching, __cache_set_stop1, system_wq);
+
+	continue_at_noreturn(&c->cl, __cache_set_stop3, system_wq);
+	return c;
+err:
+	cache_set_free(c);
+	return NULL;
+}
+
+static int bch_cache_set_online(struct cache_set *c)
+{
+	struct cache *ca;
+	unsigned i;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	if (c->kobj.state_in_sysfs)
+		return 0;
+
+	c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
+	if (c->minor < 0)
+		return c->minor;
+
+	c->chardev = device_create(bch_chardev_class, NULL,
+				   MKDEV(bch_chardev_major, c->minor), NULL,
+				   "bcache%u-ctl", c->minor);
+	if (IS_ERR(c->chardev))
+		return PTR_ERR(c->chardev);
+
+	if (kobject_add(&c->kobj, NULL, "%pU", c->disk_sb.user_uuid.b) ||
+	    kobject_add(&c->internal, &c->kobj, "internal") ||
+	    kobject_add(&c->opts_dir, &c->kobj, "options") ||
+	    kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
+	    bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
+		return -1;
+
+	for_each_cache(ca, c, i)
+		if (bch_cache_online(ca)) {
+			percpu_ref_put(&ca->ref);
+			return -1;
+		}
+
+	list_add(&c->list, &bch_cache_sets);
+	return 0;
+}
+
+static const char *run_cache_set(struct cache_set *c)
+{
+	const char *err = "cannot allocate memory";
+	struct cache *ca;
+	unsigned i, id;
+	time64_t now;
+	LIST_HEAD(journal);
+	struct jset *j;
+	int ret = -EINVAL;
+
+	lockdep_assert_held(&bch_register_lock);
+	BUG_ON(test_bit(CACHE_SET_RUNNING, &c->flags));
+
+	/* We don't want bch_fatal_error() to free underneath us */
+	closure_get(&c->caching);
+
+	/*
+	 * Make sure that each cache object's mi is up to date before
+	 * we start testing it.
+	 */
+	for_each_cache(ca, c, i)
+		cache_sb_from_cache_set(c, ca);
+
+	/*
+	 * CACHE_SET_SYNC is true if the cache set has already been run
+	 * and potentially has data.
+	 * It is false if it is the first time it is run.
+	 */
+
+	if (CACHE_SET_SYNC(&c->disk_sb)) {
+		ret = bch_journal_read(c, &journal);
+		if (ret)
+			goto err;
+
+		pr_debug("btree_journal_read() done");
+
+		j = &list_entry(journal.prev, struct journal_replay, list)->j;
+
+		err = "error reading priorities";
+		for_each_cache(ca, c, i) {
+			ret = bch_prio_read(ca);
+			if (ret) {
+				percpu_ref_put(&ca->ref);
+				goto err;
+			}
+		}
+
+		c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
+		c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
+
+		for_each_cache(ca, c, i) {
+			bch_recalc_min_prio(ca, READ);
+			bch_recalc_min_prio(ca, WRITE);
+		}
+
+		/*
+		 * If bch_prio_read() fails it'll call cache_set_error and we'll
+		 * tear everything down right away, but if we perhaps checked
+		 * sooner we could avoid journal replay.
+		 */
+
+		for (id = 0; id < BTREE_ID_NR; id++) {
+			unsigned level;
+			struct bkey_i *k;
+
+			err = "bad btree root";
+			k = bch_journal_find_btree_root(c, j, id, &level);
+			if (!k && id == BTREE_ID_EXTENTS)
+				goto err;
+			if (!k) {
+				pr_debug("missing btree root: %d", id);
+				continue;
+			}
+
+			err = "error reading btree root";
+			if (bch_btree_root_read(c, id, k, level))
+				goto err;
+		}
+
+		bch_verbose(c, "starting mark and sweep:");
+
+		err = "error in recovery";
+		if (bch_initial_gc(c, &journal))
+			goto err;
+
+		bch_verbose(c, "mark and sweep done");
+
+		/*
+		 * bch_journal_start() can't happen sooner, or btree_gc_finish()
+		 * will give spurious errors about oldest_gen > bucket_gen -
+		 * this is a hack but oh well.
+		 */
+		bch_journal_start(c);
+
+		err = "error starting allocator thread";
+		for_each_cache(ca, c, i)
+			if (ca->mi.state == CACHE_ACTIVE &&
+			    bch_cache_allocator_start(ca)) {
+				percpu_ref_put(&ca->ref);
+				goto err;
+			}
+
+		bch_verbose(c, "starting journal replay:");
+
+		err = "journal replay failed";
+		ret = bch_journal_replay(c, &journal);
+		if (ret)
+			goto err;
+
+		bch_verbose(c, "journal replay done");
+
+		/*
+		 * Write a new journal entry _before_ we start journalling new
+		 * data - otherwise, we could end up with btree node bsets with
+		 * journal seqs arbitrarily far in the future vs. the most
+		 * recently written journal entry on disk, if we crash before
+		 * writing the next journal entry:
+		 */
+		err = "error writing journal entry";
+		if (bch_journal_meta(&c->journal))
+			goto err;
+
+		bch_verbose(c, "starting fs gc:");
+		err = "error in fs gc";
+		ret = bch_gc_inode_nlinks(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "fs gc done");
+
+		if (!c->opts.nofsck) {
+			bch_verbose(c, "starting fsck:");
+			err = "error in fsck";
+			ret = bch_fsck(c);
+			if (ret)
+				goto err;
+			bch_verbose(c, "fsck done");
+		}
+	} else {
+		struct bkey_i_inode inode;
+		struct closure cl;
+
+		closure_init_stack(&cl);
+
+		bch_notice(c, "initializing new filesystem");
+
+		err = "unable to allocate journal buckets";
+		for_each_cache(ca, c, i)
+			if (bch_cache_journal_alloc(ca)) {
+				percpu_ref_put(&ca->ref);
+				goto err;
+			}
+
+		bch_initial_gc(c, NULL);
+
+		/*
+		 * journal_res_get() will crash if called before this has
+		 * set up the journal.pin FIFO and journal.cur pointer:
+		 */
+		bch_journal_start(c);
+		bch_journal_set_replay_done(&c->journal);
+
+		err = "error starting allocator thread";
+		for_each_cache(ca, c, i)
+			if (ca->mi.state == CACHE_ACTIVE &&
+			    bch_cache_allocator_start(ca)) {
+				percpu_ref_put(&ca->ref);
+				goto err;
+			}
+
+		err = "cannot allocate new btree root";
+		for (id = 0; id < BTREE_ID_NR; id++)
+			if (bch_btree_root_alloc(c, id, &cl)) {
+				closure_sync(&cl);
+				goto err;
+			}
+
+		/* Wait for new btree roots to be written: */
+		closure_sync(&cl);
+
+		bkey_inode_init(&inode.k_i);
+		inode.k.p.inode = BCACHE_ROOT_INO;
+		inode.v.i_mode = cpu_to_le16(S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO);
+		inode.v.i_nlink = cpu_to_le32(2);
+		get_random_bytes(&inode.v.i_hash_seed, sizeof(inode.v.i_hash_seed));
+		SET_INODE_STR_HASH_TYPE(&inode.v, c->sb.str_hash_type);
+
+		err = "error creating root directory";
+		if (bch_btree_insert(c, BTREE_ID_INODES, &inode.k_i,
+				     NULL, NULL, NULL, 0))
+			goto err;
+
+		err = "error writing first journal entry";
+		if (bch_journal_meta(&c->journal))
+			goto err;
+	}
+
+	if (c->opts.read_only) {
+		bch_cache_set_read_only_sync(c);
+	} else {
+		err = __bch_cache_set_read_write(c);
+		if (err)
+			goto err;
+	}
+
+	now = ktime_get_seconds();
+	rcu_read_lock();
+	for_each_cache_rcu(ca, c, i)
+		c->disk_mi[ca->sb.nr_this_dev].last_mount = cpu_to_le64(now);
+	rcu_read_unlock();
+
+	/* Mark cache set as initialized: */
+	SET_CACHE_SET_SYNC(&c->disk_sb, true);
+	SET_CACHE_SET_CLEAN(&c->disk_sb, false);
+	bcache_write_super(c);
+
+	err = "dynamic fault";
+	if (cache_set_init_fault("run_cache_set"))
+		goto err;
+
+	err = "error creating kobject";
+	if (bch_cache_set_online(c))
+		goto err;
+
+	err = "can't bring up blockdev volumes";
+	if (bch_blockdev_volumes_start(c))
+		goto err;
+
+	bch_debug_init_cache_set(c);
+	set_bit(CACHE_SET_RUNNING, &c->flags);
+	bch_attach_backing_devs(c);
+
+	closure_put(&c->caching);
+
+	bch_notify_cache_set_read_write(c);
+
+	BUG_ON(!list_empty(&journal));
+	return NULL;
+err:
+	switch (ret) {
+	case BCH_FSCK_ERRORS_NOT_FIXED:
+		bch_err(c, "filesystem contains errors: please report this to the developers");
+		pr_cont("mount with -o fix_errors to repair");
+		err = "fsck error";
+		break;
+	case BCH_FSCK_REPAIR_UNIMPLEMENTED:
+		bch_err(c, "filesystem contains errors: please report this to the developers");
+		pr_cont("repair unimplemented: inform the developers so that it can be added");
+		err = "fsck error";
+		break;
+	case BCH_FSCK_REPAIR_IMPOSSIBLE:
+		bch_err(c, "filesystem contains errors, but repair impossible");
+		err = "fsck error";
+		break;
+	case BCH_FSCK_UNKNOWN_VERSION:
+		err = "unknown metadata version";;
+		break;
+	case -ENOMEM:
+		err = "cannot allocate memory";
+		break;
+	case -EIO:
+		err = "IO error";
+		break;
+	}
+
+	BUG_ON(!err);
+
+	bch_journal_entries_free(&journal);
+	set_bit(CACHE_SET_ERROR, &c->flags);
+	bch_cache_set_unregister(c);
+	closure_put(&c->caching);
+	return err;
+}
+
+static const char *can_add_cache(struct cache_sb *sb,
+				 struct cache_set *c)
+{
+	if (le16_to_cpu(sb->block_size) != c->sb.block_size)
+		return "mismatched block size";
+
+	if (le16_to_cpu(sb->members[sb->nr_this_dev].bucket_size) <
+	    CACHE_SET_BTREE_NODE_SIZE(&c->disk_sb))
+		return "new cache bucket_size is too small";
+
+	return NULL;
+}
+
+static const char *can_attach_cache(struct cache_sb *sb, struct cache_set *c)
+{
+	const char *err;
+	bool match;
+
+	err = can_add_cache(sb, c);
+	if (err)
+		return err;
+
+	/*
+	 * When attaching an existing device, the cache set superblock must
+	 * already contain member_info with a matching UUID
+	 */
+	match = le64_to_cpu(sb->seq) <= le64_to_cpu(c->disk_sb.seq)
+		? (sb->nr_this_dev < c->disk_sb.nr_in_set &&
+		   !memcmp(&c->disk_mi[sb->nr_this_dev].uuid,
+			   &sb->disk_uuid, sizeof(uuid_le)))
+		: (sb->nr_this_dev < sb->nr_in_set &&
+		   !memcmp(&sb->members[sb->nr_this_dev].uuid,
+			   &sb->disk_uuid, sizeof(uuid_le)));
+
+	if (!match)
+		return "cache sb does not match set";
+
+	return NULL;
+}
+
+/* Cache device */
+
+bool bch_cache_read_only(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	char buf[BDEVNAME_SIZE];
+
+	bdevname(ca->disk_sb.bdev, buf);
+
+	lockdep_assert_held(&bch_register_lock);
+
+	if (ca->mi.state != CACHE_ACTIVE)
+		return false;
+
+	if (!bch_cache_may_remove(ca)) {
+		bch_err(c, "required member %s going RO, forcing fs RO", buf);
+		bch_cache_set_read_only_sync(c);
+	}
+
+	trace_bcache_cache_read_only(ca);
+
+	bch_moving_gc_stop(ca);
+
+	/*
+	 * This stops new data writes (e.g. to existing open data
+	 * buckets) and then waits for all existing writes to
+	 * complete.
+	 */
+	bch_cache_allocator_stop(ca);
+
+	bch_cache_group_remove_cache(&c->journal.devs, ca);
+
+	/*
+	 * Device data write barrier -- no non-meta-data writes should
+	 * occur after this point.  However, writes to btree buckets,
+	 * journal buckets, and the superblock can still occur.
+	 */
+	trace_bcache_cache_read_only_done(ca);
+
+	bch_notice(c, "%s read only", bdevname(ca->disk_sb.bdev, buf));
+	bch_notify_cache_read_only(ca);
+
+	SET_CACHE_STATE(&c->disk_mi[ca->sb.nr_this_dev], CACHE_RO);
+	bcache_write_super(c);
+	return true;
+}
+
+static const char *__bch_cache_read_write(struct cache_set *c, struct cache *ca)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	if (ca->mi.state == CACHE_ACTIVE)
+		return NULL;
+
+	if (test_bit(CACHE_DEV_REMOVING, &ca->flags))
+		return "removing";
+
+	trace_bcache_cache_read_write(ca);
+
+	if (bch_cache_allocator_start(ca))
+		return "error starting allocator thread";
+
+	if (bch_moving_gc_thread_start(ca))
+		return "error starting moving GC thread";
+
+	bch_cache_group_add_cache(&c->journal.devs, ca);
+
+	wake_up_process(c->tiering_read);
+
+	bch_notify_cache_read_write(ca);
+	trace_bcache_cache_read_write_done(ca);
+
+	return NULL;
+}
+
+const char *bch_cache_read_write(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	const char *err;
+
+	err = __bch_cache_read_write(c, ca);
+	if (err)
+		return err;
+
+	SET_CACHE_STATE(&c->disk_mi[ca->sb.nr_this_dev], CACHE_ACTIVE);
+	bcache_write_super(c);
+
+	return NULL;
+}
+
+/*
+ * bch_cache_stop has already returned, so we no longer hold the register
+ * lock at the point this is called.
+ */
+
+void bch_cache_release(struct kobject *kobj)
+{
+	struct cache *ca = container_of(kobj, struct cache, kobj);
+
+	percpu_ref_exit(&ca->ref);
+	kfree(ca);
+}
+
+static void bch_cache_free_work(struct work_struct *work)
+{
+	struct cache *ca = container_of(work, struct cache, free_work);
+	struct cache_set *c = ca->set;
+	unsigned i;
+
+	cancel_work_sync(&ca->io_error_work);
+
+	if (c && c->kobj.state_in_sysfs) {
+		char buf[12];
+
+		sprintf(buf, "cache%u", ca->sb.nr_this_dev);
+		sysfs_remove_link(&c->kobj, buf);
+	}
+
+	if (ca->kobj.state_in_sysfs)
+		kobject_del(&ca->kobj);
+
+	free_super(&ca->disk_sb);
+
+	/*
+	 * bch_cache_stop can be called in the middle of initialization
+	 * of the struct cache object.
+	 * As such, not all the sub-structures may be initialized.
+	 * However, they were zeroed when the object was allocated.
+	 */
+
+	free_percpu(ca->sectors_written);
+	bioset_exit(&ca->replica_set);
+	free_percpu(ca->bucket_stats_percpu);
+	kfree(ca->journal.bucket_seq);
+	free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
+	kfree(ca->prio_buckets);
+	kfree(ca->bio_prio);
+	kfree(ca->journal.bio);
+	vfree(ca->buckets);
+	vfree(ca->oldest_gens);
+	free_heap(&ca->heap);
+	free_fifo(&ca->free_inc);
+
+	for (i = 0; i < RESERVE_NR; i++)
+		free_fifo(&ca->free[i]);
+
+	kobject_put(&ca->kobj);
+
+	if (c)
+		kobject_put(&c->kobj);
+}
+
+static void bch_cache_percpu_ref_release(struct percpu_ref *ref)
+{
+	struct cache *ca = container_of(ref, struct cache, ref);
+
+	schedule_work(&ca->free_work);
+}
+
+static void bch_cache_free_rcu(struct rcu_head *rcu)
+{
+	struct cache *ca = container_of(rcu, struct cache, free_rcu);
+
+	/*
+	 * This decrements the ref count to ca, and once the ref count
+	 * is 0 (outstanding bios to the ca also incremented it and
+	 * decrement it on completion/error), bch_cache_percpu_ref_release
+	 * is called, and that eventually results in bch_cache_free_work
+	 * being called, which in turn results in bch_cache_release being
+	 * called.
+	 *
+	 * In particular, these functions won't be called until there are no
+	 * bios outstanding (the per-cpu ref counts are all 0), so it
+	 * is safe to remove the actual sysfs device at that point,
+	 * and that can indicate success to the user.
+	 */
+
+	percpu_ref_kill(&ca->ref);
+}
+
+static void bch_cache_stop(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	if (c) {
+		BUG_ON(rcu_access_pointer(c->cache[ca->sb.nr_this_dev]) != ca);
+		rcu_assign_pointer(c->cache[ca->sb.nr_this_dev], NULL);
+	}
+
+	call_rcu(&ca->free_rcu, bch_cache_free_rcu);
+}
+
+static void bch_cache_remove_work(struct work_struct *work)
+{
+	struct cache *ca = container_of(work, struct cache, remove_work);
+	struct cache_set *c = ca->set;
+	char name[BDEVNAME_SIZE];
+	bool force = test_bit(CACHE_DEV_FORCE_REMOVE, &ca->flags);
+	unsigned dev = ca->sb.nr_this_dev;
+
+	bdevname(ca->disk_sb.bdev, name);
+
+	/*
+	 * Device should already be RO, now migrate data off:
+	 *
+	 * XXX: locking is sketchy, bch_cache_read_write() has to check
+	 * CACHE_DEV_REMOVING bit
+	 */
+	if (!ca->mi.has_data) {
+		/* Nothing to do: */
+	} else if (!bch_move_data_off_device(ca)) {
+		lockdep_assert_held(&bch_register_lock);
+		SET_CACHE_HAS_DATA(&c->disk_mi[ca->sb.nr_this_dev], false);
+
+		bcache_write_super(c);
+	} else if (force) {
+		bch_flag_data_bad(ca);
+
+		lockdep_assert_held(&bch_register_lock);
+		SET_CACHE_HAS_DATA(&c->disk_mi[ca->sb.nr_this_dev], false);
+
+		bcache_write_super(c);
+	} else {
+		bch_err(c, "Remove of %s failed, unable to migrate data off",
+			name);
+		clear_bit(CACHE_DEV_REMOVING, &ca->flags);
+		return;
+	}
+
+	/* Now metadata: */
+
+	if (!ca->mi.has_metadata) {
+		/* Nothing to do: */
+	} else if (!bch_move_meta_data_off_device(ca)) {
+		lockdep_assert_held(&bch_register_lock);
+		SET_CACHE_HAS_METADATA(&c->disk_mi[ca->sb.nr_this_dev], false);
+
+		bcache_write_super(c);
+	} else {
+		bch_err(c, "Remove of %s failed, unable to migrate metadata off",
+			name);
+		clear_bit(CACHE_DEV_REMOVING, &ca->flags);
+		return;
+	}
+
+	/*
+	 * Ok, really doing the remove:
+	 * Drop device's prio pointer before removing it from superblock:
+	 */
+	bch_notify_cache_removed(ca);
+
+	spin_lock(&c->journal.lock);
+	c->journal.prio_buckets[dev] = 0;
+	spin_unlock(&c->journal.lock);
+
+	bch_journal_meta(&c->journal);
+
+	/*
+	 * Stop device before removing it from the cache set's list of devices -
+	 * and get our own ref on cache set since ca is going away:
+	 */
+	closure_get(&c->cl);
+
+	mutex_lock(&bch_register_lock);
+	bch_cache_stop(ca);
+
+	/*
+	 * RCU barrier between dropping between c->cache and dropping from
+	 * member info:
+	 */
+	synchronize_rcu();
+
+	lockdep_assert_held(&bch_register_lock);
+
+	/*
+	 * Free this device's slot in the cache_member array - all pointers to
+	 * this device must be gone:
+	 */
+	memset(&c->disk_mi[dev].uuid, 0, sizeof(c->disk_mi[dev].uuid));
+
+	bcache_write_super(c);
+	mutex_unlock(&bch_register_lock);
+
+	closure_put(&c->cl);
+}
+
+bool bch_cache_remove(struct cache *ca, bool force)
+{
+	mutex_lock(&bch_register_lock);
+
+	if (test_bit(CACHE_DEV_REMOVING, &ca->flags))
+		return false;
+
+	if (!bch_cache_may_remove(ca)) {
+		bch_err(ca->set, "Can't remove last device in tier %u",
+			ca->mi.tier);
+		bch_notify_cache_remove_failed(ca);
+		return false;
+	}
+
+	/* First, go RO before we try to migrate data off: */
+	bch_cache_read_only(ca);
+
+	if (force)
+		set_bit(CACHE_DEV_FORCE_REMOVE, &ca->flags);
+	set_bit(CACHE_DEV_REMOVING, &ca->flags);
+	bch_notify_cache_removing(ca);
+
+	mutex_unlock(&bch_register_lock);
+
+	/* Migrate the data and finish removal asynchronously: */
+
+	queue_work(system_long_wq, &ca->remove_work);
+	return true;
+}
+
+static int bch_cache_online(struct cache *ca)
+{
+	char buf[12];
+
+	lockdep_assert_held(&bch_register_lock);
+
+	sprintf(buf, "cache%u", ca->sb.nr_this_dev);
+
+	if (kobject_add(&ca->kobj,
+			&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
+			"bcache") ||
+	    sysfs_create_link(&ca->kobj, &ca->set->kobj, "set") ||
+	    sysfs_create_link(&ca->set->kobj, &ca->kobj, buf))
+		return -1;
+
+	return 0;
+}
+
+static const char *cache_alloc(struct bcache_superblock *sb,
+			       struct cache_set *c,
+			       struct cache **ret)
+{
+	size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
+	size_t heap_size;
+	unsigned i, journal_entry_pages;
+	const char *err = "cannot allocate memory";
+	struct cache *ca;
+
+	if (c->sb.nr_in_set == 1)
+		bdevname(sb->bdev, c->name);
+
+	if (cache_set_init_fault("cache_alloc"))
+		return err;
+
+	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+	if (!ca)
+		return err;
+
+	if (percpu_ref_init(&ca->ref, bch_cache_percpu_ref_release,
+			    0, GFP_KERNEL)) {
+		kfree(ca);
+		return err;
+	}
+
+	kobject_init(&ca->kobj, &bch_cache_ktype);
+
+	spin_lock_init(&ca->self.lock);
+	ca->self.nr_devices = 1;
+	rcu_assign_pointer(ca->self.d[0].dev, ca);
+	ca->sb.nr_this_dev = sb->sb->nr_this_dev;
+
+	INIT_WORK(&ca->free_work, bch_cache_free_work);
+	INIT_WORK(&ca->remove_work, bch_cache_remove_work);
+	spin_lock_init(&ca->freelist_lock);
+	spin_lock_init(&ca->prio_buckets_lock);
+	mutex_init(&ca->heap_lock);
+	bch_moving_init_cache(ca);
+
+	ca->disk_sb = *sb;
+	ca->disk_sb.bdev->bd_holder = ca;
+	memset(sb, 0, sizeof(*sb));
+
+	INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work);
+
+	err = "dynamic fault";
+	if (cache_set_init_fault("cache_alloc"))
+		goto err;
+
+	ca->mi = cache_mi_to_cpu_mi(ca->disk_sb.sb->members +
+				    ca->disk_sb.sb->nr_this_dev);
+	ca->bucket_bits = ilog2(ca->mi.bucket_size);
+
+	/* XXX: tune these */
+	movinggc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7);
+	reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9);
+	/*
+	 * free_inc must be smaller than the copygc reserve: if it was bigger,
+	 * one copygc iteration might not make enough buckets available to fill
+	 * up free_inc and allow the allocator to make forward progress
+	 */
+	free_inc_reserve = movinggc_reserve / 2;
+	heap_size = movinggc_reserve * 8;
+
+	journal_entry_pages =
+		DIV_ROUND_UP(1U << CACHE_SET_JOURNAL_ENTRY_SIZE(ca->disk_sb.sb),
+			     PAGE_SECTORS);
+
+	if (!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
+	    !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
+	    !init_fifo(&ca->free[RESERVE_MOVINGGC],
+		       movinggc_reserve, GFP_KERNEL) ||
+	    !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
+	    !init_fifo(&ca->free_inc,	free_inc_reserve, GFP_KERNEL) ||
+	    !init_heap(&ca->heap,	heap_size, GFP_KERNEL) ||
+	    !(ca->oldest_gens	= vzalloc(sizeof(u8) *
+					  ca->mi.nbuckets)) ||
+	    !(ca->buckets	= vzalloc(sizeof(struct bucket) *
+					  ca->mi.nbuckets)) ||
+	    !(ca->prio_buckets	= kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
+					  2, GFP_KERNEL)) ||
+	    !(ca->disk_buckets	= alloc_bucket_pages(GFP_KERNEL, ca)) ||
+	    !(ca->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache)) ||
+	    !(ca->journal.bucket_seq = kcalloc(bch_nr_journal_buckets(ca->disk_sb.sb),
+					       sizeof(u64), GFP_KERNEL)) ||
+	    !(ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages)) ||
+	    !(ca->bio_prio = bio_kmalloc(GFP_KERNEL, bucket_pages(ca))) ||
+	    bioset_init(&ca->replica_set, 4,
+			offsetof(struct bch_write_bio, bio)) ||
+	    !(ca->sectors_written = alloc_percpu(*ca->sectors_written)))
+		goto err;
+
+	ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
+
+	total_reserve = ca->free_inc.size;
+	for (i = 0; i < RESERVE_NR; i++)
+		total_reserve += ca->free[i].size;
+	pr_debug("%zu buckets reserved", total_reserve);
+
+	ca->copygc_write_point.group = &ca->self;
+	ca->tiering_write_point.group = &ca->self;
+
+	kobject_get(&c->kobj);
+	ca->set = c;
+
+	kobject_get(&ca->kobj);
+	rcu_assign_pointer(c->cache[ca->sb.nr_this_dev], ca);
+
+	if (le64_to_cpu(ca->disk_sb.sb->seq) > le64_to_cpu(c->disk_sb.seq))
+		cache_sb_to_cache_set(c, ca->disk_sb.sb);
+
+	/*
+	 * Increase journal write timeout if flushes to this device are
+	 * expensive:
+	 */
+	if (!blk_queue_nonrot(bdev_get_queue(ca->disk_sb.bdev)) &&
+	    journal_flushes_device(ca))
+		c->journal.write_delay_ms =
+			max(c->journal.write_delay_ms, 1000U);
+
+	err = "error creating kobject";
+	if (c->kobj.state_in_sysfs &&
+	    bch_cache_online(ca))
+		goto err;
+
+	if (ret)
+		*ret = ca;
+	else
+		kobject_put(&ca->kobj);
+	return NULL;
+err:
+	bch_cache_stop(ca);
+	return err;
+}
+
+static struct cache_set *cache_set_lookup(uuid_le uuid)
+{
+	struct cache_set *c;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	list_for_each_entry(c, &bch_cache_sets, list)
+		if (!memcmp(&c->disk_sb.set_uuid, &uuid, sizeof(uuid_le)))
+			return c;
+
+	return NULL;
+}
+
+static const char *register_cache(struct bcache_superblock *sb,
+				  struct cache_set_opts opts)
+{
+	char name[BDEVNAME_SIZE];
+	const char *err = "cannot allocate memory";
+	struct cache_set *c;
+
+	err = validate_cache_super(sb);
+	if (err)
+		return err;
+
+	bdevname(sb->bdev, name);
+
+	c = cache_set_lookup(sb->sb->set_uuid);
+	if (c) {
+		if ((err = (can_attach_cache(sb->sb, c) ?:
+			    cache_alloc(sb, c, NULL))))
+			return err;
+
+		if (cache_set_nr_online_devices(c) == cache_set_nr_devices(c)) {
+			err = run_cache_set(c);
+			if (err)
+				return err;
+		}
+		goto out;
+	}
+
+	c = bch_cache_set_alloc(sb->sb, opts);
+	if (!c)
+		return err;
+
+	err = cache_alloc(sb, c, NULL);
+	if (err)
+		goto err_stop;
+
+	if (cache_set_nr_online_devices(c) == cache_set_nr_devices(c)) {
+		err = run_cache_set(c);
+		if (err)
+			goto err_stop;
+	}
+
+	err = "error creating kobject";
+	if (bch_cache_set_online(c))
+		goto err_stop;
+out:
+
+	bch_info(c, "started");
+	return NULL;
+err_stop:
+	bch_cache_set_stop(c);
+	return err;
+}
+
+int bch_cache_set_add_cache(struct cache_set *c, const char *path)
+{
+	struct bcache_superblock sb;
+	const char *err;
+	struct cache *ca;
+	struct cache_member *new_mi = NULL;
+	struct cache_member mi;
+	unsigned nr_this_dev, nr_in_set, u64s;
+	int ret = -EINVAL;
+
+	mutex_lock(&bch_register_lock);
+
+	err = read_super(&sb, path);
+	if (err)
+		goto err_unlock;
+
+	err = validate_cache_super(&sb);
+	if (err)
+		goto err_unlock;
+
+	err = can_add_cache(sb.sb, c);
+	if (err)
+		goto err_unlock;
+
+	/*
+	 * Preserve the old cache member information (esp. tier)
+	 * before we start bashing the disk stuff.
+	 */
+	mi = sb.sb->members[sb.sb->nr_this_dev];
+	mi.last_mount = cpu_to_le64(ktime_get_seconds());
+
+	down_read(&c->gc_lock);
+
+	if (dynamic_fault("bcache:add:no_slot"))
+		goto no_slot;
+
+	if (test_bit(CACHE_SET_GC_FAILURE, &c->flags))
+		goto no_slot;
+
+	for (nr_this_dev = 0; nr_this_dev < MAX_CACHES_PER_SET; nr_this_dev++)
+		if (nr_this_dev >= c->sb.nr_in_set ||
+		    bch_is_zero(c->disk_mi[nr_this_dev].uuid.b,
+				 sizeof(uuid_le)))
+			goto have_slot;
+no_slot:
+	up_read(&c->gc_lock);
+
+	err = "no slots available in superblock";
+	ret = -ENOSPC;
+	goto err_unlock;
+
+have_slot:
+	nr_in_set = max_t(unsigned, nr_this_dev + 1, c->sb.nr_in_set);
+	up_read(&c->gc_lock);
+
+	u64s = nr_in_set * (sizeof(struct cache_member) / sizeof(u64));
+	err = "no space in superblock for member info";
+	if (bch_super_realloc(&sb, u64s))
+		goto err_unlock;
+
+	new_mi = dynamic_fault("bcache:add:member_info_realloc")
+		? NULL
+		: kmalloc(sizeof(struct cache_member) * nr_in_set,
+			  GFP_KERNEL);
+	if (!new_mi) {
+		err = "cannot allocate memory";
+		ret = -ENOMEM;
+		goto err_unlock;
+	}
+
+	memcpy(new_mi, c->disk_mi,
+	       sizeof(struct cache_member) * nr_in_set);
+	new_mi[nr_this_dev] = mi;
+
+	sb.sb->nr_this_dev	= nr_this_dev;
+	sb.sb->nr_in_set	= nr_in_set;
+	sb.sb->u64s		= cpu_to_le16(u64s);
+	memcpy(sb.sb->members, new_mi,
+	       sizeof(struct cache_member) * nr_in_set);
+
+	if (cache_set_mi_update(c, new_mi, nr_in_set)) {
+		err = "cannot allocate memory";
+		ret = -ENOMEM;
+		goto err_unlock;
+	}
+
+	/* commit new member info */
+	swap(c->disk_mi, new_mi);
+	kfree(new_mi);
+	new_mi = NULL;
+	c->disk_sb.nr_in_set = nr_in_set;
+	c->sb.nr_in_set = nr_in_set;
+
+	err = cache_alloc(&sb, c, &ca);
+	if (err)
+		goto err_unlock;
+
+	bcache_write_super(c);
+
+	err = "journal alloc failed";
+	if (bch_cache_journal_alloc(ca))
+		goto err_put;
+
+	bch_notify_cache_added(ca);
+
+	if (ca->mi.state == CACHE_ACTIVE) {
+		err = __bch_cache_read_write(c, ca);
+		if (err)
+			goto err_put;
+	}
+
+	kobject_put(&ca->kobj);
+	mutex_unlock(&bch_register_lock);
+	return 0;
+err_put:
+	bch_cache_stop(ca);
+err_unlock:
+	kfree(new_mi);
+	free_super(&sb);
+	mutex_unlock(&bch_register_lock);
+
+	bch_err(c, "Unable to add device: %s", err);
+	return ret ?: -EINVAL;
+}
+
+const char *bch_register_cache_set(char * const *devices, unsigned nr_devices,
+				   struct cache_set_opts opts,
+				   struct cache_set **ret)
+{
+	const char *err;
+	struct cache_set *c = NULL;
+	struct bcache_superblock *sb;
+	uuid_le uuid;
+	unsigned i;
+
+	memset(&uuid, 0, sizeof(uuid_le));
+
+	if (!nr_devices)
+		return "need at least one device";
+
+	if (!try_module_get(THIS_MODULE))
+		return "module unloading";
+
+	err = "cannot allocate memory";
+	sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
+	if (!sb)
+		goto err;
+
+	/*
+	 * read_super() needs to happen under register_lock, so that the
+	 * exclusive open is atomic with adding the new cache set to the list of
+	 * cache sets:
+	 */
+	mutex_lock(&bch_register_lock);
+
+	for (i = 0; i < nr_devices; i++) {
+		err = read_super(&sb[i], devices[i]);
+		if (err)
+			goto err_unlock;
+
+		err = "attempting to register backing device";
+		if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
+			goto err_unlock;
+
+		err = validate_cache_super(&sb[i]);
+		if (err)
+			goto err_unlock;
+	}
+
+	err = "cache set already registered";
+	if (cache_set_lookup(sb->sb->set_uuid))
+		goto err_unlock;
+
+	err = "cannot allocate memory";
+	c = bch_cache_set_alloc(sb[0].sb, opts);
+	if (!c)
+		goto err_unlock;
+
+	for (i = 0; i < nr_devices; i++) {
+		err = cache_alloc(&sb[i], c, NULL);
+		if (err)
+			goto err_unlock;
+	}
+
+	err = "insufficient devices";
+	if (cache_set_nr_online_devices(c) != cache_set_nr_devices(c))
+		goto err_unlock;
+
+	err = run_cache_set(c);
+	if (err)
+		goto err_unlock;
+
+	err = "error creating kobject";
+	if (bch_cache_set_online(c))
+		goto err_unlock;
+
+	if (ret) {
+		closure_get(&c->cl);
+		*ret = c;
+	}
+
+	mutex_unlock(&bch_register_lock);
+
+	err = NULL;
+out:
+	kfree(sb);
+	module_put(THIS_MODULE);
+	return err;
+err_unlock:
+	if (c)
+		bch_cache_set_stop(c);
+	mutex_unlock(&bch_register_lock);
+err:
+	for (i = 0; i < nr_devices; i++)
+		free_super(&sb[i]);
+	goto out;
+}
+
+const char *bch_register_one(const char *path)
+{
+	struct bcache_superblock sb;
+	const char *err;
+
+	mutex_lock(&bch_register_lock);
+
+	err = read_super(&sb, path);
+	if (err)
+		goto err;
+
+	if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version)))
+		err = bch_backing_dev_register(&sb);
+	else
+		err = register_cache(&sb, cache_set_opts_empty());
+
+	free_super(&sb);
+err:
+	mutex_unlock(&bch_register_lock);
+	return err;
+}
+
+/* Global interfaces/init */
+
+#define kobj_attribute_write(n, fn)					\
+	static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
+
+#define kobj_attribute_rw(n, show, store)				\
+	static struct kobj_attribute ksysfs_##n =			\
+		__ATTR(n, S_IWUSR|S_IRUSR, show, store)
+
+static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
+			       const char *, size_t);
+
+kobj_attribute_write(register,		register_bcache);
+kobj_attribute_write(register_quiet,	register_bcache);
+
+static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+			       const char *buffer, size_t size)
+{
+	ssize_t ret = -EINVAL;
+	const char *err = "cannot allocate memory";
+	char *path = NULL;
+
+	if (!try_module_get(THIS_MODULE))
+		return -EBUSY;
+
+	if (!(path = kstrndup(skip_spaces(buffer), size, GFP_KERNEL)))
+		goto err;
+
+	err = bch_register_one(strim(path));
+	if (err)
+		goto err;
+
+	ret = size;
+out:
+	kfree(path);
+	module_put(THIS_MODULE);
+	return ret;
+err:
+	pr_err("error opening %s: %s", path, err);
+	goto out;
+}
+
+static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
+{
+	if (code == SYS_DOWN ||
+	    code == SYS_HALT ||
+	    code == SYS_POWER_OFF) {
+		struct cache_set *c;
+
+		mutex_lock(&bch_register_lock);
+
+		if (!list_empty(&bch_cache_sets))
+			pr_info("Setting all devices read only:");
+
+		list_for_each_entry(c, &bch_cache_sets, list)
+			bch_cache_set_read_only(c);
+
+		list_for_each_entry(c, &bch_cache_sets, list)
+			bch_cache_set_read_only_sync(c);
+
+		mutex_unlock(&bch_register_lock);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block reboot = {
+	.notifier_call	= bcache_reboot,
+	.priority	= INT_MAX, /* before any real devices */
+};
+
+static ssize_t reboot_test(struct kobject *k, struct kobj_attribute *attr,
+			   const char *buffer, size_t size)
+{
+	bcache_reboot(NULL, SYS_DOWN, NULL);
+	return size;
+}
+
+kobj_attribute_write(reboot,		reboot_test);
+
+static void bcache_exit(void)
+{
+	bch_debug_exit();
+	bch_fs_exit();
+	bch_blockdev_exit();
+	if (bcache_kset)
+		kset_unregister(bcache_kset);
+	if (bcache_io_wq)
+		destroy_workqueue(bcache_io_wq);
+	if (!IS_ERR_OR_NULL(bch_chardev_class))
+		device_destroy(bch_chardev_class,
+			       MKDEV(bch_chardev_major, 0));
+	if (!IS_ERR_OR_NULL(bch_chardev_class))
+		class_destroy(bch_chardev_class);
+	if (bch_chardev_major > 0)
+		unregister_chrdev(bch_chardev_major, "bcache");
+	if (!IS_ERR_OR_NULL(bch_sha1))
+		crypto_free_shash(bch_sha1);
+	unregister_reboot_notifier(&reboot);
+}
+
+static int __init bcache_init(void)
+{
+	static const struct attribute *files[] = {
+		&ksysfs_register.attr,
+		&ksysfs_register_quiet.attr,
+		&ksysfs_reboot.attr,
+		NULL
+	};
+
+	mutex_init(&bch_register_lock);
+	register_reboot_notifier(&reboot);
+	closure_debug_init();
+	bkey_pack_test();
+
+	bch_sha1 = crypto_alloc_shash("sha1", 0, 0);
+	if (IS_ERR(bch_sha1))
+		goto err;
+
+	bch_chardev_major = register_chrdev(0, "bcache-ctl", &bch_chardev_fops);
+	if (bch_chardev_major < 0)
+		goto err;
+
+	bch_chardev_class = class_create(THIS_MODULE, "bcache");
+	if (IS_ERR(bch_chardev_class))
+		goto err;
+
+	bch_chardev = device_create(bch_chardev_class, NULL,
+				    MKDEV(bch_chardev_major, 255),
+				    NULL, "bcache-ctl");
+	if (IS_ERR(bch_chardev))
+		goto err;
+
+	if (!(bcache_io_wq = create_freezable_workqueue("bcache_io")) ||
+	    !(bcache_kset = kset_create_and_add("bcache", NULL, fs_kobj)) ||
+	    sysfs_create_files(&bcache_kset->kobj, files) ||
+	    bch_blockdev_init() ||
+	    bch_fs_init() ||
+	    bch_debug_init())
+		goto err;
+
+	return 0;
+err:
+	bcache_exit();
+	return -ENOMEM;
+}
+
+#define BCH_DEBUG_PARAM(name, description)			\
+	bool bch_##name;					\
+	module_param_named(name, bch_##name, bool, 0644);	\
+	MODULE_PARM_DESC(name, description);
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+module_exit(bcache_exit);
+module_init(bcache_init);
diff --git a/libbcache/super.h b/libbcache/super.h
new file mode 100644
index 0000000..635e1a6
--- /dev/null
+++ b/libbcache/super.h
@@ -0,0 +1,160 @@
+#ifndef _BCACHE_SUPER_H
+#define _BCACHE_SUPER_H
+
+#include "extents.h"
+
+static inline size_t sector_to_bucket(const struct cache *ca, sector_t s)
+{
+	return s >> ca->bucket_bits;
+}
+
+static inline sector_t bucket_to_sector(const struct cache *ca, size_t b)
+{
+	return ((sector_t) b) << ca->bucket_bits;
+}
+
+static inline sector_t bucket_remainder(const struct cache *ca, sector_t s)
+{
+	return s & (ca->mi.bucket_size - 1);
+}
+
+#define cache_member_info_get(_c)					\
+	(rcu_read_lock(), rcu_dereference((_c)->members))
+
+#define cache_member_info_put()	rcu_read_unlock()
+
+static inline struct cache *bch_next_cache_rcu(struct cache_set *c,
+					       unsigned *iter)
+{
+	struct cache *ret = NULL;
+
+	while (*iter < c->sb.nr_in_set &&
+	       !(ret = rcu_dereference(c->cache[*iter])))
+		(*iter)++;
+
+	return ret;
+}
+
+#define for_each_cache_rcu(ca, c, iter)					\
+	for ((iter) = 0; ((ca) = bch_next_cache_rcu((c), &(iter))); (iter)++)
+
+static inline struct cache *bch_get_next_cache(struct cache_set *c,
+					       unsigned *iter)
+{
+	struct cache *ret;
+
+	rcu_read_lock();
+	if ((ret = bch_next_cache_rcu(c, iter)))
+		percpu_ref_get(&ret->ref);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+/*
+ * If you break early, you must drop your ref on the current cache
+ */
+#define for_each_cache(ca, c, iter)					\
+	for ((iter) = 0;						\
+	     (ca = bch_get_next_cache(c, &(iter)));			\
+	     percpu_ref_put(&ca->ref), (iter)++)
+
+void bch_check_mark_super_slowpath(struct cache_set *,
+				   const struct bkey_i *, bool);
+
+static inline bool bch_check_super_marked(struct cache_set *c,
+					  const struct bkey_i *k, bool meta)
+{
+	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+	const struct bch_extent_ptr *ptr;
+	struct cache_member_cpu *mi = cache_member_info_get(c)->m;
+	bool ret = true;
+
+	extent_for_each_ptr(e, ptr)
+		if (!(meta
+		      ? mi[ptr->dev].has_metadata
+		      : mi[ptr->dev].has_data) &&
+		    bch_extent_ptr_is_dirty(c, e, ptr)) {
+			ret = false;
+			break;
+		}
+
+	cache_member_info_put();
+
+	return ret;
+}
+
+static inline void bch_check_mark_super(struct cache_set *c,
+					const struct bkey_i *k, bool meta)
+{
+	if (bch_check_super_marked(c, k, meta))
+		return;
+
+	bch_check_mark_super_slowpath(c, k, meta);
+}
+
+static inline bool bch_cache_may_remove(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+
+	/*
+	 * Right now, we can't remove the last device from a tier,
+	 * - For tier 0, because all metadata lives in tier 0 and because
+	 *   there is no way to have foreground writes go directly to tier 1.
+	 * - For tier 1, because the code doesn't completely support an
+	 *   empty tier 1.
+	 */
+
+	/*
+	 * Turning a device read-only removes it from the cache group,
+	 * so there may only be one read-write device in a tier, and yet
+	 * the device we are removing is in the same tier, so we have
+	 * to check for identity.
+	 * Removing the last RW device from a tier requires turning the
+	 * whole cache set RO.
+	 */
+
+	return tier->nr_devices != 1 ||
+		rcu_access_pointer(tier->d[0].dev) != ca;
+}
+
+void free_super(struct bcache_superblock *);
+int bch_super_realloc(struct bcache_superblock *, unsigned);
+void bcache_write_super(struct cache_set *);
+void __write_super(struct cache_set *, struct bcache_superblock *);
+
+void bch_cache_set_release(struct kobject *);
+void bch_cache_release(struct kobject *);
+
+void bch_cache_set_unregister(struct cache_set *);
+void bch_cache_set_stop(struct cache_set *);
+
+const char *bch_register_one(const char *path);
+const char *bch_register_cache_set(char * const *, unsigned,
+				   struct cache_set_opts,
+				   struct cache_set **);
+
+bool bch_cache_set_read_only(struct cache_set *);
+bool bch_cache_set_emergency_read_only(struct cache_set *);
+void bch_cache_set_read_only_sync(struct cache_set *);
+const char *bch_cache_set_read_write(struct cache_set *);
+
+bool bch_cache_read_only(struct cache *);
+const char *bch_cache_read_write(struct cache *);
+bool bch_cache_remove(struct cache *, bool force);
+int bch_cache_set_add_cache(struct cache_set *, const char *);
+
+extern struct mutex bch_register_lock;
+extern struct list_head bch_cache_sets;
+extern struct idr bch_cache_set_minor;
+extern struct workqueue_struct *bcache_io_wq;
+extern struct crypto_shash *bch_sha1;
+
+extern struct kobj_type bch_cache_set_ktype;
+extern struct kobj_type bch_cache_set_internal_ktype;
+extern struct kobj_type bch_cache_set_time_stats_ktype;
+extern struct kobj_type bch_cache_set_opts_dir_ktype;
+extern struct kobj_type bch_cache_ktype;
+
+#endif /* _BCACHE_SUPER_H */
diff --git a/libbcache/super_types.h b/libbcache/super_types.h
new file mode 100644
index 0000000..d89f780
--- /dev/null
+++ b/libbcache/super_types.h
@@ -0,0 +1,11 @@
+#ifndef _BCACHE_SUPER_TYPES_H
+#define _BCACHE_SUPER_TYPES_H
+
+struct bcache_superblock {
+	struct cache_sb		*sb;
+	struct block_device	*bdev;
+	struct bio		*bio;
+	unsigned		page_order;
+};
+
+#endif /* _BCACHE_SUPER_TYPES_H */
diff --git a/libbcache/sysfs.c b/libbcache/sysfs.c
new file mode 100644
index 0000000..40d006b
--- /dev/null
+++ b/libbcache/sysfs.c
@@ -0,0 +1,1397 @@
+/*
+ * bcache sysfs interfaces
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "alloc.h"
+#include "blockdev.h"
+#include "sysfs.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "inode.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "opts.h"
+#include "request.h"
+#include "writeback.h"
+
+#include <linux/blkdev.h>
+#include <linux/sort.h>
+
+static const char * const cache_replacement_policies[] = {
+	"lru",
+	"fifo",
+	"random",
+	NULL
+};
+
+/* Default is -1; we skip past it for struct cached_dev's cache mode */
+static const char * const bch_cache_modes[] = {
+	"default",
+	"writethrough",
+	"writeback",
+	"writearound",
+	"none",
+	NULL
+};
+
+static const char * const bch_cache_state[] = {
+	"active",
+	"readonly",
+	"failed",
+	"spare",
+	NULL
+};
+
+write_attribute(attach);
+write_attribute(detach);
+write_attribute(unregister);
+write_attribute(stop);
+write_attribute(clear_stats);
+write_attribute(trigger_btree_coalesce);
+write_attribute(trigger_gc);
+write_attribute(prune_cache);
+write_attribute(blockdev_volume_create);
+write_attribute(add_device);
+
+read_attribute(uuid);
+read_attribute(minor);
+read_attribute(bucket_size);
+read_attribute(bucket_size_bytes);
+read_attribute(block_size);
+read_attribute(block_size_bytes);
+read_attribute(btree_node_size);
+read_attribute(btree_node_size_bytes);
+read_attribute(first_bucket);
+read_attribute(nbuckets);
+read_attribute(tree_depth);
+read_attribute(root_usage_percent);
+read_attribute(read_priority_stats);
+read_attribute(write_priority_stats);
+read_attribute(fragmentation_stats);
+read_attribute(oldest_gen_stats);
+read_attribute(reserve_stats);
+read_attribute(btree_cache_size);
+read_attribute(cache_available_percent);
+read_attribute(compression_stats);
+read_attribute(written);
+read_attribute(btree_written);
+read_attribute(metadata_written);
+read_attribute(journal_debug);
+write_attribute(journal_flush);
+read_attribute(internal_uuid);
+
+read_attribute(btree_gc_running);
+
+read_attribute(btree_nodes);
+read_attribute(btree_used_percent);
+read_attribute(average_key_size);
+read_attribute(available_buckets);
+read_attribute(free_buckets);
+read_attribute(dirty_data);
+read_attribute(dirty_bytes);
+read_attribute(dirty_buckets);
+read_attribute(cached_data);
+read_attribute(cached_bytes);
+read_attribute(cached_buckets);
+read_attribute(meta_buckets);
+read_attribute(alloc_buckets);
+read_attribute(has_data);
+read_attribute(has_metadata);
+read_attribute(bset_tree_stats);
+read_attribute(alloc_debug);
+
+read_attribute(state);
+read_attribute(cache_read_races);
+read_attribute(writeback_keys_done);
+read_attribute(writeback_keys_failed);
+read_attribute(io_errors);
+rw_attribute(io_error_limit);
+rw_attribute(io_error_halflife);
+read_attribute(congested);
+rw_attribute(congested_read_threshold_us);
+rw_attribute(congested_write_threshold_us);
+
+rw_attribute(sequential_cutoff);
+rw_attribute(cache_mode);
+rw_attribute(writeback_metadata);
+rw_attribute(writeback_running);
+rw_attribute(writeback_percent);
+sysfs_pd_controller_attribute(writeback);
+
+read_attribute(stripe_size);
+read_attribute(partial_stripes_expensive);
+
+rw_attribute(journal_write_delay_ms);
+rw_attribute(journal_reclaim_delay_ms);
+read_attribute(journal_entry_size_max);
+
+rw_attribute(discard);
+rw_attribute(running);
+rw_attribute(label);
+rw_attribute(readahead);
+rw_attribute(verify);
+rw_attribute(bypass_torture_test);
+rw_attribute(cache_replacement_policy);
+
+rw_attribute(foreground_write_ratelimit_enabled);
+rw_attribute(copy_gc_enabled);
+sysfs_pd_controller_attribute(copy_gc);
+rw_attribute(tiering_enabled);
+rw_attribute(tiering_percent);
+sysfs_pd_controller_attribute(tiering);
+
+sysfs_pd_controller_attribute(foreground_write);
+
+rw_attribute(pd_controllers_update_seconds);
+
+rw_attribute(foreground_target_percent);
+
+rw_attribute(size);
+read_attribute(meta_replicas_have);
+read_attribute(data_replicas_have);
+read_attribute(tier);
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	rw_attribute(name);
+
+	BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)	\
+	static struct attribute sysfs_opt_##_name = {			\
+		.name = #_name,						\
+		.mode = S_IRUGO|(_perm ? S_IWUSR : 0)			\
+	};
+
+	CACHE_SET_VISIBLE_OPTS()
+#undef CACHE_SET_OPT
+
+#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
+	sysfs_time_stats_attribute(name, frequency_units, duration_units);
+	BCH_TIME_STATS()
+#undef BCH_TIME_STAT
+
+static struct attribute sysfs_state_rw = {
+	.name = "state",
+	.mode = S_IRUGO|S_IWUSR
+};
+
+SHOW(bch_cached_dev)
+{
+	struct cached_dev *dc = container_of(kobj, struct cached_dev,
+					     disk.kobj);
+	const char *states[] = { "no cache", "clean", "dirty", "inconsistent" };
+
+#define var(stat)		(dc->stat)
+
+	if (attr == &sysfs_cache_mode)
+		return bch_snprint_string_list(buf, PAGE_SIZE,
+					       bch_cache_modes + 1,
+					       BDEV_CACHE_MODE(dc->disk_sb.sb));
+
+	var_printf(verify,		"%i");
+	var_printf(bypass_torture_test,	"%i");
+	var_printf(writeback_metadata,	"%i");
+	var_printf(writeback_running,	"%i");
+	var_print(writeback_percent);
+	sysfs_pd_controller_show(writeback, &dc->writeback_pd);
+
+	sysfs_hprint(dirty_data,
+		     bcache_dev_sectors_dirty(&dc->disk) << 9);
+	sysfs_print(dirty_bytes,
+		    bcache_dev_sectors_dirty(&dc->disk) << 9);
+
+	sysfs_hprint(stripe_size,	dc->disk.stripe_size << 9);
+	var_printf(partial_stripes_expensive,	"%u");
+
+	var_hprint(sequential_cutoff);
+	var_hprint(readahead);
+
+	sysfs_print(running,		atomic_read(&dc->running));
+	sysfs_print(state,		states[BDEV_STATE(dc->disk_sb.sb)]);
+
+	if (attr == &sysfs_label) {
+		memcpy(buf, dc->disk_sb.sb->label, SB_LABEL_SIZE);
+		buf[SB_LABEL_SIZE + 1] = '\0';
+		strcat(buf, "\n");
+		return strlen(buf);
+	}
+
+#undef var
+	return 0;
+}
+
+STORE(__cached_dev)
+{
+	struct cached_dev *dc = container_of(kobj, struct cached_dev,
+					     disk.kobj);
+	unsigned v = size;
+	struct cache_set *c;
+	struct kobj_uevent_env *env;
+
+#define d_strtoul(var)		sysfs_strtoul(var, dc->var)
+#define d_strtoul_nonzero(var)	sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX)
+#define d_strtoi_h(var)		sysfs_hatoi(var, dc->var)
+
+	d_strtoul(verify);
+	d_strtoul(bypass_torture_test);
+	d_strtoul(writeback_metadata);
+	d_strtoul(writeback_running);
+	sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
+	sysfs_pd_controller_store(writeback, &dc->writeback_pd);
+
+	d_strtoi_h(sequential_cutoff);
+	d_strtoi_h(readahead);
+
+	if (attr == &sysfs_clear_stats)
+		bch_cache_accounting_clear(&dc->accounting);
+
+	if (attr == &sysfs_running &&
+	    strtoul_or_return(buf))
+		bch_cached_dev_run(dc);
+
+	if (attr == &sysfs_cache_mode) {
+		ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1);
+
+		if (v < 0)
+			return v;
+
+		if ((unsigned) v != BDEV_CACHE_MODE(dc->disk_sb.sb)) {
+			SET_BDEV_CACHE_MODE(dc->disk_sb.sb, v);
+			bch_write_bdev_super(dc, NULL);
+		}
+	}
+
+	if (attr == &sysfs_label) {
+		u64 journal_seq = 0;
+		int ret = 0;
+
+		if (size > SB_LABEL_SIZE)
+			return -EINVAL;
+
+		mutex_lock(&dc->disk.inode_lock);
+
+		memcpy(dc->disk_sb.sb->label, buf, size);
+		if (size < SB_LABEL_SIZE)
+			dc->disk_sb.sb->label[size] = '\0';
+		if (size && dc->disk_sb.sb->label[size - 1] == '\n')
+			dc->disk_sb.sb->label[size - 1] = '\0';
+
+		memcpy(dc->disk.inode.v.i_label,
+		       dc->disk_sb.sb->label, SB_LABEL_SIZE);
+
+		bch_write_bdev_super(dc, NULL);
+
+		if (dc->disk.c)
+			ret = bch_inode_update(dc->disk.c, &dc->disk.inode.k_i,
+					       &journal_seq);
+
+		mutex_unlock(&dc->disk.inode_lock);
+
+		if (ret)
+			return ret;
+
+		if (dc->disk.c)
+			ret = bch_journal_flush_seq(&dc->disk.c->journal,
+						    journal_seq);
+		if (ret)
+			return ret;
+
+		env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
+		if (!env)
+			return -ENOMEM;
+		add_uevent_var(env, "DRIVER=bcache");
+		add_uevent_var(env, "CACHED_UUID=%pU", dc->disk_sb.sb->disk_uuid.b),
+		add_uevent_var(env, "CACHED_LABEL=%s", buf);
+		kobject_uevent_env(
+			&disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp);
+		kfree(env);
+	}
+
+	if (attr == &sysfs_attach) {
+		if (uuid_parse(buf, &dc->disk_sb.sb->user_uuid))
+			return -EINVAL;
+
+		list_for_each_entry(c, &bch_cache_sets, list) {
+			v = bch_cached_dev_attach(dc, c);
+			if (!v)
+				return size;
+		}
+
+		pr_err("Can't attach %s: cache set not found", buf);
+		size = v;
+	}
+
+	if (attr == &sysfs_detach && dc->disk.c)
+		bch_cached_dev_detach(dc);
+
+	if (attr == &sysfs_stop)
+		bch_blockdev_stop(&dc->disk);
+
+	return size;
+}
+
+STORE(bch_cached_dev)
+{
+	struct cached_dev *dc = container_of(kobj, struct cached_dev,
+					     disk.kobj);
+
+	mutex_lock(&bch_register_lock);
+	size = __cached_dev_store(kobj, attr, buf, size);
+
+	if (attr == &sysfs_writeback_running)
+		bch_writeback_queue(dc);
+
+	if (attr == &sysfs_writeback_percent)
+		schedule_delayed_work(&dc->writeback_pd_update,
+				      dc->writeback_pd_update_seconds * HZ);
+
+	mutex_unlock(&bch_register_lock);
+	return size;
+}
+
+static struct attribute *bch_cached_dev_files[] = {
+	&sysfs_attach,
+	&sysfs_detach,
+	&sysfs_stop,
+	&sysfs_cache_mode,
+	&sysfs_writeback_metadata,
+	&sysfs_writeback_running,
+	&sysfs_writeback_percent,
+	sysfs_pd_controller_files(writeback),
+	&sysfs_dirty_data,
+	&sysfs_dirty_bytes,
+	&sysfs_stripe_size,
+	&sysfs_partial_stripes_expensive,
+	&sysfs_sequential_cutoff,
+	&sysfs_clear_stats,
+	&sysfs_running,
+	&sysfs_state,
+	&sysfs_label,
+	&sysfs_readahead,
+#ifdef CONFIG_BCACHE_DEBUG
+	&sysfs_verify,
+	&sysfs_bypass_torture_test,
+#endif
+	NULL
+};
+KTYPE(bch_cached_dev);
+
+SHOW(bch_blockdev_volume)
+{
+	struct bcache_device *d = container_of(kobj, struct bcache_device,
+					       kobj);
+
+	sysfs_hprint(size,	le64_to_cpu(d->inode.v.i_size));
+
+	if (attr == &sysfs_label) {
+		memcpy(buf, d->inode.v.i_label, SB_LABEL_SIZE);
+		buf[SB_LABEL_SIZE + 1] = '\0';
+		strcat(buf, "\n");
+		return strlen(buf);
+	}
+
+	return 0;
+}
+
+STORE(__bch_blockdev_volume)
+{
+	struct bcache_device *d = container_of(kobj, struct bcache_device,
+					       kobj);
+
+	if (attr == &sysfs_size) {
+		u64 journal_seq = 0;
+		u64 v = strtoi_h_or_return(buf);
+		int ret;
+
+		mutex_lock(&d->inode_lock);
+
+		if (v < le64_to_cpu(d->inode.v.i_size) ){
+			ret = bch_inode_truncate(d->c, d->inode.k.p.inode,
+						 v >> 9, NULL, NULL);
+			if (ret) {
+				mutex_unlock(&d->inode_lock);
+				return ret;
+			}
+		}
+		d->inode.v.i_size = cpu_to_le64(v);
+		ret = bch_inode_update(d->c, &d->inode.k_i, &journal_seq);
+
+		mutex_unlock(&d->inode_lock);
+
+		if (ret)
+			return ret;
+
+		ret = bch_journal_flush_seq(&d->c->journal, journal_seq);
+		if (ret)
+			return ret;
+
+		set_capacity(d->disk, v >> 9);
+	}
+
+	if (attr == &sysfs_label) {
+		u64 journal_seq = 0;
+		int ret;
+
+		mutex_lock(&d->inode_lock);
+
+		memcpy(d->inode.v.i_label, buf, SB_LABEL_SIZE);
+		ret = bch_inode_update(d->c, &d->inode.k_i, &journal_seq);
+
+		mutex_unlock(&d->inode_lock);
+
+		return ret ?: bch_journal_flush_seq(&d->c->journal, journal_seq);
+	}
+
+	if (attr == &sysfs_unregister) {
+		set_bit(BCACHE_DEV_DETACHING, &d->flags);
+		bch_blockdev_stop(d);
+	}
+
+	return size;
+}
+STORE_LOCKED(bch_blockdev_volume)
+
+static struct attribute *bch_blockdev_volume_files[] = {
+	&sysfs_unregister,
+	&sysfs_label,
+	&sysfs_size,
+	NULL
+};
+KTYPE(bch_blockdev_volume);
+
+static int bch_bset_print_stats(struct cache_set *c, char *buf)
+{
+	struct bset_stats stats;
+	size_t nodes = 0;
+	struct btree *b;
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	unsigned iter;
+
+	memset(&stats, 0, sizeof(stats));
+
+	rcu_read_lock();
+	for_each_cached_btree(b, c, tbl, iter, pos) {
+		bch_btree_keys_stats(b, &stats);
+		nodes++;
+	}
+	rcu_read_unlock();
+
+	return snprintf(buf, PAGE_SIZE,
+			"btree nodes:		%zu\n"
+			"written sets:		%zu\n"
+			"written key bytes:	%zu\n"
+			"unwritten sets:		%zu\n"
+			"unwritten key bytes:	%zu\n"
+			"no table sets:		%zu\n"
+			"no table key bytes:	%zu\n"
+			"floats:			%zu\n"
+			"failed unpacked:	%zu\n"
+			"failed prev:		%zu\n"
+			"failed overflow:	%zu\n",
+			nodes,
+			stats.sets[BSET_RO_AUX_TREE].nr,
+			stats.sets[BSET_RO_AUX_TREE].bytes,
+			stats.sets[BSET_RW_AUX_TREE].nr,
+			stats.sets[BSET_RW_AUX_TREE].bytes,
+			stats.sets[BSET_NO_AUX_TREE].nr,
+			stats.sets[BSET_NO_AUX_TREE].bytes,
+			stats.floats,
+			stats.failed_unpacked,
+			stats.failed_prev,
+			stats.failed_overflow);
+}
+
+static unsigned bch_root_usage(struct cache_set *c)
+{
+	unsigned bytes = 0;
+	struct bkey_packed *k;
+	struct btree *b;
+	struct btree_node_iter iter;
+
+	goto lock_root;
+
+	do {
+		six_unlock_read(&b->lock);
+lock_root:
+		b = c->btree_roots[BTREE_ID_EXTENTS].b;
+		six_lock_read(&b->lock);
+	} while (b != c->btree_roots[BTREE_ID_EXTENTS].b);
+
+	for_each_btree_node_key(b, k, &iter, btree_node_is_extents(b))
+		bytes += bkey_bytes(k);
+
+	six_unlock_read(&b->lock);
+
+	return (bytes * 100) / btree_bytes(c);
+}
+
+static size_t bch_cache_size(struct cache_set *c)
+{
+	size_t ret = 0;
+	struct btree *b;
+
+	mutex_lock(&c->btree_cache_lock);
+	list_for_each_entry(b, &c->btree_cache, list)
+		ret += btree_bytes(c);
+
+	mutex_unlock(&c->btree_cache_lock);
+	return ret;
+}
+
+static unsigned bch_cache_available_percent(struct cache_set *c)
+{
+	return div64_u64((u64) sectors_available(c) * 100,
+			 c->capacity ?: 1);
+}
+
+#if 0
+static unsigned bch_btree_used(struct cache_set *c)
+{
+	return div64_u64(c->gc_stats.key_bytes * 100,
+			 (c->gc_stats.nodes ?: 1) * btree_bytes(c));
+}
+
+static unsigned bch_average_key_size(struct cache_set *c)
+{
+	return c->gc_stats.nkeys
+		? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
+		: 0;
+}
+#endif
+
+static ssize_t show_cache_set_alloc_debug(struct cache_set *c, char *buf)
+{
+	struct bucket_stats_cache_set stats = bch_bucket_stats_read_cache_set(c);
+
+	return scnprintf(buf, PAGE_SIZE,
+			 "capacity:\t\t%llu\n"
+			 "compressed:\n"
+			 "\tmeta:\t\t%llu\n"
+			 "\tdirty:\t\t%llu\n"
+			 "\tcached:\t\t%llu\n"
+			 "uncompressed:\n"
+			 "\tmeta:\t\t%llu\n"
+			 "\tdirty:\t\t%llu\n"
+			 "\tcached:\t\t%llu\n"
+			 "persistent reserved sectors:\t%llu\n"
+			 "online reserved sectors:\t%llu\n",
+			 c->capacity,
+			 stats.s[S_COMPRESSED][S_META],
+			 stats.s[S_COMPRESSED][S_DIRTY],
+			 stats.s[S_COMPRESSED][S_CACHED],
+			 stats.s[S_UNCOMPRESSED][S_META],
+			 stats.s[S_UNCOMPRESSED][S_DIRTY],
+			 stats.s[S_UNCOMPRESSED][S_CACHED],
+			 stats.persistent_reserved,
+			 stats.online_reserved);
+}
+
+static ssize_t bch_compression_stats(struct cache_set *c, char *buf)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0,
+	    nr_compressed_extents = 0,
+	    compressed_sectors_compressed = 0,
+	    compressed_sectors_uncompressed = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, k)
+		if (k.k->type == BCH_EXTENT) {
+			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+			const struct bch_extent_ptr *ptr;
+			const union bch_extent_crc *crc;
+
+			extent_for_each_ptr_crc(e, ptr, crc) {
+				if (crc_compression_type(crc) == BCH_COMPRESSION_NONE) {
+					nr_uncompressed_extents++;
+					uncompressed_sectors += e.k->size;
+				} else {
+					nr_compressed_extents++;
+					compressed_sectors_compressed +=
+						crc_compressed_size(e.k, crc);
+					compressed_sectors_uncompressed +=
+						crc_uncompressed_size(e.k, crc);
+				}
+
+				/* only looking at the first ptr */
+				break;
+			}
+		}
+	bch_btree_iter_unlock(&iter);
+
+	return snprintf(buf, PAGE_SIZE,
+			"uncompressed data:\n"
+			"	nr extents:			%llu\n"
+			"	size (bytes):			%llu\n"
+			"compressed data:\n"
+			"	nr extents:			%llu\n"
+			"	compressed size (bytes):	%llu\n"
+			"	uncompressed size (bytes):	%llu\n",
+			nr_uncompressed_extents,
+			uncompressed_sectors << 9,
+			nr_compressed_extents,
+			compressed_sectors_compressed << 9,
+			compressed_sectors_uncompressed << 9);
+}
+
+SHOW(bch_cache_set)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
+	sysfs_print(minor,			c->minor);
+
+	sysfs_print(journal_write_delay_ms,	c->journal.write_delay_ms);
+	sysfs_print(journal_reclaim_delay_ms,	c->journal.reclaim_delay_ms);
+	sysfs_hprint(journal_entry_size_max,	c->journal.entry_size_max);
+
+	sysfs_hprint(block_size,		block_bytes(c));
+	sysfs_print(block_size_bytes,		block_bytes(c));
+	sysfs_hprint(btree_node_size,		c->sb.btree_node_size << 9);
+	sysfs_print(btree_node_size_bytes,	c->sb.btree_node_size << 9);
+
+	sysfs_hprint(btree_cache_size,		bch_cache_size(c));
+	sysfs_print(cache_available_percent,	bch_cache_available_percent(c));
+
+	sysfs_print(btree_gc_running,		c->gc_pos.phase != GC_PHASE_DONE);
+
+#if 0
+	/* XXX: reimplement */
+	sysfs_print(btree_used_percent,	bch_btree_used(c));
+	sysfs_print(btree_nodes,	c->gc_stats.nodes);
+	sysfs_hprint(average_key_size,	bch_average_key_size(c));
+#endif
+
+	sysfs_print(cache_read_races,
+		    atomic_long_read(&c->cache_read_races));
+
+	sysfs_print(writeback_keys_done,
+		    atomic_long_read(&c->writeback_keys_done));
+	sysfs_print(writeback_keys_failed,
+		    atomic_long_read(&c->writeback_keys_failed));
+
+	/* See count_io_errors for why 88 */
+	sysfs_print(io_error_halflife,	c->error_decay * 88);
+	sysfs_print(io_error_limit,	c->error_limit >> IO_ERROR_SHIFT);
+
+	sysfs_hprint(congested,
+		     ((uint64_t) bch_get_congested(c)) << 9);
+	sysfs_print(congested_read_threshold_us,
+		    c->congested_read_threshold_us);
+	sysfs_print(congested_write_threshold_us,
+		    c->congested_write_threshold_us);
+
+	sysfs_printf(foreground_write_ratelimit_enabled, "%i",
+		     c->foreground_write_ratelimit_enabled);
+	sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+	sysfs_pd_controller_show(foreground_write, &c->foreground_write_pd);
+
+	sysfs_print(pd_controllers_update_seconds,
+		    c->pd_controllers_update_seconds);
+	sysfs_print(foreground_target_percent, c->foreground_target_percent);
+
+	sysfs_printf(tiering_enabled,		"%i", c->tiering_enabled);
+	sysfs_print(tiering_percent,		c->tiering_percent);
+	sysfs_pd_controller_show(tiering,	&c->tiering_pd);
+
+	sysfs_printf(meta_replicas_have, "%llu",
+		     CACHE_SET_META_REPLICAS_HAVE(&c->disk_sb));
+	sysfs_printf(data_replicas_have, "%llu",
+		     CACHE_SET_DATA_REPLICAS_HAVE(&c->disk_sb));
+
+	/* Debugging: */
+
+	if (attr == &sysfs_journal_debug)
+		return bch_journal_print_debug(&c->journal, buf);
+
+#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
+	BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+	if (!test_bit(CACHE_SET_RUNNING, &c->flags))
+		return -EPERM;
+
+	if (attr == &sysfs_bset_tree_stats)
+		return bch_bset_print_stats(c, buf);
+	if (attr == &sysfs_alloc_debug)
+		return show_cache_set_alloc_debug(c, buf);
+
+	sysfs_print(tree_depth, c->btree_roots[BTREE_ID_EXTENTS].b->level);
+	sysfs_print(root_usage_percent,		bch_root_usage(c));
+
+	if (attr == &sysfs_compression_stats)
+		return bch_compression_stats(c, buf);
+
+	sysfs_printf(internal_uuid, "%pU", c->disk_sb.set_uuid.b);
+
+	return 0;
+}
+
+STORE(__bch_cache_set)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
+	if (attr == &sysfs_unregister) {
+		bch_cache_set_unregister(c);
+		return size;
+	}
+
+	if (attr == &sysfs_stop) {
+		bch_cache_set_stop(c);
+		return size;
+	}
+
+	if (attr == &sysfs_clear_stats) {
+		atomic_long_set(&c->writeback_keys_done,	0);
+		atomic_long_set(&c->writeback_keys_failed,	0);
+		bch_cache_accounting_clear(&c->accounting);
+
+		return size;
+	}
+
+	sysfs_strtoul(congested_read_threshold_us,
+		      c->congested_read_threshold_us);
+	sysfs_strtoul(congested_write_threshold_us,
+		      c->congested_write_threshold_us);
+
+	if (attr == &sysfs_io_error_limit) {
+		c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
+		return size;
+	}
+
+	/* See count_io_errors() for why 88 */
+	if (attr == &sysfs_io_error_halflife) {
+		c->error_decay = strtoul_or_return(buf) / 88;
+		return size;
+	}
+
+	sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
+	sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
+
+	sysfs_strtoul(foreground_write_ratelimit_enabled,
+		      c->foreground_write_ratelimit_enabled);
+
+	if (attr == &sysfs_copy_gc_enabled) {
+		struct cache *ca;
+		unsigned i;
+		ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
+			?: (ssize_t) size;
+
+		for_each_cache(ca, c, i)
+			if (ca->moving_gc_read)
+				wake_up_process(ca->moving_gc_read);
+		return ret;
+	}
+
+	if (attr == &sysfs_tiering_enabled) {
+		ssize_t ret = strtoul_safe(buf, c->tiering_enabled)
+			?: (ssize_t) size;
+
+		if (c->tiering_read)
+			wake_up_process(c->tiering_read);
+		return ret;
+	}
+
+	sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd);
+
+	if (attr == &sysfs_journal_flush) {
+		bch_journal_meta_async(&c->journal, NULL);
+
+		return size;
+	}
+
+	sysfs_strtoul(pd_controllers_update_seconds,
+		      c->pd_controllers_update_seconds);
+	sysfs_strtoul(foreground_target_percent, c->foreground_target_percent);
+
+	sysfs_strtoul(tiering_percent,		c->tiering_percent);
+	sysfs_pd_controller_store(tiering,	&c->tiering_pd);
+
+	/* Debugging: */
+
+#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name);
+	BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+	if (!test_bit(CACHE_SET_RUNNING, &c->flags))
+		return -EPERM;
+
+	if (test_bit(CACHE_SET_STOPPING, &c->flags))
+		return -EINTR;
+
+	if (attr == &sysfs_blockdev_volume_create) {
+		u64 v = strtoi_h_or_return(buf);
+		int r = bch_blockdev_volume_create(c, v);
+
+		if (r)
+			return r;
+	}
+
+	if (attr == &sysfs_trigger_btree_coalesce)
+		bch_coalesce(c);
+
+	/* Debugging: */
+
+	if (attr == &sysfs_trigger_gc)
+		bch_gc(c);
+
+	if (attr == &sysfs_prune_cache) {
+		struct shrink_control sc;
+
+		sc.gfp_mask = GFP_KERNEL;
+		sc.nr_to_scan = strtoul_or_return(buf);
+		c->btree_cache_shrink.scan_objects(&c->btree_cache_shrink, &sc);
+	}
+
+	return size;
+}
+
+STORE(bch_cache_set)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
+	mutex_lock(&bch_register_lock);
+	size = __bch_cache_set_store(kobj, attr, buf, size);
+	mutex_unlock(&bch_register_lock);
+
+	if (attr == &sysfs_add_device) {
+		char *path = kstrdup(buf, GFP_KERNEL);
+		int r = bch_cache_set_add_cache(c, strim(path));
+
+		kfree(path);
+		if (r)
+			return r;
+	}
+
+	return size;
+}
+
+static struct attribute *bch_cache_set_files[] = {
+	&sysfs_unregister,
+	&sysfs_stop,
+	&sysfs_journal_write_delay_ms,
+	&sysfs_journal_reclaim_delay_ms,
+	&sysfs_journal_entry_size_max,
+	&sysfs_blockdev_volume_create,
+	&sysfs_add_device,
+
+	&sysfs_block_size,
+	&sysfs_block_size_bytes,
+	&sysfs_btree_node_size,
+	&sysfs_btree_node_size_bytes,
+	&sysfs_tree_depth,
+	&sysfs_root_usage_percent,
+	&sysfs_btree_cache_size,
+	&sysfs_cache_available_percent,
+	&sysfs_compression_stats,
+
+	&sysfs_average_key_size,
+
+	&sysfs_io_error_limit,
+	&sysfs_io_error_halflife,
+	&sysfs_congested,
+	&sysfs_congested_read_threshold_us,
+	&sysfs_congested_write_threshold_us,
+	&sysfs_clear_stats,
+
+	&sysfs_meta_replicas_have,
+	&sysfs_data_replicas_have,
+
+	&sysfs_foreground_target_percent,
+	&sysfs_tiering_percent,
+
+	&sysfs_journal_flush,
+	NULL
+};
+KTYPE(bch_cache_set);
+
+/* internal dir - just a wrapper */
+
+SHOW(bch_cache_set_internal)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, internal);
+	return bch_cache_set_show(&c->kobj, attr, buf);
+}
+
+STORE(bch_cache_set_internal)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, internal);
+	return bch_cache_set_store(&c->kobj, attr, buf, size);
+}
+
+static void bch_cache_set_internal_release(struct kobject *k)
+{
+}
+
+static struct attribute *bch_cache_set_internal_files[] = {
+	&sysfs_journal_debug,
+
+	&sysfs_alloc_debug,
+
+	&sysfs_btree_gc_running,
+
+	&sysfs_btree_nodes,
+	&sysfs_btree_used_percent,
+
+	&sysfs_bset_tree_stats,
+	&sysfs_cache_read_races,
+	&sysfs_writeback_keys_done,
+	&sysfs_writeback_keys_failed,
+
+	&sysfs_trigger_btree_coalesce,
+	&sysfs_trigger_gc,
+	&sysfs_prune_cache,
+	&sysfs_foreground_write_ratelimit_enabled,
+	&sysfs_copy_gc_enabled,
+	&sysfs_tiering_enabled,
+	sysfs_pd_controller_files(tiering),
+	sysfs_pd_controller_files(foreground_write),
+	&sysfs_internal_uuid,
+
+#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
+	BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+	NULL
+};
+KTYPE(bch_cache_set_internal);
+
+/* options */
+
+SHOW(bch_cache_set_opts_dir)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, opts_dir);
+
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)	\
+	if (attr == &sysfs_opt_##_name)					\
+		return _choices == bch_bool_opt || _choices == bch_uint_opt\
+			? snprintf(buf, PAGE_SIZE, "%i\n", c->opts._name)\
+			: bch_snprint_string_list(buf, PAGE_SIZE,	\
+						_choices, c->opts._name);\
+
+	CACHE_SET_VISIBLE_OPTS()
+#undef CACHE_SET_OPT
+
+	return 0;
+}
+
+STORE(bch_cache_set_opts_dir)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, opts_dir);
+
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)	\
+	if (attr == &sysfs_opt_##_name) {				\
+		ssize_t v = (_choices == bch_bool_opt ||		\
+			     _choices == bch_uint_opt)			\
+			? strtoul_restrict_or_return(buf, _min, _max - 1)\
+			: bch_read_string_list(buf, _choices);		\
+									\
+		if (v < 0)						\
+			return v;					\
+									\
+		c->opts._name = v;					\
+									\
+		if (_sb_opt##_BITS && v != _sb_opt(&c->disk_sb)) {	\
+			SET_##_sb_opt(&c->disk_sb, v);			\
+			bcache_write_super(c);				\
+		}							\
+									\
+		return size;						\
+	}
+
+	CACHE_SET_VISIBLE_OPTS()
+#undef CACHE_SET_OPT
+
+	return size;
+}
+
+static void bch_cache_set_opts_dir_release(struct kobject *k)
+{
+}
+
+static struct attribute *bch_cache_set_opts_dir_files[] = {
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)	\
+	&sysfs_opt_##_name,
+
+	CACHE_SET_VISIBLE_OPTS()
+#undef CACHE_SET_OPT
+
+	NULL
+};
+KTYPE(bch_cache_set_opts_dir);
+
+/* time stats */
+
+SHOW(bch_cache_set_time_stats)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, time_stats);
+
+#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
+	sysfs_print_time_stats(&c->name##_time, name,			\
+			       frequency_units, duration_units);
+	BCH_TIME_STATS()
+#undef BCH_TIME_STAT
+
+	return 0;
+}
+
+STORE(bch_cache_set_time_stats)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, time_stats);
+
+#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
+	sysfs_clear_time_stats(&c->name##_time, name);
+	BCH_TIME_STATS()
+#undef BCH_TIME_STAT
+
+	return size;
+}
+
+static void bch_cache_set_time_stats_release(struct kobject *k)
+{
+}
+
+static struct attribute *bch_cache_set_time_stats_files[] = {
+#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
+	sysfs_time_stats_attribute_list(name, frequency_units, duration_units)
+	BCH_TIME_STATS()
+#undef BCH_TIME_STAT
+
+	NULL
+};
+KTYPE(bch_cache_set_time_stats);
+
+typedef unsigned (bucket_map_fn)(struct cache *, struct bucket *, void *);
+
+static unsigned bucket_priority_fn(struct cache *ca, struct bucket *g,
+				   void *private)
+{
+	int rw = (private ? 1 : 0);
+
+	return ca->set->prio_clock[rw].hand - g->prio[rw];
+}
+
+static unsigned bucket_sectors_used_fn(struct cache *ca, struct bucket *g,
+				       void *private)
+{
+	return bucket_sectors_used(g);
+}
+
+static unsigned bucket_oldest_gen_fn(struct cache *ca, struct bucket *g,
+				     void *private)
+{
+	return bucket_gc_gen(ca, g);
+}
+
+static ssize_t show_quantiles(struct cache *ca, char *buf,
+			      bucket_map_fn *fn, void *private)
+{
+	int cmp(const void *l, const void *r)
+	{	return *((unsigned *) r) - *((unsigned *) l); }
+
+	size_t n = ca->mi.nbuckets, i;
+	/* Compute 31 quantiles */
+	unsigned q[31], *p;
+	ssize_t ret = 0;
+
+	p = vzalloc(ca->mi.nbuckets * sizeof(unsigned));
+	if (!p)
+		return -ENOMEM;
+
+	for (i = ca->mi.first_bucket; i < n; i++)
+		p[i] = fn(ca, &ca->buckets[i], private);
+
+	sort(p, n, sizeof(unsigned), cmp, NULL);
+
+	while (n &&
+	       !p[n - 1])
+		--n;
+
+	for (i = 0; i < ARRAY_SIZE(q); i++)
+		q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)];
+
+	vfree(p);
+
+	for (i = 0; i < ARRAY_SIZE(q); i++)
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+				 "%u ", q[i]);
+	buf[ret - 1] = '\n';
+
+	return ret;
+
+}
+
+static ssize_t show_reserve_stats(struct cache *ca, char *buf)
+{
+	enum alloc_reserve i;
+	ssize_t ret;
+
+	spin_lock(&ca->freelist_lock);
+
+	ret = scnprintf(buf, PAGE_SIZE,
+			"free_inc:\t%zu\t%zu\n",
+			fifo_used(&ca->free_inc),
+			ca->free_inc.size);
+
+	for (i = 0; i < RESERVE_NR; i++)
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+				 "free[%u]:\t%zu\t%zu\n", i,
+				 fifo_used(&ca->free[i]),
+				 ca->free[i].size);
+
+	spin_unlock(&ca->freelist_lock);
+
+	return ret;
+}
+
+static ssize_t show_cache_alloc_debug(struct cache *ca, char *buf)
+{
+	struct cache_set *c = ca->set;
+	struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
+
+	return scnprintf(buf, PAGE_SIZE,
+		"free_inc:               %zu/%zu\n"
+		"free[RESERVE_PRIO]:     %zu/%zu\n"
+		"free[RESERVE_BTREE]:    %zu/%zu\n"
+		"free[RESERVE_MOVINGGC]: %zu/%zu\n"
+		"free[RESERVE_NONE]:     %zu/%zu\n"
+		"alloc:                  %llu/%llu\n"
+		"meta:                   %llu/%llu\n"
+		"dirty:                  %llu/%llu\n"
+		"available:              %llu/%llu\n"
+		"freelist_wait:          %s\n"
+		"open buckets:           %u/%u (reserved %u)\n"
+		"open_buckets_wait:      %s\n",
+		fifo_used(&ca->free_inc),		ca->free_inc.size,
+		fifo_used(&ca->free[RESERVE_PRIO]),	ca->free[RESERVE_PRIO].size,
+		fifo_used(&ca->free[RESERVE_BTREE]),	ca->free[RESERVE_BTREE].size,
+		fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
+		fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
+		stats.buckets_alloc,			ca->mi.nbuckets - ca->mi.first_bucket,
+		stats.buckets_meta,			ca->mi.nbuckets - ca->mi.first_bucket,
+		stats.buckets_dirty,			ca->mi.nbuckets - ca->mi.first_bucket,
+		__buckets_available_cache(ca, stats),	ca->mi.nbuckets - ca->mi.first_bucket,
+		c->freelist_wait.list.first		? "waiting" : "empty",
+		c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
+		c->open_buckets_wait.list.first		? "waiting" : "empty");
+}
+
+static u64 sectors_written(struct cache *ca)
+{
+	u64 ret = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		ret += *per_cpu_ptr(ca->sectors_written, cpu);
+
+	return ret;
+}
+
+SHOW(bch_cache)
+{
+	struct cache *ca = container_of(kobj, struct cache, kobj);
+	struct cache_set *c = ca->set;
+	struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
+
+	sysfs_printf(uuid,		"%pU\n", ca->disk_sb.sb->disk_uuid.b);
+
+	sysfs_hprint(bucket_size,	bucket_bytes(ca));
+	sysfs_print(bucket_size_bytes,	bucket_bytes(ca));
+	sysfs_hprint(block_size,	block_bytes(c));
+	sysfs_print(block_size_bytes,	block_bytes(c));
+	sysfs_print(first_bucket,	ca->mi.first_bucket);
+	sysfs_print(nbuckets,		ca->mi.nbuckets);
+	sysfs_print(discard,		ca->mi.discard);
+	sysfs_hprint(written, sectors_written(ca) << 9);
+	sysfs_hprint(btree_written,
+		     atomic64_read(&ca->btree_sectors_written) << 9);
+	sysfs_hprint(metadata_written,
+		     (atomic64_read(&ca->meta_sectors_written) +
+		      atomic64_read(&ca->btree_sectors_written)) << 9);
+
+	sysfs_print(io_errors,
+		    atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT);
+
+	sysfs_hprint(dirty_data,	stats.sectors_dirty << 9);
+	sysfs_print(dirty_bytes,	stats.sectors_dirty << 9);
+	sysfs_print(dirty_buckets,	stats.buckets_dirty);
+	sysfs_hprint(cached_data,	stats.sectors_cached << 9);
+	sysfs_print(cached_bytes,	stats.sectors_cached << 9);
+	sysfs_print(cached_buckets,	stats.buckets_cached);
+	sysfs_print(meta_buckets,	stats.buckets_meta);
+	sysfs_print(alloc_buckets,	stats.buckets_alloc);
+	sysfs_print(available_buckets,	buckets_available_cache(ca));
+	sysfs_print(free_buckets,	buckets_free_cache(ca));
+	sysfs_print(has_data,		ca->mi.has_data);
+	sysfs_print(has_metadata,	ca->mi.has_metadata);
+
+	sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd);
+
+	if (attr == &sysfs_cache_replacement_policy)
+		return bch_snprint_string_list(buf, PAGE_SIZE,
+					       cache_replacement_policies,
+					       ca->mi.replacement);
+
+	sysfs_print(tier,		ca->mi.tier);
+
+	if (attr == &sysfs_state_rw)
+		return bch_snprint_string_list(buf, PAGE_SIZE,
+					       bch_cache_state,
+					       ca->mi.state);
+
+	if (attr == &sysfs_read_priority_stats)
+		return show_quantiles(ca, buf, bucket_priority_fn, (void *) 0);
+	if (attr == &sysfs_write_priority_stats)
+		return show_quantiles(ca, buf, bucket_priority_fn, (void *) 1);
+	if (attr == &sysfs_fragmentation_stats)
+		return show_quantiles(ca, buf, bucket_sectors_used_fn, NULL);
+	if (attr == &sysfs_oldest_gen_stats)
+		return show_quantiles(ca, buf, bucket_oldest_gen_fn, NULL);
+	if (attr == &sysfs_reserve_stats)
+		return show_reserve_stats(ca, buf);
+	if (attr == &sysfs_alloc_debug)
+		return show_cache_alloc_debug(ca, buf);
+
+	return 0;
+}
+
+STORE(__bch_cache)
+{
+	struct cache *ca = container_of(kobj, struct cache, kobj);
+	struct cache_set *c = ca->set;
+	struct cache_member *mi = &c->disk_mi[ca->sb.nr_this_dev];
+
+	sysfs_pd_controller_store(copy_gc, &ca->moving_gc_pd);
+
+	if (attr == &sysfs_discard) {
+		bool v = strtoul_or_return(buf);
+
+		if (v != CACHE_DISCARD(mi)) {
+			SET_CACHE_DISCARD(mi, v);
+			bcache_write_super(c);
+		}
+	}
+
+	if (attr == &sysfs_cache_replacement_policy) {
+		ssize_t v = bch_read_string_list(buf, cache_replacement_policies);
+
+		if (v < 0)
+			return v;
+
+		if ((unsigned) v != CACHE_REPLACEMENT(mi)) {
+			SET_CACHE_REPLACEMENT(mi, v);
+			bcache_write_super(c);
+		}
+	}
+
+	if (attr == &sysfs_state_rw) {
+		char name[BDEVNAME_SIZE];
+		const char *err = NULL;
+		ssize_t v = bch_read_string_list(buf, bch_cache_state);
+
+		if (v < 0)
+			return v;
+
+		if (v == ca->mi.state)
+			return size;
+
+		switch (v) {
+		case CACHE_ACTIVE:
+			err = bch_cache_read_write(ca);
+			break;
+		case CACHE_RO:
+			bch_cache_read_only(ca);
+			break;
+		case CACHE_FAILED:
+		case CACHE_SPARE:
+			/*
+			 * XXX: need to migrate data off and set correct state
+			 */
+			pr_err("can't set %s %s: not supported",
+			       bdevname(ca->disk_sb.bdev, name),
+			       bch_cache_state[v]);
+			return -EINVAL;
+		}
+
+		if (err) {
+			pr_err("can't set %s %s: %s",
+			       bdevname(ca->disk_sb.bdev, name),
+			       bch_cache_state[v], err);
+			return -EINVAL;
+		}
+	}
+
+	if (attr == &sysfs_unregister) {
+		bool force = false;
+
+		if (!strncmp(buf, "force", 5) &&
+		    (buf[5] == '\0' || buf[5] == '\n'))
+			force = true;
+		bch_cache_remove(ca, force);
+	}
+
+	if (attr == &sysfs_clear_stats) {
+		int cpu;
+
+		for_each_possible_cpu(cpu)
+			*per_cpu_ptr(ca->sectors_written, cpu) = 0;
+
+		atomic64_set(&ca->btree_sectors_written, 0);
+		atomic64_set(&ca->meta_sectors_written, 0);
+		atomic_set(&ca->io_count, 0);
+		atomic_set(&ca->io_errors, 0);
+	}
+
+	return size;
+}
+STORE_LOCKED(bch_cache)
+
+static struct attribute *bch_cache_files[] = {
+	&sysfs_uuid,
+	&sysfs_unregister,
+	&sysfs_bucket_size,
+	&sysfs_bucket_size_bytes,
+	&sysfs_block_size,
+	&sysfs_block_size_bytes,
+	&sysfs_first_bucket,
+	&sysfs_nbuckets,
+	&sysfs_read_priority_stats,
+	&sysfs_write_priority_stats,
+	&sysfs_fragmentation_stats,
+	&sysfs_oldest_gen_stats,
+	&sysfs_reserve_stats,
+	&sysfs_available_buckets,
+	&sysfs_free_buckets,
+	&sysfs_dirty_data,
+	&sysfs_dirty_bytes,
+	&sysfs_dirty_buckets,
+	&sysfs_cached_data,
+	&sysfs_cached_bytes,
+	&sysfs_cached_buckets,
+	&sysfs_meta_buckets,
+	&sysfs_alloc_buckets,
+	&sysfs_has_data,
+	&sysfs_has_metadata,
+	&sysfs_discard,
+	&sysfs_written,
+	&sysfs_btree_written,
+	&sysfs_metadata_written,
+	&sysfs_io_errors,
+	&sysfs_clear_stats,
+	&sysfs_cache_replacement_policy,
+	&sysfs_tier,
+	&sysfs_state_rw,
+	&sysfs_alloc_debug,
+
+	sysfs_pd_controller_files(copy_gc),
+	NULL
+};
+KTYPE(bch_cache);
diff --git a/libbcache/sysfs.h b/libbcache/sysfs.h
new file mode 100644
index 0000000..9d58458
--- /dev/null
+++ b/libbcache/sysfs.h
@@ -0,0 +1,113 @@
+#ifndef _BCACHE_SYSFS_H_
+#define _BCACHE_SYSFS_H_
+
+#include "util.h"
+
+#define KTYPE(type)							\
+struct kobj_type type ## _ktype = {					\
+	.release	= type ## _release,				\
+	.sysfs_ops	= &((const struct sysfs_ops) {			\
+		.show	= type ## _show,				\
+		.store	= type ## _store				\
+	}),								\
+	.default_attrs	= type ## _files				\
+}
+
+#define SHOW(fn)							\
+static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
+			   char *buf)					\
+
+#define STORE(fn)							\
+static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
+			    const char *buf, size_t size)		\
+
+#define STORE_LOCKED(fn)						\
+STORE(fn)								\
+{									\
+	ssize_t ret;							\
+	mutex_lock(&bch_register_lock);					\
+	ret = __ ## fn ## _store(kobj, attr, buf, size);		\
+	mutex_unlock(&bch_register_lock);				\
+	return ret;							\
+}
+
+#define __sysfs_attribute(_name, _mode)					\
+	static struct attribute sysfs_##_name =				\
+		{ .name = #_name, .mode = _mode }
+
+#define write_attribute(n)	__sysfs_attribute(n, S_IWUSR)
+#define read_attribute(n)	__sysfs_attribute(n, S_IRUGO)
+#define rw_attribute(n)		__sysfs_attribute(n, S_IRUGO|S_IWUSR)
+
+#define sysfs_printf(file, fmt, ...)					\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);	\
+} while (0)
+
+#define sysfs_print(file, var)						\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return snprint(buf, PAGE_SIZE, var);			\
+} while (0)
+
+#define sysfs_hprint(file, val)						\
+do {									\
+	if (attr == &sysfs_ ## file) {					\
+		ssize_t ret = bch_hprint(buf, val);			\
+		strcat(buf, "\n");					\
+		return ret + 1;						\
+	}								\
+} while (0)
+
+#define var_printf(_var, fmt)	sysfs_printf(_var, fmt, var(_var))
+#define var_print(_var)		sysfs_print(_var, var(_var))
+#define var_hprint(_var)	sysfs_hprint(_var, var(_var))
+
+#define sysfs_strtoul(file, var)					\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoul_safe(buf, var) ?: (ssize_t) size;	\
+} while (0)
+
+#define sysfs_strtoul_clamp(file, var, min, max)			\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoul_safe_clamp(buf, var, min, max)		\
+			?: (ssize_t) size;				\
+} while (0)
+
+#define strtoul_or_return(cp)						\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (_r)								\
+		return _r;						\
+	_v;								\
+})
+
+#define strtoul_restrict_or_return(cp, min, max)			\
+({									\
+	unsigned long __v = 0;						\
+	int _r = strtoul_safe_restrict(cp, __v, min, max);		\
+	if (_r)								\
+		return _r;						\
+	__v;								\
+})
+
+#define strtoi_h_or_return(cp)						\
+({									\
+	u64 _v;								\
+	int _r = strtoi_h(cp, &_v);					\
+	if (_r)								\
+		return _r;						\
+	_v;								\
+})
+
+#define sysfs_hatoi(file, var)						\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoi_h(buf, &var) ?: (ssize_t) size;		\
+} while (0)
+
+#endif  /* _BCACHE_SYSFS_H_ */
diff --git a/libbcache/tier.c b/libbcache/tier.c
new file mode 100644
index 0000000..2b568e1
--- /dev/null
+++ b/libbcache/tier.c
@@ -0,0 +1,243 @@
+
+#include "bcache.h"
+#include "alloc.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "clock.h"
+#include "extents.h"
+#include "io.h"
+#include "keylist.h"
+#include "move.h"
+#include "tier.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <trace/events/bcache.h>
+
+struct tiering_state {
+	struct cache_group	*tier;
+	unsigned		tier_idx;
+	unsigned		sectors;
+	unsigned		stripe_size;
+	unsigned		dev_idx;
+	struct cache		*ca;
+};
+
+static bool tiering_pred(struct cache_set *c,
+			 struct tiering_state *s,
+			 struct bkey_s_c k)
+{
+	if (bkey_extent_is_data(k.k)) {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const struct bch_extent_ptr *ptr;
+		struct cache_member_rcu *mi;
+		unsigned replicas = 0;
+
+		/* Make sure we have room to add a new pointer: */
+		if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
+		    BKEY_EXTENT_VAL_U64s_MAX)
+			return false;
+
+		mi = cache_member_info_get(c);
+		extent_for_each_ptr(e, ptr)
+			if (ptr->dev < mi->nr_in_set &&
+			    mi->m[ptr->dev].tier >= s->tier_idx)
+				replicas++;
+		cache_member_info_put();
+
+		return replicas < c->opts.data_replicas;
+	}
+
+	return false;
+}
+
+static void tier_put_device(struct tiering_state *s)
+{
+	if (s->ca)
+		percpu_ref_put(&s->ca->ref);
+	s->ca = NULL;
+}
+
+/**
+ * refill_next - move on to refilling the next cache's tiering keylist
+ */
+static void tier_next_device(struct cache_set *c, struct tiering_state *s)
+{
+	if (!s->ca || s->sectors > s->stripe_size) {
+		tier_put_device(s);
+		s->sectors = 0;
+		s->dev_idx++;
+
+		spin_lock(&s->tier->lock);
+		if (s->dev_idx >= s->tier->nr_devices)
+			s->dev_idx = 0;
+
+		if (s->tier->nr_devices) {
+			s->ca = s->tier->d[s->dev_idx].dev;
+			percpu_ref_get(&s->ca->ref);
+		}
+		spin_unlock(&s->tier->lock);
+	}
+}
+
+static int issue_tiering_move(struct cache_set *c,
+			      struct tiering_state *s,
+			      struct moving_context *ctxt,
+			      struct bkey_s_c k)
+{
+	int ret;
+
+	ret = bch_data_move(c, ctxt, &s->ca->tiering_write_point, k, NULL);
+	if (!ret) {
+		trace_bcache_tiering_copy(k.k);
+		s->sectors += k.k->size;
+	} else {
+		trace_bcache_tiering_alloc_fail(c, k.k->size);
+	}
+
+	return ret;
+}
+
+/**
+ * tiering_next_cache - issue a move to write an extent to the next cache
+ * device in round robin order
+ */
+static s64 read_tiering(struct cache_set *c, struct cache_group *tier)
+{
+	struct moving_context ctxt;
+	struct tiering_state s;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	unsigned nr_devices = READ_ONCE(tier->nr_devices);
+	int ret;
+
+	if (!nr_devices)
+		return 0;
+
+	trace_bcache_tiering_start(c);
+
+	memset(&s, 0, sizeof(s));
+	s.tier		= tier;
+	s.tier_idx	= tier - c->cache_tiers;
+	s.stripe_size	= 2048; /* 1 mb for now */
+
+	bch_move_ctxt_init(&ctxt, &c->tiering_pd.rate,
+			   nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE);
+	bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+
+	while (!kthread_should_stop() &&
+	       !bch_move_ctxt_wait(&ctxt) &&
+	       (k = bch_btree_iter_peek(&iter)).k &&
+	       !btree_iter_err(k)) {
+		if (!tiering_pred(c, &s, k))
+			goto next;
+
+		tier_next_device(c, &s);
+		if (!s.ca)
+			break;
+
+		ret = issue_tiering_move(c, &s, &ctxt, k);
+		if (ret) {
+			bch_btree_iter_unlock(&iter);
+
+			/* memory allocation failure, wait for some IO to finish */
+			bch_move_ctxt_wait_for_io(&ctxt);
+			continue;
+		}
+next:
+		bch_btree_iter_advance_pos(&iter);
+		//bch_btree_iter_cond_resched(&iter);
+
+		/* unlock before calling moving_context_wait() */
+		bch_btree_iter_unlock(&iter);
+		cond_resched();
+	}
+
+	bch_btree_iter_unlock(&iter);
+	tier_put_device(&s);
+	bch_move_ctxt_exit(&ctxt);
+	trace_bcache_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved);
+
+	return ctxt.sectors_moved;
+}
+
+static int bch_tiering_thread(void *arg)
+{
+	struct cache_set *c = arg;
+	struct cache_group *tier = &c->cache_tiers[1];
+	struct io_clock *clock = &c->io_clock[WRITE];
+	struct cache *ca;
+	u64 tier_capacity, available_sectors;
+	unsigned long last;
+	unsigned i;
+
+	set_freezable();
+
+	while (!kthread_should_stop()) {
+		if (kthread_wait_freezable(c->tiering_enabled &&
+					   tier->nr_devices))
+			break;
+
+		while (1) {
+			struct cache_group *faster_tier;
+
+			last = atomic_long_read(&clock->now);
+
+			tier_capacity = available_sectors = 0;
+			rcu_read_lock();
+			for (faster_tier = c->cache_tiers;
+			     faster_tier != tier;
+			     faster_tier++) {
+				group_for_each_cache_rcu(ca, faster_tier, i) {
+					tier_capacity +=
+						(ca->mi.nbuckets -
+						 ca->mi.first_bucket) << ca->bucket_bits;
+					available_sectors +=
+						buckets_available_cache(ca) << ca->bucket_bits;
+				}
+			}
+			rcu_read_unlock();
+
+			if (available_sectors < (tier_capacity >> 1))
+				break;
+
+			bch_kthread_io_clock_wait(clock,
+						  last +
+						  available_sectors -
+						  (tier_capacity >> 1));
+			if (kthread_should_stop())
+				return 0;
+		}
+
+		read_tiering(c, tier);
+	}
+
+	return 0;
+}
+
+void bch_tiering_init_cache_set(struct cache_set *c)
+{
+	bch_pd_controller_init(&c->tiering_pd);
+}
+
+int bch_tiering_read_start(struct cache_set *c)
+{
+	struct task_struct *t;
+
+	t = kthread_create(bch_tiering_thread, c, "bch_tier_read");
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
+	c->tiering_read = t;
+	wake_up_process(c->tiering_read);
+
+	return 0;
+}
+
+void bch_tiering_read_stop(struct cache_set *c)
+{
+	if (!IS_ERR_OR_NULL(c->tiering_read)) {
+		kthread_stop(c->tiering_read);
+		c->tiering_read = NULL;
+	}
+}
diff --git a/libbcache/tier.h b/libbcache/tier.h
new file mode 100644
index 0000000..89c2bff
--- /dev/null
+++ b/libbcache/tier.h
@@ -0,0 +1,8 @@
+#ifndef _BCACHE_TIER_H
+#define _BCACHE_TIER_H
+
+void bch_tiering_init_cache_set(struct cache_set *);
+int bch_tiering_read_start(struct cache_set *);
+void bch_tiering_read_stop(struct cache_set *);
+
+#endif
diff --git a/libbcache/trace.c b/libbcache/trace.c
new file mode 100644
index 0000000..def525d
--- /dev/null
+++ b/libbcache/trace.c
@@ -0,0 +1,11 @@
+#include "bcache.h"
+#include "alloc_types.h"
+#include "blockdev_types.h"
+#include "buckets.h"
+#include "btree_types.h"
+#include "keylist.h"
+
+#include <linux/blktrace_api.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/bcache.h>
diff --git a/libbcache/util.c b/libbcache/util.c
new file mode 100644
index 0000000..5f81659
--- /dev/null
+++ b/libbcache/util.c
@@ -0,0 +1,418 @@
+/*
+ * random utiility code, for bcache but in theory not specific to bcache
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+#include <linux/types.h>
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+
+#include "util.h"
+
+#define simple_strtoint(c, end, base)	simple_strtol(c, end, base)
+#define simple_strtouint(c, end, base)	simple_strtoul(c, end, base)
+
+#define STRTO_H(name, type)					\
+int bch_ ## name ## _h(const char *cp, type *res)		\
+{								\
+	int u = 0;						\
+	char *e;						\
+	type i = simple_ ## name(cp, &e, 10);			\
+								\
+	switch (tolower(*e)) {					\
+	default:						\
+		return -EINVAL;					\
+	case 'y':						\
+	case 'z':						\
+		u++;						\
+	case 'e':						\
+		u++;						\
+	case 'p':						\
+		u++;						\
+	case 't':						\
+		u++;						\
+	case 'g':						\
+		u++;						\
+	case 'm':						\
+		u++;						\
+	case 'k':						\
+		u++;						\
+		if (e++ == cp)					\
+			return -EINVAL;				\
+	case '\n':						\
+	case '\0':						\
+		if (*e == '\n')					\
+			e++;					\
+	}							\
+								\
+	if (*e)							\
+		return -EINVAL;					\
+								\
+	while (u--) {						\
+		if ((type) ~0 > 0 &&				\
+		    (type) ~0 / 1024 <= i)			\
+			return -EINVAL;				\
+		if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) ||	\
+		    (i < 0 && -ANYSINT_MAX(type) / 1024 > i))	\
+			return -EINVAL;				\
+		i *= 1024;					\
+	}							\
+								\
+	*res = i;						\
+	return 0;						\
+}								\
+
+STRTO_H(strtoint, int)
+STRTO_H(strtouint, unsigned int)
+STRTO_H(strtoll, long long)
+STRTO_H(strtoull, unsigned long long)
+
+ssize_t bch_hprint(char *buf, s64 v)
+{
+	static const char units[] = "?kMGTPEZY";
+	char dec[4] = "";
+	int u, t = 0;
+
+	for (u = 0; v >= 1024 || v <= -1024; u++) {
+		t = v & ~(~0 << 10);
+		v >>= 10;
+	}
+
+	if (!u)
+		return sprintf(buf, "%lli", v);
+
+	/*
+	 * 103 is magic: t is in the range [-1023, 1023] and we want
+	 * to turn it into [-9, 9]
+	 */
+	if (v < 100 && v > -100)
+		snprintf(dec, sizeof(dec), ".%i", t / 103);
+
+	return sprintf(buf, "%lli%s%c", v, dec, units[u]);
+}
+
+ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
+			    size_t selected)
+{
+	char *out = buf;
+	size_t i;
+
+	for (i = 0; list[i]; i++)
+		out += snprintf(out, buf + size - out,
+				i == selected ? "[%s] " : "%s ", list[i]);
+
+	out[-1] = '\n';
+	return out - buf;
+}
+
+ssize_t bch_read_string_list(const char *buf, const char * const list[])
+{
+	size_t i;
+	char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	s = strim(d);
+
+	for (i = 0; list[i]; i++)
+		if (!strcmp(list[i], s))
+			break;
+
+	kfree(d);
+
+	if (!list[i])
+		return -EINVAL;
+
+	return i;
+}
+
+bool bch_is_zero(const void *_p, size_t n)
+{
+	const char *p = _p;
+	size_t i;
+
+	for (i = 0; i < n; i++)
+		if (p[i])
+			return false;
+	return true;
+}
+
+void bch_time_stats_clear(struct time_stats *stats)
+{
+	spin_lock(&stats->lock);
+
+	stats->count = 0;
+	stats->last_duration = 0;
+	stats->max_duration = 0;
+	stats->average_duration = 0;
+	stats->average_frequency = 0;
+	stats->last = 0;
+
+	spin_unlock(&stats->lock);
+}
+
+void __bch_time_stats_update(struct time_stats *stats, u64 start_time)
+{
+	u64 now, duration, last;
+
+	stats->count++;
+
+	now		= local_clock();
+	duration	= time_after64(now, start_time)
+		? now - start_time : 0;
+	last		= time_after64(now, stats->last)
+		? now - stats->last : 0;
+
+	stats->last_duration = duration;
+	stats->max_duration = max(stats->max_duration, duration);
+
+	if (stats->last) {
+		stats->average_duration = ewma_add(stats->average_duration,
+						   duration << 8, 3);
+
+		if (stats->average_frequency)
+			stats->average_frequency =
+				ewma_add(stats->average_frequency,
+					 last << 8, 3);
+		else
+			stats->average_frequency  = last << 8;
+	} else {
+		stats->average_duration = duration << 8;
+	}
+
+	stats->last = now ?: 1;
+}
+
+void bch_time_stats_update(struct time_stats *stats, u64 start_time)
+{
+	spin_lock(&stats->lock);
+	__bch_time_stats_update(stats, start_time);
+	spin_unlock(&stats->lock);
+}
+
+/**
+ * bch_ratelimit_delay() - return how long to delay until the next time to do
+ * some work
+ *
+ * @d - the struct bch_ratelimit to update
+ *
+ * Returns the amount of time to delay by, in jiffies
+ */
+u64 bch_ratelimit_delay(struct bch_ratelimit *d)
+{
+	u64 now = local_clock();
+
+	return time_after64(d->next, now)
+		? nsecs_to_jiffies(d->next - now)
+		: 0;
+}
+
+/**
+ * bch_ratelimit_increment() - increment @d by the amount of work done
+ *
+ * @d - the struct bch_ratelimit to update
+ * @done - the amount of work done, in arbitrary units
+ */
+void bch_ratelimit_increment(struct bch_ratelimit *d, u64 done)
+{
+	u64 now = local_clock();
+
+	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+
+	if (time_before64(now + NSEC_PER_SEC, d->next))
+		d->next = now + NSEC_PER_SEC;
+
+	if (time_after64(now - NSEC_PER_SEC * 2, d->next))
+		d->next = now - NSEC_PER_SEC * 2;
+}
+
+int bch_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d)
+{
+	while (1) {
+		u64 delay = bch_ratelimit_delay(d);
+
+		if (delay)
+			set_current_state(TASK_INTERRUPTIBLE);
+
+		if (kthread_should_stop())
+			return 1;
+
+		if (!delay)
+			return 0;
+
+		schedule_timeout(delay);
+		try_to_freeze();
+	}
+}
+
+/*
+ * Updates pd_controller. Attempts to scale inputed values to units per second.
+ * @target: desired value
+ * @actual: current value
+ *
+ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing
+ * it makes actual go down.
+ */
+void bch_pd_controller_update(struct bch_pd_controller *pd,
+			      s64 target, s64 actual, int sign)
+{
+	s64 proportional, derivative, change;
+
+	unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ;
+
+	if (seconds_since_update == 0)
+		return;
+
+	pd->last_update = jiffies;
+
+	proportional = actual - target;
+	proportional *= seconds_since_update;
+	proportional = div_s64(proportional, pd->p_term_inverse);
+
+	derivative = actual - pd->last_actual;
+	derivative = div_s64(derivative, seconds_since_update);
+	derivative = ewma_add(pd->smoothed_derivative, derivative,
+			      (pd->d_term / seconds_since_update) ?: 1);
+	derivative = derivative * pd->d_term;
+	derivative = div_s64(derivative, pd->p_term_inverse);
+
+	change = proportional + derivative;
+
+	/* Don't increase rate if not keeping up */
+	if (change > 0 &&
+	    pd->backpressure &&
+	    time_after64(local_clock(),
+			 pd->rate.next + NSEC_PER_MSEC))
+		change = 0;
+
+	change *= (sign * -1);
+
+	pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change,
+				1, UINT_MAX);
+
+	pd->last_actual		= actual;
+	pd->last_derivative	= derivative;
+	pd->last_proportional	= proportional;
+	pd->last_change		= change;
+	pd->last_target		= target;
+}
+
+void bch_pd_controller_init(struct bch_pd_controller *pd)
+{
+	pd->rate.rate		= 1024;
+	pd->last_update		= jiffies;
+	pd->p_term_inverse	= 6000;
+	pd->d_term		= 30;
+	pd->d_smooth		= pd->d_term;
+	pd->backpressure	= 1;
+}
+
+size_t bch_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
+{
+	/* 2^64 - 1 is 20 digits, plus null byte */
+	char rate[21];
+	char actual[21];
+	char target[21];
+	char proportional[21];
+	char derivative[21];
+	char change[21];
+	s64 next_io;
+
+	bch_hprint(rate,	pd->rate.rate);
+	bch_hprint(actual,	pd->last_actual);
+	bch_hprint(target,	pd->last_target);
+	bch_hprint(proportional, pd->last_proportional);
+	bch_hprint(derivative,	pd->last_derivative);
+	bch_hprint(change,	pd->last_change);
+
+	next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC);
+
+	return sprintf(buf,
+		       "rate:\t\t%s/sec\n"
+		       "target:\t\t%s\n"
+		       "actual:\t\t%s\n"
+		       "proportional:\t%s\n"
+		       "derivative:\t%s\n"
+		       "change:\t\t%s/sec\n"
+		       "next io:\t%llims\n",
+		       rate, target, actual, proportional,
+		       derivative, change, next_io);
+}
+
+void bch_bio_map(struct bio *bio, void *base)
+{
+	size_t size = bio->bi_iter.bi_size;
+	struct bio_vec *bv = bio->bi_io_vec;
+
+	BUG_ON(!bio->bi_iter.bi_size);
+	BUG_ON(bio->bi_vcnt);
+
+	bv->bv_offset = base ? offset_in_page(base) : 0;
+	goto start;
+
+	for (; size; bio->bi_vcnt++, bv++) {
+		bv->bv_offset	= 0;
+start:		bv->bv_len	= min_t(size_t, PAGE_SIZE - bv->bv_offset,
+					size);
+		BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
+		if (base) {
+			bv->bv_page = is_vmalloc_addr(base)
+				? vmalloc_to_page(base)
+				: virt_to_page(base);
+
+			base += bv->bv_len;
+		}
+
+		size -= bv->bv_len;
+	}
+}
+
+size_t bch_rand_range(size_t max)
+{
+	size_t rand;
+
+	do {
+		get_random_bytes(&rand, sizeof(rand));
+		rand &= roundup_pow_of_two(max) - 1;
+	} while (rand >= max);
+
+	return rand;
+}
+
+void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, void *src)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+
+	__bio_for_each_segment(bv, dst, iter, dst_iter) {
+		void *dstp = kmap_atomic(bv.bv_page);
+		memcpy(dstp + bv.bv_offset, src, bv.bv_len);
+		kunmap_atomic(dstp);
+
+		src += bv.bv_len;
+	}
+}
+
+void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+
+	__bio_for_each_segment(bv, src, iter, src_iter) {
+		void *srcp = kmap_atomic(bv.bv_page);
+		memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
+		kunmap_atomic(srcp);
+
+		dst += bv.bv_len;
+	}
+}
diff --git a/libbcache/util.h b/libbcache/util.h
new file mode 100644
index 0000000..2b171a1
--- /dev/null
+++ b/libbcache/util.h
@@ -0,0 +1,725 @@
+#ifndef _BCACHE_UTIL_H
+#define _BCACHE_UTIL_H
+
+#include <linux/blkdev.h>
+#include <linux/errno.h>
+#include <linux/blkdev.h>
+#include <linux/freezer.h>
+#include <linux/kernel.h>
+#include <linux/llist.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+
+#include "closure.h"
+
+#define PAGE_SECTOR_SHIFT	(PAGE_SHIFT - 9)
+#define PAGE_SECTORS		(1UL << PAGE_SECTOR_SHIFT)
+
+struct closure;
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+#define EBUG_ON(cond)		BUG_ON(cond)
+#define atomic_dec_bug(v)	BUG_ON(atomic_dec_return(v) < 0)
+#define atomic_inc_bug(v, i)	BUG_ON(atomic_inc_return(v) <= i)
+#define atomic_sub_bug(i, v)	BUG_ON(atomic_sub_return(i, v) < 0)
+#define atomic_add_bug(i, v)	BUG_ON(atomic_add_return(i, v) < 0)
+#define atomic_long_dec_bug(v)		BUG_ON(atomic_long_dec_return(v) < 0)
+#define atomic_long_sub_bug(i, v)	BUG_ON(atomic_long_sub_return(i, v) < 0)
+#define atomic64_dec_bug(v)	BUG_ON(atomic64_dec_return(v) < 0)
+#define atomic64_inc_bug(v, i)	BUG_ON(atomic64_inc_return(v) <= i)
+#define atomic64_sub_bug(i, v)	BUG_ON(atomic64_sub_return(i, v) < 0)
+#define atomic64_add_bug(i, v)	BUG_ON(atomic64_add_return(i, v) < 0)
+
+#define memcpy(_dst, _src, _len)					\
+do {									\
+	BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) ||		\
+		 (void *) (_dst) + (_len) <= (void *) (_src)));		\
+	memcpy(_dst, _src, _len);					\
+} while (0)
+
+#else /* DEBUG */
+
+#define EBUG_ON(cond)
+#define atomic_dec_bug(v)	atomic_dec(v)
+#define atomic_inc_bug(v, i)	atomic_inc(v)
+#define atomic_sub_bug(i, v)	atomic_sub(i, v)
+#define atomic_add_bug(i, v)	atomic_add(i, v)
+#define atomic_long_dec_bug(v)		atomic_long_dec(v)
+#define atomic_long_sub_bug(i, v)	atomic_long_sub(i, v)
+#define atomic64_dec_bug(v)	atomic64_dec(v)
+#define atomic64_inc_bug(v, i)	atomic64_inc(v)
+#define atomic64_sub_bug(i, v)	atomic64_sub(i, v)
+#define atomic64_add_bug(i, v)	atomic64_add(i, v)
+
+#endif
+
+#ifndef __CHECKER__
+#define __flatten __attribute__((flatten))
+#else
+/* sparse doesn't know about attribute((flatten)) */
+#define __flatten
+#endif
+
+#ifdef __LITTLE_ENDIAN
+#define CPU_BIG_ENDIAN		0
+#else
+#define CPU_BIG_ENDIAN		1
+#endif
+
+/* type hackery */
+
+#define type_is_exact(_val, _type)					\
+	__builtin_types_compatible_p(typeof(_val), _type)
+
+#define type_is(_val, _type)						\
+	(__builtin_types_compatible_p(typeof(_val), _type) ||		\
+	 __builtin_types_compatible_p(typeof(_val), const _type))
+
+static inline void *kvmalloc(size_t bytes, gfp_t gfp)
+{
+	if (bytes <= PAGE_SIZE ||
+	    !(gfp & GFP_KERNEL))
+		return kmalloc(bytes, gfp);
+
+	return ((bytes <= KMALLOC_MAX_SIZE)
+		? kmalloc(bytes, gfp|__GFP_NOWARN)
+		: NULL) ?:
+		vmalloc(bytes);
+}
+
+#define DECLARE_HEAP(type, name)					\
+	struct {							\
+		size_t size, used;					\
+		type *data;						\
+	} name
+
+#define init_heap(heap, _size, gfp)					\
+({									\
+	size_t _bytes;							\
+	(heap)->used = 0;						\
+	(heap)->size = (_size);						\
+	_bytes = (heap)->size * sizeof(*(heap)->data);			\
+	(heap)->data = kvmalloc(_bytes, (gfp));				\
+	(heap)->data;							\
+})
+
+#define free_heap(heap)							\
+do {									\
+	kvfree((heap)->data);						\
+	(heap)->data = NULL;						\
+} while (0)
+
+#define heap_swap(h, i, j)	swap((h)->data[i], (h)->data[j])
+
+#define heap_sift(h, i, cmp)						\
+do {									\
+	size_t _r, _j = i;						\
+									\
+	for (; _j * 2 + 1 < (h)->used; _j = _r) {			\
+		_r = _j * 2 + 1;					\
+		if (_r + 1 < (h)->used &&				\
+		    cmp((h)->data[_r], (h)->data[_r + 1]))		\
+			_r++;						\
+									\
+		if (cmp((h)->data[_r], (h)->data[_j]))			\
+			break;						\
+		heap_swap(h, _r, _j);					\
+	}								\
+} while (0)
+
+#define heap_sift_down(h, i, cmp)					\
+do {									\
+	while (i) {							\
+		size_t p = (i - 1) / 2;					\
+		if (cmp((h)->data[i], (h)->data[p]))			\
+			break;						\
+		heap_swap(h, i, p);					\
+		i = p;							\
+	}								\
+} while (0)
+
+#define heap_add(h, d, cmp)						\
+({									\
+	bool _r = !heap_full(h);					\
+	if (_r) {							\
+		size_t _i = (h)->used++;				\
+		(h)->data[_i] = d;					\
+									\
+		heap_sift_down(h, _i, cmp);				\
+		heap_sift(h, _i, cmp);					\
+	}								\
+	_r;								\
+})
+
+#define heap_del(h, i, cmp)						\
+do {									\
+	size_t _i = (i);						\
+									\
+	BUG_ON(_i >= (h)->used);					\
+	(h)->used--;							\
+	heap_swap(h, _i, (h)->used);					\
+	heap_sift_down(h, _i, cmp);					\
+	heap_sift(h, _i, cmp);						\
+} while (0)
+
+#define heap_pop(h, d, cmp)						\
+({									\
+	bool _r = (h)->used;						\
+	if (_r) {							\
+		(d) = (h)->data[0];					\
+		heap_del(h, 0, cmp);					\
+	}								\
+	_r;								\
+})
+
+#define heap_peek(h)							\
+({									\
+	EBUG_ON(!(h)->used);						\
+	(h)->data[0];							\
+})
+
+#define heap_full(h)	((h)->used == (h)->size)
+
+#define heap_resort(heap, cmp)						\
+do {									\
+	ssize_t _i;							\
+	for (_i = (ssize_t) (heap)->used / 2 -  1; _i >= 0; --_i)	\
+		heap_sift(heap, _i, cmp);				\
+} while (0)
+
+/*
+ * Simple array based allocator - preallocates a number of elements and you can
+ * never allocate more than that, also has no locking.
+ *
+ * Handy because if you know you only need a fixed number of elements you don't
+ * have to worry about memory allocation failure, and sometimes a mempool isn't
+ * what you want.
+ *
+ * We treat the free elements as entries in a singly linked list, and the
+ * freelist as a stack - allocating and freeing push and pop off the freelist.
+ */
+
+#define DECLARE_ARRAY_ALLOCATOR(type, name, size)			\
+	struct {							\
+		type	*freelist;					\
+		type	data[size];					\
+	} name
+
+#define array_alloc(array)						\
+({									\
+	typeof((array)->freelist) _ret = (array)->freelist;		\
+									\
+	if (_ret)							\
+		(array)->freelist = *((typeof((array)->freelist) *) _ret);\
+									\
+	_ret;								\
+})
+
+#define array_free(array, ptr)						\
+do {									\
+	typeof((array)->freelist) _ptr = ptr;				\
+									\
+	*((typeof((array)->freelist) *) _ptr) = (array)->freelist;	\
+	(array)->freelist = _ptr;					\
+} while (0)
+
+#define array_allocator_init(array)					\
+do {									\
+	typeof((array)->freelist) _i;					\
+									\
+	BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *));	\
+	(array)->freelist = NULL;					\
+									\
+	for (_i = (array)->data;					\
+	     _i < (array)->data + ARRAY_SIZE((array)->data);		\
+	     _i++)							\
+		array_free(array, _i);					\
+} while (0)
+
+#define array_freelist_empty(array)	((array)->freelist == NULL)
+
+#define ANYSINT_MAX(t)							\
+	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
+
+int bch_strtoint_h(const char *, int *);
+int bch_strtouint_h(const char *, unsigned int *);
+int bch_strtoll_h(const char *, long long *);
+int bch_strtoull_h(const char *, unsigned long long *);
+
+static inline int bch_strtol_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+	return bch_strtoint_h(cp, (int *) res);
+#else
+	return bch_strtoll_h(cp, (long long *) res);
+#endif
+}
+
+static inline int bch_strtoul_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+	return bch_strtouint_h(cp, (unsigned int *) res);
+#else
+	return bch_strtoull_h(cp, (unsigned long long *) res);
+#endif
+}
+
+#define strtoi_h(cp, res)						\
+	( type_is(*res, int)		? bch_strtoint_h(cp, (void *) res)\
+	: type_is(*res, long)		? bch_strtol_h(cp, (void *) res)\
+	: type_is(*res, long long)	? bch_strtoll_h(cp, (void *) res)\
+	: type_is(*res, unsigned)	? bch_strtouint_h(cp, (void *) res)\
+	: type_is(*res, unsigned long)	? bch_strtoul_h(cp, (void *) res)\
+	: type_is(*res, unsigned long long) ? bch_strtoull_h(cp, (void *) res)\
+	: -EINVAL)
+
+#define strtoul_safe(cp, var)						\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r)							\
+		var = _v;						\
+	_r;								\
+})
+
+#define strtoul_safe_clamp(cp, var, min, max)				\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r)							\
+		var = clamp_t(typeof(var), _v, min, max);		\
+	_r;								\
+})
+
+#define strtoul_safe_restrict(cp, var, min, max)			\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r && _v >= min && _v <= max)				\
+		var = _v;						\
+	else								\
+		_r = -EINVAL;						\
+	_r;								\
+})
+
+#define snprint(buf, size, var)						\
+	snprintf(buf, size,						\
+		   type_is(var, int)		? "%i\n"		\
+		 : type_is(var, unsigned)	? "%u\n"		\
+		 : type_is(var, long)		? "%li\n"		\
+		 : type_is(var, unsigned long)	? "%lu\n"		\
+		 : type_is(var, s64)		? "%lli\n"		\
+		 : type_is(var, u64)		? "%llu\n"		\
+		 : type_is(var, char *)		? "%s\n"		\
+		 : "%i\n", var)
+
+ssize_t bch_hprint(char *buf, s64 v);
+
+bool bch_is_zero(const void *, size_t);
+
+ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
+			    size_t selected);
+
+ssize_t bch_read_string_list(const char *buf, const char * const list[]);
+
+struct time_stats {
+	spinlock_t	lock;
+	u64		count;
+	/*
+	 * all fields are in nanoseconds, averages are ewmas stored left shifted
+	 * by 8
+	 */
+	u64		last_duration;
+	u64		max_duration;
+	u64		average_duration;
+	u64		average_frequency;
+	u64		last;
+};
+
+void bch_time_stats_clear(struct time_stats *stats);
+void __bch_time_stats_update(struct time_stats *stats, u64 time);
+void bch_time_stats_update(struct time_stats *stats, u64 time);
+
+static inline unsigned local_clock_us(void)
+{
+	return local_clock() >> 10;
+}
+
+#define NSEC_PER_ns			1L
+#define NSEC_PER_us			NSEC_PER_USEC
+#define NSEC_PER_ms			NSEC_PER_MSEC
+#define NSEC_PER_sec			NSEC_PER_SEC
+
+#define __print_time_stat(stats, name, stat, units)			\
+	sysfs_print(name ## _ ## stat ## _ ## units,			\
+		    div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
+
+#define sysfs_print_time_stats(stats, name,				\
+			       frequency_units,				\
+			       duration_units)				\
+do {									\
+	__print_time_stat(stats, name,					\
+			  average_frequency,	frequency_units);	\
+	__print_time_stat(stats, name,					\
+			  average_duration,	duration_units);	\
+	sysfs_print(name ## _ ##count, (stats)->count);			\
+	sysfs_print(name ## _ ##last_duration ## _ ## duration_units,	\
+			div_u64((stats)->last_duration,			\
+				NSEC_PER_ ## duration_units));		\
+	sysfs_print(name ## _ ##max_duration ## _ ## duration_units,	\
+			div_u64((stats)->max_duration,			\
+				NSEC_PER_ ## duration_units));		\
+									\
+	sysfs_print(name ## _last_ ## frequency_units, (stats)->last	\
+		    ? div_s64(local_clock() - (stats)->last,		\
+			      NSEC_PER_ ## frequency_units)		\
+		    : -1LL);						\
+} while (0)
+
+#define sysfs_clear_time_stats(stats, name)				\
+do {									\
+	if (attr == &sysfs_ ## name ## _clear)				\
+		bch_time_stats_clear(stats);				\
+} while (0)
+
+#define sysfs_time_stats_attribute(name,				\
+				   frequency_units,			\
+				   duration_units)			\
+write_attribute(name ## _clear);					\
+read_attribute(name ## _count);						\
+read_attribute(name ## _average_frequency_ ## frequency_units);		\
+read_attribute(name ## _average_duration_ ## duration_units);		\
+read_attribute(name ## _last_duration_ ## duration_units);		\
+read_attribute(name ## _max_duration_ ## duration_units);		\
+read_attribute(name ## _last_ ## frequency_units)
+
+#define sysfs_time_stats_attribute_list(name,				\
+					frequency_units,		\
+					duration_units)			\
+&sysfs_ ## name ## _clear,						\
+&sysfs_ ## name ## _count,						\
+&sysfs_ ## name ## _average_frequency_ ## frequency_units,		\
+&sysfs_ ## name ## _average_duration_ ## duration_units,		\
+&sysfs_ ## name ## _last_duration_ ## duration_units,			\
+&sysfs_ ## name ## _max_duration_ ## duration_units,			\
+&sysfs_ ## name ## _last_ ## frequency_units,
+
+#define ewma_add(ewma, val, weight)					\
+({									\
+	typeof(ewma) _ewma = (ewma);					\
+	typeof(weight) _weight = (weight);				\
+									\
+	(((_ewma << _weight) - _ewma) + (val)) >> _weight;		\
+})
+
+struct bch_ratelimit {
+	/* Next time we want to do some work, in nanoseconds */
+	u64			next;
+
+	/*
+	 * Rate at which we want to do work, in units per nanosecond
+	 * The units here correspond to the units passed to
+	 * bch_ratelimit_increment()
+	 */
+	unsigned		rate;
+};
+
+static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
+{
+	d->next = local_clock();
+}
+
+u64 bch_ratelimit_delay(struct bch_ratelimit *);
+void bch_ratelimit_increment(struct bch_ratelimit *, u64);
+int bch_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *);
+
+struct bch_pd_controller {
+	struct bch_ratelimit	rate;
+	unsigned long		last_update;
+
+	s64			last_actual;
+	s64			smoothed_derivative;
+
+	unsigned		p_term_inverse;
+	unsigned		d_smooth;
+	unsigned		d_term;
+
+	/* for exporting to sysfs (no effect on behavior) */
+	s64			last_derivative;
+	s64			last_proportional;
+	s64			last_change;
+	s64			last_target;
+
+	/* If true, the rate will not increase if bch_ratelimit_delay()
+	 * is not being called often enough. */
+	bool			backpressure;
+};
+
+void bch_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
+void bch_pd_controller_init(struct bch_pd_controller *);
+size_t bch_pd_controller_print_debug(struct bch_pd_controller *, char *);
+
+#define sysfs_pd_controller_attribute(name)				\
+	rw_attribute(name##_rate);					\
+	rw_attribute(name##_rate_bytes);				\
+	rw_attribute(name##_rate_d_term);				\
+	rw_attribute(name##_rate_p_term_inverse);			\
+	read_attribute(name##_rate_debug)
+
+#define sysfs_pd_controller_files(name)					\
+	&sysfs_##name##_rate,						\
+	&sysfs_##name##_rate_bytes,					\
+	&sysfs_##name##_rate_d_term,					\
+	&sysfs_##name##_rate_p_term_inverse,				\
+	&sysfs_##name##_rate_debug
+
+#define sysfs_pd_controller_show(name, var)				\
+do {									\
+	sysfs_hprint(name##_rate,		(var)->rate.rate);	\
+	sysfs_print(name##_rate_bytes,		(var)->rate.rate);	\
+	sysfs_print(name##_rate_d_term,		(var)->d_term);		\
+	sysfs_print(name##_rate_p_term_inverse,	(var)->p_term_inverse);	\
+									\
+	if (attr == &sysfs_##name##_rate_debug)				\
+		return bch_pd_controller_print_debug(var, buf);		\
+} while (0)
+
+#define sysfs_pd_controller_store(name, var)				\
+do {									\
+	sysfs_strtoul_clamp(name##_rate,				\
+			    (var)->rate.rate, 1, UINT_MAX);		\
+	sysfs_strtoul_clamp(name##_rate_bytes,				\
+			    (var)->rate.rate, 1, UINT_MAX);		\
+	sysfs_strtoul(name##_rate_d_term,	(var)->d_term);		\
+	sysfs_strtoul_clamp(name##_rate_p_term_inverse,			\
+			    (var)->p_term_inverse, 1, INT_MAX);		\
+} while (0)
+
+#define __DIV_SAFE(n, d, zero)						\
+({									\
+	typeof(n) _n = (n);						\
+	typeof(d) _d = (d);						\
+	_d ? _n / _d : zero;						\
+})
+
+#define DIV_SAFE(n, d)	__DIV_SAFE(n, d, 0)
+
+#define container_of_or_null(ptr, type, member)				\
+({									\
+	typeof(ptr) _ptr = ptr;						\
+	_ptr ? container_of(_ptr, type, member) : NULL;			\
+})
+
+#define RB_INSERT(root, new, member, cmp)				\
+({									\
+	__label__ dup;							\
+	struct rb_node **n = &(root)->rb_node, *parent = NULL;		\
+	typeof(new) this;						\
+	int res, ret = -1;						\
+									\
+	while (*n) {							\
+		parent = *n;						\
+		this = container_of(*n, typeof(*(new)), member);	\
+		res = cmp(new, this);					\
+		if (!res)						\
+			goto dup;					\
+		n = res < 0						\
+			? &(*n)->rb_left				\
+			: &(*n)->rb_right;				\
+	}								\
+									\
+	rb_link_node(&(new)->member, parent, n);			\
+	rb_insert_color(&(new)->member, root);				\
+	ret = 0;							\
+dup:									\
+	ret;								\
+})
+
+#define RB_SEARCH(root, search, member, cmp)				\
+({									\
+	struct rb_node *n = (root)->rb_node;				\
+	typeof(&(search)) this, ret = NULL;				\
+	int res;							\
+									\
+	while (n) {							\
+		this = container_of(n, typeof(search), member);		\
+		res = cmp(&(search), this);				\
+		if (!res) {						\
+			ret = this;					\
+			break;						\
+		}							\
+		n = res < 0						\
+			? n->rb_left					\
+			: n->rb_right;					\
+	}								\
+	ret;								\
+})
+
+#define RB_GREATER(root, search, member, cmp)				\
+({									\
+	struct rb_node *n = (root)->rb_node;				\
+	typeof(&(search)) this, ret = NULL;				\
+	int res;							\
+									\
+	while (n) {							\
+		this = container_of(n, typeof(search), member);		\
+		res = cmp(&(search), this);				\
+		if (res < 0) {						\
+			ret = this;					\
+			n = n->rb_left;					\
+		} else							\
+			n = n->rb_right;				\
+	}								\
+	ret;								\
+})
+
+#define RB_FIRST(root, type, member)					\
+	container_of_or_null(rb_first(root), type, member)
+
+#define RB_LAST(root, type, member)					\
+	container_of_or_null(rb_last(root), type, member)
+
+#define RB_NEXT(ptr, member)						\
+	container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member)
+
+#define RB_PREV(ptr, member)						\
+	container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
+
+/* Does linear interpolation between powers of two */
+static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
+{
+	unsigned fract = x & ~(~0 << fract_bits);
+
+	x >>= fract_bits;
+	x   = 1 << x;
+	x  += (x * fract) >> fract_bits;
+
+	return x;
+}
+
+void bch_bio_map(struct bio *bio, void *base);
+
+static inline sector_t bdev_sectors(struct block_device *bdev)
+{
+	return bdev->bd_inode->i_size >> 9;
+}
+
+#define closure_bio_submit(bio, cl)					\
+do {									\
+	closure_get(cl);						\
+	generic_make_request(bio);					\
+} while (0)
+
+#define closure_bio_submit_punt(bio, cl, c)				\
+do {									\
+	closure_get(cl);						\
+	bch_generic_make_request(bio, c);				\
+} while (0)
+
+#define kthread_wait_freezable(cond)					\
+({									\
+	int _ret = 0;							\
+	while (1) {							\
+		set_current_state(TASK_INTERRUPTIBLE);			\
+		if (kthread_should_stop()) {				\
+			_ret = -1;					\
+			break;						\
+		}							\
+									\
+		if (cond)						\
+			break;						\
+									\
+		schedule();						\
+		try_to_freeze();					\
+	}								\
+	set_current_state(TASK_RUNNING);				\
+	_ret;								\
+})
+
+size_t bch_rand_range(size_t);
+
+void memcpy_to_bio(struct bio *, struct bvec_iter, void *);
+void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
+
+static inline void __memcpy_u64s(void *dst, const void *src,
+				 unsigned u64s)
+{
+#ifdef CONFIG_X86_64
+	long d0, d1, d2;
+	asm volatile("rep ; movsq"
+		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+		     : "0" (u64s), "1" (dst), "2" (src)
+		     : "memory");
+#else
+	u64 *d = dst;
+	const u64 *s = src;
+
+	while (u64s--)
+		*d++ = *s++;
+#endif
+}
+
+static inline void memcpy_u64s(void *dst, const void *src,
+			       unsigned u64s)
+{
+	EBUG_ON(!(dst >= src + u64s * sizeof(u64) ||
+		 dst + u64s * sizeof(u64) <= src));
+
+	__memcpy_u64s(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_down(void *dst, const void *src,
+				       unsigned u64s)
+{
+	__memcpy_u64s(dst, src, u64s);
+}
+
+static inline void memmove_u64s_down(void *dst, const void *src,
+				     unsigned u64s)
+{
+	EBUG_ON(dst > src);
+
+	__memmove_u64s_down(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_up(void *_dst, const void *_src,
+				     unsigned u64s)
+{
+	u64 *dst = (u64 *) _dst + u64s - 1;
+	u64 *src = (u64 *) _src + u64s - 1;
+
+#ifdef CONFIG_X86_64
+	long d0, d1, d2;
+	asm volatile("std ;\n"
+		     "rep ; movsq\n"
+		     "cld ;\n"
+		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+		     : "0" (u64s), "1" (dst), "2" (src)
+		     : "memory");
+#else
+	while (u64s--)
+		*dst-- = *src--;
+#endif
+}
+
+static inline void memmove_u64s_up(void *dst, const void *src,
+				   unsigned u64s)
+{
+	EBUG_ON(dst < src);
+
+	__memmove_u64s_up(dst, src, u64s);
+}
+
+static inline void memmove_u64s(void *dst, const void *src,
+				unsigned u64s)
+{
+	if (dst < src)
+		__memmove_u64s_down(dst, src, u64s);
+	else
+		__memmove_u64s_up(dst, src, u64s);
+}
+
+#endif /* _BCACHE_UTIL_H */
diff --git a/libbcache/writeback.c b/libbcache/writeback.c
new file mode 100644
index 0000000..600bfbf
--- /dev/null
+++ b/libbcache/writeback.c
@@ -0,0 +1,657 @@
+/*
+ * background writeback - scan btree for dirty data and write it to the backing
+ * device
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree_update.h"
+#include "clock.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "keybuf.h"
+#include "keylist.h"
+#include "writeback.h"
+
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <trace/events/bcache.h>
+
+/* Rate limiting */
+
+static void __update_writeback_rate(struct cached_dev *dc)
+{
+	struct cache_set *c = dc->disk.c;
+	u64 cache_dirty_target =
+		div_u64(c->capacity * dc->writeback_percent, 100);
+	s64 target = div64_u64(cache_dirty_target *
+			       bdev_sectors(dc->disk_sb.bdev),
+			       c->cached_dev_sectors);
+	s64 dirty = bcache_dev_sectors_dirty(&dc->disk);
+
+	bch_pd_controller_update(&dc->writeback_pd, target << 9,
+				 dirty << 9, -1);
+}
+
+static void update_writeback_rate(struct work_struct *work)
+{
+	struct cached_dev *dc = container_of(to_delayed_work(work),
+					     struct cached_dev,
+					     writeback_pd_update);
+
+	down_read(&dc->writeback_lock);
+
+	if (atomic_read(&dc->has_dirty) &&
+	    dc->writeback_percent &&
+	    !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
+		__update_writeback_rate(dc);
+	else
+		dc->writeback_pd.rate.rate = UINT_MAX;
+
+	up_read(&dc->writeback_lock);
+
+	schedule_delayed_work(&dc->writeback_pd_update,
+			      dc->writeback_pd_update_seconds * HZ);
+}
+
+struct dirty_io {
+	struct closure		cl;
+	struct bch_replace_info	replace;
+	struct cached_dev	*dc;
+	struct cache		*ca;
+	struct keybuf_key	*w;
+	struct bch_extent_ptr	ptr;
+	int			error;
+	bool			from_mempool;
+	/* Must be last */
+	struct bio		bio;
+};
+
+#define DIRTY_IO_MEMPOOL_BVECS		64
+#define DIRTY_IO_MEMPOOL_SECTORS	(DIRTY_IO_MEMPOOL_BVECS * PAGE_SECTORS)
+
+static void dirty_init(struct dirty_io *io)
+{
+	struct bio *bio = &io->bio;
+
+	bio_init(bio);
+	if (!io->dc->writeback_percent)
+		bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+	bio->bi_iter.bi_size	= io->replace.key.k.size << 9;
+	bio->bi_max_vecs	=
+		DIV_ROUND_UP(io->replace.key.k.size, PAGE_SECTORS);
+	bio->bi_io_vec		= bio->bi_inline_vecs;
+	bch_bio_map(bio, NULL);
+}
+
+static void dirty_io_destructor(struct closure *cl)
+{
+	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+
+	if (io->from_mempool)
+		mempool_free(io, &io->dc->writeback_io_pool);
+	else
+		kfree(io);
+}
+
+static void write_dirty_finish(struct closure *cl)
+{
+	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+	struct cached_dev *dc = io->dc;
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment_all(bv, &io->bio, i)
+		mempool_free(bv->bv_page, &dc->writeback_page_pool);
+
+	if (!io->error) {
+		BKEY_PADDED(k) tmp;
+		int ret;
+
+		bkey_copy(&tmp.k, &io->replace.key);
+		io->replace.hook.fn = bch_extent_cmpxchg;
+		bkey_extent_set_cached(&tmp.k.k, true);
+
+		ret = bch_btree_insert(dc->disk.c, BTREE_ID_EXTENTS, &tmp.k,
+				       NULL, &io->replace.hook, NULL, 0);
+		if (io->replace.successes == 0)
+			trace_bcache_writeback_collision(&io->replace.key.k);
+
+		atomic_long_inc(ret
+				? &dc->disk.c->writeback_keys_failed
+				: &dc->disk.c->writeback_keys_done);
+	}
+
+	bch_keybuf_put(&dc->writeback_keys, io->w);
+
+	closure_return_with_destructor(cl, dirty_io_destructor);
+}
+
+static void dirty_endio(struct bio *bio)
+{
+	struct dirty_io *io = container_of(bio, struct dirty_io, bio);
+
+	if (bio->bi_error) {
+		trace_bcache_writeback_error(&io->replace.key.k,
+					     op_is_write(bio_op(&io->bio)),
+					     bio->bi_error);
+		io->error = bio->bi_error;
+	}
+
+	closure_put(&io->cl);
+}
+
+static void write_dirty(struct closure *cl)
+{
+	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+
+	if (!io->error) {
+		dirty_init(io);
+		bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
+		io->bio.bi_iter.bi_sector =
+			bkey_start_offset(&io->replace.key.k);
+		io->bio.bi_bdev		= io->dc->disk_sb.bdev;
+		io->bio.bi_end_io	= dirty_endio;
+
+		closure_bio_submit(&io->bio, cl);
+	}
+
+	continue_at(cl, write_dirty_finish, io->dc->disk.c->wq);
+}
+
+static void read_dirty_endio(struct bio *bio)
+{
+	struct dirty_io *io = container_of(bio, struct dirty_io, bio);
+
+	cache_nonfatal_io_err_on(bio->bi_error, io->ca, "writeback read");
+
+	bch_account_io_completion(io->ca);
+
+	if (ptr_stale(io->ca, &io->ptr))
+		bio->bi_error = -EINTR;
+
+	dirty_endio(bio);
+}
+
+static void read_dirty_submit(struct closure *cl)
+{
+	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+
+	closure_bio_submit(&io->bio, cl);
+
+	continue_at(cl, write_dirty, system_freezable_wq);
+}
+
+static u64 read_dirty(struct cached_dev *dc)
+{
+	struct keybuf_key *w;
+	struct dirty_io *io;
+	struct closure cl;
+	unsigned i;
+	struct bio_vec *bv;
+	u64 sectors_written = 0;
+	BKEY_PADDED(k) tmp;
+
+	closure_init_stack(&cl);
+
+	while (!bch_ratelimit_wait_freezable_stoppable(&dc->writeback_pd.rate)) {
+		w = bch_keybuf_next(&dc->writeback_keys);
+		if (!w)
+			break;
+
+		sectors_written += w->key.k.size;
+		bkey_copy(&tmp.k, &w->key);
+
+		while (tmp.k.k.size) {
+			struct extent_pick_ptr pick;
+
+			bch_extent_pick_ptr(dc->disk.c,
+					    bkey_i_to_s_c(&tmp.k),
+					    &pick);
+			if (IS_ERR_OR_NULL(pick.ca))
+				break;
+
+			io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) *
+				     DIV_ROUND_UP(tmp.k.k.size,
+						  PAGE_SECTORS),
+				     GFP_KERNEL);
+			if (!io) {
+				trace_bcache_writeback_alloc_fail(pick.ca->set,
+								  tmp.k.k.size);
+				io = mempool_alloc(&dc->writeback_io_pool,
+						   GFP_KERNEL);
+				memset(io, 0, sizeof(*io) +
+				       sizeof(struct bio_vec) *
+				       DIRTY_IO_MEMPOOL_BVECS);
+				io->from_mempool = true;
+
+				bkey_copy(&io->replace.key, &tmp.k);
+
+				if (DIRTY_IO_MEMPOOL_SECTORS <
+				    io->replace.key.k.size)
+					bch_key_resize(&io->replace.key.k,
+						DIRTY_IO_MEMPOOL_SECTORS);
+			} else {
+				bkey_copy(&io->replace.key, &tmp.k);
+			}
+
+			io->dc		= dc;
+			io->ca		= pick.ca;
+			io->w		= w;
+			io->ptr		= pick.ptr;
+			atomic_inc(&w->ref);
+
+			dirty_init(io);
+			bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
+			io->bio.bi_iter.bi_sector = pick.ptr.offset;
+			io->bio.bi_bdev		= pick.ca->disk_sb.bdev;
+			io->bio.bi_end_io	= read_dirty_endio;
+
+			bio_for_each_segment_all(bv, &io->bio, i) {
+				bv->bv_page =
+					mempool_alloc(&dc->writeback_page_pool,
+						      i ? GFP_NOWAIT
+						      : GFP_KERNEL);
+				if (!bv->bv_page) {
+					BUG_ON(!i);
+					io->bio.bi_vcnt = i;
+
+					io->bio.bi_iter.bi_size =
+						io->bio.bi_vcnt * PAGE_SIZE;
+
+					bch_key_resize(&io->replace.key.k,
+						       bio_sectors(&io->bio));
+					break;
+				}
+			}
+
+			bch_cut_front(io->replace.key.k.p, &tmp.k);
+			trace_bcache_writeback(&io->replace.key.k);
+
+			bch_ratelimit_increment(&dc->writeback_pd.rate,
+						io->replace.key.k.size << 9);
+
+			closure_call(&io->cl, read_dirty_submit, NULL, &cl);
+		}
+
+		bch_keybuf_put(&dc->writeback_keys, w);
+	}
+
+	/*
+	 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
+	 * freed) before refilling again
+	 */
+	closure_sync(&cl);
+
+	return sectors_written;
+}
+
+/* Scan for dirty data */
+
+static void __bcache_dev_sectors_dirty_add(struct bcache_device *d,
+					   u64 offset, int nr_sectors)
+{
+	unsigned stripe_offset, stripe, sectors_dirty;
+
+	if (!d)
+		return;
+
+	if (!d->stripe_sectors_dirty)
+		return;
+
+	stripe = offset_to_stripe(d, offset);
+	stripe_offset = offset & (d->stripe_size - 1);
+
+	while (nr_sectors) {
+		int s = min_t(unsigned, abs(nr_sectors),
+			      d->stripe_size - stripe_offset);
+
+		if (nr_sectors < 0)
+			s = -s;
+
+		if (stripe >= d->nr_stripes)
+			return;
+
+		sectors_dirty = atomic_add_return(s,
+					d->stripe_sectors_dirty + stripe);
+		if (sectors_dirty == d->stripe_size)
+			set_bit(stripe, d->full_dirty_stripes);
+		else
+			clear_bit(stripe, d->full_dirty_stripes);
+
+		nr_sectors -= s;
+		stripe_offset = 0;
+		stripe++;
+	}
+}
+
+void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
+				  u64 offset, int nr_sectors)
+{
+	struct bcache_device *d;
+
+	rcu_read_lock();
+	d = bch_dev_find(c, inode);
+	if (d)
+		__bcache_dev_sectors_dirty_add(d, offset, nr_sectors);
+	rcu_read_unlock();
+}
+
+static bool dirty_pred(struct keybuf *buf, struct bkey_s_c k)
+{
+	struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys);
+
+	BUG_ON(k.k->p.inode != bcache_dev_inum(&dc->disk));
+
+	return bkey_extent_is_data(k.k) &&
+		!bkey_extent_is_cached(k.k);
+}
+
+static void refill_full_stripes(struct cached_dev *dc)
+{
+	struct keybuf *buf = &dc->writeback_keys;
+	unsigned inode = bcache_dev_inum(&dc->disk);
+	unsigned start_stripe, stripe, next_stripe;
+	bool wrapped = false;
+
+	stripe = offset_to_stripe(&dc->disk, buf->last_scanned.offset);
+
+	if (stripe >= dc->disk.nr_stripes)
+		stripe = 0;
+
+	start_stripe = stripe;
+
+	while (1) {
+		stripe = find_next_bit(dc->disk.full_dirty_stripes,
+				       dc->disk.nr_stripes, stripe);
+
+		if (stripe == dc->disk.nr_stripes)
+			goto next;
+
+		next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
+						 dc->disk.nr_stripes, stripe);
+
+		buf->last_scanned = POS(inode,
+					stripe * dc->disk.stripe_size);
+
+		bch_refill_keybuf(dc->disk.c, buf,
+				  POS(inode,
+				      next_stripe * dc->disk.stripe_size),
+				  dirty_pred);
+
+		if (array_freelist_empty(&buf->freelist))
+			return;
+
+		stripe = next_stripe;
+next:
+		if (wrapped && stripe > start_stripe)
+			return;
+
+		if (stripe == dc->disk.nr_stripes) {
+			stripe = 0;
+			wrapped = true;
+		}
+	}
+}
+
+static u64 bch_writeback(struct cached_dev *dc)
+{
+	struct keybuf *buf = &dc->writeback_keys;
+	unsigned inode = bcache_dev_inum(&dc->disk);
+	struct bpos start = POS(inode, 0);
+	struct bpos end = POS(inode, KEY_OFFSET_MAX);
+	struct bpos start_pos;
+	u64 sectors_written = 0;
+
+	buf->last_scanned = POS(inode, 0);
+
+	while (bkey_cmp(buf->last_scanned, end) < 0 &&
+	       !kthread_should_stop()) {
+		down_write(&dc->writeback_lock);
+
+		if (!atomic_read(&dc->has_dirty)) {
+			up_write(&dc->writeback_lock);
+			set_current_state(TASK_INTERRUPTIBLE);
+
+			if (kthread_should_stop())
+				return sectors_written;
+
+			schedule();
+			try_to_freeze();
+			return sectors_written;
+		}
+
+		if (bkey_cmp(buf->last_scanned, end) >= 0)
+			buf->last_scanned = POS(inode, 0);
+
+		if (dc->partial_stripes_expensive) {
+			refill_full_stripes(dc);
+			if (array_freelist_empty(&buf->freelist))
+				goto refill_done;
+		}
+
+		start_pos = buf->last_scanned;
+		bch_refill_keybuf(dc->disk.c, buf, end, dirty_pred);
+
+		if (bkey_cmp(buf->last_scanned, end) >= 0) {
+			/*
+			 * If we get to the end start scanning again from the
+			 * beginning, and only scan up to where we initially
+			 * started scanning from:
+			 */
+			buf->last_scanned = start;
+			bch_refill_keybuf(dc->disk.c, buf, start_pos,
+					  dirty_pred);
+		}
+
+		if (RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
+			atomic_set(&dc->has_dirty, 0);
+			cached_dev_put(dc);
+			SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_CLEAN);
+			bch_write_bdev_super(dc, NULL);
+		}
+
+refill_done:
+		up_write(&dc->writeback_lock);
+
+		bch_ratelimit_reset(&dc->writeback_pd.rate);
+		sectors_written += read_dirty(dc);
+	}
+
+	return sectors_written;
+}
+
+static int bch_writeback_thread(void *arg)
+{
+	struct cached_dev *dc = arg;
+	struct cache_set *c = dc->disk.c;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	unsigned long last;
+	u64 sectors_written;
+
+	set_freezable();
+
+	while (!kthread_should_stop()) {
+		if (kthread_wait_freezable(dc->writeback_running ||
+				test_bit(BCACHE_DEV_DETACHING,
+					 &dc->disk.flags)))
+			break;
+
+		last = atomic_long_read(&clock->now);
+
+		sectors_written = bch_writeback(dc);
+
+		if (sectors_written < c->capacity >> 4)
+			bch_kthread_io_clock_wait(clock,
+					  last + (c->capacity >> 5));
+	}
+
+	return 0;
+}
+
+/**
+ * bch_keylist_recalc_oldest_gens - update oldest_gen pointers from writeback keys
+ *
+ * This prevents us from wrapping around gens for a bucket only referenced from
+ * writeback keybufs. We don't actually care that the data in those buckets is
+ * marked live, only that we don't wrap the gens.
+ */
+void bch_writeback_recalc_oldest_gens(struct cache_set *c)
+{
+	struct radix_tree_iter iter;
+	void **slot;
+
+	rcu_read_lock();
+
+	radix_tree_for_each_slot(slot, &c->devices, &iter, 0) {
+		struct bcache_device *d;
+		struct cached_dev *dc;
+
+		d = radix_tree_deref_slot(slot);
+
+		if (!CACHED_DEV(&d->inode.v))
+			continue;
+		dc = container_of(d, struct cached_dev, disk);
+
+		bch_keybuf_recalc_oldest_gens(c, &dc->writeback_keys);
+	}
+
+	rcu_read_unlock();
+}
+
+/* Init */
+
+void bch_sectors_dirty_init(struct cached_dev *dc, struct cache_set *c)
+{
+	struct bcache_device *d = &dc->disk;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+
+	/*
+	 * We have to do this before the disk is added to the radix tree or we
+	 * race with moving GC
+	 */
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+			   POS(bcache_dev_inum(d), 0), k) {
+		if (k.k->p.inode > bcache_dev_inum(d))
+			break;
+
+		if (bkey_extent_is_data(k.k) &&
+		    !bkey_extent_is_cached(k.k))
+			__bcache_dev_sectors_dirty_add(d,
+						       bkey_start_offset(k.k),
+						       k.k->size);
+
+		bch_btree_iter_cond_resched(&iter);
+	}
+	bch_btree_iter_unlock(&iter);
+
+	dc->writeback_pd.last_actual = bcache_dev_sectors_dirty(d);
+}
+
+void bch_cached_dev_writeback_stop(struct cached_dev *dc)
+{
+	cancel_delayed_work_sync(&dc->writeback_pd_update);
+	if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
+		kthread_stop(dc->writeback_thread);
+		dc->writeback_thread = NULL;
+	}
+}
+
+void bch_cached_dev_writeback_free(struct cached_dev *dc)
+{
+	struct bcache_device *d = &dc->disk;
+
+	mempool_exit(&dc->writeback_page_pool);
+	mempool_exit(&dc->writeback_io_pool);
+	kvfree(d->full_dirty_stripes);
+	kvfree(d->stripe_sectors_dirty);
+}
+
+int bch_cached_dev_writeback_init(struct cached_dev *dc)
+{
+	struct bcache_device *d = &dc->disk;
+	sector_t sectors;
+	size_t n;
+
+	sectors = get_capacity(dc->disk.disk);
+
+	if (!d->stripe_size) {
+#ifdef CONFIG_BCACHE_DEBUG
+		d->stripe_size = 1 << 0;
+#else
+		d->stripe_size = 1 << 31;
+#endif
+	}
+
+	pr_debug("stripe size: %d sectors", d->stripe_size);
+	d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
+
+	if (!d->nr_stripes ||
+	    d->nr_stripes > INT_MAX ||
+	    d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) {
+		pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
+			(unsigned)d->nr_stripes);
+		return -ENOMEM;
+	}
+
+	n = d->nr_stripes * sizeof(atomic_t);
+	d->stripe_sectors_dirty = n < PAGE_SIZE << 6
+		? kzalloc(n, GFP_KERNEL)
+		: vzalloc(n);
+	if (!d->stripe_sectors_dirty) {
+		pr_err("cannot allocate stripe_sectors_dirty");
+		return -ENOMEM;
+	}
+
+	n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
+	d->full_dirty_stripes = n < PAGE_SIZE << 6
+		? kzalloc(n, GFP_KERNEL)
+		: vzalloc(n);
+	if (!d->full_dirty_stripes) {
+		pr_err("cannot allocate full_dirty_stripes");
+		return -ENOMEM;
+	}
+
+	if (mempool_init_kmalloc_pool(&dc->writeback_io_pool, 4,
+				      sizeof(struct dirty_io) +
+				      sizeof(struct bio_vec) *
+				      DIRTY_IO_MEMPOOL_BVECS) ||
+	    mempool_init_page_pool(&dc->writeback_page_pool,
+				   (64 << 10) / PAGE_SIZE, 0))
+		return -ENOMEM;
+
+	init_rwsem(&dc->writeback_lock);
+	bch_keybuf_init(&dc->writeback_keys);
+
+	dc->writeback_metadata		= true;
+	dc->writeback_running		= true;
+	dc->writeback_percent		= 10;
+	dc->writeback_pd_update_seconds	= 5;
+
+	bch_pd_controller_init(&dc->writeback_pd);
+	INIT_DELAYED_WORK(&dc->writeback_pd_update, update_writeback_rate);
+
+	return 0;
+}
+
+int bch_cached_dev_writeback_start(struct cached_dev *dc)
+{
+	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
+					      "bcache_writeback");
+	if (IS_ERR(dc->writeback_thread))
+		return PTR_ERR(dc->writeback_thread);
+
+	schedule_delayed_work(&dc->writeback_pd_update,
+			      dc->writeback_pd_update_seconds * HZ);
+
+	bch_writeback_queue(dc);
+
+	return 0;
+}
diff --git a/libbcache/writeback.h b/libbcache/writeback.h
new file mode 100644
index 0000000..77e5965
--- /dev/null
+++ b/libbcache/writeback.h
@@ -0,0 +1,100 @@
+#ifndef _BCACHE_WRITEBACK_H
+#define _BCACHE_WRITEBACK_H
+
+#include "blockdev.h"
+#include "buckets.h"
+
+#define CUTOFF_WRITEBACK	60
+#define CUTOFF_WRITEBACK_SYNC	30
+
+static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
+{
+	uint64_t i, ret = 0;
+
+	for (i = 0; i < d->nr_stripes; i++)
+		ret += atomic_read(d->stripe_sectors_dirty + i);
+
+	return ret;
+}
+
+static inline unsigned offset_to_stripe(struct bcache_device *d,
+					uint64_t offset)
+{
+	do_div(offset, d->stripe_size);
+	return offset;
+}
+
+static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
+					   uint64_t offset,
+					   unsigned nr_sectors)
+{
+	unsigned stripe = offset_to_stripe(&dc->disk, offset);
+
+	while (1) {
+		if (atomic_read(dc->disk.stripe_sectors_dirty + stripe))
+			return true;
+
+		if (nr_sectors <= dc->disk.stripe_size)
+			return false;
+
+		nr_sectors -= dc->disk.stripe_size;
+		stripe++;
+	}
+}
+
+static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
+				    unsigned cache_mode, bool would_skip)
+{
+	struct cache_set *c = dc->disk.c;
+	u64 available = sectors_available(c);
+
+	if (cache_mode != CACHE_MODE_WRITEBACK ||
+	    test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
+	    available * 100 < c->capacity * CUTOFF_WRITEBACK_SYNC)
+		return false;
+
+	if (dc->partial_stripes_expensive &&
+	    bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector,
+				    bio_sectors(bio)))
+		return true;
+
+	if (would_skip)
+		return false;
+
+	return bio->bi_opf & REQ_SYNC ||
+		available * 100 < c->capacity * CUTOFF_WRITEBACK;
+}
+
+static inline void bch_writeback_queue(struct cached_dev *dc)
+{
+	if (!IS_ERR_OR_NULL(dc->writeback_thread))
+		wake_up_process(dc->writeback_thread);
+}
+
+static inline void bch_writeback_add(struct cached_dev *dc)
+{
+	if (!atomic_read(&dc->has_dirty) &&
+	    !atomic_xchg(&dc->has_dirty, 1)) {
+		atomic_inc(&dc->count);
+
+		if (BDEV_STATE(dc->disk_sb.sb) != BDEV_STATE_DIRTY) {
+			SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_DIRTY);
+			/* XXX: should do this synchronously */
+			bch_write_bdev_super(dc, NULL);
+		}
+
+		bch_writeback_queue(dc);
+	}
+}
+
+void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, u64, int);
+
+void bch_writeback_recalc_oldest_gens(struct cache_set *);
+void bch_sectors_dirty_init(struct cached_dev *, struct cache_set *c);
+
+void bch_cached_dev_writeback_stop(struct cached_dev *);
+void bch_cached_dev_writeback_free(struct cached_dev *);
+int bch_cached_dev_writeback_init(struct cached_dev *);
+int bch_cached_dev_writeback_start(struct cached_dev *);
+
+#endif
diff --git a/libbcache/xattr.c b/libbcache/xattr.c
new file mode 100644
index 0000000..e9e0a9a
--- /dev/null
+++ b/libbcache/xattr.c
@@ -0,0 +1,379 @@
+
+#include "bcache.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "fs.h"
+#include "str_hash.h"
+#include "xattr.h"
+
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+#include <crypto/hash.h>
+
+struct xattr_search_key {
+	u8		type;
+	struct qstr	name;
+};
+
+#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key)	\
+	{ .type = _type, .name = QSTR_INIT(_name, _len) })
+
+static u64 bch_xattr_hash(const struct bch_hash_info *info,
+			  const struct xattr_search_key *key)
+{
+	switch (info->type) {
+	case BCH_STR_HASH_SHA1: {
+		SHASH_DESC_ON_STACK(desc, bch_sha1);
+		u8 digest[SHA1_DIGEST_SIZE];
+		u64 ret;
+
+		desc->tfm = bch_sha1;
+		desc->flags = 0;
+		crypto_shash_init(desc);
+
+		crypto_shash_update(desc, (void *) &info->seed, sizeof(info->seed));
+
+		crypto_shash_update(desc, (void *) &key->type, sizeof(key->type));
+		crypto_shash_update(desc, (void *) key->name.name, key->name.len);
+
+		crypto_shash_final(desc, digest);
+		memcpy(&ret, &digest, sizeof(ret));
+		return ret >> 1;
+	}
+	default: {
+		struct bch_str_hash_ctx ctx;
+
+		bch_str_hash_init(&ctx, info->type);
+		bch_str_hash_update(&ctx, info->type, &info->seed, sizeof(info->seed));
+
+		bch_str_hash_update(&ctx, info->type, &key->type, sizeof(key->type));
+		bch_str_hash_update(&ctx, info->type, key->name.name, key->name.len);
+
+		return bch_str_hash_end(&ctx, info->type);
+	}
+	}
+}
+
+#define xattr_val(_xattr)	((_xattr)->x_name + (_xattr)->x_name_len)
+
+static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
+{
+	return bch_xattr_hash(info, key);
+}
+
+static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
+{
+	struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);
+
+	return bch_xattr_hash(info,
+		 &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len));
+}
+
+static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
+{
+	struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
+	const struct xattr_search_key *r = _r;
+
+	return l.v->x_type != r->type ||
+		l.v->x_name_len != r->name.len ||
+		memcmp(l.v->x_name, r->name.name, r->name.len);
+}
+
+static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
+{
+	struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
+	struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r);
+
+	return l.v->x_type != r.v->x_type ||
+		l.v->x_name_len != r.v->x_name_len ||
+		memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
+}
+
+static const struct bch_hash_desc xattr_hash_desc = {
+	.btree_id	= BTREE_ID_XATTRS,
+	.key_type	= BCH_XATTR,
+	.whiteout_type	= BCH_XATTR_WHITEOUT,
+	.hash_key	= xattr_hash_key,
+	.hash_bkey	= xattr_hash_bkey,
+	.cmp_key	= xattr_cmp_key,
+	.cmp_bkey	= xattr_cmp_bkey,
+};
+
+static const char *bch_xattr_invalid(const struct cache_set *c,
+				     struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_XATTR:
+		return bkey_val_bytes(k.k) < sizeof(struct bch_xattr)
+			? "value too small"
+			: NULL;
+
+	case BCH_XATTR_WHITEOUT:
+		return bkey_val_bytes(k.k) != 0
+			? "value size should be zero"
+			: NULL;
+
+	default:
+		return "invalid type";
+	}
+}
+
+static void bch_xattr_to_text(struct cache_set *c, char *buf,
+			      size_t size, struct bkey_s_c k)
+{
+	struct bkey_s_c_xattr xattr;
+	int n;
+
+	switch (k.k->type) {
+	case BCH_XATTR:
+		xattr = bkey_s_c_to_xattr(k);
+
+		if (size) {
+			n = min_t(unsigned, size, xattr.v->x_name_len);
+			memcpy(buf, xattr.v->x_name, n);
+			buf[size - 1] = '\0';
+			buf += n;
+			size -= n;
+		}
+
+		n = scnprintf(buf, size, " -> ");
+		buf += n;
+		size -= n;
+
+		if (size) {
+			n = min_t(unsigned, size,
+				  le16_to_cpu(xattr.v->x_val_len));
+			memcpy(buf, xattr_val(xattr.v), n);
+			buf[size - 1] = '\0';
+			buf += n;
+			size -= n;
+		}
+
+		break;
+	case BCH_XATTR_WHITEOUT:
+		scnprintf(buf, size, "whiteout");
+		break;
+	}
+}
+
+const struct bkey_ops bch_bkey_xattr_ops = {
+	.key_invalid	= bch_xattr_invalid,
+	.val_to_text	= bch_xattr_to_text,
+};
+
+int bch_xattr_get(struct cache_set *c, struct inode *inode,
+		  const char *name, void *buffer, size_t size, int type)
+{
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_xattr xattr;
+	int ret;
+
+	k = bch_hash_lookup(xattr_hash_desc, &ei->str_hash, c,
+			    ei->vfs_inode.i_ino, &iter,
+			    &X_SEARCH(type, name, strlen(name)));
+	if (IS_ERR(k.k))
+		return bch_btree_iter_unlock(&iter) ?: -ENODATA;
+
+	xattr = bkey_s_c_to_xattr(k);
+	ret = le16_to_cpu(xattr.v->x_val_len);
+	if (buffer) {
+		if (ret > size)
+			ret = -ERANGE;
+		else
+			memcpy(buffer, xattr_val(xattr.v), ret);
+	}
+
+	bch_btree_iter_unlock(&iter);
+	return ret;
+}
+
+int bch_xattr_set(struct cache_set *c, struct inode *inode,
+		  const char *name, const void *value, size_t size,
+		  int flags, int type)
+{
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct xattr_search_key search = X_SEARCH(type, name, strlen(name));
+	int ret;
+
+	if (!value) {
+		ret = bch_hash_delete(xattr_hash_desc, &ei->str_hash,
+				      c, ei->vfs_inode.i_ino,
+				      &ei->journal_seq, &search);
+	} else {
+		struct bkey_i_xattr *xattr;
+		unsigned u64s = BKEY_U64s +
+			DIV_ROUND_UP(sizeof(struct bch_xattr) +
+				     search.name.len + size,
+				     sizeof(u64));
+
+		if (u64s > U8_MAX)
+			return -ERANGE;
+
+		xattr = kmalloc(u64s * sizeof(u64), GFP_NOFS);
+		if (!xattr)
+			return -ENOMEM;
+
+		bkey_xattr_init(&xattr->k_i);
+		xattr->k.u64s		= u64s;
+		xattr->v.x_type		= type;
+		xattr->v.x_name_len	= search.name.len;
+		xattr->v.x_val_len	= cpu_to_le16(size);
+		memcpy(xattr->v.x_name, search.name.name, search.name.len);
+		memcpy(xattr_val(&xattr->v), value, size);
+
+		ret = bch_hash_set(xattr_hash_desc, &ei->str_hash, c,
+				ei->vfs_inode.i_ino, &ei->journal_seq,
+				&xattr->k_i,
+				(flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
+				(flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
+		kfree(xattr);
+	}
+
+	if (ret == -ENOENT)
+		ret = flags & XATTR_REPLACE ? -ENODATA : 0;
+
+	return ret;
+}
+
+static const struct xattr_handler *bch_xattr_type_to_handler(unsigned);
+
+static size_t bch_xattr_emit(struct dentry *dentry,
+			     const struct bch_xattr *xattr,
+			     char *buffer, size_t buffer_size)
+{
+	const struct xattr_handler *handler =
+		bch_xattr_type_to_handler(xattr->x_type);
+
+	if (handler && (!handler->list || handler->list(dentry))) {
+		const char *prefix = handler->prefix ?: handler->name;
+		const size_t prefix_len = strlen(prefix);
+		const size_t total_len = prefix_len + xattr->x_name_len + 1;
+
+		if (buffer && total_len <= buffer_size) {
+			memcpy(buffer, prefix, prefix_len);
+			memcpy(buffer + prefix_len,
+			       xattr->x_name, xattr->x_name_len);
+			buffer[prefix_len + xattr->x_name_len] = '\0';
+		}
+
+		return total_len;
+	} else {
+		return 0;
+	}
+}
+
+ssize_t bch_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct cache_set *c = dentry->d_sb->s_fs_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	const struct bch_xattr *xattr;
+	u64 inum = dentry->d_inode->i_ino;
+	ssize_t ret = 0;
+	size_t len;
+
+	for_each_btree_key(&iter, c, BTREE_ID_XATTRS, POS(inum, 0), k) {
+		BUG_ON(k.k->p.inode < inum);
+
+		if (k.k->p.inode > inum)
+			break;
+
+		if (k.k->type != BCH_XATTR)
+			continue;
+
+		xattr = bkey_s_c_to_xattr(k).v;
+
+		len = bch_xattr_emit(dentry, xattr, buffer, buffer_size);
+		if (buffer) {
+			if (len > buffer_size) {
+				bch_btree_iter_unlock(&iter);
+				return -ERANGE;
+			}
+
+			buffer += len;
+			buffer_size -= len;
+		}
+
+		ret += len;
+
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+static int bch_xattr_get_handler(const struct xattr_handler *handler,
+				 struct dentry *dentry, struct inode *inode,
+				 const char *name, void *buffer, size_t size)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+
+	return bch_xattr_get(c, inode, name, buffer, size, handler->flags);
+}
+
+static int bch_xattr_set_handler(const struct xattr_handler *handler,
+				 struct dentry *dentry, struct inode *inode,
+				 const char *name, const void *value,
+				 size_t size, int flags)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+
+	return bch_xattr_set(c, inode, name, value, size, flags,
+			     handler->flags);
+}
+
+static const struct xattr_handler bch_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.get	= bch_xattr_get_handler,
+	.set	= bch_xattr_set_handler,
+	.flags	= BCH_XATTR_INDEX_USER,
+};
+
+static bool bch_xattr_trusted_list(struct dentry *dentry)
+{
+	return capable(CAP_SYS_ADMIN);
+}
+
+static const struct xattr_handler bch_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= bch_xattr_trusted_list,
+	.get	= bch_xattr_get_handler,
+	.set	= bch_xattr_set_handler,
+	.flags	= BCH_XATTR_INDEX_TRUSTED,
+};
+
+static const struct xattr_handler bch_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.get	= bch_xattr_get_handler,
+	.set	= bch_xattr_set_handler,
+	.flags	= BCH_XATTR_INDEX_SECURITY,
+};
+
+static const struct xattr_handler *bch_xattr_handler_map[] = {
+	[BCH_XATTR_INDEX_USER]			= &bch_xattr_user_handler,
+	[BCH_XATTR_INDEX_POSIX_ACL_ACCESS]	=
+		&posix_acl_access_xattr_handler,
+	[BCH_XATTR_INDEX_POSIX_ACL_DEFAULT]	=
+		&posix_acl_default_xattr_handler,
+	[BCH_XATTR_INDEX_TRUSTED]		= &bch_xattr_trusted_handler,
+	[BCH_XATTR_INDEX_SECURITY]		= &bch_xattr_security_handler,
+};
+
+const struct xattr_handler *bch_xattr_handlers[] = {
+	&bch_xattr_user_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+	&bch_xattr_trusted_handler,
+	&bch_xattr_security_handler,
+	NULL
+};
+
+static const struct xattr_handler *bch_xattr_type_to_handler(unsigned type)
+{
+	return type < ARRAY_SIZE(bch_xattr_handler_map)
+		? bch_xattr_handler_map[type]
+		: NULL;
+}
diff --git a/libbcache/xattr.h b/libbcache/xattr.h
new file mode 100644
index 0000000..54eb920
--- /dev/null
+++ b/libbcache/xattr.h
@@ -0,0 +1,17 @@
+#ifndef _BCACHE_XATTR_H
+#define _BCACHE_XATTR_H
+
+extern const struct bkey_ops bch_bkey_xattr_ops;
+
+struct dentry;
+struct xattr_handler;
+
+int bch_xattr_get(struct cache_set *, struct inode *,
+		  const char *, void *, size_t, int);
+int bch_xattr_set(struct cache_set *, struct inode *,
+		  const char *, const void *, size_t, int, int);
+ssize_t bch_xattr_list(struct dentry *, char *, size_t);
+
+extern const struct xattr_handler *bch_xattr_handlers[];
+
+#endif /* _BCACHE_XATTR_H */
diff --git a/linux/bio.c b/linux/bio.c
new file mode 100644
index 0000000..966f227
--- /dev/null
+++ b/linux/bio.c
@@ -0,0 +1,323 @@
+/*
+ * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ *
+ */
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+
+void bio_copy_data_iter(struct bio *dst, struct bvec_iter dst_iter,
+			struct bio *src, struct bvec_iter src_iter)
+{
+	struct bio_vec src_bv, dst_bv;
+	void *src_p, *dst_p;
+	unsigned bytes;
+
+	while (1) {
+		if (!src_iter.bi_size) {
+			src = src->bi_next;
+			if (!src)
+				break;
+
+			src_iter = src->bi_iter;
+		}
+
+		if (!dst_iter.bi_size) {
+			dst = dst->bi_next;
+			if (!dst)
+				break;
+
+			dst_iter = dst->bi_iter;
+		}
+
+		src_bv = bio_iter_iovec(src, src_iter);
+		dst_bv = bio_iter_iovec(dst, dst_iter);
+
+		bytes = min(src_bv.bv_len, dst_bv.bv_len);
+
+		src_p = kmap_atomic(src_bv.bv_page);
+		dst_p = kmap_atomic(dst_bv.bv_page);
+
+		memcpy(dst_p + dst_bv.bv_offset,
+		       src_p + src_bv.bv_offset,
+		       bytes);
+
+		kunmap_atomic(dst_p);
+		kunmap_atomic(src_p);
+
+		bio_advance_iter(src, &src_iter, bytes);
+		bio_advance_iter(dst, &dst_iter, bytes);
+	}
+}
+
+void bio_copy_data(struct bio *dst, struct bio *src)
+{
+	bio_copy_data_iter(dst, dst->bi_iter,
+			   src, src->bi_iter);
+}
+
+void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
+{
+	unsigned long flags;
+	struct bio_vec bv;
+	struct bvec_iter iter;
+
+	__bio_for_each_segment(bv, bio, iter, start) {
+		char *data = bvec_kmap_irq(&bv, &flags);
+		memset(data, 0, bv.bv_len);
+		bvec_kunmap_irq(data, &flags);
+	}
+}
+
+void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
+{
+	/*
+	 * most users will be overriding ->bi_bdev with a new target,
+	 * so we don't set nor calculate new physical/hw segment counts here
+	 */
+	bio->bi_bdev = bio_src->bi_bdev;
+	bio_set_flag(bio, BIO_CLONED);
+	bio->bi_opf = bio_src->bi_opf;
+	bio->bi_iter = bio_src->bi_iter;
+	bio->bi_io_vec = bio_src->bi_io_vec;
+}
+
+struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
+{
+	struct bio *b;
+
+	b = bio_alloc_bioset(gfp_mask, 0, bs);
+	if (!b)
+		return NULL;
+
+	__bio_clone_fast(b, bio);
+	return b;
+}
+
+struct bio *bio_split(struct bio *bio, int sectors,
+		      gfp_t gfp, struct bio_set *bs)
+{
+	struct bio *split = NULL;
+
+	BUG_ON(sectors <= 0);
+	BUG_ON(sectors >= bio_sectors(bio));
+
+	/*
+	 * Discards need a mutable bio_vec to accommodate the payload
+	 * required by the DSM TRIM and UNMAP commands.
+	 */
+	if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
+		split = bio_clone_bioset(bio, gfp, bs);
+	else
+		split = bio_clone_fast(bio, gfp, bs);
+
+	if (!split)
+		return NULL;
+
+	split->bi_iter.bi_size = sectors << 9;
+
+	bio_advance(bio, split->bi_iter.bi_size);
+
+	return split;
+}
+
+int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
+{
+	int i;
+	struct bio_vec *bv;
+
+	bio_for_each_segment_all(bv, bio, i) {
+		bv->bv_page = alloc_page(gfp_mask);
+		if (!bv->bv_page) {
+			while (--bv >= bio->bi_io_vec)
+				__free_page(bv->bv_page);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+void bio_advance(struct bio *bio, unsigned bytes)
+{
+	bio_advance_iter(bio, &bio->bi_iter, bytes);
+}
+
+static void bio_free(struct bio *bio)
+{
+	unsigned front_pad = bio->bi_pool ? bio->bi_pool->front_pad : 0;
+
+	kfree((void *) bio - front_pad);
+}
+
+void bio_put(struct bio *bio)
+{
+	if (!bio_flagged(bio, BIO_REFFED))
+		bio_free(bio);
+	else {
+		BUG_ON(!atomic_read(&bio->__bi_cnt));
+
+		/*
+		 * last put frees it
+		 */
+		if (atomic_dec_and_test(&bio->__bi_cnt))
+			bio_free(bio);
+	}
+}
+
+static inline bool bio_remaining_done(struct bio *bio)
+{
+	/*
+	 * If we're not chaining, then ->__bi_remaining is always 1 and
+	 * we always end io on the first invocation.
+	 */
+	if (!bio_flagged(bio, BIO_CHAIN))
+		return true;
+
+	BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
+
+	if (atomic_dec_and_test(&bio->__bi_remaining)) {
+		bio_clear_flag(bio, BIO_CHAIN);
+		return true;
+	}
+
+	return false;
+}
+
+static struct bio *__bio_chain_endio(struct bio *bio)
+{
+	struct bio *parent = bio->bi_private;
+
+	if (!parent->bi_error)
+		parent->bi_error = bio->bi_error;
+	bio_put(bio);
+	return parent;
+}
+
+static void bio_chain_endio(struct bio *bio)
+{
+	bio_endio(__bio_chain_endio(bio));
+}
+
+void bio_endio(struct bio *bio)
+{
+again:
+	if (!bio_remaining_done(bio))
+		return;
+
+	/*
+	 * Need to have a real endio function for chained bios, otherwise
+	 * various corner cases will break (like stacking block devices that
+	 * save/restore bi_end_io) - however, we want to avoid unbounded
+	 * recursion and blowing the stack. Tail call optimization would
+	 * handle this, but compiling with frame pointers also disables
+	 * gcc's sibling call optimization.
+	 */
+	if (bio->bi_end_io == bio_chain_endio) {
+		bio = __bio_chain_endio(bio);
+		goto again;
+	}
+
+	if (bio->bi_end_io)
+		bio->bi_end_io(bio);
+}
+
+void bio_endio_nodec(struct bio *bio)
+{
+	goto nodec;
+
+	while (bio) {
+		if (unlikely(!bio_remaining_done(bio)))
+			break;
+nodec:
+		if (bio->bi_end_io == bio_chain_endio) {
+			struct bio *parent = bio->bi_private;
+			parent->bi_error = bio->bi_error;
+			bio_put(bio);
+			bio = parent;
+		} else {
+			if (bio->bi_end_io)
+				bio->bi_end_io(bio);
+			bio = NULL;
+		}
+	}
+}
+
+void bio_reset(struct bio *bio)
+{
+	unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
+
+	memset(bio, 0, BIO_RESET_BYTES);
+	bio->bi_flags = flags;
+	atomic_set(&bio->__bi_remaining, 1);
+}
+
+struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
+{
+	unsigned front_pad = bs ? bs->front_pad : 0;
+	struct bio *bio;
+	void *p;
+
+	p = kmalloc(front_pad +
+		    sizeof(struct bio) +
+		    nr_iovecs * sizeof(struct bio_vec),
+		    gfp_mask);
+
+	if (unlikely(!p))
+		return NULL;
+
+	bio = p + front_pad;
+	bio_init(bio);
+	bio->bi_pool		= bs;
+	bio->bi_max_vecs	= nr_iovecs;
+	bio->bi_io_vec		= bio->bi_inline_vecs;
+
+	return bio;
+}
+
+struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+			     struct bio_set *bs)
+{
+	struct bvec_iter iter;
+	struct bio_vec bv;
+	struct bio *bio;
+
+	bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
+	if (!bio)
+		return NULL;
+
+	bio->bi_bdev		= bio_src->bi_bdev;
+	bio->bi_opf		= bio_src->bi_opf;
+	bio->bi_iter.bi_sector	= bio_src->bi_iter.bi_sector;
+	bio->bi_iter.bi_size	= bio_src->bi_iter.bi_size;
+
+	switch (bio_op(bio)) {
+	case REQ_OP_DISCARD:
+	case REQ_OP_SECURE_ERASE:
+		break;
+	case REQ_OP_WRITE_SAME:
+		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
+		break;
+	default:
+		bio_for_each_segment(bv, bio_src, iter)
+			bio->bi_io_vec[bio->bi_vcnt++] = bv;
+		break;
+	}
+
+	return bio;
+}
diff --git a/linux/bitrev.c b/linux/bitrev.c
new file mode 100644
index 0000000..61207bb
--- /dev/null
+++ b/linux/bitrev.c
@@ -0,0 +1,37 @@
+#include <linux/types.h>
+#include <linux/bitrev.h>
+
+const u8 byte_rev_table[256] = {
+	0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
+	0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
+	0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
+	0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
+	0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
+	0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
+	0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
+	0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
+	0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
+	0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
+	0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
+	0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
+	0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
+	0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
+	0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
+	0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
+	0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
+	0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
+	0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
+	0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
+	0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
+	0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
+	0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
+	0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
+	0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
+	0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
+	0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
+	0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
+	0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
+	0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
+	0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
+	0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
+};
diff --git a/linux/blkdev.c b/linux/blkdev.c
new file mode 100644
index 0000000..b4ff451
--- /dev/null
+++ b/linux/blkdev.c
@@ -0,0 +1,155 @@
+
+#include <alloca.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/fs.h>
+
+int submit_bio_wait(struct bio *bio)
+{
+	struct iovec *iov;
+	struct bvec_iter iter;
+	struct bio_vec bv;
+	ssize_t ret;
+	unsigned i;
+
+	if (bio->bi_opf & REQ_PREFLUSH)
+		fdatasync(bio->bi_bdev->bd_fd);
+
+	i = 0;
+	bio_for_each_segment(bv, bio, iter)
+		i++;
+
+	iov = alloca(sizeof(*iov) * i);
+
+	i = 0;
+	bio_for_each_segment(bv, bio, iter)
+		iov[i++] = (struct iovec) {
+			.iov_base = page_address(bv.bv_page) + bv.bv_offset,
+			.iov_len = bv.bv_len,
+		};
+
+	switch (bio_op(bio)) {
+	case REQ_OP_READ:
+		ret = preadv(bio->bi_bdev->bd_fd, iov, i,
+			     bio->bi_iter.bi_sector << 9);
+		break;
+	case REQ_OP_WRITE:
+		ret = pwritev(bio->bi_bdev->bd_fd, iov, i,
+			      bio->bi_iter.bi_sector << 9);
+		break;
+	default:
+		BUG();
+	}
+
+	if (bio->bi_opf & REQ_FUA)
+		fdatasync(bio->bi_bdev->bd_fd);
+
+	return ret == bio->bi_iter.bi_size ? 0 : -EIO;
+}
+
+void generic_make_request(struct bio *bio)
+{
+	bio->bi_error = submit_bio_wait(bio);
+	bio_endio(bio);
+}
+
+int blkdev_issue_discard(struct block_device *bdev,
+			 sector_t sector, sector_t nr_sects,
+			 gfp_t gfp_mask, unsigned long flags)
+{
+	return 0;
+}
+
+unsigned bdev_logical_block_size(struct block_device *bdev)
+{
+	struct stat statbuf;
+	unsigned blksize;
+	int ret;
+
+	ret = fstat(bdev->bd_fd, &statbuf);
+	BUG_ON(ret);
+
+	if (!S_ISBLK(statbuf.st_mode))
+		return statbuf.st_blksize >> 9;
+
+	ret = ioctl(bdev->bd_fd, BLKPBSZGET, &blksize);
+	BUG_ON(ret);
+
+	return blksize >> 9;
+}
+
+sector_t get_capacity(struct gendisk *disk)
+{
+	struct block_device *bdev =
+		container_of(disk, struct block_device, __bd_disk);
+	struct stat statbuf;
+	u64 bytes;
+	int ret;
+
+	ret = fstat(bdev->bd_fd, &statbuf);
+	BUG_ON(ret);
+
+	if (!S_ISBLK(statbuf.st_mode))
+		return statbuf.st_size >> 9;
+
+	ret = ioctl(bdev->bd_fd, BLKGETSIZE64, &bytes);
+	BUG_ON(ret);
+
+	return bytes >> 9;
+}
+
+void blkdev_put(struct block_device *bdev, fmode_t mode)
+{
+	fdatasync(bdev->bd_fd);
+	close(bdev->bd_fd);
+	free(bdev);
+}
+
+struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
+					void *holder)
+{
+	struct block_device *bdev;
+	int flags = O_DIRECT;
+
+	if ((mode & (FMODE_READ|FMODE_WRITE)) == (FMODE_READ|FMODE_WRITE))
+		flags = O_RDWR;
+	else if (mode & FMODE_READ)
+		flags = O_RDONLY;
+	else if (mode & FMODE_WRITE)
+		flags = O_WRONLY;
+
+	if (mode & FMODE_EXCL)
+		flags |= O_EXCL;
+
+	bdev = malloc(sizeof(*bdev));
+	memset(bdev, 0, sizeof(*bdev));
+
+	strncpy(bdev->name, path, sizeof(bdev->name));
+	bdev->name[sizeof(bdev->name) - 1] = '\0';
+
+	bdev->bd_fd = open(path, flags);
+	bdev->bd_holder = holder;
+	bdev->bd_disk = &bdev->__bd_disk;
+
+	BUG_ON(bdev->bd_fd < 0);
+
+	return bdev;
+}
+
+void bdput(struct block_device *bdev)
+{
+	BUG();
+}
+
+struct block_device *lookup_bdev(const char *path)
+{
+	return ERR_PTR(-EINVAL);
+}
diff --git a/linux/completion.c b/linux/completion.c
new file mode 100644
index 0000000..fcc77a0
--- /dev/null
+++ b/linux/completion.c
@@ -0,0 +1,311 @@
+/*
+ * Generic wait-for-completion handler;
+ *
+ * It differs from semaphores in that their default case is the opposite,
+ * wait_for_completion default blocks whereas semaphore default non-block. The
+ * interface also makes it easy to 'complete' multiple waiting threads,
+ * something which isn't entirely natural for semaphores.
+ *
+ * But more importantly, the primitive documents the usage. Semaphores would
+ * typically be used for exclusion which gives rise to priority inversion.
+ * Waiting for completion is a typically sync point, but not an exclusion point.
+ */
+
+#include <linux/sched.h>
+#include <linux/completion.h>
+
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete(struct completion *x)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	x->done++;
+	__wake_up_locked(&x->wait, TASK_NORMAL, 1);
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete);
+
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete_all(struct completion *x)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	x->done += UINT_MAX/2;
+	__wake_up_locked(&x->wait, TASK_NORMAL, 0);
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_all);
+
+static inline long __sched
+do_wait_for_common(struct completion *x,
+		   long (*action)(long), long timeout, int state)
+{
+	if (!x->done) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		__add_wait_queue_tail_exclusive(&x->wait, &wait);
+		do {
+			__set_current_state(state);
+			spin_unlock_irq(&x->wait.lock);
+			timeout = action(timeout);
+			spin_lock_irq(&x->wait.lock);
+		} while (!x->done && timeout);
+		__remove_wait_queue(&x->wait, &wait);
+		if (!x->done)
+			return timeout;
+	}
+	x->done--;
+	return timeout ?: 1;
+}
+
+static inline long __sched
+__wait_for_common(struct completion *x,
+		  long (*action)(long), long timeout, int state)
+{
+	might_sleep();
+
+	spin_lock_irq(&x->wait.lock);
+	timeout = do_wait_for_common(x, action, timeout, state);
+	spin_unlock_irq(&x->wait.lock);
+	return timeout;
+}
+
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+	return __wait_for_common(x, schedule_timeout, timeout, state);
+}
+
+static long __sched
+wait_for_common_io(struct completion *x, long timeout, int state)
+{
+	return __wait_for_common(x, io_schedule_timeout, timeout, state);
+}
+
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
+void __sched wait_for_completion(struct completion *x)
+{
+	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion);
+
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+
+/**
+ * wait_for_completion_io: - waits for completion of a task
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout. The caller is accounted as waiting
+ * for IO (which traditionally means blkio only).
+ */
+void __sched wait_for_completion_io(struct completion *x)
+{
+	wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io);
+
+/**
+ * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible. The caller is accounted as waiting for IO (which traditionally
+ * means blkio only).
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
+{
+	return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io_timeout);
+
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_interruptible(struct completion *x)
+{
+	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+	return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+					  unsigned long timeout)
+{
+	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_killable(struct completion *x)
+{
+	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+	return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_killable);
+
+/**
+ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be
+ * signaled or for a specified timeout to expire. It can be
+ * interrupted by a kill signal. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_killable_timeout(struct completion *x,
+				     unsigned long timeout)
+{
+	return wait_for_common(x, timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(wait_for_completion_killable_timeout);
+
+/**
+ *	try_wait_for_completion - try to decrement a completion without blocking
+ *	@x:	completion structure
+ *
+ *	Return: 0 if a decrement cannot be done without blocking
+ *		 1 if a decrement succeeded.
+ *
+ *	If a completion is being used as a counting completion,
+ *	attempt to decrement the counter without blocking. This
+ *	enables us to avoid waiting if the resource the completion
+ *	is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+	unsigned long flags;
+	int ret = 1;
+
+	/*
+	 * Since x->done will need to be locked only
+	 * in the non-blocking case, we check x->done
+	 * first without taking the lock so we can
+	 * return early in the blocking case.
+	 */
+	if (!READ_ONCE(x->done))
+		return 0;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	if (!x->done)
+		ret = 0;
+	else
+		x->done--;
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+
+/**
+ *	completion_done - Test to see if a completion has any waiters
+ *	@x:	completion structure
+ *
+ *	Return: 0 if there are waiters (wait_for_completion() in progress)
+ *		 1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+	if (!READ_ONCE(x->done))
+		return false;
+
+	/*
+	 * If ->done, we need to wait for complete() to release ->wait.lock
+	 * otherwise we can end up freeing the completion before complete()
+	 * is done referencing it.
+	 *
+	 * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
+	 * the loads of ->done and ->wait.lock such that we cannot observe
+	 * the lock before complete() acquires it while observing the ->done
+	 * after it's acquired the lock.
+	 */
+	smp_rmb();
+	//spin_unlock_wait(&x->wait.lock);
+	spin_lock(&x->wait.lock);
+	spin_unlock(&x->wait.lock);
+	return true;
+}
+EXPORT_SYMBOL(completion_done);
diff --git a/linux/crypto/algapi.c b/linux/crypto/algapi.c
new file mode 100644
index 0000000..5e8e97b
--- /dev/null
+++ b/linux/crypto/algapi.c
@@ -0,0 +1,315 @@
+/*
+ * Cryptographic API for algorithms (i.e., low-level API).
+ *
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <linux/byteorder.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "internal.h"
+
+static inline int crypto_set_driver_name(struct crypto_alg *alg)
+{
+	static const char suffix[] = "-generic";
+	char *driver_name = alg->cra_driver_name;
+	int len;
+
+	if (*driver_name)
+		return 0;
+
+	len = strlcpy(driver_name, alg->cra_name, CRYPTO_MAX_ALG_NAME);
+	if (len + sizeof(suffix) > CRYPTO_MAX_ALG_NAME)
+		return -ENAMETOOLONG;
+
+	memcpy(driver_name + len, suffix, sizeof(suffix));
+	return 0;
+}
+
+static int crypto_check_alg(struct crypto_alg *alg)
+{
+	if (alg->cra_alignmask & (alg->cra_alignmask + 1))
+		return -EINVAL;
+
+	if (alg->cra_blocksize > PAGE_SIZE / 8)
+		return -EINVAL;
+
+	if (alg->cra_priority < 0)
+		return -EINVAL;
+
+	atomic_set(&alg->cra_refcnt, 1);
+
+	return crypto_set_driver_name(alg);
+}
+
+static int __crypto_register_alg(struct crypto_alg *alg)
+{
+	struct crypto_alg *q;
+	int ret = -EAGAIN;
+
+	INIT_LIST_HEAD(&alg->cra_users);
+
+	ret = -EEXIST;
+
+	list_for_each_entry(q, &crypto_alg_list, cra_list) {
+		if (q == alg)
+			goto err;
+
+		if (!strcmp(q->cra_driver_name, alg->cra_name) ||
+		    !strcmp(q->cra_name, alg->cra_driver_name))
+			goto err;
+	}
+
+	list_add(&alg->cra_list, &crypto_alg_list);
+	return 0;
+err:
+	return ret;
+}
+
+void crypto_remove_final(struct list_head *list)
+{
+	struct crypto_alg *alg;
+	struct crypto_alg *n;
+
+	list_for_each_entry_safe(alg, n, list, cra_list) {
+		list_del_init(&alg->cra_list);
+		crypto_alg_put(alg);
+	}
+}
+
+int crypto_register_alg(struct crypto_alg *alg)
+{
+	int err;
+
+	err = crypto_check_alg(alg);
+	if (err)
+		return err;
+
+	down_write(&crypto_alg_sem);
+	err = __crypto_register_alg(alg);
+	up_write(&crypto_alg_sem);
+
+	return err;
+}
+
+static int crypto_remove_alg(struct crypto_alg *alg, struct list_head *list)
+{
+	if (unlikely(list_empty(&alg->cra_list)))
+		return -ENOENT;
+
+	list_del_init(&alg->cra_list);
+	return 0;
+}
+
+int crypto_unregister_alg(struct crypto_alg *alg)
+{
+	int ret;
+	LIST_HEAD(list);
+
+	down_write(&crypto_alg_sem);
+	ret = crypto_remove_alg(alg, &list);
+	up_write(&crypto_alg_sem);
+
+	if (ret)
+		return ret;
+
+	BUG_ON(atomic_read(&alg->cra_refcnt) != 1);
+	if (alg->cra_destroy)
+		alg->cra_destroy(alg);
+
+	crypto_remove_final(&list);
+	return 0;
+}
+
+int crypto_register_algs(struct crypto_alg *algs, int count)
+{
+	int i, ret;
+
+	for (i = 0; i < count; i++) {
+		ret = crypto_register_alg(&algs[i]);
+		if (ret)
+			goto err;
+	}
+
+	return 0;
+
+err:
+	for (--i; i >= 0; --i)
+		crypto_unregister_alg(&algs[i]);
+
+	return ret;
+}
+
+int crypto_unregister_algs(struct crypto_alg *algs, int count)
+{
+	int i, ret;
+
+	for (i = 0; i < count; i++) {
+		ret = crypto_unregister_alg(&algs[i]);
+		if (ret)
+			pr_err("Failed to unregister %s %s: %d\n",
+			       algs[i].cra_driver_name, algs[i].cra_name, ret);
+	}
+
+	return 0;
+}
+
+struct crypto_attr_type *crypto_get_attr_type(struct rtattr **tb)
+{
+	struct rtattr *rta = tb[0];
+	struct crypto_attr_type *algt;
+
+	if (!rta)
+		return ERR_PTR(-ENOENT);
+	if (RTA_PAYLOAD(rta) < sizeof(*algt))
+		return ERR_PTR(-EINVAL);
+	if (rta->rta_type != CRYPTOA_TYPE)
+		return ERR_PTR(-EINVAL);
+
+	algt = RTA_DATA(rta);
+
+	return algt;
+}
+
+int crypto_check_attr_type(struct rtattr **tb, u32 type)
+{
+	struct crypto_attr_type *algt;
+
+	algt = crypto_get_attr_type(tb);
+	if (IS_ERR(algt))
+		return PTR_ERR(algt);
+
+	if ((algt->type ^ type) & algt->mask)
+		return -EINVAL;
+
+	return 0;
+}
+
+const char *crypto_attr_alg_name(struct rtattr *rta)
+{
+	struct crypto_attr_alg *alga;
+
+	if (!rta)
+		return ERR_PTR(-ENOENT);
+	if (RTA_PAYLOAD(rta) < sizeof(*alga))
+		return ERR_PTR(-EINVAL);
+	if (rta->rta_type != CRYPTOA_ALG)
+		return ERR_PTR(-EINVAL);
+
+	alga = RTA_DATA(rta);
+	alga->name[CRYPTO_MAX_ALG_NAME - 1] = 0;
+
+	return alga->name;
+}
+
+struct crypto_alg *crypto_attr_alg2(struct rtattr *rta,
+				    const struct crypto_type *frontend,
+				    u32 type, u32 mask)
+{
+	const char *name;
+
+	name = crypto_attr_alg_name(rta);
+	if (IS_ERR(name))
+		return ERR_CAST(name);
+
+	return crypto_find_alg(name, frontend, type, mask);
+}
+
+int crypto_attr_u32(struct rtattr *rta, u32 *num)
+{
+	struct crypto_attr_u32 *nu32;
+
+	if (!rta)
+		return -ENOENT;
+	if (RTA_PAYLOAD(rta) < sizeof(*nu32))
+		return -EINVAL;
+	if (rta->rta_type != CRYPTOA_U32)
+		return -EINVAL;
+
+	nu32 = RTA_DATA(rta);
+	*num = nu32->num;
+
+	return 0;
+}
+
+static inline void crypto_inc_byte(u8 *a, unsigned int size)
+{
+	u8 *b = (a + size);
+	u8 c;
+
+	for (; size; size--) {
+		c = *--b + 1;
+		*b = c;
+		if (c)
+			break;
+	}
+}
+
+void crypto_inc(u8 *a, unsigned int size)
+{
+	__be32 *b = (__be32 *)(a + size);
+	u32 c;
+
+	for (; size >= 4; size -= 4) {
+		c = be32_to_cpu(*--b) + 1;
+		*b = cpu_to_be32(c);
+		if (c)
+			return;
+	}
+
+	crypto_inc_byte(a, size);
+}
+
+static inline void crypto_xor_byte(u8 *a, const u8 *b, unsigned int size)
+{
+	for (; size; size--)
+		*a++ ^= *b++;
+}
+
+void crypto_xor(u8 *dst, const u8 *src, unsigned int size)
+{
+	u32 *a = (u32 *)dst;
+	u32 *b = (u32 *)src;
+
+	for (; size >= 4; size -= 4)
+		*a++ ^= *b++;
+
+	crypto_xor_byte((u8 *)a, (u8 *)b, size);
+}
+
+unsigned int crypto_alg_extsize(struct crypto_alg *alg)
+{
+	return alg->cra_ctxsize +
+	       (alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1));
+}
+
+int crypto_type_has_alg(const char *name, const struct crypto_type *frontend,
+			u32 type, u32 mask)
+{
+	int ret = 0;
+	struct crypto_alg *alg = crypto_find_alg(name, frontend, type, mask);
+
+	if (!IS_ERR(alg)) {
+		crypto_alg_put(alg);
+		ret = 1;
+	}
+
+	return ret;
+}
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Cryptographic algorithms API");
diff --git a/linux/crypto/api.c b/linux/crypto/api.c
new file mode 100644
index 0000000..513a48a
--- /dev/null
+++ b/linux/crypto/api.c
@@ -0,0 +1,326 @@
+/*
+ * Scatterlist Cryptographic API.
+ *
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * Copyright (c) 2002 David S. Miller (davem@redhat.com)
+ * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no>
+ * and Nettle, by Niels Möller.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/param.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include "internal.h"
+
+LIST_HEAD(crypto_alg_list);
+DECLARE_RWSEM(crypto_alg_sem);
+
+static struct crypto_alg *__crypto_alg_lookup(const char *name, u32 type,
+					      u32 mask)
+{
+	struct crypto_alg *q, *alg = NULL;
+	int best = -2;
+
+	list_for_each_entry(q, &crypto_alg_list, cra_list) {
+		int exact, fuzzy;
+
+		if ((q->cra_flags ^ type) & mask)
+			continue;
+
+		exact = !strcmp(q->cra_driver_name, name);
+		fuzzy = !strcmp(q->cra_name, name);
+		if (!exact && !(fuzzy && q->cra_priority > best))
+			continue;
+
+		if (unlikely(!crypto_alg_get(q)))
+			continue;
+
+		best = q->cra_priority;
+		if (alg)
+			crypto_alg_put(alg);
+		alg = q;
+
+		if (exact)
+			break;
+	}
+
+	return alg;
+}
+
+struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask)
+{
+	struct crypto_alg *alg;
+
+	/*
+	 * If the internal flag is set for a cipher, require a caller to
+	 * to invoke the cipher with the internal flag to use that cipher.
+	 * Also, if a caller wants to allocate a cipher that may or may
+	 * not be an internal cipher, use type | CRYPTO_ALG_INTERNAL and
+	 * !(mask & CRYPTO_ALG_INTERNAL).
+	 */
+	if (!((type | mask) & CRYPTO_ALG_INTERNAL))
+		mask |= CRYPTO_ALG_INTERNAL;
+
+	down_read(&crypto_alg_sem);
+	alg = __crypto_alg_lookup(name, type, mask);
+	up_read(&crypto_alg_sem);
+
+	return alg ?: ERR_PTR(-ENOENT);
+}
+
+static int crypto_init_ops(struct crypto_tfm *tfm, u32 type, u32 mask)
+{
+	const struct crypto_type *type_obj = tfm->__crt_alg->cra_type;
+
+	if (type_obj)
+		return type_obj->init(tfm, type, mask);
+
+	switch (crypto_tfm_alg_type(tfm)) {
+	case CRYPTO_ALG_TYPE_CIPHER:
+		return crypto_init_cipher_ops(tfm);
+	default:
+		break;
+	}
+
+	BUG();
+	return -EINVAL;
+}
+
+static void crypto_exit_ops(struct crypto_tfm *tfm)
+{
+	const struct crypto_type *type = tfm->__crt_alg->cra_type;
+
+	if (type) {
+		if (tfm->exit)
+			tfm->exit(tfm);
+		return;
+	}
+
+	switch (crypto_tfm_alg_type(tfm)) {
+	case CRYPTO_ALG_TYPE_CIPHER:
+		crypto_exit_cipher_ops(tfm);
+		break;
+
+	default:
+		BUG();
+	}
+}
+
+static unsigned int crypto_ctxsize(struct crypto_alg *alg, u32 type, u32 mask)
+{
+	const struct crypto_type *type_obj = alg->cra_type;
+	unsigned int len;
+
+	len = alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1);
+	if (type_obj)
+		return len + type_obj->ctxsize(alg, type, mask);
+
+	switch (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) {
+	default:
+		BUG();
+
+	case CRYPTO_ALG_TYPE_CIPHER:
+		len += crypto_cipher_ctxsize(alg);
+		break;
+	}
+
+	return len;
+}
+
+struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type,
+				      u32 mask)
+{
+	struct crypto_tfm *tfm = NULL;
+	unsigned int tfm_size;
+	int err = -ENOMEM;
+
+	tfm_size = sizeof(*tfm) + crypto_ctxsize(alg, type, mask);
+	tfm = kzalloc(tfm_size, GFP_KERNEL);
+	if (tfm == NULL)
+		goto out_err;
+
+	tfm->__crt_alg = alg;
+
+	err = crypto_init_ops(tfm, type, mask);
+	if (err)
+		goto out_free_tfm;
+
+	if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm)))
+		goto cra_init_failed;
+
+	goto out;
+
+cra_init_failed:
+	crypto_exit_ops(tfm);
+out_free_tfm:
+	kfree(tfm);
+out_err:
+	tfm = ERR_PTR(err);
+out:
+	return tfm;
+}
+
+/*
+ *	crypto_alloc_base - Locate algorithm and allocate transform
+ *	@alg_name: Name of algorithm
+ *	@type: Type of algorithm
+ *	@mask: Mask for type comparison
+ *
+ *	This function should not be used by new algorithm types.
+ *	Please use crypto_alloc_tfm instead.
+ *
+ *	crypto_alloc_base() will first attempt to locate an already loaded
+ *	algorithm.  If that fails and the kernel supports dynamically loadable
+ *	modules, it will then attempt to load a module of the same name or
+ *	alias.  If that fails it will send a query to any loaded crypto manager
+ *	to construct an algorithm on the fly.  A refcount is grabbed on the
+ *	algorithm which is then associated with the new transform.
+ *
+ *	The returned transform is of a non-determinate type.  Most people
+ *	should use one of the more specific allocation functions such as
+ *	crypto_alloc_blkcipher.
+ *
+ *	In case of error the return value is an error pointer.
+ */
+struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask)
+{
+	struct crypto_alg *alg;
+	struct crypto_tfm *tfm;
+
+	alg = crypto_alg_mod_lookup(alg_name, type, mask);
+	if (IS_ERR(alg)) {
+		fprintf(stderr, "unknown cipher %s\n", alg_name);
+		return ERR_CAST(alg);
+	}
+
+	tfm = __crypto_alloc_tfm(alg, type, mask);
+	if (IS_ERR(tfm)) {
+		crypto_alg_put(alg);
+		return tfm;
+	}
+
+	return tfm;
+}
+
+void *crypto_create_tfm(struct crypto_alg *alg,
+			const struct crypto_type *frontend)
+{
+	char *mem;
+	struct crypto_tfm *tfm = NULL;
+	unsigned int tfmsize;
+	unsigned int total;
+	int err = -ENOMEM;
+
+	tfmsize = frontend->tfmsize;
+	total = tfmsize + sizeof(*tfm) + frontend->extsize(alg);
+
+	mem = kzalloc(total, GFP_KERNEL);
+	if (mem == NULL)
+		goto out_err;
+
+	tfm = (struct crypto_tfm *)(mem + tfmsize);
+	tfm->__crt_alg = alg;
+
+	err = frontend->init_tfm(tfm);
+	if (err)
+		goto out_free_tfm;
+
+	if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm)))
+		goto cra_init_failed;
+
+	goto out;
+
+cra_init_failed:
+	crypto_exit_ops(tfm);
+out_free_tfm:
+	kfree(mem);
+out_err:
+	mem = ERR_PTR(err);
+out:
+	return mem;
+}
+
+struct crypto_alg *crypto_find_alg(const char *alg_name,
+				   const struct crypto_type *frontend,
+				   u32 type, u32 mask)
+{
+	struct crypto_alg *(*lookup)(const char *name, u32 type, u32 mask) =
+		crypto_alg_mod_lookup;
+
+	if (frontend) {
+		type &= frontend->maskclear;
+		mask &= frontend->maskclear;
+		type |= frontend->type;
+		mask |= frontend->maskset;
+
+		if (frontend->lookup)
+			lookup = frontend->lookup;
+	}
+
+	return lookup(alg_name, type, mask);
+}
+
+void *crypto_alloc_tfm(const char *alg_name,
+		       const struct crypto_type *frontend, u32 type, u32 mask)
+{
+	struct crypto_alg *alg;
+	void *tfm;
+
+	alg = crypto_find_alg(alg_name, frontend, type, mask);
+	if (IS_ERR(alg))
+		return ERR_CAST(alg);
+
+	tfm = crypto_create_tfm(alg, frontend);
+	if (IS_ERR(tfm)) {
+		crypto_alg_put(alg);
+		return tfm;
+	}
+
+	return tfm;
+}
+
+void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm)
+{
+	struct crypto_alg *alg;
+
+	if (unlikely(!mem))
+		return;
+
+	alg = tfm->__crt_alg;
+
+	if (!tfm->exit && alg->cra_exit)
+		alg->cra_exit(tfm);
+	crypto_exit_ops(tfm);
+	crypto_alg_put(alg);
+	kzfree(mem);
+}
+
+int crypto_has_alg(const char *name, u32 type, u32 mask)
+{
+	int ret = 0;
+	struct crypto_alg *alg = crypto_alg_mod_lookup(name, type, mask);
+
+	if (!IS_ERR(alg)) {
+		crypto_alg_put(alg);
+		ret = 1;
+	}
+
+	return ret;
+}
+
+MODULE_DESCRIPTION("Cryptographic core API");
+MODULE_LICENSE("GPL");
diff --git a/linux/crypto/cipher.c b/linux/crypto/cipher.c
new file mode 100644
index 0000000..6f47ac6
--- /dev/null
+++ b/linux/crypto/cipher.c
@@ -0,0 +1,123 @@
+/*
+ * Cryptographic API.
+ *
+ * Cipher operations.
+ *
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/crypto.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include "internal.h"
+
+static int setkey_unaligned(struct crypto_tfm *tfm, const u8 *key,
+			    unsigned int keylen)
+{
+	struct cipher_alg *cia = &tfm->__crt_alg->cra_cipher;
+	unsigned long alignmask = crypto_tfm_alg_alignmask(tfm);
+	int ret;
+	u8 *buffer, *alignbuffer;
+	unsigned long absize;
+
+	absize = keylen + alignmask;
+	buffer = kmalloc(absize, GFP_ATOMIC);
+	if (!buffer)
+		return -ENOMEM;
+
+	alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
+	memcpy(alignbuffer, key, keylen);
+	ret = cia->cia_setkey(tfm, alignbuffer, keylen);
+	memset(alignbuffer, 0, keylen);
+	kfree(buffer);
+	return ret;
+
+}
+
+static int setkey_default(struct crypto_tfm *tfm, const u8 *key,
+			  unsigned int keylen)
+{
+	struct cipher_alg *cia = &tfm->__crt_alg->cra_cipher;
+	unsigned long alignmask = crypto_tfm_alg_alignmask(tfm);
+
+	tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
+	if (keylen < cia->cia_min_keysize || keylen > cia->cia_max_keysize) {
+		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+
+	if ((unsigned long)key & alignmask)
+		return setkey_unaligned(tfm, key, keylen);
+
+	return cia->cia_setkey(tfm, key, keylen);
+}
+
+static void cipher_crypt_unaligned(void (*fn)(struct crypto_tfm *, u8 *,
+					      const u8 *),
+				   struct crypto_tfm *tfm,
+				   u8 *dst, const u8 *src)
+{
+	unsigned long alignmask = crypto_tfm_alg_alignmask(tfm);
+	unsigned int size = crypto_tfm_alg_blocksize(tfm);
+	u8 buffer[size + alignmask];
+	u8 *tmp = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
+
+	memcpy(tmp, src, size);
+	fn(tfm, tmp, tmp);
+	memcpy(dst, tmp, size);
+}
+
+static void cipher_encrypt_unaligned(struct crypto_tfm *tfm,
+				     u8 *dst, const u8 *src)
+{
+	unsigned long alignmask = crypto_tfm_alg_alignmask(tfm);
+	struct cipher_alg *cipher = &tfm->__crt_alg->cra_cipher;
+
+	if (unlikely(((unsigned long)dst | (unsigned long)src) & alignmask)) {
+		cipher_crypt_unaligned(cipher->cia_encrypt, tfm, dst, src);
+		return;
+	}
+
+	cipher->cia_encrypt(tfm, dst, src);
+}
+
+static void cipher_decrypt_unaligned(struct crypto_tfm *tfm,
+				     u8 *dst, const u8 *src)
+{
+	unsigned long alignmask = crypto_tfm_alg_alignmask(tfm);
+	struct cipher_alg *cipher = &tfm->__crt_alg->cra_cipher;
+
+	if (unlikely(((unsigned long)dst | (unsigned long)src) & alignmask)) {
+		cipher_crypt_unaligned(cipher->cia_decrypt, tfm, dst, src);
+		return;
+	}
+
+	cipher->cia_decrypt(tfm, dst, src);
+}
+
+int crypto_init_cipher_ops(struct crypto_tfm *tfm)
+{
+	struct cipher_tfm *ops = &tfm->crt_cipher;
+	struct cipher_alg *cipher = &tfm->__crt_alg->cra_cipher;
+
+	ops->cit_setkey = setkey_default;
+	ops->cit_encrypt_one = crypto_tfm_alg_alignmask(tfm) ?
+		cipher_encrypt_unaligned : cipher->cia_encrypt;
+	ops->cit_decrypt_one = crypto_tfm_alg_alignmask(tfm) ?
+		cipher_decrypt_unaligned : cipher->cia_decrypt;
+
+	return 0;
+}
+
+void crypto_exit_cipher_ops(struct crypto_tfm *tfm)
+{
+}
diff --git a/linux/crypto/internal.h b/linux/crypto/internal.h
new file mode 100644
index 0000000..b00dcea
--- /dev/null
+++ b/linux/crypto/internal.h
@@ -0,0 +1,78 @@
+/*
+ * Cryptographic API.
+ *
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+#ifndef _CRYPTO_INTERNAL_H
+#define _CRYPTO_INTERNAL_H
+
+#include <crypto/algapi.h>
+#include <linux/completion.h>
+#include <linux/mm.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/notifier.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+
+struct crypto_instance;
+struct crypto_template;
+
+struct crypto_larval {
+	struct crypto_alg alg;
+	struct crypto_alg *adult;
+	struct completion completion;
+	u32 mask;
+};
+
+extern struct list_head crypto_alg_list;
+extern struct rw_semaphore crypto_alg_sem;
+
+static inline unsigned int crypto_cipher_ctxsize(struct crypto_alg *alg)
+{
+	return alg->cra_ctxsize;
+}
+
+int crypto_init_cipher_ops(struct crypto_tfm *tfm);
+void crypto_exit_cipher_ops(struct crypto_tfm *tfm);
+
+void crypto_remove_final(struct list_head *list);
+struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type,
+				      u32 mask);
+void *crypto_create_tfm(struct crypto_alg *alg,
+			const struct crypto_type *frontend);
+struct crypto_alg *crypto_find_alg(const char *alg_name,
+				   const struct crypto_type *frontend,
+				   u32 type, u32 mask);
+void *crypto_alloc_tfm(const char *alg_name,
+		       const struct crypto_type *frontend, u32 type, u32 mask);
+
+int crypto_register_notifier(struct notifier_block *nb);
+int crypto_unregister_notifier(struct notifier_block *nb);
+
+unsigned int crypto_alg_extsize(struct crypto_alg *alg);
+
+int crypto_type_has_alg(const char *name, const struct crypto_type *frontend,
+			u32 type, u32 mask);
+
+static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg)
+{
+	atomic_inc(&alg->cra_refcnt);
+	return alg;
+}
+
+static inline void crypto_alg_put(struct crypto_alg *alg)
+{
+	if (atomic_dec_and_test(&alg->cra_refcnt) && alg->cra_destroy)
+		alg->cra_destroy(alg);
+}
+
+#endif	/* _CRYPTO_INTERNAL_H */
+
diff --git a/linux/crypto/sha1_generic.c b/linux/crypto/sha1_generic.c
new file mode 100644
index 0000000..b0b9cd1
--- /dev/null
+++ b/linux/crypto/sha1_generic.c
@@ -0,0 +1,92 @@
+/*
+ * Cryptographic API.
+ *
+ * SHA1 Secure Hash Algorithm.
+ *
+ * Derived from cryptoapi implementation, adapted for in-place
+ * scatterlist interface.
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+#include <crypto/internal/hash.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <crypto/sha1_base.h>
+#include <asm/byteorder.h>
+
+const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE] = {
+	0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d,
+	0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90,
+	0xaf, 0xd8, 0x07, 0x09
+};
+
+static void sha1_generic_block_fn(struct sha1_state *sst, u8 const *src,
+				  int blocks)
+{
+	u32 temp[SHA_WORKSPACE_WORDS];
+
+	while (blocks--) {
+		sha_transform(sst->state, src, temp);
+		src += SHA1_BLOCK_SIZE;
+	}
+	memzero_explicit(temp, sizeof(temp));
+}
+
+int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
+		       unsigned int len)
+{
+	return sha1_base_do_update(desc, data, len, sha1_generic_block_fn);
+}
+
+static int sha1_final(struct shash_desc *desc, u8 *out)
+{
+	sha1_base_do_finalize(desc, sha1_generic_block_fn);
+	return sha1_base_finish(desc, out);
+}
+
+int crypto_sha1_finup(struct shash_desc *desc, const u8 *data,
+		      unsigned int len, u8 *out)
+{
+	sha1_base_do_update(desc, data, len, sha1_generic_block_fn);
+	return sha1_final(desc, out);
+}
+
+static struct shash_alg alg = {
+	.digestsize	=	SHA1_DIGEST_SIZE,
+	.init		=	sha1_base_init,
+	.update		=	crypto_sha1_update,
+	.final		=	sha1_final,
+	.finup		=	crypto_sha1_finup,
+	.descsize	=	sizeof(struct sha1_state),
+	.base		=	{
+		.cra_name	=	"sha1",
+		.cra_driver_name=	"sha1-generic",
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA1_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+static int __init sha1_generic_mod_init(void)
+{
+	return crypto_register_shash(&alg);
+}
+
+static void __exit sha1_generic_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(sha1_generic_mod_init);
+module_exit(sha1_generic_mod_fini);
diff --git a/linux/crypto/shash.c b/linux/crypto/shash.c
new file mode 100644
index 0000000..406ddfe
--- /dev/null
+++ b/linux/crypto/shash.c
@@ -0,0 +1,294 @@
+/*
+ * Synchronous Cryptographic Hash operations.
+ *
+ * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+
+#include "internal.h"
+
+static int shash_no_setkey(struct crypto_shash *tfm, const u8 *key,
+			   unsigned int keylen)
+{
+	return -ENOSYS;
+}
+
+static int shash_setkey_unaligned(struct crypto_shash *tfm, const u8 *key,
+				  unsigned int keylen)
+{
+	struct shash_alg *shash = crypto_shash_alg(tfm);
+	unsigned long alignmask = crypto_shash_alignmask(tfm);
+	unsigned long absize;
+	u8 *buffer, *alignbuffer;
+	int err;
+
+	absize = keylen + (alignmask & ~(crypto_tfm_ctx_alignment() - 1));
+	buffer = kmalloc(absize, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
+	memcpy(alignbuffer, key, keylen);
+	err = shash->setkey(tfm, alignbuffer, keylen);
+	kzfree(buffer);
+	return err;
+}
+
+int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key,
+			unsigned int keylen)
+{
+	struct shash_alg *shash = crypto_shash_alg(tfm);
+	unsigned long alignmask = crypto_shash_alignmask(tfm);
+
+	if ((unsigned long)key & alignmask)
+		return shash_setkey_unaligned(tfm, key, keylen);
+
+	return shash->setkey(tfm, key, keylen);
+}
+
+static inline unsigned int shash_align_buffer_size(unsigned len,
+						   unsigned long mask)
+{
+	typedef u8 __attribute__ ((aligned)) u8_aligned;
+	return len + (mask & ~(__alignof__(u8_aligned) - 1));
+}
+
+static int shash_update_unaligned(struct shash_desc *desc, const u8 *data,
+				  unsigned int len)
+{
+	struct crypto_shash *tfm = desc->tfm;
+	struct shash_alg *shash = crypto_shash_alg(tfm);
+	unsigned long alignmask = crypto_shash_alignmask(tfm);
+	unsigned int unaligned_len = alignmask + 1 -
+				     ((unsigned long)data & alignmask);
+	u8 ubuf[shash_align_buffer_size(unaligned_len, alignmask)]
+		__attribute__ ((aligned));
+	u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1);
+	int err;
+
+	if (unaligned_len > len)
+		unaligned_len = len;
+
+	memcpy(buf, data, unaligned_len);
+	err = shash->update(desc, buf, unaligned_len);
+	memset(buf, 0, unaligned_len);
+
+	return err ?:
+	       shash->update(desc, data + unaligned_len, len - unaligned_len);
+}
+
+int crypto_shash_update(struct shash_desc *desc, const u8 *data,
+			unsigned int len)
+{
+	struct crypto_shash *tfm = desc->tfm;
+	struct shash_alg *shash = crypto_shash_alg(tfm);
+	unsigned long alignmask = crypto_shash_alignmask(tfm);
+
+	if ((unsigned long)data & alignmask)
+		return shash_update_unaligned(desc, data, len);
+
+	return shash->update(desc, data, len);
+}
+
+static int shash_final_unaligned(struct shash_desc *desc, u8 *out)
+{
+	struct crypto_shash *tfm = desc->tfm;
+	unsigned long alignmask = crypto_shash_alignmask(tfm);
+	struct shash_alg *shash = crypto_shash_alg(tfm);
+	unsigned int ds = crypto_shash_digestsize(tfm);
+	u8 ubuf[shash_align_buffer_size(ds, alignmask)]
+		__attribute__ ((aligned));
+	u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1);
+	int err;
+
+	err = shash->final(desc, buf);
+	if (err)
+		goto out;
+
+	memcpy(out, buf, ds);
+
+out:
+	memset(buf, 0, ds);
+	return err;
+}
+
+int crypto_shash_final(struct shash_desc *desc, u8 *out)
+{
+	struct crypto_shash *tfm = desc->tfm;
+	struct shash_alg *shash = crypto_shash_alg(tfm);
+	unsigned long alignmask = crypto_shash_alignmask(tfm);
+
+	if ((unsigned long)out & alignmask)
+		return shash_final_unaligned(desc, out);
+
+	return shash->final(desc, out);
+}
+
+static int shash_finup_unaligned(struct shash_desc *desc, const u8 *data,
+				 unsigned int len, u8 *out)
+{
+	return crypto_shash_update(desc, data, len) ?:
+	       crypto_shash_final(desc, out);
+}
+
+int crypto_shash_finup(struct shash_desc *desc, const u8 *data,
+		       unsigned int len, u8 *out)
+{
+	struct crypto_shash *tfm = desc->tfm;
+	struct shash_alg *shash = crypto_shash_alg(tfm);
+	unsigned long alignmask = crypto_shash_alignmask(tfm);
+
+	if (((unsigned long)data | (unsigned long)out) & alignmask)
+		return shash_finup_unaligned(desc, data, len, out);
+
+	return shash->finup(desc, data, len, out);
+}
+
+static int shash_digest_unaligned(struct shash_desc *desc, const u8 *data,
+				  unsigned int len, u8 *out)
+{
+	return crypto_shash_init(desc) ?:
+	       crypto_shash_finup(desc, data, len, out);
+}
+
+int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
+			unsigned int len, u8 *out)
+{
+	struct crypto_shash *tfm = desc->tfm;
+	struct shash_alg *shash = crypto_shash_alg(tfm);
+	unsigned long alignmask = crypto_shash_alignmask(tfm);
+
+	if (((unsigned long)data | (unsigned long)out) & alignmask)
+		return shash_digest_unaligned(desc, data, len, out);
+
+	return shash->digest(desc, data, len, out);
+}
+
+static int shash_default_export(struct shash_desc *desc, void *out)
+{
+	memcpy(out, shash_desc_ctx(desc), crypto_shash_descsize(desc->tfm));
+	return 0;
+}
+
+static int shash_default_import(struct shash_desc *desc, const void *in)
+{
+	memcpy(shash_desc_ctx(desc), in, crypto_shash_descsize(desc->tfm));
+	return 0;
+}
+
+static int crypto_shash_init_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_shash *hash = __crypto_shash_cast(tfm);
+
+	hash->descsize = crypto_shash_alg(hash)->descsize;
+	return 0;
+}
+
+static const struct crypto_type crypto_shash_type = {
+	.extsize = crypto_alg_extsize,
+	.init_tfm = crypto_shash_init_tfm,
+	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
+	.maskset = CRYPTO_ALG_TYPE_MASK,
+	.type = CRYPTO_ALG_TYPE_SHASH,
+	.tfmsize = offsetof(struct crypto_shash, base),
+};
+
+struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
+					u32 mask)
+{
+	return crypto_alloc_tfm(alg_name, &crypto_shash_type, type, mask);
+}
+
+static int shash_prepare_alg(struct shash_alg *alg)
+{
+	struct crypto_alg *base = &alg->base;
+
+	if (alg->digestsize > PAGE_SIZE / 8 ||
+	    alg->descsize > PAGE_SIZE / 8 ||
+	    alg->statesize > PAGE_SIZE / 8)
+		return -EINVAL;
+
+	base->cra_type = &crypto_shash_type;
+	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
+	base->cra_flags |= CRYPTO_ALG_TYPE_SHASH;
+
+	if (!alg->finup)
+		alg->finup = shash_finup_unaligned;
+	if (!alg->digest)
+		alg->digest = shash_digest_unaligned;
+	if (!alg->export) {
+		alg->export = shash_default_export;
+		alg->import = shash_default_import;
+		alg->statesize = alg->descsize;
+	}
+	if (!alg->setkey)
+		alg->setkey = shash_no_setkey;
+
+	return 0;
+}
+
+int crypto_register_shash(struct shash_alg *alg)
+{
+	struct crypto_alg *base = &alg->base;
+	int err;
+
+	err = shash_prepare_alg(alg);
+	if (err)
+		return err;
+
+	return crypto_register_alg(base);
+}
+
+int crypto_unregister_shash(struct shash_alg *alg)
+{
+	return crypto_unregister_alg(&alg->base);
+}
+
+int crypto_register_shashes(struct shash_alg *algs, int count)
+{
+	int i, ret;
+
+	for (i = 0; i < count; i++) {
+		ret = crypto_register_shash(&algs[i]);
+		if (ret)
+			goto err;
+	}
+
+	return 0;
+
+err:
+	for (--i; i >= 0; --i)
+		crypto_unregister_shash(&algs[i]);
+
+	return ret;
+}
+
+int crypto_unregister_shashes(struct shash_alg *algs, int count)
+{
+	int i, ret;
+
+	for (i = count - 1; i >= 0; --i) {
+		ret = crypto_unregister_shash(&algs[i]);
+		if (ret)
+			pr_err("Failed to unregister %s %s: %d\n",
+			       algs[i].base.cra_driver_name,
+			       algs[i].base.cra_name, ret);
+	}
+
+	return 0;
+}
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Synchronous cryptographic hash type");
diff --git a/linux/fs.c b/linux/fs.c
new file mode 100644
index 0000000..0002846
--- /dev/null
+++ b/linux/fs.c
@@ -0,0 +1,14 @@
+#include <linux/fs.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+const struct xattr_handler posix_acl_access_xattr_handler = {
+	.name = XATTR_NAME_POSIX_ACL_ACCESS,
+	.flags = ACL_TYPE_ACCESS,
+};
+
+const struct xattr_handler posix_acl_default_xattr_handler = {
+	.name = XATTR_NAME_POSIX_ACL_DEFAULT,
+	.flags = ACL_TYPE_DEFAULT,
+};
diff --git a/linux/generic-radix-tree.c b/linux/generic-radix-tree.c
new file mode 100644
index 0000000..5c4a275
--- /dev/null
+++ b/linux/generic-radix-tree.c
@@ -0,0 +1,167 @@
+
+#include <linux/export.h>
+#include <linux/generic-radix-tree.h>
+#include <linux/gfp.h>
+
+#define GENRADIX_ARY		(PAGE_SIZE / sizeof(struct genradix_node *))
+#define GENRADIX_ARY_SHIFT	ilog2(GENRADIX_ARY)
+
+struct genradix_node {
+	union {
+		/* Interior node: */
+		struct genradix_node	*children[GENRADIX_ARY];
+
+		/* Leaf: */
+		u8			data[PAGE_SIZE];
+	};
+};
+
+static inline unsigned genradix_depth_shift(unsigned depth)
+{
+	return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth;
+}
+
+/*
+ * Returns size (of data, in bytes) that a tree of a given depth holds:
+ */
+static inline size_t genradix_depth_size(unsigned depth)
+{
+	return 1UL << genradix_depth_shift(depth);
+}
+
+/*
+ * Returns pointer to the specified byte @offset within @radix, or NULL if not
+ * allocated
+ */
+void *__genradix_ptr(struct __genradix *radix, size_t offset)
+{
+	size_t level = radix->depth;
+	struct genradix_node *n = radix->root;
+
+	if (offset >= genradix_depth_size(radix->depth))
+		return NULL;
+
+	while (1) {
+		if (!n)
+			return NULL;
+		if (!level)
+			break;
+
+		level--;
+
+		n = n->children[offset >> genradix_depth_shift(level)];
+		offset &= genradix_depth_size(level) - 1;
+	}
+
+	return &n->data[offset];
+}
+EXPORT_SYMBOL(__genradix_ptr);
+
+/*
+ * Returns pointer to the specified byte @offset within @radix, allocating it if
+ * necessary - newly allocated slots are always zeroed out:
+ */
+void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
+			   gfp_t gfp_mask)
+{
+	struct genradix_node **n;
+	size_t level;
+
+	/* Increase tree depth if necessary: */
+
+	while (offset >= genradix_depth_size(radix->depth)) {
+		struct genradix_node *new_root =
+			(void *) __get_free_page(gfp_mask|__GFP_ZERO);
+
+		if (!new_root)
+			return NULL;
+
+		new_root->children[0] = radix->root;
+		radix->root = new_root;
+		radix->depth++;
+	}
+
+	n = &radix->root;
+	level = radix->depth;
+
+	while (1) {
+		if (!*n) {
+			*n = (void *) __get_free_page(gfp_mask|__GFP_ZERO);
+			if (!*n)
+				return NULL;
+		}
+
+		if (!level)
+			break;
+
+		level--;
+
+		n = &(*n)->children[offset >> genradix_depth_shift(level)];
+		offset &= genradix_depth_size(level) - 1;
+	}
+
+	return &(*n)->data[offset];
+}
+EXPORT_SYMBOL(__genradix_ptr_alloc);
+
+void *__genradix_iter_peek(struct genradix_iter *iter,
+			   struct __genradix *radix,
+			   size_t objs_per_page)
+{
+	struct genradix_node *n;
+	size_t level, i;
+
+	if (!radix->root)
+		return NULL;
+restart:
+	if (iter->offset >= genradix_depth_size(radix->depth))
+		return NULL;
+
+	n	= radix->root;
+	level	= radix->depth;
+
+	while (level) {
+		level--;
+
+		i = (iter->offset >> genradix_depth_shift(level)) &
+			(GENRADIX_ARY - 1);
+
+		while (!n->children[i]) {
+			i++;
+			iter->offset = round_down(iter->offset +
+					   genradix_depth_size(level),
+					   genradix_depth_size(level));
+			iter->pos = (iter->offset >> PAGE_SHIFT) *
+				objs_per_page;
+			if (i == GENRADIX_ARY)
+				goto restart;
+		}
+
+		n = n->children[i];
+	}
+
+	return &n->data[iter->offset & (PAGE_SIZE - 1)];
+}
+EXPORT_SYMBOL(__genradix_iter_peek);
+
+static void genradix_free_recurse(struct genradix_node *n, unsigned level)
+{
+	if (level) {
+		unsigned i;
+
+		for (i = 0; i < GENRADIX_ARY; i++)
+			if (n->children[i])
+				genradix_free_recurse(n->children[i], level - 1);
+	}
+
+	free_page((unsigned long) n);
+}
+
+void __genradix_free(struct __genradix *radix)
+{
+	genradix_free_recurse(radix->root, radix->depth);
+
+	radix->root = NULL;
+	radix->depth = 0;
+}
+EXPORT_SYMBOL(__genradix_free);
diff --git a/linux/kstrtox.c b/linux/kstrtox.c
new file mode 100644
index 0000000..af6b222
--- /dev/null
+++ b/linux/kstrtox.c
@@ -0,0 +1,368 @@
+/*
+ * Convert integer string representation to an integer.
+ * If an integer doesn't fit into specified type, -E is returned.
+ *
+ * Integer starts with optional sign.
+ * kstrtou*() functions do not accept sign "-".
+ *
+ * Radix 0 means autodetection: leading "0x" implies radix 16,
+ * leading "0" implies radix 8, otherwise radix is 10.
+ * Autodetection hints work after optional sign, but not before.
+ *
+ * If -E is returned, result is not touched.
+ */
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/math64.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include "kstrtox.h"
+
+const char *_parse_integer_fixup_radix(const char *s, unsigned int *base)
+{
+	if (*base == 0) {
+		if (s[0] == '0') {
+			if (_tolower(s[1]) == 'x' && isxdigit(s[2]))
+				*base = 16;
+			else
+				*base = 8;
+		} else
+			*base = 10;
+	}
+	if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x')
+		s += 2;
+	return s;
+}
+
+/*
+ * Convert non-negative integer string representation in explicitly given radix
+ * to an integer.
+ * Return number of characters consumed maybe or-ed with overflow bit.
+ * If overflow occurs, result integer (incorrect) is still returned.
+ *
+ * Don't you dare use this function.
+ */
+unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long *p)
+{
+	unsigned long long res;
+	unsigned int rv;
+	int overflow;
+
+	res = 0;
+	rv = 0;
+	overflow = 0;
+	while (*s) {
+		unsigned int val;
+
+		if ('0' <= *s && *s <= '9')
+			val = *s - '0';
+		else if ('a' <= _tolower(*s) && _tolower(*s) <= 'f')
+			val = _tolower(*s) - 'a' + 10;
+		else
+			break;
+
+		if (val >= base)
+			break;
+		/*
+		 * Check for overflow only if we are within range of
+		 * it in the max base we support (16)
+		 */
+		if (unlikely(res & (~0ull << 60))) {
+			if (res > div_u64(ULLONG_MAX - val, base))
+				overflow = 1;
+		}
+		res = res * base + val;
+		rv++;
+		s++;
+	}
+	*p = res;
+	if (overflow)
+		rv |= KSTRTOX_OVERFLOW;
+	return rv;
+}
+
+static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res)
+{
+	unsigned long long _res;
+	unsigned int rv;
+
+	s = _parse_integer_fixup_radix(s, &base);
+	rv = _parse_integer(s, base, &_res);
+	if (rv & KSTRTOX_OVERFLOW)
+		return -ERANGE;
+	if (rv == 0)
+		return -EINVAL;
+	s += rv;
+	if (*s == '\n')
+		s++;
+	if (*s)
+		return -EINVAL;
+	*res = _res;
+	return 0;
+}
+
+/**
+ * kstrtoull - convert a string to an unsigned long long
+ * @s: The start of the string. The string must be null-terminated, and may also
+ *  include a single newline before its terminating null. The first character
+ *  may also be a plus sign, but not a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ *  given as 0, then the base of the string is automatically detected with the
+ *  conventional semantics - If it begins with 0x the number will be parsed as a
+ *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ *  parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Used as a replacement for the obsolete simple_strtoull. Return code must
+ * be checked.
+ */
+int kstrtoull(const char *s, unsigned int base, unsigned long long *res)
+{
+	if (s[0] == '+')
+		s++;
+	return _kstrtoull(s, base, res);
+}
+EXPORT_SYMBOL(kstrtoull);
+
+/**
+ * kstrtoll - convert a string to a long long
+ * @s: The start of the string. The string must be null-terminated, and may also
+ *  include a single newline before its terminating null. The first character
+ *  may also be a plus sign or a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ *  given as 0, then the base of the string is automatically detected with the
+ *  conventional semantics - If it begins with 0x the number will be parsed as a
+ *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ *  parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Used as a replacement for the obsolete simple_strtoull. Return code must
+ * be checked.
+ */
+int kstrtoll(const char *s, unsigned int base, long long *res)
+{
+	unsigned long long tmp;
+	int rv;
+
+	if (s[0] == '-') {
+		rv = _kstrtoull(s + 1, base, &tmp);
+		if (rv < 0)
+			return rv;
+		if ((long long)-tmp > 0)
+			return -ERANGE;
+		*res = -tmp;
+	} else {
+		rv = kstrtoull(s, base, &tmp);
+		if (rv < 0)
+			return rv;
+		if ((long long)tmp < 0)
+			return -ERANGE;
+		*res = tmp;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(kstrtoll);
+
+/* Internal, do not use. */
+int _kstrtoul(const char *s, unsigned int base, unsigned long *res)
+{
+	unsigned long long tmp;
+	int rv;
+
+	rv = kstrtoull(s, base, &tmp);
+	if (rv < 0)
+		return rv;
+	if (tmp != (unsigned long long)(unsigned long)tmp)
+		return -ERANGE;
+	*res = tmp;
+	return 0;
+}
+EXPORT_SYMBOL(_kstrtoul);
+
+/* Internal, do not use. */
+int _kstrtol(const char *s, unsigned int base, long *res)
+{
+	long long tmp;
+	int rv;
+
+	rv = kstrtoll(s, base, &tmp);
+	if (rv < 0)
+		return rv;
+	if (tmp != (long long)(long)tmp)
+		return -ERANGE;
+	*res = tmp;
+	return 0;
+}
+EXPORT_SYMBOL(_kstrtol);
+
+/**
+ * kstrtouint - convert a string to an unsigned int
+ * @s: The start of the string. The string must be null-terminated, and may also
+ *  include a single newline before its terminating null. The first character
+ *  may also be a plus sign, but not a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ *  given as 0, then the base of the string is automatically detected with the
+ *  conventional semantics - If it begins with 0x the number will be parsed as a
+ *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ *  parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Used as a replacement for the obsolete simple_strtoull. Return code must
+ * be checked.
+ */
+int kstrtouint(const char *s, unsigned int base, unsigned int *res)
+{
+	unsigned long long tmp;
+	int rv;
+
+	rv = kstrtoull(s, base, &tmp);
+	if (rv < 0)
+		return rv;
+	if (tmp != (unsigned long long)(unsigned int)tmp)
+		return -ERANGE;
+	*res = tmp;
+	return 0;
+}
+EXPORT_SYMBOL(kstrtouint);
+
+/**
+ * kstrtoint - convert a string to an int
+ * @s: The start of the string. The string must be null-terminated, and may also
+ *  include a single newline before its terminating null. The first character
+ *  may also be a plus sign or a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ *  given as 0, then the base of the string is automatically detected with the
+ *  conventional semantics - If it begins with 0x the number will be parsed as a
+ *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ *  parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Used as a replacement for the obsolete simple_strtoull. Return code must
+ * be checked.
+ */
+int kstrtoint(const char *s, unsigned int base, int *res)
+{
+	long long tmp;
+	int rv;
+
+	rv = kstrtoll(s, base, &tmp);
+	if (rv < 0)
+		return rv;
+	if (tmp != (long long)(int)tmp)
+		return -ERANGE;
+	*res = tmp;
+	return 0;
+}
+EXPORT_SYMBOL(kstrtoint);
+
+int kstrtou16(const char *s, unsigned int base, u16 *res)
+{
+	unsigned long long tmp;
+	int rv;
+
+	rv = kstrtoull(s, base, &tmp);
+	if (rv < 0)
+		return rv;
+	if (tmp != (unsigned long long)(u16)tmp)
+		return -ERANGE;
+	*res = tmp;
+	return 0;
+}
+EXPORT_SYMBOL(kstrtou16);
+
+int kstrtos16(const char *s, unsigned int base, s16 *res)
+{
+	long long tmp;
+	int rv;
+
+	rv = kstrtoll(s, base, &tmp);
+	if (rv < 0)
+		return rv;
+	if (tmp != (long long)(s16)tmp)
+		return -ERANGE;
+	*res = tmp;
+	return 0;
+}
+EXPORT_SYMBOL(kstrtos16);
+
+int kstrtou8(const char *s, unsigned int base, u8 *res)
+{
+	unsigned long long tmp;
+	int rv;
+
+	rv = kstrtoull(s, base, &tmp);
+	if (rv < 0)
+		return rv;
+	if (tmp != (unsigned long long)(u8)tmp)
+		return -ERANGE;
+	*res = tmp;
+	return 0;
+}
+EXPORT_SYMBOL(kstrtou8);
+
+int kstrtos8(const char *s, unsigned int base, s8 *res)
+{
+	long long tmp;
+	int rv;
+
+	rv = kstrtoll(s, base, &tmp);
+	if (rv < 0)
+		return rv;
+	if (tmp != (long long)(s8)tmp)
+		return -ERANGE;
+	*res = tmp;
+	return 0;
+}
+EXPORT_SYMBOL(kstrtos8);
+
+/**
+ * kstrtobool - convert common user inputs into boolean values
+ * @s: input string
+ * @res: result
+ *
+ * This routine returns 0 iff the first character is one of 'Yy1Nn0', or
+ * [oO][NnFf] for "on" and "off". Otherwise it will return -EINVAL.  Value
+ * pointed to by res is updated upon finding a match.
+ */
+int kstrtobool(const char *s, bool *res)
+{
+	if (!s)
+		return -EINVAL;
+
+	switch (s[0]) {
+	case 'y':
+	case 'Y':
+	case '1':
+		*res = true;
+		return 0;
+	case 'n':
+	case 'N':
+	case '0':
+		*res = false;
+		return 0;
+	case 'o':
+	case 'O':
+		switch (s[1]) {
+		case 'n':
+		case 'N':
+			*res = true;
+			return 0;
+		case 'f':
+		case 'F':
+			*res = false;
+			return 0;
+		default:
+			break;
+		}
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL(kstrtobool);
diff --git a/linux/kstrtox.h b/linux/kstrtox.h
new file mode 100644
index 0000000..f13eeea
--- /dev/null
+++ b/linux/kstrtox.h
@@ -0,0 +1,8 @@
+#ifndef _LIB_KSTRTOX_H
+#define _LIB_KSTRTOX_H
+
+#define KSTRTOX_OVERFLOW	(1U << 31)
+const char *_parse_integer_fixup_radix(const char *s, unsigned int *base);
+unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long *res);
+
+#endif
diff --git a/linux/kthread.c b/linux/kthread.c
new file mode 100644
index 0000000..0f4b571
--- /dev/null
+++ b/linux/kthread.c
@@ -0,0 +1,117 @@
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <linux/bitops.h>
+#include <linux/kthread.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+
+enum KTHREAD_BITS {
+	KTHREAD_IS_PER_CPU = 0,
+	KTHREAD_SHOULD_STOP,
+	KTHREAD_SHOULD_PARK,
+	KTHREAD_IS_PARKED,
+};
+
+static void *kthread_start_fn(void *data)
+{
+	rcu_register_thread();
+
+	current = data;
+	schedule();
+	current->thread_fn(current->thread_data);
+
+	complete(&current->exited);
+	put_task_struct(current);
+	rcu_unregister_thread();
+	return NULL;
+}
+
+/**
+ * kthread_create_on_node - create a kthread.
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @node: task and thread structures for the thread are allocated on this node
+ * @namefmt: printf-style name for the thread.
+ *
+ * Description: This helper function creates and names a kernel
+ * thread.  The thread will be stopped: use wake_up_process() to start
+ * it.  See also kthread_run().  The new thread has SCHED_NORMAL policy and
+ * is affine to all CPUs.
+ *
+ * If thread is going to be bound on a particular cpu, give its node
+ * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
+ * When woken, the thread will run @threadfn() with @data as its
+ * argument. @threadfn() can either call do_exit() directly if it is a
+ * standalone thread for which no one will call kthread_stop(), or
+ * return when 'kthread_should_stop()' is true (which means
+ * kthread_stop() has been called).  The return value should be zero
+ * or a negative error number; it will be passed to kthread_stop().
+ *
+ * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
+ */
+struct task_struct *kthread_create(int (*thread_fn)(void *data),
+				   void *thread_data,
+				   const char namefmt[], ...)
+{
+	va_list args;
+	struct task_struct *p = malloc(sizeof(*p));
+
+	memset(p, 0, sizeof(*p));
+
+	va_start(args, namefmt);
+	vsnprintf(p->comm, sizeof(p->comm), namefmt, args);
+	va_end(args);
+
+	p->thread_fn	= thread_fn;
+	p->thread_data	= thread_data;
+	p->state	= TASK_UNINTERRUPTIBLE;
+	pthread_mutex_init(&p->lock, NULL);
+	pthread_cond_init(&p->wait, NULL);
+	atomic_set(&p->usage, 1);
+	init_completion(&p->exited);
+
+	pthread_create(&p->thread, NULL, kthread_start_fn, p);
+	return p;
+}
+
+/**
+ * kthread_should_stop - should this kthread return now?
+ *
+ * When someone calls kthread_stop() on your kthread, it will be woken
+ * and this will return true.  You should then return, and your return
+ * value will be passed through to kthread_stop().
+ */
+bool kthread_should_stop(void)
+{
+	return test_bit(KTHREAD_SHOULD_STOP, &current->kthread_flags);
+}
+
+/**
+ * kthread_stop - stop a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_stop() for @k to return true, wakes it, and
+ * waits for it to exit. This can also be called after kthread_create()
+ * instead of calling wake_up_process(): the thread will exit without
+ * calling threadfn().
+ *
+ * If threadfn() may call do_exit() itself, the caller must ensure
+ * task_struct can't go away.
+ *
+ * Returns the result of threadfn(), or %-EINTR if wake_up_process()
+ * was never called.
+ */
+int kthread_stop(struct task_struct *p)
+{
+	get_task_struct(p);
+
+	set_bit(KTHREAD_SHOULD_STOP, &p->kthread_flags);
+	wake_up_process(p);
+	wait_for_completion(&p->exited);
+
+	put_task_struct(p);
+
+	return 0;
+}
diff --git a/linux/llist.c b/linux/llist.c
new file mode 100644
index 0000000..ae5872b
--- /dev/null
+++ b/linux/llist.c
@@ -0,0 +1,104 @@
+/*
+ * Lock-less NULL terminated single linked list
+ *
+ * The basic atomic operation of this list is cmpxchg on long.  On
+ * architectures that don't have NMI-safe cmpxchg implementation, the
+ * list can NOT be used in NMI handlers.  So code that uses the list in
+ * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
+ *
+ * Copyright 2010,2011 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/llist.h>
+
+
+/**
+ * llist_add_batch - add several linked entries in batch
+ * @new_first:	first entry in batch to be added
+ * @new_last:	last entry in batch to be added
+ * @head:	the head for your lock-less list
+ *
+ * Return whether list is empty before adding.
+ */
+bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
+		     struct llist_head *head)
+{
+	struct llist_node *first;
+
+	do {
+		new_last->next = first = ACCESS_ONCE(head->first);
+	} while (cmpxchg(&head->first, first, new_first) != first);
+
+	return !first;
+}
+EXPORT_SYMBOL_GPL(llist_add_batch);
+
+/**
+ * llist_del_first - delete the first entry of lock-less list
+ * @head:	the head for your lock-less list
+ *
+ * If list is empty, return NULL, otherwise, return the first entry
+ * deleted, this is the newest added one.
+ *
+ * Only one llist_del_first user can be used simultaneously with
+ * multiple llist_add users without lock.  Because otherwise
+ * llist_del_first, llist_add, llist_add (or llist_del_all, llist_add,
+ * llist_add) sequence in another user may change @head->first->next,
+ * but keep @head->first.  If multiple consumers are needed, please
+ * use llist_del_all or use lock between consumers.
+ */
+struct llist_node *llist_del_first(struct llist_head *head)
+{
+	struct llist_node *entry, *old_entry, *next;
+
+	entry = smp_load_acquire(&head->first);
+	for (;;) {
+		if (entry == NULL)
+			return NULL;
+		old_entry = entry;
+		next = READ_ONCE(entry->next);
+		entry = cmpxchg(&head->first, old_entry, next);
+		if (entry == old_entry)
+			break;
+	}
+
+	return entry;
+}
+EXPORT_SYMBOL_GPL(llist_del_first);
+
+/**
+ * llist_reverse_order - reverse order of a llist chain
+ * @head:	first item of the list to be reversed
+ *
+ * Reverse the order of a chain of llist entries and return the
+ * new first entry.
+ */
+struct llist_node *llist_reverse_order(struct llist_node *head)
+{
+	struct llist_node *new_head = NULL;
+
+	while (head) {
+		struct llist_node *tmp = head;
+		head = head->next;
+		tmp->next = new_head;
+		new_head = tmp;
+	}
+
+	return new_head;
+}
+EXPORT_SYMBOL_GPL(llist_reverse_order);
diff --git a/linux/lz4_compress.c b/linux/lz4_compress.c
new file mode 100644
index 0000000..65243c7
--- /dev/null
+++ b/linux/lz4_compress.c
@@ -0,0 +1,258 @@
+/*
+ * LZ4 - Fast LZ compression algorithm
+ * Copyright (C) 2011-2012, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at :
+ * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ * - LZ4 source repository : http://code.google.com/p/lz4/
+ *
+ *  Changed for kernel use by:
+ *  Chanho Min <chanho.min@lge.com>
+ */
+
+#include <linux/log2.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/lz4.h>
+#include <asm/unaligned.h>
+#include "lz4defs.h"
+
+#define LZ4_HASH_VALUE(p, _table)				\
+	__HASH_VALUE(p, MEMORY_USAGE - ilog2(sizeof(_table[0])))
+
+struct lz4_hash_table {
+	const u8	*(*add)(const struct lz4_hash_table, const u8 *);
+	void		*ctx;
+	const u8	*base;
+};
+
+#if __SIZEOF_POINTER__ == 4
+static inline const u8 *hash_table_add32(const struct lz4_hash_table hash,
+					 const u8 *ip)
+{
+	const u8 **table = hash.ctx;
+
+	swap(table[LZ4_HASH_VALUE(ip, table)], ip);
+	return ip;
+}
+#else
+static inline const u8 *hash_table_add32(const struct lz4_hash_table hash,
+					 const u8 *ip)
+{
+	u32 *table = hash.ctx;
+	size_t offset = ip - hash.base;
+
+	swap(table[LZ4_HASH_VALUE(ip, table)], offset);
+	return hash.base + offset;
+}
+#endif
+
+static inline const u8 *hash_table_add16(const struct lz4_hash_table hash,
+					 const u8 *ip)
+{
+	u16 *table = hash.ctx;
+	size_t offset = ip - hash.base;
+
+	swap(table[LZ4_HASH_VALUE(ip, table)], offset);
+	return hash.base + offset;
+}
+
+static inline const u8 *try_match(const struct lz4_hash_table hash,
+				  const u8 *ip)
+{
+	const u8 *ref = hash.add(hash, ip);
+
+	return ref >= ip - MAX_DISTANCE &&
+		A32(ref) == A32(ip) ? ref : NULL;
+}
+
+static inline const u8 *find_match(const struct lz4_hash_table hash,
+				   const u8 **ip, const u8 *anchor,
+				   const u8 *start, const u8 *end)
+{
+
+	int findmatchattempts = (1U << SKIPSTRENGTH) + 3;
+	const u8 *next_ip = *ip, *ref;
+
+	do {
+		*ip = next_ip;
+		next_ip += findmatchattempts++ >> SKIPSTRENGTH;
+
+		if (unlikely(next_ip > end))
+			return NULL;
+	} while (!(ref = try_match(hash, *ip)));
+
+	/* Catch up */
+	while (*ip > anchor &&
+	       ref > start &&
+	       unlikely((*ip)[-1] == ref[-1])) {
+		(*ip)--;
+		ref--;
+	}
+
+	return ref;
+}
+
+/*
+ * LZ4_compressCtx :
+ * -----------------
+ * Compress 'isize' bytes from 'source' into an output buffer 'dest' of
+ * maximum size 'maxOutputSize'.  * If it cannot achieve it, compression
+ * will stop, and result of the function will be zero.
+ * return : the number of bytes written in buffer 'dest', or 0 if the
+ * compression fails
+ */
+static inline int lz4_compressctx(const struct lz4_hash_table hash,
+				  const u8 *src, size_t src_len,
+				  u8 *dst, size_t *dst_len)
+{
+	const u8 *ip = src;
+	const u8 *anchor = ip, *ref;
+	const u8 *const iend = ip + src_len;
+	const u8 *const mflimit = iend - MFLIMIT;
+	const u8 *const matchlimit = iend - LASTLITERALS;
+	size_t maxoutputsize = *dst_len;
+	u8 *op = dst;
+	u8 *const oend = op + maxoutputsize;
+	int length;
+	u8 *token;
+
+	/* Init */
+	if (src_len < MINLENGTH)
+		goto _last_literals;
+
+	memset(hash.ctx, 0, LZ4_MEM_COMPRESS);
+	hash.add(hash, ip);
+
+	/* Main Loop */
+	while (1) {
+		/* Starting a literal: */
+		anchor = ip++;
+		ref = find_match(hash, &ip, anchor, src, mflimit);
+		if (!ref)
+			goto _last_literals;
+
+		/*
+		 * We found a match; @ip now points to the match and @ref points
+		 * to the prior part of the input we matched with. Everything up
+		 * to @anchor has been encoded; the range from @anchor to @ip
+		 * didn't match and now has to be encoded as a literal:
+		 */
+		length = ip - anchor;
+		token = op++;
+
+		/* check output limit */
+		if (unlikely(op + length + (2 + 1 + LASTLITERALS) +
+			     (length >> 8) > oend))
+			return -(ip - src);
+
+		*token = encode_length(&op, length) << ML_BITS;
+
+		/* Copy Literals */
+		MEMCPY_ADVANCE_CHUNKED(op, anchor, length);
+
+		/* Encode matches: */
+		while (1) {
+			/* Match offset: */
+			PUT_LE16_ADVANCE(op, ip - ref);
+
+			/* MINMATCH bytes already matched from find_match(): */
+			ip += MINMATCH;
+			ref += MINMATCH;
+
+			length = common_length(ip, ref, matchlimit);
+
+			/* Check output limit */
+			if (unlikely(op + (1 + LASTLITERALS) +
+				     (length >> 8) > oend))
+				return -(ip - src);
+
+			ip += length;
+
+			*token += encode_length(&op, length);
+
+			/* Test end of chunk */
+			if (ip > mflimit) {
+				anchor = ip;
+				break;
+			}
+
+			/* Fill table */
+			hash.add(hash, ip - 2);
+
+			/* Test next position */
+			ref = try_match(hash, ip);
+			if (!ref)
+				break;
+
+			token = op++;
+			*token = 0;
+		}
+	}
+
+_last_literals:
+	/* Encode Last Literals */
+	length = iend - anchor;
+	if ((op - dst) + length + 1 +
+	    ((length + 255 - RUN_MASK) / 255) > (u32)maxoutputsize)
+		return -(ip - src);
+
+	token = op++;
+	*token = encode_length(&op, length) << ML_BITS;
+	MEMCPY_ADVANCE(op, anchor, iend - anchor);
+
+	/* End */
+	*dst_len = op - dst;
+	return 0;
+}
+
+__attribute__((flatten))
+int lz4_compress(const unsigned char *src, size_t src_len,
+		 unsigned char *dst, size_t *dst_len, void *wrkmem)
+{
+	if (src_len < LZ4_64KLIMIT) {
+		const struct lz4_hash_table hash = {
+			.add	= hash_table_add16,
+			.ctx	= wrkmem,
+			.base	= src,
+		};
+
+		return lz4_compressctx(hash, src, src_len, dst, dst_len);
+	} else {
+		const struct lz4_hash_table hash = {
+			.add	= hash_table_add32,
+			.ctx	= wrkmem,
+			.base	= src,
+		};
+
+		return lz4_compressctx(hash, src, src_len, dst, dst_len);
+	}
+}
+EXPORT_SYMBOL(lz4_compress);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("LZ4 compressor");
diff --git a/linux/lz4_decompress.c b/linux/lz4_decompress.c
new file mode 100644
index 0000000..0f3e42d
--- /dev/null
+++ b/linux/lz4_decompress.c
@@ -0,0 +1,316 @@
+/*
+ * LZ4 Decompressor for Linux kernel
+ *
+ * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
+ *
+ * Based on LZ4 implementation by Yann Collet.
+ *
+ * LZ4 - Fast LZ compression algorithm
+ * Copyright (C) 2011-2012, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  You can contact the author at :
+ *  - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ *  - LZ4 source repository : http://code.google.com/p/lz4/
+ */
+
+#ifndef STATIC
+#include <linux/module.h>
+#include <linux/kernel.h>
+#endif
+#include <linux/lz4.h>
+
+#include "lz4defs.h"
+
+static const int dec32table[8] = {0, 3, 2, 3, 0, 0, 0, 0};
+#if LZ4_ARCH64
+static const int dec64table[8] = {0, 0, 0, -1, 0, 1, 2, 3};
+#else
+static const int dec64table[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+#endif
+
+static inline size_t get_length(const u8 **ip, size_t length)
+{
+	if (length == LENGTH_LONG) {
+		size_t len;
+
+		do {
+			length += (len = *(*ip)++);
+		} while (len == 255);
+	}
+
+	return length;
+}
+
+static int lz4_uncompress(const u8 *source, u8 *dest, int osize)
+{
+	const u8 *ip = source;
+	const u8 *ref;
+	u8 *op = dest;
+	u8 * const oend = op + osize;
+	u8 *cpy;
+	unsigned token, offset;
+	ssize_t length;
+
+	while (1) {
+		/* get runlength */
+		token = *ip++;
+		length = get_length(&ip, token >> ML_BITS);
+
+		/* copy literals */
+		if (unlikely(op + length > oend - COPYLENGTH)) {
+			/*
+			 * Error: not enough place for another match
+			 * (min 4) + 5 literals
+			 */
+			if (op + length != oend)
+				goto _output_error;
+
+			MEMCPY_ADVANCE(op, ip, length);
+			break; /* EOF */
+		}
+		MEMCPY_ADVANCE_CHUNKED(op, ip, length);
+
+		/* get match offset */
+		offset = GET_LE16_ADVANCE(ip);
+		ref = op - offset;
+
+		/* Error: offset create reference outside destination buffer */
+		if (unlikely(ref < (u8 *const) dest))
+			goto _output_error;
+
+		/* get match length */
+		length = get_length(&ip, token & ML_MASK);
+		length += MINMATCH;
+
+		/* copy first STEPSIZE bytes of match: */
+		if (unlikely(offset < STEPSIZE)) {
+			MEMCPY_ADVANCE_BYTES(op, ref, 4);
+			ref -= dec32table[offset];
+
+			memcpy(op, ref, 4);
+			op += STEPSIZE - 4;
+			ref -= dec64table[offset];
+		} else {
+			MEMCPY_ADVANCE(op, ref, STEPSIZE);
+		}
+		length -= STEPSIZE;
+		/*
+		 * Note - length could have been < STEPSIZE; that's ok, length
+		 * will now be negative and we'll just end up rewinding op:
+		 */
+
+		/* copy rest of match: */
+		cpy = op + length;
+		if (cpy > oend - COPYLENGTH) {
+			/* Error: request to write beyond destination buffer */
+			if (cpy              > oend ||
+			    ref + COPYLENGTH > oend)
+				goto _output_error;
+#if !LZ4_ARCH64
+			if (op  + COPYLENGTH > oend)
+				goto _output_error;
+#endif
+			MEMCPY_ADVANCE_CHUNKED_NOFIXUP(op, ref, oend - COPYLENGTH);
+			/* op could be > cpy here */
+			while (op < cpy)
+				*op++ = *ref++;
+			op = cpy;
+			/*
+			 * Check EOF (should never happen, since last 5 bytes
+			 * are supposed to be literals)
+			 */
+			if (op == oend)
+				goto _output_error;
+		} else {
+			MEMCPY_ADVANCE_CHUNKED(op, ref, length);
+		}
+	}
+	/* end of decoding */
+	return ip - source;
+
+	/* write overflow error detected */
+_output_error:
+	return -1;
+}
+
+static inline ssize_t get_length_safe(const u8 **ip, ssize_t length)
+{
+	if (length == 15) {
+		size_t len;
+
+		do {
+			length += (len = *(*ip)++);
+			if (unlikely((ssize_t) length < 0))
+				return -1;
+
+			length += len;
+		} while (len == 255);
+	}
+
+	return length;
+}
+
+static int lz4_uncompress_unknownoutputsize(const u8 *source, u8 *dest,
+				int isize, size_t maxoutputsize)
+{
+	const u8 *ip = source;
+	const u8 *const iend = ip + isize;
+	const u8 *ref;
+	u8 *op = dest;
+	u8 * const oend = op + maxoutputsize;
+	u8 *cpy;
+	unsigned token, offset;
+	size_t length;
+
+	/* Main Loop */
+	while (ip < iend) {
+		/* get runlength */
+		token = *ip++;
+		length = get_length_safe(&ip, token >> ML_BITS);
+		if (unlikely((ssize_t) length < 0))
+			goto _output_error;
+
+		/* copy literals */
+		if ((op + length > oend - COPYLENGTH) ||
+		    (ip + length > iend - COPYLENGTH)) {
+
+			if (op + length > oend)
+				goto _output_error;/* writes beyond buffer */
+
+			if (ip + length != iend)
+				goto _output_error;/*
+						    * Error: LZ4 format requires
+						    * to consume all input
+						    * at this stage
+						    */
+			MEMCPY_ADVANCE(op, ip, length);
+			break;/* Necessarily EOF, due to parsing restrictions */
+		}
+		MEMCPY_ADVANCE_CHUNKED(op, ip, length);
+
+		/* get match offset */
+		offset = GET_LE16_ADVANCE(ip);
+		ref = op - offset;
+
+		/* Error: offset create reference outside destination buffer */
+		if (ref < (u8 * const) dest)
+			goto _output_error;
+
+		/* get match length */
+		length = get_length_safe(&ip, token & ML_MASK);
+		if (unlikely((ssize_t) length < 0))
+			goto _output_error;
+
+		length += MINMATCH;
+
+		/* copy first STEPSIZE bytes of match: */
+		if (unlikely(offset < STEPSIZE)) {
+			MEMCPY_ADVANCE_BYTES(op, ref, 4);
+			ref -= dec32table[offset];
+
+			memcpy(op, ref, 4);
+			op += STEPSIZE - 4;
+			ref -= dec64table[offset];
+		} else {
+			MEMCPY_ADVANCE(op, ref, STEPSIZE);
+		}
+		length -= STEPSIZE;
+
+		/* copy rest of match: */
+		cpy = op + length;
+		if (cpy > oend - COPYLENGTH) {
+			/* Error: request to write beyond destination buffer */
+			if (cpy              > oend ||
+			    ref + COPYLENGTH > oend)
+				goto _output_error;
+#if !LZ4_ARCH64
+			if (op  + COPYLENGTH > oend)
+				goto _output_error;
+#endif
+			MEMCPY_ADVANCE_CHUNKED_NOFIXUP(op, ref, oend - COPYLENGTH);
+			while (op < cpy)
+				*op++ = *ref++;
+			op = cpy;
+			/*
+			 * Check EOF (should never happen, since last 5 bytes
+			 * are supposed to be literals)
+			 */
+			if (op == oend)
+				goto _output_error;
+		} else {
+			MEMCPY_ADVANCE_CHUNKED(op, ref, length);
+		}
+	}
+	/* end of decoding */
+	return op - dest;
+
+	/* write overflow error detected */
+_output_error:
+	return -1;
+}
+
+int lz4_decompress(const unsigned char *src, size_t *src_len,
+		unsigned char *dest, size_t actual_dest_len)
+{
+	int ret = -1;
+	int input_len = 0;
+
+	input_len = lz4_uncompress(src, dest, actual_dest_len);
+	if (input_len < 0)
+		goto exit_0;
+	*src_len = input_len;
+
+	return 0;
+exit_0:
+	return ret;
+}
+#ifndef STATIC
+EXPORT_SYMBOL(lz4_decompress);
+#endif
+
+int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len,
+		unsigned char *dest, size_t *dest_len)
+{
+	int ret = -1;
+	int out_len = 0;
+
+	out_len = lz4_uncompress_unknownoutputsize(src, dest, src_len,
+					*dest_len);
+	if (out_len < 0)
+		goto exit_0;
+	*dest_len = out_len;
+
+	return 0;
+exit_0:
+	return ret;
+}
+#ifndef STATIC
+EXPORT_SYMBOL(lz4_decompress_unknownoutputsize);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("LZ4 Decompressor");
+#endif
diff --git a/linux/lz4defs.h b/linux/lz4defs.h
new file mode 100644
index 0000000..586b217
--- /dev/null
+++ b/linux/lz4defs.h
@@ -0,0 +1,181 @@
+/*
+ * lz4defs.h -- architecture specific defines
+ *
+ * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Detects 64 bits mode
+ */
+#if __SIZEOF_POINTER__ == 8
+#define LZ4_ARCH64 1
+#else
+#define LZ4_ARCH64 0
+#endif
+
+#include <linux/string.h>
+#include <asm/unaligned.h>
+
+#define A32(_p) get_unaligned((u32 *) (_p))
+#define A16(_p) get_unaligned((u16 *) (_p))
+
+#define GET_LE16_ADVANCE(_src)				\
+({							\
+	u16 _r = get_unaligned_le16(_src);		\
+	(_src) += 2;					\
+	_r;						\
+})
+
+#define PUT_LE16_ADVANCE(_dst, _v)			\
+do {							\
+	put_unaligned_le16((_v), (_dst));		\
+	(_dst) += 2;					\
+} while (0)
+
+#define LENGTH_LONG		15
+#define COPYLENGTH		8
+#define ML_BITS			4
+#define ML_MASK			((1U << ML_BITS) - 1)
+#define RUN_BITS		(8 - ML_BITS)
+#define RUN_MASK		((1U << RUN_BITS) - 1)
+#define MEMORY_USAGE		14
+#define MINMATCH		4
+#define SKIPSTRENGTH		6
+#define LASTLITERALS		5
+#define MFLIMIT			(COPYLENGTH + MINMATCH)
+#define MINLENGTH		(MFLIMIT + 1)
+#define MAXD_LOG		16
+#define MAXD			(1 << MAXD_LOG)
+#define MAXD_MASK		(u32)(MAXD - 1)
+#define MAX_DISTANCE		(MAXD - 1)
+#define HASH_LOG		(MAXD_LOG - 1)
+#define HASHTABLESIZE		(1 << HASH_LOG)
+#define MAX_NB_ATTEMPTS		256
+#define OPTIMAL_ML		(int)((ML_MASK-1)+MINMATCH)
+#define LZ4_64KLIMIT		((1<<16) + (MFLIMIT - 1))
+
+#define __HASH_VALUE(p, bits)				\
+	(((A32(p)) * 2654435761U) >> (32 - (bits)))
+
+#define HASH_VALUE(p)		__HASH_VALUE(p, HASH_LOG)
+
+#define MEMCPY_ADVANCE(_dst, _src, length)		\
+do {							\
+	typeof(length) _length = (length);		\
+	memcpy(_dst, _src, _length);			\
+	_src += _length;				\
+	_dst += _length;				\
+} while (0)
+
+#define MEMCPY_ADVANCE_BYTES(_dst, _src, _length)	\
+do {							\
+	const u8 *_end = (_src) + (_length);		\
+	while ((_src) < _end)				\
+		*_dst++ = *_src++;			\
+} while (0)
+
+#define STEPSIZE		__SIZEOF_LONG__
+
+#define LZ4_COPYPACKET(_src, _dst)			\
+do {							\
+	MEMCPY_ADVANCE(_dst, _src, STEPSIZE);		\
+	MEMCPY_ADVANCE(_dst, _src, COPYLENGTH - STEPSIZE);\
+} while (0)
+
+/*
+ * Equivalent to MEMCPY_ADVANCE - except may overrun @_dst and @_src by
+ * COPYLENGTH:
+ *
+ * Note: src and dst may overlap (with src < dst) - we must do the copy in
+ * STEPSIZE chunks for correctness
+ *
+ * Note also: length may be negative - we must not call memcpy if length is
+ * negative, but still adjust dst and src by length
+ */
+#define MEMCPY_ADVANCE_CHUNKED(_dst, _src, _length)	\
+do {							\
+	u8 *_end = (_dst) + (_length);			\
+	while ((_dst) < _end)				\
+		LZ4_COPYPACKET(_src, _dst);		\
+	_src -= (_dst) - _end;				\
+	_dst = _end;					\
+} while (0)
+
+#define MEMCPY_ADVANCE_CHUNKED_NOFIXUP(_dst, _src, _end)\
+do {							\
+	while ((_dst) < (_end))				\
+		LZ4_COPYPACKET((_src), (_dst));		\
+} while (0)
+
+struct lz4_hashtable {
+#if LZ4_ARCH64
+	const u8 * const	base;
+	u32			*table;
+#else
+	const int		base;
+	const u8		*table;
+#endif
+};
+
+#if LZ4_ARCH64
+#define HTYPE u32
+#else	/* 32-bit */
+#define HTYPE const u8*
+#endif
+
+#ifdef __BIG_ENDIAN
+#define LZ4_NBCOMMONBYTES(val) (__builtin_clzl(val) >> 3)
+#else
+#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzl(val) >> 3)
+#endif
+
+static inline unsigned common_length(const u8 *l, const u8 *r,
+				     const u8 *const l_end)
+{
+	const u8 *l_start = l;
+
+	while (likely(l <= l_end - sizeof(long))) {
+		unsigned long diff =
+			get_unaligned((unsigned long *) l) ^
+			get_unaligned((unsigned long *) r);
+
+		if (diff)
+			return l + LZ4_NBCOMMONBYTES(diff) - l_start;
+
+		l += sizeof(long);
+		r += sizeof(long);
+	}
+#if LZ4_ARCH64
+	if (l <= l_end - 4 && A32(r) == A32(l)) {
+		l += 4;
+		r += 4;
+	}
+#endif
+	if (l <= l_end - 2 && A16(r) == A16(l)) {
+		l += 2;
+		r += 2;
+	}
+	if (l <= l_end - 1 && *r == *l) {
+		l++;
+		r++;
+	}
+
+	return l - l_start;
+}
+
+static inline unsigned encode_length(u8 **op, unsigned length)
+{
+	if (length >= LENGTH_LONG) {
+		length -= LENGTH_LONG;
+
+		for (; length > 254 ; length -= 255)
+			*(*op)++ = 255;
+		*(*op)++ = length;
+		return LENGTH_LONG;
+	} else
+		return length;
+}
diff --git a/linux/lz4hc_compress.c b/linux/lz4hc_compress.c
new file mode 100644
index 0000000..b64ded0
--- /dev/null
+++ b/linux/lz4hc_compress.c
@@ -0,0 +1,454 @@
+/*
+ * LZ4 HC - High Compression Mode of LZ4
+ * Copyright (C) 2011-2012, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at :
+ * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ * - LZ4 source repository : http://code.google.com/p/lz4/
+ *
+ *  Changed for kernel use by:
+ *  Chanho Min <chanho.min@lge.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/lz4.h>
+#include <asm/unaligned.h>
+#include "lz4defs.h"
+
+struct lz4hc_data {
+	const u8 *base;
+	HTYPE hashtable[HASHTABLESIZE];
+	u16 chaintable[MAXD];
+	const u8 *nexttoupdate;
+} __attribute__((__packed__));
+
+static inline int lz4hc_init(struct lz4hc_data *hc4, const u8 *base)
+{
+	memset((void *)hc4->hashtable, 0, sizeof(hc4->hashtable));
+	memset(hc4->chaintable, 0xFF, sizeof(hc4->chaintable));
+
+#if LZ4_ARCH64
+	hc4->nexttoupdate = base + 1;
+#else
+	hc4->nexttoupdate = base;
+#endif
+	hc4->base = base;
+	return 1;
+}
+
+/* Update chains up to ip (excluded) */
+static inline void lz4hc_insert(struct lz4hc_data *hc4, const u8 *ip)
+{
+	u16 *chaintable = hc4->chaintable;
+	HTYPE *hashtable  = hc4->hashtable;
+#if LZ4_ARCH64
+	const u8 * const base = hc4->base;
+#else
+	const int base = 0;
+#endif
+
+	while (hc4->nexttoupdate < ip) {
+		const u8 *p = hc4->nexttoupdate;
+		size_t delta = p - (hashtable[HASH_VALUE(p)] + base);
+		if (delta > MAX_DISTANCE)
+			delta = MAX_DISTANCE;
+		chaintable[(size_t)(p) & MAXD_MASK] = (u16)delta;
+		hashtable[HASH_VALUE(p)] = (p) - base;
+		hc4->nexttoupdate++;
+	}
+}
+
+static inline int lz4hc_insertandfindbestmatch(struct lz4hc_data *hc4,
+		const u8 *ip, const u8 *const matchlimit, const u8 **matchpos)
+{
+	u16 *const chaintable = hc4->chaintable;
+	HTYPE *const hashtable = hc4->hashtable;
+	const u8 *ref;
+#if LZ4_ARCH64
+	const u8 * const base = hc4->base;
+#else
+	const int base = 0;
+#endif
+	int nbattempts = MAX_NB_ATTEMPTS;
+	size_t repl = 0, ml = 0;
+	u16 delta;
+
+	/* HC4 match finder */
+	lz4hc_insert(hc4, ip);
+	ref = hashtable[HASH_VALUE(ip)] + base;
+
+	/* potential repetition */
+	if (ref >= ip-4) {
+		/* confirmed */
+		if (A32(ref) == A32(ip)) {
+			delta = (u16)(ip-ref);
+			repl = ml  = common_length(ip + MINMATCH,
+					ref + MINMATCH, matchlimit) + MINMATCH;
+			*matchpos = ref;
+		}
+		ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
+	}
+
+	while ((ref >= ip - MAX_DISTANCE) && nbattempts) {
+		nbattempts--;
+		if (*(ref + ml) == *(ip + ml)) {
+			if (A32(ref) == A32(ip)) {
+				size_t mlt =
+					common_length(ip + MINMATCH,
+					ref + MINMATCH, matchlimit) + MINMATCH;
+				if (mlt > ml) {
+					ml = mlt;
+					*matchpos = ref;
+				}
+			}
+		}
+		ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
+	}
+
+	/* Complete table */
+	if (repl) {
+		const u8 *ptr = ip;
+		const u8 *end;
+		end = ip + repl - (MINMATCH-1);
+		/* Pre-Load */
+		while (ptr < end - delta) {
+			chaintable[(size_t)(ptr) & MAXD_MASK] = delta;
+			ptr++;
+		}
+		do {
+			chaintable[(size_t)(ptr) & MAXD_MASK] = delta;
+			/* Head of chain */
+			hashtable[HASH_VALUE(ptr)] = (ptr) - base;
+			ptr++;
+		} while (ptr < end);
+		hc4->nexttoupdate = end;
+	}
+
+	return (int)ml;
+}
+
+static inline int lz4hc_insertandgetwidermatch(struct lz4hc_data *hc4,
+	const u8 *ip, const u8 *startlimit, const u8 *matchlimit, int longest,
+	const u8 **matchpos, const u8 **startpos)
+{
+	u16 *const chaintable = hc4->chaintable;
+	HTYPE *const hashtable = hc4->hashtable;
+#if LZ4_ARCH64
+	const u8 * const base = hc4->base;
+#else
+	const int base = 0;
+#endif
+	const u8 *ref;
+	int nbattempts = MAX_NB_ATTEMPTS;
+	int delta = (int)(ip - startlimit);
+
+	/* First Match */
+	lz4hc_insert(hc4, ip);
+	ref = hashtable[HASH_VALUE(ip)] + base;
+
+	while ((ref >= ip - MAX_DISTANCE) && (ref >= hc4->base)
+		&& (nbattempts)) {
+		nbattempts--;
+		if (*(startlimit + longest) == *(ref - delta + longest)) {
+			if (A32(ref) == A32(ip)) {
+				const u8 *reft = ref;
+				const u8 *startt = ip;
+				unsigned length =
+					common_length(ip + MINMATCH,
+						      ref + MINMATCH,
+						      matchlimit);
+
+				while ((startt > startlimit)
+					&& (reft > hc4->base)
+					&& (startt[-1] == reft[-1])) {
+					startt--;
+					reft--;
+					length++;
+				}
+
+				if (length > longest) {
+					longest = length;
+					*matchpos = reft;
+					*startpos = startt;
+				}
+			}
+		}
+		ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
+	}
+	return longest;
+}
+
+static inline int lz4_encodesequence(const u8 **ip, u8 **op, const u8 **anchor,
+		int ml, const u8 *ref)
+{
+	unsigned length;
+	u8 *token;
+
+	/* Encode Literal length */
+	length = *ip - *anchor;
+	token = (*op)++;
+	*token = encode_length(op, length) << ML_BITS;
+
+	/* Copy Literals */
+	MEMCPY_ADVANCE_CHUNKED(*op, *anchor, length);
+
+	/* Encode Offset */
+	PUT_LE16_ADVANCE(*op, (u16)(*ip - ref));
+
+	*token += encode_length(op, ml - MINMATCH);
+
+	/* Prepare next loop */
+	*ip += ml;
+	*anchor = *ip;
+
+	return 0;
+}
+
+static int lz4_compresshcctx(struct lz4hc_data *ctx,
+		const char *source,
+		char *dest,
+		int isize)
+{
+	const u8 *ip = (const u8 *)source;
+	const u8 *anchor = ip;
+	const u8 *const iend = ip + isize;
+	const u8 *const mflimit = iend - MFLIMIT;
+	const u8 *const matchlimit = (iend - LASTLITERALS);
+
+	u8 *op = (u8 *)dest;
+
+	int ml, ml2, ml3, ml0;
+	const u8 *ref = NULL;
+	const u8 *start2 = NULL;
+	const u8 *ref2 = NULL;
+	const u8 *start3 = NULL;
+	const u8 *ref3 = NULL;
+	const u8 *start0;
+	const u8 *ref0;
+	int lastrun;
+
+	ip++;
+
+	/* Main Loop */
+	while (ip < mflimit) {
+		ml = lz4hc_insertandfindbestmatch(ctx, ip, matchlimit, (&ref));
+		if (!ml) {
+			ip++;
+			continue;
+		}
+
+		/* saved, in case we would skip too much */
+		start0 = ip;
+		ref0 = ref;
+		ml0 = ml;
+_search2:
+		if (ip+ml < mflimit)
+			ml2 = lz4hc_insertandgetwidermatch(ctx, ip + ml - 2,
+				ip + 1, matchlimit, ml, &ref2, &start2);
+		else
+			ml2 = ml;
+		/* No better match */
+		if (ml2 == ml) {
+			lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+			continue;
+		}
+
+		if (start0 < ip) {
+			/* empirical */
+			if (start2 < ip + ml0) {
+				ip = start0;
+				ref = ref0;
+				ml = ml0;
+			}
+		}
+		/*
+		 * Here, start0==ip
+		 * First Match too small : removed
+		 */
+		if ((start2 - ip) < 3) {
+			ml = ml2;
+			ip = start2;
+			ref = ref2;
+			goto _search2;
+		}
+
+_search3:
+		/*
+		 * Currently we have :
+		 * ml2 > ml1, and
+		 * ip1+3 <= ip2 (usually < ip1+ml1)
+		 */
+		if ((start2 - ip) < OPTIMAL_ML) {
+			int correction;
+			int new_ml = ml;
+			if (new_ml > OPTIMAL_ML)
+				new_ml = OPTIMAL_ML;
+			if (ip + new_ml > start2 + ml2 - MINMATCH)
+				new_ml = (int)(start2 - ip) + ml2 - MINMATCH;
+			correction = new_ml - (int)(start2 - ip);
+			if (correction > 0) {
+				start2 += correction;
+				ref2 += correction;
+				ml2 -= correction;
+			}
+		}
+		/*
+		 * Now, we have start2 = ip+new_ml,
+		 * with new_ml=min(ml, OPTIMAL_ML=18)
+		 */
+		if (start2 + ml2 < mflimit)
+			ml3 = lz4hc_insertandgetwidermatch(ctx,
+				start2 + ml2 - 3, start2, matchlimit,
+				ml2, &ref3, &start3);
+		else
+			ml3 = ml2;
+
+		/* No better match : 2 sequences to encode */
+		if (ml3 == ml2) {
+			/* ip & ref are known; Now for ml */
+			if (start2 < ip+ml)
+				ml = (int)(start2 - ip);
+
+			/* Now, encode 2 sequences */
+			lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+			ip = start2;
+			lz4_encodesequence(&ip, &op, &anchor, ml2, ref2);
+			continue;
+		}
+
+		/* Not enough space for match 2 : remove it */
+		if (start3 < ip + ml + 3) {
+			/*
+			 * can write Seq1 immediately ==> Seq2 is removed,
+			 * so Seq3 becomes Seq1
+			 */
+			if (start3 >= (ip + ml)) {
+				if (start2 < ip + ml) {
+					int correction =
+						(int)(ip + ml - start2);
+					start2 += correction;
+					ref2 += correction;
+					ml2 -= correction;
+					if (ml2 < MINMATCH) {
+						start2 = start3;
+						ref2 = ref3;
+						ml2 = ml3;
+					}
+				}
+
+				lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+				ip  = start3;
+				ref = ref3;
+				ml  = ml3;
+
+				start0 = start2;
+				ref0 = ref2;
+				ml0 = ml2;
+				goto _search2;
+			}
+
+			start2 = start3;
+			ref2 = ref3;
+			ml2 = ml3;
+			goto _search3;
+		}
+
+		/*
+		 * OK, now we have 3 ascending matches; let's write at least
+		 * the first one ip & ref are known; Now for ml
+		 */
+		if (start2 < ip + ml) {
+			if ((start2 - ip) < (int)ML_MASK) {
+				int correction;
+				if (ml > OPTIMAL_ML)
+					ml = OPTIMAL_ML;
+				if (ip + ml > start2 + ml2 - MINMATCH)
+					ml = (int)(start2 - ip) + ml2
+						- MINMATCH;
+				correction = ml - (int)(start2 - ip);
+				if (correction > 0) {
+					start2 += correction;
+					ref2 += correction;
+					ml2 -= correction;
+				}
+			} else
+				ml = (int)(start2 - ip);
+		}
+		lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+
+		ip = start2;
+		ref = ref2;
+		ml = ml2;
+
+		start2 = start3;
+		ref2 = ref3;
+		ml2 = ml3;
+
+		goto _search3;
+	}
+
+	/* Encode Last Literals */
+	lastrun = (int)(iend - anchor);
+	if (lastrun >= (int)RUN_MASK) {
+		*op++ = (RUN_MASK << ML_BITS);
+		lastrun -= RUN_MASK;
+		for (; lastrun > 254 ; lastrun -= 255)
+			*op++ = 255;
+		*op++ = (u8) lastrun;
+	} else
+		*op++ = (lastrun << ML_BITS);
+	memcpy(op, anchor, iend - anchor);
+	op += iend - anchor;
+	/* End */
+	return (int) (((char *)op) - dest);
+}
+
+int lz4hc_compress(const unsigned char *src, size_t src_len,
+			unsigned char *dst, size_t *dst_len, void *wrkmem)
+{
+	int ret = -1;
+	int out_len = 0;
+
+	struct lz4hc_data *hc4 = (struct lz4hc_data *)wrkmem;
+	lz4hc_init(hc4, (const u8 *)src);
+	out_len = lz4_compresshcctx((struct lz4hc_data *)hc4, (const u8 *)src,
+		(char *)dst, (int)src_len);
+
+	if (out_len < 0)
+		goto exit;
+
+	*dst_len = out_len;
+	return 0;
+
+exit:
+	return ret;
+}
+EXPORT_SYMBOL(lz4hc_compress);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("LZ4HC compressor");
diff --git a/linux/rbtree.c b/linux/rbtree.c
new file mode 100644
index 0000000..d0e3cbf
--- /dev/null
+++ b/linux/rbtree.c
@@ -0,0 +1,615 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/lib/rbtree.c
+*/
+
+#include <linux/atomic.h>
+#include <linux/rbtree_augmented.h>
+#include <linux/export.h>
+
+/*
+ * red-black trees properties:  http://en.wikipedia.org/wiki/Rbtree
+ *
+ *  1) A node is either red or black
+ *  2) The root is black
+ *  3) All leaves (NULL) are black
+ *  4) Both children of every red node are black
+ *  5) Every simple path from root to leaves contains the same number
+ *     of black nodes.
+ *
+ *  4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
+ *  consecutive red nodes in a path and every red node is therefore followed by
+ *  a black. So if B is the number of black nodes on every simple path (as per
+ *  5), then the longest possible path due to 4 is 2B.
+ *
+ *  We shall indicate color with case, where black nodes are uppercase and red
+ *  nodes will be lowercase. Unknown color nodes shall be drawn as red within
+ *  parentheses and have some accompanying text comment.
+ */
+
+/*
+ * Notes on lockless lookups:
+ *
+ * All stores to the tree structure (rb_left and rb_right) must be done using
+ * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the
+ * tree structure as seen in program order.
+ *
+ * These two requirements will allow lockless iteration of the tree -- not
+ * correct iteration mind you, tree rotations are not atomic so a lookup might
+ * miss entire subtrees.
+ *
+ * But they do guarantee that any such traversal will only see valid elements
+ * and that it will indeed complete -- does not get stuck in a loop.
+ *
+ * It also guarantees that if the lookup returns an element it is the 'correct'
+ * one. But not returning an element does _NOT_ mean it's not present.
+ *
+ * NOTE:
+ *
+ * Stores to __rb_parent_color are not important for simple lookups so those
+ * are left undone as of now. Nor did I check for loops involving parent
+ * pointers.
+ */
+
+static inline void rb_set_black(struct rb_node *rb)
+{
+	rb->__rb_parent_color |= RB_BLACK;
+}
+
+static inline struct rb_node *rb_red_parent(struct rb_node *red)
+{
+	return (struct rb_node *)red->__rb_parent_color;
+}
+
+/*
+ * Helper function for rotations:
+ * - old's parent and color get assigned to new
+ * - old gets assigned new as a parent and 'color' as a color.
+ */
+static inline void
+__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
+			struct rb_root *root, int color)
+{
+	struct rb_node *parent = rb_parent(old);
+	new->__rb_parent_color = old->__rb_parent_color;
+	rb_set_parent_color(old, new, color);
+	__rb_change_child(old, new, parent, root);
+}
+
+static __always_inline void
+__rb_insert(struct rb_node *node, struct rb_root *root,
+	    void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+{
+	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
+
+	while (true) {
+		/*
+		 * Loop invariant: node is red
+		 *
+		 * If there is a black parent, we are done.
+		 * Otherwise, take some corrective action as we don't
+		 * want a red root or two consecutive red nodes.
+		 */
+		if (!parent) {
+			rb_set_parent_color(node, NULL, RB_BLACK);
+			break;
+		} else if (rb_is_black(parent))
+			break;
+
+		gparent = rb_red_parent(parent);
+
+		tmp = gparent->rb_right;
+		if (parent != tmp) {	/* parent == gparent->rb_left */
+			if (tmp && rb_is_red(tmp)) {
+				/*
+				 * Case 1 - color flips
+				 *
+				 *       G            g
+				 *      / \          / \
+				 *     p   u  -->   P   U
+				 *    /            /
+				 *   n            n
+				 *
+				 * However, since g's parent might be red, and
+				 * 4) does not allow this, we need to recurse
+				 * at g.
+				 */
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+				rb_set_parent_color(parent, gparent, RB_BLACK);
+				node = gparent;
+				parent = rb_parent(node);
+				rb_set_parent_color(node, parent, RB_RED);
+				continue;
+			}
+
+			tmp = parent->rb_right;
+			if (node == tmp) {
+				/*
+				 * Case 2 - left rotate at parent
+				 *
+				 *      G             G
+				 *     / \           / \
+				 *    p   U  -->    n   U
+				 *     \           /
+				 *      n         p
+				 *
+				 * This still leaves us in violation of 4), the
+				 * continuation into Case 3 will fix that.
+				 */
+				tmp = node->rb_left;
+				WRITE_ONCE(parent->rb_right, tmp);
+				WRITE_ONCE(node->rb_left, parent);
+				if (tmp)
+					rb_set_parent_color(tmp, parent,
+							    RB_BLACK);
+				rb_set_parent_color(parent, node, RB_RED);
+				augment_rotate(parent, node);
+				parent = node;
+				tmp = node->rb_right;
+			}
+
+			/*
+			 * Case 3 - right rotate at gparent
+			 *
+			 *        G           P
+			 *       / \         / \
+			 *      p   U  -->  n   g
+			 *     /                 \
+			 *    n                   U
+			 */
+			WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */
+			WRITE_ONCE(parent->rb_right, gparent);
+			if (tmp)
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
+			augment_rotate(gparent, parent);
+			break;
+		} else {
+			tmp = gparent->rb_left;
+			if (tmp && rb_is_red(tmp)) {
+				/* Case 1 - color flips */
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+				rb_set_parent_color(parent, gparent, RB_BLACK);
+				node = gparent;
+				parent = rb_parent(node);
+				rb_set_parent_color(node, parent, RB_RED);
+				continue;
+			}
+
+			tmp = parent->rb_left;
+			if (node == tmp) {
+				/* Case 2 - right rotate at parent */
+				tmp = node->rb_right;
+				WRITE_ONCE(parent->rb_left, tmp);
+				WRITE_ONCE(node->rb_right, parent);
+				if (tmp)
+					rb_set_parent_color(tmp, parent,
+							    RB_BLACK);
+				rb_set_parent_color(parent, node, RB_RED);
+				augment_rotate(parent, node);
+				parent = node;
+				tmp = node->rb_left;
+			}
+
+			/* Case 3 - left rotate at gparent */
+			WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */
+			WRITE_ONCE(parent->rb_left, gparent);
+			if (tmp)
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
+			augment_rotate(gparent, parent);
+			break;
+		}
+	}
+}
+
+/*
+ * Inline version for rb_erase() use - we want to be able to inline
+ * and eliminate the dummy_rotate callback there
+ */
+static __always_inline void
+____rb_erase_color(struct rb_node *parent, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+{
+	struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
+
+	while (true) {
+		/*
+		 * Loop invariants:
+		 * - node is black (or NULL on first iteration)
+		 * - node is not the root (parent is not NULL)
+		 * - All leaf paths going through parent and node have a
+		 *   black node count that is 1 lower than other leaf paths.
+		 */
+		sibling = parent->rb_right;
+		if (node != sibling) {	/* node == parent->rb_left */
+			if (rb_is_red(sibling)) {
+				/*
+				 * Case 1 - left rotate at parent
+				 *
+				 *     P               S
+				 *    / \             / \
+				 *   N   s    -->    p   Sr
+				 *      / \         / \
+				 *     Sl  Sr      N   Sl
+				 */
+				tmp1 = sibling->rb_left;
+				WRITE_ONCE(parent->rb_right, tmp1);
+				WRITE_ONCE(sibling->rb_left, parent);
+				rb_set_parent_color(tmp1, parent, RB_BLACK);
+				__rb_rotate_set_parents(parent, sibling, root,
+							RB_RED);
+				augment_rotate(parent, sibling);
+				sibling = tmp1;
+			}
+			tmp1 = sibling->rb_right;
+			if (!tmp1 || rb_is_black(tmp1)) {
+				tmp2 = sibling->rb_left;
+				if (!tmp2 || rb_is_black(tmp2)) {
+					/*
+					 * Case 2 - sibling color flip
+					 * (p could be either color here)
+					 *
+					 *    (p)           (p)
+					 *    / \           / \
+					 *   N   S    -->  N   s
+					 *      / \           / \
+					 *     Sl  Sr        Sl  Sr
+					 *
+					 * This leaves us violating 5) which
+					 * can be fixed by flipping p to black
+					 * if it was red, or by recursing at p.
+					 * p is red when coming from Case 1.
+					 */
+					rb_set_parent_color(sibling, parent,
+							    RB_RED);
+					if (rb_is_red(parent))
+						rb_set_black(parent);
+					else {
+						node = parent;
+						parent = rb_parent(node);
+						if (parent)
+							continue;
+					}
+					break;
+				}
+				/*
+				 * Case 3 - right rotate at sibling
+				 * (p could be either color here)
+				 *
+				 *   (p)           (p)
+				 *   / \           / \
+				 *  N   S    -->  N   Sl
+				 *     / \             \
+				 *    sl  Sr            s
+				 *                       \
+				 *                        Sr
+				 */
+				tmp1 = tmp2->rb_right;
+				WRITE_ONCE(sibling->rb_left, tmp1);
+				WRITE_ONCE(tmp2->rb_right, sibling);
+				WRITE_ONCE(parent->rb_right, tmp2);
+				if (tmp1)
+					rb_set_parent_color(tmp1, sibling,
+							    RB_BLACK);
+				augment_rotate(sibling, tmp2);
+				tmp1 = sibling;
+				sibling = tmp2;
+			}
+			/*
+			 * Case 4 - left rotate at parent + color flips
+			 * (p and sl could be either color here.
+			 *  After rotation, p becomes black, s acquires
+			 *  p's color, and sl keeps its color)
+			 *
+			 *      (p)             (s)
+			 *      / \             / \
+			 *     N   S     -->   P   Sr
+			 *        / \         / \
+			 *      (sl) sr      N  (sl)
+			 */
+			tmp2 = sibling->rb_left;
+			WRITE_ONCE(parent->rb_right, tmp2);
+			WRITE_ONCE(sibling->rb_left, parent);
+			rb_set_parent_color(tmp1, sibling, RB_BLACK);
+			if (tmp2)
+				rb_set_parent(tmp2, parent);
+			__rb_rotate_set_parents(parent, sibling, root,
+						RB_BLACK);
+			augment_rotate(parent, sibling);
+			break;
+		} else {
+			sibling = parent->rb_left;
+			if (rb_is_red(sibling)) {
+				/* Case 1 - right rotate at parent */
+				tmp1 = sibling->rb_right;
+				WRITE_ONCE(parent->rb_left, tmp1);
+				WRITE_ONCE(sibling->rb_right, parent);
+				rb_set_parent_color(tmp1, parent, RB_BLACK);
+				__rb_rotate_set_parents(parent, sibling, root,
+							RB_RED);
+				augment_rotate(parent, sibling);
+				sibling = tmp1;
+			}
+			tmp1 = sibling->rb_left;
+			if (!tmp1 || rb_is_black(tmp1)) {
+				tmp2 = sibling->rb_right;
+				if (!tmp2 || rb_is_black(tmp2)) {
+					/* Case 2 - sibling color flip */
+					rb_set_parent_color(sibling, parent,
+							    RB_RED);
+					if (rb_is_red(parent))
+						rb_set_black(parent);
+					else {
+						node = parent;
+						parent = rb_parent(node);
+						if (parent)
+							continue;
+					}
+					break;
+				}
+				/* Case 3 - right rotate at sibling */
+				tmp1 = tmp2->rb_left;
+				WRITE_ONCE(sibling->rb_right, tmp1);
+				WRITE_ONCE(tmp2->rb_left, sibling);
+				WRITE_ONCE(parent->rb_left, tmp2);
+				if (tmp1)
+					rb_set_parent_color(tmp1, sibling,
+							    RB_BLACK);
+				augment_rotate(sibling, tmp2);
+				tmp1 = sibling;
+				sibling = tmp2;
+			}
+			/* Case 4 - left rotate at parent + color flips */
+			tmp2 = sibling->rb_right;
+			WRITE_ONCE(parent->rb_left, tmp2);
+			WRITE_ONCE(sibling->rb_right, parent);
+			rb_set_parent_color(tmp1, sibling, RB_BLACK);
+			if (tmp2)
+				rb_set_parent(tmp2, parent);
+			__rb_rotate_set_parents(parent, sibling, root,
+						RB_BLACK);
+			augment_rotate(parent, sibling);
+			break;
+		}
+	}
+}
+
+/* Non-inline version for rb_erase_augmented() use */
+void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+{
+	____rb_erase_color(parent, root, augment_rotate);
+}
+EXPORT_SYMBOL(__rb_erase_color);
+
+/*
+ * Non-augmented rbtree manipulation functions.
+ *
+ * We use dummy augmented callbacks here, and have the compiler optimize them
+ * out of the rb_insert_color() and rb_erase() function definitions.
+ */
+
+static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
+static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
+static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
+
+static const struct rb_augment_callbacks dummy_callbacks = {
+	dummy_propagate, dummy_copy, dummy_rotate
+};
+
+void rb_insert_color(struct rb_node *node, struct rb_root *root)
+{
+	__rb_insert(node, root, dummy_rotate);
+}
+EXPORT_SYMBOL(rb_insert_color);
+
+void rb_erase(struct rb_node *node, struct rb_root *root)
+{
+	struct rb_node *rebalance;
+	rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
+	if (rebalance)
+		____rb_erase_color(rebalance, root, dummy_rotate);
+}
+EXPORT_SYMBOL(rb_erase);
+
+/*
+ * Augmented rbtree manipulation functions.
+ *
+ * This instantiates the same __always_inline functions as in the non-augmented
+ * case, but this time with user-defined callbacks.
+ */
+
+void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+{
+	__rb_insert(node, root, augment_rotate);
+}
+EXPORT_SYMBOL(__rb_insert_augmented);
+
+/*
+ * This function returns the first node (in sort order) of the tree.
+ */
+struct rb_node *rb_first(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_left)
+		n = n->rb_left;
+	return n;
+}
+EXPORT_SYMBOL(rb_first);
+
+struct rb_node *rb_last(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_right)
+		n = n->rb_right;
+	return n;
+}
+EXPORT_SYMBOL(rb_last);
+
+struct rb_node *rb_next(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (RB_EMPTY_NODE(node))
+		return NULL;
+
+	/*
+	 * If we have a right-hand child, go down and then left as far
+	 * as we can.
+	 */
+	if (node->rb_right) {
+		node = node->rb_right; 
+		while (node->rb_left)
+			node=node->rb_left;
+		return (struct rb_node *)node;
+	}
+
+	/*
+	 * No right-hand children. Everything down and left is smaller than us,
+	 * so any 'next' node must be in the general direction of our parent.
+	 * Go up the tree; any time the ancestor is a right-hand child of its
+	 * parent, keep going up. First time it's a left-hand child of its
+	 * parent, said parent is our 'next' node.
+	 */
+	while ((parent = rb_parent(node)) && node == parent->rb_right)
+		node = parent;
+
+	return parent;
+}
+EXPORT_SYMBOL(rb_next);
+
+struct rb_node *rb_prev(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (RB_EMPTY_NODE(node))
+		return NULL;
+
+	/*
+	 * If we have a left-hand child, go down and then right as far
+	 * as we can.
+	 */
+	if (node->rb_left) {
+		node = node->rb_left; 
+		while (node->rb_right)
+			node=node->rb_right;
+		return (struct rb_node *)node;
+	}
+
+	/*
+	 * No left-hand children. Go up till we find an ancestor which
+	 * is a right-hand child of its parent.
+	 */
+	while ((parent = rb_parent(node)) && node == parent->rb_left)
+		node = parent;
+
+	return parent;
+}
+EXPORT_SYMBOL(rb_prev);
+
+void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+		     struct rb_root *root)
+{
+	struct rb_node *parent = rb_parent(victim);
+
+	/* Copy the pointers/colour from the victim to the replacement */
+	*new = *victim;
+
+	/* Set the surrounding nodes to point to the replacement */
+	if (victim->rb_left)
+		rb_set_parent(victim->rb_left, new);
+	if (victim->rb_right)
+		rb_set_parent(victim->rb_right, new);
+	__rb_change_child(victim, new, parent, root);
+}
+EXPORT_SYMBOL(rb_replace_node);
+
+void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
+			 struct rb_root *root)
+{
+	struct rb_node *parent = rb_parent(victim);
+
+	/* Copy the pointers/colour from the victim to the replacement */
+	*new = *victim;
+
+	/* Set the surrounding nodes to point to the replacement */
+	if (victim->rb_left)
+		rb_set_parent(victim->rb_left, new);
+	if (victim->rb_right)
+		rb_set_parent(victim->rb_right, new);
+
+	/* Set the parent's pointer to the new node last after an RCU barrier
+	 * so that the pointers onwards are seen to be set correctly when doing
+	 * an RCU walk over the tree.
+	 */
+	__rb_change_child_rcu(victim, new, parent, root);
+}
+EXPORT_SYMBOL(rb_replace_node_rcu);
+
+static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
+{
+	for (;;) {
+		if (node->rb_left)
+			node = node->rb_left;
+		else if (node->rb_right)
+			node = node->rb_right;
+		else
+			return (struct rb_node *)node;
+	}
+}
+
+struct rb_node *rb_next_postorder(const struct rb_node *node)
+{
+	const struct rb_node *parent;
+	if (!node)
+		return NULL;
+	parent = rb_parent(node);
+
+	/* If we're sitting on node, we've already seen our children */
+	if (parent && node == parent->rb_left && parent->rb_right) {
+		/* If we are the parent's left node, go to the parent's right
+		 * node then all the way down to the left */
+		return rb_left_deepest_node(parent->rb_right);
+	} else
+		/* Otherwise we are the parent's right node, and the parent
+		 * should be next */
+		return (struct rb_node *)parent;
+}
+EXPORT_SYMBOL(rb_next_postorder);
+
+struct rb_node *rb_first_postorder(const struct rb_root *root)
+{
+	if (!root->rb_node)
+		return NULL;
+
+	return rb_left_deepest_node(root->rb_node);
+}
+EXPORT_SYMBOL(rb_first_postorder);
diff --git a/linux/rhashtable.c b/linux/rhashtable.c
new file mode 100644
index 0000000..035d82a
--- /dev/null
+++ b/linux/rhashtable.c
@@ -0,0 +1,860 @@
+/*
+ * Resizable, Scalable, Concurrent Hash Table
+ *
+ * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
+ * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
+ * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
+ *
+ * Code partially derived from nft_hash
+ * Rewritten with rehash code from br_multicast plus single list
+ * pointer as suggested by Josh Triplett
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/atomic.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <linux/rhashtable.h>
+#include <linux/err.h>
+#include <linux/export.h>
+
+#define HASH_DEFAULT_SIZE	64UL
+#define HASH_MIN_SIZE		4U
+#define BUCKET_LOCKS_PER_CPU	32UL
+
+static u32 head_hashfn(struct rhashtable *ht,
+		       const struct bucket_table *tbl,
+		       const struct rhash_head *he)
+{
+	return rht_head_hashfn(ht, tbl, he, ht->p);
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
+
+int lockdep_rht_mutex_is_held(struct rhashtable *ht)
+{
+	return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
+
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
+{
+	spinlock_t *lock = rht_bucket_lock(tbl, hash);
+
+	return (debug_locks) ? lockdep_is_held(lock) : 1;
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
+#else
+#define ASSERT_RHT_MUTEX(HT)
+#endif
+
+
+static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
+			      gfp_t gfp)
+{
+	unsigned int i, size;
+#if defined(CONFIG_PROVE_LOCKING)
+	unsigned int nr_pcpus = 2;
+#else
+	unsigned int nr_pcpus = num_possible_cpus();
+#endif
+
+	nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL);
+	size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
+
+	/* Never allocate more than 0.5 locks per bucket */
+	size = min_t(unsigned int, size, tbl->size >> 1);
+
+	if (sizeof(spinlock_t) != 0) {
+		tbl->locks = NULL;
+#ifdef CONFIG_NUMA
+		if (size * sizeof(spinlock_t) > PAGE_SIZE &&
+		    gfp == GFP_KERNEL)
+			tbl->locks = vmalloc(size * sizeof(spinlock_t));
+#endif
+		if (gfp != GFP_KERNEL)
+			gfp |= __GFP_NOWARN | __GFP_NORETRY;
+
+		if (!tbl->locks)
+			tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
+						   gfp);
+		if (!tbl->locks)
+			return -ENOMEM;
+		for (i = 0; i < size; i++)
+			spin_lock_init(&tbl->locks[i]);
+	}
+	tbl->locks_mask = size - 1;
+
+	return 0;
+}
+
+static void bucket_table_free(struct bucket_table *tbl)
+{
+	if (tbl)
+		kvfree(tbl->locks);
+
+	kvfree(tbl);
+}
+
+static void bucket_table_free_rcu(struct rcu_head *head)
+{
+	bucket_table_free(container_of(head, struct bucket_table, rcu));
+}
+
+static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
+					       size_t nbuckets,
+					       gfp_t gfp)
+{
+	struct bucket_table *tbl = NULL;
+	size_t size;
+	int i;
+
+	size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
+	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) ||
+	    gfp != GFP_KERNEL)
+		tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
+	if (tbl == NULL && gfp == GFP_KERNEL)
+		tbl = vzalloc(size);
+	if (tbl == NULL)
+		return NULL;
+
+	tbl->size = nbuckets;
+
+	if (alloc_bucket_locks(ht, tbl, gfp) < 0) {
+		bucket_table_free(tbl);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&tbl->walkers);
+
+	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
+
+	for (i = 0; i < nbuckets; i++)
+		INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i);
+
+	return tbl;
+}
+
+static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
+						  struct bucket_table *tbl)
+{
+	struct bucket_table *new_tbl;
+
+	do {
+		new_tbl = tbl;
+		tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	} while (tbl);
+
+	return new_tbl;
+}
+
+static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
+{
+	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *new_tbl = rhashtable_last_table(ht,
+		rht_dereference_rcu(old_tbl->future_tbl, ht));
+	struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash];
+	int err = -ENOENT;
+	struct rhash_head *head, *next, *entry;
+	spinlock_t *new_bucket_lock;
+	unsigned int new_hash;
+
+	rht_for_each(entry, old_tbl, old_hash) {
+		err = 0;
+		next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
+
+		if (rht_is_a_nulls(next))
+			break;
+
+		pprev = &entry->next;
+	}
+
+	if (err)
+		goto out;
+
+	new_hash = head_hashfn(ht, new_tbl, entry);
+
+	new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
+
+	spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
+	head = rht_dereference_bucket(new_tbl->buckets[new_hash],
+				      new_tbl, new_hash);
+
+	RCU_INIT_POINTER(entry->next, head);
+
+	rcu_assign_pointer(new_tbl->buckets[new_hash], entry);
+	spin_unlock(new_bucket_lock);
+
+	rcu_assign_pointer(*pprev, next);
+
+out:
+	return err;
+}
+
+static void rhashtable_rehash_chain(struct rhashtable *ht,
+				    unsigned int old_hash)
+{
+	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
+	spinlock_t *old_bucket_lock;
+
+	old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
+
+	spin_lock_bh(old_bucket_lock);
+	while (!rhashtable_rehash_one(ht, old_hash))
+		;
+	old_tbl->rehash++;
+	spin_unlock_bh(old_bucket_lock);
+}
+
+static int rhashtable_rehash_attach(struct rhashtable *ht,
+				    struct bucket_table *old_tbl,
+				    struct bucket_table *new_tbl)
+{
+	/* Protect future_tbl using the first bucket lock. */
+	spin_lock_bh(old_tbl->locks);
+
+	/* Did somebody beat us to it? */
+	if (rcu_access_pointer(old_tbl->future_tbl)) {
+		spin_unlock_bh(old_tbl->locks);
+		return -EEXIST;
+	}
+
+	/* Make insertions go into the new, empty table right away. Deletions
+	 * and lookups will be attempted in both tables until we synchronize.
+	 */
+	rcu_assign_pointer(old_tbl->future_tbl, new_tbl);
+
+	spin_unlock_bh(old_tbl->locks);
+
+	return 0;
+}
+
+static int rhashtable_rehash_table(struct rhashtable *ht)
+{
+	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *new_tbl;
+	struct rhashtable_walker *walker;
+	unsigned int old_hash;
+
+	new_tbl = rht_dereference(old_tbl->future_tbl, ht);
+	if (!new_tbl)
+		return 0;
+
+	for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
+		rhashtable_rehash_chain(ht, old_hash);
+
+	/* Publish the new table pointer. */
+	rcu_assign_pointer(ht->tbl, new_tbl);
+
+	spin_lock(&ht->lock);
+	list_for_each_entry(walker, &old_tbl->walkers, list)
+		walker->tbl = NULL;
+	spin_unlock(&ht->lock);
+
+	/* Wait for readers. All new readers will see the new
+	 * table, and thus no references to the old table will
+	 * remain.
+	 */
+	call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
+
+	return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
+}
+
+/**
+ * rhashtable_expand - Expand hash table while allowing concurrent lookups
+ * @ht:		the hash table to expand
+ *
+ * A secondary bucket array is allocated and the hash entries are migrated.
+ *
+ * This function may only be called in a context where it is safe to call
+ * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
+ *
+ * The caller must ensure that no concurrent resizing occurs by holding
+ * ht->mutex.
+ *
+ * It is valid to have concurrent insertions and deletions protected by per
+ * bucket locks or concurrent RCU protected lookups and traversals.
+ */
+static int rhashtable_expand(struct rhashtable *ht)
+{
+	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+	int err;
+
+	ASSERT_RHT_MUTEX(ht);
+
+	old_tbl = rhashtable_last_table(ht, old_tbl);
+
+	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL);
+	if (new_tbl == NULL)
+		return -ENOMEM;
+
+	err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
+	if (err)
+		bucket_table_free(new_tbl);
+
+	return err;
+}
+
+/**
+ * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
+ * @ht:		the hash table to shrink
+ *
+ * This function shrinks the hash table to fit, i.e., the smallest
+ * size would not cause it to expand right away automatically.
+ *
+ * The caller must ensure that no concurrent resizing occurs by holding
+ * ht->mutex.
+ *
+ * The caller must ensure that no concurrent table mutations take place.
+ * It is however valid to have concurrent lookups if they are RCU protected.
+ *
+ * It is valid to have concurrent insertions and deletions protected by per
+ * bucket locks or concurrent RCU protected lookups and traversals.
+ */
+static int rhashtable_shrink(struct rhashtable *ht)
+{
+	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+	unsigned int nelems = atomic_read(&ht->nelems);
+	unsigned int size = 0;
+	int err;
+
+	ASSERT_RHT_MUTEX(ht);
+
+	if (nelems)
+		size = roundup_pow_of_two(nelems * 3 / 2);
+	if (size < ht->p.min_size)
+		size = ht->p.min_size;
+
+	if (old_tbl->size <= size)
+		return 0;
+
+	if (rht_dereference(old_tbl->future_tbl, ht))
+		return -EEXIST;
+
+	new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
+	if (new_tbl == NULL)
+		return -ENOMEM;
+
+	err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
+	if (err)
+		bucket_table_free(new_tbl);
+
+	return err;
+}
+
+static void rht_deferred_worker(struct work_struct *work)
+{
+	struct rhashtable *ht;
+	struct bucket_table *tbl;
+	int err = 0;
+
+	ht = container_of(work, struct rhashtable, run_work);
+	mutex_lock(&ht->mutex);
+
+	tbl = rht_dereference(ht->tbl, ht);
+	tbl = rhashtable_last_table(ht, tbl);
+
+	if (rht_grow_above_75(ht, tbl))
+		rhashtable_expand(ht);
+	else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
+		rhashtable_shrink(ht);
+
+	err = rhashtable_rehash_table(ht);
+
+	mutex_unlock(&ht->mutex);
+
+	if (err)
+		schedule_work(&ht->run_work);
+}
+
+static bool rhashtable_check_elasticity(struct rhashtable *ht,
+					struct bucket_table *tbl,
+					unsigned int hash)
+{
+	unsigned int elasticity = ht->elasticity;
+	struct rhash_head *head;
+
+	rht_for_each(head, tbl, hash)
+		if (!--elasticity)
+			return true;
+
+	return false;
+}
+
+int rhashtable_insert_rehash(struct rhashtable *ht,
+			     struct bucket_table *tbl)
+{
+	struct bucket_table *old_tbl;
+	struct bucket_table *new_tbl;
+	unsigned int size;
+	int err;
+
+	old_tbl = rht_dereference_rcu(ht->tbl, ht);
+
+	size = tbl->size;
+
+	err = -EBUSY;
+
+	if (rht_grow_above_75(ht, tbl))
+		size *= 2;
+	/* Do not schedule more than one rehash */
+	else if (old_tbl != tbl)
+		goto fail;
+
+	err = -ENOMEM;
+
+	new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC);
+	if (new_tbl == NULL)
+		goto fail;
+
+	err = rhashtable_rehash_attach(ht, tbl, new_tbl);
+	if (err) {
+		bucket_table_free(new_tbl);
+		if (err == -EEXIST)
+			err = 0;
+	} else
+		schedule_work(&ht->run_work);
+
+	return err;
+
+fail:
+	/* Do not fail the insert if someone else did a rehash. */
+	if (likely(rcu_dereference_raw(tbl->future_tbl)))
+		return 0;
+
+	/* Schedule async rehash to retry allocation in process context. */
+	if (err == -ENOMEM)
+		schedule_work(&ht->run_work);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(rhashtable_insert_rehash);
+
+struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
+					    const void *key,
+					    struct rhash_head *obj,
+					    struct bucket_table *tbl)
+{
+	struct rhash_head *head;
+	unsigned int hash;
+	int err;
+
+	tbl = rhashtable_last_table(ht, tbl);
+	hash = head_hashfn(ht, tbl, obj);
+	spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
+
+	err = -EEXIST;
+	if (key && rhashtable_lookup_fast(ht, key, ht->p))
+		goto exit;
+
+	err = -E2BIG;
+	if (unlikely(rht_grow_above_max(ht, tbl)))
+		goto exit;
+
+	err = -EAGAIN;
+	if (rhashtable_check_elasticity(ht, tbl, hash) ||
+	    rht_grow_above_100(ht, tbl))
+		goto exit;
+
+	err = 0;
+
+	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+
+	RCU_INIT_POINTER(obj->next, head);
+
+	rcu_assign_pointer(tbl->buckets[hash], obj);
+
+	atomic_inc(&ht->nelems);
+
+exit:
+	spin_unlock(rht_bucket_lock(tbl, hash));
+
+	if (err == 0)
+		return NULL;
+	else if (err == -EAGAIN)
+		return tbl;
+	else
+		return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
+
+/**
+ * rhashtable_walk_init - Initialise an iterator
+ * @ht:		Table to walk over
+ * @iter:	Hash table Iterator
+ * @gfp:	GFP flags for allocations
+ *
+ * This function prepares a hash table walk.
+ *
+ * Note that if you restart a walk after rhashtable_walk_stop you
+ * may see the same object twice.  Also, you may miss objects if
+ * there are removals in between rhashtable_walk_stop and the next
+ * call to rhashtable_walk_start.
+ *
+ * For a completely stable walk you should construct your own data
+ * structure outside the hash table.
+ *
+ * This function may sleep so you must not call it from interrupt
+ * context or with spin locks held.
+ *
+ * You must call rhashtable_walk_exit if this function returns
+ * successfully.
+ */
+int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter,
+			 gfp_t gfp)
+{
+	iter->ht = ht;
+	iter->p = NULL;
+	iter->slot = 0;
+	iter->skip = 0;
+
+	iter->walker = kmalloc(sizeof(*iter->walker), gfp);
+	if (!iter->walker)
+		return -ENOMEM;
+
+	spin_lock(&ht->lock);
+	iter->walker->tbl =
+		rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
+	list_add(&iter->walker->list, &iter->walker->tbl->walkers);
+	spin_unlock(&ht->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_init);
+
+/**
+ * rhashtable_walk_exit - Free an iterator
+ * @iter:	Hash table Iterator
+ *
+ * This function frees resources allocated by rhashtable_walk_init.
+ */
+void rhashtable_walk_exit(struct rhashtable_iter *iter)
+{
+	spin_lock(&iter->ht->lock);
+	if (iter->walker->tbl)
+		list_del(&iter->walker->list);
+	spin_unlock(&iter->ht->lock);
+	kfree(iter->walker);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
+
+/**
+ * rhashtable_walk_start - Start a hash table walk
+ * @iter:	Hash table iterator
+ *
+ * Start a hash table walk.  Note that we take the RCU lock in all
+ * cases including when we return an error.  So you must always call
+ * rhashtable_walk_stop to clean up.
+ *
+ * Returns zero if successful.
+ *
+ * Returns -EAGAIN if resize event occured.  Note that the iterator
+ * will rewind back to the beginning and you may use it immediately
+ * by calling rhashtable_walk_next.
+ */
+int rhashtable_walk_start(struct rhashtable_iter *iter)
+	__acquires(RCU)
+{
+	struct rhashtable *ht = iter->ht;
+
+	rcu_read_lock();
+
+	spin_lock(&ht->lock);
+	if (iter->walker->tbl)
+		list_del(&iter->walker->list);
+	spin_unlock(&ht->lock);
+
+	if (!iter->walker->tbl) {
+		iter->walker->tbl = rht_dereference_rcu(ht->tbl, ht);
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_start);
+
+/**
+ * rhashtable_walk_next - Return the next object and advance the iterator
+ * @iter:	Hash table iterator
+ *
+ * Note that you must call rhashtable_walk_stop when you are finished
+ * with the walk.
+ *
+ * Returns the next object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occured.  Note that the iterator
+ * will rewind back to the beginning and you may continue to use it.
+ */
+void *rhashtable_walk_next(struct rhashtable_iter *iter)
+{
+	struct bucket_table *tbl = iter->walker->tbl;
+	struct rhashtable *ht = iter->ht;
+	struct rhash_head *p = iter->p;
+
+	if (p) {
+		p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot);
+		goto next;
+	}
+
+	for (; iter->slot < tbl->size; iter->slot++) {
+		int skip = iter->skip;
+
+		rht_for_each_rcu(p, tbl, iter->slot) {
+			if (!skip)
+				break;
+			skip--;
+		}
+
+next:
+		if (!rht_is_a_nulls(p)) {
+			iter->skip++;
+			iter->p = p;
+			return rht_obj(ht, p);
+		}
+
+		iter->skip = 0;
+	}
+
+	iter->p = NULL;
+
+	/* Ensure we see any new tables. */
+	smp_rmb();
+
+	iter->walker->tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	if (iter->walker->tbl) {
+		iter->slot = 0;
+		iter->skip = 0;
+		return ERR_PTR(-EAGAIN);
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_next);
+
+/**
+ * rhashtable_walk_stop - Finish a hash table walk
+ * @iter:	Hash table iterator
+ *
+ * Finish a hash table walk.
+ */
+void rhashtable_walk_stop(struct rhashtable_iter *iter)
+	__releases(RCU)
+{
+	struct rhashtable *ht;
+	struct bucket_table *tbl = iter->walker->tbl;
+
+	if (!tbl)
+		goto out;
+
+	ht = iter->ht;
+
+	spin_lock(&ht->lock);
+	if (tbl->rehash < tbl->size)
+		list_add(&iter->walker->list, &tbl->walkers);
+	else
+		iter->walker->tbl = NULL;
+	spin_unlock(&ht->lock);
+
+	iter->p = NULL;
+
+out:
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
+
+static size_t rounded_hashtable_size(const struct rhashtable_params *params)
+{
+	return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
+		   (unsigned long)params->min_size);
+}
+
+static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
+{
+	return jhash2(key, length, seed);
+}
+
+/**
+ * rhashtable_init - initialize a new hash table
+ * @ht:		hash table to be initialized
+ * @params:	configuration parameters
+ *
+ * Initializes a new hash table based on the provided configuration
+ * parameters. A table can be configured either with a variable or
+ * fixed length key:
+ *
+ * Configuration Example 1: Fixed length keys
+ * struct test_obj {
+ *	int			key;
+ *	void *			my_member;
+ *	struct rhash_head	node;
+ * };
+ *
+ * struct rhashtable_params params = {
+ *	.head_offset = offsetof(struct test_obj, node),
+ *	.key_offset = offsetof(struct test_obj, key),
+ *	.key_len = sizeof(int),
+ *	.hashfn = jhash,
+ *	.nulls_base = (1U << RHT_BASE_SHIFT),
+ * };
+ *
+ * Configuration Example 2: Variable length keys
+ * struct test_obj {
+ *	[...]
+ *	struct rhash_head	node;
+ * };
+ *
+ * u32 my_hash_fn(const void *data, u32 len, u32 seed)
+ * {
+ *	struct test_obj *obj = data;
+ *
+ *	return [... hash ...];
+ * }
+ *
+ * struct rhashtable_params params = {
+ *	.head_offset = offsetof(struct test_obj, node),
+ *	.hashfn = jhash,
+ *	.obj_hashfn = my_hash_fn,
+ * };
+ */
+int rhashtable_init(struct rhashtable *ht,
+		    const struct rhashtable_params *params)
+{
+	struct bucket_table *tbl;
+	size_t size;
+
+	size = HASH_DEFAULT_SIZE;
+
+	if ((!params->key_len && !params->obj_hashfn) ||
+	    (params->obj_hashfn && !params->obj_cmpfn))
+		return -EINVAL;
+
+	if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
+		return -EINVAL;
+
+	memset(ht, 0, sizeof(*ht));
+	mutex_init(&ht->mutex);
+	spin_lock_init(&ht->lock);
+	memcpy(&ht->p, params, sizeof(*params));
+
+	if (params->min_size)
+		ht->p.min_size = roundup_pow_of_two(params->min_size);
+
+	if (params->max_size)
+		ht->p.max_size = rounddown_pow_of_two(params->max_size);
+
+	if (params->insecure_max_entries)
+		ht->p.insecure_max_entries =
+			rounddown_pow_of_two(params->insecure_max_entries);
+	else
+		ht->p.insecure_max_entries = ht->p.max_size * 2;
+
+	ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
+
+	if (params->nelem_hint)
+		size = rounded_hashtable_size(&ht->p);
+
+	/* The maximum (not average) chain length grows with the
+	 * size of the hash table, at a rate of (log N)/(log log N).
+	 * The value of 16 is selected so that even if the hash
+	 * table grew to 2^32 you would not expect the maximum
+	 * chain length to exceed it unless we are under attack
+	 * (or extremely unlucky).
+	 *
+	 * As this limit is only to detect attacks, we don't need
+	 * to set it to a lower value as you'd need the chain
+	 * length to vastly exceed 16 to have any real effect
+	 * on the system.
+	 */
+	if (!params->insecure_elasticity)
+		ht->elasticity = 16;
+
+	if (params->locks_mul)
+		ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
+	else
+		ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
+
+	ht->key_len = ht->p.key_len;
+	if (!params->hashfn) {
+		ht->p.hashfn = jhash;
+
+		if (!(ht->key_len & (sizeof(u32) - 1))) {
+			ht->key_len /= sizeof(u32);
+			ht->p.hashfn = rhashtable_jhash2;
+		}
+	}
+
+	tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
+	if (tbl == NULL)
+		return -ENOMEM;
+
+	atomic_set(&ht->nelems, 0);
+
+	RCU_INIT_POINTER(ht->tbl, tbl);
+
+	INIT_WORK(&ht->run_work, rht_deferred_worker);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rhashtable_init);
+
+/**
+ * rhashtable_free_and_destroy - free elements and destroy hash table
+ * @ht:		the hash table to destroy
+ * @free_fn:	callback to release resources of element
+ * @arg:	pointer passed to free_fn
+ *
+ * Stops an eventual async resize. If defined, invokes free_fn for each
+ * element to releasal resources. Please note that RCU protected
+ * readers may still be accessing the elements. Releasing of resources
+ * must occur in a compatible manner. Then frees the bucket array.
+ *
+ * This function will eventually sleep to wait for an async resize
+ * to complete. The caller is responsible that no further write operations
+ * occurs in parallel.
+ */
+void rhashtable_free_and_destroy(struct rhashtable *ht,
+				 void (*free_fn)(void *ptr, void *arg),
+				 void *arg)
+{
+	struct bucket_table *tbl;
+	unsigned int i;
+
+	cancel_work_sync(&ht->run_work);
+
+	mutex_lock(&ht->mutex);
+	tbl = rht_dereference(ht->tbl, ht);
+	if (free_fn) {
+		for (i = 0; i < tbl->size; i++) {
+			struct rhash_head *pos, *next;
+
+			for (pos = rht_dereference(tbl->buckets[i], ht),
+			     next = !rht_is_a_nulls(pos) ?
+					rht_dereference(pos->next, ht) : NULL;
+			     !rht_is_a_nulls(pos);
+			     pos = next,
+			     next = !rht_is_a_nulls(pos) ?
+					rht_dereference(pos->next, ht) : NULL)
+				free_fn(rht_obj(ht, pos), arg);
+		}
+	}
+
+	bucket_table_free(tbl);
+	mutex_unlock(&ht->mutex);
+}
+EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);
+
+void rhashtable_destroy(struct rhashtable *ht)
+{
+	return rhashtable_free_and_destroy(ht, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(rhashtable_destroy);
diff --git a/linux/sched.c b/linux/sched.c
new file mode 100644
index 0000000..11480f3
--- /dev/null
+++ b/linux/sched.c
@@ -0,0 +1,178 @@
+
+#include <string.h>
+
+#include <linux/math64.h>
+#include <linux/printk.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+
+__thread struct task_struct *current;
+
+void __put_task_struct(struct task_struct *t)
+{
+	pthread_join(t->thread, NULL);
+	free(t);
+}
+
+/* returns true if process was woken up, false if it was already running */
+int wake_up_process(struct task_struct *p)
+{
+	int ret;
+
+	pthread_mutex_lock(&p->lock);
+	ret = p->state != TASK_RUNNING;
+	p->state = TASK_RUNNING;
+
+	pthread_cond_signal(&p->wait);
+	pthread_mutex_unlock(&p->lock);
+
+	return ret;
+}
+
+void schedule(void)
+{
+	rcu_quiescent_state();
+
+	pthread_mutex_lock(&current->lock);
+
+	while (current->state != TASK_RUNNING)
+		pthread_cond_wait(&current->wait, &current->lock);
+
+	pthread_mutex_unlock(&current->lock);
+}
+
+static void process_timeout(unsigned long __data)
+{
+	wake_up_process((struct task_struct *)__data);
+}
+
+long schedule_timeout(long timeout)
+{
+	struct timer_list timer;
+	unsigned long expire;
+
+	switch (timeout)
+	{
+	case MAX_SCHEDULE_TIMEOUT:
+		/*
+		 * These two special cases are useful to be comfortable
+		 * in the caller. Nothing more. We could take
+		 * MAX_SCHEDULE_TIMEOUT from one of the negative value
+		 * but I' d like to return a valid offset (>=0) to allow
+		 * the caller to do everything it want with the retval.
+		 */
+		schedule();
+		goto out;
+	default:
+		/*
+		 * Another bit of PARANOID. Note that the retval will be
+		 * 0 since no piece of kernel is supposed to do a check
+		 * for a negative retval of schedule_timeout() (since it
+		 * should never happens anyway). You just have the printk()
+		 * that will tell you if something is gone wrong and where.
+		 */
+		if (timeout < 0) {
+			printk(KERN_ERR "schedule_timeout: wrong timeout "
+				"value %lx\n", timeout);
+			current->state = TASK_RUNNING;
+			goto out;
+		}
+	}
+
+	expire = timeout + jiffies;
+
+	setup_timer(&timer, process_timeout, (unsigned long)current);
+	mod_timer(&timer, expire);
+	schedule();
+	del_timer_sync(&timer);
+
+	timeout = expire - jiffies;
+out:
+	return timeout < 0 ? 0 : timeout;
+}
+
+unsigned long __msecs_to_jiffies(const unsigned int m)
+{
+	/*
+	 * Negative value, means infinite timeout:
+	 */
+	if ((int)m < 0)
+		return MAX_JIFFY_OFFSET;
+	return _msecs_to_jiffies(m);
+}
+
+u64 nsecs_to_jiffies64(u64 n)
+{
+#if (NSEC_PER_SEC % HZ) == 0
+	/* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
+	return div_u64(n, NSEC_PER_SEC / HZ);
+#elif (HZ % 512) == 0
+	/* overflow after 292 years if HZ = 1024 */
+	return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
+#else
+	/*
+	 * Generic case - optimized for cases where HZ is a multiple of 3.
+	 * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
+	 */
+	return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
+#endif
+}
+
+unsigned long nsecs_to_jiffies(u64 n)
+{
+	return (unsigned long)nsecs_to_jiffies64(n);
+}
+
+unsigned int jiffies_to_msecs(const unsigned long j)
+{
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+	return (MSEC_PER_SEC / HZ) * j;
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+	return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
+#else
+# if BITS_PER_LONG == 32
+	return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32;
+# else
+	return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN;
+# endif
+#endif
+}
+
+unsigned int jiffies_to_usecs(const unsigned long j)
+{
+	/*
+	 * Hz usually doesn't go much further MSEC_PER_SEC.
+	 * jiffies_to_usecs() and usecs_to_jiffies() depend on that.
+	 */
+	BUILD_BUG_ON(HZ > USEC_PER_SEC);
+
+#if !(USEC_PER_SEC % HZ)
+	return (USEC_PER_SEC / HZ) * j;
+#else
+# if BITS_PER_LONG == 32
+	return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
+# else
+	return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
+# endif
+#endif
+}
+
+__attribute__((constructor(101)))
+static void sched_init(void)
+{
+	struct task_struct *p = malloc(sizeof(*p));
+
+	memset(p, 0, sizeof(*p));
+
+	p->state	= TASK_RUNNING;
+	pthread_mutex_init(&p->lock, NULL);
+	pthread_cond_init(&p->wait, NULL);
+	atomic_set(&p->usage, 1);
+	init_completion(&p->exited);
+
+	current = p;
+
+	rcu_init();
+	rcu_register_thread();
+}
diff --git a/linux/semaphore.c b/linux/semaphore.c
new file mode 100644
index 0000000..6561dd2
--- /dev/null
+++ b/linux/semaphore.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2008 Intel Corporation
+ * Author: Matthew Wilcox <willy@linux.intel.com>
+ *
+ * Distributed under the terms of the GNU GPL, version 2
+ *
+ * This file implements counting semaphores.
+ * A counting semaphore may be acquired 'n' times before sleeping.
+ * See mutex.c for single-acquisition sleeping locks which enforce
+ * rules which allow code to be debugged more easily.
+ */
+
+/*
+ * Some notes on the implementation:
+ *
+ * The spinlock controls access to the other members of the semaphore.
+ * down_trylock() and up() can be called from interrupt context, so we
+ * have to disable interrupts when taking the lock.  It turns out various
+ * parts of the kernel expect to be able to use down() on a semaphore in
+ * interrupt context when they know it will succeed, so we have to use
+ * irqsave variants for down(), down_interruptible() and down_killable()
+ * too.
+ *
+ * The ->count variable represents how many more tasks can acquire this
+ * semaphore.  If it's zero, there may be tasks waiting on the wait_list.
+ */
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/spinlock.h>
+
+static noinline void __down(struct semaphore *sem);
+static noinline int __down_interruptible(struct semaphore *sem);
+static noinline int __down_killable(struct semaphore *sem);
+static noinline int __down_timeout(struct semaphore *sem, long timeout);
+static noinline void __up(struct semaphore *sem);
+
+/**
+ * down - acquire the semaphore
+ * @sem: the semaphore to be acquired
+ *
+ * Acquires the semaphore.  If no more tasks are allowed to acquire the
+ * semaphore, calling this function will put the task to sleep until the
+ * semaphore is released.
+ *
+ * Use of this function is deprecated, please use down_interruptible() or
+ * down_killable() instead.
+ */
+void down(struct semaphore *sem)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&sem->lock, flags);
+	if (likely(sem->count > 0))
+		sem->count--;
+	else
+		__down(sem);
+	raw_spin_unlock_irqrestore(&sem->lock, flags);
+}
+EXPORT_SYMBOL(down);
+
+/**
+ * down_interruptible - acquire the semaphore unless interrupted
+ * @sem: the semaphore to be acquired
+ *
+ * Attempts to acquire the semaphore.  If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the sleep is interrupted by a signal, this function will return -EINTR.
+ * If the semaphore is successfully acquired, this function returns 0.
+ */
+int down_interruptible(struct semaphore *sem)
+{
+	unsigned long flags;
+	int result = 0;
+
+	raw_spin_lock_irqsave(&sem->lock, flags);
+	if (likely(sem->count > 0))
+		sem->count--;
+	else
+		result = __down_interruptible(sem);
+	raw_spin_unlock_irqrestore(&sem->lock, flags);
+
+	return result;
+}
+EXPORT_SYMBOL(down_interruptible);
+
+/**
+ * down_killable - acquire the semaphore unless killed
+ * @sem: the semaphore to be acquired
+ *
+ * Attempts to acquire the semaphore.  If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the sleep is interrupted by a fatal signal, this function will return
+ * -EINTR.  If the semaphore is successfully acquired, this function returns
+ * 0.
+ */
+int down_killable(struct semaphore *sem)
+{
+	unsigned long flags;
+	int result = 0;
+
+	raw_spin_lock_irqsave(&sem->lock, flags);
+	if (likely(sem->count > 0))
+		sem->count--;
+	else
+		result = __down_killable(sem);
+	raw_spin_unlock_irqrestore(&sem->lock, flags);
+
+	return result;
+}
+EXPORT_SYMBOL(down_killable);
+
+/**
+ * down_trylock - try to acquire the semaphore, without waiting
+ * @sem: the semaphore to be acquired
+ *
+ * Try to acquire the semaphore atomically.  Returns 0 if the semaphore has
+ * been acquired successfully or 1 if it it cannot be acquired.
+ *
+ * NOTE: This return value is inverted from both spin_trylock and
+ * mutex_trylock!  Be careful about this when converting code.
+ *
+ * Unlike mutex_trylock, this function can be used from interrupt context,
+ * and the semaphore can be released by any task or interrupt.
+ */
+int down_trylock(struct semaphore *sem)
+{
+	unsigned long flags;
+	int count;
+
+	raw_spin_lock_irqsave(&sem->lock, flags);
+	count = sem->count - 1;
+	if (likely(count >= 0))
+		sem->count = count;
+	raw_spin_unlock_irqrestore(&sem->lock, flags);
+
+	return (count < 0);
+}
+EXPORT_SYMBOL(down_trylock);
+
+/**
+ * down_timeout - acquire the semaphore within a specified time
+ * @sem: the semaphore to be acquired
+ * @timeout: how long to wait before failing
+ *
+ * Attempts to acquire the semaphore.  If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the semaphore is not released within the specified number of jiffies,
+ * this function returns -ETIME.  It returns 0 if the semaphore was acquired.
+ */
+int down_timeout(struct semaphore *sem, long timeout)
+{
+	unsigned long flags;
+	int result = 0;
+
+	raw_spin_lock_irqsave(&sem->lock, flags);
+	if (likely(sem->count > 0))
+		sem->count--;
+	else
+		result = __down_timeout(sem, timeout);
+	raw_spin_unlock_irqrestore(&sem->lock, flags);
+
+	return result;
+}
+EXPORT_SYMBOL(down_timeout);
+
+/**
+ * up - release the semaphore
+ * @sem: the semaphore to release
+ *
+ * Release the semaphore.  Unlike mutexes, up() may be called from any
+ * context and even by tasks which have never called down().
+ */
+void up(struct semaphore *sem)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&sem->lock, flags);
+	if (likely(list_empty(&sem->wait_list)))
+		sem->count++;
+	else
+		__up(sem);
+	raw_spin_unlock_irqrestore(&sem->lock, flags);
+}
+EXPORT_SYMBOL(up);
+
+/* Functions for the contended case */
+
+struct semaphore_waiter {
+	struct list_head list;
+	struct task_struct *task;
+	bool up;
+};
+
+/*
+ * Because this function is inlined, the 'state' parameter will be
+ * constant, and thus optimised away by the compiler.  Likewise the
+ * 'timeout' parameter for the cases without timeouts.
+ */
+static inline int __sched __down_common(struct semaphore *sem, long state,
+								long timeout)
+{
+	struct task_struct *task = current;
+	struct semaphore_waiter waiter;
+
+	list_add_tail(&waiter.list, &sem->wait_list);
+	waiter.task = task;
+	waiter.up = false;
+
+	for (;;) {
+		if (unlikely(timeout <= 0))
+			goto timed_out;
+		__set_task_state(task, state);
+		raw_spin_unlock_irq(&sem->lock);
+		timeout = schedule_timeout(timeout);
+		raw_spin_lock_irq(&sem->lock);
+		if (waiter.up)
+			return 0;
+	}
+
+ timed_out:
+	list_del(&waiter.list);
+	return -1;
+}
+
+static noinline void __sched __down(struct semaphore *sem)
+{
+	__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+
+static noinline int __sched __down_interruptible(struct semaphore *sem)
+{
+	return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+
+static noinline int __sched __down_killable(struct semaphore *sem)
+{
+	return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
+}
+
+static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
+{
+	return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
+}
+
+static noinline void __sched __up(struct semaphore *sem)
+{
+	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
+						struct semaphore_waiter, list);
+	list_del(&waiter->list);
+	waiter->up = true;
+	wake_up_process(waiter->task);
+}
diff --git a/linux/sha1.c b/linux/sha1.c
new file mode 100644
index 0000000..5a56dfd
--- /dev/null
+++ b/linux/sha1.c
@@ -0,0 +1,201 @@
+/*
+ * SHA1 routine optimized to do word accesses rather than byte accesses,
+ * and to avoid unnecessary copies into the context array.
+ *
+ * This was based on the git SHA1 implementation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/bitops.h>
+#include <linux/cryptohash.h>
+#include <asm/unaligned.h>
+
+/*
+ * If you have 32 registers or more, the compiler can (and should)
+ * try to change the array[] accesses into registers. However, on
+ * machines with less than ~25 registers, that won't really work,
+ * and at least gcc will make an unholy mess of it.
+ *
+ * So to avoid that mess which just slows things down, we force
+ * the stores to memory to actually happen (we might be better off
+ * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
+ * suggested by Artur Skawina - that will also make gcc unable to
+ * try to do the silly "optimize away loads" part because it won't
+ * see what the value will be).
+ *
+ * Ben Herrenschmidt reports that on PPC, the C version comes close
+ * to the optimized asm with this (ie on PPC you don't want that
+ * 'volatile', since there are lots of registers).
+ *
+ * On ARM we get the best code generation by forcing a full memory barrier
+ * between each SHA_ROUND, otherwise gcc happily get wild with spilling and
+ * the stack frame size simply explode and performance goes down the drain.
+ */
+
+#ifdef CONFIG_X86
+  #define setW(x, val) (*(volatile __u32 *)&W(x) = (val))
+#elif defined(CONFIG_ARM)
+  #define setW(x, val) do { W(x) = (val); __asm__("":::"memory"); } while (0)
+#else
+  #define setW(x, val) (W(x) = (val))
+#endif
+
+/* This "rolls" over the 512-bit array */
+#define W(x) (array[(x)&15])
+
+/*
+ * Where do we get the source from? The first 16 iterations get it from
+ * the input data, the next mix it from the 512-bit array.
+ */
+#define SHA_SRC(t) get_unaligned_be32((__u32 *)data + t)
+#define SHA_MIX(t) rol32(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1)
+
+#define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \
+	__u32 TEMP = input(t); setW(t, TEMP); \
+	E += TEMP + rol32(A,5) + (fn) + (constant); \
+	B = ror32(B, 2); } while (0)
+
+#define T_0_15(t, A, B, C, D, E)  SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
+#define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
+#define T_20_39(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1, A, B, C, D, E )
+#define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E )
+#define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) ,  0xca62c1d6, A, B, C, D, E )
+
+/**
+ * sha_transform - single block SHA1 transform
+ *
+ * @digest: 160 bit digest to update
+ * @data:   512 bits of data to hash
+ * @array:  16 words of workspace (see note)
+ *
+ * This function generates a SHA1 digest for a single 512-bit block.
+ * Be warned, it does not handle padding and message digest, do not
+ * confuse it with the full FIPS 180-1 digest algorithm for variable
+ * length messages.
+ *
+ * Note: If the hash is security sensitive, the caller should be sure
+ * to clear the workspace. This is left to the caller to avoid
+ * unnecessary clears between chained hashing operations.
+ */
+void sha_transform(__u32 *digest, const char *data, __u32 *array)
+{
+	__u32 A, B, C, D, E;
+
+	A = digest[0];
+	B = digest[1];
+	C = digest[2];
+	D = digest[3];
+	E = digest[4];
+
+	/* Round 1 - iterations 0-16 take their input from 'data' */
+	T_0_15( 0, A, B, C, D, E);
+	T_0_15( 1, E, A, B, C, D);
+	T_0_15( 2, D, E, A, B, C);
+	T_0_15( 3, C, D, E, A, B);
+	T_0_15( 4, B, C, D, E, A);
+	T_0_15( 5, A, B, C, D, E);
+	T_0_15( 6, E, A, B, C, D);
+	T_0_15( 7, D, E, A, B, C);
+	T_0_15( 8, C, D, E, A, B);
+	T_0_15( 9, B, C, D, E, A);
+	T_0_15(10, A, B, C, D, E);
+	T_0_15(11, E, A, B, C, D);
+	T_0_15(12, D, E, A, B, C);
+	T_0_15(13, C, D, E, A, B);
+	T_0_15(14, B, C, D, E, A);
+	T_0_15(15, A, B, C, D, E);
+
+	/* Round 1 - tail. Input from 512-bit mixing array */
+	T_16_19(16, E, A, B, C, D);
+	T_16_19(17, D, E, A, B, C);
+	T_16_19(18, C, D, E, A, B);
+	T_16_19(19, B, C, D, E, A);
+
+	/* Round 2 */
+	T_20_39(20, A, B, C, D, E);
+	T_20_39(21, E, A, B, C, D);
+	T_20_39(22, D, E, A, B, C);
+	T_20_39(23, C, D, E, A, B);
+	T_20_39(24, B, C, D, E, A);
+	T_20_39(25, A, B, C, D, E);
+	T_20_39(26, E, A, B, C, D);
+	T_20_39(27, D, E, A, B, C);
+	T_20_39(28, C, D, E, A, B);
+	T_20_39(29, B, C, D, E, A);
+	T_20_39(30, A, B, C, D, E);
+	T_20_39(31, E, A, B, C, D);
+	T_20_39(32, D, E, A, B, C);
+	T_20_39(33, C, D, E, A, B);
+	T_20_39(34, B, C, D, E, A);
+	T_20_39(35, A, B, C, D, E);
+	T_20_39(36, E, A, B, C, D);
+	T_20_39(37, D, E, A, B, C);
+	T_20_39(38, C, D, E, A, B);
+	T_20_39(39, B, C, D, E, A);
+
+	/* Round 3 */
+	T_40_59(40, A, B, C, D, E);
+	T_40_59(41, E, A, B, C, D);
+	T_40_59(42, D, E, A, B, C);
+	T_40_59(43, C, D, E, A, B);
+	T_40_59(44, B, C, D, E, A);
+	T_40_59(45, A, B, C, D, E);
+	T_40_59(46, E, A, B, C, D);
+	T_40_59(47, D, E, A, B, C);
+	T_40_59(48, C, D, E, A, B);
+	T_40_59(49, B, C, D, E, A);
+	T_40_59(50, A, B, C, D, E);
+	T_40_59(51, E, A, B, C, D);
+	T_40_59(52, D, E, A, B, C);
+	T_40_59(53, C, D, E, A, B);
+	T_40_59(54, B, C, D, E, A);
+	T_40_59(55, A, B, C, D, E);
+	T_40_59(56, E, A, B, C, D);
+	T_40_59(57, D, E, A, B, C);
+	T_40_59(58, C, D, E, A, B);
+	T_40_59(59, B, C, D, E, A);
+
+	/* Round 4 */
+	T_60_79(60, A, B, C, D, E);
+	T_60_79(61, E, A, B, C, D);
+	T_60_79(62, D, E, A, B, C);
+	T_60_79(63, C, D, E, A, B);
+	T_60_79(64, B, C, D, E, A);
+	T_60_79(65, A, B, C, D, E);
+	T_60_79(66, E, A, B, C, D);
+	T_60_79(67, D, E, A, B, C);
+	T_60_79(68, C, D, E, A, B);
+	T_60_79(69, B, C, D, E, A);
+	T_60_79(70, A, B, C, D, E);
+	T_60_79(71, E, A, B, C, D);
+	T_60_79(72, D, E, A, B, C);
+	T_60_79(73, C, D, E, A, B);
+	T_60_79(74, B, C, D, E, A);
+	T_60_79(75, A, B, C, D, E);
+	T_60_79(76, E, A, B, C, D);
+	T_60_79(77, D, E, A, B, C);
+	T_60_79(78, C, D, E, A, B);
+	T_60_79(79, B, C, D, E, A);
+
+	digest[0] += A;
+	digest[1] += B;
+	digest[2] += C;
+	digest[3] += D;
+	digest[4] += E;
+}
+EXPORT_SYMBOL(sha_transform);
+
+/**
+ * sha_init - initialize the vectors for a SHA1 digest
+ * @buf: vector to initialize
+ */
+void sha_init(__u32 *buf)
+{
+	buf[0] = 0x67452301;
+	buf[1] = 0xefcdab89;
+	buf[2] = 0x98badcfe;
+	buf[3] = 0x10325476;
+	buf[4] = 0xc3d2e1f0;
+}
+EXPORT_SYMBOL(sha_init);
diff --git a/linux/sort.c b/linux/sort.c
new file mode 100644
index 0000000..15e8d11
--- /dev/null
+++ b/linux/sort.c
@@ -0,0 +1,143 @@
+/*
+ * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
+ *
+ * Jan 23 2005  Matt Mackall <mpm@selenic.com>
+ */
+
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/sort.h>
+
+static int alignment_ok(const void *base, int align)
+{
+	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+		((unsigned long)base & (align - 1)) == 0;
+}
+
+static void u32_swap(void *a, void *b, int size)
+{
+	u32 t = *(u32 *)a;
+	*(u32 *)a = *(u32 *)b;
+	*(u32 *)b = t;
+}
+
+static void u64_swap(void *a, void *b, int size)
+{
+	u64 t = *(u64 *)a;
+	*(u64 *)a = *(u64 *)b;
+	*(u64 *)b = t;
+}
+
+static void generic_swap(void *a, void *b, int size)
+{
+	char t;
+
+	do {
+		t = *(char *)a;
+		*(char *)a++ = *(char *)b;
+		*(char *)b++ = t;
+	} while (--size > 0);
+}
+
+/**
+ * sort - sort an array of elements
+ * @base: pointer to data to sort
+ * @num: number of elements
+ * @size: size of each element
+ * @cmp_func: pointer to comparison function
+ * @swap_func: pointer to swap function or NULL
+ *
+ * This function does a heapsort on the given array. You may provide a
+ * swap_func function optimized to your element type.
+ *
+ * Sorting time is O(n log n) both on average and worst-case. While
+ * qsort is about 20% faster on average, it suffers from exploitable
+ * O(n*n) worst-case behavior and extra memory requirements that make
+ * it less suitable for kernel use.
+ */
+
+void sort(void *base, size_t num, size_t size,
+	  int (*cmp_func)(const void *, const void *),
+	  void (*swap_func)(void *, void *, int size))
+{
+	/* pre-scale counters for performance */
+	int i = (num/2 - 1) * size, n = num * size, c, r;
+
+	if (!swap_func) {
+		if (size == 4 && alignment_ok(base, 4))
+			swap_func = u32_swap;
+		else if (size == 8 && alignment_ok(base, 8))
+			swap_func = u64_swap;
+		else
+			swap_func = generic_swap;
+	}
+
+	/* heapify */
+	for ( ; i >= 0; i -= size) {
+		for (r = i; r * 2 + size < n; r  = c) {
+			c = r * 2 + size;
+			if (c < n - size &&
+					cmp_func(base + c, base + c + size) < 0)
+				c += size;
+			if (cmp_func(base + r, base + c) >= 0)
+				break;
+			swap_func(base + r, base + c, size);
+		}
+	}
+
+	/* sort */
+	for (i = n - size; i > 0; i -= size) {
+		swap_func(base, base + i, size);
+		for (r = 0; r * 2 + size < i; r = c) {
+			c = r * 2 + size;
+			if (c < i - size &&
+					cmp_func(base + c, base + c + size) < 0)
+				c += size;
+			if (cmp_func(base + r, base + c) >= 0)
+				break;
+			swap_func(base + r, base + c, size);
+		}
+	}
+}
+
+EXPORT_SYMBOL(sort);
+
+#if 0
+#include <linux/slab.h>
+/* a simple boot-time regression test */
+
+int cmpint(const void *a, const void *b)
+{
+	return *(int *)a - *(int *)b;
+}
+
+static int sort_test(void)
+{
+	int *a, i, r = 1;
+
+	a = kmalloc(1000 * sizeof(int), GFP_KERNEL);
+	BUG_ON(!a);
+
+	printk("testing sort()\n");
+
+	for (i = 0; i < 1000; i++) {
+		r = (r * 725861) % 6599;
+		a[i] = r;
+	}
+
+	sort(a, 1000, sizeof(int), cmpint, NULL);
+
+	for (i = 0; i < 999; i++)
+		if (a[i] > a[i+1]) {
+			printk("sort() failed!\n");
+			break;
+		}
+
+	kfree(a);
+
+	return 0;
+}
+
+module_init(sort_test);
+#endif
diff --git a/linux/string.c b/linux/string.c
new file mode 100644
index 0000000..0f23f07
--- /dev/null
+++ b/linux/string.c
@@ -0,0 +1,97 @@
+/*
+ *  linux/lib/string.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ * stupid library routines.. The optimized versions should generally be found
+ * as inline code in <asm-xx/string.h>
+ *
+ * These are buggy as well..
+ *
+ * * Fri Jun 25 1999, Ingo Oeser <ioe@informatik.tu-chemnitz.de>
+ * -  Added strsep() which will replace strtok() soon (because strsep() is
+ *    reentrant and should be faster). Use only strsep() in new code, please.
+ *
+ * * Sat Feb 09 2002, Jason Thomas <jason@topic.com.au>,
+ *                    Matthew Hawkins <matt@mh.dropbear.id.au>
+ * -  Kissed strtok() goodbye
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/bug.h>
+#include <linux/errno.h>
+
+#include <string.h>
+
+/**
+ * skip_spaces - Removes leading whitespace from @str.
+ * @str: The string to be stripped.
+ *
+ * Returns a pointer to the first non-whitespace character in @str.
+ */
+char *skip_spaces(const char *str)
+{
+	while (isspace(*str))
+		++str;
+	return (char *)str;
+}
+
+/**
+ * strim - Removes leading and trailing whitespace from @s.
+ * @s: The string to be stripped.
+ *
+ * Note that the first trailing whitespace is replaced with a %NUL-terminator
+ * in the given string @s. Returns a pointer to the first non-whitespace
+ * character in @s.
+ */
+char *strim(char *s)
+{
+	size_t size;
+	char *end;
+
+	size = strlen(s);
+	if (!size)
+		return s;
+
+	end = s + size - 1;
+	while (end >= s && isspace(*end))
+		end--;
+	*(end + 1) = '\0';
+
+	return skip_spaces(s);
+}
+
+/**
+ * strlcpy - Copy a C-string into a sized buffer
+ * @dest: Where to copy the string to
+ * @src: Where to copy the string from
+ * @size: size of destination buffer
+ *
+ * Compatible with *BSD: the result is always a valid
+ * NUL-terminated string that fits in the buffer (unless,
+ * of course, the buffer size is zero). It does not pad
+ * out the result like strncpy() does.
+ */
+size_t strlcpy(char *dest, const char *src, size_t size)
+{
+	size_t ret = strlen(src);
+
+	if (size) {
+		size_t len = (ret >= size) ? size - 1 : ret;
+		memcpy(dest, src, len);
+		dest[len] = '\0';
+	}
+	return ret;
+}
+
+void memzero_explicit(void *s, size_t count)
+{
+	memset(s, 0, count);
+	barrier_data(s);
+}
diff --git a/linux/timer.c b/linux/timer.c
new file mode 100644
index 0000000..557af09
--- /dev/null
+++ b/linux/timer.c
@@ -0,0 +1,311 @@
+
+#include <pthread.h>
+#include <signal.h>
+#include <time.h>
+
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/timer.h>
+
+/**
+ * timespec_add_ns - Adds nanoseconds to a timespec
+ * @a:		pointer to timespec to be incremented
+ * @ns:		unsigned nanoseconds value to be added
+ *
+ * This must always be inlined because its used from the x86-64 vdso,
+ * which cannot call other kernel functions.
+ */
+static struct timespec timespec_add_ns(struct timespec a, u64 ns)
+{
+	a.tv_nsec	+= ns;
+	a.tv_sec	+= a.tv_nsec / NSEC_PER_SEC;
+	a.tv_nsec	%= NSEC_PER_SEC;
+	return a;
+}
+
+#define DECLARE_HEAP(type)						\
+struct {								\
+	size_t size, used;						\
+	type *data;							\
+}
+
+#define heap_init(heap, _size)						\
+({									\
+	size_t _bytes;							\
+	(heap)->used = 0;						\
+	(heap)->size = (_size);						\
+	_bytes = (heap)->size * sizeof(*(heap)->data);			\
+	(heap)->data = malloc(_bytes);					\
+	(heap)->data;							\
+})
+
+#define heap_free(heap)							\
+do {									\
+	kvfree((heap)->data);						\
+	(heap)->data = NULL;						\
+} while (0)
+
+#define heap_swap(h, i, j)	swap((h)->data[i], (h)->data[j])
+
+#define heap_sift(h, i, cmp)						\
+do {									\
+	size_t _r, _j = i;						\
+									\
+	for (; _j * 2 + 1 < (h)->used; _j = _r) {			\
+		_r = _j * 2 + 1;					\
+		if (_r + 1 < (h)->used &&				\
+		    cmp((h)->data[_r], (h)->data[_r + 1]))		\
+			_r++;						\
+									\
+		if (cmp((h)->data[_r], (h)->data[_j]))			\
+			break;						\
+		heap_swap(h, _r, _j);					\
+	}								\
+} while (0)
+
+#define heap_sift_down(h, i, cmp)					\
+do {									\
+	while (i) {							\
+		size_t p = (i - 1) / 2;					\
+		if (cmp((h)->data[i], (h)->data[p]))			\
+			break;						\
+		heap_swap(h, i, p);					\
+		i = p;							\
+	}								\
+} while (0)
+
+#define heap_add(h, d, cmp)						\
+({									\
+	bool _r = !heap_full(h);					\
+	if (_r) {							\
+		size_t _i = (h)->used++;				\
+		(h)->data[_i] = d;					\
+									\
+		heap_sift_down(h, _i, cmp);				\
+		heap_sift(h, _i, cmp);					\
+	}								\
+	_r;								\
+})
+
+#define heap_del(h, i, cmp)						\
+do {									\
+	size_t _i = (i);						\
+									\
+	BUG_ON(_i >= (h)->used);					\
+	(h)->used--;							\
+	heap_swap(h, _i, (h)->used);					\
+	heap_sift_down(h, _i, cmp);					\
+	heap_sift(h, _i, cmp);						\
+} while (0)
+
+#define heap_pop(h, d, cmp)						\
+({									\
+	bool _r = (h)->used;						\
+	if (_r) {							\
+		(d) = (h)->data[0];					\
+		heap_del(h, 0, cmp);					\
+	}								\
+	_r;								\
+})
+
+#define heap_peek(h)	((h)->used ? &(h)->data[0] : NULL)
+#define heap_full(h)	((h)->used == (h)->size)
+#define heap_empty(h)	((h)->used == 0)
+
+#define heap_resort(heap, cmp)						\
+do {									\
+	ssize_t _i;							\
+	for (_i = (ssize_t) (heap)->used / 2 -  1; _i >= 0; --_i)	\
+		heap_sift(heap, _i, cmp);				\
+} while (0)
+
+struct pending_timer {
+	struct timer_list	*timer;
+	unsigned long		expires;
+};
+
+static inline bool pending_timer_cmp(struct pending_timer a,
+				     struct pending_timer b)
+{
+	return a.expires < b.expires;
+}
+
+static DECLARE_HEAP(struct pending_timer) pending_timers;
+
+static pthread_mutex_t	timer_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t	timer_cond = PTHREAD_COND_INITIALIZER;
+static pthread_cond_t	timer_running_cond = PTHREAD_COND_INITIALIZER;
+static unsigned long	timer_seq;
+
+static inline bool timer_running(void)
+{
+	return timer_seq & 1;
+}
+
+static size_t timer_idx(struct timer_list *timer)
+{
+	size_t i;
+
+	for (i = 0; i < pending_timers.size; i++)
+		if (pending_timers.data[i].timer == timer)
+			return i;
+	BUG();
+}
+
+int del_timer(struct timer_list *timer)
+{
+	int pending;
+
+	pthread_mutex_lock(&timer_lock);
+	pending = timer_pending(timer);
+	timer->pending = false;
+
+	if (pending)
+		heap_del(&pending_timers, timer_idx(timer), pending_timer_cmp);
+
+	pthread_mutex_unlock(&timer_lock);
+
+	return pending;
+}
+
+void flush_timers(void)
+{
+	unsigned long seq;
+
+	pthread_mutex_lock(&timer_lock);
+	seq = timer_seq;
+	while (timer_running() && seq == timer_seq)
+		pthread_cond_wait(&timer_running_cond, &timer_lock);
+
+	pthread_mutex_unlock(&timer_lock);
+}
+
+int del_timer_sync(struct timer_list *timer)
+{
+	unsigned long seq;
+	int pending;
+
+	pthread_mutex_lock(&timer_lock);
+	pending = timer_pending(timer);
+	timer->pending = false;
+
+	if (pending)
+		heap_del(&pending_timers, timer_idx(timer), pending_timer_cmp);
+
+	seq = timer_seq;
+	while (timer_running() && seq == timer_seq)
+		pthread_cond_wait(&timer_running_cond, &timer_lock);
+
+	pthread_mutex_unlock(&timer_lock);
+
+	return pending;
+}
+
+int mod_timer(struct timer_list *timer, unsigned long expires)
+{
+	int pending;
+	size_t i;
+
+	pthread_mutex_lock(&timer_lock);
+	pending = timer_pending(timer);
+
+	if (pending && timer->expires == expires)
+		goto out;
+
+	timer->expires = expires;
+	timer->pending = true;
+
+	if (pending) {
+		i = timer_idx(timer);
+		pending_timers.data[i].expires = expires;
+
+		heap_sift_down(&pending_timers, i, pending_timer_cmp);
+		heap_sift(&pending_timers, i, pending_timer_cmp);
+	} else {
+		if (heap_full(&pending_timers)) {
+			pending_timers.size *= 2;
+			pending_timers.data =
+				realloc(pending_timers.data,
+					pending_timers.size *
+					sizeof(struct pending_timer));
+
+			BUG_ON(!pending_timers.data);
+		}
+
+		heap_add(&pending_timers,
+			 ((struct pending_timer) {
+				.timer = timer,
+				.expires = expires,
+			 }),
+			 pending_timer_cmp);
+	}
+
+	pthread_cond_signal(&timer_cond);
+out:
+	pthread_mutex_unlock(&timer_lock);
+
+	return pending;
+}
+
+static int timer_thread(void *arg)
+{
+	struct pending_timer *p;
+	struct timespec ts;
+	unsigned long now;
+	int ret;
+
+	pthread_mutex_lock(&timer_lock);
+
+	while (1) {
+		now = jiffies;
+		p = heap_peek(&pending_timers);
+
+		if (!p) {
+			pthread_cond_wait(&timer_cond, &timer_lock);
+			continue;
+		}
+
+		if (time_after_eq(now, p->expires)) {
+			struct timer_list *timer = p->timer;
+
+			heap_del(&pending_timers, 0, pending_timer_cmp);
+			BUG_ON(!timer_pending(timer));
+			timer->pending = false;
+
+			timer_seq++;
+			BUG_ON(!timer_running());
+
+			pthread_mutex_unlock(&timer_lock);
+			timer->function(timer->data);
+			pthread_mutex_lock(&timer_lock);
+
+			timer_seq++;
+			pthread_cond_broadcast(&timer_running_cond);
+			continue;
+		}
+
+
+		ret = clock_gettime(CLOCK_REALTIME, &ts);
+		BUG_ON(ret);
+
+		ts = timespec_add_ns(ts, jiffies_to_nsecs(p->expires - now));
+
+		pthread_cond_timedwait(&timer_cond, &timer_lock, &ts);
+	}
+
+	pthread_mutex_unlock(&timer_lock);
+
+	return 0;
+}
+
+__attribute__((constructor(103)))
+static void timers_init(void)
+{
+	struct task_struct *p;
+
+	heap_init(&pending_timers, 64);
+	BUG_ON(!pending_timers.data);
+
+	p = kthread_run(timer_thread, NULL, "timers");
+	BUG_ON(IS_ERR(p));
+}
diff --git a/linux/vsprintf.c b/linux/vsprintf.c
new file mode 100644
index 0000000..3642200
--- /dev/null
+++ b/linux/vsprintf.c
@@ -0,0 +1,75 @@
+#include <linux/kernel.h>
+#include "kstrtox.h"
+
+/**
+ * simple_strtoull - convert a string to an unsigned long long
+ * @cp: The start of the string
+ * @endp: A pointer to the end of the parsed string will be placed here
+ * @base: The number base to use
+ *
+ * This function is obsolete. Please use kstrtoull instead.
+ */
+unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
+{
+	unsigned long long result;
+	unsigned int rv;
+
+	cp = _parse_integer_fixup_radix(cp, &base);
+	rv = _parse_integer(cp, base, &result);
+	/* FIXME */
+	cp += (rv & ~KSTRTOX_OVERFLOW);
+
+	if (endp)
+		*endp = (char *)cp;
+
+	return result;
+}
+EXPORT_SYMBOL(simple_strtoull);
+
+/**
+ * simple_strtoul - convert a string to an unsigned long
+ * @cp: The start of the string
+ * @endp: A pointer to the end of the parsed string will be placed here
+ * @base: The number base to use
+ *
+ * This function is obsolete. Please use kstrtoul instead.
+ */
+unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base)
+{
+	return simple_strtoull(cp, endp, base);
+}
+EXPORT_SYMBOL(simple_strtoul);
+
+/**
+ * simple_strtol - convert a string to a signed long
+ * @cp: The start of the string
+ * @endp: A pointer to the end of the parsed string will be placed here
+ * @base: The number base to use
+ *
+ * This function is obsolete. Please use kstrtol instead.
+ */
+long simple_strtol(const char *cp, char **endp, unsigned int base)
+{
+	if (*cp == '-')
+		return -simple_strtoul(cp + 1, endp, base);
+
+	return simple_strtoul(cp, endp, base);
+}
+EXPORT_SYMBOL(simple_strtol);
+
+/**
+ * simple_strtoll - convert a string to a signed long long
+ * @cp: The start of the string
+ * @endp: A pointer to the end of the parsed string will be placed here
+ * @base: The number base to use
+ *
+ * This function is obsolete. Please use kstrtoll instead.
+ */
+long long simple_strtoll(const char *cp, char **endp, unsigned int base)
+{
+	if (*cp == '-')
+		return -simple_strtoull(cp + 1, endp, base);
+
+	return simple_strtoull(cp, endp, base);
+}
+EXPORT_SYMBOL(simple_strtoll);
diff --git a/linux/wait.c b/linux/wait.c
new file mode 100644
index 0000000..83f4e85
--- /dev/null
+++ b/linux/wait.c
@@ -0,0 +1,616 @@
+/*
+ * Generic waiting primitives.
+ *
+ * (C) 2004 Nadia Yvette Chambers, Oracle
+ */
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/wait.h>
+#include <linux/hash.h>
+#include <linux/kthread.h>
+
+void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
+{
+	spin_lock_init(&q->lock);
+	lockdep_set_class_and_name(&q->lock, key, name);
+	INIT_LIST_HEAD(&q->task_list);
+}
+
+EXPORT_SYMBOL(__init_waitqueue_head);
+
+void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+{
+	unsigned long flags;
+
+	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+	spin_lock_irqsave(&q->lock, flags);
+	__add_wait_queue(q, wait);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue);
+
+void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+{
+	unsigned long flags;
+
+	wait->flags |= WQ_FLAG_EXCLUSIVE;
+	spin_lock_irqsave(&q->lock, flags);
+	__add_wait_queue_tail(q, wait);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue_exclusive);
+
+void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__remove_wait_queue(q, wait);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(remove_wait_queue);
+
+
+/*
+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, int wake_flags, void *key)
+{
+	wait_queue_t *curr, *next;
+
+	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
+		unsigned flags = curr->flags;
+
+		if (curr->func(curr, mode, wake_flags, key) &&
+				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+			break;
+	}
+}
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, void *key)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__wake_up_common(q, mode, nr_exclusive, 0, key);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(__wake_up);
+
+/*
+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+ */
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
+{
+	__wake_up_common(q, mode, nr, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked);
+
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+	__wake_up_common(q, mode, 1, 0, key);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
+
+/**
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, void *key)
+{
+	unsigned long flags;
+	int wake_flags = 1; /* XXX WF_SYNC */
+
+	if (unlikely(!q))
+		return;
+
+	if (unlikely(nr_exclusive != 1))
+		wake_flags = 0;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+	__wake_up_sync_key(q, mode, nr_exclusive, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
+
+/*
+ * Note: we use "set_current_state()" _after_ the wait-queue add,
+ * because we need a memory barrier there on SMP, so that any
+ * wake-function that tests for the wait-queue being active
+ * will be guaranteed to see waitqueue addition _or_ subsequent
+ * tests in this thread will see the wakeup having taken place.
+ *
+ * The spin_unlock() itself is semi-permeable and only protects
+ * one way (it only protects stuff inside the critical region and
+ * stops them from bleeding out - it would still allow subsequent
+ * loads to move into the critical region).
+ */
+void
+prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+	unsigned long flags;
+
+	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+	spin_lock_irqsave(&q->lock, flags);
+	if (list_empty(&wait->task_list))
+		__add_wait_queue(q, wait);
+	set_current_state(state);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_wait);
+
+void
+prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+	unsigned long flags;
+
+	wait->flags |= WQ_FLAG_EXCLUSIVE;
+	spin_lock_irqsave(&q->lock, flags);
+	if (list_empty(&wait->task_list))
+		__add_wait_queue_tail(q, wait);
+	set_current_state(state);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_wait_exclusive);
+
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+	unsigned long flags;
+
+	wait->private = current;
+	wait->func = autoremove_wake_function;
+
+	spin_lock_irqsave(&q->lock, flags);
+	if (list_empty(&wait->task_list)) {
+		if (wait->flags & WQ_FLAG_EXCLUSIVE)
+			__add_wait_queue_tail(q, wait);
+		else
+			__add_wait_queue(q, wait);
+	}
+	set_current_state(state);
+	spin_unlock_irqrestore(&q->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(prepare_to_wait_event);
+
+/**
+ * finish_wait - clean up after waiting in a queue
+ * @q: waitqueue waited on
+ * @wait: wait descriptor
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ */
+void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
+{
+	unsigned long flags;
+
+	__set_current_state(TASK_RUNNING);
+	/*
+	 * We can check for list emptiness outside the lock
+	 * IFF:
+	 *  - we use the "careful" check that verifies both
+	 *    the next and prev pointers, so that there cannot
+	 *    be any half-pending updates in progress on other
+	 *    CPU's that we haven't seen yet (and that might
+	 *    still change the stack area.
+	 * and
+	 *  - all other users take the lock (ie we can only
+	 *    have _one_ other CPU that looks at or modifies
+	 *    the list).
+	 */
+	if (!list_empty_careful(&wait->task_list)) {
+		spin_lock_irqsave(&q->lock, flags);
+		list_del_init(&wait->task_list);
+		spin_unlock_irqrestore(&q->lock, flags);
+	}
+}
+EXPORT_SYMBOL(finish_wait);
+
+/**
+ * abort_exclusive_wait - abort exclusive waiting in a queue
+ * @q: waitqueue waited on
+ * @wait: wait descriptor
+ * @mode: runstate of the waiter to be woken
+ * @key: key to identify a wait bit queue or %NULL
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ *
+ * Wakes up the next waiter if the caller is concurrently
+ * woken up through the queue.
+ *
+ * This prevents waiter starvation where an exclusive waiter
+ * aborts and is woken up concurrently and no one wakes up
+ * the next waiter.
+ */
+void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
+			unsigned int mode, void *key)
+{
+	unsigned long flags;
+
+	__set_current_state(TASK_RUNNING);
+	spin_lock_irqsave(&q->lock, flags);
+	if (!list_empty(&wait->task_list))
+		list_del_init(&wait->task_list);
+	else if (waitqueue_active(q))
+		__wake_up_locked_key(q, mode, key);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(abort_exclusive_wait);
+
+int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
+			  void *key)
+{
+	return wake_up_process(curr->private);
+}
+
+int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	int ret = default_wake_function(wait, mode, sync, key);
+
+	if (ret)
+		list_del_init(&wait->task_list);
+	return ret;
+}
+EXPORT_SYMBOL(autoremove_wake_function);
+
+static inline bool is_kthread_should_stop(void)
+{
+	return (current->flags & PF_KTHREAD) && kthread_should_stop();
+}
+
+/*
+ * DEFINE_WAIT_FUNC(wait, woken_wake_func);
+ *
+ * add_wait_queue(&wq, &wait);
+ * for (;;) {
+ *     if (condition)
+ *         break;
+ *
+ *     p->state = mode;				condition = true;
+ *     smp_mb(); // A				smp_wmb(); // C
+ *     if (!wait->flags & WQ_FLAG_WOKEN)	wait->flags |= WQ_FLAG_WOKEN;
+ *         schedule()				try_to_wake_up();
+ *     p->state = TASK_RUNNING;		    ~~~~~~~~~~~~~~~~~~
+ *     wait->flags &= ~WQ_FLAG_WOKEN;		condition = true;
+ *     smp_mb() // B				smp_wmb(); // C
+ *						wait->flags |= WQ_FLAG_WOKEN;
+ * }
+ * remove_wait_queue(&wq, &wait);
+ *
+ */
+long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
+{
+	set_current_state(mode); /* A */
+	/*
+	 * The above implies an smp_mb(), which matches with the smp_wmb() from
+	 * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
+	 * also observe all state before the wakeup.
+	 */
+	if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
+		timeout = schedule_timeout(timeout);
+	__set_current_state(TASK_RUNNING);
+
+	/*
+	 * The below implies an smp_mb(), it too pairs with the smp_wmb() from
+	 * woken_wake_function() such that we must either observe the wait
+	 * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
+	 * an event.
+	 */
+	smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+
+	return timeout;
+}
+EXPORT_SYMBOL(wait_woken);
+
+int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	/*
+	 * Although this function is called under waitqueue lock, LOCK
+	 * doesn't imply write barrier and the users expects write
+	 * barrier semantics on wakeup functions.  The following
+	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
+	 * and is paired with smp_store_mb() in wait_woken().
+	 */
+	smp_wmb(); /* C */
+	wait->flags |= WQ_FLAG_WOKEN;
+
+	return default_wake_function(wait, mode, sync, key);
+}
+EXPORT_SYMBOL(woken_wake_function);
+
+int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
+{
+	struct wait_bit_key *key = arg;
+	struct wait_bit_queue *wait_bit
+		= container_of(wait, struct wait_bit_queue, wait);
+
+	if (wait_bit->key.flags != key->flags ||
+			wait_bit->key.bit_nr != key->bit_nr ||
+			test_bit(key->bit_nr, key->flags))
+		return 0;
+	else
+		return autoremove_wake_function(wait, mode, sync, key);
+}
+EXPORT_SYMBOL(wake_bit_function);
+
+/*
+ * To allow interruptible waiting and asynchronous (i.e. nonblocking)
+ * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
+ * permitted return codes. Nonzero return codes halt waiting and return.
+ */
+int __sched
+__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
+	      wait_bit_action_f *action, unsigned mode)
+{
+	int ret = 0;
+
+	do {
+		prepare_to_wait(wq, &q->wait, mode);
+		if (test_bit(q->key.bit_nr, q->key.flags))
+			ret = (*action)(&q->key, mode);
+	} while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
+	finish_wait(wq, &q->wait);
+	return ret;
+}
+EXPORT_SYMBOL(__wait_on_bit);
+
+int __sched out_of_line_wait_on_bit(void *word, int bit,
+				    wait_bit_action_f *action, unsigned mode)
+{
+	wait_queue_head_t *wq = bit_waitqueue(word, bit);
+	DEFINE_WAIT_BIT(wait, word, bit);
+
+	return __wait_on_bit(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit);
+
+int __sched out_of_line_wait_on_bit_timeout(
+	void *word, int bit, wait_bit_action_f *action,
+	unsigned mode, unsigned long timeout)
+{
+	wait_queue_head_t *wq = bit_waitqueue(word, bit);
+	DEFINE_WAIT_BIT(wait, word, bit);
+
+	wait.key.timeout = jiffies + timeout;
+	return __wait_on_bit(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
+
+int __sched
+__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
+			wait_bit_action_f *action, unsigned mode)
+{
+	do {
+		int ret;
+
+		prepare_to_wait_exclusive(wq, &q->wait, mode);
+		if (!test_bit(q->key.bit_nr, q->key.flags))
+			continue;
+		ret = action(&q->key, mode);
+		if (!ret)
+			continue;
+		abort_exclusive_wait(wq, &q->wait, mode, &q->key);
+		return ret;
+	} while (test_and_set_bit(q->key.bit_nr, q->key.flags));
+	finish_wait(wq, &q->wait);
+	return 0;
+}
+EXPORT_SYMBOL(__wait_on_bit_lock);
+
+int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
+					 wait_bit_action_f *action, unsigned mode)
+{
+	wait_queue_head_t *wq = bit_waitqueue(word, bit);
+	DEFINE_WAIT_BIT(wait, word, bit);
+
+	return __wait_on_bit_lock(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
+
+void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
+{
+	struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
+	if (waitqueue_active(wq))
+		__wake_up(wq, TASK_NORMAL, 1, &key);
+}
+EXPORT_SYMBOL(__wake_up_bit);
+
+/**
+ * wake_up_bit - wake up a waiter on a bit
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ *
+ * There is a standard hashed waitqueue table for generic use. This
+ * is the part of the hashtable's accessor API that wakes up waiters
+ * on a bit. For instance, if one were to have waiters on a bitflag,
+ * one would call wake_up_bit() after clearing the bit.
+ *
+ * In order for this to function properly, as it uses waitqueue_active()
+ * internally, some kind of memory barrier must be done prior to calling
+ * this. Typically, this will be smp_mb__after_atomic(), but in some
+ * cases where bitflags are manipulated non-atomically under a lock, one
+ * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
+ * because spin_unlock() does not guarantee a memory barrier.
+ */
+void wake_up_bit(void *word, int bit)
+{
+	__wake_up_bit(bit_waitqueue(word, bit), word, bit);
+}
+EXPORT_SYMBOL(wake_up_bit);
+
+static DECLARE_WAIT_QUEUE_HEAD(__bit_waitqueue);
+
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
+{
+	return &__bit_waitqueue;
+}
+EXPORT_SYMBOL(bit_waitqueue);
+
+/*
+ * Manipulate the atomic_t address to produce a better bit waitqueue table hash
+ * index (we're keying off bit -1, but that would produce a horrible hash
+ * value).
+ */
+static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
+{
+	if (BITS_PER_LONG == 64) {
+		unsigned long q = (unsigned long)p;
+		return bit_waitqueue((void *)(q & ~1), q & 1);
+	}
+	return bit_waitqueue(p, 0);
+}
+
+static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync,
+				  void *arg)
+{
+	struct wait_bit_key *key = arg;
+	struct wait_bit_queue *wait_bit
+		= container_of(wait, struct wait_bit_queue, wait);
+	atomic_t *val = key->flags;
+
+	if (wait_bit->key.flags != key->flags ||
+	    wait_bit->key.bit_nr != key->bit_nr ||
+	    atomic_read(val) != 0)
+		return 0;
+	return autoremove_wake_function(wait, mode, sync, key);
+}
+
+/*
+ * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
+ * the actions of __wait_on_atomic_t() are permitted return codes.  Nonzero
+ * return codes halt waiting and return.
+ */
+static __sched
+int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,
+		       int (*action)(atomic_t *), unsigned mode)
+{
+	atomic_t *val;
+	int ret = 0;
+
+	do {
+		prepare_to_wait(wq, &q->wait, mode);
+		val = q->key.flags;
+		if (atomic_read(val) == 0)
+			break;
+		ret = (*action)(val);
+	} while (!ret && atomic_read(val) != 0);
+	finish_wait(wq, &q->wait);
+	return ret;
+}
+
+#define DEFINE_WAIT_ATOMIC_T(name, p)					\
+	struct wait_bit_queue name = {					\
+		.key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p),		\
+		.wait	= {						\
+			.private	= current,			\
+			.func		= wake_atomic_t_function,	\
+			.task_list	=				\
+				LIST_HEAD_INIT((name).wait.task_list),	\
+		},							\
+	}
+
+__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
+					 unsigned mode)
+{
+	wait_queue_head_t *wq = atomic_t_waitqueue(p);
+	DEFINE_WAIT_ATOMIC_T(wait, p);
+
+	return __wait_on_atomic_t(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
+
+/**
+ * wake_up_atomic_t - Wake up a waiter on a atomic_t
+ * @p: The atomic_t being waited on, a kernel virtual address
+ *
+ * Wake up anyone waiting for the atomic_t to go to zero.
+ *
+ * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
+ * check is done by the waiter's wake function, not the by the waker itself).
+ */
+void wake_up_atomic_t(atomic_t *p)
+{
+	__wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
+}
+EXPORT_SYMBOL(wake_up_atomic_t);
+
+__sched int bit_wait(struct wait_bit_key *word, int mode)
+{
+	schedule();
+	return 0;
+}
+EXPORT_SYMBOL(bit_wait);
+
+__sched int bit_wait_io(struct wait_bit_key *word, int mode)
+{
+	io_schedule();
+	return 0;
+}
+EXPORT_SYMBOL(bit_wait_io);
+
+__sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
+{
+	unsigned long now = jiffies;
+	if (time_after_eq(now, word->timeout))
+		return -EAGAIN;
+	schedule_timeout(word->timeout - now);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_timeout);
+
+__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
+{
+	unsigned long now = jiffies;
+	if (time_after_eq(now, word->timeout))
+		return -EAGAIN;
+	io_schedule_timeout(word->timeout - now);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/linux/workqueue.c b/linux/workqueue.c
new file mode 100644
index 0000000..b25e7a4
--- /dev/null
+++ b/linux/workqueue.c
@@ -0,0 +1,318 @@
+#include <pthread.h>
+
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+static pthread_mutex_t	wq_lock = PTHREAD_MUTEX_INITIALIZER;
+static LIST_HEAD(wq_list);
+
+struct workqueue_struct {
+	struct list_head	list;
+
+	struct work_struct	*current_work;
+	struct list_head	pending_work;
+
+	pthread_cond_t		work_finished;
+
+	struct task_struct	*worker;
+	char			name[24];
+};
+
+enum {
+	WORK_PENDING_BIT,
+};
+
+static void clear_work_pending(struct work_struct *work)
+{
+	clear_bit(WORK_PENDING_BIT, work_data_bits(work));
+}
+
+static bool set_work_pending(struct work_struct *work)
+{
+	return !test_and_set_bit(WORK_PENDING_BIT, work_data_bits(work));
+}
+
+static void __queue_work(struct workqueue_struct *wq,
+			 struct work_struct *work)
+{
+	BUG_ON(!test_bit(WORK_PENDING_BIT, work_data_bits(work)));
+	BUG_ON(!list_empty(&work->entry));
+
+	list_add_tail(&work->entry, &wq->pending_work);
+	wake_up_process(wq->worker);
+}
+
+bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
+{
+	bool ret;
+
+	pthread_mutex_lock(&wq_lock);
+	if ((ret = set_work_pending(work)))
+		__queue_work(wq, work);
+	pthread_mutex_unlock(&wq_lock);
+
+	return ret;
+}
+
+void delayed_work_timer_fn(unsigned long __data)
+{
+	struct delayed_work *dwork = (struct delayed_work *) __data;
+
+	pthread_mutex_lock(&wq_lock);
+	__queue_work(dwork->wq, &dwork->work);
+	pthread_mutex_unlock(&wq_lock);
+}
+
+static void __queue_delayed_work(struct workqueue_struct *wq,
+				 struct delayed_work *dwork,
+				 unsigned long delay)
+{
+	struct timer_list *timer = &dwork->timer;
+	struct work_struct *work = &dwork->work;
+
+	BUG_ON(timer->function != delayed_work_timer_fn ||
+	       timer->data != (unsigned long)dwork);
+	BUG_ON(timer_pending(timer));
+	BUG_ON(!list_empty(&work->entry));
+
+	if (!delay) {
+		__queue_work(wq, &dwork->work);
+	} else {
+		dwork->wq = wq;
+		timer->expires = jiffies + delay;
+		add_timer(timer);
+	}
+}
+
+bool queue_delayed_work(struct workqueue_struct *wq,
+			struct delayed_work *dwork,
+			unsigned long delay)
+{
+	struct work_struct *work = &dwork->work;
+	bool ret;
+
+	pthread_mutex_lock(&wq_lock);
+	if ((ret = set_work_pending(work)))
+		__queue_delayed_work(wq, dwork, delay);
+	pthread_mutex_unlock(&wq_lock);
+
+	return ret;
+}
+
+static bool grab_pending(struct work_struct *work, bool is_dwork)
+{
+retry:
+	if (set_work_pending(work)) {
+		BUG_ON(!list_empty(&work->entry));
+		return false;
+	}
+
+	if (is_dwork) {
+		struct delayed_work *dwork = to_delayed_work(work);
+
+		if (likely(del_timer(&dwork->timer))) {
+			BUG_ON(!list_empty(&work->entry));
+			return true;
+		}
+	}
+
+	if (!list_empty(&work->entry)) {
+		list_del_init(&work->entry);
+		return true;
+	}
+
+	BUG_ON(!is_dwork);
+
+	pthread_mutex_unlock(&wq_lock);
+	flush_timers();
+	pthread_mutex_lock(&wq_lock);
+	goto retry;
+}
+
+static bool __flush_work(struct work_struct *work)
+{
+	struct workqueue_struct *wq;
+	bool ret = false;
+retry:
+	list_for_each_entry(wq, &wq_list, list)
+		if (wq->current_work == work) {
+			pthread_cond_wait(&wq->work_finished, &wq_lock);
+			ret = true;
+			goto retry;
+		}
+
+	return ret;
+}
+
+bool cancel_work_sync(struct work_struct *work)
+{
+	bool ret;
+
+	pthread_mutex_lock(&wq_lock);
+	ret = grab_pending(work, false);
+
+	__flush_work(work);
+	clear_work_pending(work);
+	pthread_mutex_unlock(&wq_lock);
+
+	return ret;
+}
+
+bool mod_delayed_work(struct workqueue_struct *wq,
+		      struct delayed_work *dwork,
+		      unsigned long delay)
+{
+	struct work_struct *work = &dwork->work;
+	bool ret;
+
+	pthread_mutex_lock(&wq_lock);
+	ret = grab_pending(work, true);
+
+	__queue_delayed_work(wq, dwork, delay);
+	pthread_mutex_unlock(&wq_lock);
+
+	return ret;
+}
+
+bool cancel_delayed_work(struct delayed_work *dwork)
+{
+	struct work_struct *work = &dwork->work;
+	bool ret;
+
+	pthread_mutex_lock(&wq_lock);
+	ret = grab_pending(work, true);
+
+	clear_work_pending(&dwork->work);
+	pthread_mutex_unlock(&wq_lock);
+
+	return ret;
+}
+
+bool cancel_delayed_work_sync(struct delayed_work *dwork)
+{
+	struct work_struct *work = &dwork->work;
+	bool ret;
+
+	pthread_mutex_lock(&wq_lock);
+	ret = grab_pending(work, true);
+
+	__flush_work(work);
+	clear_work_pending(work);
+	pthread_mutex_unlock(&wq_lock);
+
+	return ret;
+}
+
+static int worker_thread(void *arg)
+{
+	struct workqueue_struct *wq = arg;
+	struct work_struct *work;
+
+	pthread_mutex_lock(&wq_lock);
+	while (1) {
+		__set_current_state(TASK_INTERRUPTIBLE);
+		work = list_first_entry_or_null(&wq->pending_work,
+				struct work_struct, entry);
+		wq->current_work = work;
+
+		if (kthread_should_stop()) {
+			BUG_ON(wq->current_work);
+			break;
+		}
+
+		if (!work) {
+			pthread_mutex_unlock(&wq_lock);
+			schedule();
+			pthread_mutex_lock(&wq_lock);
+			continue;
+		}
+
+		BUG_ON(!test_bit(WORK_PENDING_BIT, work_data_bits(work)));
+		list_del_init(&work->entry);
+		clear_work_pending(work);
+
+		pthread_mutex_unlock(&wq_lock);
+		work->func(work);
+		pthread_mutex_lock(&wq_lock);
+
+		pthread_cond_broadcast(&wq->work_finished);
+	}
+	pthread_mutex_unlock(&wq_lock);
+
+	return 0;
+}
+
+void destroy_workqueue(struct workqueue_struct *wq)
+{
+	kthread_stop(wq->worker);
+
+	pthread_mutex_lock(&wq_lock);
+	list_del(&wq->list);
+	pthread_mutex_unlock(&wq_lock);
+
+	kfree(wq);
+}
+
+struct workqueue_struct *alloc_workqueue(const char *fmt,
+					 unsigned flags,
+					 int max_active,
+					 ...)
+{
+	va_list args;
+	struct workqueue_struct *wq;
+
+	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+	if (!wq)
+		return NULL;
+
+	INIT_LIST_HEAD(&wq->list);
+	INIT_LIST_HEAD(&wq->pending_work);
+
+	pthread_cond_init(&wq->work_finished, NULL);
+
+	va_start(args, max_active);
+	vsnprintf(wq->name, sizeof(wq->name), fmt, args);
+	va_end(args);
+
+	wq->worker = kthread_run(worker_thread, wq, "%s", wq->name);
+	if (IS_ERR(wq->worker)) {
+		kfree(wq);
+		return NULL;
+	}
+
+	pthread_mutex_lock(&wq_lock);
+	list_add(&wq->list, &wq_list);
+	pthread_mutex_unlock(&wq_lock);
+
+	return wq;
+}
+
+struct workqueue_struct *system_wq;
+struct workqueue_struct *system_highpri_wq;
+struct workqueue_struct *system_long_wq;
+struct workqueue_struct *system_unbound_wq;
+struct workqueue_struct *system_freezable_wq;
+struct workqueue_struct *system_power_efficient_wq;
+struct workqueue_struct *system_freezable_power_efficient_wq;
+
+__attribute__((constructor(102)))
+static void wq_init(void)
+{
+	system_wq = alloc_workqueue("events", 0, 0);
+	system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
+	system_long_wq = alloc_workqueue("events_long", 0, 0);
+	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
+					    WQ_UNBOUND_MAX_ACTIVE);
+	system_freezable_wq = alloc_workqueue("events_freezable",
+					      WQ_FREEZABLE, 0);
+	system_power_efficient_wq = alloc_workqueue("events_power_efficient",
+					      WQ_POWER_EFFICIENT, 0);
+	system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
+					      WQ_FREEZABLE | WQ_POWER_EFFICIENT,
+					      0);
+	BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
+	       !system_unbound_wq || !system_freezable_wq ||
+	       !system_power_efficient_wq ||
+	       !system_freezable_power_efficient_wq);
+}
diff --git a/linux/zlib_deflate/deflate.c b/linux/zlib_deflate/deflate.c
new file mode 100644
index 0000000..d20ef45
--- /dev/null
+++ b/linux/zlib_deflate/deflate.c
@@ -0,0 +1,1137 @@
+/* +++ deflate.c */
+/* deflate.c -- compress data using the deflation algorithm
+ * Copyright (C) 1995-1996 Jean-loup Gailly.
+ * For conditions of distribution and use, see copyright notice in zlib.h 
+ */
+
+/*
+ *  ALGORITHM
+ *
+ *      The "deflation" process depends on being able to identify portions
+ *      of the input text which are identical to earlier input (within a
+ *      sliding window trailing behind the input currently being processed).
+ *
+ *      The most straightforward technique turns out to be the fastest for
+ *      most input files: try all possible matches and select the longest.
+ *      The key feature of this algorithm is that insertions into the string
+ *      dictionary are very simple and thus fast, and deletions are avoided
+ *      completely. Insertions are performed at each input character, whereas
+ *      string matches are performed only when the previous match ends. So it
+ *      is preferable to spend more time in matches to allow very fast string
+ *      insertions and avoid deletions. The matching algorithm for small
+ *      strings is inspired from that of Rabin & Karp. A brute force approach
+ *      is used to find longer strings when a small match has been found.
+ *      A similar algorithm is used in comic (by Jan-Mark Wams) and freeze
+ *      (by Leonid Broukhis).
+ *         A previous version of this file used a more sophisticated algorithm
+ *      (by Fiala and Greene) which is guaranteed to run in linear amortized
+ *      time, but has a larger average cost, uses more memory and is patented.
+ *      However the F&G algorithm may be faster for some highly redundant
+ *      files if the parameter max_chain_length (described below) is too large.
+ *
+ *  ACKNOWLEDGEMENTS
+ *
+ *      The idea of lazy evaluation of matches is due to Jan-Mark Wams, and
+ *      I found it in 'freeze' written by Leonid Broukhis.
+ *      Thanks to many people for bug reports and testing.
+ *
+ *  REFERENCES
+ *
+ *      Deutsch, L.P.,"DEFLATE Compressed Data Format Specification".
+ *      Available in ftp://ds.internic.net/rfc/rfc1951.txt
+ *
+ *      A description of the Rabin and Karp algorithm is given in the book
+ *         "Algorithms" by R. Sedgewick, Addison-Wesley, p252.
+ *
+ *      Fiala,E.R., and Greene,D.H.
+ *         Data Compression with Finite Windows, Comm.ACM, 32,4 (1989) 490-595
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/zutil.h>
+#include "defutil.h"
+
+
+/* ===========================================================================
+ *  Function prototypes.
+ */
+typedef enum {
+    need_more,      /* block not completed, need more input or more output */
+    block_done,     /* block flush performed */
+    finish_started, /* finish started, need only more output at next deflate */
+    finish_done     /* finish done, accept no more input or output */
+} block_state;
+
+typedef block_state (*compress_func) (deflate_state *s, int flush);
+/* Compression function. Returns the block state after the call. */
+
+static void fill_window    (deflate_state *s);
+static block_state deflate_stored (deflate_state *s, int flush);
+static block_state deflate_fast   (deflate_state *s, int flush);
+static block_state deflate_slow   (deflate_state *s, int flush);
+static void lm_init        (deflate_state *s);
+static void putShortMSB    (deflate_state *s, uInt b);
+static void flush_pending  (z_streamp strm);
+static int read_buf        (z_streamp strm, Byte *buf, unsigned size);
+static uInt longest_match  (deflate_state *s, IPos cur_match);
+
+#ifdef DEBUG_ZLIB
+static  void check_match (deflate_state *s, IPos start, IPos match,
+                         int length);
+#endif
+
+/* ===========================================================================
+ * Local data
+ */
+
+#define NIL 0
+/* Tail of hash chains */
+
+#ifndef TOO_FAR
+#  define TOO_FAR 4096
+#endif
+/* Matches of length 3 are discarded if their distance exceeds TOO_FAR */
+
+#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1)
+/* Minimum amount of lookahead, except at the end of the input file.
+ * See deflate.c for comments about the MIN_MATCH+1.
+ */
+
+/* Values for max_lazy_match, good_match and max_chain_length, depending on
+ * the desired pack level (0..9). The values given below have been tuned to
+ * exclude worst case performance for pathological files. Better values may be
+ * found for specific files.
+ */
+typedef struct config_s {
+   ush good_length; /* reduce lazy search above this match length */
+   ush max_lazy;    /* do not perform lazy search above this match length */
+   ush nice_length; /* quit search above this match length */
+   ush max_chain;
+   compress_func func;
+} config;
+
+static const config configuration_table[10] = {
+/*      good lazy nice chain */
+/* 0 */ {0,    0,  0,    0, deflate_stored},  /* store only */
+/* 1 */ {4,    4,  8,    4, deflate_fast}, /* maximum speed, no lazy matches */
+/* 2 */ {4,    5, 16,    8, deflate_fast},
+/* 3 */ {4,    6, 32,   32, deflate_fast},
+
+/* 4 */ {4,    4, 16,   16, deflate_slow},  /* lazy matches */
+/* 5 */ {8,   16, 32,   32, deflate_slow},
+/* 6 */ {8,   16, 128, 128, deflate_slow},
+/* 7 */ {8,   32, 128, 256, deflate_slow},
+/* 8 */ {32, 128, 258, 1024, deflate_slow},
+/* 9 */ {32, 258, 258, 4096, deflate_slow}}; /* maximum compression */
+
+/* Note: the deflate() code requires max_lazy >= MIN_MATCH and max_chain >= 4
+ * For deflate_fast() (levels <= 3) good is ignored and lazy has a different
+ * meaning.
+ */
+
+#define EQUAL 0
+/* result of memcmp for equal strings */
+
+/* ===========================================================================
+ * Update a hash value with the given input byte
+ * IN  assertion: all calls to UPDATE_HASH are made with consecutive
+ *    input characters, so that a running hash key can be computed from the
+ *    previous key instead of complete recalculation each time.
+ */
+#define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask)
+
+
+/* ===========================================================================
+ * Insert string str in the dictionary and set match_head to the previous head
+ * of the hash chain (the most recent string with same hash key). Return
+ * the previous length of the hash chain.
+ * IN  assertion: all calls to INSERT_STRING are made with consecutive
+ *    input characters and the first MIN_MATCH bytes of str are valid
+ *    (except for the last MIN_MATCH-1 bytes of the input file).
+ */
+#define INSERT_STRING(s, str, match_head) \
+   (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
+    s->prev[(str) & s->w_mask] = match_head = s->head[s->ins_h], \
+    s->head[s->ins_h] = (Pos)(str))
+
+/* ===========================================================================
+ * Initialize the hash table (avoiding 64K overflow for 16 bit systems).
+ * prev[] will be initialized on the fly.
+ */
+#define CLEAR_HASH(s) \
+    s->head[s->hash_size-1] = NIL; \
+    memset((char *)s->head, 0, (unsigned)(s->hash_size-1)*sizeof(*s->head));
+
+/* ========================================================================= */
+int zlib_deflateInit2(
+	z_streamp strm,
+	int  level,
+	int  method,
+	int  windowBits,
+	int  memLevel,
+	int  strategy
+)
+{
+    deflate_state *s;
+    int noheader = 0;
+    deflate_workspace *mem;
+    char *next;
+
+    ush *overlay;
+    /* We overlay pending_buf and d_buf+l_buf. This works since the average
+     * output size for (length,distance) codes is <= 24 bits.
+     */
+
+    if (strm == NULL) return Z_STREAM_ERROR;
+
+    strm->msg = NULL;
+
+    if (level == Z_DEFAULT_COMPRESSION) level = 6;
+
+    mem = (deflate_workspace *) strm->workspace;
+
+    if (windowBits < 0) { /* undocumented feature: suppress zlib header */
+        noheader = 1;
+        windowBits = -windowBits;
+    }
+    if (memLevel < 1 || memLevel > MAX_MEM_LEVEL || method != Z_DEFLATED ||
+        windowBits < 9 || windowBits > 15 || level < 0 || level > 9 ||
+	strategy < 0 || strategy > Z_HUFFMAN_ONLY) {
+        return Z_STREAM_ERROR;
+    }
+
+    /*
+     * Direct the workspace's pointers to the chunks that were allocated
+     * along with the deflate_workspace struct.
+     */
+    next = (char *) mem;
+    next += sizeof(*mem);
+    mem->window_memory = (Byte *) next;
+    next += zlib_deflate_window_memsize(windowBits);
+    mem->prev_memory = (Pos *) next;
+    next += zlib_deflate_prev_memsize(windowBits);
+    mem->head_memory = (Pos *) next;
+    next += zlib_deflate_head_memsize(memLevel);
+    mem->overlay_memory = next;
+
+    s = (deflate_state *) &(mem->deflate_memory);
+    strm->state = (struct internal_state *)s;
+    s->strm = strm;
+
+    s->noheader = noheader;
+    s->w_bits = windowBits;
+    s->w_size = 1 << s->w_bits;
+    s->w_mask = s->w_size - 1;
+
+    s->hash_bits = memLevel + 7;
+    s->hash_size = 1 << s->hash_bits;
+    s->hash_mask = s->hash_size - 1;
+    s->hash_shift =  ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH);
+
+    s->window = (Byte *) mem->window_memory;
+    s->prev   = (Pos *)  mem->prev_memory;
+    s->head   = (Pos *)  mem->head_memory;
+
+    s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */
+
+    overlay = (ush *) mem->overlay_memory;
+    s->pending_buf = (uch *) overlay;
+    s->pending_buf_size = (ulg)s->lit_bufsize * (sizeof(ush)+2L);
+
+    s->d_buf = overlay + s->lit_bufsize/sizeof(ush);
+    s->l_buf = s->pending_buf + (1+sizeof(ush))*s->lit_bufsize;
+
+    s->level = level;
+    s->strategy = strategy;
+    s->method = (Byte)method;
+
+    return zlib_deflateReset(strm);
+}
+
+/* ========================================================================= */
+int zlib_deflateReset(
+	z_streamp strm
+)
+{
+    deflate_state *s;
+    
+    if (strm == NULL || strm->state == NULL)
+        return Z_STREAM_ERROR;
+
+    strm->total_in = strm->total_out = 0;
+    strm->msg = NULL;
+    strm->data_type = Z_UNKNOWN;
+
+    s = (deflate_state *)strm->state;
+    s->pending = 0;
+    s->pending_out = s->pending_buf;
+
+    if (s->noheader < 0) {
+        s->noheader = 0; /* was set to -1 by deflate(..., Z_FINISH); */
+    }
+    s->status = s->noheader ? BUSY_STATE : INIT_STATE;
+    strm->adler = 1;
+    s->last_flush = Z_NO_FLUSH;
+
+    zlib_tr_init(s);
+    lm_init(s);
+
+    return Z_OK;
+}
+
+/* =========================================================================
+ * Put a short in the pending buffer. The 16-bit value is put in MSB order.
+ * IN assertion: the stream state is correct and there is enough room in
+ * pending_buf.
+ */
+static void putShortMSB(
+	deflate_state *s,
+	uInt b
+)
+{
+    put_byte(s, (Byte)(b >> 8));
+    put_byte(s, (Byte)(b & 0xff));
+}   
+
+/* =========================================================================
+ * Flush as much pending output as possible. All deflate() output goes
+ * through this function so some applications may wish to modify it
+ * to avoid allocating a large strm->next_out buffer and copying into it.
+ * (See also read_buf()).
+ */
+static void flush_pending(
+	z_streamp strm
+)
+{
+    deflate_state *s = (deflate_state *) strm->state;
+    unsigned len = s->pending;
+
+    if (len > strm->avail_out) len = strm->avail_out;
+    if (len == 0) return;
+
+    if (strm->next_out != NULL) {
+	memcpy(strm->next_out, s->pending_out, len);
+	strm->next_out += len;
+    }
+    s->pending_out += len;
+    strm->total_out += len;
+    strm->avail_out  -= len;
+    s->pending -= len;
+    if (s->pending == 0) {
+        s->pending_out = s->pending_buf;
+    }
+}
+
+/* ========================================================================= */
+int zlib_deflate(
+	z_streamp strm,
+	int flush
+)
+{
+    int old_flush; /* value of flush param for previous deflate call */
+    deflate_state *s;
+
+    if (strm == NULL || strm->state == NULL ||
+	flush > Z_FINISH || flush < 0) {
+        return Z_STREAM_ERROR;
+    }
+    s = (deflate_state *) strm->state;
+
+    if ((strm->next_in == NULL && strm->avail_in != 0) ||
+	(s->status == FINISH_STATE && flush != Z_FINISH)) {
+        return Z_STREAM_ERROR;
+    }
+    if (strm->avail_out == 0) return Z_BUF_ERROR;
+
+    s->strm = strm; /* just in case */
+    old_flush = s->last_flush;
+    s->last_flush = flush;
+
+    /* Write the zlib header */
+    if (s->status == INIT_STATE) {
+
+        uInt header = (Z_DEFLATED + ((s->w_bits-8)<<4)) << 8;
+        uInt level_flags = (s->level-1) >> 1;
+
+        if (level_flags > 3) level_flags = 3;
+        header |= (level_flags << 6);
+	if (s->strstart != 0) header |= PRESET_DICT;
+        header += 31 - (header % 31);
+
+        s->status = BUSY_STATE;
+        putShortMSB(s, header);
+
+	/* Save the adler32 of the preset dictionary: */
+	if (s->strstart != 0) {
+	    putShortMSB(s, (uInt)(strm->adler >> 16));
+	    putShortMSB(s, (uInt)(strm->adler & 0xffff));
+	}
+	strm->adler = 1L;
+    }
+
+    /* Flush as much pending output as possible */
+    if (s->pending != 0) {
+        flush_pending(strm);
+        if (strm->avail_out == 0) {
+	    /* Since avail_out is 0, deflate will be called again with
+	     * more output space, but possibly with both pending and
+	     * avail_in equal to zero. There won't be anything to do,
+	     * but this is not an error situation so make sure we
+	     * return OK instead of BUF_ERROR at next call of deflate:
+             */
+	    s->last_flush = -1;
+	    return Z_OK;
+	}
+
+    /* Make sure there is something to do and avoid duplicate consecutive
+     * flushes. For repeated and useless calls with Z_FINISH, we keep
+     * returning Z_STREAM_END instead of Z_BUFF_ERROR.
+     */
+    } else if (strm->avail_in == 0 && flush <= old_flush &&
+	       flush != Z_FINISH) {
+        return Z_BUF_ERROR;
+    }
+
+    /* User must not provide more input after the first FINISH: */
+    if (s->status == FINISH_STATE && strm->avail_in != 0) {
+        return Z_BUF_ERROR;
+    }
+
+    /* Start a new block or continue the current one.
+     */
+    if (strm->avail_in != 0 || s->lookahead != 0 ||
+        (flush != Z_NO_FLUSH && s->status != FINISH_STATE)) {
+        block_state bstate;
+
+	bstate = (*(configuration_table[s->level].func))(s, flush);
+
+        if (bstate == finish_started || bstate == finish_done) {
+            s->status = FINISH_STATE;
+        }
+        if (bstate == need_more || bstate == finish_started) {
+	    if (strm->avail_out == 0) {
+	        s->last_flush = -1; /* avoid BUF_ERROR next call, see above */
+	    }
+	    return Z_OK;
+	    /* If flush != Z_NO_FLUSH && avail_out == 0, the next call
+	     * of deflate should use the same flush parameter to make sure
+	     * that the flush is complete. So we don't have to output an
+	     * empty block here, this will be done at next call. This also
+	     * ensures that for a very small output buffer, we emit at most
+	     * one empty block.
+	     */
+	}
+        if (bstate == block_done) {
+            if (flush == Z_PARTIAL_FLUSH) {
+                zlib_tr_align(s);
+	    } else if (flush == Z_PACKET_FLUSH) {
+		/* Output just the 3-bit `stored' block type value,
+		   but not a zero length. */
+		zlib_tr_stored_type_only(s);
+            } else { /* FULL_FLUSH or SYNC_FLUSH */
+                zlib_tr_stored_block(s, (char*)0, 0L, 0);
+                /* For a full flush, this empty block will be recognized
+                 * as a special marker by inflate_sync().
+                 */
+                if (flush == Z_FULL_FLUSH) {
+                    CLEAR_HASH(s);             /* forget history */
+                }
+            }
+            flush_pending(strm);
+	    if (strm->avail_out == 0) {
+	      s->last_flush = -1; /* avoid BUF_ERROR at next call, see above */
+	      return Z_OK;
+	    }
+        }
+    }
+    Assert(strm->avail_out > 0, "bug2");
+
+    if (flush != Z_FINISH) return Z_OK;
+    if (s->noheader) return Z_STREAM_END;
+
+    /* Write the zlib trailer (adler32) */
+    putShortMSB(s, (uInt)(strm->adler >> 16));
+    putShortMSB(s, (uInt)(strm->adler & 0xffff));
+    flush_pending(strm);
+    /* If avail_out is zero, the application will call deflate again
+     * to flush the rest.
+     */
+    s->noheader = -1; /* write the trailer only once! */
+    return s->pending != 0 ? Z_OK : Z_STREAM_END;
+}
+
+/* ========================================================================= */
+int zlib_deflateEnd(
+	z_streamp strm
+)
+{
+    int status;
+    deflate_state *s;
+
+    if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
+    s = (deflate_state *) strm->state;
+
+    status = s->status;
+    if (status != INIT_STATE && status != BUSY_STATE &&
+	status != FINISH_STATE) {
+      return Z_STREAM_ERROR;
+    }
+
+    strm->state = NULL;
+
+    return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK;
+}
+
+/* ===========================================================================
+ * Read a new buffer from the current input stream, update the adler32
+ * and total number of bytes read.  All deflate() input goes through
+ * this function so some applications may wish to modify it to avoid
+ * allocating a large strm->next_in buffer and copying from it.
+ * (See also flush_pending()).
+ */
+static int read_buf(
+	z_streamp strm,
+	Byte *buf,
+	unsigned size
+)
+{
+    unsigned len = strm->avail_in;
+
+    if (len > size) len = size;
+    if (len == 0) return 0;
+
+    strm->avail_in  -= len;
+
+    if (!((deflate_state *)(strm->state))->noheader) {
+        strm->adler = zlib_adler32(strm->adler, strm->next_in, len);
+    }
+    memcpy(buf, strm->next_in, len);
+    strm->next_in  += len;
+    strm->total_in += len;
+
+    return (int)len;
+}
+
+/* ===========================================================================
+ * Initialize the "longest match" routines for a new zlib stream
+ */
+static void lm_init(
+	deflate_state *s
+)
+{
+    s->window_size = (ulg)2L*s->w_size;
+
+    CLEAR_HASH(s);
+
+    /* Set the default configuration parameters:
+     */
+    s->max_lazy_match   = configuration_table[s->level].max_lazy;
+    s->good_match       = configuration_table[s->level].good_length;
+    s->nice_match       = configuration_table[s->level].nice_length;
+    s->max_chain_length = configuration_table[s->level].max_chain;
+
+    s->strstart = 0;
+    s->block_start = 0L;
+    s->lookahead = 0;
+    s->match_length = s->prev_length = MIN_MATCH-1;
+    s->match_available = 0;
+    s->ins_h = 0;
+}
+
+/* ===========================================================================
+ * Set match_start to the longest match starting at the given string and
+ * return its length. Matches shorter or equal to prev_length are discarded,
+ * in which case the result is equal to prev_length and match_start is
+ * garbage.
+ * IN assertions: cur_match is the head of the hash chain for the current
+ *   string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
+ * OUT assertion: the match length is not greater than s->lookahead.
+ */
+/* For 80x86 and 680x0, an optimized version will be provided in match.asm or
+ * match.S. The code will be functionally equivalent.
+ */
+static uInt longest_match(
+	deflate_state *s,
+	IPos cur_match			/* current match */
+)
+{
+    unsigned chain_length = s->max_chain_length;/* max hash chain length */
+    register Byte *scan = s->window + s->strstart; /* current string */
+    register Byte *match;                       /* matched string */
+    register int len;                           /* length of current match */
+    int best_len = s->prev_length;              /* best match length so far */
+    int nice_match = s->nice_match;             /* stop if match long enough */
+    IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
+        s->strstart - (IPos)MAX_DIST(s) : NIL;
+    /* Stop when cur_match becomes <= limit. To simplify the code,
+     * we prevent matches with the string of window index 0.
+     */
+    Pos *prev = s->prev;
+    uInt wmask = s->w_mask;
+
+#ifdef UNALIGNED_OK
+    /* Compare two bytes at a time. Note: this is not always beneficial.
+     * Try with and without -DUNALIGNED_OK to check.
+     */
+    register Byte *strend = s->window + s->strstart + MAX_MATCH - 1;
+    register ush scan_start = *(ush*)scan;
+    register ush scan_end   = *(ush*)(scan+best_len-1);
+#else
+    register Byte *strend = s->window + s->strstart + MAX_MATCH;
+    register Byte scan_end1  = scan[best_len-1];
+    register Byte scan_end   = scan[best_len];
+#endif
+
+    /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
+     * It is easy to get rid of this optimization if necessary.
+     */
+    Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
+
+    /* Do not waste too much time if we already have a good match: */
+    if (s->prev_length >= s->good_match) {
+        chain_length >>= 2;
+    }
+    /* Do not look for matches beyond the end of the input. This is necessary
+     * to make deflate deterministic.
+     */
+    if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
+
+    Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
+
+    do {
+        Assert(cur_match < s->strstart, "no future");
+        match = s->window + cur_match;
+
+        /* Skip to next match if the match length cannot increase
+         * or if the match length is less than 2:
+         */
+#if (defined(UNALIGNED_OK) && MAX_MATCH == 258)
+        /* This code assumes sizeof(unsigned short) == 2. Do not use
+         * UNALIGNED_OK if your compiler uses a different size.
+         */
+        if (*(ush*)(match+best_len-1) != scan_end ||
+            *(ush*)match != scan_start) continue;
+
+        /* It is not necessary to compare scan[2] and match[2] since they are
+         * always equal when the other bytes match, given that the hash keys
+         * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at
+         * strstart+3, +5, ... up to strstart+257. We check for insufficient
+         * lookahead only every 4th comparison; the 128th check will be made
+         * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is
+         * necessary to put more guard bytes at the end of the window, or
+         * to check more often for insufficient lookahead.
+         */
+        Assert(scan[2] == match[2], "scan[2]?");
+        scan++, match++;
+        do {
+        } while (*(ush*)(scan+=2) == *(ush*)(match+=2) &&
+                 *(ush*)(scan+=2) == *(ush*)(match+=2) &&
+                 *(ush*)(scan+=2) == *(ush*)(match+=2) &&
+                 *(ush*)(scan+=2) == *(ush*)(match+=2) &&
+                 scan < strend);
+        /* The funny "do {}" generates better code on most compilers */
+
+        /* Here, scan <= window+strstart+257 */
+        Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
+        if (*scan == *match) scan++;
+
+        len = (MAX_MATCH - 1) - (int)(strend-scan);
+        scan = strend - (MAX_MATCH-1);
+
+#else /* UNALIGNED_OK */
+
+        if (match[best_len]   != scan_end  ||
+            match[best_len-1] != scan_end1 ||
+            *match            != *scan     ||
+            *++match          != scan[1])      continue;
+
+        /* The check at best_len-1 can be removed because it will be made
+         * again later. (This heuristic is not always a win.)
+         * It is not necessary to compare scan[2] and match[2] since they
+         * are always equal when the other bytes match, given that
+         * the hash keys are equal and that HASH_BITS >= 8.
+         */
+        scan += 2, match++;
+        Assert(*scan == *match, "match[2]?");
+
+        /* We check for insufficient lookahead only every 8th comparison;
+         * the 256th check will be made at strstart+258.
+         */
+        do {
+        } while (*++scan == *++match && *++scan == *++match &&
+                 *++scan == *++match && *++scan == *++match &&
+                 *++scan == *++match && *++scan == *++match &&
+                 *++scan == *++match && *++scan == *++match &&
+                 scan < strend);
+
+        Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
+
+        len = MAX_MATCH - (int)(strend - scan);
+        scan = strend - MAX_MATCH;
+
+#endif /* UNALIGNED_OK */
+
+        if (len > best_len) {
+            s->match_start = cur_match;
+            best_len = len;
+            if (len >= nice_match) break;
+#ifdef UNALIGNED_OK
+            scan_end = *(ush*)(scan+best_len-1);
+#else
+            scan_end1  = scan[best_len-1];
+            scan_end   = scan[best_len];
+#endif
+        }
+    } while ((cur_match = prev[cur_match & wmask]) > limit
+             && --chain_length != 0);
+
+    if ((uInt)best_len <= s->lookahead) return best_len;
+    return s->lookahead;
+}
+
+#ifdef DEBUG_ZLIB
+/* ===========================================================================
+ * Check that the match at match_start is indeed a match.
+ */
+static void check_match(
+	deflate_state *s,
+	IPos start,
+	IPos match,
+	int length
+)
+{
+    /* check that the match is indeed a match */
+    if (memcmp((char *)s->window + match,
+                (char *)s->window + start, length) != EQUAL) {
+        fprintf(stderr, " start %u, match %u, length %d\n",
+		start, match, length);
+        do {
+	    fprintf(stderr, "%c%c", s->window[match++], s->window[start++]);
+	} while (--length != 0);
+        z_error("invalid match");
+    }
+    if (z_verbose > 1) {
+        fprintf(stderr,"\\[%d,%d]", start-match, length);
+        do { putc(s->window[start++], stderr); } while (--length != 0);
+    }
+}
+#else
+#  define check_match(s, start, match, length)
+#endif
+
+/* ===========================================================================
+ * Fill the window when the lookahead becomes insufficient.
+ * Updates strstart and lookahead.
+ *
+ * IN assertion: lookahead < MIN_LOOKAHEAD
+ * OUT assertions: strstart <= window_size-MIN_LOOKAHEAD
+ *    At least one byte has been read, or avail_in == 0; reads are
+ *    performed for at least two bytes (required for the zip translate_eol
+ *    option -- not supported here).
+ */
+static void fill_window(
+	deflate_state *s
+)
+{
+    register unsigned n, m;
+    register Pos *p;
+    unsigned more;    /* Amount of free space at the end of the window. */
+    uInt wsize = s->w_size;
+
+    do {
+        more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);
+
+        /* Deal with !@#$% 64K limit: */
+        if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
+            more = wsize;
+
+        } else if (more == (unsigned)(-1)) {
+            /* Very unlikely, but possible on 16 bit machine if strstart == 0
+             * and lookahead == 1 (input done one byte at time)
+             */
+            more--;
+
+        /* If the window is almost full and there is insufficient lookahead,
+         * move the upper half to the lower one to make room in the upper half.
+         */
+        } else if (s->strstart >= wsize+MAX_DIST(s)) {
+
+            memcpy((char *)s->window, (char *)s->window+wsize,
+                   (unsigned)wsize);
+            s->match_start -= wsize;
+            s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
+            s->block_start -= (long) wsize;
+
+            /* Slide the hash table (could be avoided with 32 bit values
+               at the expense of memory usage). We slide even when level == 0
+               to keep the hash table consistent if we switch back to level > 0
+               later. (Using level 0 permanently is not an optimal usage of
+               zlib, so we don't care about this pathological case.)
+             */
+            n = s->hash_size;
+            p = &s->head[n];
+            do {
+                m = *--p;
+                *p = (Pos)(m >= wsize ? m-wsize : NIL);
+            } while (--n);
+
+            n = wsize;
+            p = &s->prev[n];
+            do {
+                m = *--p;
+                *p = (Pos)(m >= wsize ? m-wsize : NIL);
+                /* If n is not on any hash chain, prev[n] is garbage but
+                 * its value will never be used.
+                 */
+            } while (--n);
+            more += wsize;
+        }
+        if (s->strm->avail_in == 0) return;
+
+        /* If there was no sliding:
+         *    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
+         *    more == window_size - lookahead - strstart
+         * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
+         * => more >= window_size - 2*WSIZE + 2
+         * In the BIG_MEM or MMAP case (not yet supported),
+         *   window_size == input_size + MIN_LOOKAHEAD  &&
+         *   strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
+         * Otherwise, window_size == 2*WSIZE so more >= 2.
+         * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
+         */
+        Assert(more >= 2, "more < 2");
+
+        n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
+        s->lookahead += n;
+
+        /* Initialize the hash value now that we have some input: */
+        if (s->lookahead >= MIN_MATCH) {
+            s->ins_h = s->window[s->strstart];
+            UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]);
+#if MIN_MATCH != 3
+            Call UPDATE_HASH() MIN_MATCH-3 more times
+#endif
+        }
+        /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
+         * but this is not important since only literal bytes will be emitted.
+         */
+
+    } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
+}
+
+/* ===========================================================================
+ * Flush the current block, with given end-of-file flag.
+ * IN assertion: strstart is set to the end of the current match.
+ */
+#define FLUSH_BLOCK_ONLY(s, eof) { \
+   zlib_tr_flush_block(s, (s->block_start >= 0L ? \
+                   (char *)&s->window[(unsigned)s->block_start] : \
+                   NULL), \
+		(ulg)((long)s->strstart - s->block_start), \
+		(eof)); \
+   s->block_start = s->strstart; \
+   flush_pending(s->strm); \
+   Tracev((stderr,"[FLUSH]")); \
+}
+
+/* Same but force premature exit if necessary. */
+#define FLUSH_BLOCK(s, eof) { \
+   FLUSH_BLOCK_ONLY(s, eof); \
+   if (s->strm->avail_out == 0) return (eof) ? finish_started : need_more; \
+}
+
+/* ===========================================================================
+ * Copy without compression as much as possible from the input stream, return
+ * the current block state.
+ * This function does not insert new strings in the dictionary since
+ * uncompressible data is probably not useful. This function is used
+ * only for the level=0 compression option.
+ * NOTE: this function should be optimized to avoid extra copying from
+ * window to pending_buf.
+ */
+static block_state deflate_stored(
+	deflate_state *s,
+	int flush
+)
+{
+    /* Stored blocks are limited to 0xffff bytes, pending_buf is limited
+     * to pending_buf_size, and each stored block has a 5 byte header:
+     */
+    ulg max_block_size = 0xffff;
+    ulg max_start;
+
+    if (max_block_size > s->pending_buf_size - 5) {
+        max_block_size = s->pending_buf_size - 5;
+    }
+
+    /* Copy as much as possible from input to output: */
+    for (;;) {
+        /* Fill the window as much as possible: */
+        if (s->lookahead <= 1) {
+
+            Assert(s->strstart < s->w_size+MAX_DIST(s) ||
+		   s->block_start >= (long)s->w_size, "slide too late");
+
+            fill_window(s);
+            if (s->lookahead == 0 && flush == Z_NO_FLUSH) return need_more;
+
+            if (s->lookahead == 0) break; /* flush the current block */
+        }
+	Assert(s->block_start >= 0L, "block gone");
+
+	s->strstart += s->lookahead;
+	s->lookahead = 0;
+
+	/* Emit a stored block if pending_buf will be full: */
+ 	max_start = s->block_start + max_block_size;
+        if (s->strstart == 0 || (ulg)s->strstart >= max_start) {
+	    /* strstart == 0 is possible when wraparound on 16-bit machine */
+	    s->lookahead = (uInt)(s->strstart - max_start);
+	    s->strstart = (uInt)max_start;
+            FLUSH_BLOCK(s, 0);
+	}
+	/* Flush if we may have to slide, otherwise block_start may become
+         * negative and the data will be gone:
+         */
+        if (s->strstart - (uInt)s->block_start >= MAX_DIST(s)) {
+            FLUSH_BLOCK(s, 0);
+	}
+    }
+    FLUSH_BLOCK(s, flush == Z_FINISH);
+    return flush == Z_FINISH ? finish_done : block_done;
+}
+
+/* ===========================================================================
+ * Compress as much as possible from the input stream, return the current
+ * block state.
+ * This function does not perform lazy evaluation of matches and inserts
+ * new strings in the dictionary only for unmatched strings or for short
+ * matches. It is used only for the fast compression options.
+ */
+static block_state deflate_fast(
+	deflate_state *s,
+	int flush
+)
+{
+    IPos hash_head = NIL; /* head of the hash chain */
+    int bflush;           /* set if current block must be flushed */
+
+    for (;;) {
+        /* Make sure that we always have enough lookahead, except
+         * at the end of the input file. We need MAX_MATCH bytes
+         * for the next match, plus MIN_MATCH bytes to insert the
+         * string following the next match.
+         */
+        if (s->lookahead < MIN_LOOKAHEAD) {
+            fill_window(s);
+            if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
+	        return need_more;
+	    }
+            if (s->lookahead == 0) break; /* flush the current block */
+        }
+
+        /* Insert the string window[strstart .. strstart+2] in the
+         * dictionary, and set hash_head to the head of the hash chain:
+         */
+        if (s->lookahead >= MIN_MATCH) {
+            INSERT_STRING(s, s->strstart, hash_head);
+        }
+
+        /* Find the longest match, discarding those <= prev_length.
+         * At this point we have always match_length < MIN_MATCH
+         */
+        if (hash_head != NIL && s->strstart - hash_head <= MAX_DIST(s)) {
+            /* To simplify the code, we prevent matches with the string
+             * of window index 0 (in particular we have to avoid a match
+             * of the string with itself at the start of the input file).
+             */
+            if (s->strategy != Z_HUFFMAN_ONLY) {
+                s->match_length = longest_match (s, hash_head);
+            }
+            /* longest_match() sets match_start */
+        }
+        if (s->match_length >= MIN_MATCH) {
+            check_match(s, s->strstart, s->match_start, s->match_length);
+
+            bflush = zlib_tr_tally(s, s->strstart - s->match_start,
+                               s->match_length - MIN_MATCH);
+
+            s->lookahead -= s->match_length;
+
+            /* Insert new strings in the hash table only if the match length
+             * is not too large. This saves time but degrades compression.
+             */
+            if (s->match_length <= s->max_insert_length &&
+                s->lookahead >= MIN_MATCH) {
+                s->match_length--; /* string at strstart already in hash table */
+                do {
+                    s->strstart++;
+                    INSERT_STRING(s, s->strstart, hash_head);
+                    /* strstart never exceeds WSIZE-MAX_MATCH, so there are
+                     * always MIN_MATCH bytes ahead.
+                     */
+                } while (--s->match_length != 0);
+                s->strstart++; 
+            } else {
+                s->strstart += s->match_length;
+                s->match_length = 0;
+                s->ins_h = s->window[s->strstart];
+                UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]);
+#if MIN_MATCH != 3
+                Call UPDATE_HASH() MIN_MATCH-3 more times
+#endif
+                /* If lookahead < MIN_MATCH, ins_h is garbage, but it does not
+                 * matter since it will be recomputed at next deflate call.
+                 */
+            }
+        } else {
+            /* No match, output a literal byte */
+            Tracevv((stderr,"%c", s->window[s->strstart]));
+            bflush = zlib_tr_tally (s, 0, s->window[s->strstart]);
+            s->lookahead--;
+            s->strstart++; 
+        }
+        if (bflush) FLUSH_BLOCK(s, 0);
+    }
+    FLUSH_BLOCK(s, flush == Z_FINISH);
+    return flush == Z_FINISH ? finish_done : block_done;
+}
+
+/* ===========================================================================
+ * Same as above, but achieves better compression. We use a lazy
+ * evaluation for matches: a match is finally adopted only if there is
+ * no better match at the next window position.
+ */
+static block_state deflate_slow(
+	deflate_state *s,
+	int flush
+)
+{
+    IPos hash_head = NIL;    /* head of hash chain */
+    int bflush;              /* set if current block must be flushed */
+
+    /* Process the input block. */
+    for (;;) {
+        /* Make sure that we always have enough lookahead, except
+         * at the end of the input file. We need MAX_MATCH bytes
+         * for the next match, plus MIN_MATCH bytes to insert the
+         * string following the next match.
+         */
+        if (s->lookahead < MIN_LOOKAHEAD) {
+            fill_window(s);
+            if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
+	        return need_more;
+	    }
+            if (s->lookahead == 0) break; /* flush the current block */
+        }
+
+        /* Insert the string window[strstart .. strstart+2] in the
+         * dictionary, and set hash_head to the head of the hash chain:
+         */
+        if (s->lookahead >= MIN_MATCH) {
+            INSERT_STRING(s, s->strstart, hash_head);
+        }
+
+        /* Find the longest match, discarding those <= prev_length.
+         */
+        s->prev_length = s->match_length, s->prev_match = s->match_start;
+        s->match_length = MIN_MATCH-1;
+
+        if (hash_head != NIL && s->prev_length < s->max_lazy_match &&
+            s->strstart - hash_head <= MAX_DIST(s)) {
+            /* To simplify the code, we prevent matches with the string
+             * of window index 0 (in particular we have to avoid a match
+             * of the string with itself at the start of the input file).
+             */
+            if (s->strategy != Z_HUFFMAN_ONLY) {
+                s->match_length = longest_match (s, hash_head);
+            }
+            /* longest_match() sets match_start */
+
+            if (s->match_length <= 5 && (s->strategy == Z_FILTERED ||
+                 (s->match_length == MIN_MATCH &&
+                  s->strstart - s->match_start > TOO_FAR))) {
+
+                /* If prev_match is also MIN_MATCH, match_start is garbage
+                 * but we will ignore the current match anyway.
+                 */
+                s->match_length = MIN_MATCH-1;
+            }
+        }
+        /* If there was a match at the previous step and the current
+         * match is not better, output the previous match:
+         */
+        if (s->prev_length >= MIN_MATCH && s->match_length <= s->prev_length) {
+            uInt max_insert = s->strstart + s->lookahead - MIN_MATCH;
+            /* Do not insert strings in hash table beyond this. */
+
+            check_match(s, s->strstart-1, s->prev_match, s->prev_length);
+
+            bflush = zlib_tr_tally(s, s->strstart -1 - s->prev_match,
+				   s->prev_length - MIN_MATCH);
+
+            /* Insert in hash table all strings up to the end of the match.
+             * strstart-1 and strstart are already inserted. If there is not
+             * enough lookahead, the last two strings are not inserted in
+             * the hash table.
+             */
+            s->lookahead -= s->prev_length-1;
+            s->prev_length -= 2;
+            do {
+                if (++s->strstart <= max_insert) {
+                    INSERT_STRING(s, s->strstart, hash_head);
+                }
+            } while (--s->prev_length != 0);
+            s->match_available = 0;
+            s->match_length = MIN_MATCH-1;
+            s->strstart++;
+
+            if (bflush) FLUSH_BLOCK(s, 0);
+
+        } else if (s->match_available) {
+            /* If there was no match at the previous position, output a
+             * single literal. If there was a match but the current match
+             * is longer, truncate the previous match to a single literal.
+             */
+            Tracevv((stderr,"%c", s->window[s->strstart-1]));
+            if (zlib_tr_tally (s, 0, s->window[s->strstart-1])) {
+                FLUSH_BLOCK_ONLY(s, 0);
+            }
+            s->strstart++;
+            s->lookahead--;
+            if (s->strm->avail_out == 0) return need_more;
+        } else {
+            /* There is no previous match to compare with, wait for
+             * the next step to decide.
+             */
+            s->match_available = 1;
+            s->strstart++;
+            s->lookahead--;
+        }
+    }
+    Assert (flush != Z_NO_FLUSH, "no flush?");
+    if (s->match_available) {
+        Tracevv((stderr,"%c", s->window[s->strstart-1]));
+        zlib_tr_tally (s, 0, s->window[s->strstart-1]);
+        s->match_available = 0;
+    }
+    FLUSH_BLOCK(s, flush == Z_FINISH);
+    return flush == Z_FINISH ? finish_done : block_done;
+}
+
+int zlib_deflate_workspacesize(int windowBits, int memLevel)
+{
+    if (windowBits < 0) /* undocumented feature: suppress zlib header */
+        windowBits = -windowBits;
+
+    /* Since the return value is typically passed to vmalloc() unchecked... */
+    BUG_ON(memLevel < 1 || memLevel > MAX_MEM_LEVEL || windowBits < 9 ||
+							windowBits > 15);
+
+    return sizeof(deflate_workspace)
+        + zlib_deflate_window_memsize(windowBits)
+        + zlib_deflate_prev_memsize(windowBits)
+        + zlib_deflate_head_memsize(memLevel)
+        + zlib_deflate_overlay_memsize(memLevel);
+}
diff --git a/linux/zlib_deflate/deftree.c b/linux/zlib_deflate/deftree.c
new file mode 100644
index 0000000..9b1756b
--- /dev/null
+++ b/linux/zlib_deflate/deftree.c
@@ -0,0 +1,1113 @@
+/* +++ trees.c */
+/* trees.c -- output deflated data using Huffman coding
+ * Copyright (C) 1995-1996 Jean-loup Gailly
+ * For conditions of distribution and use, see copyright notice in zlib.h 
+ */
+
+/*
+ *  ALGORITHM
+ *
+ *      The "deflation" process uses several Huffman trees. The more
+ *      common source values are represented by shorter bit sequences.
+ *
+ *      Each code tree is stored in a compressed form which is itself
+ * a Huffman encoding of the lengths of all the code strings (in
+ * ascending order by source values).  The actual code strings are
+ * reconstructed from the lengths in the inflate process, as described
+ * in the deflate specification.
+ *
+ *  REFERENCES
+ *
+ *      Deutsch, L.P.,"'Deflate' Compressed Data Format Specification".
+ *      Available in ftp.uu.net:/pub/archiving/zip/doc/deflate-1.1.doc
+ *
+ *      Storer, James A.
+ *          Data Compression:  Methods and Theory, pp. 49-50.
+ *          Computer Science Press, 1988.  ISBN 0-7167-8156-5.
+ *
+ *      Sedgewick, R.
+ *          Algorithms, p290.
+ *          Addison-Wesley, 1983. ISBN 0-201-06672-6.
+ */
+
+/* From: trees.c,v 1.11 1996/07/24 13:41:06 me Exp $ */
+
+/* #include "deflate.h" */
+
+#include <linux/zutil.h>
+#include <linux/bitrev.h>
+#include "defutil.h"
+
+#ifdef DEBUG_ZLIB
+#  include <ctype.h>
+#endif
+
+/* ===========================================================================
+ * Constants
+ */
+
+#define MAX_BL_BITS 7
+/* Bit length codes must not exceed MAX_BL_BITS bits */
+
+#define END_BLOCK 256
+/* end of block literal code */
+
+#define REP_3_6      16
+/* repeat previous bit length 3-6 times (2 bits of repeat count) */
+
+#define REPZ_3_10    17
+/* repeat a zero length 3-10 times  (3 bits of repeat count) */
+
+#define REPZ_11_138  18
+/* repeat a zero length 11-138 times  (7 bits of repeat count) */
+
+static const int extra_lbits[LENGTH_CODES] /* extra bits for each length code */
+   = {0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0};
+
+static const int extra_dbits[D_CODES] /* extra bits for each distance code */
+   = {0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+static const int extra_blbits[BL_CODES]/* extra bits for each bit length code */
+   = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,7};
+
+static const uch bl_order[BL_CODES]
+   = {16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15};
+/* The lengths of the bit length codes are sent in order of decreasing
+ * probability, to avoid transmitting the lengths for unused bit length codes.
+ */
+
+#define Buf_size (8 * 2*sizeof(char))
+/* Number of bits used within bi_buf. (bi_buf might be implemented on
+ * more than 16 bits on some systems.)
+ */
+
+/* ===========================================================================
+ * Local data. These are initialized only once.
+ */
+
+static ct_data static_ltree[L_CODES+2];
+/* The static literal tree. Since the bit lengths are imposed, there is no
+ * need for the L_CODES extra codes used during heap construction. However
+ * The codes 286 and 287 are needed to build a canonical tree (see zlib_tr_init
+ * below).
+ */
+
+static ct_data static_dtree[D_CODES];
+/* The static distance tree. (Actually a trivial tree since all codes use
+ * 5 bits.)
+ */
+
+static uch dist_code[512];
+/* distance codes. The first 256 values correspond to the distances
+ * 3 .. 258, the last 256 values correspond to the top 8 bits of
+ * the 15 bit distances.
+ */
+
+static uch length_code[MAX_MATCH-MIN_MATCH+1];
+/* length code for each normalized match length (0 == MIN_MATCH) */
+
+static int base_length[LENGTH_CODES];
+/* First normalized length for each code (0 = MIN_MATCH) */
+
+static int base_dist[D_CODES];
+/* First normalized distance for each code (0 = distance of 1) */
+
+struct static_tree_desc_s {
+    const ct_data *static_tree;  /* static tree or NULL */
+    const int *extra_bits;       /* extra bits for each code or NULL */
+    int     extra_base;          /* base index for extra_bits */
+    int     elems;               /* max number of elements in the tree */
+    int     max_length;          /* max bit length for the codes */
+};
+
+static static_tree_desc  static_l_desc =
+{static_ltree, extra_lbits, LITERALS+1, L_CODES, MAX_BITS};
+
+static static_tree_desc  static_d_desc =
+{static_dtree, extra_dbits, 0,          D_CODES, MAX_BITS};
+
+static static_tree_desc  static_bl_desc =
+{(const ct_data *)0, extra_blbits, 0,   BL_CODES, MAX_BL_BITS};
+
+/* ===========================================================================
+ * Local (static) routines in this file.
+ */
+
+static void tr_static_init (void);
+static void init_block     (deflate_state *s);
+static void pqdownheap     (deflate_state *s, ct_data *tree, int k);
+static void gen_bitlen     (deflate_state *s, tree_desc *desc);
+static void gen_codes      (ct_data *tree, int max_code, ush *bl_count);
+static void build_tree     (deflate_state *s, tree_desc *desc);
+static void scan_tree      (deflate_state *s, ct_data *tree, int max_code);
+static void send_tree      (deflate_state *s, ct_data *tree, int max_code);
+static int  build_bl_tree  (deflate_state *s);
+static void send_all_trees (deflate_state *s, int lcodes, int dcodes,
+                           int blcodes);
+static void compress_block (deflate_state *s, ct_data *ltree,
+                           ct_data *dtree);
+static void set_data_type  (deflate_state *s);
+static void bi_windup      (deflate_state *s);
+static void bi_flush       (deflate_state *s);
+static void copy_block     (deflate_state *s, char *buf, unsigned len,
+                           int header);
+
+#ifndef DEBUG_ZLIB
+#  define send_code(s, c, tree) send_bits(s, tree[c].Code, tree[c].Len)
+   /* Send a code of the given tree. c and tree must not have side effects */
+
+#else /* DEBUG_ZLIB */
+#  define send_code(s, c, tree) \
+     { if (z_verbose>2) fprintf(stderr,"\ncd %3d ",(c)); \
+       send_bits(s, tree[c].Code, tree[c].Len); }
+#endif
+
+#define d_code(dist) \
+   ((dist) < 256 ? dist_code[dist] : dist_code[256+((dist)>>7)])
+/* Mapping from a distance to a distance code. dist is the distance - 1 and
+ * must not have side effects. dist_code[256] and dist_code[257] are never
+ * used.
+ */
+
+/* ===========================================================================
+ * Send a value on a given number of bits.
+ * IN assertion: length <= 16 and value fits in length bits.
+ */
+#ifdef DEBUG_ZLIB
+static void send_bits      (deflate_state *s, int value, int length);
+
+static void send_bits(
+	deflate_state *s,
+	int value,  /* value to send */
+	int length  /* number of bits */
+)
+{
+    Tracevv((stderr," l %2d v %4x ", length, value));
+    Assert(length > 0 && length <= 15, "invalid length");
+    s->bits_sent += (ulg)length;
+
+    /* If not enough room in bi_buf, use (valid) bits from bi_buf and
+     * (16 - bi_valid) bits from value, leaving (width - (16-bi_valid))
+     * unused bits in value.
+     */
+    if (s->bi_valid > (int)Buf_size - length) {
+        s->bi_buf |= (value << s->bi_valid);
+        put_short(s, s->bi_buf);
+        s->bi_buf = (ush)value >> (Buf_size - s->bi_valid);
+        s->bi_valid += length - Buf_size;
+    } else {
+        s->bi_buf |= value << s->bi_valid;
+        s->bi_valid += length;
+    }
+}
+#else /* !DEBUG_ZLIB */
+
+#define send_bits(s, value, length) \
+{ int len = length;\
+  if (s->bi_valid > (int)Buf_size - len) {\
+    int val = value;\
+    s->bi_buf |= (val << s->bi_valid);\
+    put_short(s, s->bi_buf);\
+    s->bi_buf = (ush)val >> (Buf_size - s->bi_valid);\
+    s->bi_valid += len - Buf_size;\
+  } else {\
+    s->bi_buf |= (value) << s->bi_valid;\
+    s->bi_valid += len;\
+  }\
+}
+#endif /* DEBUG_ZLIB */
+
+/* ===========================================================================
+ * Initialize the various 'constant' tables. In a multi-threaded environment,
+ * this function may be called by two threads concurrently, but this is
+ * harmless since both invocations do exactly the same thing.
+ */
+static void tr_static_init(void)
+{
+    static int static_init_done;
+    int n;        /* iterates over tree elements */
+    int bits;     /* bit counter */
+    int length;   /* length value */
+    int code;     /* code value */
+    int dist;     /* distance index */
+    ush bl_count[MAX_BITS+1];
+    /* number of codes at each bit length for an optimal tree */
+
+    if (static_init_done) return;
+
+    /* Initialize the mapping length (0..255) -> length code (0..28) */
+    length = 0;
+    for (code = 0; code < LENGTH_CODES-1; code++) {
+        base_length[code] = length;
+        for (n = 0; n < (1<<extra_lbits[code]); n++) {
+            length_code[length++] = (uch)code;
+        }
+    }
+    Assert (length == 256, "tr_static_init: length != 256");
+    /* Note that the length 255 (match length 258) can be represented
+     * in two different ways: code 284 + 5 bits or code 285, so we
+     * overwrite length_code[255] to use the best encoding:
+     */
+    length_code[length-1] = (uch)code;
+
+    /* Initialize the mapping dist (0..32K) -> dist code (0..29) */
+    dist = 0;
+    for (code = 0 ; code < 16; code++) {
+        base_dist[code] = dist;
+        for (n = 0; n < (1<<extra_dbits[code]); n++) {
+            dist_code[dist++] = (uch)code;
+        }
+    }
+    Assert (dist == 256, "tr_static_init: dist != 256");
+    dist >>= 7; /* from now on, all distances are divided by 128 */
+    for ( ; code < D_CODES; code++) {
+        base_dist[code] = dist << 7;
+        for (n = 0; n < (1<<(extra_dbits[code]-7)); n++) {
+            dist_code[256 + dist++] = (uch)code;
+        }
+    }
+    Assert (dist == 256, "tr_static_init: 256+dist != 512");
+
+    /* Construct the codes of the static literal tree */
+    for (bits = 0; bits <= MAX_BITS; bits++) bl_count[bits] = 0;
+    n = 0;
+    while (n <= 143) static_ltree[n++].Len = 8, bl_count[8]++;
+    while (n <= 255) static_ltree[n++].Len = 9, bl_count[9]++;
+    while (n <= 279) static_ltree[n++].Len = 7, bl_count[7]++;
+    while (n <= 287) static_ltree[n++].Len = 8, bl_count[8]++;
+    /* Codes 286 and 287 do not exist, but we must include them in the
+     * tree construction to get a canonical Huffman tree (longest code
+     * all ones)
+     */
+    gen_codes((ct_data *)static_ltree, L_CODES+1, bl_count);
+
+    /* The static distance tree is trivial: */
+    for (n = 0; n < D_CODES; n++) {
+        static_dtree[n].Len = 5;
+        static_dtree[n].Code = bitrev32((u32)n) >> (32 - 5);
+    }
+    static_init_done = 1;
+}
+
+/* ===========================================================================
+ * Initialize the tree data structures for a new zlib stream.
+ */
+void zlib_tr_init(
+	deflate_state *s
+)
+{
+    tr_static_init();
+
+    s->compressed_len = 0L;
+
+    s->l_desc.dyn_tree = s->dyn_ltree;
+    s->l_desc.stat_desc = &static_l_desc;
+
+    s->d_desc.dyn_tree = s->dyn_dtree;
+    s->d_desc.stat_desc = &static_d_desc;
+
+    s->bl_desc.dyn_tree = s->bl_tree;
+    s->bl_desc.stat_desc = &static_bl_desc;
+
+    s->bi_buf = 0;
+    s->bi_valid = 0;
+    s->last_eob_len = 8; /* enough lookahead for inflate */
+#ifdef DEBUG_ZLIB
+    s->bits_sent = 0L;
+#endif
+
+    /* Initialize the first block of the first file: */
+    init_block(s);
+}
+
+/* ===========================================================================
+ * Initialize a new block.
+ */
+static void init_block(
+	deflate_state *s
+)
+{
+    int n; /* iterates over tree elements */
+
+    /* Initialize the trees. */
+    for (n = 0; n < L_CODES;  n++) s->dyn_ltree[n].Freq = 0;
+    for (n = 0; n < D_CODES;  n++) s->dyn_dtree[n].Freq = 0;
+    for (n = 0; n < BL_CODES; n++) s->bl_tree[n].Freq = 0;
+
+    s->dyn_ltree[END_BLOCK].Freq = 1;
+    s->opt_len = s->static_len = 0L;
+    s->last_lit = s->matches = 0;
+}
+
+#define SMALLEST 1
+/* Index within the heap array of least frequent node in the Huffman tree */
+
+
+/* ===========================================================================
+ * Remove the smallest element from the heap and recreate the heap with
+ * one less element. Updates heap and heap_len.
+ */
+#define pqremove(s, tree, top) \
+{\
+    top = s->heap[SMALLEST]; \
+    s->heap[SMALLEST] = s->heap[s->heap_len--]; \
+    pqdownheap(s, tree, SMALLEST); \
+}
+
+/* ===========================================================================
+ * Compares to subtrees, using the tree depth as tie breaker when
+ * the subtrees have equal frequency. This minimizes the worst case length.
+ */
+#define smaller(tree, n, m, depth) \
+   (tree[n].Freq < tree[m].Freq || \
+   (tree[n].Freq == tree[m].Freq && depth[n] <= depth[m]))
+
+/* ===========================================================================
+ * Restore the heap property by moving down the tree starting at node k,
+ * exchanging a node with the smallest of its two sons if necessary, stopping
+ * when the heap property is re-established (each father smaller than its
+ * two sons).
+ */
+static void pqdownheap(
+	deflate_state *s,
+	ct_data *tree,  /* the tree to restore */
+	int k		/* node to move down */
+)
+{
+    int v = s->heap[k];
+    int j = k << 1;  /* left son of k */
+    while (j <= s->heap_len) {
+        /* Set j to the smallest of the two sons: */
+        if (j < s->heap_len &&
+            smaller(tree, s->heap[j+1], s->heap[j], s->depth)) {
+            j++;
+        }
+        /* Exit if v is smaller than both sons */
+        if (smaller(tree, v, s->heap[j], s->depth)) break;
+
+        /* Exchange v with the smallest son */
+        s->heap[k] = s->heap[j];  k = j;
+
+        /* And continue down the tree, setting j to the left son of k */
+        j <<= 1;
+    }
+    s->heap[k] = v;
+}
+
+/* ===========================================================================
+ * Compute the optimal bit lengths for a tree and update the total bit length
+ * for the current block.
+ * IN assertion: the fields freq and dad are set, heap[heap_max] and
+ *    above are the tree nodes sorted by increasing frequency.
+ * OUT assertions: the field len is set to the optimal bit length, the
+ *     array bl_count contains the frequencies for each bit length.
+ *     The length opt_len is updated; static_len is also updated if stree is
+ *     not null.
+ */
+static void gen_bitlen(
+	deflate_state *s,
+	tree_desc *desc    /* the tree descriptor */
+)
+{
+    ct_data *tree        = desc->dyn_tree;
+    int max_code         = desc->max_code;
+    const ct_data *stree = desc->stat_desc->static_tree;
+    const int *extra     = desc->stat_desc->extra_bits;
+    int base             = desc->stat_desc->extra_base;
+    int max_length       = desc->stat_desc->max_length;
+    int h;              /* heap index */
+    int n, m;           /* iterate over the tree elements */
+    int bits;           /* bit length */
+    int xbits;          /* extra bits */
+    ush f;              /* frequency */
+    int overflow = 0;   /* number of elements with bit length too large */
+
+    for (bits = 0; bits <= MAX_BITS; bits++) s->bl_count[bits] = 0;
+
+    /* In a first pass, compute the optimal bit lengths (which may
+     * overflow in the case of the bit length tree).
+     */
+    tree[s->heap[s->heap_max]].Len = 0; /* root of the heap */
+
+    for (h = s->heap_max+1; h < HEAP_SIZE; h++) {
+        n = s->heap[h];
+        bits = tree[tree[n].Dad].Len + 1;
+        if (bits > max_length) bits = max_length, overflow++;
+        tree[n].Len = (ush)bits;
+        /* We overwrite tree[n].Dad which is no longer needed */
+
+        if (n > max_code) continue; /* not a leaf node */
+
+        s->bl_count[bits]++;
+        xbits = 0;
+        if (n >= base) xbits = extra[n-base];
+        f = tree[n].Freq;
+        s->opt_len += (ulg)f * (bits + xbits);
+        if (stree) s->static_len += (ulg)f * (stree[n].Len + xbits);
+    }
+    if (overflow == 0) return;
+
+    Trace((stderr,"\nbit length overflow\n"));
+    /* This happens for example on obj2 and pic of the Calgary corpus */
+
+    /* Find the first bit length which could increase: */
+    do {
+        bits = max_length-1;
+        while (s->bl_count[bits] == 0) bits--;
+        s->bl_count[bits]--;      /* move one leaf down the tree */
+        s->bl_count[bits+1] += 2; /* move one overflow item as its brother */
+        s->bl_count[max_length]--;
+        /* The brother of the overflow item also moves one step up,
+         * but this does not affect bl_count[max_length]
+         */
+        overflow -= 2;
+    } while (overflow > 0);
+
+    /* Now recompute all bit lengths, scanning in increasing frequency.
+     * h is still equal to HEAP_SIZE. (It is simpler to reconstruct all
+     * lengths instead of fixing only the wrong ones. This idea is taken
+     * from 'ar' written by Haruhiko Okumura.)
+     */
+    for (bits = max_length; bits != 0; bits--) {
+        n = s->bl_count[bits];
+        while (n != 0) {
+            m = s->heap[--h];
+            if (m > max_code) continue;
+            if (tree[m].Len != (unsigned) bits) {
+                Trace((stderr,"code %d bits %d->%d\n", m, tree[m].Len, bits));
+                s->opt_len += ((long)bits - (long)tree[m].Len)
+                              *(long)tree[m].Freq;
+                tree[m].Len = (ush)bits;
+            }
+            n--;
+        }
+    }
+}
+
+/* ===========================================================================
+ * Generate the codes for a given tree and bit counts (which need not be
+ * optimal).
+ * IN assertion: the array bl_count contains the bit length statistics for
+ * the given tree and the field len is set for all tree elements.
+ * OUT assertion: the field code is set for all tree elements of non
+ *     zero code length.
+ */
+static void gen_codes(
+	ct_data *tree,             /* the tree to decorate */
+	int max_code,              /* largest code with non zero frequency */
+	ush *bl_count             /* number of codes at each bit length */
+)
+{
+    ush next_code[MAX_BITS+1]; /* next code value for each bit length */
+    ush code = 0;              /* running code value */
+    int bits;                  /* bit index */
+    int n;                     /* code index */
+
+    /* The distribution counts are first used to generate the code values
+     * without bit reversal.
+     */
+    for (bits = 1; bits <= MAX_BITS; bits++) {
+        next_code[bits] = code = (code + bl_count[bits-1]) << 1;
+    }
+    /* Check that the bit counts in bl_count are consistent. The last code
+     * must be all ones.
+     */
+    Assert (code + bl_count[MAX_BITS]-1 == (1<<MAX_BITS)-1,
+            "inconsistent bit counts");
+    Tracev((stderr,"\ngen_codes: max_code %d ", max_code));
+
+    for (n = 0;  n <= max_code; n++) {
+        int len = tree[n].Len;
+        if (len == 0) continue;
+        /* Now reverse the bits */
+        tree[n].Code = bitrev32((u32)(next_code[len]++)) >> (32 - len);
+
+        Tracecv(tree != static_ltree, (stderr,"\nn %3d %c l %2d c %4x (%x) ",
+             n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len]-1));
+    }
+}
+
+/* ===========================================================================
+ * Construct one Huffman tree and assigns the code bit strings and lengths.
+ * Update the total bit length for the current block.
+ * IN assertion: the field freq is set for all tree elements.
+ * OUT assertions: the fields len and code are set to the optimal bit length
+ *     and corresponding code. The length opt_len is updated; static_len is
+ *     also updated if stree is not null. The field max_code is set.
+ */
+static void build_tree(
+	deflate_state *s,
+	tree_desc *desc	 /* the tree descriptor */
+)
+{
+    ct_data *tree         = desc->dyn_tree;
+    const ct_data *stree  = desc->stat_desc->static_tree;
+    int elems             = desc->stat_desc->elems;
+    int n, m;          /* iterate over heap elements */
+    int max_code = -1; /* largest code with non zero frequency */
+    int node;          /* new node being created */
+
+    /* Construct the initial heap, with least frequent element in
+     * heap[SMALLEST]. The sons of heap[n] are heap[2*n] and heap[2*n+1].
+     * heap[0] is not used.
+     */
+    s->heap_len = 0, s->heap_max = HEAP_SIZE;
+
+    for (n = 0; n < elems; n++) {
+        if (tree[n].Freq != 0) {
+            s->heap[++(s->heap_len)] = max_code = n;
+            s->depth[n] = 0;
+        } else {
+            tree[n].Len = 0;
+        }
+    }
+
+    /* The pkzip format requires that at least one distance code exists,
+     * and that at least one bit should be sent even if there is only one
+     * possible code. So to avoid special checks later on we force at least
+     * two codes of non zero frequency.
+     */
+    while (s->heap_len < 2) {
+        node = s->heap[++(s->heap_len)] = (max_code < 2 ? ++max_code : 0);
+        tree[node].Freq = 1;
+        s->depth[node] = 0;
+        s->opt_len--; if (stree) s->static_len -= stree[node].Len;
+        /* node is 0 or 1 so it does not have extra bits */
+    }
+    desc->max_code = max_code;
+
+    /* The elements heap[heap_len/2+1 .. heap_len] are leaves of the tree,
+     * establish sub-heaps of increasing lengths:
+     */
+    for (n = s->heap_len/2; n >= 1; n--) pqdownheap(s, tree, n);
+
+    /* Construct the Huffman tree by repeatedly combining the least two
+     * frequent nodes.
+     */
+    node = elems;              /* next internal node of the tree */
+    do {
+        pqremove(s, tree, n);  /* n = node of least frequency */
+        m = s->heap[SMALLEST]; /* m = node of next least frequency */
+
+        s->heap[--(s->heap_max)] = n; /* keep the nodes sorted by frequency */
+        s->heap[--(s->heap_max)] = m;
+
+        /* Create a new node father of n and m */
+        tree[node].Freq = tree[n].Freq + tree[m].Freq;
+        s->depth[node] = (uch) (max(s->depth[n], s->depth[m]) + 1);
+        tree[n].Dad = tree[m].Dad = (ush)node;
+#ifdef DUMP_BL_TREE
+        if (tree == s->bl_tree) {
+            fprintf(stderr,"\nnode %d(%d), sons %d(%d) %d(%d)",
+                    node, tree[node].Freq, n, tree[n].Freq, m, tree[m].Freq);
+        }
+#endif
+        /* and insert the new node in the heap */
+        s->heap[SMALLEST] = node++;
+        pqdownheap(s, tree, SMALLEST);
+
+    } while (s->heap_len >= 2);
+
+    s->heap[--(s->heap_max)] = s->heap[SMALLEST];
+
+    /* At this point, the fields freq and dad are set. We can now
+     * generate the bit lengths.
+     */
+    gen_bitlen(s, (tree_desc *)desc);
+
+    /* The field len is now set, we can generate the bit codes */
+    gen_codes ((ct_data *)tree, max_code, s->bl_count);
+}
+
+/* ===========================================================================
+ * Scan a literal or distance tree to determine the frequencies of the codes
+ * in the bit length tree.
+ */
+static void scan_tree(
+	deflate_state *s,
+	ct_data *tree,   /* the tree to be scanned */
+	int max_code     /* and its largest code of non zero frequency */
+)
+{
+    int n;                     /* iterates over all tree elements */
+    int prevlen = -1;          /* last emitted length */
+    int curlen;                /* length of current code */
+    int nextlen = tree[0].Len; /* length of next code */
+    int count = 0;             /* repeat count of the current code */
+    int max_count = 7;         /* max repeat count */
+    int min_count = 4;         /* min repeat count */
+
+    if (nextlen == 0) max_count = 138, min_count = 3;
+    tree[max_code+1].Len = (ush)0xffff; /* guard */
+
+    for (n = 0; n <= max_code; n++) {
+        curlen = nextlen; nextlen = tree[n+1].Len;
+        if (++count < max_count && curlen == nextlen) {
+            continue;
+        } else if (count < min_count) {
+            s->bl_tree[curlen].Freq += count;
+        } else if (curlen != 0) {
+            if (curlen != prevlen) s->bl_tree[curlen].Freq++;
+            s->bl_tree[REP_3_6].Freq++;
+        } else if (count <= 10) {
+            s->bl_tree[REPZ_3_10].Freq++;
+        } else {
+            s->bl_tree[REPZ_11_138].Freq++;
+        }
+        count = 0; prevlen = curlen;
+        if (nextlen == 0) {
+            max_count = 138, min_count = 3;
+        } else if (curlen == nextlen) {
+            max_count = 6, min_count = 3;
+        } else {
+            max_count = 7, min_count = 4;
+        }
+    }
+}
+
+/* ===========================================================================
+ * Send a literal or distance tree in compressed form, using the codes in
+ * bl_tree.
+ */
+static void send_tree(
+	deflate_state *s,
+	ct_data *tree, /* the tree to be scanned */
+	int max_code   /* and its largest code of non zero frequency */
+)
+{
+    int n;                     /* iterates over all tree elements */
+    int prevlen = -1;          /* last emitted length */
+    int curlen;                /* length of current code */
+    int nextlen = tree[0].Len; /* length of next code */
+    int count = 0;             /* repeat count of the current code */
+    int max_count = 7;         /* max repeat count */
+    int min_count = 4;         /* min repeat count */
+
+    /* tree[max_code+1].Len = -1; */  /* guard already set */
+    if (nextlen == 0) max_count = 138, min_count = 3;
+
+    for (n = 0; n <= max_code; n++) {
+        curlen = nextlen; nextlen = tree[n+1].Len;
+        if (++count < max_count && curlen == nextlen) {
+            continue;
+        } else if (count < min_count) {
+            do { send_code(s, curlen, s->bl_tree); } while (--count != 0);
+
+        } else if (curlen != 0) {
+            if (curlen != prevlen) {
+                send_code(s, curlen, s->bl_tree); count--;
+            }
+            Assert(count >= 3 && count <= 6, " 3_6?");
+            send_code(s, REP_3_6, s->bl_tree); send_bits(s, count-3, 2);
+
+        } else if (count <= 10) {
+            send_code(s, REPZ_3_10, s->bl_tree); send_bits(s, count-3, 3);
+
+        } else {
+            send_code(s, REPZ_11_138, s->bl_tree); send_bits(s, count-11, 7);
+        }
+        count = 0; prevlen = curlen;
+        if (nextlen == 0) {
+            max_count = 138, min_count = 3;
+        } else if (curlen == nextlen) {
+            max_count = 6, min_count = 3;
+        } else {
+            max_count = 7, min_count = 4;
+        }
+    }
+}
+
+/* ===========================================================================
+ * Construct the Huffman tree for the bit lengths and return the index in
+ * bl_order of the last bit length code to send.
+ */
+static int build_bl_tree(
+	deflate_state *s
+)
+{
+    int max_blindex;  /* index of last bit length code of non zero freq */
+
+    /* Determine the bit length frequencies for literal and distance trees */
+    scan_tree(s, (ct_data *)s->dyn_ltree, s->l_desc.max_code);
+    scan_tree(s, (ct_data *)s->dyn_dtree, s->d_desc.max_code);
+
+    /* Build the bit length tree: */
+    build_tree(s, (tree_desc *)(&(s->bl_desc)));
+    /* opt_len now includes the length of the tree representations, except
+     * the lengths of the bit lengths codes and the 5+5+4 bits for the counts.
+     */
+
+    /* Determine the number of bit length codes to send. The pkzip format
+     * requires that at least 4 bit length codes be sent. (appnote.txt says
+     * 3 but the actual value used is 4.)
+     */
+    for (max_blindex = BL_CODES-1; max_blindex >= 3; max_blindex--) {
+        if (s->bl_tree[bl_order[max_blindex]].Len != 0) break;
+    }
+    /* Update opt_len to include the bit length tree and counts */
+    s->opt_len += 3*(max_blindex+1) + 5+5+4;
+    Tracev((stderr, "\ndyn trees: dyn %ld, stat %ld",
+            s->opt_len, s->static_len));
+
+    return max_blindex;
+}
+
+/* ===========================================================================
+ * Send the header for a block using dynamic Huffman trees: the counts, the
+ * lengths of the bit length codes, the literal tree and the distance tree.
+ * IN assertion: lcodes >= 257, dcodes >= 1, blcodes >= 4.
+ */
+static void send_all_trees(
+	deflate_state *s,
+	int lcodes,  /* number of codes for each tree */
+	int dcodes,  /* number of codes for each tree */
+	int blcodes  /* number of codes for each tree */
+)
+{
+    int rank;                    /* index in bl_order */
+
+    Assert (lcodes >= 257 && dcodes >= 1 && blcodes >= 4, "not enough codes");
+    Assert (lcodes <= L_CODES && dcodes <= D_CODES && blcodes <= BL_CODES,
+            "too many codes");
+    Tracev((stderr, "\nbl counts: "));
+    send_bits(s, lcodes-257, 5); /* not +255 as stated in appnote.txt */
+    send_bits(s, dcodes-1,   5);
+    send_bits(s, blcodes-4,  4); /* not -3 as stated in appnote.txt */
+    for (rank = 0; rank < blcodes; rank++) {
+        Tracev((stderr, "\nbl code %2d ", bl_order[rank]));
+        send_bits(s, s->bl_tree[bl_order[rank]].Len, 3);
+    }
+    Tracev((stderr, "\nbl tree: sent %ld", s->bits_sent));
+
+    send_tree(s, (ct_data *)s->dyn_ltree, lcodes-1); /* literal tree */
+    Tracev((stderr, "\nlit tree: sent %ld", s->bits_sent));
+
+    send_tree(s, (ct_data *)s->dyn_dtree, dcodes-1); /* distance tree */
+    Tracev((stderr, "\ndist tree: sent %ld", s->bits_sent));
+}
+
+/* ===========================================================================
+ * Send a stored block
+ */
+void zlib_tr_stored_block(
+	deflate_state *s,
+	char *buf,        /* input block */
+	ulg stored_len,   /* length of input block */
+	int eof           /* true if this is the last block for a file */
+)
+{
+    send_bits(s, (STORED_BLOCK<<1)+eof, 3);  /* send block type */
+    s->compressed_len = (s->compressed_len + 3 + 7) & (ulg)~7L;
+    s->compressed_len += (stored_len + 4) << 3;
+
+    copy_block(s, buf, (unsigned)stored_len, 1); /* with header */
+}
+
+/* Send just the `stored block' type code without any length bytes or data.
+ */
+void zlib_tr_stored_type_only(
+	deflate_state *s
+)
+{
+    send_bits(s, (STORED_BLOCK << 1), 3);
+    bi_windup(s);
+    s->compressed_len = (s->compressed_len + 3) & ~7L;
+}
+
+
+/* ===========================================================================
+ * Send one empty static block to give enough lookahead for inflate.
+ * This takes 10 bits, of which 7 may remain in the bit buffer.
+ * The current inflate code requires 9 bits of lookahead. If the
+ * last two codes for the previous block (real code plus EOB) were coded
+ * on 5 bits or less, inflate may have only 5+3 bits of lookahead to decode
+ * the last real code. In this case we send two empty static blocks instead
+ * of one. (There are no problems if the previous block is stored or fixed.)
+ * To simplify the code, we assume the worst case of last real code encoded
+ * on one bit only.
+ */
+void zlib_tr_align(
+	deflate_state *s
+)
+{
+    send_bits(s, STATIC_TREES<<1, 3);
+    send_code(s, END_BLOCK, static_ltree);
+    s->compressed_len += 10L; /* 3 for block type, 7 for EOB */
+    bi_flush(s);
+    /* Of the 10 bits for the empty block, we have already sent
+     * (10 - bi_valid) bits. The lookahead for the last real code (before
+     * the EOB of the previous block) was thus at least one plus the length
+     * of the EOB plus what we have just sent of the empty static block.
+     */
+    if (1 + s->last_eob_len + 10 - s->bi_valid < 9) {
+        send_bits(s, STATIC_TREES<<1, 3);
+        send_code(s, END_BLOCK, static_ltree);
+        s->compressed_len += 10L;
+        bi_flush(s);
+    }
+    s->last_eob_len = 7;
+}
+
+/* ===========================================================================
+ * Determine the best encoding for the current block: dynamic trees, static
+ * trees or store, and output the encoded block to the zip file. This function
+ * returns the total compressed length for the file so far.
+ */
+ulg zlib_tr_flush_block(
+	deflate_state *s,
+	char *buf,        /* input block, or NULL if too old */
+	ulg stored_len,   /* length of input block */
+	int eof           /* true if this is the last block for a file */
+)
+{
+    ulg opt_lenb, static_lenb; /* opt_len and static_len in bytes */
+    int max_blindex = 0;  /* index of last bit length code of non zero freq */
+
+    /* Build the Huffman trees unless a stored block is forced */
+    if (s->level > 0) {
+
+	 /* Check if the file is ascii or binary */
+	if (s->data_type == Z_UNKNOWN) set_data_type(s);
+
+	/* Construct the literal and distance trees */
+	build_tree(s, (tree_desc *)(&(s->l_desc)));
+	Tracev((stderr, "\nlit data: dyn %ld, stat %ld", s->opt_len,
+		s->static_len));
+
+	build_tree(s, (tree_desc *)(&(s->d_desc)));
+	Tracev((stderr, "\ndist data: dyn %ld, stat %ld", s->opt_len,
+		s->static_len));
+	/* At this point, opt_len and static_len are the total bit lengths of
+	 * the compressed block data, excluding the tree representations.
+	 */
+
+	/* Build the bit length tree for the above two trees, and get the index
+	 * in bl_order of the last bit length code to send.
+	 */
+	max_blindex = build_bl_tree(s);
+
+	/* Determine the best encoding. Compute first the block length in bytes*/
+	opt_lenb = (s->opt_len+3+7)>>3;
+	static_lenb = (s->static_len+3+7)>>3;
+
+	Tracev((stderr, "\nopt %lu(%lu) stat %lu(%lu) stored %lu lit %u ",
+		opt_lenb, s->opt_len, static_lenb, s->static_len, stored_len,
+		s->last_lit));
+
+	if (static_lenb <= opt_lenb) opt_lenb = static_lenb;
+
+    } else {
+        Assert(buf != (char*)0, "lost buf");
+	opt_lenb = static_lenb = stored_len + 5; /* force a stored block */
+    }
+
+    /* If compression failed and this is the first and last block,
+     * and if the .zip file can be seeked (to rewrite the local header),
+     * the whole file is transformed into a stored file:
+     */
+#ifdef STORED_FILE_OK
+#  ifdef FORCE_STORED_FILE
+    if (eof && s->compressed_len == 0L) { /* force stored file */
+#  else
+    if (stored_len <= opt_lenb && eof && s->compressed_len==0L && seekable()) {
+#  endif
+        /* Since LIT_BUFSIZE <= 2*WSIZE, the input data must be there: */
+        if (buf == (char*)0) error ("block vanished");
+
+        copy_block(s, buf, (unsigned)stored_len, 0); /* without header */
+        s->compressed_len = stored_len << 3;
+        s->method = STORED;
+    } else
+#endif /* STORED_FILE_OK */
+
+#ifdef FORCE_STORED
+    if (buf != (char*)0) { /* force stored block */
+#else
+    if (stored_len+4 <= opt_lenb && buf != (char*)0) {
+                       /* 4: two words for the lengths */
+#endif
+        /* The test buf != NULL is only necessary if LIT_BUFSIZE > WSIZE.
+         * Otherwise we can't have processed more than WSIZE input bytes since
+         * the last block flush, because compression would have been
+         * successful. If LIT_BUFSIZE <= WSIZE, it is never too late to
+         * transform a block into a stored block.
+         */
+        zlib_tr_stored_block(s, buf, stored_len, eof);
+
+#ifdef FORCE_STATIC
+    } else if (static_lenb >= 0) { /* force static trees */
+#else
+    } else if (static_lenb == opt_lenb) {
+#endif
+        send_bits(s, (STATIC_TREES<<1)+eof, 3);
+        compress_block(s, (ct_data *)static_ltree, (ct_data *)static_dtree);
+        s->compressed_len += 3 + s->static_len;
+    } else {
+        send_bits(s, (DYN_TREES<<1)+eof, 3);
+        send_all_trees(s, s->l_desc.max_code+1, s->d_desc.max_code+1,
+                       max_blindex+1);
+        compress_block(s, (ct_data *)s->dyn_ltree, (ct_data *)s->dyn_dtree);
+        s->compressed_len += 3 + s->opt_len;
+    }
+    Assert (s->compressed_len == s->bits_sent, "bad compressed size");
+    init_block(s);
+
+    if (eof) {
+        bi_windup(s);
+        s->compressed_len += 7;  /* align on byte boundary */
+    }
+    Tracev((stderr,"\ncomprlen %lu(%lu) ", s->compressed_len>>3,
+           s->compressed_len-7*eof));
+
+    return s->compressed_len >> 3;
+}
+
+/* ===========================================================================
+ * Save the match info and tally the frequency counts. Return true if
+ * the current block must be flushed.
+ */
+int zlib_tr_tally(
+	deflate_state *s,
+	unsigned dist,  /* distance of matched string */
+	unsigned lc     /* match length-MIN_MATCH or unmatched char (if dist==0) */
+)
+{
+    s->d_buf[s->last_lit] = (ush)dist;
+    s->l_buf[s->last_lit++] = (uch)lc;
+    if (dist == 0) {
+        /* lc is the unmatched char */
+        s->dyn_ltree[lc].Freq++;
+    } else {
+        s->matches++;
+        /* Here, lc is the match length - MIN_MATCH */
+        dist--;             /* dist = match distance - 1 */
+        Assert((ush)dist < (ush)MAX_DIST(s) &&
+               (ush)lc <= (ush)(MAX_MATCH-MIN_MATCH) &&
+               (ush)d_code(dist) < (ush)D_CODES,  "zlib_tr_tally: bad match");
+
+        s->dyn_ltree[length_code[lc]+LITERALS+1].Freq++;
+        s->dyn_dtree[d_code(dist)].Freq++;
+    }
+
+    /* Try to guess if it is profitable to stop the current block here */
+    if ((s->last_lit & 0xfff) == 0 && s->level > 2) {
+        /* Compute an upper bound for the compressed length */
+        ulg out_length = (ulg)s->last_lit*8L;
+        ulg in_length = (ulg)((long)s->strstart - s->block_start);
+        int dcode;
+        for (dcode = 0; dcode < D_CODES; dcode++) {
+            out_length += (ulg)s->dyn_dtree[dcode].Freq *
+                (5L+extra_dbits[dcode]);
+        }
+        out_length >>= 3;
+        Tracev((stderr,"\nlast_lit %u, in %ld, out ~%ld(%ld%%) ",
+               s->last_lit, in_length, out_length,
+               100L - out_length*100L/in_length));
+        if (s->matches < s->last_lit/2 && out_length < in_length/2) return 1;
+    }
+    return (s->last_lit == s->lit_bufsize-1);
+    /* We avoid equality with lit_bufsize because of wraparound at 64K
+     * on 16 bit machines and because stored blocks are restricted to
+     * 64K-1 bytes.
+     */
+}
+
+/* ===========================================================================
+ * Send the block data compressed using the given Huffman trees
+ */
+static void compress_block(
+	deflate_state *s,
+	ct_data *ltree, /* literal tree */
+	ct_data *dtree  /* distance tree */
+)
+{
+    unsigned dist;      /* distance of matched string */
+    int lc;             /* match length or unmatched char (if dist == 0) */
+    unsigned lx = 0;    /* running index in l_buf */
+    unsigned code;      /* the code to send */
+    int extra;          /* number of extra bits to send */
+
+    if (s->last_lit != 0) do {
+        dist = s->d_buf[lx];
+        lc = s->l_buf[lx++];
+        if (dist == 0) {
+            send_code(s, lc, ltree); /* send a literal byte */
+            Tracecv(isgraph(lc), (stderr," '%c' ", lc));
+        } else {
+            /* Here, lc is the match length - MIN_MATCH */
+            code = length_code[lc];
+            send_code(s, code+LITERALS+1, ltree); /* send the length code */
+            extra = extra_lbits[code];
+            if (extra != 0) {
+                lc -= base_length[code];
+                send_bits(s, lc, extra);       /* send the extra length bits */
+            }
+            dist--; /* dist is now the match distance - 1 */
+            code = d_code(dist);
+            Assert (code < D_CODES, "bad d_code");
+
+            send_code(s, code, dtree);       /* send the distance code */
+            extra = extra_dbits[code];
+            if (extra != 0) {
+                dist -= base_dist[code];
+                send_bits(s, dist, extra);   /* send the extra distance bits */
+            }
+        } /* literal or match pair ? */
+
+        /* Check that the overlay between pending_buf and d_buf+l_buf is ok: */
+        Assert(s->pending < s->lit_bufsize + 2*lx, "pendingBuf overflow");
+
+    } while (lx < s->last_lit);
+
+    send_code(s, END_BLOCK, ltree);
+    s->last_eob_len = ltree[END_BLOCK].Len;
+}
+
+/* ===========================================================================
+ * Set the data type to ASCII or BINARY, using a crude approximation:
+ * binary if more than 20% of the bytes are <= 6 or >= 128, ascii otherwise.
+ * IN assertion: the fields freq of dyn_ltree are set and the total of all
+ * frequencies does not exceed 64K (to fit in an int on 16 bit machines).
+ */
+static void set_data_type(
+	deflate_state *s
+)
+{
+    int n = 0;
+    unsigned ascii_freq = 0;
+    unsigned bin_freq = 0;
+    while (n < 7)        bin_freq += s->dyn_ltree[n++].Freq;
+    while (n < 128)    ascii_freq += s->dyn_ltree[n++].Freq;
+    while (n < LITERALS) bin_freq += s->dyn_ltree[n++].Freq;
+    s->data_type = (Byte)(bin_freq > (ascii_freq >> 2) ? Z_BINARY : Z_ASCII);
+}
+
+/* ===========================================================================
+ * Copy a stored block, storing first the length and its
+ * one's complement if requested.
+ */
+static void copy_block(
+	deflate_state *s,
+	char    *buf,     /* the input data */
+	unsigned len,     /* its length */
+	int      header   /* true if block header must be written */
+)
+{
+    bi_windup(s);        /* align on byte boundary */
+    s->last_eob_len = 8; /* enough lookahead for inflate */
+
+    if (header) {
+        put_short(s, (ush)len);   
+        put_short(s, (ush)~len);
+#ifdef DEBUG_ZLIB
+        s->bits_sent += 2*16;
+#endif
+    }
+#ifdef DEBUG_ZLIB
+    s->bits_sent += (ulg)len<<3;
+#endif
+    /* bundle up the put_byte(s, *buf++) calls */
+    memcpy(&s->pending_buf[s->pending], buf, len);
+    s->pending += len;
+}
+
diff --git a/linux/zlib_deflate/defutil.h b/linux/zlib_deflate/defutil.h
new file mode 100644
index 0000000..a8c3708
--- /dev/null
+++ b/linux/zlib_deflate/defutil.h
@@ -0,0 +1,327 @@
+
+
+
+#define Assert(err, str) 
+#define Trace(dummy) 
+#define Tracev(dummy) 
+#define Tracecv(err, dummy) 
+#define Tracevv(dummy) 
+
+
+
+#define LENGTH_CODES 29
+/* number of length codes, not counting the special END_BLOCK code */
+
+#define LITERALS  256
+/* number of literal bytes 0..255 */
+
+#define L_CODES (LITERALS+1+LENGTH_CODES)
+/* number of Literal or Length codes, including the END_BLOCK code */
+
+#define D_CODES   30
+/* number of distance codes */
+
+#define BL_CODES  19
+/* number of codes used to transfer the bit lengths */
+
+#define HEAP_SIZE (2*L_CODES+1)
+/* maximum heap size */
+
+#define MAX_BITS 15
+/* All codes must not exceed MAX_BITS bits */
+
+#define INIT_STATE    42
+#define BUSY_STATE   113
+#define FINISH_STATE 666
+/* Stream status */
+
+
+/* Data structure describing a single value and its code string. */
+typedef struct ct_data_s {
+    union {
+        ush  freq;       /* frequency count */
+        ush  code;       /* bit string */
+    } fc;
+    union {
+        ush  dad;        /* father node in Huffman tree */
+        ush  len;        /* length of bit string */
+    } dl;
+} ct_data;
+
+#define Freq fc.freq
+#define Code fc.code
+#define Dad  dl.dad
+#define Len  dl.len
+
+typedef struct static_tree_desc_s  static_tree_desc;
+
+typedef struct tree_desc_s {
+    ct_data *dyn_tree;           /* the dynamic tree */
+    int     max_code;            /* largest code with non zero frequency */
+    static_tree_desc *stat_desc; /* the corresponding static tree */
+} tree_desc;
+
+typedef ush Pos;
+typedef unsigned IPos;
+
+/* A Pos is an index in the character window. We use short instead of int to
+ * save space in the various tables. IPos is used only for parameter passing.
+ */
+
+typedef struct deflate_state {
+    z_streamp strm;      /* pointer back to this zlib stream */
+    int   status;        /* as the name implies */
+    Byte *pending_buf;   /* output still pending */
+    ulg   pending_buf_size; /* size of pending_buf */
+    Byte *pending_out;   /* next pending byte to output to the stream */
+    int   pending;       /* nb of bytes in the pending buffer */
+    int   noheader;      /* suppress zlib header and adler32 */
+    Byte  data_type;     /* UNKNOWN, BINARY or ASCII */
+    Byte  method;        /* STORED (for zip only) or DEFLATED */
+    int   last_flush;    /* value of flush param for previous deflate call */
+
+                /* used by deflate.c: */
+
+    uInt  w_size;        /* LZ77 window size (32K by default) */
+    uInt  w_bits;        /* log2(w_size)  (8..16) */
+    uInt  w_mask;        /* w_size - 1 */
+
+    Byte *window;
+    /* Sliding window. Input bytes are read into the second half of the window,
+     * and move to the first half later to keep a dictionary of at least wSize
+     * bytes. With this organization, matches are limited to a distance of
+     * wSize-MAX_MATCH bytes, but this ensures that IO is always
+     * performed with a length multiple of the block size. Also, it limits
+     * the window size to 64K, which is quite useful on MSDOS.
+     * To do: use the user input buffer as sliding window.
+     */
+
+    ulg window_size;
+    /* Actual size of window: 2*wSize, except when the user input buffer
+     * is directly used as sliding window.
+     */
+
+    Pos *prev;
+    /* Link to older string with same hash index. To limit the size of this
+     * array to 64K, this link is maintained only for the last 32K strings.
+     * An index in this array is thus a window index modulo 32K.
+     */
+
+    Pos *head; /* Heads of the hash chains or NIL. */
+
+    uInt  ins_h;          /* hash index of string to be inserted */
+    uInt  hash_size;      /* number of elements in hash table */
+    uInt  hash_bits;      /* log2(hash_size) */
+    uInt  hash_mask;      /* hash_size-1 */
+
+    uInt  hash_shift;
+    /* Number of bits by which ins_h must be shifted at each input
+     * step. It must be such that after MIN_MATCH steps, the oldest
+     * byte no longer takes part in the hash key, that is:
+     *   hash_shift * MIN_MATCH >= hash_bits
+     */
+
+    long block_start;
+    /* Window position at the beginning of the current output block. Gets
+     * negative when the window is moved backwards.
+     */
+
+    uInt match_length;           /* length of best match */
+    IPos prev_match;             /* previous match */
+    int match_available;         /* set if previous match exists */
+    uInt strstart;               /* start of string to insert */
+    uInt match_start;            /* start of matching string */
+    uInt lookahead;              /* number of valid bytes ahead in window */
+
+    uInt prev_length;
+    /* Length of the best match at previous step. Matches not greater than this
+     * are discarded. This is used in the lazy match evaluation.
+     */
+
+    uInt max_chain_length;
+    /* To speed up deflation, hash chains are never searched beyond this
+     * length.  A higher limit improves compression ratio but degrades the
+     * speed.
+     */
+
+    uInt max_lazy_match;
+    /* Attempt to find a better match only when the current match is strictly
+     * smaller than this value. This mechanism is used only for compression
+     * levels >= 4.
+     */
+#   define max_insert_length  max_lazy_match
+    /* Insert new strings in the hash table only if the match length is not
+     * greater than this length. This saves time but degrades compression.
+     * max_insert_length is used only for compression levels <= 3.
+     */
+
+    int level;    /* compression level (1..9) */
+    int strategy; /* favor or force Huffman coding*/
+
+    uInt good_match;
+    /* Use a faster search when the previous match is longer than this */
+
+    int nice_match; /* Stop searching when current match exceeds this */
+
+                /* used by trees.c: */
+    /* Didn't use ct_data typedef below to suppress compiler warning */
+    struct ct_data_s dyn_ltree[HEAP_SIZE];   /* literal and length tree */
+    struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */
+    struct ct_data_s bl_tree[2*BL_CODES+1];  /* Huffman tree for bit lengths */
+
+    struct tree_desc_s l_desc;               /* desc. for literal tree */
+    struct tree_desc_s d_desc;               /* desc. for distance tree */
+    struct tree_desc_s bl_desc;              /* desc. for bit length tree */
+
+    ush bl_count[MAX_BITS+1];
+    /* number of codes at each bit length for an optimal tree */
+
+    int heap[2*L_CODES+1];      /* heap used to build the Huffman trees */
+    int heap_len;               /* number of elements in the heap */
+    int heap_max;               /* element of largest frequency */
+    /* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used.
+     * The same heap array is used to build all trees.
+     */
+
+    uch depth[2*L_CODES+1];
+    /* Depth of each subtree used as tie breaker for trees of equal frequency
+     */
+
+    uch *l_buf;          /* buffer for literals or lengths */
+
+    uInt  lit_bufsize;
+    /* Size of match buffer for literals/lengths.  There are 4 reasons for
+     * limiting lit_bufsize to 64K:
+     *   - frequencies can be kept in 16 bit counters
+     *   - if compression is not successful for the first block, all input
+     *     data is still in the window so we can still emit a stored block even
+     *     when input comes from standard input.  (This can also be done for
+     *     all blocks if lit_bufsize is not greater than 32K.)
+     *   - if compression is not successful for a file smaller than 64K, we can
+     *     even emit a stored file instead of a stored block (saving 5 bytes).
+     *     This is applicable only for zip (not gzip or zlib).
+     *   - creating new Huffman trees less frequently may not provide fast
+     *     adaptation to changes in the input data statistics. (Take for
+     *     example a binary file with poorly compressible code followed by
+     *     a highly compressible string table.) Smaller buffer sizes give
+     *     fast adaptation but have of course the overhead of transmitting
+     *     trees more frequently.
+     *   - I can't count above 4
+     */
+
+    uInt last_lit;      /* running index in l_buf */
+
+    ush *d_buf;
+    /* Buffer for distances. To simplify the code, d_buf and l_buf have
+     * the same number of elements. To use different lengths, an extra flag
+     * array would be necessary.
+     */
+
+    ulg opt_len;        /* bit length of current block with optimal trees */
+    ulg static_len;     /* bit length of current block with static trees */
+    ulg compressed_len; /* total bit length of compressed file */
+    uInt matches;       /* number of string matches in current block */
+    int last_eob_len;   /* bit length of EOB code for last block */
+
+#ifdef DEBUG_ZLIB
+    ulg bits_sent;      /* bit length of the compressed data */
+#endif
+
+    ush bi_buf;
+    /* Output buffer. bits are inserted starting at the bottom (least
+     * significant bits).
+     */
+    int bi_valid;
+    /* Number of valid bits in bi_buf.  All bits above the last valid bit
+     * are always zero.
+     */
+
+} deflate_state;
+
+typedef struct deflate_workspace {
+    /* State memory for the deflator */
+    deflate_state deflate_memory;
+    Byte *window_memory;
+    Pos *prev_memory;
+    Pos *head_memory;
+    char *overlay_memory;
+} deflate_workspace;
+
+#define zlib_deflate_window_memsize(windowBits) \
+	(2 * (1 << (windowBits)) * sizeof(Byte))
+#define zlib_deflate_prev_memsize(windowBits) \
+	((1 << (windowBits)) * sizeof(Pos))
+#define zlib_deflate_head_memsize(memLevel) \
+	((1 << ((memLevel)+7)) * sizeof(Pos))
+#define zlib_deflate_overlay_memsize(memLevel) \
+	((1 << ((memLevel)+6)) * (sizeof(ush)+2))
+
+/* Output a byte on the stream.
+ * IN assertion: there is enough room in pending_buf.
+ */
+#define put_byte(s, c) {s->pending_buf[s->pending++] = (c);}
+
+
+#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1)
+/* Minimum amount of lookahead, except at the end of the input file.
+ * See deflate.c for comments about the MIN_MATCH+1.
+ */
+
+#define MAX_DIST(s)  ((s)->w_size-MIN_LOOKAHEAD)
+/* In order to simplify the code, particularly on 16 bit machines, match
+ * distances are limited to MAX_DIST instead of WSIZE.
+ */
+
+        /* in trees.c */
+void zlib_tr_init         (deflate_state *s);
+int  zlib_tr_tally        (deflate_state *s, unsigned dist, unsigned lc);
+ulg  zlib_tr_flush_block  (deflate_state *s, char *buf, ulg stored_len,
+			   int eof);
+void zlib_tr_align        (deflate_state *s);
+void zlib_tr_stored_block (deflate_state *s, char *buf, ulg stored_len,
+			   int eof);
+void zlib_tr_stored_type_only (deflate_state *);
+
+
+/* ===========================================================================
+ * Output a short LSB first on the stream.
+ * IN assertion: there is enough room in pendingBuf.
+ */
+#define put_short(s, w) { \
+    put_byte(s, (uch)((w) & 0xff)); \
+    put_byte(s, (uch)((ush)(w) >> 8)); \
+}
+
+/* ===========================================================================
+ * Flush the bit buffer, keeping at most 7 bits in it.
+ */
+static inline void bi_flush(deflate_state *s)
+{
+    if (s->bi_valid == 16) {
+        put_short(s, s->bi_buf);
+        s->bi_buf = 0;
+        s->bi_valid = 0;
+    } else if (s->bi_valid >= 8) {
+        put_byte(s, (Byte)s->bi_buf);
+        s->bi_buf >>= 8;
+        s->bi_valid -= 8;
+    }
+}
+
+/* ===========================================================================
+ * Flush the bit buffer and align the output on a byte boundary
+ */
+static inline void bi_windup(deflate_state *s)
+{
+    if (s->bi_valid > 8) {
+        put_short(s, s->bi_buf);
+    } else if (s->bi_valid > 0) {
+        put_byte(s, (Byte)s->bi_buf);
+    }
+    s->bi_buf = 0;
+    s->bi_valid = 0;
+#ifdef DEBUG_ZLIB
+    s->bits_sent = (s->bits_sent+7) & ~7;
+#endif
+}
+
diff --git a/linux/zlib_inflate/inffast.c b/linux/zlib_inflate/inffast.c
new file mode 100644
index 0000000..2c13ecc
--- /dev/null
+++ b/linux/zlib_inflate/inffast.c
@@ -0,0 +1,363 @@
+/* inffast.c -- fast decoding
+ * Copyright (C) 1995-2004 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <linux/zutil.h>
+#include "inftrees.h"
+#include "inflate.h"
+#include "inffast.h"
+
+#ifndef ASMINF
+
+/* Allow machine dependent optimization for post-increment or pre-increment.
+   Based on testing to date,
+   Pre-increment preferred for:
+   - PowerPC G3 (Adler)
+   - MIPS R5000 (Randers-Pehrson)
+   Post-increment preferred for:
+   - none
+   No measurable difference:
+   - Pentium III (Anderson)
+   - M68060 (Nikl)
+ */
+union uu {
+	unsigned short us;
+	unsigned char b[2];
+};
+
+/* Endian independed version */
+static inline unsigned short
+get_unaligned16(const unsigned short *p)
+{
+	union uu  mm;
+	unsigned char *b = (unsigned char *)p;
+
+	mm.b[0] = b[0];
+	mm.b[1] = b[1];
+	return mm.us;
+}
+
+#ifdef POSTINC
+#  define OFF 0
+#  define PUP(a) *(a)++
+#  define UP_UNALIGNED(a) get_unaligned16((a)++)
+#else
+#  define OFF 1
+#  define PUP(a) *++(a)
+#  define UP_UNALIGNED(a) get_unaligned16(++(a))
+#endif
+
+/*
+   Decode literal, length, and distance codes and write out the resulting
+   literal and match bytes until either not enough input or output is
+   available, an end-of-block is encountered, or a data error is encountered.
+   When large enough input and output buffers are supplied to inflate(), for
+   example, a 16K input buffer and a 64K output buffer, more than 95% of the
+   inflate execution time is spent in this routine.
+
+   Entry assumptions:
+
+        state->mode == LEN
+        strm->avail_in >= 6
+        strm->avail_out >= 258
+        start >= strm->avail_out
+        state->bits < 8
+
+   On return, state->mode is one of:
+
+        LEN -- ran out of enough output space or enough available input
+        TYPE -- reached end of block code, inflate() to interpret next block
+        BAD -- error in block data
+
+   Notes:
+
+    - The maximum input bits used by a length/distance pair is 15 bits for the
+      length code, 5 bits for the length extra, 15 bits for the distance code,
+      and 13 bits for the distance extra.  This totals 48 bits, or six bytes.
+      Therefore if strm->avail_in >= 6, then there is enough input to avoid
+      checking for available input while decoding.
+
+    - The maximum bytes that a single length/distance pair can output is 258
+      bytes, which is the maximum length that can be coded.  inflate_fast()
+      requires strm->avail_out >= 258 for each loop to avoid checking for
+      output space.
+
+    - @start:	inflate()'s starting value for strm->avail_out
+ */
+void inflate_fast(z_streamp strm, unsigned start)
+{
+    struct inflate_state *state;
+    const unsigned char *in;    /* local strm->next_in */
+    const unsigned char *last;  /* while in < last, enough input available */
+    unsigned char *out;         /* local strm->next_out */
+    unsigned char *beg;         /* inflate()'s initial strm->next_out */
+    unsigned char *end;         /* while out < end, enough space available */
+#ifdef INFLATE_STRICT
+    unsigned dmax;              /* maximum distance from zlib header */
+#endif
+    unsigned wsize;             /* window size or zero if not using window */
+    unsigned whave;             /* valid bytes in the window */
+    unsigned write;             /* window write index */
+    unsigned char *window;      /* allocated sliding window, if wsize != 0 */
+    unsigned long hold;         /* local strm->hold */
+    unsigned bits;              /* local strm->bits */
+    code const *lcode;          /* local strm->lencode */
+    code const *dcode;          /* local strm->distcode */
+    unsigned lmask;             /* mask for first level of length codes */
+    unsigned dmask;             /* mask for first level of distance codes */
+    code this;                  /* retrieved table entry */
+    unsigned op;                /* code bits, operation, extra bits, or */
+                                /*  window position, window bytes to copy */
+    unsigned len;               /* match length, unused bytes */
+    unsigned dist;              /* match distance */
+    unsigned char *from;        /* where to copy match from */
+
+    /* copy state to local variables */
+    state = (struct inflate_state *)strm->state;
+    in = strm->next_in - OFF;
+    last = in + (strm->avail_in - 5);
+    out = strm->next_out - OFF;
+    beg = out - (start - strm->avail_out);
+    end = out + (strm->avail_out - 257);
+#ifdef INFLATE_STRICT
+    dmax = state->dmax;
+#endif
+    wsize = state->wsize;
+    whave = state->whave;
+    write = state->write;
+    window = state->window;
+    hold = state->hold;
+    bits = state->bits;
+    lcode = state->lencode;
+    dcode = state->distcode;
+    lmask = (1U << state->lenbits) - 1;
+    dmask = (1U << state->distbits) - 1;
+
+    /* decode literals and length/distances until end-of-block or not enough
+       input data or output space */
+    do {
+        if (bits < 15) {
+            hold += (unsigned long)(PUP(in)) << bits;
+            bits += 8;
+            hold += (unsigned long)(PUP(in)) << bits;
+            bits += 8;
+        }
+        this = lcode[hold & lmask];
+      dolen:
+        op = (unsigned)(this.bits);
+        hold >>= op;
+        bits -= op;
+        op = (unsigned)(this.op);
+        if (op == 0) {                          /* literal */
+            PUP(out) = (unsigned char)(this.val);
+        }
+        else if (op & 16) {                     /* length base */
+            len = (unsigned)(this.val);
+            op &= 15;                           /* number of extra bits */
+            if (op) {
+                if (bits < op) {
+                    hold += (unsigned long)(PUP(in)) << bits;
+                    bits += 8;
+                }
+                len += (unsigned)hold & ((1U << op) - 1);
+                hold >>= op;
+                bits -= op;
+            }
+            if (bits < 15) {
+                hold += (unsigned long)(PUP(in)) << bits;
+                bits += 8;
+                hold += (unsigned long)(PUP(in)) << bits;
+                bits += 8;
+            }
+            this = dcode[hold & dmask];
+          dodist:
+            op = (unsigned)(this.bits);
+            hold >>= op;
+            bits -= op;
+            op = (unsigned)(this.op);
+            if (op & 16) {                      /* distance base */
+                dist = (unsigned)(this.val);
+                op &= 15;                       /* number of extra bits */
+                if (bits < op) {
+                    hold += (unsigned long)(PUP(in)) << bits;
+                    bits += 8;
+                    if (bits < op) {
+                        hold += (unsigned long)(PUP(in)) << bits;
+                        bits += 8;
+                    }
+                }
+                dist += (unsigned)hold & ((1U << op) - 1);
+#ifdef INFLATE_STRICT
+                if (dist > dmax) {
+                    strm->msg = (char *)"invalid distance too far back";
+                    state->mode = BAD;
+                    break;
+                }
+#endif
+                hold >>= op;
+                bits -= op;
+                op = (unsigned)(out - beg);     /* max distance in output */
+                if (dist > op) {                /* see if copy from window */
+                    op = dist - op;             /* distance back in window */
+                    if (op > whave) {
+                        strm->msg = (char *)"invalid distance too far back";
+                        state->mode = BAD;
+                        break;
+                    }
+                    from = window - OFF;
+                    if (write == 0) {           /* very common case */
+                        from += wsize - op;
+                        if (op < len) {         /* some from window */
+                            len -= op;
+                            do {
+                                PUP(out) = PUP(from);
+                            } while (--op);
+                            from = out - dist;  /* rest from output */
+                        }
+                    }
+                    else if (write < op) {      /* wrap around window */
+                        from += wsize + write - op;
+                        op -= write;
+                        if (op < len) {         /* some from end of window */
+                            len -= op;
+                            do {
+                                PUP(out) = PUP(from);
+                            } while (--op);
+                            from = window - OFF;
+                            if (write < len) {  /* some from start of window */
+                                op = write;
+                                len -= op;
+                                do {
+                                    PUP(out) = PUP(from);
+                                } while (--op);
+                                from = out - dist;      /* rest from output */
+                            }
+                        }
+                    }
+                    else {                      /* contiguous in window */
+                        from += write - op;
+                        if (op < len) {         /* some from window */
+                            len -= op;
+                            do {
+                                PUP(out) = PUP(from);
+                            } while (--op);
+                            from = out - dist;  /* rest from output */
+                        }
+                    }
+                    while (len > 2) {
+                        PUP(out) = PUP(from);
+                        PUP(out) = PUP(from);
+                        PUP(out) = PUP(from);
+                        len -= 3;
+                    }
+                    if (len) {
+                        PUP(out) = PUP(from);
+                        if (len > 1)
+                            PUP(out) = PUP(from);
+                    }
+                }
+                else {
+		    unsigned short *sout;
+		    unsigned long loops;
+
+                    from = out - dist;          /* copy direct from output */
+		    /* minimum length is three */
+		    /* Align out addr */
+		    if (!((long)(out - 1 + OFF) & 1)) {
+			PUP(out) = PUP(from);
+			len--;
+		    }
+		    sout = (unsigned short *)(out - OFF);
+		    if (dist > 2) {
+			unsigned short *sfrom;
+
+			sfrom = (unsigned short *)(from - OFF);
+			loops = len >> 1;
+			do
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+			    PUP(sout) = PUP(sfrom);
+#else
+			    PUP(sout) = UP_UNALIGNED(sfrom);
+#endif
+			while (--loops);
+			out = (unsigned char *)sout + OFF;
+			from = (unsigned char *)sfrom + OFF;
+		    } else { /* dist == 1 or dist == 2 */
+			unsigned short pat16;
+
+			pat16 = *(sout-1+OFF);
+			if (dist == 1) {
+				union uu mm;
+				/* copy one char pattern to both bytes */
+				mm.us = pat16;
+				mm.b[0] = mm.b[1];
+				pat16 = mm.us;
+			}
+			loops = len >> 1;
+			do
+			    PUP(sout) = pat16;
+			while (--loops);
+			out = (unsigned char *)sout + OFF;
+		    }
+		    if (len & 1)
+			PUP(out) = PUP(from);
+                }
+            }
+            else if ((op & 64) == 0) {          /* 2nd level distance code */
+                this = dcode[this.val + (hold & ((1U << op) - 1))];
+                goto dodist;
+            }
+            else {
+                strm->msg = (char *)"invalid distance code";
+                state->mode = BAD;
+                break;
+            }
+        }
+        else if ((op & 64) == 0) {              /* 2nd level length code */
+            this = lcode[this.val + (hold & ((1U << op) - 1))];
+            goto dolen;
+        }
+        else if (op & 32) {                     /* end-of-block */
+            state->mode = TYPE;
+            break;
+        }
+        else {
+            strm->msg = (char *)"invalid literal/length code";
+            state->mode = BAD;
+            break;
+        }
+    } while (in < last && out < end);
+
+    /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
+    len = bits >> 3;
+    in -= len;
+    bits -= len << 3;
+    hold &= (1U << bits) - 1;
+
+    /* update state and return */
+    strm->next_in = in + OFF;
+    strm->next_out = out + OFF;
+    strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last));
+    strm->avail_out = (unsigned)(out < end ?
+                                 257 + (end - out) : 257 - (out - end));
+    state->hold = hold;
+    state->bits = bits;
+    return;
+}
+
+/*
+   inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe):
+   - Using bit fields for code structure
+   - Different op definition to avoid & for extra bits (do & for table bits)
+   - Three separate decoding do-loops for direct, window, and write == 0
+   - Special case for distance > 1 copies to do overlapped load and store copy
+   - Explicit branch predictions (based on measured branch probabilities)
+   - Deferring match copy and interspersed it with decoding subsequent codes
+   - Swapping literal/length else
+   - Swapping window/direct else
+   - Larger unrolled copy loops (three is about right)
+   - Moving len -= 3 statement into middle of loop
+ */
+
+#endif /* !ASMINF */
diff --git a/linux/zlib_inflate/inffast.h b/linux/zlib_inflate/inffast.h
new file mode 100644
index 0000000..40315d9
--- /dev/null
+++ b/linux/zlib_inflate/inffast.h
@@ -0,0 +1,11 @@
+/* inffast.h -- header to use inffast.c
+ * Copyright (C) 1995-2003 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+void inflate_fast (z_streamp strm, unsigned start);
diff --git a/linux/zlib_inflate/inffixed.h b/linux/zlib_inflate/inffixed.h
new file mode 100644
index 0000000..75ed4b5
--- /dev/null
+++ b/linux/zlib_inflate/inffixed.h
@@ -0,0 +1,94 @@
+    /* inffixed.h -- table for decoding fixed codes
+     * Generated automatically by makefixed().
+     */
+
+    /* WARNING: this file should *not* be used by applications. It
+       is part of the implementation of the compression library and
+       is subject to change. Applications should only use zlib.h.
+     */
+
+    static const code lenfix[512] = {
+        {96,7,0},{0,8,80},{0,8,16},{20,8,115},{18,7,31},{0,8,112},{0,8,48},
+        {0,9,192},{16,7,10},{0,8,96},{0,8,32},{0,9,160},{0,8,0},{0,8,128},
+        {0,8,64},{0,9,224},{16,7,6},{0,8,88},{0,8,24},{0,9,144},{19,7,59},
+        {0,8,120},{0,8,56},{0,9,208},{17,7,17},{0,8,104},{0,8,40},{0,9,176},
+        {0,8,8},{0,8,136},{0,8,72},{0,9,240},{16,7,4},{0,8,84},{0,8,20},
+        {21,8,227},{19,7,43},{0,8,116},{0,8,52},{0,9,200},{17,7,13},{0,8,100},
+        {0,8,36},{0,9,168},{0,8,4},{0,8,132},{0,8,68},{0,9,232},{16,7,8},
+        {0,8,92},{0,8,28},{0,9,152},{20,7,83},{0,8,124},{0,8,60},{0,9,216},
+        {18,7,23},{0,8,108},{0,8,44},{0,9,184},{0,8,12},{0,8,140},{0,8,76},
+        {0,9,248},{16,7,3},{0,8,82},{0,8,18},{21,8,163},{19,7,35},{0,8,114},
+        {0,8,50},{0,9,196},{17,7,11},{0,8,98},{0,8,34},{0,9,164},{0,8,2},
+        {0,8,130},{0,8,66},{0,9,228},{16,7,7},{0,8,90},{0,8,26},{0,9,148},
+        {20,7,67},{0,8,122},{0,8,58},{0,9,212},{18,7,19},{0,8,106},{0,8,42},
+        {0,9,180},{0,8,10},{0,8,138},{0,8,74},{0,9,244},{16,7,5},{0,8,86},
+        {0,8,22},{64,8,0},{19,7,51},{0,8,118},{0,8,54},{0,9,204},{17,7,15},
+        {0,8,102},{0,8,38},{0,9,172},{0,8,6},{0,8,134},{0,8,70},{0,9,236},
+        {16,7,9},{0,8,94},{0,8,30},{0,9,156},{20,7,99},{0,8,126},{0,8,62},
+        {0,9,220},{18,7,27},{0,8,110},{0,8,46},{0,9,188},{0,8,14},{0,8,142},
+        {0,8,78},{0,9,252},{96,7,0},{0,8,81},{0,8,17},{21,8,131},{18,7,31},
+        {0,8,113},{0,8,49},{0,9,194},{16,7,10},{0,8,97},{0,8,33},{0,9,162},
+        {0,8,1},{0,8,129},{0,8,65},{0,9,226},{16,7,6},{0,8,89},{0,8,25},
+        {0,9,146},{19,7,59},{0,8,121},{0,8,57},{0,9,210},{17,7,17},{0,8,105},
+        {0,8,41},{0,9,178},{0,8,9},{0,8,137},{0,8,73},{0,9,242},{16,7,4},
+        {0,8,85},{0,8,21},{16,8,258},{19,7,43},{0,8,117},{0,8,53},{0,9,202},
+        {17,7,13},{0,8,101},{0,8,37},{0,9,170},{0,8,5},{0,8,133},{0,8,69},
+        {0,9,234},{16,7,8},{0,8,93},{0,8,29},{0,9,154},{20,7,83},{0,8,125},
+        {0,8,61},{0,9,218},{18,7,23},{0,8,109},{0,8,45},{0,9,186},{0,8,13},
+        {0,8,141},{0,8,77},{0,9,250},{16,7,3},{0,8,83},{0,8,19},{21,8,195},
+        {19,7,35},{0,8,115},{0,8,51},{0,9,198},{17,7,11},{0,8,99},{0,8,35},
+        {0,9,166},{0,8,3},{0,8,131},{0,8,67},{0,9,230},{16,7,7},{0,8,91},
+        {0,8,27},{0,9,150},{20,7,67},{0,8,123},{0,8,59},{0,9,214},{18,7,19},
+        {0,8,107},{0,8,43},{0,9,182},{0,8,11},{0,8,139},{0,8,75},{0,9,246},
+        {16,7,5},{0,8,87},{0,8,23},{64,8,0},{19,7,51},{0,8,119},{0,8,55},
+        {0,9,206},{17,7,15},{0,8,103},{0,8,39},{0,9,174},{0,8,7},{0,8,135},
+        {0,8,71},{0,9,238},{16,7,9},{0,8,95},{0,8,31},{0,9,158},{20,7,99},
+        {0,8,127},{0,8,63},{0,9,222},{18,7,27},{0,8,111},{0,8,47},{0,9,190},
+        {0,8,15},{0,8,143},{0,8,79},{0,9,254},{96,7,0},{0,8,80},{0,8,16},
+        {20,8,115},{18,7,31},{0,8,112},{0,8,48},{0,9,193},{16,7,10},{0,8,96},
+        {0,8,32},{0,9,161},{0,8,0},{0,8,128},{0,8,64},{0,9,225},{16,7,6},
+        {0,8,88},{0,8,24},{0,9,145},{19,7,59},{0,8,120},{0,8,56},{0,9,209},
+        {17,7,17},{0,8,104},{0,8,40},{0,9,177},{0,8,8},{0,8,136},{0,8,72},
+        {0,9,241},{16,7,4},{0,8,84},{0,8,20},{21,8,227},{19,7,43},{0,8,116},
+        {0,8,52},{0,9,201},{17,7,13},{0,8,100},{0,8,36},{0,9,169},{0,8,4},
+        {0,8,132},{0,8,68},{0,9,233},{16,7,8},{0,8,92},{0,8,28},{0,9,153},
+        {20,7,83},{0,8,124},{0,8,60},{0,9,217},{18,7,23},{0,8,108},{0,8,44},
+        {0,9,185},{0,8,12},{0,8,140},{0,8,76},{0,9,249},{16,7,3},{0,8,82},
+        {0,8,18},{21,8,163},{19,7,35},{0,8,114},{0,8,50},{0,9,197},{17,7,11},
+        {0,8,98},{0,8,34},{0,9,165},{0,8,2},{0,8,130},{0,8,66},{0,9,229},
+        {16,7,7},{0,8,90},{0,8,26},{0,9,149},{20,7,67},{0,8,122},{0,8,58},
+        {0,9,213},{18,7,19},{0,8,106},{0,8,42},{0,9,181},{0,8,10},{0,8,138},
+        {0,8,74},{0,9,245},{16,7,5},{0,8,86},{0,8,22},{64,8,0},{19,7,51},
+        {0,8,118},{0,8,54},{0,9,205},{17,7,15},{0,8,102},{0,8,38},{0,9,173},
+        {0,8,6},{0,8,134},{0,8,70},{0,9,237},{16,7,9},{0,8,94},{0,8,30},
+        {0,9,157},{20,7,99},{0,8,126},{0,8,62},{0,9,221},{18,7,27},{0,8,110},
+        {0,8,46},{0,9,189},{0,8,14},{0,8,142},{0,8,78},{0,9,253},{96,7,0},
+        {0,8,81},{0,8,17},{21,8,131},{18,7,31},{0,8,113},{0,8,49},{0,9,195},
+        {16,7,10},{0,8,97},{0,8,33},{0,9,163},{0,8,1},{0,8,129},{0,8,65},
+        {0,9,227},{16,7,6},{0,8,89},{0,8,25},{0,9,147},{19,7,59},{0,8,121},
+        {0,8,57},{0,9,211},{17,7,17},{0,8,105},{0,8,41},{0,9,179},{0,8,9},
+        {0,8,137},{0,8,73},{0,9,243},{16,7,4},{0,8,85},{0,8,21},{16,8,258},
+        {19,7,43},{0,8,117},{0,8,53},{0,9,203},{17,7,13},{0,8,101},{0,8,37},
+        {0,9,171},{0,8,5},{0,8,133},{0,8,69},{0,9,235},{16,7,8},{0,8,93},
+        {0,8,29},{0,9,155},{20,7,83},{0,8,125},{0,8,61},{0,9,219},{18,7,23},
+        {0,8,109},{0,8,45},{0,9,187},{0,8,13},{0,8,141},{0,8,77},{0,9,251},
+        {16,7,3},{0,8,83},{0,8,19},{21,8,195},{19,7,35},{0,8,115},{0,8,51},
+        {0,9,199},{17,7,11},{0,8,99},{0,8,35},{0,9,167},{0,8,3},{0,8,131},
+        {0,8,67},{0,9,231},{16,7,7},{0,8,91},{0,8,27},{0,9,151},{20,7,67},
+        {0,8,123},{0,8,59},{0,9,215},{18,7,19},{0,8,107},{0,8,43},{0,9,183},
+        {0,8,11},{0,8,139},{0,8,75},{0,9,247},{16,7,5},{0,8,87},{0,8,23},
+        {64,8,0},{19,7,51},{0,8,119},{0,8,55},{0,9,207},{17,7,15},{0,8,103},
+        {0,8,39},{0,9,175},{0,8,7},{0,8,135},{0,8,71},{0,9,239},{16,7,9},
+        {0,8,95},{0,8,31},{0,9,159},{20,7,99},{0,8,127},{0,8,63},{0,9,223},
+        {18,7,27},{0,8,111},{0,8,47},{0,9,191},{0,8,15},{0,8,143},{0,8,79},
+        {0,9,255}
+    };
+
+    static const code distfix[32] = {
+        {16,5,1},{23,5,257},{19,5,17},{27,5,4097},{17,5,5},{25,5,1025},
+        {21,5,65},{29,5,16385},{16,5,3},{24,5,513},{20,5,33},{28,5,8193},
+        {18,5,9},{26,5,2049},{22,5,129},{64,5,0},{16,5,2},{23,5,385},
+        {19,5,25},{27,5,6145},{17,5,7},{25,5,1537},{21,5,97},{29,5,24577},
+        {16,5,4},{24,5,769},{20,5,49},{28,5,12289},{18,5,13},{26,5,3073},
+        {22,5,193},{64,5,0}
+    };
diff --git a/linux/zlib_inflate/inflate.c b/linux/zlib_inflate/inflate.c
new file mode 100644
index 0000000..58a733b
--- /dev/null
+++ b/linux/zlib_inflate/inflate.c
@@ -0,0 +1,786 @@
+/* inflate.c -- zlib decompression
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Based on zlib 1.2.3 but modified for the Linux Kernel by
+ * Richard Purdie <richard@openedhand.com>
+ *
+ * Changes mainly for static instead of dynamic memory allocation
+ *
+ */
+
+#include <linux/zutil.h>
+#include "inftrees.h"
+#include "inflate.h"
+#include "inffast.h"
+#include "infutil.h"
+
+int zlib_inflate_workspacesize(void)
+{
+    return sizeof(struct inflate_workspace);
+}
+
+int zlib_inflateReset(z_streamp strm)
+{
+    struct inflate_state *state;
+
+    if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+    strm->total_in = strm->total_out = state->total = 0;
+    strm->msg = NULL;
+    strm->adler = 1;        /* to support ill-conceived Java test suite */
+    state->mode = HEAD;
+    state->last = 0;
+    state->havedict = 0;
+    state->dmax = 32768U;
+    state->hold = 0;
+    state->bits = 0;
+    state->lencode = state->distcode = state->next = state->codes;
+
+    /* Initialise Window */
+    state->wsize = 1U << state->wbits;
+    state->write = 0;
+    state->whave = 0;
+
+    return Z_OK;
+}
+
+int zlib_inflateInit2(z_streamp strm, int windowBits)
+{
+    struct inflate_state *state;
+
+    if (strm == NULL) return Z_STREAM_ERROR;
+    strm->msg = NULL;                 /* in case we return an error */
+
+    state = &WS(strm)->inflate_state;
+    strm->state = (struct internal_state *)state;
+
+    if (windowBits < 0) {
+        state->wrap = 0;
+        windowBits = -windowBits;
+    }
+    else {
+        state->wrap = (windowBits >> 4) + 1;
+    }
+    if (windowBits < 8 || windowBits > 15) {
+        return Z_STREAM_ERROR;
+    }
+    state->wbits = (unsigned)windowBits;
+    state->window = &WS(strm)->working_window[0];
+
+    return zlib_inflateReset(strm);
+}
+
+/*
+   Return state with length and distance decoding tables and index sizes set to
+   fixed code decoding.  This returns fixed tables from inffixed.h.
+ */
+static void zlib_fixedtables(struct inflate_state *state)
+{
+#   include "inffixed.h"
+    state->lencode = lenfix;
+    state->lenbits = 9;
+    state->distcode = distfix;
+    state->distbits = 5;
+}
+
+
+/*
+   Update the window with the last wsize (normally 32K) bytes written before
+   returning. This is only called when a window is already in use, or when
+   output has been written during this inflate call, but the end of the deflate
+   stream has not been reached yet. It is also called to window dictionary data
+   when a dictionary is loaded.
+
+   Providing output buffers larger than 32K to inflate() should provide a speed
+   advantage, since only the last 32K of output is copied to the sliding window
+   upon return from inflate(), and since all distances after the first 32K of
+   output will fall in the output data, making match copies simpler and faster.
+   The advantage may be dependent on the size of the processor's data caches.
+ */
+static void zlib_updatewindow(z_streamp strm, unsigned out)
+{
+    struct inflate_state *state;
+    unsigned copy, dist;
+
+    state = (struct inflate_state *)strm->state;
+
+    /* copy state->wsize or less output bytes into the circular window */
+    copy = out - strm->avail_out;
+    if (copy >= state->wsize) {
+        memcpy(state->window, strm->next_out - state->wsize, state->wsize);
+        state->write = 0;
+        state->whave = state->wsize;
+    }
+    else {
+        dist = state->wsize - state->write;
+        if (dist > copy) dist = copy;
+        memcpy(state->window + state->write, strm->next_out - copy, dist);
+        copy -= dist;
+        if (copy) {
+            memcpy(state->window, strm->next_out - copy, copy);
+            state->write = copy;
+            state->whave = state->wsize;
+        }
+        else {
+            state->write += dist;
+            if (state->write == state->wsize) state->write = 0;
+            if (state->whave < state->wsize) state->whave += dist;
+        }
+    }
+}
+
+
+/*
+ * At the end of a Deflate-compressed PPP packet, we expect to have seen
+ * a `stored' block type value but not the (zero) length bytes.
+ */
+/*
+   Returns true if inflate is currently at the end of a block generated by
+   Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP
+   implementation to provide an additional safety check. PPP uses
+   Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored
+   block. When decompressing, PPP checks that at the end of input packet,
+   inflate is waiting for these length bytes.
+ */
+static int zlib_inflateSyncPacket(z_streamp strm)
+{
+    struct inflate_state *state;
+
+    if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+
+    if (state->mode == STORED && state->bits == 0) {
+	state->mode = TYPE;
+        return Z_OK;
+    }
+    return Z_DATA_ERROR;
+}
+
+/* Macros for inflate(): */
+
+/* check function to use adler32() for zlib or crc32() for gzip */
+#define UPDATE(check, buf, len) zlib_adler32(check, buf, len)
+
+/* Load registers with state in inflate() for speed */
+#define LOAD() \
+    do { \
+        put = strm->next_out; \
+        left = strm->avail_out; \
+        next = strm->next_in; \
+        have = strm->avail_in; \
+        hold = state->hold; \
+        bits = state->bits; \
+    } while (0)
+
+/* Restore state from registers in inflate() */
+#define RESTORE() \
+    do { \
+        strm->next_out = put; \
+        strm->avail_out = left; \
+        strm->next_in = next; \
+        strm->avail_in = have; \
+        state->hold = hold; \
+        state->bits = bits; \
+    } while (0)
+
+/* Clear the input bit accumulator */
+#define INITBITS() \
+    do { \
+        hold = 0; \
+        bits = 0; \
+    } while (0)
+
+/* Get a byte of input into the bit accumulator, or return from inflate()
+   if there is no input available. */
+#define PULLBYTE() \
+    do { \
+        if (have == 0) goto inf_leave; \
+        have--; \
+        hold += (unsigned long)(*next++) << bits; \
+        bits += 8; \
+    } while (0)
+
+/* Assure that there are at least n bits in the bit accumulator.  If there is
+   not enough available input to do that, then return from inflate(). */
+#define NEEDBITS(n) \
+    do { \
+        while (bits < (unsigned)(n)) \
+            PULLBYTE(); \
+    } while (0)
+
+/* Return the low n bits of the bit accumulator (n < 16) */
+#define BITS(n) \
+    ((unsigned)hold & ((1U << (n)) - 1))
+
+/* Remove n bits from the bit accumulator */
+#define DROPBITS(n) \
+    do { \
+        hold >>= (n); \
+        bits -= (unsigned)(n); \
+    } while (0)
+
+/* Remove zero to seven bits as needed to go to a byte boundary */
+#define BYTEBITS() \
+    do { \
+        hold >>= bits & 7; \
+        bits -= bits & 7; \
+    } while (0)
+
+/* Reverse the bytes in a 32-bit value */
+#define REVERSE(q) \
+    ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \
+     (((q) & 0xff00) << 8) + (((q) & 0xff) << 24))
+
+/*
+   inflate() uses a state machine to process as much input data and generate as
+   much output data as possible before returning.  The state machine is
+   structured roughly as follows:
+
+    for (;;) switch (state) {
+    ...
+    case STATEn:
+        if (not enough input data or output space to make progress)
+            return;
+        ... make progress ...
+        state = STATEm;
+        break;
+    ...
+    }
+
+   so when inflate() is called again, the same case is attempted again, and
+   if the appropriate resources are provided, the machine proceeds to the
+   next state.  The NEEDBITS() macro is usually the way the state evaluates
+   whether it can proceed or should return.  NEEDBITS() does the return if
+   the requested bits are not available.  The typical use of the BITS macros
+   is:
+
+        NEEDBITS(n);
+        ... do something with BITS(n) ...
+        DROPBITS(n);
+
+   where NEEDBITS(n) either returns from inflate() if there isn't enough
+   input left to load n bits into the accumulator, or it continues.  BITS(n)
+   gives the low n bits in the accumulator.  When done, DROPBITS(n) drops
+   the low n bits off the accumulator.  INITBITS() clears the accumulator
+   and sets the number of available bits to zero.  BYTEBITS() discards just
+   enough bits to put the accumulator on a byte boundary.  After BYTEBITS()
+   and a NEEDBITS(8), then BITS(8) would return the next byte in the stream.
+
+   NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return
+   if there is no input available.  The decoding of variable length codes uses
+   PULLBYTE() directly in order to pull just enough bytes to decode the next
+   code, and no more.
+
+   Some states loop until they get enough input, making sure that enough
+   state information is maintained to continue the loop where it left off
+   if NEEDBITS() returns in the loop.  For example, want, need, and keep
+   would all have to actually be part of the saved state in case NEEDBITS()
+   returns:
+
+    case STATEw:
+        while (want < need) {
+            NEEDBITS(n);
+            keep[want++] = BITS(n);
+            DROPBITS(n);
+        }
+        state = STATEx;
+    case STATEx:
+
+   As shown above, if the next state is also the next case, then the break
+   is omitted.
+
+   A state may also return if there is not enough output space available to
+   complete that state.  Those states are copying stored data, writing a
+   literal byte, and copying a matching string.
+
+   When returning, a "goto inf_leave" is used to update the total counters,
+   update the check value, and determine whether any progress has been made
+   during that inflate() call in order to return the proper return code.
+   Progress is defined as a change in either strm->avail_in or strm->avail_out.
+   When there is a window, goto inf_leave will update the window with the last
+   output written.  If a goto inf_leave occurs in the middle of decompression
+   and there is no window currently, goto inf_leave will create one and copy
+   output to the window for the next call of inflate().
+
+   In this implementation, the flush parameter of inflate() only affects the
+   return code (per zlib.h).  inflate() always writes as much as possible to
+   strm->next_out, given the space available and the provided input--the effect
+   documented in zlib.h of Z_SYNC_FLUSH.  Furthermore, inflate() always defers
+   the allocation of and copying into a sliding window until necessary, which
+   provides the effect documented in zlib.h for Z_FINISH when the entire input
+   stream available.  So the only thing the flush parameter actually does is:
+   when flush is set to Z_FINISH, inflate() cannot return Z_OK.  Instead it
+   will return Z_BUF_ERROR if it has not reached the end of the stream.
+ */
+
+int zlib_inflate(z_streamp strm, int flush)
+{
+    struct inflate_state *state;
+    const unsigned char *next;  /* next input */
+    unsigned char *put;         /* next output */
+    unsigned have, left;        /* available input and output */
+    unsigned long hold;         /* bit buffer */
+    unsigned bits;              /* bits in bit buffer */
+    unsigned in, out;           /* save starting available input and output */
+    unsigned copy;              /* number of stored or match bytes to copy */
+    unsigned char *from;        /* where to copy match bytes from */
+    code this;                  /* current decoding table entry */
+    code last;                  /* parent table entry */
+    unsigned len;               /* length to copy for repeats, bits to drop */
+    int ret;                    /* return code */
+    static const unsigned short order[19] = /* permutation of code lengths */
+        {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+    /* Do not check for strm->next_out == NULL here as ppc zImage
+       inflates to strm->next_out = 0 */
+
+    if (strm == NULL || strm->state == NULL ||
+        (strm->next_in == NULL && strm->avail_in != 0))
+        return Z_STREAM_ERROR;
+
+    state = (struct inflate_state *)strm->state;
+
+    if (state->mode == TYPE) state->mode = TYPEDO;      /* skip check */
+    LOAD();
+    in = have;
+    out = left;
+    ret = Z_OK;
+    for (;;)
+        switch (state->mode) {
+        case HEAD:
+            if (state->wrap == 0) {
+                state->mode = TYPEDO;
+                break;
+            }
+            NEEDBITS(16);
+            if (
+                ((BITS(8) << 8) + (hold >> 8)) % 31) {
+                strm->msg = (char *)"incorrect header check";
+                state->mode = BAD;
+                break;
+            }
+            if (BITS(4) != Z_DEFLATED) {
+                strm->msg = (char *)"unknown compression method";
+                state->mode = BAD;
+                break;
+            }
+            DROPBITS(4);
+            len = BITS(4) + 8;
+            if (len > state->wbits) {
+                strm->msg = (char *)"invalid window size";
+                state->mode = BAD;
+                break;
+            }
+            state->dmax = 1U << len;
+            strm->adler = state->check = zlib_adler32(0L, NULL, 0);
+            state->mode = hold & 0x200 ? DICTID : TYPE;
+            INITBITS();
+            break;
+        case DICTID:
+            NEEDBITS(32);
+            strm->adler = state->check = REVERSE(hold);
+            INITBITS();
+            state->mode = DICT;
+        case DICT:
+            if (state->havedict == 0) {
+                RESTORE();
+                return Z_NEED_DICT;
+            }
+            strm->adler = state->check = zlib_adler32(0L, NULL, 0);
+            state->mode = TYPE;
+        case TYPE:
+            if (flush == Z_BLOCK) goto inf_leave;
+        case TYPEDO:
+            if (state->last) {
+                BYTEBITS();
+                state->mode = CHECK;
+                break;
+            }
+            NEEDBITS(3);
+            state->last = BITS(1);
+            DROPBITS(1);
+            switch (BITS(2)) {
+            case 0:                             /* stored block */
+                state->mode = STORED;
+                break;
+            case 1:                             /* fixed block */
+                zlib_fixedtables(state);
+                state->mode = LEN;              /* decode codes */
+                break;
+            case 2:                             /* dynamic block */
+                state->mode = TABLE;
+                break;
+            case 3:
+                strm->msg = (char *)"invalid block type";
+                state->mode = BAD;
+            }
+            DROPBITS(2);
+            break;
+        case STORED:
+            BYTEBITS();                         /* go to byte boundary */
+            NEEDBITS(32);
+            if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) {
+                strm->msg = (char *)"invalid stored block lengths";
+                state->mode = BAD;
+                break;
+            }
+            state->length = (unsigned)hold & 0xffff;
+            INITBITS();
+            state->mode = COPY;
+        case COPY:
+            copy = state->length;
+            if (copy) {
+                if (copy > have) copy = have;
+                if (copy > left) copy = left;
+                if (copy == 0) goto inf_leave;
+                memcpy(put, next, copy);
+                have -= copy;
+                next += copy;
+                left -= copy;
+                put += copy;
+                state->length -= copy;
+                break;
+            }
+            state->mode = TYPE;
+            break;
+        case TABLE:
+            NEEDBITS(14);
+            state->nlen = BITS(5) + 257;
+            DROPBITS(5);
+            state->ndist = BITS(5) + 1;
+            DROPBITS(5);
+            state->ncode = BITS(4) + 4;
+            DROPBITS(4);
+#ifndef PKZIP_BUG_WORKAROUND
+            if (state->nlen > 286 || state->ndist > 30) {
+                strm->msg = (char *)"too many length or distance symbols";
+                state->mode = BAD;
+                break;
+            }
+#endif
+            state->have = 0;
+            state->mode = LENLENS;
+        case LENLENS:
+            while (state->have < state->ncode) {
+                NEEDBITS(3);
+                state->lens[order[state->have++]] = (unsigned short)BITS(3);
+                DROPBITS(3);
+            }
+            while (state->have < 19)
+                state->lens[order[state->have++]] = 0;
+            state->next = state->codes;
+            state->lencode = (code const *)(state->next);
+            state->lenbits = 7;
+            ret = zlib_inflate_table(CODES, state->lens, 19, &(state->next),
+                                &(state->lenbits), state->work);
+            if (ret) {
+                strm->msg = (char *)"invalid code lengths set";
+                state->mode = BAD;
+                break;
+            }
+            state->have = 0;
+            state->mode = CODELENS;
+        case CODELENS:
+            while (state->have < state->nlen + state->ndist) {
+                for (;;) {
+                    this = state->lencode[BITS(state->lenbits)];
+                    if ((unsigned)(this.bits) <= bits) break;
+                    PULLBYTE();
+                }
+                if (this.val < 16) {
+                    NEEDBITS(this.bits);
+                    DROPBITS(this.bits);
+                    state->lens[state->have++] = this.val;
+                }
+                else {
+                    if (this.val == 16) {
+                        NEEDBITS(this.bits + 2);
+                        DROPBITS(this.bits);
+                        if (state->have == 0) {
+                            strm->msg = (char *)"invalid bit length repeat";
+                            state->mode = BAD;
+                            break;
+                        }
+                        len = state->lens[state->have - 1];
+                        copy = 3 + BITS(2);
+                        DROPBITS(2);
+                    }
+                    else if (this.val == 17) {
+                        NEEDBITS(this.bits + 3);
+                        DROPBITS(this.bits);
+                        len = 0;
+                        copy = 3 + BITS(3);
+                        DROPBITS(3);
+                    }
+                    else {
+                        NEEDBITS(this.bits + 7);
+                        DROPBITS(this.bits);
+                        len = 0;
+                        copy = 11 + BITS(7);
+                        DROPBITS(7);
+                    }
+                    if (state->have + copy > state->nlen + state->ndist) {
+                        strm->msg = (char *)"invalid bit length repeat";
+                        state->mode = BAD;
+                        break;
+                    }
+                    while (copy--)
+                        state->lens[state->have++] = (unsigned short)len;
+                }
+            }
+
+            /* handle error breaks in while */
+            if (state->mode == BAD) break;
+
+            /* build code tables */
+            state->next = state->codes;
+            state->lencode = (code const *)(state->next);
+            state->lenbits = 9;
+            ret = zlib_inflate_table(LENS, state->lens, state->nlen, &(state->next),
+                                &(state->lenbits), state->work);
+            if (ret) {
+                strm->msg = (char *)"invalid literal/lengths set";
+                state->mode = BAD;
+                break;
+            }
+            state->distcode = (code const *)(state->next);
+            state->distbits = 6;
+            ret = zlib_inflate_table(DISTS, state->lens + state->nlen, state->ndist,
+                            &(state->next), &(state->distbits), state->work);
+            if (ret) {
+                strm->msg = (char *)"invalid distances set";
+                state->mode = BAD;
+                break;
+            }
+            state->mode = LEN;
+        case LEN:
+            if (have >= 6 && left >= 258) {
+                RESTORE();
+                inflate_fast(strm, out);
+                LOAD();
+                break;
+            }
+            for (;;) {
+                this = state->lencode[BITS(state->lenbits)];
+                if ((unsigned)(this.bits) <= bits) break;
+                PULLBYTE();
+            }
+            if (this.op && (this.op & 0xf0) == 0) {
+                last = this;
+                for (;;) {
+                    this = state->lencode[last.val +
+                            (BITS(last.bits + last.op) >> last.bits)];
+                    if ((unsigned)(last.bits + this.bits) <= bits) break;
+                    PULLBYTE();
+                }
+                DROPBITS(last.bits);
+            }
+            DROPBITS(this.bits);
+            state->length = (unsigned)this.val;
+            if ((int)(this.op) == 0) {
+                state->mode = LIT;
+                break;
+            }
+            if (this.op & 32) {
+                state->mode = TYPE;
+                break;
+            }
+            if (this.op & 64) {
+                strm->msg = (char *)"invalid literal/length code";
+                state->mode = BAD;
+                break;
+            }
+            state->extra = (unsigned)(this.op) & 15;
+            state->mode = LENEXT;
+        case LENEXT:
+            if (state->extra) {
+                NEEDBITS(state->extra);
+                state->length += BITS(state->extra);
+                DROPBITS(state->extra);
+            }
+            state->mode = DIST;
+        case DIST:
+            for (;;) {
+                this = state->distcode[BITS(state->distbits)];
+                if ((unsigned)(this.bits) <= bits) break;
+                PULLBYTE();
+            }
+            if ((this.op & 0xf0) == 0) {
+                last = this;
+                for (;;) {
+                    this = state->distcode[last.val +
+                            (BITS(last.bits + last.op) >> last.bits)];
+                    if ((unsigned)(last.bits + this.bits) <= bits) break;
+                    PULLBYTE();
+                }
+                DROPBITS(last.bits);
+            }
+            DROPBITS(this.bits);
+            if (this.op & 64) {
+                strm->msg = (char *)"invalid distance code";
+                state->mode = BAD;
+                break;
+            }
+            state->offset = (unsigned)this.val;
+            state->extra = (unsigned)(this.op) & 15;
+            state->mode = DISTEXT;
+        case DISTEXT:
+            if (state->extra) {
+                NEEDBITS(state->extra);
+                state->offset += BITS(state->extra);
+                DROPBITS(state->extra);
+            }
+#ifdef INFLATE_STRICT
+            if (state->offset > state->dmax) {
+                strm->msg = (char *)"invalid distance too far back";
+                state->mode = BAD;
+                break;
+            }
+#endif
+            if (state->offset > state->whave + out - left) {
+                strm->msg = (char *)"invalid distance too far back";
+                state->mode = BAD;
+                break;
+            }
+            state->mode = MATCH;
+        case MATCH:
+            if (left == 0) goto inf_leave;
+            copy = out - left;
+            if (state->offset > copy) {         /* copy from window */
+                copy = state->offset - copy;
+                if (copy > state->write) {
+                    copy -= state->write;
+                    from = state->window + (state->wsize - copy);
+                }
+                else
+                    from = state->window + (state->write - copy);
+                if (copy > state->length) copy = state->length;
+            }
+            else {                              /* copy from output */
+                from = put - state->offset;
+                copy = state->length;
+            }
+            if (copy > left) copy = left;
+            left -= copy;
+            state->length -= copy;
+            do {
+                *put++ = *from++;
+            } while (--copy);
+            if (state->length == 0) state->mode = LEN;
+            break;
+        case LIT:
+            if (left == 0) goto inf_leave;
+            *put++ = (unsigned char)(state->length);
+            left--;
+            state->mode = LEN;
+            break;
+        case CHECK:
+            if (state->wrap) {
+                NEEDBITS(32);
+                out -= left;
+                strm->total_out += out;
+                state->total += out;
+                if (out)
+                    strm->adler = state->check =
+                        UPDATE(state->check, put - out, out);
+                out = left;
+                if ((
+                     REVERSE(hold)) != state->check) {
+                    strm->msg = (char *)"incorrect data check";
+                    state->mode = BAD;
+                    break;
+                }
+                INITBITS();
+            }
+            state->mode = DONE;
+        case DONE:
+            ret = Z_STREAM_END;
+            goto inf_leave;
+        case BAD:
+            ret = Z_DATA_ERROR;
+            goto inf_leave;
+        case MEM:
+            return Z_MEM_ERROR;
+        case SYNC:
+        default:
+            return Z_STREAM_ERROR;
+        }
+
+    /*
+       Return from inflate(), updating the total counts and the check value.
+       If there was no progress during the inflate() call, return a buffer
+       error.  Call zlib_updatewindow() to create and/or update the window state.
+     */
+  inf_leave:
+    RESTORE();
+    if (state->wsize || (state->mode < CHECK && out != strm->avail_out))
+        zlib_updatewindow(strm, out);
+
+    in -= strm->avail_in;
+    out -= strm->avail_out;
+    strm->total_in += in;
+    strm->total_out += out;
+    state->total += out;
+    if (state->wrap && out)
+        strm->adler = state->check =
+            UPDATE(state->check, strm->next_out - out, out);
+
+    strm->data_type = state->bits + (state->last ? 64 : 0) +
+                      (state->mode == TYPE ? 128 : 0);
+
+    if (flush == Z_PACKET_FLUSH && ret == Z_OK &&
+            strm->avail_out != 0 && strm->avail_in == 0)
+		return zlib_inflateSyncPacket(strm);
+
+    if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK)
+        ret = Z_BUF_ERROR;
+
+    return ret;
+}
+
+int zlib_inflateEnd(z_streamp strm)
+{
+    if (strm == NULL || strm->state == NULL)
+        return Z_STREAM_ERROR;
+    return Z_OK;
+}
+
+/*
+ * This subroutine adds the data at next_in/avail_in to the output history
+ * without performing any output.  The output buffer must be "caught up";
+ * i.e. no pending output but this should always be the case. The state must
+ * be waiting on the start of a block (i.e. mode == TYPE or HEAD).  On exit,
+ * the output will also be caught up, and the checksum will have been updated
+ * if need be.
+ */
+int zlib_inflateIncomp(z_stream *z)
+{
+    struct inflate_state *state = (struct inflate_state *)z->state;
+    Byte *saved_no = z->next_out;
+    uInt saved_ao = z->avail_out;
+
+    if (state->mode != TYPE && state->mode != HEAD)
+	return Z_DATA_ERROR;
+
+    /* Setup some variables to allow misuse of updateWindow */
+    z->avail_out = 0;
+    z->next_out = (unsigned char*)z->next_in + z->avail_in;
+
+    zlib_updatewindow(z, z->avail_in);
+
+    /* Restore saved variables */
+    z->avail_out = saved_ao;
+    z->next_out = saved_no;
+
+    z->adler = state->check =
+        UPDATE(state->check, z->next_in, z->avail_in);
+
+    z->total_out += z->avail_in;
+    z->total_in += z->avail_in;
+    z->next_in += z->avail_in;
+    state->total += z->avail_in;
+    z->avail_in = 0;
+
+    return Z_OK;
+}
diff --git a/linux/zlib_inflate/inflate.h b/linux/zlib_inflate/inflate.h
new file mode 100644
index 0000000..3d17b3d
--- /dev/null
+++ b/linux/zlib_inflate/inflate.h
@@ -0,0 +1,111 @@
+#ifndef INFLATE_H
+#define INFLATE_H
+
+/* inflate.h -- internal inflate state definition
+ * Copyright (C) 1995-2004 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+/* Possible inflate modes between inflate() calls */
+typedef enum {
+    HEAD,       /* i: waiting for magic header */
+    FLAGS,      /* i: waiting for method and flags (gzip) */
+    TIME,       /* i: waiting for modification time (gzip) */
+    OS,         /* i: waiting for extra flags and operating system (gzip) */
+    EXLEN,      /* i: waiting for extra length (gzip) */
+    EXTRA,      /* i: waiting for extra bytes (gzip) */
+    NAME,       /* i: waiting for end of file name (gzip) */
+    COMMENT,    /* i: waiting for end of comment (gzip) */
+    HCRC,       /* i: waiting for header crc (gzip) */
+    DICTID,     /* i: waiting for dictionary check value */
+    DICT,       /* waiting for inflateSetDictionary() call */
+        TYPE,       /* i: waiting for type bits, including last-flag bit */
+        TYPEDO,     /* i: same, but skip check to exit inflate on new block */
+        STORED,     /* i: waiting for stored size (length and complement) */
+        COPY,       /* i/o: waiting for input or output to copy stored block */
+        TABLE,      /* i: waiting for dynamic block table lengths */
+        LENLENS,    /* i: waiting for code length code lengths */
+        CODELENS,   /* i: waiting for length/lit and distance code lengths */
+            LEN,        /* i: waiting for length/lit code */
+            LENEXT,     /* i: waiting for length extra bits */
+            DIST,       /* i: waiting for distance code */
+            DISTEXT,    /* i: waiting for distance extra bits */
+            MATCH,      /* o: waiting for output space to copy string */
+            LIT,        /* o: waiting for output space to write literal */
+    CHECK,      /* i: waiting for 32-bit check value */
+    LENGTH,     /* i: waiting for 32-bit length (gzip) */
+    DONE,       /* finished check, done -- remain here until reset */
+    BAD,        /* got a data error -- remain here until reset */
+    MEM,        /* got an inflate() memory error -- remain here until reset */
+    SYNC        /* looking for synchronization bytes to restart inflate() */
+} inflate_mode;
+
+/*
+    State transitions between above modes -
+
+    (most modes can go to the BAD or MEM mode -- not shown for clarity)
+
+    Process header:
+        HEAD -> (gzip) or (zlib)
+        (gzip) -> FLAGS -> TIME -> OS -> EXLEN -> EXTRA -> NAME
+        NAME -> COMMENT -> HCRC -> TYPE
+        (zlib) -> DICTID or TYPE
+        DICTID -> DICT -> TYPE
+    Read deflate blocks:
+            TYPE -> STORED or TABLE or LEN or CHECK
+            STORED -> COPY -> TYPE
+            TABLE -> LENLENS -> CODELENS -> LEN
+    Read deflate codes:
+                LEN -> LENEXT or LIT or TYPE
+                LENEXT -> DIST -> DISTEXT -> MATCH -> LEN
+                LIT -> LEN
+    Process trailer:
+        CHECK -> LENGTH -> DONE
+ */
+
+/* state maintained between inflate() calls.  Approximately 7K bytes. */
+struct inflate_state {
+    inflate_mode mode;          /* current inflate mode */
+    int last;                   /* true if processing last block */
+    int wrap;                   /* bit 0 true for zlib, bit 1 true for gzip */
+    int havedict;               /* true if dictionary provided */
+    int flags;                  /* gzip header method and flags (0 if zlib) */
+    unsigned dmax;              /* zlib header max distance (INFLATE_STRICT) */
+    unsigned long check;        /* protected copy of check value */
+    unsigned long total;        /* protected copy of output count */
+ /*   gz_headerp head; */           /* where to save gzip header information */
+        /* sliding window */
+    unsigned wbits;             /* log base 2 of requested window size */
+    unsigned wsize;             /* window size or zero if not using window */
+    unsigned whave;             /* valid bytes in the window */
+    unsigned write;             /* window write index */
+    unsigned char *window;  /* allocated sliding window, if needed */
+        /* bit accumulator */
+    unsigned long hold;         /* input bit accumulator */
+    unsigned bits;              /* number of bits in "in" */
+        /* for string and stored block copying */
+    unsigned length;            /* literal or length of data to copy */
+    unsigned offset;            /* distance back to copy string from */
+        /* for table and code decoding */
+    unsigned extra;             /* extra bits needed */
+        /* fixed and dynamic code tables */
+    code const *lencode;    /* starting table for length/literal codes */
+    code const *distcode;   /* starting table for distance codes */
+    unsigned lenbits;           /* index bits for lencode */
+    unsigned distbits;          /* index bits for distcode */
+        /* dynamic table building */
+    unsigned ncode;             /* number of code length code lengths */
+    unsigned nlen;              /* number of length code lengths */
+    unsigned ndist;             /* number of distance code lengths */
+    unsigned have;              /* number of code lengths in lens[] */
+    code *next;             /* next available space in codes[] */
+    unsigned short lens[320];   /* temporary storage for code lengths */
+    unsigned short work[288];   /* work area for code table building */
+    code codes[ENOUGH];         /* space for code tables */
+};
+#endif
diff --git a/linux/zlib_inflate/inftrees.c b/linux/zlib_inflate/inftrees.c
new file mode 100644
index 0000000..3fe6ce5
--- /dev/null
+++ b/linux/zlib_inflate/inftrees.c
@@ -0,0 +1,315 @@
+/* inftrees.c -- generate Huffman trees for efficient decoding
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <linux/zutil.h>
+#include "inftrees.h"
+
+#define MAXBITS 15
+
+/*
+   Build a set of tables to decode the provided canonical Huffman code.
+   The code lengths are lens[0..codes-1].  The result starts at *table,
+   whose indices are 0..2^bits-1.  work is a writable array of at least
+   lens shorts, which is used as a work area.  type is the type of code
+   to be generated, CODES, LENS, or DISTS.  On return, zero is success,
+   -1 is an invalid code, and +1 means that ENOUGH isn't enough.  table
+   on return points to the next available entry's address.  bits is the
+   requested root table index bits, and on return it is the actual root
+   table index bits.  It will differ if the request is greater than the
+   longest code or if it is less than the shortest code.
+ */
+int zlib_inflate_table(codetype type, unsigned short *lens, unsigned codes,
+			code **table, unsigned *bits, unsigned short *work)
+{
+    unsigned len;               /* a code's length in bits */
+    unsigned sym;               /* index of code symbols */
+    unsigned min, max;          /* minimum and maximum code lengths */
+    unsigned root;              /* number of index bits for root table */
+    unsigned curr;              /* number of index bits for current table */
+    unsigned drop;              /* code bits to drop for sub-table */
+    int left;                   /* number of prefix codes available */
+    unsigned used;              /* code entries in table used */
+    unsigned huff;              /* Huffman code */
+    unsigned incr;              /* for incrementing code, index */
+    unsigned fill;              /* index for replicating entries */
+    unsigned low;               /* low bits for current root entry */
+    unsigned mask;              /* mask for low root bits */
+    code this;                  /* table entry for duplication */
+    code *next;             /* next available space in table */
+    const unsigned short *base;     /* base value table to use */
+    const unsigned short *extra;    /* extra bits table to use */
+    int end;                    /* use base and extra for symbol > end */
+    unsigned short count[MAXBITS+1];    /* number of codes of each length */
+    unsigned short offs[MAXBITS+1];     /* offsets in table for each length */
+    static const unsigned short lbase[31] = { /* Length codes 257..285 base */
+        3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
+        35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
+    static const unsigned short lext[31] = { /* Length codes 257..285 extra */
+        16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18,
+        19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 201, 196};
+    static const unsigned short dbase[32] = { /* Distance codes 0..29 base */
+        1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
+        257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
+        8193, 12289, 16385, 24577, 0, 0};
+    static const unsigned short dext[32] = { /* Distance codes 0..29 extra */
+        16, 16, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22,
+        23, 23, 24, 24, 25, 25, 26, 26, 27, 27,
+        28, 28, 29, 29, 64, 64};
+
+    /*
+       Process a set of code lengths to create a canonical Huffman code.  The
+       code lengths are lens[0..codes-1].  Each length corresponds to the
+       symbols 0..codes-1.  The Huffman code is generated by first sorting the
+       symbols by length from short to long, and retaining the symbol order
+       for codes with equal lengths.  Then the code starts with all zero bits
+       for the first code of the shortest length, and the codes are integer
+       increments for the same length, and zeros are appended as the length
+       increases.  For the deflate format, these bits are stored backwards
+       from their more natural integer increment ordering, and so when the
+       decoding tables are built in the large loop below, the integer codes
+       are incremented backwards.
+
+       This routine assumes, but does not check, that all of the entries in
+       lens[] are in the range 0..MAXBITS.  The caller must assure this.
+       1..MAXBITS is interpreted as that code length.  zero means that that
+       symbol does not occur in this code.
+
+       The codes are sorted by computing a count of codes for each length,
+       creating from that a table of starting indices for each length in the
+       sorted table, and then entering the symbols in order in the sorted
+       table.  The sorted table is work[], with that space being provided by
+       the caller.
+
+       The length counts are used for other purposes as well, i.e. finding
+       the minimum and maximum length codes, determining if there are any
+       codes at all, checking for a valid set of lengths, and looking ahead
+       at length counts to determine sub-table sizes when building the
+       decoding tables.
+     */
+
+    /* accumulate lengths for codes (assumes lens[] all in 0..MAXBITS) */
+    for (len = 0; len <= MAXBITS; len++)
+        count[len] = 0;
+    for (sym = 0; sym < codes; sym++)
+        count[lens[sym]]++;
+
+    /* bound code lengths, force root to be within code lengths */
+    root = *bits;
+    for (max = MAXBITS; max >= 1; max--)
+        if (count[max] != 0) break;
+    if (root > max) root = max;
+    if (max == 0) {                     /* no symbols to code at all */
+        this.op = (unsigned char)64;    /* invalid code marker */
+        this.bits = (unsigned char)1;
+        this.val = (unsigned short)0;
+        *(*table)++ = this;             /* make a table to force an error */
+        *(*table)++ = this;
+        *bits = 1;
+        return 0;     /* no symbols, but wait for decoding to report error */
+    }
+    for (min = 1; min <= MAXBITS; min++)
+        if (count[min] != 0) break;
+    if (root < min) root = min;
+
+    /* check for an over-subscribed or incomplete set of lengths */
+    left = 1;
+    for (len = 1; len <= MAXBITS; len++) {
+        left <<= 1;
+        left -= count[len];
+        if (left < 0) return -1;        /* over-subscribed */
+    }
+    if (left > 0 && (type == CODES || max != 1))
+        return -1;                      /* incomplete set */
+
+    /* generate offsets into symbol table for each length for sorting */
+    offs[1] = 0;
+    for (len = 1; len < MAXBITS; len++)
+        offs[len + 1] = offs[len] + count[len];
+
+    /* sort symbols by length, by symbol order within each length */
+    for (sym = 0; sym < codes; sym++)
+        if (lens[sym] != 0) work[offs[lens[sym]]++] = (unsigned short)sym;
+
+    /*
+       Create and fill in decoding tables.  In this loop, the table being
+       filled is at next and has curr index bits.  The code being used is huff
+       with length len.  That code is converted to an index by dropping drop
+       bits off of the bottom.  For codes where len is less than drop + curr,
+       those top drop + curr - len bits are incremented through all values to
+       fill the table with replicated entries.
+
+       root is the number of index bits for the root table.  When len exceeds
+       root, sub-tables are created pointed to by the root entry with an index
+       of the low root bits of huff.  This is saved in low to check for when a
+       new sub-table should be started.  drop is zero when the root table is
+       being filled, and drop is root when sub-tables are being filled.
+
+       When a new sub-table is needed, it is necessary to look ahead in the
+       code lengths to determine what size sub-table is needed.  The length
+       counts are used for this, and so count[] is decremented as codes are
+       entered in the tables.
+
+       used keeps track of how many table entries have been allocated from the
+       provided *table space.  It is checked when a LENS table is being made
+       against the space in *table, ENOUGH, minus the maximum space needed by
+       the worst case distance code, MAXD.  This should never happen, but the
+       sufficiency of ENOUGH has not been proven exhaustively, hence the check.
+       This assumes that when type == LENS, bits == 9.
+
+       sym increments through all symbols, and the loop terminates when
+       all codes of length max, i.e. all codes, have been processed.  This
+       routine permits incomplete codes, so another loop after this one fills
+       in the rest of the decoding tables with invalid code markers.
+     */
+
+    /* set up for code type */
+    switch (type) {
+    case CODES:
+        base = extra = work;    /* dummy value--not used */
+        end = 19;
+        break;
+    case LENS:
+        base = lbase;
+        base -= 257;
+        extra = lext;
+        extra -= 257;
+        end = 256;
+        break;
+    default:            /* DISTS */
+        base = dbase;
+        extra = dext;
+        end = -1;
+    }
+
+    /* initialize state for loop */
+    huff = 0;                   /* starting code */
+    sym = 0;                    /* starting code symbol */
+    len = min;                  /* starting code length */
+    next = *table;              /* current table to fill in */
+    curr = root;                /* current table index bits */
+    drop = 0;                   /* current bits to drop from code for index */
+    low = (unsigned)(-1);       /* trigger new sub-table when len > root */
+    used = 1U << root;          /* use root table entries */
+    mask = used - 1;            /* mask for comparing low */
+
+    /* check available table space */
+    if (type == LENS && used >= ENOUGH - MAXD)
+        return 1;
+
+    /* process all codes and make table entries */
+    for (;;) {
+        /* create table entry */
+        this.bits = (unsigned char)(len - drop);
+        if ((int)(work[sym]) < end) {
+            this.op = (unsigned char)0;
+            this.val = work[sym];
+        }
+        else if ((int)(work[sym]) > end) {
+            this.op = (unsigned char)(extra[work[sym]]);
+            this.val = base[work[sym]];
+        }
+        else {
+            this.op = (unsigned char)(32 + 64);         /* end of block */
+            this.val = 0;
+        }
+
+        /* replicate for those indices with low len bits equal to huff */
+        incr = 1U << (len - drop);
+        fill = 1U << curr;
+        min = fill;                 /* save offset to next table */
+        do {
+            fill -= incr;
+            next[(huff >> drop) + fill] = this;
+        } while (fill != 0);
+
+        /* backwards increment the len-bit code huff */
+        incr = 1U << (len - 1);
+        while (huff & incr)
+            incr >>= 1;
+        if (incr != 0) {
+            huff &= incr - 1;
+            huff += incr;
+        }
+        else
+            huff = 0;
+
+        /* go to next symbol, update count, len */
+        sym++;
+        if (--(count[len]) == 0) {
+            if (len == max) break;
+            len = lens[work[sym]];
+        }
+
+        /* create new sub-table if needed */
+        if (len > root && (huff & mask) != low) {
+            /* if first time, transition to sub-tables */
+            if (drop == 0)
+                drop = root;
+
+            /* increment past last table */
+            next += min;            /* here min is 1 << curr */
+
+            /* determine length of next table */
+            curr = len - drop;
+            left = (int)(1 << curr);
+            while (curr + drop < max) {
+                left -= count[curr + drop];
+                if (left <= 0) break;
+                curr++;
+                left <<= 1;
+            }
+
+            /* check for enough space */
+            used += 1U << curr;
+            if (type == LENS && used >= ENOUGH - MAXD)
+                return 1;
+
+            /* point entry in root table to sub-table */
+            low = huff & mask;
+            (*table)[low].op = (unsigned char)curr;
+            (*table)[low].bits = (unsigned char)root;
+            (*table)[low].val = (unsigned short)(next - *table);
+        }
+    }
+
+    /*
+       Fill in rest of table for incomplete codes.  This loop is similar to the
+       loop above in incrementing huff for table indices.  It is assumed that
+       len is equal to curr + drop, so there is no loop needed to increment
+       through high index bits.  When the current sub-table is filled, the loop
+       drops back to the root table to fill in any remaining entries there.
+     */
+    this.op = (unsigned char)64;                /* invalid code marker */
+    this.bits = (unsigned char)(len - drop);
+    this.val = (unsigned short)0;
+    while (huff != 0) {
+        /* when done with sub-table, drop back to root table */
+        if (drop != 0 && (huff & mask) != low) {
+            drop = 0;
+            len = root;
+            next = *table;
+            this.bits = (unsigned char)len;
+        }
+
+        /* put invalid code marker in table */
+        next[huff >> drop] = this;
+
+        /* backwards increment the len-bit code huff */
+        incr = 1U << (len - 1);
+        while (huff & incr)
+            incr >>= 1;
+        if (incr != 0) {
+            huff &= incr - 1;
+            huff += incr;
+        }
+        else
+            huff = 0;
+    }
+
+    /* set return parameters */
+    *table += used;
+    *bits = root;
+    return 0;
+}
diff --git a/linux/zlib_inflate/inftrees.h b/linux/zlib_inflate/inftrees.h
new file mode 100644
index 0000000..b70b473
--- /dev/null
+++ b/linux/zlib_inflate/inftrees.h
@@ -0,0 +1,59 @@
+#ifndef INFTREES_H
+#define INFTREES_H
+
+/* inftrees.h -- header to use inftrees.c
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+/* Structure for decoding tables.  Each entry provides either the
+   information needed to do the operation requested by the code that
+   indexed that table entry, or it provides a pointer to another
+   table that indexes more bits of the code.  op indicates whether
+   the entry is a pointer to another table, a literal, a length or
+   distance, an end-of-block, or an invalid code.  For a table
+   pointer, the low four bits of op is the number of index bits of
+   that table.  For a length or distance, the low four bits of op
+   is the number of extra bits to get after the code.  bits is
+   the number of bits in this code or part of the code to drop off
+   of the bit buffer.  val is the actual byte to output in the case
+   of a literal, the base length or distance, or the offset from
+   the current table to the next table.  Each entry is four bytes. */
+typedef struct {
+    unsigned char op;           /* operation, extra bits, table bits */
+    unsigned char bits;         /* bits in this part of the code */
+    unsigned short val;         /* offset in table or code value */
+} code;
+
+/* op values as set by inflate_table():
+    00000000 - literal
+    0000tttt - table link, tttt != 0 is the number of table index bits
+    0001eeee - length or distance, eeee is the number of extra bits
+    01100000 - end of block
+    01000000 - invalid code
+ */
+
+/* Maximum size of dynamic tree.  The maximum found in a long but non-
+   exhaustive search was 1444 code structures (852 for length/literals
+   and 592 for distances, the latter actually the result of an
+   exhaustive search).  The true maximum is not known, but the value
+   below is more than safe. */
+#define ENOUGH 2048
+#define MAXD 592
+
+/* Type of code to build for inftable() */
+typedef enum {
+    CODES,
+    LENS,
+    DISTS
+} codetype;
+
+extern int zlib_inflate_table (codetype type, unsigned short *lens,
+                             unsigned codes, code **table,
+                             unsigned *bits, unsigned short *work);
+#endif
diff --git a/linux/zlib_inflate/infutil.c b/linux/zlib_inflate/infutil.c
new file mode 100644
index 0000000..4824c2c
--- /dev/null
+++ b/linux/zlib_inflate/infutil.c
@@ -0,0 +1,49 @@
+#include <linux/zutil.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+/* Utility function: initialize zlib, unpack binary blob, clean up zlib,
+ * return len or negative error code.
+ */
+int zlib_inflate_blob(void *gunzip_buf, unsigned int sz,
+		      const void *buf, unsigned int len)
+{
+	const u8 *zbuf = buf;
+	struct z_stream_s *strm;
+	int rc;
+
+	rc = -ENOMEM;
+	strm = kmalloc(sizeof(*strm), GFP_KERNEL);
+	if (strm == NULL)
+		goto gunzip_nomem1;
+	strm->workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
+	if (strm->workspace == NULL)
+		goto gunzip_nomem2;
+
+	/* gzip header (1f,8b,08... 10 bytes total + possible asciz filename)
+	 * expected to be stripped from input
+	 */
+	strm->next_in = zbuf;
+	strm->avail_in = len;
+	strm->next_out = gunzip_buf;
+	strm->avail_out = sz;
+
+	rc = zlib_inflateInit2(strm, -MAX_WBITS);
+	if (rc == Z_OK) {
+		rc = zlib_inflate(strm, Z_FINISH);
+		/* after Z_FINISH, only Z_STREAM_END is "we unpacked it all" */
+		if (rc == Z_STREAM_END)
+			rc = sz - strm->avail_out;
+		else
+			rc = -EINVAL;
+		zlib_inflateEnd(strm);
+	} else
+		rc = -EINVAL;
+
+	kfree(strm->workspace);
+gunzip_nomem2:
+	kfree(strm);
+gunzip_nomem1:
+	return rc; /* returns Z_OK (0) if successful */
+}
diff --git a/linux/zlib_inflate/infutil.h b/linux/zlib_inflate/infutil.h
new file mode 100644
index 0000000..eb1a900
--- /dev/null
+++ b/linux/zlib_inflate/infutil.h
@@ -0,0 +1,25 @@
+/* infutil.h -- types and macros common to blocks and codes
+ * Copyright (C) 1995-1998 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h 
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+#ifndef _INFUTIL_H
+#define _INFUTIL_H
+
+#include <linux/zlib.h>
+
+/* memory allocation for inflation */
+
+struct inflate_workspace {
+	struct inflate_state inflate_state;
+	unsigned char working_window[1 << MAX_WBITS];
+};
+
+#define WS(z) ((struct inflate_workspace *)(z->workspace))
+
+#endif
diff --git a/tools-util.c b/tools-util.c
new file mode 100644
index 0000000..68a42a9
--- /dev/null
+++ b/tools-util.c
@@ -0,0 +1,314 @@
+#include <alloca.h>
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/fs.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <uuid/uuid.h>
+
+#include "ccan/crc/crc.h"
+
+#include "tools-util.h"
+
+/* Integer stuff: */
+
+struct units_buf pr_units(u64 v, enum units units)
+{
+	struct units_buf ret;
+
+	switch (units) {
+	case BYTES:
+		snprintf(ret.b, sizeof(ret.b), "%llu", v << 9);
+		break;
+	case SECTORS:
+		snprintf(ret.b, sizeof(ret.b), "%llu", v);
+		break;
+	case HUMAN_READABLE:
+		v <<= 9;
+
+		if (v >= 1024) {
+			int exp = log(v) / log(1024);
+			snprintf(ret.b, sizeof(ret.b), "%.1f%c",
+				 v / pow(1024, exp),
+				 "KMGTPE"[exp-1]);
+		} else {
+			snprintf(ret.b, sizeof(ret.b), "%llu", v);
+		}
+
+		break;
+	}
+
+	return ret;
+}
+
+/* Argument parsing stuff: */
+
+long strtoul_or_die(const char *p, size_t max, const char *msg)
+{
+	errno = 0;
+	long v = strtol(p, NULL, 10);
+	if (errno || v < 0 || v >= max)
+		die("Invalid %s %zi", msg, v);
+
+	return v;
+}
+
+u64 hatoi(const char *s)
+{
+	char *e;
+	long long i = strtoll(s, &e, 10);
+	switch (*e) {
+		case 't':
+		case 'T':
+			i *= 1024;
+		case 'g':
+		case 'G':
+			i *= 1024;
+		case 'm':
+		case 'M':
+			i *= 1024;
+		case 'k':
+		case 'K':
+			i *= 1024;
+	}
+	return i;
+}
+
+unsigned hatoi_validate(const char *s, const char *msg)
+{
+	u64 v = hatoi(s);
+
+	if (v & (v - 1))
+		die("%s must be a power of two", msg);
+
+	v /= 512;
+
+	if (v > USHRT_MAX)
+		die("%s too large\n", msg);
+
+	if (!v)
+		die("%s too small\n", msg);
+
+	return v;
+}
+
+unsigned nr_args(char * const *args)
+{
+	unsigned i;
+
+	for (i = 0; args[i]; i++)
+		;
+
+	return i;
+}
+
+/* File parsing (i.e. sysfs) */
+
+char *read_file_str(int dirfd, const char *path)
+{
+	int fd = openat(dirfd, path, O_RDONLY);
+
+	if (fd < 0)
+		die("Unable to open %s\n", path);
+
+	struct stat statbuf;
+	if (fstat(fd, &statbuf) < 0)
+		die("fstat error\n");
+
+	char *buf = malloc(statbuf.st_size + 1);
+
+	int len = read(fd, buf, statbuf.st_size);
+	if (len < 0)
+		die("read error while reading from file %s\n", path);
+
+	buf[len] = '\0';
+	if (len && buf[len - 1] == '\n')
+		buf[len - 1] = '\0';
+
+	close(fd);
+
+	return buf;
+}
+
+u64 read_file_u64(int dirfd, const char *path)
+{
+	char *buf = read_file_str(dirfd, path);
+	u64 ret = strtoll(buf, NULL, 10);
+
+	free(buf);
+	return ret;
+}
+
+/* String list options: */
+
+ssize_t read_string_list(const char *buf, const char * const list[])
+{
+	size_t i;
+	char *s, *d = strdup(buf);
+	if (!d)
+		return -ENOMEM;
+
+	s = strim(d);
+
+	for (i = 0; list[i]; i++)
+		if (!strcmp(list[i], s))
+			break;
+
+	free(d);
+
+	if (!list[i])
+		return -EINVAL;
+
+	return i;
+}
+
+ssize_t read_string_list_or_die(const char *opt, const char * const list[],
+				const char *msg)
+{
+	ssize_t v = read_string_list(opt, list);
+	if (v < 0)
+		die("Bad %s %s", msg, opt);
+
+	return v;
+}
+
+void print_string_list(const char * const list[], size_t selected)
+{
+	size_t i;
+
+	for (i = 0; list[i]; i++) {
+		if (i)
+			putchar(' ');
+		printf(i == selected ? "[%s] ": "%s", list[i]);
+	}
+}
+
+/* Returns size of file or block device, in units of 512 byte sectors: */
+u64 get_size(const char *path, int fd)
+{
+	struct stat statbuf;
+	if (fstat(fd, &statbuf))
+		die("Error statting %s: %s", path, strerror(errno));
+
+	if (!S_ISBLK(statbuf.st_mode))
+		return statbuf.st_size >> 9;
+
+	u64 ret;
+	if (ioctl(fd, BLKGETSIZE64, &ret))
+		die("Error getting block device size on %s: %s\n",
+		    path, strerror(errno));
+
+	return ret >> 9;
+}
+
+/* Returns blocksize in units of 512 byte sectors: */
+unsigned get_blocksize(const char *path, int fd)
+{
+	struct stat statbuf;
+	if (fstat(fd, &statbuf))
+		die("Error statting %s: %s", path, strerror(errno));
+
+	if (!S_ISBLK(statbuf.st_mode))
+		return statbuf.st_blksize >> 9;
+
+	unsigned ret;
+	if (ioctl(fd, BLKPBSZGET, &ret))
+		die("Error getting blocksize on %s: %s\n",
+		    path, strerror(errno));
+
+	return ret >> 9;
+}
+
+/* Global control device: */
+int bcachectl_open(void)
+{
+	int fd = open("/dev/bcache-ctl", O_RDWR);
+	if (fd < 0)
+		die("Can't open bcache device: %s", strerror(errno));
+
+	return fd;
+}
+
+/* Filesystem handles (ioctl, sysfs dir): */
+
+#define SYSFS_BASE "/sys/fs/bcache/"
+
+struct bcache_handle bcache_fs_open(const char *path)
+{
+	struct bcache_handle ret;
+	uuid_t tmp;
+
+	if (!uuid_parse(path, tmp)) {
+		/* It's a UUID, look it up in sysfs: */
+
+		char *sysfs = alloca(strlen(SYSFS_BASE) + strlen(path) + 1);
+		sprintf(sysfs, "%s%s", SYSFS_BASE, path);
+
+		ret.sysfs_fd = open(sysfs, O_RDONLY);
+		if (!ret.sysfs_fd)
+			die("Unable to open %s\n", path);
+
+		char *minor = read_file_str(ret.sysfs_fd, "minor");
+		char *ctl = alloca(20 + strlen(minor));
+
+		sprintf(ctl, "/dev/bcache%s-ctl", minor);
+		free(minor);
+
+		ret.ioctl_fd = open(ctl, O_RDWR);
+		if (ret.ioctl_fd < 0)
+			die("Error opening control device: %s\n",
+			    strerror(errno));
+	} else {
+		/* It's a path: */
+
+		ret.ioctl_fd = open(path, O_RDONLY);
+		if (ret.ioctl_fd < 0)
+			die("Error opening %s: %s\n",
+			    path, strerror(errno));
+
+		struct bch_ioctl_query_uuid uuid;
+		if (ioctl(ret.ioctl_fd, BCH_IOCTL_QUERY_UUID, &uuid))
+			die("ioctl error (not a bcache fs?): %s\n",
+			    strerror(errno));
+
+		char uuid_str[40];
+		uuid_unparse(uuid.uuid.b, uuid_str);
+
+		char *sysfs = alloca(strlen(SYSFS_BASE) + strlen(uuid_str) + 1);
+		sprintf(sysfs, "%s%s", SYSFS_BASE, uuid_str);
+
+		ret.sysfs_fd = open(sysfs, O_RDONLY);
+		if (ret.sysfs_fd < 0)
+			die("Unable to open sysfs dir %s: %s\n",
+			    sysfs, strerror(errno));
+	}
+
+	return ret;
+}
+
+bool ask_proceed(void)
+{
+	const char *short_yes = "yY";
+	char *buf = NULL;
+	size_t buflen = 0;
+	bool ret;
+
+	fputs("Proceed anyway? (y,n) ", stdout);
+
+	if (getline(&buf, &buflen, stdin) < 0)
+		die("error reading from standard input");
+
+	ret = strchr(short_yes, buf[0]);
+	free(buf);
+	return ret;
+}
diff --git a/tools-util.h b/tools-util.h
new file mode 100644
index 0000000..5c8ea13
--- /dev/null
+++ b/tools-util.h
@@ -0,0 +1,64 @@
+#ifndef _TOOLS_UTIL_H
+#define _TOOLS_UTIL_H
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include <linux/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#define die(arg, ...)					\
+do {							\
+	fprintf(stderr, arg "\n", ##__VA_ARGS__);	\
+	exit(EXIT_FAILURE);				\
+} while (0)
+
+enum units {
+	BYTES,
+	SECTORS,
+	HUMAN_READABLE,
+};
+
+struct units_buf pr_units(u64, enum units);
+
+struct units_buf {
+	char	b[20];
+};
+
+long strtoul_or_die(const char *, size_t, const char *);
+
+u64 hatoi(const char *);
+unsigned hatoi_validate(const char *, const char *);
+unsigned nr_args(char * const *);
+
+char *read_file_str(int, const char *);
+u64 read_file_u64(int, const char *);
+
+ssize_t read_string_list(const char *, const char * const[]);
+ssize_t read_string_list_or_die(const char *, const char * const[],
+				const char *);
+void print_string_list(const char * const[], size_t);
+
+u64 get_size(const char *, int);
+unsigned get_blocksize(const char *, int);
+
+#include "linux/bcache.h"
+#include "linux/bcache-ioctl.h"
+
+int bcachectl_open(void);
+
+struct bcache_handle {
+	int	ioctl_fd;
+	int	sysfs_fd;
+};
+
+struct bcache_handle bcache_fs_open(const char *);
+
+bool ask_proceed(void);
+
+#endif /* _TOOLS_UTIL_H */
diff --git a/util.h b/util.h
deleted file mode 100644
index b5ea071..0000000
--- a/util.h
+++ /dev/null
@@ -1,144 +0,0 @@
-#ifndef _UTIL_H
-#define _UTIL_H
-
-#include <stdbool.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-
-/* linux kernel style types: */
-
-#include <asm/types.h>
-#include <asm/byteorder.h>
-
-typedef __u8	u8;
-typedef __u16	u16;
-typedef __u32	u32;
-typedef __u64	u64;
-
-typedef __s8	s8;
-typedef __s16	s16;
-typedef __s32	s32;
-typedef __s64	s64;
-
-#define cpu_to_le16		__cpu_to_le16
-#define cpu_to_le32		__cpu_to_le32
-#define cpu_to_le64		__cpu_to_le64
-
-#define le16_to_cpu		__le16_to_cpu
-#define le32_to_cpu		__le32_to_cpu
-#define le64_to_cpu		__le64_to_cpu
-
-static inline void le16_add_cpu(__le16 *var, u16 val)
-{
-	*var = cpu_to_le16(le16_to_cpu(*var) + val);
-}
-
-static inline void le32_add_cpu(__le32 *var, u32 val)
-{
-	*var = cpu_to_le32(le32_to_cpu(*var) + val);
-}
-
-static inline void le64_add_cpu(__le64 *var, u64 val)
-{
-	*var = cpu_to_le64(le64_to_cpu(*var) + val);
-}
-
-#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
-
-#define min(x, y) ({				\
-	typeof(x) _min1 = (x);			\
-	typeof(y) _min2 = (y);			\
-	(void) (&_min1 == &_min2);		\
-	_min1 < _min2 ? _min1 : _min2; })
-
-#define max(x, y) ({				\
-	typeof(x) _max1 = (x);			\
-	typeof(y) _max2 = (y);			\
-	(void) (&_max1 == &_max2);		\
-	_max1 > _max2 ? _max1 : _max2; })
-
-#define max_t(type, x, y) ({			\
-	type __max1 = (x);			\
-	type __max2 = (y);			\
-	__max1 > __max2 ? __max1: __max2; })
-
-#define die(arg, ...)					\
-do {							\
-	fprintf(stderr, arg "\n", ##__VA_ARGS__);	\
-	exit(EXIT_FAILURE);				\
-} while (0)
-
-unsigned ilog2(u64);
-u64 rounddown_pow_of_two(u64);
-u64 roundup_pow_of_two(u64);
-
-char *skip_spaces(const char *str);
-char *strim(char *s);
-
-enum units {
-	BYTES,
-	SECTORS,
-	HUMAN_READABLE,
-};
-
-struct units_buf pr_units(u64, enum units);
-
-struct units_buf {
-	char	b[20];
-};
-
-long strtoul_or_die(const char *, size_t, const char *);
-
-u64 hatoi(const char *);
-unsigned hatoi_validate(const char *, const char *);
-unsigned nr_args(char * const *);
-
-char *read_file_str(int, const char *);
-u64 read_file_u64(int, const char *);
-
-ssize_t read_string_list(const char *, const char * const[]);
-ssize_t read_string_list_or_die(const char *, const char * const[],
-				const char *);
-void print_string_list(const char * const[], size_t);
-
-u64 get_size(const char *, int);
-unsigned get_blocksize(const char *, int);
-
-#include "bcache-ondisk.h"
-#include "bcache-ioctl.h"
-
-u64 bch_checksum(unsigned, const void *, size_t);
-
-#define __bkey_idx(_set, _offset)					\
-	((_set)->_data + (_offset))
-
-#define __bset_bkey_last(_set)						\
-	 __bkey_idx((_set), (_set)->u64s)
-
-#define __csum_set(i, u64s, type)					\
-({									\
-	const void *start = ((const void *) (i)) + sizeof(i->csum);	\
-	const void *end = __bkey_idx(i, u64s);				\
-									\
-	bch_checksum(type, start, end - start);				\
-})
-
-#define csum_set(i, type)	__csum_set(i, (i)->u64s, type)
-
-int bcachectl_open(void);
-
-#include <dirent.h>
-
-struct bcache_handle {
-	DIR	*sysfs;
-	int	fd;
-};
-
-struct bcache_handle bcache_fs_open(const char *);
-
-bool ask_proceed(void);
-
-void memzero_explicit(void *, size_t);
-
-#endif /* _UTIL_H */
author	Kent Overstreet <kent.overstreet@gmail.com>	2017-01-08 00:13:18 -0900
committer	Kent Overstreet <kent.overstreet@gmail.com>	2017-01-20 09:07:08 -0900
commit	b33fc8298f7e13226b9895abc57c9bfce5e3fa2d (patch)
tree	a3d2a5a909b6372f7777c1c5c18cef5f81d123a9
parent	7f4191a202ea4558ca2d5eb8a47daea33c9999c7 (diff)