summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2017-03-01 01:45:15 -0900
committerKent Overstreet <kent.overstreet@gmail.com>2017-03-08 02:34:33 -0900
commit06b73dbd7ffc0296b2ecea8d3bc55bfeb72d7f2a (patch)
tree1ba37985a18eb2d9a9616ee160c82339e23e2160
parent171ee48e57be78f4e95954c99851553fa523bf91 (diff)
-rw-r--r--.bcache_revision2
-rw-r--r--Makefile1
-rw-r--r--bcache.c12
-rw-r--r--cmd_debug.c75
-rw-r--r--cmd_device.c5
-rw-r--r--cmd_format.c193
-rw-r--r--cmd_fsck.c2
-rw-r--r--cmd_key.c37
-rw-r--r--cmd_migrate.c835
-rw-r--r--cmd_run.c5
-rw-r--r--cmds.h3
-rw-r--r--crypto.c72
-rw-r--r--crypto.h1
-rw-r--r--include/linux/bcache.h2
-rw-r--r--include/linux/blkdev.h2
-rw-r--r--include/linux/generic-radix-tree.h13
-rw-r--r--include/linux/percpu-refcount.h5
-rw-r--r--libbcache.c207
-rw-r--r--libbcache.h55
-rw-r--r--libbcache/alloc.c189
-rw-r--r--libbcache/alloc.h7
-rw-r--r--libbcache/alloc_types.h2
-rw-r--r--libbcache/bcache.h47
-rw-r--r--libbcache/blockdev.c12
-rw-r--r--libbcache/btree_cache.c7
-rw-r--r--libbcache/btree_cache.h6
-rw-r--r--libbcache/btree_gc.c105
-rw-r--r--libbcache/buckets.c103
-rw-r--r--libbcache/buckets.h15
-rw-r--r--libbcache/buckets_types.h27
-rw-r--r--libbcache/chardev.c2
-rw-r--r--libbcache/checksum.c7
-rw-r--r--libbcache/checksum.h2
-rw-r--r--libbcache/compress.c14
-rw-r--r--libbcache/compress.h4
-rw-r--r--libbcache/debug.c10
-rw-r--r--libbcache/debug.h8
-rw-r--r--libbcache/error.c6
-rw-r--r--libbcache/extents.c43
-rw-r--r--libbcache/fs-gc.c10
-rw-r--r--libbcache/fs.c24
-rw-r--r--libbcache/fs.h8
-rw-r--r--libbcache/io.c25
-rw-r--r--libbcache/journal.c318
-rw-r--r--libbcache/journal.h17
-rw-r--r--libbcache/movinggc.c27
-rw-r--r--libbcache/movinggc.h4
-rw-r--r--libbcache/opts.h10
-rw-r--r--libbcache/super-io.c170
-rw-r--r--libbcache/super-io.h46
-rw-r--r--libbcache/super.c709
-rw-r--r--libbcache/super.h32
-rw-r--r--libbcache/super_types.h1
-rw-r--r--libbcache/sysfs.c60
-rw-r--r--libbcache/tier.c101
-rw-r--r--libbcache/tier.h6
-rw-r--r--linux/blkdev.c6
-rw-r--r--qcow2.c37
-rw-r--r--qcow2.h19
-rw-r--r--tools-util.c171
-rw-r--r--tools-util.h120
61 files changed, 2714 insertions, 1350 deletions
diff --git a/.bcache_revision b/.bcache_revision
index 58bdf2d..b86381a 100644
--- a/.bcache_revision
+++ b/.bcache_revision
@@ -1 +1 @@
-BCACHE_REVISION=aa4471ac314a1f117957f9fc59c1bfbdf965a28c
+BCACHE_REVISION=c1f1a9e1d9b9664db9c9c03cbac455c2750335bc
diff --git a/Makefile b/Makefile
index 2defed0..682bf8e 100644
--- a/Makefile
+++ b/Makefile
@@ -56,6 +56,7 @@ OBJS=bcache.o \
cmd_fsck.o \
cmd_format.o \
cmd_key.o \
+ cmd_migrate.o \
cmd_run.o \
crypto.o \
libbcache.o \
diff --git a/bcache.c b/bcache.c
index ac9eb07..a0fa860 100644
--- a/bcache.c
+++ b/bcache.c
@@ -50,7 +50,12 @@ static void usage(void)
"\n"
"Debug:\n"
" bcache dump Dump filesystem metadata to a qcow2 image\n"
- " bcache list List filesystem metadata in textual form\n");
+ " bcache list List filesystem metadata in textual form\n"
+ "\n"
+ "Migrate:\n"
+ " bcache migrate Migrate an existing filesystem to bcachefs, in place\n"
+ " bcache migrate_superblock\n"
+ " Add default superblock, after bcache migrate\n");
}
int main(int argc, char *argv[])
@@ -104,6 +109,11 @@ int main(int argc, char *argv[])
if (!strcmp(cmd, "list"))
return cmd_list(argc, argv);
+ if (!strcmp(cmd, "migrate"))
+ return cmd_migrate(argc, argv);
+ if (!strcmp(cmd, "migrate_superblock"))
+ return cmd_migrate_superblock(argc, argv);
+
usage();
return 0;
}
diff --git a/cmd_debug.c b/cmd_debug.c
index 4f2586d..ca0f453 100644
--- a/cmd_debug.c
+++ b/cmd_debug.c
@@ -30,35 +30,35 @@ static void dump_usage(void)
static void dump_one_device(struct cache_set *c, struct cache *ca, int fd)
{
struct bch_sb *sb = ca->disk_sb.sb;
- sparse_data data;
+ ranges data;
unsigned i;
darray_init(data);
/* Superblock: */
- data_add(&data, BCH_SB_LAYOUT_SECTOR << 9,
- sizeof(struct bch_sb_layout));
+ range_add(&data, BCH_SB_LAYOUT_SECTOR << 9,
+ sizeof(struct bch_sb_layout));
for (i = 0; i < sb->layout.nr_superblocks; i++)
- data_add(&data,
- le64_to_cpu(sb->layout.sb_offset[i]) << 9,
- vstruct_bytes(sb));
+ range_add(&data,
+ le64_to_cpu(sb->layout.sb_offset[i]) << 9,
+ vstruct_bytes(sb));
/* Journal: */
for (i = 0; i < ca->journal.nr; i++)
if (ca->journal.bucket_seq[i] >= c->journal.last_seq_ondisk) {
u64 bucket = ca->journal.buckets[i];
- data_add(&data,
- bucket_bytes(ca) * bucket,
- bucket_bytes(ca));
+ range_add(&data,
+ bucket_bytes(ca) * bucket,
+ bucket_bytes(ca));
}
/* Prios/gens: */
for (i = 0; i < prio_buckets(ca); i++)
- data_add(&data,
- bucket_bytes(ca) * ca->prio_last_buckets[i],
- bucket_bytes(ca));
+ range_add(&data,
+ bucket_bytes(ca) * ca->prio_last_buckets[i],
+ bucket_bytes(ca));
/* Btree: */
for (i = 0; i < BTREE_ID_NR; i++) {
@@ -71,9 +71,9 @@ static void dump_one_device(struct cache_set *c, struct cache *ca, int fd)
extent_for_each_ptr(e, ptr)
if (ptr->dev == ca->dev_idx)
- data_add(&data,
- ptr->offset << 9,
- b->written << 9);
+ range_add(&data,
+ ptr->offset << 9,
+ b->written << 9);
}
bch_btree_iter_unlock(&iter);
}
@@ -87,7 +87,7 @@ int cmd_dump(int argc, char *argv[])
struct bch_opts opts = bch_opts_empty();
struct cache_set *c = NULL;
const char *err;
- char *out = NULL, *buf;
+ char *out = NULL;
unsigned i, nr_devices = 0;
bool force = false;
int fd, opt;
@@ -116,9 +116,6 @@ int cmd_dump(int argc, char *argv[])
if (!out)
die("Please supply output filename");
- buf = alloca(strlen(out) + 10);
- strcpy(buf, out);
-
err = bch_fs_open(argv + optind, argc - optind, opts, &c);
if (err)
die("error opening %s: %s", argv[optind], err);
@@ -140,12 +137,11 @@ int cmd_dump(int argc, char *argv[])
if (!c->cache[i])
continue;
- if (nr_devices > 1)
- sprintf(buf, "%s.%u", out, i);
-
- fd = open(buf, mode, 0600);
- if (fd < 0)
- die("error opening %s: %s", buf, strerror(errno));
+ char *path = nr_devices > 1
+ ? mprintf("%s.%u", out, i)
+ : strdup(out);
+ fd = xopen(path, mode, 0600);
+ free(path);
dump_one_device(c, c->cache[i], fd);
close(fd);
@@ -153,7 +149,7 @@ int cmd_dump(int argc, char *argv[])
up_read(&c->gc_lock);
- bch_fs_stop_sync(c);
+ bch_fs_stop(c);
return 0;
}
@@ -213,14 +209,20 @@ static void list_keys_usage(void)
"Usage: bcache list_keys [OPTION]... <devices>\n"
"\n"
"Options:\n"
- " -b btree_id Integer btree id to list\n"
- " -s start Start pos (as inode:offset)\n"
- " -e end End pos\n"
- " -m mode Mode for listing\n"
- " -h Display this help and exit\n"
+ " -b (extents|inodes|dirents|xattrs) Btree to list from\n"
+ " -s inode:offset Start position to list from\n"
+ " -e inode:offset End position\n"
+ " -m (keys|formats) List mode\n"
+ " -h Display this help and exit\n"
"Report bugs to <linux-bcache@vger.kernel.org>");
}
+static const char * const list_modes[] = {
+ "keys",
+ "formats",
+ NULL
+};
+
int cmd_list(int argc, char *argv[])
{
struct bch_opts opts = bch_opts_empty();
@@ -229,7 +231,6 @@ int cmd_list(int argc, char *argv[])
struct bpos start = POS_MIN, end = POS_MAX;
const char *err;
int mode = 0, opt;
- u64 v;
opts.nochanges = true;
opts.norecovery = true;
@@ -239,10 +240,8 @@ int cmd_list(int argc, char *argv[])
while ((opt = getopt(argc, argv, "b:s:e:m:h")) != -1)
switch (opt) {
case 'b':
- if (kstrtoull(optarg, 10, &v) ||
- v >= BTREE_ID_NR)
- die("invalid btree id");
- btree_id = v;
+ btree_id = read_string_list_or_die(optarg,
+ bch_btree_ids, "btree id");
break;
case 's':
start = parse_pos(optarg);
@@ -251,6 +250,8 @@ int cmd_list(int argc, char *argv[])
end = parse_pos(optarg);
break;
case 'm':
+ mode = read_string_list_or_die(optarg,
+ list_modes, "list mode");
break;
case 'h':
list_keys_usage();
@@ -275,6 +276,6 @@ int cmd_list(int argc, char *argv[])
die("Invalid mode");
}
- bch_fs_stop_sync(c);
+ bch_fs_stop(c);
return 0;
}
diff --git a/cmd_device.c b/cmd_device.c
index 1c5208a..505fedc 100644
--- a/cmd_device.c
+++ b/cmd_device.c
@@ -121,10 +121,7 @@ int cmd_device_show(int argc, char *argv[])
char *dev_name = basename(dirname(link));
- int fd = openat(dirfd(fs.sysfs), entry->d_name, O_RDONLY);
- if (fd < 0)
- die("couldn't open device %s: %s\n",
- entry->d_name, strerror(errno));
+ int fd = xopenat(dirfd(fs.sysfs), entry->d_name, O_RDONLY);
devices[nr_devices] = fill_dev(strdup(dev_name), nr, fd);
tiers[devices[nr_devices].tier]++;
diff --git a/cmd_format.c b/cmd_format.c
index 2b1453e..f222a8b 100644
--- a/cmd_format.c
+++ b/cmd_format.c
@@ -34,10 +34,8 @@ static int open_for_format(const char *dev, bool force)
blkid_probe pr;
const char *fs_type = NULL, *fs_label = NULL;
size_t fs_type_len, fs_label_len;
- int fd;
- if ((fd = open(dev, O_RDWR|O_EXCL)) == -1)
- die("Can't open dev %s: %s\n", dev, strerror(errno));
+ int fd = xopen(dev, O_RDWR|O_EXCL);
if (force)
return fd;
@@ -70,8 +68,41 @@ static int open_for_format(const char *dev, bool force)
return fd;
}
+#define OPTS \
+t("bcache format - create a new bcache filesystem on one or more devices") \
+t("Usage: bcache format [OPTION]... <devices>") \
+t("") \
+x('b', block_size, "size", NULL) \
+x(0, btree_node_size, "size", "Default 256k") \
+x(0, metadata_checksum_type, "(none|crc32c|crc64)", NULL) \
+x(0, data_checksum_type, "(none|crc32c|crc64)", NULL) \
+x(0, compression_type, "(none|lz4|gzip)", NULL) \
+x(0, encrypted, NULL, "Enable whole filesystem encryption (chacha20/poly1305)")\
+x(0, no_passphrase, NULL, "Don't encrypt master encryption key")\
+x('e', error_action, "(continue|readonly|panic)", NULL) \
+x(0, max_journal_entry_size, "size", NULL) \
+x('L', label, "label", NULL) \
+x('U', uuid, "uuid", NULL) \
+x('f', force, NULL, NULL) \
+t("") \
+t("Device specific options:") \
+x(0, fs_size, "size", "Size of filesystem on device")\
+x(0, bucket_size, "size", "Bucket size") \
+x('t', tier, "#", "Higher tier indicates slower devices")\
+x(0, discard, NULL, NULL) \
+t("Device specific options must come before corresponding devices, e.g.") \
+t(" bcache format --tier 0 /dev/sdb --tier 1 /dev/sdc") \
+t("") \
+x('h', help, NULL, "display this help and exit")
+
static void usage(void)
{
+#define t(text) puts(text "\n")
+#define x(shortopt, longopt, arg, help) do { \
+ OPTS
+#undef x
+#undef t
+
puts("bcache format - create a new bcache filesystem on one or more devices\n"
"Usage: bcache format [OPTION]... <devices>\n"
"\n"
@@ -81,7 +112,8 @@ static void usage(void)
" --metadata_checksum_type=(none|crc32c|crc64)\n"
" --data_checksum_type=(none|crc32c|crc64)\n"
" --compression_type=(none|lz4|gzip)\n"
- " --encrypted\n"
+ " --encrypted Enable whole filesystem encryption (chacha20/poly1305)\n"
+ " --no_passphrase Don't encrypt master encryption key\n"
" --error_action=(continue|readonly|panic)\n"
" Action to take on filesystem error\n"
" --max_journal_entry_size=size\n"
@@ -103,37 +135,26 @@ static void usage(void)
"Report bugs to <linux-bcache@vger.kernel.org>");
}
-#define OPTS \
- OPT('b', block_size, required_argument) \
- OPT(0, btree_node_size, required_argument) \
- OPT(0, metadata_checksum_type, required_argument) \
- OPT(0, data_checksum_type, required_argument) \
- OPT(0, compression_type, required_argument) \
- OPT(0, encrypted, no_argument) \
- OPT('e', error_action, required_argument) \
- OPT(0, max_journal_entry_size, required_argument) \
- OPT('L', label, required_argument) \
- OPT('U', uuid, required_argument) \
- OPT('f', force, no_argument) \
- OPT(0, fs_size, required_argument) \
- OPT(0, bucket_size, required_argument) \
- OPT('t', tier, required_argument) \
- OPT(0, discard, no_argument) \
- OPT('h', help, no_argument)
-
enum {
Opt_no_opt = 1,
-#define OPT(shortopt, longopt, has_arg) Opt_##longopt,
+#define t(text)
+#define x(shortopt, longopt, arg, help) Opt_##longopt,
OPTS
-#undef OPT
+#undef x
+#undef t
};
static const struct option format_opts[] = {
-#define OPT(shortopt, longopt, has_arg) { \
- #longopt, has_arg, NULL, Opt_##longopt \
- },
+#define t(text)
+#define x(shortopt, longopt, arg, help) { \
+ .name = #longopt, \
+ .has_arg = arg ? required_argument : no_argument, \
+ .flag = NULL, \
+ .val = Opt_##longopt, \
+},
OPTS
-#undef OPT
+#undef x
+#undef t
{ NULL }
};
@@ -161,29 +182,12 @@ static unsigned hatoi_validate(const char *s, const char *msg)
int cmd_format(int argc, char *argv[])
{
darray(struct dev_opts) devices;
- struct dev_opts *dev;
- unsigned block_size = 0;
- unsigned btree_node_size = 0;
- unsigned meta_csum_type = BCH_CSUM_CRC32C;
- unsigned data_csum_type = BCH_CSUM_CRC32C;
- unsigned compression_type = BCH_COMPRESSION_NONE;
- bool encrypted = false;
- unsigned on_error_action = BCH_ON_ERROR_RO;
- char *label = NULL;
- uuid_le uuid;
- bool force = false;
-
- /* Device specific options: */
- u64 filesystem_size = 0;
- unsigned bucket_size = 0;
- unsigned tier = 0;
- bool discard = false;
- unsigned max_journal_entry_size = 0;
- char *passphrase = NULL;
+ struct format_opts opts = format_opts_default();
+ struct dev_opts dev_opts = { 0 }, *dev;
+ bool force = false, no_passphrase = false;
int opt;
darray_init(devices);
- uuid_clear(uuid.b);
while ((opt = getopt_long(argc, argv,
"-b:e:L:U:ft:h",
@@ -192,45 +196,52 @@ int cmd_format(int argc, char *argv[])
switch (opt) {
case Opt_block_size:
case 'b':
- block_size = hatoi_validate(optarg,
- "block size");
+ opts.block_size =
+ hatoi_validate(optarg, "block size");
break;
case Opt_btree_node_size:
- btree_node_size = hatoi_validate(optarg,
- "btree node size");
+ opts.btree_node_size =
+ hatoi_validate(optarg, "btree node size");
break;
case Opt_metadata_checksum_type:
- meta_csum_type = read_string_list_or_die(optarg,
+ opts.meta_csum_type =
+ read_string_list_or_die(optarg,
bch_csum_types, "checksum type");
break;
case Opt_data_checksum_type:
- data_csum_type = read_string_list_or_die(optarg,
+ opts.data_csum_type =
+ read_string_list_or_die(optarg,
bch_csum_types, "checksum type");
break;
case Opt_compression_type:
- compression_type = read_string_list_or_die(optarg,
+ opts.compression_type =
+ read_string_list_or_die(optarg,
bch_compression_types,
"compression type");
break;
case Opt_encrypted:
- encrypted = true;
+ opts.encrypted = true;
+ break;
+ case Opt_no_passphrase:
+ no_passphrase = true;
break;
case Opt_error_action:
case 'e':
- on_error_action = read_string_list_or_die(optarg,
+ opts.on_error_action =
+ read_string_list_or_die(optarg,
bch_error_actions, "error action");
break;
case Opt_max_journal_entry_size:
- max_journal_entry_size = hatoi_validate(optarg,
- "journal entry size");
+ opts.max_journal_entry_size =
+ hatoi_validate(optarg, "journal entry size");
break;
case Opt_label:
case 'L':
- label = strdup(optarg);
+ opts.label = strdup(optarg);
break;
case Opt_uuid:
case 'U':
- if (uuid_parse(optarg, uuid.b))
+ if (uuid_parse(optarg, opts.uuid.b))
die("Bad uuid");
break;
case Opt_force:
@@ -238,31 +249,28 @@ int cmd_format(int argc, char *argv[])
force = true;
break;
case Opt_fs_size:
- if (bch_strtoull_h(optarg, &filesystem_size))
+ if (bch_strtoull_h(optarg, &dev_opts.size))
die("invalid filesystem size");
- filesystem_size >>= 9;
+ dev_opts.size >>= 9;
break;
case Opt_bucket_size:
- bucket_size = hatoi_validate(optarg, "bucket size");
+ dev_opts.bucket_size =
+ hatoi_validate(optarg, "bucket size");
break;
case Opt_tier:
case 't':
- if (kstrtouint(optarg, 10, &tier) ||
- tier >= BCH_TIER_MAX)
+ if (kstrtouint(optarg, 10, &dev_opts.tier) ||
+ dev_opts.tier >= BCH_TIER_MAX)
die("invalid tier");
break;
case Opt_discard:
- discard = true;
+ dev_opts.discard = true;
break;
case Opt_no_opt:
- darray_append(devices, (struct dev_opts) {
- .path = strdup(optarg),
- .size = filesystem_size,
- .bucket_size = bucket_size,
- .tier = tier,
- .discard = discard,
- });
+ dev_opts.path = strdup(optarg);
+ darray_append(devices, dev_opts);
+ dev_opts.size = 0;
break;
case Opt_help:
case 'h':
@@ -274,18 +282,16 @@ int cmd_format(int argc, char *argv[])
if (!darray_size(devices))
die("Please supply a device");
- if (uuid_is_null(uuid.b))
- uuid_generate(uuid.b);
-
- if (encrypted) {
- passphrase = read_passphrase("Enter passphrase: ");
+ if (opts.encrypted && !no_passphrase) {
+ opts.passphrase = read_passphrase("Enter passphrase: ");
if (isatty(STDIN_FILENO)) {
char *pass2 =
read_passphrase("Enter same passphrase again: ");
- if (strcmp(passphrase, pass2)) {
- memzero_explicit(passphrase, strlen(passphrase));
+ if (strcmp(opts.passphrase, pass2)) {
+ memzero_explicit(opts.passphrase,
+ strlen(opts.passphrase));
memzero_explicit(pass2, strlen(pass2));
die("Passphrases do not match");
}
@@ -298,23 +304,14 @@ int cmd_format(int argc, char *argv[])
darray_foreach(dev, devices)
dev->fd = open_for_format(dev->path, force);
- bcache_format(devices.item, darray_size(devices),
- block_size,
- btree_node_size,
- meta_csum_type,
- data_csum_type,
- compression_type,
- passphrase,
- 1,
- 1,
- on_error_action,
- max_journal_entry_size,
- label,
- uuid);
-
- if (passphrase) {
- memzero_explicit(passphrase, strlen(passphrase));
- free(passphrase);
+ struct bch_sb *sb =
+ bcache_format(opts, devices.item, darray_size(devices));
+ bcache_super_print(sb, HUMAN_READABLE);
+ free(sb);
+
+ if (opts.passphrase) {
+ memzero_explicit(opts.passphrase, strlen(opts.passphrase));
+ free(opts.passphrase);
}
return 0;
diff --git a/cmd_fsck.c b/cmd_fsck.c
index a8c8dc5..6af5669 100644
--- a/cmd_fsck.c
+++ b/cmd_fsck.c
@@ -56,6 +56,6 @@ int cmd_fsck(int argc, char *argv[])
if (err)
die("error opening %s: %s", argv[optind], err);
- bch_fs_stop_sync(c);
+ bch_fs_stop(c);
return 0;
}
diff --git a/cmd_key.c b/cmd_key.c
index 587ecbe..654ad77 100644
--- a/cmd_key.c
+++ b/cmd_key.c
@@ -1,6 +1,5 @@
#include <errno.h>
#include <unistd.h>
-#include <keyutils.h>
#include <uuid/uuid.h>
#include "cmds.h"
@@ -10,52 +9,18 @@
int cmd_unlock(int argc, char *argv[])
{
- struct bch_encrypted_key sb_key;
- struct bch_key passphrase_key;
struct bch_sb *sb;
- struct bch_sb_field_crypt *crypt;
char *passphrase;
- char uuid[40];
- char description[60];
if (argc != 2)
die("please supply a single device");
sb = bcache_super_read(argv[1]);
- crypt = bch_sb_get_crypt(sb);
- if (!crypt)
- die("filesystem is not encrypted");
-
- sb_key = crypt->key;
-
- if (!bch_key_is_encrypted(&sb_key))
- die("filesystem does not have encryption key");
-
passphrase = read_passphrase("Enter passphrase: ");
- derive_passphrase(crypt, &passphrase_key, passphrase);
-
- /* Check if the user supplied the correct passphrase: */
- if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb),
- &sb_key, sizeof(sb_key)))
- die("error encrypting key");
-
- if (bch_key_is_encrypted(&sb_key))
- die("incorrect passphrase");
-
- uuid_unparse_lower(sb->user_uuid.b, uuid);
- sprintf(description, "bcache:%s", uuid);
- if (add_key("logon", description,
- &passphrase_key, sizeof(passphrase_key),
- KEY_SPEC_USER_KEYRING) < 0 ||
- add_key("user", description,
- &passphrase_key, sizeof(passphrase_key),
- KEY_SPEC_USER_KEYRING) < 0)
- die("add_key error: %s", strerror(errno));
+ add_bcache_key(sb, passphrase);
- memzero_explicit(&sb_key, sizeof(sb_key));
- memzero_explicit(&passphrase_key, sizeof(passphrase_key));
memzero_explicit(passphrase, strlen(passphrase));
free(passphrase);
return 0;
diff --git a/cmd_migrate.c b/cmd_migrate.c
new file mode 100644
index 0000000..3109ec7
--- /dev/null
+++ b/cmd_migrate.c
@@ -0,0 +1,835 @@
+#include </usr/include/dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/vfs.h>
+#include <unistd.h>
+#include <attr/xattr.h>
+
+#include <linux/fiemap.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+
+#include <uuid/uuid.h>
+
+#include "cmds.h"
+#include "crypto.h"
+#include "libbcache.h"
+#include "linux/bcache.h"
+
+#include <linux/dcache.h>
+#include <linux/generic-radix-tree.h>
+#include <linux/xattr.h>
+#include "btree_update.h"
+#include "buckets.h"
+#include "dirent.h"
+#include "fs.h"
+#include "inode.h"
+#include "io.h"
+#include "str_hash.h"
+#include "super.h"
+#include "xattr.h"
+
+static char *dev_t_to_path(dev_t dev)
+{
+ char link[PATH_MAX], *p;
+ int ret;
+
+ char *sysfs_dev = mprintf("/sys/dev/block/%u:%u",
+ major(dev), minor(dev));
+ ret = readlink(sysfs_dev, link, sizeof(link));
+ free(sysfs_dev);
+
+ if (ret < 0 || ret >= sizeof(link))
+ die("readlink error while looking up block device: %s", strerror(errno));
+
+ link[ret] = '\0';
+
+ p = strrchr(link, '/');
+ if (!p)
+ die("error looking up device name");
+ p++;
+
+ return mprintf("/dev/%s", p);
+}
+
+static bool path_is_fs_root(char *path)
+{
+ char *line = NULL, *p, *mount;
+ size_t n = 0;
+ FILE *f;
+ bool ret = true;
+
+ f = fopen("/proc/self/mountinfo", "r");
+ if (!f)
+ die("Error getting mount information");
+
+ while (getline(&line, &n, f) != -1) {
+ p = line;
+
+ strsep(&p, " "); /* mount id */
+ strsep(&p, " "); /* parent id */
+ strsep(&p, " "); /* dev */
+ strsep(&p, " "); /* root */
+ mount = strsep(&p, " ");
+ strsep(&p, " ");
+
+ if (mount && !strcmp(path, mount))
+ goto found;
+ }
+
+ ret = false;
+found:
+ fclose(f);
+ free(line);
+ return ret;
+}
+
+static void mark_unreserved_space(struct cache_set *c, ranges extents)
+{
+ struct cache *ca = c->cache[0];
+ struct hole_iter iter;
+ struct range i;
+
+ for_each_hole(iter, extents, bucket_to_sector(ca, ca->mi.nbuckets) << 9, i) {
+ struct bucket_mark new;
+ u64 b;
+
+ if (i.start == i.end)
+ return;
+
+ b = sector_to_bucket(ca, i.start >> 9);
+ do {
+ bucket_cmpxchg(&ca->buckets[b], new, new.nouse = 1);
+ b++;
+ } while (bucket_to_sector(ca, b) << 9 < i.end);
+ }
+}
+
+static void update_inode(struct cache_set *c,
+ struct bch_inode_unpacked *inode)
+{
+ struct bkey_inode_buf packed;
+ int ret;
+
+ bch_inode_pack(&packed, inode);
+ ret = bch_btree_update(c, BTREE_ID_INODES, &packed.inode.k_i, NULL);
+ if (ret)
+ die("error creating file: %s", strerror(-ret));
+}
+
+static void create_dirent(struct cache_set *c,
+ struct bch_inode_unpacked *parent,
+ const char *name, u64 inum, mode_t mode)
+{
+ struct bch_hash_info parent_hash_info = bch_hash_info_init(parent);
+ struct qstr qname = { { { .len = strlen(name), } }, .name = name };
+
+ int ret = bch_dirent_create(c, parent->inum, &parent_hash_info,
+ mode_to_type(mode), &qname,
+ inum, NULL, BCH_HASH_SET_MUST_CREATE);
+ if (ret)
+ die("error creating file: %s", strerror(-ret));
+
+ if (S_ISDIR(mode))
+ parent->i_nlink++;
+}
+
+static void create_link(struct cache_set *c,
+ struct bch_inode_unpacked *parent,
+ const char *name, u64 inum, mode_t mode)
+{
+ struct bch_inode_unpacked inode;
+ int ret = bch_inode_find_by_inum(c, inum, &inode);
+ if (ret)
+ die("error looking up hardlink: %s", strerror(-ret));
+
+ inode.i_nlink++;
+ update_inode(c, &inode);
+
+ create_dirent(c, parent, name, inum, mode);
+}
+
+static struct bch_inode_unpacked create_file(struct cache_set *c,
+ struct bch_inode_unpacked *parent,
+ const char *name,
+ uid_t uid, gid_t gid,
+ mode_t mode, dev_t rdev)
+{
+ struct bch_inode_unpacked new_inode;
+ struct bkey_inode_buf packed;
+ int ret;
+
+ bch_inode_init(c, &new_inode, uid, gid, mode, rdev);
+ bch_inode_pack(&packed, &new_inode);
+
+ ret = bch_inode_create(c, &packed.inode.k_i, BLOCKDEV_INODE_MAX, 0,
+ &c->unused_inode_hint);
+ if (ret)
+ die("error creating file: %s", strerror(-ret));
+
+ new_inode.inum = packed.inode.k.p.inode;
+ create_dirent(c, parent, name, new_inode.inum, mode);
+
+ return new_inode;
+}
+
+#define for_each_xattr_handler(handlers, handler) \
+ if (handlers) \
+ for ((handler) = *(handlers)++; \
+ (handler) != NULL; \
+ (handler) = *(handlers)++)
+
+static const struct xattr_handler *xattr_resolve_name(const char **name)
+{
+ const struct xattr_handler **handlers = bch_xattr_handlers;
+ const struct xattr_handler *handler;
+
+ for_each_xattr_handler(handlers, handler) {
+ const char *n;
+
+ n = strcmp_prefix(*name, xattr_prefix(handler));
+ if (n) {
+ if (!handler->prefix ^ !*n) {
+ if (*n)
+ continue;
+ return ERR_PTR(-EINVAL);
+ }
+ *name = n;
+ return handler;
+ }
+ }
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static void copy_times(struct cache_set *c, struct bch_inode_unpacked *dst,
+ struct stat *src)
+{
+ dst->i_atime = timespec_to_bch_time(c, src->st_atim);
+ dst->i_mtime = timespec_to_bch_time(c, src->st_mtim);
+ dst->i_ctime = timespec_to_bch_time(c, src->st_ctim);
+}
+
+static void copy_xattrs(struct cache_set *c, struct bch_inode_unpacked *dst,
+ char *src)
+{
+ struct bch_hash_info hash_info = bch_hash_info_init(dst);
+ ssize_t size = llistxattr(src, NULL, 0);
+ if (size < 0)
+ die("listxattr error: %s", strerror(errno));
+
+ if (!size)
+ return;
+
+ char *buf = malloc(size);
+ size = llistxattr(src, buf, size);
+ if (size < 0)
+ die("listxattr error: %s", strerror(errno));
+
+ for (const char *next, *attr = buf;
+ attr <= buf + size;
+ attr = next) {
+ next = attr + strlen(attr) + 1;
+
+ /* max possible xattr val: */
+ static char val[64 << 10];
+ ssize_t val_size = lgetxattr(src, attr, val, sizeof(val));
+
+ if (val_size < 0)
+ die("error getting xattr val: %s", strerror(errno));
+
+ const struct xattr_handler *h = xattr_resolve_name(&attr);
+
+ int ret = __bch_xattr_set(c, dst->inum, &hash_info, attr,
+ val, val_size, 0, h->flags, NULL);
+ if (ret < 0)
+ die("error creating xattr: %s", strerror(-ret));
+ }
+
+ free(buf);
+}
+
+static void write_data(struct cache_set *c,
+ struct bch_inode_unpacked *dst_inode,
+ u64 dst_offset, void *buf, size_t len)
+{
+ struct disk_reservation res;
+ struct bch_write_op op;
+ struct bch_write_bio bio;
+ struct bio_vec bv;
+ struct closure cl;
+
+ BUG_ON(dst_offset & (block_bytes(c) - 1));
+ BUG_ON(len & (block_bytes(c) - 1));
+
+ closure_init_stack(&cl);
+
+ bio_init(&bio.bio);
+ bio.bio.bi_max_vecs = 1;
+ bio.bio.bi_io_vec = &bv;
+ bio.bio.bi_iter.bi_size = len;
+ bch_bio_map(&bio.bio, buf);
+
+ int ret = bch_disk_reservation_get(c, &res, len >> 9, 0);
+ if (ret)
+ die("error reserving space in new filesystem: %s", strerror(-ret));
+
+ bch_write_op_init(&op, c, &bio, res, c->write_points,
+ POS(dst_inode->inum, dst_offset >> 9), NULL, 0);
+ closure_call(&op.cl, bch_write, NULL, &cl);
+ closure_sync(&cl);
+
+ dst_inode->i_sectors += len >> 9;
+}
+
+static char buf[1 << 20] __aligned(PAGE_SIZE);
+
+static void copy_data(struct cache_set *c,
+ struct bch_inode_unpacked *dst_inode,
+ int src_fd, u64 start, u64 end)
+{
+ while (start < end) {
+ unsigned len = min_t(u64, end - start, sizeof(buf));
+
+ xpread(src_fd, buf, len, start);
+ write_data(c, dst_inode, start, buf, len);
+ start += len;
+ }
+}
+
+static void link_data(struct cache_set *c, struct bch_inode_unpacked *dst,
+ u64 logical, u64 physical, u64 length)
+{
+ struct cache *ca = c->cache[0];
+
+ BUG_ON(logical & (block_bytes(c) - 1));
+ BUG_ON(physical & (block_bytes(c) - 1));
+ BUG_ON(length & (block_bytes(c) - 1));
+
+ logical >>= 9;
+ physical >>= 9;
+ length >>= 9;
+
+ BUG_ON(physical + length > bucket_to_sector(ca, ca->mi.nbuckets));
+
+ while (length) {
+ struct bkey_i_extent *e;
+ BKEY_PADDED(k) k;
+ u64 b = sector_to_bucket(ca, physical >> 9);
+ struct disk_reservation res;
+ unsigned sectors;
+ int ret;
+
+ sectors = min(ca->mi.bucket_size -
+ (physical & (ca->mi.bucket_size - 1)),
+ length);
+
+ e = bkey_extent_init(&k.k);
+ e->k.p.inode = dst->inum;
+ e->k.p.offset = logical + sectors;
+ e->k.size = sectors;
+ extent_ptr_append(e, (struct bch_extent_ptr) {
+ .offset = physical,
+ .dev = 0,
+ .gen = ca->buckets[b].mark.gen,
+ });
+
+ ret = bch_disk_reservation_get(c, &res, sectors,
+ BCH_DISK_RESERVATION_NOFAIL);
+ if (ret)
+ die("error reserving space in new filesystem: %s",
+ strerror(-ret));
+
+ ret = bch_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i,
+ &res, NULL, NULL, 0);
+ if (ret)
+ die("btree insert error %s", strerror(-ret));
+
+ bch_disk_reservation_put(c, &res);
+
+ dst->i_sectors += sectors;
+ logical += sectors;
+ physical += sectors;
+ length -= sectors;
+ }
+}
+
+static void copy_link(struct cache_set *c, struct bch_inode_unpacked *dst,
+ char *src)
+{
+ ssize_t ret = readlink(src, buf, sizeof(buf));
+ if (ret < 0)
+ die("readlink error: %s", strerror(errno));
+
+ write_data(c, dst, 0, buf, round_up(ret, block_bytes(c)));
+}
+
+static void copy_file(struct cache_set *c, struct bch_inode_unpacked *dst,
+ int src, char *src_path, ranges *extents)
+{
+ struct fiemap_iter iter;
+ struct fiemap_extent e;
+
+ fiemap_for_each(src, iter, e)
+ if (e.fe_flags & FIEMAP_EXTENT_UNKNOWN) {
+ fsync(src);
+ break;
+ }
+
+ fiemap_for_each(src, iter, e) {
+ if ((e.fe_logical & (block_bytes(c) - 1)) ||
+ (e.fe_length & (block_bytes(c) - 1)))
+ die("Unaligned extent in %s - can't handle", src_path);
+
+ if (e.fe_flags & (FIEMAP_EXTENT_UNKNOWN|
+ FIEMAP_EXTENT_ENCODED|
+ FIEMAP_EXTENT_NOT_ALIGNED|
+ FIEMAP_EXTENT_DATA_INLINE)) {
+ copy_data(c, dst,
+ src,
+ round_down(e.fe_logical, block_bytes(c)),
+ round_up(e.fe_logical + e.fe_length,
+ block_bytes(c)));
+ continue;
+ }
+
+ if ((e.fe_physical & (block_bytes(c) - 1)))
+ die("Unaligned extent in %s - can't handle", src_path);
+
+ range_add(extents, e.fe_physical, e.fe_length);
+ link_data(c, dst, e.fe_logical, e.fe_physical, e.fe_length);
+ }
+}
+
+struct copy_fs_state {
+ u64 bcachefs_inum;
+ dev_t dev;
+
+ GENRADIX(u64) hardlinks;
+ ranges extents;
+};
+
+static void copy_dir(struct copy_fs_state *s,
+ struct cache_set *c,
+ struct bch_inode_unpacked *dst,
+ int src_fd, const char *src_path)
+{
+ DIR *dir = fdopendir(src_fd);
+ struct dirent *d;
+
+ while ((errno = 0), (d = readdir(dir))) {
+ struct bch_inode_unpacked inode;
+ int fd;
+
+ if (fchdir(src_fd))
+ die("chdir error: %s", strerror(errno));
+
+ struct stat stat =
+ xfstatat(src_fd, d->d_name, AT_SYMLINK_NOFOLLOW);
+
+ if (!strcmp(d->d_name, ".") ||
+ !strcmp(d->d_name, "..") ||
+ stat.st_ino == s->bcachefs_inum)
+ continue;
+
+ char *child_path = mprintf("%s/%s", src_path, d->d_name);
+
+ if (stat.st_dev != s->dev)
+ die("%s does not have correct st_dev!", child_path);
+
+ u64 *dst_inum = S_ISREG(stat.st_mode)
+ ? genradix_ptr_alloc(&s->hardlinks, stat.st_ino, GFP_KERNEL)
+ : NULL;
+
+ if (dst_inum && *dst_inum) {
+ create_link(c, dst, d->d_name, *dst_inum, S_IFREG);
+ goto next;
+ }
+
+ inode = create_file(c, dst, d->d_name,
+ stat.st_uid, stat.st_gid,
+ stat.st_mode, stat.st_rdev);
+
+ if (dst_inum)
+ *dst_inum = inode.inum;
+
+ copy_times(c, &inode, &stat);
+ copy_xattrs(c, &inode, d->d_name);
+
+ /* copy xattrs */
+
+ switch (mode_to_type(stat.st_mode)) {
+ case DT_DIR:
+ fd = xopen(d->d_name, O_RDONLY|O_NOATIME);
+ copy_dir(s, c, &inode, fd, child_path);
+ close(fd);
+ break;
+ case DT_REG:
+ inode.i_size = stat.st_size;
+
+ fd = xopen(d->d_name, O_RDONLY|O_NOATIME);
+ copy_file(c, &inode, fd, child_path, &s->extents);
+ close(fd);
+ break;
+ case DT_LNK:
+ inode.i_size = stat.st_size;
+
+ copy_link(c, &inode, d->d_name);
+ break;
+ case DT_FIFO:
+ case DT_CHR:
+ case DT_BLK:
+ case DT_SOCK:
+ case DT_WHT:
+ /* nothing else to copy for these: */
+ break;
+ default:
+ BUG();
+ }
+
+ update_inode(c, &inode);
+next:
+ free(child_path);
+ }
+
+ if (errno)
+ die("readdir error: %s", strerror(errno));
+}
+
+static ranges reserve_new_fs_space(const char *file_path, unsigned block_size,
+ u64 size, u64 *bcachefs_inum, dev_t dev)
+{
+ int fd = open(file_path, O_RDWR|O_CREAT|O_EXCL, 0600);
+ if (fd < 0)
+ die("Error creating %s for bcachefs metadata: %s",
+ file_path, strerror(errno));
+
+ struct stat statbuf = xfstat(fd);
+
+ if (statbuf.st_dev != dev)
+ die("bcachefs file has incorrect device");
+
+ *bcachefs_inum = statbuf.st_ino;
+
+ if (fallocate(fd, 0, 0, size))
+ die("Error reserving space for bcachefs metadata: %s",
+ strerror(errno));
+
+ fsync(fd);
+
+ struct fiemap_iter iter;
+ struct fiemap_extent e;
+ ranges extents = { NULL };
+
+ fiemap_for_each(fd, iter, e) {
+ if (e.fe_flags & (FIEMAP_EXTENT_UNKNOWN|
+ FIEMAP_EXTENT_ENCODED|
+ FIEMAP_EXTENT_NOT_ALIGNED|
+ FIEMAP_EXTENT_DATA_INLINE))
+ die("Unable to continue: metadata file not fully mapped");
+
+ if ((e.fe_physical & (block_size - 1)) ||
+ (e.fe_length & (block_size - 1)))
+ die("Unable to continue: unaligned extents in metadata file");
+
+ range_add(&extents, e.fe_physical, e.fe_length);
+ }
+ close(fd);
+
+ ranges_sort_merge(&extents);
+ return extents;
+}
+
+static void reserve_old_fs_space(struct cache_set *c,
+ struct bch_inode_unpacked *root_inode,
+ ranges *extents)
+{
+ struct cache *ca = c->cache[0];
+ struct bch_inode_unpacked dst;
+ struct hole_iter iter;
+ struct range i;
+
+ dst = create_file(c, root_inode, "old_migrated_filesystem",
+ 0, 0, S_IFREG|0400, 0);
+ dst.i_size = bucket_to_sector(ca, ca->mi.nbuckets) << 9;
+
+ ranges_sort_merge(extents);
+
+ for_each_hole(iter, *extents, bucket_to_sector(ca, ca->mi.nbuckets) << 9, i)
+ link_data(c, &dst, i.start, i.start, i.end - i.start);
+
+ update_inode(c, &dst);
+}
+
+static void copy_fs(struct cache_set *c, int src_fd, const char *src_path,
+ u64 bcachefs_inum, ranges *extents)
+{
+ syncfs(src_fd);
+
+ struct bch_inode_unpacked root_inode;
+ int ret = bch_inode_find_by_inum(c, BCACHE_ROOT_INO, &root_inode);
+ if (ret)
+ die("error looking up root directory: %s", strerror(-ret));
+
+ if (fchdir(src_fd))
+ die("chdir error: %s", strerror(errno));
+
+ struct stat stat = xfstat(src_fd);
+ copy_times(c, &root_inode, &stat);
+ copy_xattrs(c, &root_inode, ".");
+
+ struct copy_fs_state s = {
+ .bcachefs_inum = bcachefs_inum,
+ .dev = stat.st_dev,
+ .extents = *extents,
+ };
+
+ /* now, copy: */
+ copy_dir(&s, c, &root_inode, src_fd, src_path);
+
+ reserve_old_fs_space(c, &root_inode, &s.extents);
+
+ update_inode(c, &root_inode);
+
+ darray_free(s.extents);
+ genradix_free(&s.hardlinks);
+}
+
+static void find_superblock_space(ranges extents, struct dev_opts *dev)
+{
+ struct range *i;
+ darray_foreach(i, extents) {
+ u64 offset = max(256ULL << 10, i->start);
+
+ if (offset + (128 << 10) <= i->end) {
+ dev->sb_offset = offset >> 9;
+ dev->sb_end = dev->sb_offset + 256;
+ return;
+ }
+ }
+
+ die("Couldn't find a valid location for superblock");
+}
+
+static void migrate_usage(void)
+{
+ puts("bcache migrate - migrate an existing filesystem to bcachefs\n"
+ "Usage: bcache migrate [OPTION]...\n"
+ "\n"
+ "Options:\n"
+ " -f fs Root of filesystem to migrate(s)\n"
+ " --encrypted Enable whole filesystem encryption (chacha20/poly1305)\n"
+ " --no_passphrase Don't encrypt master encryption key\n"
+ " -h Display this help and exit\n"
+ "Report bugs to <linux-bcache@vger.kernel.org>");
+}
+
+static const struct option migrate_opts[] = {
+ { "encrypted", no_argument, NULL, 'e' },
+ { "no_passphrase", no_argument, NULL, 'p' },
+ { NULL }
+};
+
+int cmd_migrate(int argc, char *argv[])
+{
+ struct format_opts format_opts = format_opts_default();
+ char *fs_path = NULL;
+ unsigned block_size;
+ bool no_passphrase = false;
+ int opt;
+
+ while ((opt = getopt_long(argc, argv, "f:h",
+ migrate_opts, NULL)) != -1)
+ switch (opt) {
+ case 'f':
+ fs_path = optarg;
+ break;
+ case 'e':
+ format_opts.encrypted = true;
+ break;
+ case 'p':
+ no_passphrase = true;
+ break;
+ case 'h':
+ migrate_usage();
+ exit(EXIT_SUCCESS);
+ }
+
+ if (!fs_path)
+ die("Please specify a filesytem to migrate");
+
+ if (!path_is_fs_root(fs_path))
+ die("%s is not a filysestem root", fs_path);
+
+ int fs_fd = xopen(fs_path, O_RDONLY|O_NOATIME);
+ struct stat stat = xfstat(fs_fd);
+
+ if (!S_ISDIR(stat.st_mode))
+ die("%s is not a directory", fs_path);
+
+ struct dev_opts dev = { 0 };
+
+ dev.path = dev_t_to_path(stat.st_dev);
+ dev.fd = xopen(dev.path, O_RDWR);
+
+ block_size = min_t(unsigned, stat.st_blksize,
+ get_blocksize(dev.path, dev.fd) << 9);
+
+ BUG_ON(!is_power_of_2(block_size) || block_size < 512);
+ format_opts.block_size = block_size >> 9;
+
+ u64 bcachefs_inum;
+ char *file_path = mprintf("%s/bcachefs", fs_path);
+
+ ranges extents = reserve_new_fs_space(file_path,
+ block_size, get_size(dev.path, dev.fd) / 5,
+ &bcachefs_inum, stat.st_dev);
+
+ find_superblock_space(extents, &dev);
+
+ if (format_opts.encrypted && !no_passphrase) {
+ format_opts.passphrase = read_passphrase("Enter passphrase: ");
+
+ if (isatty(STDIN_FILENO)) {
+ char *pass2 =
+ read_passphrase("Enter same passphrase again: ");
+
+ if (strcmp(format_opts.passphrase, pass2)) {
+ memzero_explicit(format_opts.passphrase,
+ strlen(format_opts.passphrase));
+ memzero_explicit(pass2, strlen(pass2));
+ die("Passphrases do not match");
+ }
+
+ memzero_explicit(pass2, strlen(pass2));
+ free(pass2);
+ }
+ }
+
+ struct bch_sb *sb = bcache_format(format_opts, &dev, 1);
+ u64 sb_offset = le64_to_cpu(sb->layout.sb_offset[0]);
+
+ if (format_opts.passphrase)
+ add_bcache_key(sb, format_opts.passphrase);
+
+ free(sb);
+
+ printf("Creating new filesystem on %s in space reserved at %s\n"
+ "To mount, run\n"
+ " mount -t bcache -o sb=%llu %s dir\n"
+ "\n"
+ "After verifying that the new filesystem is correct, to create a\n"
+ "superblock at the default offset and finish the migration run\n"
+ " bcache migrate_superblock -d %s -o %llu\n"
+ "\n"
+ "The new filesystem will have a file at /old_migrated_filestem\n"
+ "referencing all disk space that might be used by the existing\n"
+ "filesystem. That file can be deleted once the old filesystem is\n"
+ "no longer needed (and should be deleted prior to running\n"
+ "bcache migrate_superblock)\n",
+ dev.path, file_path, sb_offset, dev.path,
+ dev.path, sb_offset);
+
+ struct bch_opts opts = bch_opts_empty();
+ struct cache_set *c = NULL;
+ char *path[1] = { dev.path };
+ const char *err;
+
+ opts.sb = sb_offset;
+ opts.nostart = true;
+ opts.noexcl = true;
+
+ err = bch_fs_open(path, 1, opts, &c);
+ if (err)
+ die("Error opening new filesystem: %s", err);
+
+ mark_unreserved_space(c, extents);
+
+ err = bch_fs_start(c);
+ if (err)
+ die("Error starting new filesystem: %s", err);
+
+ copy_fs(c, fs_fd, fs_path, bcachefs_inum, &extents);
+
+ bch_fs_stop(c);
+
+ printf("Migrate complete, running fsck:\n");
+ opts.nostart = false;
+ opts.nochanges = true;
+ fsck_err_opt = FSCK_ERR_NO;
+
+ err = bch_fs_open(path, 1, opts, &c);
+ if (err)
+ die("Error opening new filesystem: %s", err);
+
+ bch_fs_stop(c);
+ printf("fsck complete\n");
+ return 0;
+}
+
+static void migrate_superblock_usage(void)
+{
+ puts("bcache migrate_superblock - create default superblock after migrating\n"
+ "Usage: bcache migrate_superblock [OPTION]...\n"
+ "\n"
+ "Options:\n"
+ " -d device Device to create superblock for\n"
+ " -o offset Offset of existing superblock\n"
+ " -h Display this help and exit\n"
+ "Report bugs to <linux-bcache@vger.kernel.org>");
+}
+
+int cmd_migrate_superblock(int argc, char *argv[])
+{
+ char *dev = NULL;
+ u64 offset = 0;
+ int opt, ret;
+
+ while ((opt = getopt(argc, argv, "d:o:h")) != -1)
+ switch (opt) {
+ case 'd':
+ dev = optarg;
+ break;
+ case 'o':
+ ret = kstrtou64(optarg, 10, &offset);
+ if (ret)
+ die("Invalid offset");
+ break;
+ case 'h':
+ migrate_superblock_usage();
+ exit(EXIT_SUCCESS);
+ }
+
+ if (!dev)
+ die("Please specify a device");
+
+ if (!offset)
+ die("Please specify offset of existing superblock");
+
+ int fd = xopen(dev, O_RDWR);
+ struct bch_sb *sb = __bcache_super_read(fd, offset);
+
+ if (sb->layout.nr_superblocks >= ARRAY_SIZE(sb->layout.sb_offset))
+ die("Can't add superblock: no space left in superblock layout");
+
+ for (unsigned i = 0; i < sb->layout.nr_superblocks; i++)
+ if (le64_to_cpu(sb->layout.sb_offset[i]) == BCH_SB_SECTOR)
+ die("Superblock layout already has default superblock");
+
+ memmove(&sb->layout.sb_offset[1],
+ &sb->layout.sb_offset[0],
+ sb->layout.nr_superblocks * sizeof(u64));
+ sb->layout.nr_superblocks++;
+
+ sb->layout.sb_offset[0] = cpu_to_le64(BCH_SB_SECTOR);
+
+ bcache_super_write(fd, sb);
+ close(fd);
+
+ return 0;
+}
diff --git a/cmd_run.c b/cmd_run.c
index 74f3248..6fb1c4f 100644
--- a/cmd_run.c
+++ b/cmd_run.c
@@ -25,9 +25,6 @@ int cmd_stop(int argc, char *argv[])
die("Please supply a filesystem");
struct bcache_handle fs = bcache_fs_open(argv[1]);
-
- if (ioctl(fs.ioctl_fd, BCH_IOCTL_STOP))
- die("BCH_IOCTL_STOP error: %s", strerror(errno));
-
+ xioctl(fs.ioctl_fd, BCH_IOCTL_STOP);
return 0;
}
diff --git a/cmds.h b/cmds.h
index 946acfd..120e83f 100644
--- a/cmds.h
+++ b/cmds.h
@@ -29,4 +29,7 @@ int cmd_fsck(int argc, char *argv[]);
int cmd_dump(int argc, char *argv[]);
int cmd_list(int argc, char *argv[]);
+int cmd_migrate(int argc, char *argv[]);
+int cmd_migrate_superblock(int argc, char *argv[]);
+
#endif /* _CMDS_H */
diff --git a/crypto.c b/crypto.c
index 86da70a..f38a359 100644
--- a/crypto.c
+++ b/crypto.c
@@ -10,8 +10,10 @@
#include <time.h>
#include <unistd.h>
+#include <keyutils.h>
#include <linux/random.h>
#include <libscrypt.h>
+#include <uuid/uuid.h>
#include "checksum.h"
#include "crypto.h"
@@ -75,29 +77,71 @@ void derive_passphrase(struct bch_sb_field_crypt *crypt,
}
}
-void bch_sb_crypt_init(struct bch_sb *sb,
- struct bch_sb_field_crypt *crypt,
- const char *passphrase)
+void add_bcache_key(struct bch_sb *sb, const char *passphrase)
{
- struct bch_key passphrase_key;
+ struct bch_sb_field_crypt *crypt = bch_sb_get_crypt(sb);
+ if (!crypt)
+ die("filesystem is not encrypted");
- SET_BCH_CRYPT_KDF_TYPE(crypt, BCH_KDF_SCRYPT);
- SET_BCH_KDF_SCRYPT_N(crypt, ilog2(SCRYPT_N));
- SET_BCH_KDF_SCRYPT_R(crypt, ilog2(SCRYPT_r));
- SET_BCH_KDF_SCRYPT_P(crypt, ilog2(SCRYPT_p));
+ struct bch_encrypted_key sb_key = crypt->key;
+ if (!bch_key_is_encrypted(&sb_key))
+ die("filesystem does not have encryption key");
+ struct bch_key passphrase_key;
derive_passphrase(crypt, &passphrase_key, passphrase);
+ /* Check if the user supplied the correct passphrase: */
+ if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb),
+ &sb_key, sizeof(sb_key)))
+ die("error encrypting key");
+
+ if (bch_key_is_encrypted(&sb_key))
+ die("incorrect passphrase");
+
+ char uuid[40];
+ uuid_unparse_lower(sb->user_uuid.b, uuid);
+
+ char *description = mprintf("bcache:%s", uuid);
+
+ if (add_key("logon", description,
+ &passphrase_key, sizeof(passphrase_key),
+ KEY_SPEC_USER_KEYRING) < 0 ||
+ add_key("user", description,
+ &passphrase_key, sizeof(passphrase_key),
+ KEY_SPEC_USER_KEYRING) < 0)
+ die("add_key error: %s", strerror(errno));
+
+ memzero_explicit(description, strlen(description));
+ free(description);
+ memzero_explicit(&passphrase_key, sizeof(passphrase_key));
+ memzero_explicit(&sb_key, sizeof(sb_key));
+}
+
+void bch_sb_crypt_init(struct bch_sb *sb,
+ struct bch_sb_field_crypt *crypt,
+ const char *passphrase)
+{
crypt->key.magic = BCH_KEY_MAGIC;
get_random_bytes(&crypt->key.key, sizeof(crypt->key.key));
- assert(!bch_key_is_encrypted(&crypt->key));
+ if (passphrase) {
+ struct bch_key passphrase_key;
- if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb),
- &crypt->key, sizeof(crypt->key)))
- die("error encrypting key");
+ SET_BCH_CRYPT_KDF_TYPE(crypt, BCH_KDF_SCRYPT);
+ SET_BCH_KDF_SCRYPT_N(crypt, ilog2(SCRYPT_N));
+ SET_BCH_KDF_SCRYPT_R(crypt, ilog2(SCRYPT_r));
+ SET_BCH_KDF_SCRYPT_P(crypt, ilog2(SCRYPT_p));
- assert(bch_key_is_encrypted(&crypt->key));
+ derive_passphrase(crypt, &passphrase_key, passphrase);
- memzero_explicit(&passphrase_key, sizeof(passphrase_key));
+ assert(!bch_key_is_encrypted(&crypt->key));
+
+ if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb),
+ &crypt->key, sizeof(crypt->key)))
+ die("error encrypting key");
+
+ assert(bch_key_is_encrypted(&crypt->key));
+
+ memzero_explicit(&passphrase_key, sizeof(passphrase_key));
+ }
}
diff --git a/crypto.h b/crypto.h
index 643073e..534f9c2 100644
--- a/crypto.h
+++ b/crypto.h
@@ -7,6 +7,7 @@
char *read_passphrase(const char *);
void derive_passphrase(struct bch_sb_field_crypt *,
struct bch_key *, const char *);
+void add_bcache_key(struct bch_sb *, const char *);
void bch_sb_crypt_init(struct bch_sb *sb, struct bch_sb_field_crypt *,
const char *);
diff --git a/include/linux/bcache.h b/include/linux/bcache.h
index dbb0274..d70e2e3 100644
--- a/include/linux/bcache.h
+++ b/include/linux/bcache.h
@@ -821,7 +821,7 @@ struct bch_sb_field {
__le32 type;
};
-enum bch_sb_field_types {
+enum bch_sb_field_type {
BCH_SB_FIELD_journal = 0,
BCH_SB_FIELD_members = 1,
BCH_SB_FIELD_crypt = 2,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3c18594..217ff09 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -110,6 +110,7 @@ struct super_block {
* NOTE! These match bits 12..15 of stat.st_mode
* (ie "(i_mode >> 12) & 15").
*/
+#ifndef DT_UNKNOWN
#define DT_UNKNOWN 0
#define DT_FIFO 1
#define DT_CHR 2
@@ -119,6 +120,7 @@ struct super_block {
#define DT_LNK 10
#define DT_SOCK 12
#define DT_WHT 14
+#endif
/*
* This is the "filldir" function type, used by readdir() to let
diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index 1a951e9..6ea2deb 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -8,7 +8,6 @@
* interior nodes.
*/
-#include <linux/page.h>
#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/log2.h>
@@ -41,20 +40,14 @@ struct __genradix {
* genradix.
*/
-#define DECLARE_GENRADIX_TYPE(_name, _type) \
-struct _name { \
- struct __genradix tree; \
- _type type[0] __aligned(1); \
-}
-
-#define DECLARE_GENRADIX(_name, _type) \
+#define GENRADIX(_type) \
struct { \
struct __genradix tree; \
_type type[0] __aligned(1); \
-} _name
+}
#define DEFINE_GENRADIX(_name, _type) \
- DECLARE_GENRADIX(_name, _type) = __GENRADIX_INITIALIZER
+ GENRADIX(_type) _name = __GENRADIX_INITIALIZER
#define genradix_init(_radix) \
do { \
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 5a98618..2bbd097 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -180,4 +180,9 @@ static inline bool percpu_ref_is_zero(struct percpu_ref *ref)
return !atomic_long_read(&ref->count);
}
+static inline bool percpu_ref_is_dying(struct percpu_ref *ref)
+{
+ return percpu_ref_is_zero(ref);
+}
+
#endif /* __TOOLS_LINUX_PERCPU_REFCOUNT_H */
diff --git a/libbcache.c b/libbcache.c
index 6908ead..0cfafbb 100644
--- a/libbcache.c
+++ b/libbcache.c
@@ -23,66 +23,82 @@
#define BCH_MIN_NR_NBUCKETS (1 << 10)
-/* first bucket should start 1 mb in, in sectors: */
-#define FIRST_BUCKET_OFFSET (1 << 11)
-
/* minimum size filesystem we can create, given a bucket size: */
static u64 min_size(unsigned bucket_size)
{
- return (DIV_ROUND_UP(FIRST_BUCKET_OFFSET, bucket_size) +
- BCH_MIN_NR_NBUCKETS) * bucket_size;
+ return BCH_MIN_NR_NBUCKETS * bucket_size;
}
-static void init_layout(struct bch_sb_layout *l)
+static void init_layout(struct bch_sb_layout *l, unsigned block_size,
+ u64 start, u64 end)
{
+ unsigned sb_size;
+ u64 backup; /* offset of 2nd sb */
+
memset(l, 0, sizeof(*l));
+ if (start != BCH_SB_SECTOR)
+ start = round_up(start, block_size);
+ end = round_down(end, block_size);
+
+ if (start >= end)
+ die("insufficient space for superblocks");
+
+ /*
+ * Create two superblocks in the allowed range: reserve a maximum of 64k
+ */
+ sb_size = min_t(u64, 128, end - start / 2);
+
+ backup = start + sb_size;
+ backup = round_up(backup, block_size);
+
+ backup = min(backup, end);
+
+ sb_size = min(end - backup, backup- start);
+ sb_size = rounddown_pow_of_two(sb_size);
+
+ if (sb_size < 8)
+ die("insufficient space for superblocks");
+
l->magic = BCACHE_MAGIC;
l->layout_type = 0;
l->nr_superblocks = 2;
- l->sb_max_size_bits = 7;
- l->sb_offset[0] = cpu_to_le64(BCH_SB_SECTOR);
- l->sb_offset[1] = cpu_to_le64(BCH_SB_SECTOR +
- (1 << l->sb_max_size_bits));
+ l->sb_max_size_bits = ilog2(sb_size);
+ l->sb_offset[0] = cpu_to_le64(start);
+ l->sb_offset[1] = cpu_to_le64(backup);
}
-void bcache_format(struct dev_opts *devs, size_t nr_devs,
- unsigned block_size,
- unsigned btree_node_size,
- unsigned meta_csum_type,
- unsigned data_csum_type,
- unsigned compression_type,
- const char *passphrase,
- unsigned meta_replicas,
- unsigned data_replicas,
- unsigned on_error_action,
- unsigned max_journal_entry_size,
- char *label,
- uuid_le uuid)
+struct bch_sb *bcache_format(struct format_opts opts,
+ struct dev_opts *devs, size_t nr_devs)
{
struct bch_sb *sb;
struct dev_opts *i;
struct bch_sb_field_members *mi;
- unsigned u64s, j;
+ unsigned u64s;
/* calculate block size: */
- if (!block_size)
+ if (!opts.block_size)
for (i = devs; i < devs + nr_devs; i++)
- block_size = max(block_size,
- get_blocksize(i->path, i->fd));
+ opts.block_size = max(opts.block_size,
+ get_blocksize(i->path, i->fd));
/* calculate bucket sizes: */
for (i = devs; i < devs + nr_devs; i++) {
+ if (!i->sb_offset) {
+ i->sb_offset = BCH_SB_SECTOR;
+ i->sb_end = BCH_SB_SECTOR + 256;
+ }
+
if (!i->size)
i->size = get_size(i->path, i->fd) >> 9;
if (!i->bucket_size) {
- if (i->size < min_size(block_size))
+ if (i->size < min_size(opts.block_size))
die("cannot format %s, too small (%llu sectors, min %llu)",
- i->path, i->size, min_size(block_size));
+ i->path, i->size, min_size(opts.block_size));
/* Want a bucket size of at least 128k, if possible: */
- i->bucket_size = max(block_size, 256U);
+ i->bucket_size = max(opts.block_size, 256U);
if (i->size >= min_size(i->bucket_size)) {
unsigned scale = max(1,
@@ -99,34 +115,36 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
}
}
- /* first bucket: 1 mb in */
- i->first_bucket = DIV_ROUND_UP(FIRST_BUCKET_OFFSET, i->bucket_size);
i->nbuckets = i->size / i->bucket_size;
- if (i->bucket_size < block_size)
+ if (i->bucket_size < opts.block_size)
die("Bucket size cannot be smaller than block size");
- if (i->nbuckets - i->first_bucket < BCH_MIN_NR_NBUCKETS)
+ if (i->nbuckets < BCH_MIN_NR_NBUCKETS)
die("Not enough buckets: %llu, need %u (bucket size %u)",
- i->nbuckets - i->first_bucket, BCH_MIN_NR_NBUCKETS,
- i->bucket_size);
+ i->nbuckets, BCH_MIN_NR_NBUCKETS, i->bucket_size);
}
/* calculate btree node size: */
- if (!btree_node_size) {
+ if (!opts.btree_node_size) {
/* 256k default btree node size */
- btree_node_size = 512;
+ opts.btree_node_size = 512;
for (i = devs; i < devs + nr_devs; i++)
- btree_node_size = min(btree_node_size, i->bucket_size);
+ opts.btree_node_size =
+ min(opts.btree_node_size, i->bucket_size);
}
- if (!max_journal_entry_size) {
+ if (!opts.max_journal_entry_size) {
/* 2 MB default: */
- max_journal_entry_size = 4096;
+ opts.max_journal_entry_size = 4096;
}
- max_journal_entry_size = roundup_pow_of_two(max_journal_entry_size);
+ opts.max_journal_entry_size =
+ roundup_pow_of_two(opts.max_journal_entry_size);
+
+ if (uuid_is_null(opts.uuid.b))
+ uuid_generate(opts.uuid.b);
sb = calloc(1, sizeof(*sb) +
sizeof(struct bch_sb_field_members) +
@@ -135,35 +153,29 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
sb->version = cpu_to_le64(BCACHE_SB_VERSION_CDEV_V4);
sb->magic = BCACHE_MAGIC;
- sb->block_size = cpu_to_le16(block_size);
- sb->user_uuid = uuid;
+ sb->block_size = cpu_to_le16(opts.block_size);
+ sb->user_uuid = opts.uuid;
sb->nr_devices = nr_devs;
- init_layout(&sb->layout);
-
uuid_generate(sb->uuid.b);
- if (label)
- strncpy((char *) sb->label, label, sizeof(sb->label));
+ if (opts.label)
+ strncpy((char *) sb->label, opts.label, sizeof(sb->label));
- /*
- * don't have a userspace crc32c implementation handy, just always use
- * crc64
- */
- SET_BCH_SB_CSUM_TYPE(sb, BCH_CSUM_CRC64);
- SET_BCH_SB_META_CSUM_TYPE(sb, meta_csum_type);
- SET_BCH_SB_DATA_CSUM_TYPE(sb, data_csum_type);
- SET_BCH_SB_COMPRESSION_TYPE(sb, compression_type);
+ SET_BCH_SB_CSUM_TYPE(sb, opts.meta_csum_type);
+ SET_BCH_SB_META_CSUM_TYPE(sb, opts.meta_csum_type);
+ SET_BCH_SB_DATA_CSUM_TYPE(sb, opts.data_csum_type);
+ SET_BCH_SB_COMPRESSION_TYPE(sb, opts.compression_type);
- SET_BCH_SB_BTREE_NODE_SIZE(sb, btree_node_size);
+ SET_BCH_SB_BTREE_NODE_SIZE(sb, opts.btree_node_size);
SET_BCH_SB_GC_RESERVE(sb, 8);
- SET_BCH_SB_META_REPLICAS_WANT(sb, meta_replicas);
- SET_BCH_SB_META_REPLICAS_HAVE(sb, meta_replicas);
- SET_BCH_SB_DATA_REPLICAS_WANT(sb, data_replicas);
- SET_BCH_SB_DATA_REPLICAS_HAVE(sb, data_replicas);
- SET_BCH_SB_ERROR_ACTION(sb, on_error_action);
+ SET_BCH_SB_META_REPLICAS_WANT(sb, opts.meta_replicas);
+ SET_BCH_SB_META_REPLICAS_HAVE(sb, opts.meta_replicas);
+ SET_BCH_SB_DATA_REPLICAS_WANT(sb, opts.data_replicas);
+ SET_BCH_SB_DATA_REPLICAS_HAVE(sb, opts.data_replicas);
+ SET_BCH_SB_ERROR_ACTION(sb, opts.on_error_action);
SET_BCH_SB_STR_HASH_TYPE(sb, BCH_STR_HASH_SIPHASH);
- SET_BCH_SB_JOURNAL_ENTRY_SIZE(sb, ilog2(max_journal_entry_size));
+ SET_BCH_SB_JOURNAL_ENTRY_SIZE(sb, ilog2(opts.max_journal_entry_size));
struct timespec now;
if (clock_gettime(CLOCK_REALTIME, &now))
@@ -172,7 +184,7 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
sb->time_base_lo = cpu_to_le64(now.tv_sec * NSEC_PER_SEC + now.tv_nsec);
sb->time_precision = cpu_to_le32(1);
- if (passphrase) {
+ if (opts.encrypted) {
struct bch_sb_field_crypt *crypt = vstruct_end(sb);
u64s = sizeof(struct bch_sb_field_crypt) / sizeof(u64);
@@ -181,7 +193,7 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
crypt->field.u64s = cpu_to_le32(u64s);
crypt->field.type = BCH_SB_FIELD_crypt;
- bch_sb_crypt_init(sb, crypt, passphrase);
+ bch_sb_crypt_init(sb, crypt, opts.passphrase);
SET_BCH_SB_ENCRYPTION_TYPE(sb, 1);
}
@@ -198,7 +210,7 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
uuid_generate(m->uuid.b);
m->nbuckets = cpu_to_le64(i->nbuckets);
- m->first_bucket = cpu_to_le16(i->first_bucket);
+ m->first_bucket = 0;
m->bucket_size = cpu_to_le16(i->bucket_size);
SET_BCH_MEMBER_TIER(m, i->tier);
@@ -209,42 +221,49 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
for (i = devs; i < devs + nr_devs; i++) {
sb->dev_idx = i - devs;
- static const char zeroes[BCH_SB_SECTOR << 9];
- struct nonce nonce = { 0 };
+ init_layout(&sb->layout, opts.block_size,
+ i->sb_offset, i->sb_end);
- /* Zero start of disk */
- xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0);
+ if (i->sb_offset == BCH_SB_SECTOR) {
+ /* Zero start of disk */
+ static const char zeroes[BCH_SB_SECTOR << 9];
- xpwrite(i->fd, &sb->layout, sizeof(sb->layout),
- BCH_SB_LAYOUT_SECTOR << 9);
-
- for (j = 0; j < sb->layout.nr_superblocks; j++) {
- sb->offset = sb->layout.sb_offset[j];
-
- sb->csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb),
- nonce, sb);
- xpwrite(i->fd, sb, vstruct_bytes(sb),
- le64_to_cpu(sb->offset) << 9);
+ xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0);
}
- fsync(i->fd);
+ bcache_super_write(i->fd, sb);
close(i->fd);
}
- bcache_super_print(sb, HUMAN_READABLE);
+ return sb;
+}
+
+void bcache_super_write(int fd, struct bch_sb *sb)
+{
+ struct nonce nonce = { 0 };
+
+ for (unsigned i = 0; i < sb->layout.nr_superblocks; i++) {
+ sb->offset = sb->layout.sb_offset[i];
+
+ if (sb->offset == BCH_SB_SECTOR) {
+ /* Write backup layout */
+ xpwrite(fd, &sb->layout, sizeof(sb->layout),
+ BCH_SB_LAYOUT_SECTOR << 9);
+ }
+
+ sb->csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb), nonce, sb);
+ xpwrite(fd, sb, vstruct_bytes(sb),
+ le64_to_cpu(sb->offset) << 9);
+ }
- free(sb);
+ fsync(fd);
}
-struct bch_sb *bcache_super_read(const char *path)
+struct bch_sb *__bcache_super_read(int fd, u64 sector)
{
struct bch_sb sb, *ret;
- int fd = open(path, O_RDONLY);
- if (fd < 0)
- die("couldn't open %s", path);
-
- xpread(fd, &sb, sizeof(sb), BCH_SB_SECTOR << 9);
+ xpread(fd, &sb, sizeof(sb), sector << 9);
if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic)))
die("not a bcache superblock");
@@ -253,11 +272,19 @@ struct bch_sb *bcache_super_read(const char *path)
ret = malloc(bytes);
- xpread(fd, ret, bytes, BCH_SB_SECTOR << 9);
+ xpread(fd, ret, bytes, sector << 9);
return ret;
}
+struct bch_sb *bcache_super_read(const char *path)
+{
+ int fd = xopen(path, O_RDONLY);
+ struct bch_sb *sb = __bcache_super_read(fd, BCH_SB_SECTOR);
+ close(fd);
+ return sb;
+}
+
void bcache_super_print(struct bch_sb *sb, int units)
{
struct bch_sb_field_members *mi;
diff --git a/libbcache.h b/libbcache.h
index 6ec3f42..779b470 100644
--- a/libbcache.h
+++ b/libbcache.h
@@ -1,6 +1,7 @@
#ifndef _LIBBCACHE_H
#define _LIBBCACHE_H
+#include <linux/bcache.h>
#include <linux/uuid.h>
#include "tools-util.h"
#include "vstructs.h"
@@ -18,32 +19,56 @@ enum fsck_err_opts {
extern enum fsck_err_opts fsck_err_opt;
+struct format_opts {
+ char *label;
+ uuid_le uuid;
+
+ unsigned on_error_action;
+ unsigned max_journal_entry_size; /* will be removed */
+
+ unsigned block_size;
+ unsigned btree_node_size;
+
+ unsigned meta_replicas;
+ unsigned data_replicas;
+
+ unsigned meta_csum_type;
+ unsigned data_csum_type;
+ unsigned compression_type;
+
+ bool encrypted;
+ char *passphrase;
+};
+
+static inline struct format_opts format_opts_default()
+{
+ return (struct format_opts) {
+ .on_error_action = BCH_ON_ERROR_RO,
+ .meta_csum_type = BCH_CSUM_CRC32C,
+ .data_csum_type = BCH_CSUM_CRC32C,
+ .meta_replicas = 1,
+ .data_replicas = 1,
+ };
+}
+
struct dev_opts {
int fd;
- const char *path;
+ char *path;
u64 size; /* 512 byte sectors */
unsigned bucket_size;
unsigned tier;
bool discard;
- u64 first_bucket;
u64 nbuckets;
+
+ u64 sb_offset;
+ u64 sb_end;
};
-void bcache_format(struct dev_opts *devs, size_t nr_devs,
- unsigned block_size,
- unsigned btree_node_size,
- unsigned meta_csum_type,
- unsigned data_csum_type,
- unsigned compression_type,
- const char *passphrase,
- unsigned meta_replicas,
- unsigned data_replicas,
- unsigned on_error_action,
- unsigned max_journal_entry_size,
- char *label,
- uuid_le uuid);
+struct bch_sb *bcache_format(struct format_opts, struct dev_opts *, size_t);
+void bcache_super_write(int, struct bch_sb *);
+struct bch_sb *__bcache_super_read(int, u64);
struct bch_sb *bcache_super_read(const char *);
void bcache_super_print(struct bch_sb *, int);
diff --git a/libbcache/alloc.c b/libbcache/alloc.c
index 8cb3194..93f0c2f 100644
--- a/libbcache/alloc.c
+++ b/libbcache/alloc.c
@@ -73,7 +73,6 @@
#include <linux/rcupdate.h>
#include <trace/events/bcache.h>
-static size_t bch_bucket_alloc(struct cache *, enum alloc_reserve);
static void __bch_bucket_free(struct cache *, struct bucket *);
/* Allocation groups: */
@@ -84,12 +83,12 @@ void bch_dev_group_remove(struct cache_group *grp, struct cache *ca)
spin_lock(&grp->lock);
- for (i = 0; i < grp->nr_devices; i++)
+ for (i = 0; i < grp->nr; i++)
if (rcu_access_pointer(grp->d[i].dev) == ca) {
- grp->nr_devices--;
+ grp->nr--;
memmove(&grp->d[i],
&grp->d[i + 1],
- (grp->nr_devices - i) * sizeof(grp->d[0]));
+ (grp->nr- i) * sizeof(grp->d[0]));
break;
}
@@ -101,13 +100,13 @@ void bch_dev_group_add(struct cache_group *grp, struct cache *ca)
unsigned i;
spin_lock(&grp->lock);
- for (i = 0; i < grp->nr_devices; i++)
+ for (i = 0; i < grp->nr; i++)
if (rcu_access_pointer(grp->d[i].dev) == ca)
goto out;
- BUG_ON(grp->nr_devices >= BCH_SB_MEMBERS_MAX);
+ BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX);
- rcu_assign_pointer(grp->d[grp->nr_devices++].dev, ca);
+ rcu_assign_pointer(grp->d[grp->nr++].dev, ca);
out:
spin_unlock(&grp->lock);
}
@@ -120,25 +119,32 @@ static void pd_controllers_update(struct work_struct *work)
struct cache_set,
pd_controllers_update);
struct cache *ca;
- unsigned iter;
- int i;
+ unsigned i, iter;
/* All units are in bytes */
- u64 tier_size[BCH_TIER_MAX];
- u64 tier_free[BCH_TIER_MAX];
- u64 tier_dirty[BCH_TIER_MAX];
- u64 tier0_can_free = 0;
+ u64 faster_tiers_size = 0;
+ u64 faster_tiers_dirty = 0;
- memset(tier_size, 0, sizeof(tier_size));
- memset(tier_free, 0, sizeof(tier_free));
- memset(tier_dirty, 0, sizeof(tier_dirty));
+ u64 fastest_tier_size = 0;
+ u64 fastest_tier_free = 0;
+ u64 copygc_can_free = 0;
rcu_read_lock();
- for (i = BCH_TIER_MAX - 1; i >= 0; --i)
- group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) {
+ for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
+ bch_pd_controller_update(&c->tiers[i].pd,
+ div_u64(faster_tiers_size *
+ c->tiering_percent, 100),
+ faster_tiers_dirty,
+ -1);
+
+ group_for_each_cache_rcu(ca, &c->tiers[i].devs, iter) {
struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
unsigned bucket_bits = ca->bucket_bits + 9;
+ u64 size = (ca->mi.nbuckets -
+ ca->mi.first_bucket) << bucket_bits;
+ u64 dirty = stats.buckets_dirty << bucket_bits;
+ u64 free = __buckets_free_cache(ca, stats) << bucket_bits;
/*
* Bytes of internal fragmentation, which can be
* reclaimed by copy GC
@@ -149,41 +155,30 @@ static void pd_controllers_update(struct work_struct *work)
((stats.sectors_dirty +
stats.sectors_cached) << 9);
- u64 dev_size = (ca->mi.nbuckets -
- ca->mi.first_bucket) << bucket_bits;
-
- u64 free = __buckets_free_cache(ca, stats) << bucket_bits;
-
if (fragmented < 0)
fragmented = 0;
bch_pd_controller_update(&ca->moving_gc_pd,
free, fragmented, -1);
- if (i == 0)
- tier0_can_free += fragmented;
-
- tier_size[i] += dev_size;
- tier_free[i] += free;
- tier_dirty[i] += stats.buckets_dirty << bucket_bits;
- }
- rcu_read_unlock();
-
- if (tier_size[1]) {
- u64 target = div_u64(tier_size[0] * c->tiering_percent, 100);
+ faster_tiers_size += size;
+ faster_tiers_dirty += dirty;
- tier0_can_free = max_t(s64, 0, tier_dirty[0] - target);
+ if (!c->fastest_tier ||
+ c->fastest_tier == &c->tiers[i]) {
+ fastest_tier_size += size;
+ fastest_tier_free += free;
+ }
- bch_pd_controller_update(&c->tiering_pd,
- target,
- tier_dirty[0],
- -1);
+ copygc_can_free += fragmented;
+ }
}
+ rcu_read_unlock();
+
/*
* Throttle foreground writes if tier 0 is running out of free buckets,
- * and either tiering or copygc can free up space (but don't take both
- * into account).
+ * and either tiering or copygc can free up space.
*
* Target will be small if there isn't any work to do - we don't want to
* throttle foreground writes if we currently have all the free space
@@ -192,12 +187,15 @@ static void pd_controllers_update(struct work_struct *work)
* Otherwise, if there's work to do, try to keep 20% of tier0 available
* for foreground writes.
*/
+ if (c->fastest_tier)
+ copygc_can_free = U64_MAX;
+
bch_pd_controller_update(&c->foreground_write_pd,
- min(tier0_can_free,
- div_u64(tier_size[0] *
+ min(copygc_can_free,
+ div_u64(fastest_tier_size *
c->foreground_target_percent,
100)),
- tier_free[0],
+ fastest_tier_free,
-1);
schedule_delayed_work(&c->pd_controllers_update,
@@ -301,7 +299,8 @@ static int bch_prio_write(struct cache *ca)
* it getting gc'd from under us
*/
ca->prio_buckets[i] = r;
- bch_mark_metadata_bucket(ca, ca->buckets + r, false);
+ bch_mark_metadata_bucket(ca, ca->buckets + r,
+ BUCKET_PRIOS, false);
spin_unlock(&ca->prio_buckets_lock);
SET_PSET_CSUM_TYPE(p, bch_meta_checksum_type(c));
@@ -334,6 +333,9 @@ static int bch_prio_write(struct cache *ca)
do {
unsigned u64s = jset_u64s(0);
+ if (!test_bit(JOURNAL_STARTED, &c->journal.flags))
+ break;
+
ret = bch_journal_res_get(j, &res, u64s, u64s);
if (ret)
return ret;
@@ -815,8 +817,7 @@ static void bch_find_empty_buckets(struct cache_set *c, struct cache *ca)
if (is_available_bucket(m) &&
!m.cached_sectors &&
!m.had_metadata &&
- (!m.wait_on_journal ||
- ((s16) last_seq_ondisk - (s16) m.journal_seq >= 0))) {
+ !bucket_needs_journal_commit(m, last_seq_ondisk)) {
spin_lock(&ca->freelist_lock);
bch_mark_alloc_bucket(ca, g, true);
@@ -850,6 +851,8 @@ static int bch_allocator_thread(void *arg)
set_freezable();
+ bch_find_empty_buckets(c, ca);
+
while (1) {
/*
* First, we pull buckets off of the free_inc list, possibly
@@ -894,7 +897,7 @@ static int bch_allocator_thread(void *arg)
* See if we have buckets we can reuse without invalidating them
* or forcing a journal commit:
*/
- bch_find_empty_buckets(c, ca);
+ //bch_find_empty_buckets(c, ca);
if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) {
up_read(&c->gc_lock);
@@ -967,7 +970,7 @@ out:
*
* Returns index of bucket on success, 0 on failure
* */
-static size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve)
+size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve)
{
struct bucket *g;
long r;
@@ -1018,21 +1021,21 @@ static void recalc_alloc_group_weights(struct cache_set *c,
u64 available_buckets = 1; /* avoid a divide by zero... */
unsigned i;
- for (i = 0; i < devs->nr_devices; i++) {
+ for (i = 0; i < devs->nr; i++) {
ca = devs->d[i].dev;
devs->d[i].weight = buckets_free_cache(ca);
available_buckets += devs->d[i].weight;
}
- for (i = 0; i < devs->nr_devices; i++) {
+ for (i = 0; i < devs->nr; i++) {
const unsigned min_weight = U32_MAX >> 4;
const unsigned max_weight = U32_MAX;
devs->d[i].weight =
min_weight +
div64_u64(devs->d[i].weight *
- devs->nr_devices *
+ devs->nr *
(max_weight - min_weight),
available_buckets);
devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight);
@@ -1058,7 +1061,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
rcu_read_lock();
spin_lock(&devs->lock);
- for (i = 0; i < devs->nr_devices; i++)
+ for (i = 0; i < devs->nr; i++)
available += !test_bit(devs->d[i].dev->dev_idx,
caches_used);
@@ -1076,7 +1079,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
}
i++;
- i %= devs->nr_devices;
+ i %= devs->nr;
ret = FREELIST_EMPTY;
if (i == fail_idx)
@@ -1136,20 +1139,25 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
enum alloc_reserve reserve,
long *caches_used)
{
+ struct bch_tier *tier;
/*
* this should implement policy - for a given type of allocation, decide
* which devices to allocate from:
*
* XXX: switch off wp->type and do something more intelligent here
*/
+ if (wp->group)
+ return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
+ wp->group, caches_used);
- /* foreground writes: prefer tier 0: */
- if (wp->group == &c->cache_all)
+ /* foreground writes: prefer fastest tier: */
+ tier = READ_ONCE(c->fastest_tier);
+ if (tier)
bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
- &c->cache_tiers[0], caches_used);
+ &tier->devs, caches_used);
return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
- wp->group, caches_used);
+ &c->cache_all, caches_used);
}
static int bch_bucket_alloc_set(struct cache_set *c, struct write_point *wp,
@@ -1413,7 +1421,6 @@ struct open_bucket *bch_alloc_sectors_start(struct cache_set *c,
? 0 : BTREE_NODE_RESERVE;
int ret;
- BUG_ON(!wp->group);
BUG_ON(!reserve);
BUG_ON(!nr_replicas);
retry:
@@ -1481,7 +1488,7 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
unsigned nr_replicas, struct open_bucket *ob,
unsigned sectors)
{
- struct bch_extent_ptr tmp, *ptr;
+ struct bch_extent_ptr tmp;
struct cache *ca;
bool has_data = false;
unsigned i;
@@ -1501,6 +1508,8 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
if (nr_replicas < ob->nr_ptrs)
has_data = true;
+ rcu_read_lock();
+
for (i = 0; i < nr_replicas; i++) {
EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));
@@ -1510,10 +1519,12 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
extent_ptr_append(e, tmp);
ob->ptr_offset[i] += sectors;
+
+ if ((ca = PTR_CACHE(c, &ob->ptrs[i])))
+ this_cpu_add(*ca->sectors_written, sectors);
}
- open_bucket_for_each_online_device(c, ob, ptr, ca)
- this_cpu_add(*ca->sectors_written, sectors);
+ rcu_read_unlock();
}
/*
@@ -1586,9 +1597,9 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c,
/* Startup/shutdown (ro/rw): */
-static void bch_recalc_capacity(struct cache_set *c)
+void bch_recalc_capacity(struct cache_set *c)
{
- struct cache_group *tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers);
+ struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier;
struct cache *ca;
u64 total_capacity, capacity = 0, reserved_sectors = 0;
unsigned long ra_pages = 0;
@@ -1604,16 +1615,29 @@ static void bch_recalc_capacity(struct cache_set *c)
c->bdi.ra_pages = ra_pages;
+ /* Find fastest, slowest tiers with devices: */
+
+ for (tier = c->tiers;
+ tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
+ if (!tier->devs.nr)
+ continue;
+ if (!fastest_tier)
+ fastest_tier = tier;
+ slowest_tier = tier;
+ }
+
+ c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL;
+
+ c->promote_write_point.group = &fastest_tier->devs;
+
+ if (!fastest_tier)
+ goto set_capacity;
+
/*
* Capacity of the cache set is the capacity of all the devices in the
* slowest (highest) tier - we don't include lower tier devices.
*/
- for (tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers) - 1;
- tier > c->cache_tiers && !tier->nr_devices;
- --tier)
- ;
-
- group_for_each_cache_rcu(ca, tier, i) {
+ group_for_each_cache_rcu(ca, &slowest_tier->devs, i) {
size_t reserve = 0;
/*
@@ -1649,8 +1673,8 @@ static void bch_recalc_capacity(struct cache_set *c)
ca->mi.first_bucket) <<
ca->bucket_bits;
}
+set_capacity:
rcu_read_unlock();
-
total_capacity = capacity;
capacity *= (100 - c->opts.gc_reserve_percent);
@@ -1727,7 +1751,7 @@ static bool bch_dev_has_open_write_point(struct cache *ca)
void bch_dev_allocator_stop(struct cache *ca)
{
struct cache_set *c = ca->set;
- struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+ struct cache_group *tier = &c->tiers[ca->mi.tier].devs;
struct task_struct *p;
struct closure cl;
unsigned i;
@@ -1808,7 +1832,7 @@ void bch_dev_allocator_stop(struct cache *ca)
int bch_dev_allocator_start(struct cache *ca)
{
struct cache_set *c = ca->set;
- struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+ struct cache_group *tier = &c->tiers[ca->mi.tier].devs;
struct task_struct *k;
/*
@@ -1826,6 +1850,7 @@ int bch_dev_allocator_start(struct cache *ca)
bch_dev_group_add(tier, ca);
bch_dev_group_add(&c->cache_all, ca);
+ bch_dev_group_add(&c->journal.devs, ca);
bch_recalc_capacity(c);
@@ -1838,7 +1863,7 @@ int bch_dev_allocator_start(struct cache *ca)
return 0;
}
-void bch_open_buckets_init(struct cache_set *c)
+void bch_fs_allocator_init(struct cache_set *c)
{
unsigned i;
@@ -1860,19 +1885,11 @@ void bch_open_buckets_init(struct cache_set *c)
spin_lock_init(&c->cache_all.lock);
- for (i = 0; i < ARRAY_SIZE(c->write_points); i++) {
- c->write_points[i].throttle = true;
- c->write_points[i].group = &c->cache_tiers[0];
- }
-
- for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++)
- spin_lock_init(&c->cache_tiers[i].lock);
+ for (i = 0; i < ARRAY_SIZE(c->tiers); i++)
+ spin_lock_init(&c->tiers[i].devs.lock);
- c->promote_write_point.group = &c->cache_tiers[0];
-
- c->migration_write_point.group = &c->cache_all;
-
- c->btree_write_point.group = &c->cache_all;
+ for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+ c->write_points[i].throttle = true;
c->pd_controllers_update_seconds = 5;
INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
diff --git a/libbcache/alloc.h b/libbcache/alloc.h
index 09139a5..9573dd2 100644
--- a/libbcache/alloc.h
+++ b/libbcache/alloc.h
@@ -27,6 +27,8 @@ int bch_prio_read(struct cache *);
void bch_recalc_min_prio(struct cache *, int);
+size_t bch_bucket_alloc(struct cache *, enum alloc_reserve);
+
void bch_open_bucket_put(struct cache_set *, struct open_bucket *);
struct open_bucket *bch_alloc_sectors_start(struct cache_set *,
@@ -58,7 +60,7 @@ static inline struct cache *cache_group_next_rcu(struct cache_group *devs,
{
struct cache *ret = NULL;
- while (*iter < devs->nr_devices &&
+ while (*iter < devs->nr &&
!(ret = rcu_dereference(devs->d[*iter].dev)))
(*iter)++;
@@ -103,8 +105,9 @@ static inline struct cache *cache_group_next(struct cache_group *devs,
((_ca) = __open_bucket_next_online_device(_c, _ob, _ptr, _ca));\
(_ptr)++)
+void bch_recalc_capacity(struct cache_set *);
void bch_dev_allocator_stop(struct cache *);
int bch_dev_allocator_start(struct cache *);
-void bch_open_buckets_init(struct cache_set *);
+void bch_fs_allocator_init(struct cache_set *);
#endif /* _BCACHE_ALLOC_H */
diff --git a/libbcache/alloc_types.h b/libbcache/alloc_types.h
index fbe8b75..f408bd9 100644
--- a/libbcache/alloc_types.h
+++ b/libbcache/alloc_types.h
@@ -51,7 +51,7 @@ static inline bool allocation_is_metadata(enum alloc_reserve id)
struct cache_group {
spinlock_t lock;
- unsigned nr_devices;
+ unsigned nr;
unsigned cur_device;
struct {
u64 weight;
diff --git a/libbcache/bcache.h b/libbcache/bcache.h
index babc08d..5b668c7 100644
--- a/libbcache/bcache.h
+++ b/libbcache/bcache.h
@@ -464,24 +464,10 @@ struct cache {
* BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
* all the backing devices first (their cached data gets invalidated, and they
* won't automatically reattach).
- *
- * BCH_FS_STOPPING always gets set first when we're closing down a cache set;
- * we'll continue to run normally for awhile with BCH_FS_STOPPING set (i.e.
- * flushing dirty data).
- *
- * BCH_FS_RUNNING means all cache devices have been registered and journal
- * replay is complete.
*/
enum {
- /* Startup: */
BCH_FS_INITIAL_GC_DONE,
- BCH_FS_RUNNING,
-
- /* Shutdown: */
BCH_FS_DETACHING,
- BCH_FS_STOPPING,
- BCH_FS_RO,
- BCH_FS_RO_COMPLETE,
BCH_FS_EMERGENCY_RO,
BCH_FS_WRITE_DISABLE_COMPLETE,
BCH_FS_GC_STOPPING,
@@ -498,6 +484,21 @@ struct btree_debug {
struct dentry *failed;
};
+struct bch_tier {
+ unsigned idx;
+ struct task_struct *migrate;
+ struct bch_pd_controller pd;
+
+ struct cache_group devs;
+};
+
+enum bch_fs_state {
+ BCH_FS_STARTING = 0,
+ BCH_FS_STOPPING,
+ BCH_FS_RO,
+ BCH_FS_RW,
+};
+
struct cache_set {
struct closure cl;
@@ -506,7 +507,6 @@ struct cache_set {
struct kobject internal;
struct kobject opts_dir;
struct kobject time_stats;
- struct completion *stop_completion;
unsigned long flags;
int minor;
@@ -514,6 +514,10 @@ struct cache_set {
struct super_block *vfs_sb;
char name[40];
+ /* ro/rw, add/remove devices: */
+ struct mutex state_lock;
+ enum bch_fs_state state;
+
/* Counts outstanding writes, for clean transition to read-only */
struct percpu_ref writes;
struct work_struct read_only_work;
@@ -640,7 +644,9 @@ struct cache_set {
* allocate from:
*/
struct cache_group cache_all;
- struct cache_group cache_tiers[BCH_TIER_MAX];
+ struct bch_tier tiers[BCH_TIER_MAX];
+ /* NULL if we only have devices in one tier: */
+ struct bch_tier *fastest_tier;
u64 capacity; /* sectors */
@@ -753,10 +759,6 @@ struct cache_set {
unsigned writeback_pages_max;
atomic_long_t nr_inodes;
- /* TIERING */
- struct task_struct *tiering_read;
- struct bch_pd_controller tiering_pd;
-
/* NOTIFICATIONS */
struct mutex uevent_lock;
struct kobj_uevent_env uevent_env;
@@ -828,6 +830,11 @@ struct cache_set {
#undef BCH_TIME_STAT
};
+static inline bool bch_fs_running(struct cache_set *c)
+{
+ return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
+}
+
static inline unsigned bucket_pages(const struct cache *ca)
{
return ca->mi.bucket_size / PAGE_SECTORS;
diff --git a/libbcache/blockdev.c b/libbcache/blockdev.c
index 82b07f5..ba2e9a8 100644
--- a/libbcache/blockdev.c
+++ b/libbcache/blockdev.c
@@ -375,6 +375,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
bool found;
int ret;
+ lockdep_assert_held(&c->state_lock);
+
bdevname(dc->disk_sb.bdev, buf);
if (memcmp(&dc->disk_sb.sb->set_uuid,
@@ -387,11 +389,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
return -EINVAL;
}
- if (!test_bit(BCH_FS_RUNNING, &c->flags))
- return 0;
-
- if (test_bit(BCH_FS_STOPPING, &c->flags)) {
- pr_err("Can't attach %s: shutting down", buf);
+ if (!bch_fs_running(c)) {
+ pr_err("Can't attach %s: not running", buf);
return -EINVAL;
}
@@ -497,6 +496,7 @@ void bch_attach_backing_devs(struct cache_set *c)
struct cached_dev *dc, *t;
lockdep_assert_held(&bch_register_lock);
+ lockdep_assert_held(&c->state_lock);
list_for_each_entry_safe(dc, t, &uncached_devices, list)
bch_cached_dev_attach(dc, c);
@@ -742,7 +742,7 @@ int bch_blockdev_volumes_start(struct cache_set *c)
struct bkey_s_c_inode_blockdev inode;
int ret = 0;
- if (test_bit(BCH_FS_STOPPING, &c->flags))
+ if (!bch_fs_running(c))
return -EINVAL;
for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) {
diff --git a/libbcache/btree_cache.c b/libbcache/btree_cache.c
index 4d5efdb..4d0c6d4 100644
--- a/libbcache/btree_cache.c
+++ b/libbcache/btree_cache.c
@@ -11,8 +11,9 @@
#define DEF_BTREE_ID(kwd, val, name) name,
-const char *bch_btree_id_names[BTREE_ID_NR] = {
+const char * const bch_btree_ids[] = {
DEFINE_BCH_BTREE_IDS()
+ NULL
};
#undef DEF_BTREE_ID
@@ -311,7 +312,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink,
return mca_can_free(c) * btree_pages(c);
}
-void bch_btree_cache_free(struct cache_set *c)
+void bch_fs_btree_exit(struct cache_set *c)
{
struct btree *b;
unsigned i;
@@ -358,7 +359,7 @@ void bch_btree_cache_free(struct cache_set *c)
rhashtable_destroy(&c->btree_cache_table);
}
-int bch_btree_cache_alloc(struct cache_set *c)
+int bch_fs_btree_init(struct cache_set *c)
{
unsigned i;
int ret;
diff --git a/libbcache/btree_cache.h b/libbcache/btree_cache.h
index c26489d..4d67704 100644
--- a/libbcache/btree_cache.h
+++ b/libbcache/btree_cache.h
@@ -6,7 +6,7 @@
struct btree_iter;
-extern const char *bch_btree_id_names[BTREE_ID_NR];
+extern const char * const bch_btree_ids[];
void bch_recalc_btree_reserve(struct cache_set *);
@@ -22,8 +22,8 @@ struct btree *mca_alloc(struct cache_set *);
struct btree *bch_btree_node_get(struct btree_iter *, const struct bkey_i *,
unsigned, enum six_lock_type);
-void bch_btree_cache_free(struct cache_set *);
-int bch_btree_cache_alloc(struct cache_set *);
+void bch_fs_btree_exit(struct cache_set *);
+int bch_fs_btree_init(struct cache_set *);
#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \
for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl, \
diff --git a/libbcache/btree_gc.c b/libbcache/btree_gc.c
index 0eb7290..b90807f 100644
--- a/libbcache/btree_gc.c
+++ b/libbcache/btree_gc.c
@@ -262,30 +262,72 @@ static void bch_mark_allocator_buckets(struct cache_set *c)
}
}
+static void mark_metadata_sectors(struct cache *ca, u64 start, u64 end,
+ enum bucket_data_type type)
+{
+ u64 b = start >> ca->bucket_bits;
+
+ do {
+ bch_mark_metadata_bucket(ca, ca->buckets + b, type, true);
+ b++;
+ } while (b < end >> ca->bucket_bits);
+}
+
/*
* Mark non btree metadata - prios, journal
*/
-static void bch_mark_metadata(struct cache_set *c)
+static void bch_mark_dev_metadata(struct cache_set *c, struct cache *ca)
{
- struct cache *ca;
- unsigned i, j;
+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+ unsigned i;
u64 b;
- for_each_cache(ca, c, i) {
- for (j = 0; j < ca->journal.nr; j++) {
- b = ca->journal.buckets[j];
- bch_mark_metadata_bucket(ca, ca->buckets + b, true);
- }
+ /* Mark superblocks: */
+ for (i = 0; i < layout->nr_superblocks; i++) {
+ if (layout->sb_offset[i] == BCH_SB_SECTOR)
+ mark_metadata_sectors(ca, 0, BCH_SB_SECTOR,
+ BUCKET_SB);
+
+ mark_metadata_sectors(ca,
+ layout->sb_offset[i],
+ layout->sb_offset[i] +
+ (1 << layout->sb_max_size_bits),
+ BUCKET_SB);
+ }
- spin_lock(&ca->prio_buckets_lock);
+ spin_lock(&c->journal.lock);
- for (j = 0; j < prio_buckets(ca) * 2; j++) {
- b = ca->prio_buckets[j];
- bch_mark_metadata_bucket(ca, ca->buckets + b, true);
- }
+ for (i = 0; i < ca->journal.nr; i++) {
+ b = ca->journal.buckets[i];
+ bch_mark_metadata_bucket(ca, ca->buckets + b,
+ BUCKET_JOURNAL, true);
+ }
+
+ spin_unlock(&c->journal.lock);
+
+ spin_lock(&ca->prio_buckets_lock);
- spin_unlock(&ca->prio_buckets_lock);
+ for (i = 0; i < prio_buckets(ca) * 2; i++) {
+ b = ca->prio_buckets[i];
+ if (b)
+ bch_mark_metadata_bucket(ca, ca->buckets + b,
+ BUCKET_PRIOS, true);
}
+
+ spin_unlock(&ca->prio_buckets_lock);
+}
+
+static void bch_mark_metadata(struct cache_set *c)
+{
+ struct cache *ca;
+ unsigned i;
+
+ mutex_lock(&c->sb_lock);
+
+ for_each_cache(ca, c, i)
+ bch_mark_dev_metadata(c, ca);
+
+ mutex_unlock(&c->sb_lock);
}
/* Also see bch_pending_btree_node_free_insert_done() */
@@ -389,7 +431,7 @@ void bch_gc(struct cache_set *c)
for_each_bucket(g, ca) {
bucket_cmpxchg(g, new, ({
new.owned_by_allocator = 0;
- new.is_metadata = 0;
+ new.data_type = 0;
new.cached_sectors = 0;
new.dirty_sectors = 0;
}));
@@ -750,9 +792,6 @@ void bch_coalesce(struct cache_set *c)
u64 start_time;
enum btree_id id;
- if (btree_gc_coalesce_disabled(c))
- return;
-
if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
return;
@@ -811,7 +850,8 @@ static int bch_gc_thread(void *arg)
last_kick = atomic_read(&c->kick_gc);
bch_gc(c);
- bch_coalesce(c);
+ if (!btree_gc_coalesce_disabled(c))
+ bch_coalesce(c);
debug_check_no_locks_held();
}
@@ -823,18 +863,24 @@ void bch_gc_thread_stop(struct cache_set *c)
{
set_bit(BCH_FS_GC_STOPPING, &c->flags);
- if (!IS_ERR_OR_NULL(c->gc_thread))
+ if (c->gc_thread)
kthread_stop(c->gc_thread);
+
+ c->gc_thread = NULL;
+ clear_bit(BCH_FS_GC_STOPPING, &c->flags);
}
int bch_gc_thread_start(struct cache_set *c)
{
- clear_bit(BCH_FS_GC_STOPPING, &c->flags);
+ struct task_struct *p;
- c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
- if (IS_ERR(c->gc_thread))
- return PTR_ERR(c->gc_thread);
+ BUG_ON(c->gc_thread);
+ p = kthread_create(bch_gc_thread, c, "bcache_gc");
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ c->gc_thread = p;
wake_up_process(c->gc_thread);
return 0;
}
@@ -883,12 +929,13 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal)
{
enum btree_id id;
- if (journal) {
- for (id = 0; id < BTREE_ID_NR; id++)
- bch_initial_gc_btree(c, id);
+ bch_mark_metadata(c);
+ for (id = 0; id < BTREE_ID_NR; id++)
+ bch_initial_gc_btree(c, id);
+
+ if (journal)
bch_journal_mark(c, journal);
- }
/*
* Skip past versions that might have possibly been used (as nonces),
@@ -897,8 +944,6 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal)
if (c->sb.encryption_type)
atomic64_add(1 << 16, &c->key_version);
- bch_mark_metadata(c);
-
gc_pos_set(c, gc_phase(GC_PHASE_DONE));
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
diff --git a/libbcache/buckets.c b/libbcache/buckets.c
index 315cfbe..ec4ee54 100644
--- a/libbcache/buckets.c
+++ b/libbcache/buckets.c
@@ -66,6 +66,7 @@
#include "alloc.h"
#include "btree_gc.h"
#include "buckets.h"
+#include "error.h"
#include <linux/preempt.h>
#include <trace/events/bcache.h>
@@ -102,6 +103,10 @@ static void bch_fs_stats_verify(struct cache_set *c) {}
#endif
+/*
+ * Clear journal_seq_valid for buckets for which it's not needed, to prevent
+ * wraparound:
+ */
void bch_bucket_seq_cleanup(struct cache_set *c)
{
u16 last_seq_ondisk = c->journal.last_seq_ondisk;
@@ -113,12 +118,11 @@ void bch_bucket_seq_cleanup(struct cache_set *c)
for_each_cache(ca, c, i)
for_each_bucket(g, ca) {
bucket_cmpxchg(g, m, ({
- if (!m.wait_on_journal ||
- ((s16) last_seq_ondisk -
- (s16) m.journal_seq < 0))
+ if (!m.journal_seq_valid ||
+ bucket_needs_journal_commit(m, last_seq_ondisk))
break;
- m.wait_on_journal = 0;
+ m.journal_seq_valid = 0;
}));
}
}
@@ -186,17 +190,18 @@ bch_bucket_stats_read_cache_set(struct cache_set *c)
static inline int is_meta_bucket(struct bucket_mark m)
{
- return !m.owned_by_allocator && m.is_metadata;
+ return m.data_type != BUCKET_DATA;
}
static inline int is_dirty_bucket(struct bucket_mark m)
{
- return !m.owned_by_allocator && !m.is_metadata && !!m.dirty_sectors;
+ return m.data_type == BUCKET_DATA && !!m.dirty_sectors;
}
static inline int is_cached_bucket(struct bucket_mark m)
{
- return !m.owned_by_allocator && !m.dirty_sectors && !!m.cached_sectors;
+ return m.data_type == BUCKET_DATA &&
+ !m.dirty_sectors && !!m.cached_sectors;
}
void bch_fs_stats_apply(struct cache_set *c,
@@ -236,29 +241,37 @@ void bch_fs_stats_apply(struct cache_set *c,
memset(stats, 0, sizeof(*stats));
}
+static bool bucket_became_unavailable(struct cache_set *c,
+ struct bucket_mark old,
+ struct bucket_mark new)
+{
+ return is_available_bucket(old) &&
+ !is_available_bucket(new) &&
+ c->gc_pos.phase == GC_PHASE_DONE;
+}
+
static void bucket_stats_update(struct cache *ca,
struct bucket_mark old, struct bucket_mark new,
- bool may_make_unavailable,
struct bucket_stats_cache_set *bch_alloc_stats)
{
struct cache_set *c = ca->set;
struct bucket_stats_cache *cache_stats;
- BUG_ON(!may_make_unavailable &&
- is_available_bucket(old) &&
- !is_available_bucket(new) &&
- c->gc_pos.phase == GC_PHASE_DONE);
+ bch_fs_inconsistent_on(old.data_type && new.data_type &&
+ old.data_type != new.data_type, c,
+ "different types of metadata in same bucket: %u, %u",
+ old.data_type, new.data_type);
if (bch_alloc_stats) {
bch_alloc_stats->s[S_COMPRESSED][S_CACHED] +=
(int) new.cached_sectors - (int) old.cached_sectors;
bch_alloc_stats->s[S_COMPRESSED]
- [old.is_metadata ? S_META : S_DIRTY] -=
+ [is_meta_bucket(old) ? S_META : S_DIRTY] -=
old.dirty_sectors;
bch_alloc_stats->s[S_COMPRESSED]
- [new.is_metadata ? S_META : S_DIRTY] +=
+ [is_meta_bucket(new) ? S_META : S_DIRTY] +=
new.dirty_sectors;
}
@@ -268,12 +281,12 @@ static void bucket_stats_update(struct cache *ca,
cache_stats->sectors_cached +=
(int) new.cached_sectors - (int) old.cached_sectors;
- if (old.is_metadata)
+ if (is_meta_bucket(old))
cache_stats->sectors_meta -= old.dirty_sectors;
else
cache_stats->sectors_dirty -= old.dirty_sectors;
- if (new.is_metadata)
+ if (is_meta_bucket(new))
cache_stats->sectors_meta += new.dirty_sectors;
else
cache_stats->sectors_dirty += new.dirty_sectors;
@@ -290,6 +303,15 @@ static void bucket_stats_update(struct cache *ca,
bch_wake_allocator(ca);
}
+#define bucket_data_cmpxchg(ca, g, new, expr) \
+({ \
+ struct bucket_stats_cache_set _stats = { 0 }; \
+ struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
+ \
+ bucket_stats_update(ca, _old, new, &_stats); \
+ _old; \
+})
+
void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
{
struct bucket_stats_cache_set stats = { 0 };
@@ -297,16 +319,17 @@ void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
old = bucket_cmpxchg(g, new, ({
new.owned_by_allocator = 1;
- new.is_metadata = 0;
+ new.had_metadata = 0;
+ new.data_type = 0;
new.cached_sectors = 0;
new.dirty_sectors = 0;
new.copygc = 0;
new.gen++;
}));
- BUG_ON(old.dirty_sectors);
+ bucket_stats_update(ca, old, new, &stats);
- bucket_stats_update(ca, old, new, true, &stats);
+ BUG_ON(old.dirty_sectors);
/*
* Ick:
@@ -329,45 +352,45 @@ void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
void bch_mark_free_bucket(struct cache *ca, struct bucket *g)
{
- struct bucket_stats_cache_set stats = { 0 };
struct bucket_mark old, new;
- old = bucket_cmpxchg(g, new, ({
+ old = bucket_data_cmpxchg(ca, g, new, ({
new.owned_by_allocator = 0;
- new.is_metadata = 0;
+ new.data_type = 0;
new.cached_sectors = 0;
new.dirty_sectors = 0;
}));
- bucket_stats_update(ca, old, new, false, &stats);
+ BUG_ON(bucket_became_unavailable(ca->set, old, new));
}
void bch_mark_alloc_bucket(struct cache *ca, struct bucket *g,
bool owned_by_allocator)
{
- struct bucket_stats_cache_set stats = { 0 };
- struct bucket_mark old, new;
+ struct bucket_mark new;
- old = bucket_cmpxchg(g, new, new.owned_by_allocator = owned_by_allocator);
-
- bucket_stats_update(ca, old, new, true, &stats);
+ bucket_data_cmpxchg(ca, g, new, ({
+ new.owned_by_allocator = owned_by_allocator;
+ }));
}
void bch_mark_metadata_bucket(struct cache *ca, struct bucket *g,
+ enum bucket_data_type type,
bool may_make_unavailable)
{
- struct bucket_stats_cache_set stats = { 0 };
struct bucket_mark old, new;
- old = bucket_cmpxchg(g, new, ({
- new.is_metadata = 1;
+ BUG_ON(!type);
+
+ old = bucket_data_cmpxchg(ca, g, new, ({
+ new.data_type = type;
new.had_metadata = 1;
}));
BUG_ON(old.cached_sectors);
BUG_ON(old.dirty_sectors);
-
- bucket_stats_update(ca, old, new, may_make_unavailable, &stats);
+ BUG_ON(!may_make_unavailable &&
+ bucket_became_unavailable(ca->set, old, new));
}
#define saturated_add(ca, dst, src, max) \
@@ -487,22 +510,26 @@ static void bch_mark_pointer(struct cache_set *c,
if (!new.dirty_sectors &&
!new.cached_sectors) {
- new.is_metadata = false;
+ new.data_type = 0;
if (journal_seq) {
- new.wait_on_journal = true;
+ new.journal_seq_valid = 1;
new.journal_seq = journal_seq;
}
} else {
- new.is_metadata = (type == S_META);
+ new.data_type = type == S_META
+ ? BUCKET_BTREE : BUCKET_DATA;
}
- new.had_metadata |= new.is_metadata;
+ new.had_metadata |= is_meta_bucket(new);
} while ((v = cmpxchg(&g->_mark.counter,
old.counter,
new.counter)) != old.counter);
- bucket_stats_update(ca, old, new, may_make_unavailable, NULL);
+ bucket_stats_update(ca, old, new, NULL);
+
+ BUG_ON(!may_make_unavailable &&
+ bucket_became_unavailable(c, old, new));
if (saturated &&
atomic_long_add_return(saturated,
diff --git a/libbcache/buckets.h b/libbcache/buckets.h
index 9c6e438..6d70103 100644
--- a/libbcache/buckets.h
+++ b/libbcache/buckets.h
@@ -235,8 +235,16 @@ static inline u64 sectors_available(struct cache_set *c)
static inline bool is_available_bucket(struct bucket_mark mark)
{
return (!mark.owned_by_allocator &&
- !mark.is_metadata &&
- !mark.dirty_sectors);
+ mark.data_type == BUCKET_DATA &&
+ !mark.dirty_sectors &&
+ !mark.nouse);
+}
+
+static inline bool bucket_needs_journal_commit(struct bucket_mark m,
+ u16 last_seq_ondisk)
+{
+ return m.journal_seq_valid &&
+ ((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
}
void bch_bucket_seq_cleanup(struct cache_set *);
@@ -244,7 +252,8 @@ void bch_bucket_seq_cleanup(struct cache_set *);
void bch_invalidate_bucket(struct cache *, struct bucket *);
void bch_mark_free_bucket(struct cache *, struct bucket *);
void bch_mark_alloc_bucket(struct cache *, struct bucket *, bool);
-void bch_mark_metadata_bucket(struct cache *, struct bucket *, bool);
+void bch_mark_metadata_bucket(struct cache *, struct bucket *,
+ enum bucket_data_type, bool);
void __bch_gc_mark_key(struct cache_set *, struct bkey_s_c, s64, bool,
struct bucket_stats_cache_set *);
diff --git a/libbcache/buckets_types.h b/libbcache/buckets_types.h
index 6bbdcd2..f42e09d 100644
--- a/libbcache/buckets_types.h
+++ b/libbcache/buckets_types.h
@@ -1,6 +1,14 @@
#ifndef _BUCKETS_TYPES_H
#define _BUCKETS_TYPES_H
+enum bucket_data_type {
+ BUCKET_DATA = 0,
+ BUCKET_BTREE,
+ BUCKET_PRIOS,
+ BUCKET_JOURNAL,
+ BUCKET_SB,
+};
+
struct bucket_mark {
union {
struct {
@@ -12,23 +20,30 @@ struct bucket_mark {
/* generation copygc is going to move this bucket into */
unsigned copygc:1;
- unsigned wait_on_journal:1;
+
+ unsigned journal_seq_valid:1;
/*
- * If this bucket ever had metadata in it, the allocator must
- * increment its gen before we reuse it:
+ * If this bucket had metadata while at the current generation
+ * number, the allocator must increment its gen before we reuse
+ * it:
*/
unsigned had_metadata:1;
unsigned owned_by_allocator:1;
- unsigned is_metadata:1;
- u16 cached_sectors;
+ unsigned data_type:3;
+
+ unsigned nouse:1;
+
u16 dirty_sectors;
+ u16 cached_sectors;
/*
* low bits of journal sequence number when this bucket was most
- * recently modified:
+ * recently modified: if journal_seq_valid is set, this bucket
+ * can't be reused until the journal sequence number written to
+ * disk is >= the bucket's journal sequence number:
*/
u16 journal_seq;
};
diff --git a/libbcache/chardev.c b/libbcache/chardev.c
index b142d7b..049aa91 100644
--- a/libbcache/chardev.c
+++ b/libbcache/chardev.c
@@ -107,7 +107,7 @@ static long bch_global_ioctl(unsigned cmd, void __user *arg)
static long bch_ioctl_stop(struct cache_set *c)
{
- bch_fs_stop(c);
+ bch_fs_stop_async(c);
return 0;
}
diff --git a/libbcache/checksum.c b/libbcache/checksum.c
index dae52d4..92036db 100644
--- a/libbcache/checksum.c
+++ b/libbcache/checksum.c
@@ -539,15 +539,12 @@ int bch_enable_encryption(struct cache_set *c, bool keyed)
if (ret)
goto err;
- crypt = container_of_or_null(bch_fs_sb_field_resize(c, NULL,
- sizeof(*crypt) / sizeof(u64)),
- struct bch_sb_field_crypt, field);
+ crypt = bch_fs_sb_resize_crypt(c, sizeof(*crypt) / sizeof(u64));
if (!crypt) {
ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
goto err;
}
- crypt->field.type = BCH_SB_FIELD_crypt;
crypt->key = key;
/* write superblock */
@@ -560,7 +557,7 @@ err:
return ret;
}
-void bch_fs_encryption_free(struct cache_set *c)
+void bch_fs_encryption_exit(struct cache_set *c)
{
if (!IS_ERR_OR_NULL(c->poly1305))
crypto_free_shash(c->poly1305);
diff --git a/libbcache/checksum.h b/libbcache/checksum.h
index 137c915..9d4da08 100644
--- a/libbcache/checksum.h
+++ b/libbcache/checksum.h
@@ -43,7 +43,7 @@ void bch_encrypt_bio(struct cache_set *, unsigned,
int bch_disable_encryption(struct cache_set *);
int bch_enable_encryption(struct cache_set *, bool);
-void bch_fs_encryption_free(struct cache_set *);
+void bch_fs_encryption_exit(struct cache_set *);
int bch_fs_encryption_init(struct cache_set *);
static inline unsigned bch_data_checksum_type(struct cache_set *c)
diff --git a/libbcache/compress.c b/libbcache/compress.c
index f81a814..89da31e 100644
--- a/libbcache/compress.c
+++ b/libbcache/compress.c
@@ -434,10 +434,10 @@ int bch_check_set_has_compressed_data(struct cache_set *c,
break;
}
- return bch_compress_init(c);
+ return bch_fs_compress_init(c);
}
-void bch_compress_free(struct cache_set *c)
+void bch_fs_compress_exit(struct cache_set *c)
{
vfree(c->zlib_workspace);
mempool_exit(&c->lz4_workspace_pool);
@@ -450,15 +450,11 @@ void bch_compress_free(struct cache_set *c)
max_t(size_t, zlib_inflate_workspacesize(), \
zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL))
-int bch_compress_init(struct cache_set *c)
+int bch_fs_compress_init(struct cache_set *c)
{
unsigned order = get_order(BCH_ENCODED_EXTENT_MAX << 9);
int ret, cpu;
- if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
- !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
- return 0;
-
if (!c->bio_decompress_worker) {
c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker);
if (!c->bio_decompress_worker)
@@ -474,6 +470,10 @@ int bch_compress_init(struct cache_set *c)
}
}
+ if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
+ !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
+ return 0;
+
if (!mempool_initialized(&c->compression_bounce[READ])) {
ret = mempool_init_page_pool(&c->compression_bounce[READ],
1, order);
diff --git a/libbcache/compress.h b/libbcache/compress.h
index 485acd9..4604b06 100644
--- a/libbcache/compress.h
+++ b/libbcache/compress.h
@@ -9,7 +9,7 @@ void bch_bio_compress(struct cache_set *, struct bio *, size_t *,
struct bio *, size_t *, unsigned *);
int bch_check_set_has_compressed_data(struct cache_set *, unsigned);
-void bch_compress_free(struct cache_set *);
-int bch_compress_init(struct cache_set *);
+void bch_fs_compress_exit(struct cache_set *);
+int bch_fs_compress_init(struct cache_set *);
#endif /* _BCACHE_COMPRESS_H */
diff --git a/libbcache/debug.c b/libbcache/debug.c
index d25c32a..16cc72b 100644
--- a/libbcache/debug.c
+++ b/libbcache/debug.c
@@ -409,13 +409,13 @@ static const struct file_operations bfloat_failed_debug_ops = {
.read = bch_read_bfloat_failed,
};
-void bch_debug_exit_cache_set(struct cache_set *c)
+void bch_fs_debug_exit(struct cache_set *c)
{
if (!IS_ERR_OR_NULL(c->debug))
debugfs_remove_recursive(c->debug);
}
-void bch_debug_init_cache_set(struct cache_set *c)
+void bch_fs_debug_init(struct cache_set *c)
{
struct btree_debug *bd;
char name[100];
@@ -432,18 +432,18 @@ void bch_debug_init_cache_set(struct cache_set *c)
bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
bd++) {
bd->id = bd - c->btree_debug;
- bd->btree = debugfs_create_file(bch_btree_id_names[bd->id],
+ bd->btree = debugfs_create_file(bch_btree_ids[bd->id],
0400, c->debug, bd,
&btree_debug_ops);
snprintf(name, sizeof(name), "%s-formats",
- bch_btree_id_names[bd->id]);
+ bch_btree_ids[bd->id]);
bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
&btree_format_debug_ops);
snprintf(name, sizeof(name), "%s-bfloat-failed",
- bch_btree_id_names[bd->id]);
+ bch_btree_ids[bd->id]);
bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
&bfloat_failed_debug_ops);
diff --git a/libbcache/debug.h b/libbcache/debug.h
index a3635e6..d34a95a 100644
--- a/libbcache/debug.h
+++ b/libbcache/debug.h
@@ -52,11 +52,11 @@ static inline void bch_btree_verify(struct cache_set *c, struct btree *b)
}
#ifdef CONFIG_DEBUG_FS
-void bch_debug_exit_cache_set(struct cache_set *);
-void bch_debug_init_cache_set(struct cache_set *);
+void bch_fs_debug_exit(struct cache_set *);
+void bch_fs_debug_init(struct cache_set *);
#else
-static inline void bch_debug_exit_cache_set(struct cache_set *c) {}
-static inline void bch_debug_init_cache_set(struct cache_set *c) {}
+static inline void bch_fs_debug_exit(struct cache_set *c) {}
+static inline void bch_fs_debug_init(struct cache_set *c) {}
#endif
void bch_debug_exit(void);
diff --git a/libbcache/error.c b/libbcache/error.c
index 9f39be1..f4109da 100644
--- a/libbcache/error.c
+++ b/libbcache/error.c
@@ -14,7 +14,7 @@ void bch_inconsistent_error(struct cache_set *c)
case BCH_ON_ERROR_RO:
if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
/* XXX do something better here? */
- bch_fs_stop(c);
+ bch_fs_stop_async(c);
return;
}
@@ -120,7 +120,7 @@ void bch_nonfatal_io_error_work(struct work_struct *work)
} else {
bch_notify_dev_error(ca, true);
- mutex_lock(&bch_register_lock);
+ mutex_lock(&c->state_lock);
dev = bch_dev_may_remove(ca);
if (dev
? bch_dev_read_only(ca)
@@ -129,7 +129,7 @@ void bch_nonfatal_io_error_work(struct work_struct *work)
"too many IO errors on %s, setting %s RO",
bdevname(ca->disk_sb.bdev, buf),
dev ? "device" : "filesystem");
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&c->state_lock);
}
}
diff --git a/libbcache/extents.c b/libbcache/extents.c
index 523f3f4..c5e0e37 100644
--- a/libbcache/extents.c
+++ b/libbcache/extents.c
@@ -547,7 +547,7 @@ static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b,
do {
seq = read_seqcount_begin(&c->gc_pos_lock);
bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
- !g->mark.is_metadata;
+ g->mark.data_type != BUCKET_BTREE;
} while (read_seqcount_retry(&c->gc_pos_lock, seq));
err = "inconsistent";
@@ -602,6 +602,7 @@ bch_btree_pick_ptr(struct cache_set *c, const struct btree *b)
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
const union bch_extent_crc *crc;
const struct bch_extent_ptr *ptr;
+ struct extent_pick_ptr pick = { .ca = NULL };
struct cache *ca;
rcu_read_lock();
@@ -621,15 +622,19 @@ bch_btree_pick_ptr(struct cache_set *c, const struct btree *b)
PTR_BUCKET_NR(ca, ptr)))
continue;
- percpu_ref_get(&ca->ref);
- rcu_read_unlock();
+ if (pick.ca && pick.ca->mi.tier < ca->mi.tier)
+ continue;
- return (struct extent_pick_ptr) { .ptr = *ptr, .ca = ca };
+ pick.ca = ca;
+ pick.ptr = *ptr;
}
+ if (pick.ca)
+ percpu_ref_get(&pick.ca->ref);
+
rcu_read_unlock();
- return (struct extent_pick_ptr) { .ca = NULL, };
+ return pick;
}
const struct bkey_ops bch_bkey_btree_ops = {
@@ -1880,7 +1885,7 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b,
if (stale)
break;
- bad = (mark.is_metadata ||
+ bad = (mark.data_type != BUCKET_DATA ||
(gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
!mark.owned_by_allocator &&
!(ptr->cached
@@ -2193,17 +2198,21 @@ void bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k,
rcu_read_lock();
ret->ca = NULL;
- extent_for_each_online_device_crc(c, e, crc, ptr, ca)
- if (!ptr_stale(ca, ptr)) {
- *ret = (struct extent_pick_ptr) {
- .crc = crc_to_128(e.k, crc),
- .ptr = *ptr,
- .ca = ca,
- };
-
- if (ca != avoid)
- break;
- }
+ extent_for_each_online_device_crc(c, e, crc, ptr, ca) {
+ if (ptr_stale(ca, ptr))
+ continue;
+
+ if (ret->ca &&
+ (ca == avoid ||
+ ret->ca->mi.tier < ca->mi.tier))
+ continue;
+
+ *ret = (struct extent_pick_ptr) {
+ .crc = crc_to_128(e.k, crc),
+ .ptr = *ptr,
+ .ca = ca,
+ };
+ }
if (ret->ca)
percpu_ref_get(&ret->ca->ref);
diff --git a/libbcache/fs-gc.c b/libbcache/fs-gc.c
index e9585fd..e2f1427 100644
--- a/libbcache/fs-gc.c
+++ b/libbcache/fs-gc.c
@@ -545,9 +545,9 @@ struct nlink {
u32 dir_count;
};
-DECLARE_GENRADIX_TYPE(nlinks, struct nlink);
+typedef GENRADIX(struct nlink) nlink_table;
-static void inc_link(struct cache_set *c, struct nlinks *links,
+static void inc_link(struct cache_set *c, nlink_table *links,
u64 range_start, u64 *range_end,
u64 inum, bool dir)
{
@@ -570,7 +570,7 @@ static void inc_link(struct cache_set *c, struct nlinks *links,
}
noinline_for_stack
-static int bch_gc_walk_dirents(struct cache_set *c, struct nlinks *links,
+static int bch_gc_walk_dirents(struct cache_set *c, nlink_table *links,
u64 range_start, u64 *range_end)
{
struct btree_iter iter;
@@ -776,7 +776,7 @@ fsck_err:
noinline_for_stack
static int bch_gc_walk_inodes(struct cache_set *c,
struct bch_inode_unpacked *lostfound_inode,
- struct nlinks *links,
+ nlink_table *links,
u64 range_start, u64 range_end)
{
struct btree_iter iter;
@@ -850,7 +850,7 @@ noinline_for_stack
static int check_inode_nlinks(struct cache_set *c,
struct bch_inode_unpacked *lostfound_inode)
{
- struct nlinks links;
+ nlink_table links;
u64 this_iter_range_start, next_iter_range_start = 0;
int ret = 0;
diff --git a/libbcache/fs.c b/libbcache/fs.c
index ab0d972..ec70a3e 100644
--- a/libbcache/fs.c
+++ b/libbcache/fs.c
@@ -1257,13 +1257,17 @@ static struct cache_set *bch_open_as_blockdevs(const char *_dev_name,
if (!c)
goto err_unlock;
- if (!test_bit(BCH_FS_RUNNING, &c->flags)) {
+ mutex_lock(&c->state_lock);
+
+ if (!bch_fs_running(c)) {
+ mutex_unlock(&c->state_lock);
err = "incomplete cache set";
c = NULL;
goto err_unlock;
}
closure_get(&c->cl);
+ mutex_unlock(&c->state_lock);
mutex_unlock(&bch_register_lock);
}
@@ -1291,22 +1295,19 @@ static int bch_remount(struct super_block *sb, int *flags, char *data)
if (ret)
return ret;
- mutex_lock(&bch_register_lock);
-
if (opts.read_only >= 0 &&
opts.read_only != c->opts.read_only) {
const char *err = NULL;
if (opts.read_only) {
- bch_fs_read_only_sync(c);
+ bch_fs_read_only(c);
sb->s_flags |= MS_RDONLY;
} else {
err = bch_fs_read_write(c);
if (err) {
bch_err(c, "error going rw: %s", err);
- ret = -EINVAL;
- goto unlock;
+ return -EINVAL;
}
sb->s_flags &= ~MS_RDONLY;
@@ -1318,9 +1319,6 @@ static int bch_remount(struct super_block *sb, int *flags, char *data)
if (opts.errors >= 0)
c->opts.errors = opts.errors;
-unlock:
- mutex_unlock(&bch_register_lock);
-
return ret;
}
@@ -1449,7 +1447,7 @@ static void bch_kill_sb(struct super_block *sb)
generic_shutdown_super(sb);
if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags))
- bch_fs_stop_sync(c);
+ bch_fs_stop(c);
else
closure_put(&c->cl);
}
@@ -1464,7 +1462,7 @@ static struct file_system_type bcache_fs_type = {
MODULE_ALIAS_FS("bcache");
-void bch_fs_exit(void)
+void bch_vfs_exit(void)
{
unregister_filesystem(&bcache_fs_type);
if (bch_dio_write_bioset)
@@ -1477,7 +1475,7 @@ void bch_fs_exit(void)
kmem_cache_destroy(bch_inode_cache);
}
-int __init bch_fs_init(void)
+int __init bch_vfs_init(void)
{
int ret = -ENOMEM;
@@ -1504,6 +1502,6 @@ int __init bch_fs_init(void)
return 0;
err:
- bch_fs_exit();
+ bch_vfs_exit();
return ret;
}
diff --git a/libbcache/fs.h b/libbcache/fs.h
index 933fb6d..2a29b13 100644
--- a/libbcache/fs.h
+++ b/libbcache/fs.h
@@ -52,13 +52,13 @@ int __must_check __bch_write_inode(struct cache_set *, struct bch_inode_info *,
int __must_check bch_write_inode(struct cache_set *,
struct bch_inode_info *);
-void bch_fs_exit(void);
-int bch_fs_init(void);
+void bch_vfs_exit(void);
+int bch_vfs_init(void);
#else
-static inline void bch_fs_exit(void) {}
-static inline int bch_fs_init(void) { return 0; }
+static inline void bch_vfs_exit(void) {}
+static inline int bch_vfs_init(void) { return 0; }
#endif
diff --git a/libbcache/io.c b/libbcache/io.c
index be99a97..a3df379 100644
--- a/libbcache/io.c
+++ b/libbcache/io.c
@@ -722,9 +722,7 @@ void bch_wake_delayed_writes(unsigned long data)
spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
while ((op = c->write_wait_head)) {
- if (!test_bit(BCH_FS_RO, &c->flags) &&
- !test_bit(BCH_FS_STOPPING, &c->flags) &&
- time_after(op->expires, jiffies)) {
+ if (time_after(op->expires, jiffies)) {
mod_timer(&c->foreground_write_wakeup, op->expires);
break;
}
@@ -1068,9 +1066,7 @@ static void __bch_read_endio(struct cache_set *c, struct bch_read_bio *rbio)
return;
}
- if (rbio->promote &&
- !test_bit(BCH_FS_RO, &c->flags) &&
- !test_bit(BCH_FS_STOPPING, &c->flags)) {
+ if (rbio->promote) {
struct cache_promote_op *promote = rbio->promote;
struct closure *cl = &promote->cl;
@@ -1133,13 +1129,26 @@ static void bch_read_endio(struct bio *bio)
preempt_disable();
d = this_cpu_ptr(c->bio_decompress_worker);
llist_add(&rbio->list, &d->bio_list);
- queue_work(system_unbound_wq, &d->work);
+ queue_work(system_highpri_wq, &d->work);
preempt_enable();
} else {
__bch_read_endio(c, rbio);
}
}
+static bool should_promote(struct cache_set *c,
+ struct extent_pick_ptr *pick, unsigned flags)
+{
+ if (!(flags & BCH_READ_PROMOTE))
+ return false;
+
+ if (percpu_ref_is_dying(&c->writes))
+ return false;
+
+ return c->fastest_tier &&
+ c->fastest_tier < c->tiers + pick->ca->mi.tier;
+}
+
void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
struct bvec_iter iter, struct bkey_s_c k,
struct extent_pick_ptr *pick, unsigned flags)
@@ -1158,7 +1167,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
* XXX: multiple promotes can race with each other, wastefully. Keep a
* list of outstanding promotes?
*/
- if ((flags & BCH_READ_PROMOTE) && pick->ca->mi.tier) {
+ if (should_promote(c, pick, flags)) {
/*
* biovec needs to be big enough to hold decompressed data, if
* the bch_write_extent() has to decompress/recompress it:
diff --git a/libbcache/journal.c b/libbcache/journal.c
index 99dd9f2..b283837 100644
--- a/libbcache/journal.c
+++ b/libbcache/journal.c
@@ -545,8 +545,7 @@ static int journal_entry_validate(struct cache_set *c,
return BCH_FSCK_UNKNOWN_VERSION;
}
- if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9 ||
- bytes > c->journal.entry_size_max, c,
+ if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9, c,
"journal entry too big (%zu bytes), sector %lluu",
bytes, sector)) {
/* XXX: note we might have missing journal entries */
@@ -1406,13 +1405,7 @@ void bch_journal_start(struct cache_set *c)
{
struct journal *j = &c->journal;
struct journal_seq_blacklist *bl;
- struct cache *ca;
u64 new_seq = 0;
- unsigned i;
-
- for_each_cache(ca, c, i)
- if (is_journal_device(ca))
- bch_dev_group_add(&c->journal.devs, ca);
list_for_each_entry(bl, &j->seq_blacklist, list)
new_seq = max(new_seq, bl->seq);
@@ -1534,48 +1527,111 @@ err:
return ret;
}
-static int bch_set_nr_journal_buckets(struct cache *ca, unsigned nr)
+static int bch_set_nr_journal_buckets(struct cache_set *c, struct cache *ca,
+ unsigned nr, bool write_super)
{
+ struct journal *j = &c->journal;
struct journal_device *ja = &ca->journal;
- struct bch_sb_field_journal *journal_buckets =
- bch_sb_get_journal(ca->disk_sb.sb);
- struct bch_sb_field *f;
- u64 *p;
+ struct bch_sb_field_journal *journal_buckets;
+ struct disk_reservation disk_res = { 0, 0 };
+ struct closure cl;
+ u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+ int ret = 0;
- p = krealloc(ja->bucket_seq, nr * sizeof(u64),
- GFP_KERNEL|__GFP_ZERO);
- if (!p)
- return -ENOMEM;
+ closure_init_stack(&cl);
- ja->bucket_seq = p;
+ mutex_lock(&c->sb_lock);
- p = krealloc(ja->buckets, nr * sizeof(u64),
- GFP_KERNEL|__GFP_ZERO);
- if (!p)
- return -ENOMEM;
+ /* don't handle reducing nr of buckets yet: */
+ if (nr <= ja->nr)
+ goto err;
- ja->buckets = p;
+ /*
+ * note: journal buckets aren't really counted as _sectors_ used yet, so
+ * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
+ * when space used goes up without a reservation - but we do need the
+ * reservation to ensure we'll actually be able to allocate:
+ */
- f = bch_dev_sb_field_resize(&ca->disk_sb, &journal_buckets->field, nr +
- sizeof(*journal_buckets) / sizeof(u64));
- if (!f)
- return -ENOMEM;
- f->type = BCH_SB_FIELD_journal;
+ ret = ENOSPC;
+ if (bch_disk_reservation_get(c, &disk_res,
+ (nr - ja->nr) << ca->bucket_bits, 0))
+ goto err;
- ja->nr = nr;
- return 0;
+ ret = -ENOMEM;
+ new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL);
+ new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL);
+ if (!new_buckets || !new_bucket_seq)
+ goto err;
+
+ journal_buckets = bch_sb_resize_journal(&ca->disk_sb,
+ nr + sizeof(*journal_buckets) / sizeof(u64));
+ if (!journal_buckets)
+ goto err;
+
+ spin_lock(&j->lock);
+ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
+ memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
+ swap(new_buckets, ja->buckets);
+ swap(new_bucket_seq, ja->bucket_seq);
+
+ while (ja->nr < nr) {
+ /* must happen under journal lock, to avoid racing with gc: */
+ u64 b = bch_bucket_alloc(ca, RESERVE_NONE);
+ if (!b) {
+ if (!closure_wait(&c->freelist_wait, &cl)) {
+ spin_unlock(&j->lock);
+ closure_sync(&cl);
+ spin_lock(&j->lock);
+ }
+ continue;
+ }
+
+ bch_mark_metadata_bucket(ca, &ca->buckets[b],
+ BUCKET_JOURNAL, false);
+ bch_mark_alloc_bucket(ca, &ca->buckets[b], false);
+
+ memmove(ja->buckets + ja->last_idx + 1,
+ ja->buckets + ja->last_idx,
+ (ja->nr - ja->last_idx) * sizeof(u64));
+ memmove(ja->bucket_seq + ja->last_idx + 1,
+ ja->bucket_seq + ja->last_idx,
+ (ja->nr - ja->last_idx) * sizeof(u64));
+ memmove(journal_buckets->buckets + ja->last_idx + 1,
+ journal_buckets->buckets + ja->last_idx,
+ (ja->nr - ja->last_idx) * sizeof(u64));
+
+ ja->buckets[ja->last_idx] = b;
+ journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b);
+
+ if (ja->last_idx < ja->nr) {
+ if (ja->cur_idx >= ja->last_idx)
+ ja->cur_idx++;
+ ja->last_idx++;
+ }
+ ja->nr++;
+
+ }
+ spin_unlock(&j->lock);
+
+ BUG_ON(bch_validate_journal_layout(ca->disk_sb.sb, ca->mi));
+
+ if (write_super)
+ bch_write_super(c);
+
+ ret = 0;
+err:
+ mutex_unlock(&c->sb_lock);
+
+ kfree(new_bucket_seq);
+ kfree(new_buckets);
+ bch_disk_reservation_put(c, &disk_res);
+
+ return ret;
}
int bch_dev_journal_alloc(struct cache *ca)
{
- struct journal_device *ja = &ca->journal;
- struct bch_sb_field_journal *journal_buckets;
- int ret;
- unsigned i;
-
- if (ca->mi.tier != 0)
- return 0;
-
if (dynamic_fault("bcache:add:journal_alloc"))
return -ENOMEM;
@@ -1583,26 +1639,12 @@ int bch_dev_journal_alloc(struct cache *ca)
* clamp journal size to 1024 buckets or 512MB (in sectors), whichever
* is smaller:
*/
- ret = bch_set_nr_journal_buckets(ca,
+ return bch_set_nr_journal_buckets(ca->set, ca,
clamp_t(unsigned, ca->mi.nbuckets >> 8,
BCH_JOURNAL_BUCKETS_MIN,
min(1 << 10,
- (1 << 20) / ca->mi.bucket_size)));
- if (ret)
- return ret;
-
- journal_buckets = bch_sb_get_journal(ca->disk_sb.sb);
-
- for (i = 0; i < ja->nr; i++) {
- u64 bucket = ca->mi.first_bucket + i;
-
- ja->buckets[i] = bucket;
- journal_buckets->buckets[i] = cpu_to_le64(bucket);
-
- bch_mark_metadata_bucket(ca, &ca->buckets[bucket], true);
- }
-
- return 0;
+ (1 << 20) / ca->mi.bucket_size)),
+ false);
}
/* Journalling */
@@ -1726,14 +1768,12 @@ void bch_journal_pin_add_if_older(struct journal *j,
fifo_entry_idx(&j->pin, pin->pin_list))) {
if (journal_pin_active(pin))
__journal_pin_drop(j, pin);
- __journal_pin_add(j, src_pin->pin_list,
- pin, NULL);
+ __journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
}
spin_unlock_irq(&j->pin_lock);
}
-
static struct journal_entry_pin *
journal_get_next_pin(struct journal *j, u64 seq_to_flush)
{
@@ -1766,6 +1806,29 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush)
return ret;
}
+static bool journal_has_pins(struct journal *j)
+{
+ bool ret;
+
+ spin_lock(&j->lock);
+ journal_reclaim_fast(j);
+ ret = fifo_used(&j->pin) > 1 ||
+ atomic_read(&fifo_peek_front(&j->pin).count) > 1;
+ spin_unlock(&j->lock);
+
+ return ret;
+}
+
+void bch_journal_flush_pins(struct journal *j)
+{
+ struct journal_entry_pin *pin;
+
+ while ((pin = journal_get_next_pin(j, U64_MAX)))
+ pin->flush(j, pin);
+
+ wait_event(j->wait, !journal_has_pins(j) || bch_journal_error(j));
+}
+
static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
{
bool ret;
@@ -1895,8 +1958,10 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
struct cache_set *c = container_of(j, struct cache_set, journal);
struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
struct bch_extent_ptr *ptr;
+ struct journal_device *ja;
struct cache *ca;
- unsigned iter, replicas, replicas_want =
+ bool swapped;
+ unsigned i, replicas, replicas_want =
READ_ONCE(c->opts.metadata_replicas);
spin_lock(&j->lock);
@@ -1921,12 +1986,27 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
replicas = bch_extent_nr_ptrs(e.c);
+ spin_lock(&j->devs.lock);
+
+ /* Sort by tier: */
+ do {
+ swapped = false;
+
+ for (i = 0; i + 1 < j->devs.nr; i++)
+ if (j->devs.d[i + 0].dev->mi.tier >
+ j->devs.d[i + 1].dev->mi.tier) {
+ swap(j->devs.d[i], j->devs.d[i + 1]);
+ swapped = true;
+ }
+ } while (swapped);
+
/*
- * Determine location of the next journal write:
- * XXX: sort caches by free journal space
+ * Pick devices for next journal write:
+ * XXX: sort devices by free journal space?
*/
- group_for_each_cache_rcu(ca, &j->devs, iter) {
- struct journal_device *ja = &ca->journal;
+ for (i = 0; i < j->devs.nr; i++) {
+ ca = j->devs.d[i].dev;
+ ja = &ca->journal;
if (replicas >= replicas_want)
break;
@@ -1954,7 +2034,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx);
}
-
+ spin_unlock(&j->devs.lock);
rcu_read_unlock();
j->prev_buf_sectors = 0;
@@ -2468,50 +2548,6 @@ int bch_journal_flush(struct journal *j)
return bch_journal_flush_seq(j, seq);
}
-void bch_journal_free(struct journal *j)
-{
- unsigned order = get_order(j->entry_size_max);
-
- free_pages((unsigned long) j->buf[1].data, order);
- free_pages((unsigned long) j->buf[0].data, order);
- free_fifo(&j->pin);
-}
-
-int bch_journal_alloc(struct journal *j, unsigned entry_size_max)
-{
- static struct lock_class_key res_key;
- unsigned order = get_order(entry_size_max);
-
- spin_lock_init(&j->lock);
- spin_lock_init(&j->pin_lock);
- init_waitqueue_head(&j->wait);
- INIT_DELAYED_WORK(&j->write_work, journal_write_work);
- INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
- mutex_init(&j->blacklist_lock);
- INIT_LIST_HEAD(&j->seq_blacklist);
- spin_lock_init(&j->devs.lock);
- mutex_init(&j->reclaim_lock);
-
- lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
-
- j->entry_size_max = entry_size_max;
- j->write_delay_ms = 100;
- j->reclaim_delay_ms = 100;
-
- bkey_extent_init(&j->key);
-
- atomic64_set(&j->reservations.counter,
- ((union journal_res_state)
- { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
-
- if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
- !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
- !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
- return -ENOMEM;
-
- return 0;
-}
-
ssize_t bch_journal_print_debug(struct journal *j, char *buf)
{
union journal_res_state *s = &j->reservations;
@@ -2643,13 +2679,31 @@ int bch_journal_move(struct cache *ca)
return ret;
}
-void bch_journal_free_cache(struct cache *ca)
+void bch_fs_journal_stop(struct journal *j)
+{
+ if (!test_bit(JOURNAL_STARTED, &j->flags))
+ return;
+
+ /*
+ * Empty out the journal by first flushing everything pinning existing
+ * journal entries, then force a brand new empty journal entry to be
+ * written:
+ */
+ bch_journal_flush_pins(j);
+ bch_journal_flush_async(j, NULL);
+ bch_journal_meta(j);
+
+ cancel_delayed_work_sync(&j->write_work);
+ cancel_delayed_work_sync(&j->reclaim_work);
+}
+
+void bch_dev_journal_exit(struct cache *ca)
{
kfree(ca->journal.buckets);
kfree(ca->journal.bucket_seq);
}
-int bch_journal_init_cache(struct cache *ca)
+int bch_dev_journal_init(struct cache *ca)
{
struct journal_device *ja = &ca->journal;
struct bch_sb_field_journal *journal_buckets =
@@ -2679,3 +2733,47 @@ int bch_journal_init_cache(struct cache *ca)
return 0;
}
+
+void bch_fs_journal_exit(struct journal *j)
+{
+ unsigned order = get_order(j->entry_size_max);
+
+ free_pages((unsigned long) j->buf[1].data, order);
+ free_pages((unsigned long) j->buf[0].data, order);
+ free_fifo(&j->pin);
+}
+
+int bch_fs_journal_init(struct journal *j, unsigned entry_size_max)
+{
+ static struct lock_class_key res_key;
+ unsigned order = get_order(entry_size_max);
+
+ spin_lock_init(&j->lock);
+ spin_lock_init(&j->pin_lock);
+ init_waitqueue_head(&j->wait);
+ INIT_DELAYED_WORK(&j->write_work, journal_write_work);
+ INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
+ mutex_init(&j->blacklist_lock);
+ INIT_LIST_HEAD(&j->seq_blacklist);
+ spin_lock_init(&j->devs.lock);
+ mutex_init(&j->reclaim_lock);
+
+ lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
+
+ j->entry_size_max = entry_size_max;
+ j->write_delay_ms = 100;
+ j->reclaim_delay_ms = 100;
+
+ bkey_extent_init(&j->key);
+
+ atomic64_set(&j->reservations.counter,
+ ((union journal_res_state)
+ { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
+
+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+ !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
+ !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/libbcache/journal.h b/libbcache/journal.h
index 02a6e67..d3a1db0 100644
--- a/libbcache/journal.h
+++ b/libbcache/journal.h
@@ -111,7 +111,6 @@
#include <linux/hash.h>
#include "journal_types.h"
-//#include "super-io.h"
/*
* Only used for holding the journal entries we read in btree_journal_read()
@@ -136,6 +135,7 @@ void bch_journal_pin_add_if_older(struct journal *,
struct journal_entry_pin *,
struct journal_entry_pin *,
journal_pin_flush_fn);
+void bch_journal_flush_pins(struct journal *);
struct closure;
struct cache_set;
@@ -330,11 +330,6 @@ static inline int bch_journal_error(struct journal *j)
? -EIO : 0;
}
-static inline bool is_journal_device(struct cache *ca)
-{
- return ca->mi.state == BCH_MEMBER_STATE_ACTIVE && ca->mi.tier == 0;
-}
-
static inline bool journal_flushes_device(struct cache *ca)
{
return true;
@@ -356,9 +351,6 @@ static inline void bch_journal_set_replay_done(struct journal *j)
spin_unlock(&j->lock);
}
-void bch_journal_free(struct journal *);
-int bch_journal_alloc(struct journal *, unsigned);
-
ssize_t bch_journal_print_debug(struct journal *, char *);
int bch_dev_journal_alloc(struct cache *);
@@ -372,7 +364,10 @@ static inline unsigned bch_nr_journal_buckets(struct bch_sb_field_journal *j)
int bch_journal_move(struct cache *);
-void bch_journal_free_cache(struct cache *);
-int bch_journal_init_cache(struct cache *);
+void bch_fs_journal_stop(struct journal *);
+void bch_dev_journal_exit(struct cache *);
+int bch_dev_journal_init(struct cache *);
+void bch_fs_journal_exit(struct journal *);
+int bch_fs_journal_init(struct journal *, unsigned);
#endif /* _BCACHE_JOURNAL_H */
diff --git a/libbcache/movinggc.c b/libbcache/movinggc.c
index e40dfbc..27f5c63 100644
--- a/libbcache/movinggc.c
+++ b/libbcache/movinggc.c
@@ -191,7 +191,7 @@ static void bch_moving_gc(struct cache *ca)
}
if (g->mark.owned_by_allocator ||
- g->mark.is_metadata)
+ g->mark.data_type != BUCKET_DATA)
continue;
sectors_used = bucket_sectors_used(g);
@@ -258,18 +258,21 @@ static int bch_moving_gc_thread(void *arg)
return 0;
}
-void bch_moving_init_cache(struct cache *ca)
+void bch_moving_gc_stop(struct cache *ca)
{
- bch_pd_controller_init(&ca->moving_gc_pd);
- ca->moving_gc_pd.d_term = 0;
+ ca->moving_gc_pd.rate.rate = UINT_MAX;
+ bch_ratelimit_reset(&ca->moving_gc_pd.rate);
+
+ if (ca->moving_gc_read)
+ kthread_stop(ca->moving_gc_read);
+ ca->moving_gc_read = NULL;
}
-int bch_moving_gc_thread_start(struct cache *ca)
+int bch_moving_gc_start(struct cache *ca)
{
struct task_struct *t;
- /* The moving gc read thread must be stopped */
- BUG_ON(ca->moving_gc_read != NULL);
+ BUG_ON(ca->moving_gc_read);
if (ca->set->opts.nochanges)
return 0;
@@ -287,12 +290,8 @@ int bch_moving_gc_thread_start(struct cache *ca)
return 0;
}
-void bch_moving_gc_stop(struct cache *ca)
+void bch_dev_moving_gc_init(struct cache *ca)
{
- ca->moving_gc_pd.rate.rate = UINT_MAX;
- bch_ratelimit_reset(&ca->moving_gc_pd.rate);
-
- if (ca->moving_gc_read)
- kthread_stop(ca->moving_gc_read);
- ca->moving_gc_read = NULL;
+ bch_pd_controller_init(&ca->moving_gc_pd);
+ ca->moving_gc_pd.d_term = 0;
}
diff --git a/libbcache/movinggc.h b/libbcache/movinggc.h
index 5f15308..e8ae95e 100644
--- a/libbcache/movinggc.h
+++ b/libbcache/movinggc.h
@@ -23,8 +23,8 @@
#define COPYGC_SECTORS_PER_ITER(ca) \
((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca))
-void bch_moving_init_cache(struct cache *);
void bch_moving_gc_stop(struct cache *);
-int bch_moving_gc_thread_start(struct cache *);
+int bch_moving_gc_start(struct cache *);
+void bch_dev_moving_gc_init(struct cache *);
#endif
diff --git a/libbcache/opts.h b/libbcache/opts.h
index 95184db..9b10310 100644
--- a/libbcache/opts.h
+++ b/libbcache/opts.h
@@ -86,11 +86,17 @@ enum opt_type {
BCH_OPT(noreplay, 0444, NO_SB_OPT, \
s8, OPT_BOOL()) \
BCH_OPT(norecovery, 0444, NO_SB_OPT, \
- s8, OPT_BOOL())
+ s8, OPT_BOOL()) \
+ BCH_OPT(noexcl, 0444, NO_SB_OPT, \
+ s8, OPT_BOOL()) \
+ BCH_OPT(sb, 0444, NO_SB_OPT, \
+ s64, OPT_UINT(0, S64_MAX)) \
#define BCH_OPTS() \
BCH_OPT(read_only, 0444, NO_SB_OPT, \
s8, OPT_BOOL()) \
+ BCH_OPT(nostart, 0444, NO_SB_OPT, \
+ s8, OPT_BOOL()) \
BCH_VISIBLE_OPTS()
struct bch_opts {
@@ -145,6 +151,8 @@ static inline void bch_opts_apply(struct bch_opts *dst, struct bch_opts src)
#undef BCH_OPT
}
+#define opt_defined(_opt) ((_opt) >= 0)
+
void bch_opt_set(struct bch_opts *, enum bch_opt_id, u64);
struct bch_opts bch_sb_opts(struct bch_sb *);
diff --git a/libbcache/super-io.c b/libbcache/super-io.c
index be27d3e..f50a5ee 100644
--- a/libbcache/super-io.c
+++ b/libbcache/super-io.c
@@ -10,6 +10,7 @@
#include "vstructs.h"
#include <linux/backing-dev.h>
+#include <linux/sort.h>
static inline void __bch_sb_layout_size_assert(void)
{
@@ -17,7 +18,7 @@ static inline void __bch_sb_layout_size_assert(void)
}
struct bch_sb_field *bch_sb_field_get(struct bch_sb *sb,
- enum bch_sb_field_types type)
+ enum bch_sb_field_type type)
{
struct bch_sb_field *f;
@@ -34,7 +35,7 @@ void bch_free_super(struct bcache_superblock *sb)
if (sb->bio)
bio_put(sb->bio);
if (!IS_ERR_OR_NULL(sb->bdev))
- blkdev_put(sb->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+ blkdev_put(sb->bdev, sb->mode);
free_pages((unsigned long) sb->sb, sb->page_order);
memset(sb, 0, sizeof(*sb));
@@ -74,7 +75,7 @@ static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order)
return 0;
}
-int bch_dev_sb_realloc(struct bcache_superblock *sb, unsigned u64s)
+static int bch_sb_realloc(struct bcache_superblock *sb, unsigned u64s)
{
u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s);
u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
@@ -140,13 +141,29 @@ static struct bch_sb_field *__bch_sb_field_resize(struct bch_sb *sb,
le32_add_cpu(&sb->u64s, u64s - old_u64s);
return f;
+}
+
+struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *sb,
+ enum bch_sb_field_type type,
+ unsigned u64s)
+{
+ struct bch_sb_field *f = bch_sb_field_get(sb->sb, type);
+ ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+ ssize_t d = -old_u64s + u64s;
+ if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
+ return NULL;
+
+ f = __bch_sb_field_resize(sb->sb, f, u64s);
+ f->type = type;
+ return f;
}
struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c,
- struct bch_sb_field *f,
+ enum bch_sb_field_type type,
unsigned u64s)
{
+ struct bch_sb_field *f = bch_sb_field_get(c->disk_sb, type);
ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
ssize_t d = -old_u64s + u64s;
struct cache *ca;
@@ -160,26 +177,15 @@ struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c,
for_each_cache(ca, c, i) {
struct bcache_superblock *sb = &ca->disk_sb;
- if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
+ if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
percpu_ref_put(&ca->ref);
return NULL;
}
}
- return __bch_sb_field_resize(c->disk_sb, f, u64s);
-}
-
-struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *sb,
- struct bch_sb_field *f,
- unsigned u64s)
-{
- ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
- ssize_t d = -old_u64s + u64s;
-
- if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
- return NULL;
-
- return __bch_sb_field_resize(sb->sb, f, u64s);
+ f = __bch_sb_field_resize(c->disk_sb, f, u64s);
+ f->type = type;
+ return f;
}
static const char *validate_sb_layout(struct bch_sb_layout *layout)
@@ -203,9 +209,6 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout)
prev_offset = le64_to_cpu(layout->sb_offset[0]);
- if (prev_offset != BCH_SB_SECTOR)
- return "Invalid superblock layout: doesn't have default superblock location";
-
for (i = 1; i < layout->nr_superblocks; i++) {
offset = le64_to_cpu(layout->sb_offset[i]);
@@ -217,16 +220,70 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout)
return NULL;
}
+static int u64_cmp(const void *_l, const void *_r)
+{
+ u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
+
+ return l < r ? -1 : l > r ? 1 : 0;
+}
+
+const char *bch_validate_journal_layout(struct bch_sb *sb,
+ struct cache_member_cpu mi)
+{
+ struct bch_sb_field_journal *journal;
+ const char *err;
+ unsigned nr;
+ unsigned i;
+ u64 *b;
+
+ journal = bch_sb_get_journal(sb);
+ if (!journal)
+ return NULL;
+
+ nr = bch_nr_journal_buckets(journal);
+ if (!nr)
+ return NULL;
+
+ b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
+ if (!b)
+ return "cannot allocate memory";
+
+ for (i = 0; i < nr; i++)
+ b[i] = le64_to_cpu(journal->buckets[i]);
+
+ sort(b, nr, sizeof(u64), u64_cmp, NULL);
+
+ err = "journal bucket at sector 0";
+ if (!b[0])
+ goto err;
+
+ err = "journal bucket before first bucket";
+ if (b[0] < mi.first_bucket)
+ goto err;
+
+ err = "journal bucket past end of device";
+ if (b[nr - 1] >= mi.nbuckets)
+ goto err;
+
+ err = "duplicate journal buckets";
+ for (i = 0; i + 1 < nr; i++)
+ if (b[i] == b[i + 1])
+ goto err;
+
+ err = NULL;
+err:
+ kfree(b);
+ return err;
+}
+
const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
{
struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field *f;
struct bch_sb_field_members *sb_mi;
- struct bch_sb_field_journal *journal;
struct cache_member_cpu mi;
const char *err;
u16 block_size;
- unsigned i;
switch (le64_to_cpu(sb->version)) {
case BCACHE_SB_VERSION_CDEV_V4:
@@ -324,14 +381,6 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
mi = cache_mi_to_cpu_mi(sb_mi->members + sb->dev_idx);
- for (i = 0; i < sb->layout.nr_superblocks; i++) {
- u64 offset = le64_to_cpu(sb->layout.sb_offset[i]);
- u64 max_size = 1 << sb->layout.sb_max_size_bits;
-
- if (offset + max_size > mi.first_bucket * mi.bucket_size)
- return "Invalid superblock: first bucket comes before end of super";
- }
-
if (mi.nbuckets > LONG_MAX)
return "Too many buckets";
@@ -347,16 +396,9 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
mi.bucket_size * mi.nbuckets)
return "Invalid superblock: device too small";
- /* Validate journal buckets: */
- journal = bch_sb_get_journal(sb);
- if (journal) {
- for (i = 0; i < bch_nr_journal_buckets(journal); i++) {
- u64 b = le64_to_cpu(journal->buckets[i]);
-
- if (b < mi.first_bucket || b >= mi.nbuckets)
- return "bad journal bucket";
- }
- }
+ err = bch_validate_journal_layout(sb, mi);
+ if (err)
+ return err;
return NULL;
}
@@ -382,19 +424,19 @@ static bool bch_is_open_cache(struct block_device *bdev)
static bool bch_is_open(struct block_device *bdev)
{
- lockdep_assert_held(&bch_register_lock);
+ bool ret;
+
+ mutex_lock(&bch_register_lock);
+ ret = bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
+ mutex_unlock(&bch_register_lock);
- return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
+ return ret;
}
-static const char *bch_blkdev_open(const char *path, void *holder,
- struct bch_opts opts,
- struct block_device **ret)
+static const char *bch_blkdev_open(const char *path, fmode_t mode,
+ void *holder, struct block_device **ret)
{
struct block_device *bdev;
- fmode_t mode = opts.nochanges > 0
- ? FMODE_READ
- : FMODE_READ|FMODE_WRITE|FMODE_EXCL;
const char *err;
*ret = NULL;
@@ -548,7 +590,7 @@ int bch_sb_from_cache_set(struct cache_set *c, struct cache *ca)
unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
int ret;
- ret = bch_dev_sb_realloc(&ca->disk_sb, u64s);
+ ret = bch_sb_realloc(&ca->disk_sb, u64s);
if (ret)
return ret;
@@ -567,7 +609,7 @@ static const char *read_one_super(struct bcache_superblock *sb, u64 offset)
reread:
bio_reset(sb->bio);
sb->bio->bi_bdev = sb->bdev;
- sb->bio->bi_iter.bi_sector = BCH_SB_SECTOR;
+ sb->bio->bi_iter.bi_sector = offset;
sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
bch_bio_map(sb->bio, sb->sb);
@@ -610,15 +652,21 @@ const char *bch_read_super(struct bcache_superblock *sb,
struct bch_opts opts,
const char *path)
{
+ u64 offset = opt_defined(opts.sb) ? opts.sb : BCH_SB_SECTOR;
struct bch_sb_layout layout;
const char *err;
unsigned i;
- lockdep_assert_held(&bch_register_lock);
-
memset(sb, 0, sizeof(*sb));
+ sb->mode = FMODE_READ;
+
+ if (!(opt_defined(opts.noexcl) && opts.noexcl))
+ sb->mode |= FMODE_EXCL;
- err = bch_blkdev_open(path, &sb, opts, &sb->bdev);
+ if (!(opt_defined(opts.nochanges) && opts.nochanges))
+ sb->mode |= FMODE_WRITE;
+
+ err = bch_blkdev_open(path, sb->mode, sb, &sb->bdev);
if (err)
return err;
@@ -630,11 +678,16 @@ const char *bch_read_super(struct bcache_superblock *sb,
if (bch_fs_init_fault("read_super"))
goto err;
- err = read_one_super(sb, BCH_SB_SECTOR);
+ err = read_one_super(sb, offset);
if (!err)
goto got_super;
- pr_err("error reading default super: %s", err);
+ if (offset != BCH_SB_SECTOR) {
+ pr_err("error reading superblock: %s", err);
+ goto err;
+ }
+
+ pr_err("error reading default superblock: %s", err);
/*
* Error reading primary superblock - read location of backup
@@ -747,6 +800,9 @@ void bch_write_super(struct cache_set *c)
lockdep_assert_held(&c->sb_lock);
+ if (c->opts.nochanges)
+ return;
+
closure_init_stack(cl);
le64_add_cpu(&c->disk_sb->seq, 1);
diff --git a/libbcache/super-io.h b/libbcache/super-io.h
index 665de81..ae1e8b9 100644
--- a/libbcache/super-io.h
+++ b/libbcache/super-io.h
@@ -6,16 +6,35 @@
#include <asm/byteorder.h>
-struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_types);
-
-#define BCH_SB_FIELD_TYPE(_name) \
-static inline struct bch_sb_field_##_name * \
-bch_sb_get_##_name(struct bch_sb *sb) \
-{ \
- struct bch_sb_field *f = \
- bch_sb_field_get(sb, BCH_SB_FIELD_##_name); \
- \
- return container_of_or_null(f, struct bch_sb_field_##_name, field);\
+struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
+struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *,
+ enum bch_sb_field_type, unsigned);
+struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *,
+ enum bch_sb_field_type, unsigned);
+
+#define field_to_type(_f, _name) \
+ container_of_or_null(_f, struct bch_sb_field_##_name, field)
+
+#define BCH_SB_FIELD_TYPE(_name) \
+static inline struct bch_sb_field_##_name * \
+bch_sb_get_##_name(struct bch_sb *sb) \
+{ \
+ return field_to_type(bch_sb_field_get(sb, \
+ BCH_SB_FIELD_##_name), _name); \
+} \
+ \
+static inline struct bch_sb_field_##_name * \
+bch_sb_resize_##_name(struct bcache_superblock *sb, unsigned u64s) \
+{ \
+ return field_to_type(bch_sb_field_resize(sb, \
+ BCH_SB_FIELD_##_name, u64s), _name); \
+} \
+ \
+static inline struct bch_sb_field_##_name * \
+bch_fs_sb_resize_##_name(struct cache_set *c, unsigned u64s) \
+{ \
+ return field_to_type(bch_fs_sb_field_resize(c, \
+ BCH_SB_FIELD_##_name, u64s), _name); \
}
BCH_SB_FIELD_TYPE(journal);
@@ -85,14 +104,11 @@ int bch_fs_mi_update(struct cache_set *, struct bch_member *, unsigned);
int bch_sb_to_cache_set(struct cache_set *, struct bch_sb *);
int bch_sb_from_cache_set(struct cache_set *, struct cache *);
-struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *,
- struct bch_sb_field *, unsigned);
-struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *,
- struct bch_sb_field *, unsigned);
-
void bch_free_super(struct bcache_superblock *);
int bch_super_realloc(struct bcache_superblock *, unsigned);
+const char *bch_validate_journal_layout(struct bch_sb *,
+ struct cache_member_cpu);
const char *bch_validate_cache_super(struct bcache_superblock *);
const char *bch_read_super(struct bcache_superblock *,
diff --git a/libbcache/super.c b/libbcache/super.c
index fab3480..5535639 100644
--- a/libbcache/super.c
+++ b/libbcache/super.c
@@ -69,7 +69,7 @@ static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
struct workqueue_struct *bcache_io_wq;
struct crypto_shash *bch_sha256;
-static void bch_dev_stop(struct cache *);
+static void bch_dev_free(struct cache *);
static int bch_dev_online(struct cache *);
static int bch_congested_fn(void *data, int bdi_bits)
@@ -92,8 +92,11 @@ static int bch_congested_fn(void *data, int bdi_bits)
}
}
} else {
- /* Writes only go to tier 0: */
- group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
+ /* Writes prefer fastest tier: */
+ struct bch_tier *tier = READ_ONCE(c->fastest_tier);
+ struct cache_group *grp = tier ? &tier->devs : &c->cache_all;
+
+ group_for_each_cache_rcu(ca, grp, i) {
bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
if (bdi_congested(bdi, bdi_bits)) {
@@ -107,7 +110,7 @@ static int bch_congested_fn(void *data, int bdi_bits)
return ret;
}
-/* Cache set RO/RW: */
+/* Filesystem RO/RW: */
/*
* For startup/shutdown of RW stuff, the dependencies are:
@@ -129,9 +132,7 @@ static void __bch_fs_read_only(struct cache_set *c)
struct cache *ca;
unsigned i;
- c->tiering_pd.rate.rate = UINT_MAX;
- bch_ratelimit_reset(&c->tiering_pd.rate);
- bch_tiering_read_stop(c);
+ bch_tiering_stop(c);
for_each_cache(ca, c, i)
bch_moving_gc_stop(ca);
@@ -143,20 +144,7 @@ static void __bch_fs_read_only(struct cache_set *c)
for_each_cache(ca, c, i)
bch_dev_allocator_stop(ca);
- /*
- * Write a journal entry after flushing the btree, so we don't end up
- * replaying everything we just flushed:
- */
- if (test_bit(JOURNAL_STARTED, &c->journal.flags)) {
- int ret;
-
- bch_journal_flush_async(&c->journal, NULL);
- ret = bch_journal_meta(&c->journal);
- BUG_ON(ret && !bch_journal_error(&c->journal));
- }
-
- cancel_delayed_work_sync(&c->journal.write_work);
- cancel_delayed_work_sync(&c->journal.reclaim_work);
+ bch_fs_journal_stop(&c->journal);
}
static void bch_writes_disabled(struct percpu_ref *writes)
@@ -167,12 +155,27 @@ static void bch_writes_disabled(struct percpu_ref *writes)
wake_up(&bch_read_only_wait);
}
-static void bch_fs_read_only_work(struct work_struct *work)
+void bch_fs_read_only(struct cache_set *c)
{
- struct cache_set *c =
- container_of(work, struct cache_set, read_only_work);
+ mutex_lock(&c->state_lock);
+ if (c->state != BCH_FS_STARTING &&
+ c->state != BCH_FS_RW)
+ goto out;
+
+ if (test_bit(BCH_FS_ERROR, &c->flags))
+ goto out;
- percpu_ref_put(&c->writes);
+ trace_fs_read_only(c);
+
+ /*
+ * Block new foreground-end write operations from starting - any new
+ * writes will return -EROFS:
+ *
+ * (This is really blocking new _allocations_, writes to previously
+ * allocated space can still happen until stopping the allocator in
+ * bch_dev_allocator_stop()).
+ */
+ percpu_ref_kill(&c->writes);
del_timer(&c->foreground_write_wakeup);
cancel_delayed_work(&c->pd_controllers_update);
@@ -180,98 +183,77 @@ static void bch_fs_read_only_work(struct work_struct *work)
c->foreground_write_pd.rate.rate = UINT_MAX;
bch_wake_delayed_writes((unsigned long) c);
- if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
- /*
- * If we're not doing an emergency shutdown, we want to wait on
- * outstanding writes to complete so they don't see spurious
- * errors due to shutting down the allocator:
- */
- wait_event(bch_read_only_wait,
- test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+ /*
+ * If we're not doing an emergency shutdown, we want to wait on
+ * outstanding writes to complete so they don't see spurious errors due
+ * to shutting down the allocator:
+ *
+ * If we are doing an emergency shutdown outstanding writes may
+ * hang until we shutdown the allocator so we don't want to wait
+ * on outstanding writes before shutting everything down - but
+ * we do need to wait on them before returning and signalling
+ * that going RO is complete:
+ */
+ wait_event(bch_read_only_wait,
+ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
+ test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
- __bch_fs_read_only(c);
+ __bch_fs_read_only(c);
- if (!bch_journal_error(&c->journal) &&
- !test_bit(BCH_FS_ERROR, &c->flags)) {
- mutex_lock(&c->sb_lock);
- SET_BCH_SB_CLEAN(c->disk_sb, true);
- bch_write_super(c);
- mutex_unlock(&c->sb_lock);
- }
- } else {
- /*
- * If we are doing an emergency shutdown outstanding writes may
- * hang until we shutdown the allocator so we don't want to wait
- * on outstanding writes before shutting everything down - but
- * we do need to wait on them before returning and signalling
- * that going RO is complete:
- */
- __bch_fs_read_only(c);
+ wait_event(bch_read_only_wait,
+ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+
+ clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
- wait_event(bch_read_only_wait,
- test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+ if (!bch_journal_error(&c->journal) &&
+ !test_bit(BCH_FS_ERROR, &c->flags)) {
+ mutex_lock(&c->sb_lock);
+ SET_BCH_SB_CLEAN(c->disk_sb, true);
+ bch_write_super(c);
+ mutex_unlock(&c->sb_lock);
}
+ c->state = BCH_FS_RO;
bch_notify_fs_read_only(c);
trace_fs_read_only_done(c);
-
- set_bit(BCH_FS_RO_COMPLETE, &c->flags);
- wake_up(&bch_read_only_wait);
+out:
+ mutex_unlock(&c->state_lock);
}
-bool bch_fs_read_only(struct cache_set *c)
+static void bch_fs_read_only_work(struct work_struct *work)
{
- if (test_and_set_bit(BCH_FS_RO, &c->flags))
- return false;
-
- trace_fs_read_only(c);
-
- percpu_ref_get(&c->writes);
+ struct cache_set *c =
+ container_of(work, struct cache_set, read_only_work);
- /*
- * Block new foreground-end write operations from starting - any new
- * writes will return -EROFS:
- *
- * (This is really blocking new _allocations_, writes to previously
- * allocated space can still happen until stopping the allocator in
- * bch_dev_allocator_stop()).
- */
- percpu_ref_kill(&c->writes);
+ bch_fs_read_only(c);
+}
- queue_work(system_freezable_wq, &c->read_only_work);
- return true;
+static void bch_fs_read_only_async(struct cache_set *c)
+{
+ queue_work(system_long_wq, &c->read_only_work);
}
bool bch_fs_emergency_read_only(struct cache_set *c)
{
bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
- bch_fs_read_only(c);
+ bch_fs_read_only_async(c);
bch_journal_halt(&c->journal);
wake_up(&bch_read_only_wait);
return ret;
}
-void bch_fs_read_only_sync(struct cache_set *c)
-{
- /* so we don't race with bch_fs_read_write() */
- lockdep_assert_held(&bch_register_lock);
-
- bch_fs_read_only(c);
-
- wait_event(bch_read_only_wait,
- test_bit(BCH_FS_RO_COMPLETE, &c->flags) &&
- test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
-}
-
-static const char *__bch_fs_read_write(struct cache_set *c)
+const char *bch_fs_read_write(struct cache_set *c)
{
struct cache *ca;
- const char *err;
+ const char *err = NULL;
unsigned i;
- lockdep_assert_held(&bch_register_lock);
+ mutex_lock(&c->state_lock);
+ if (c->state != BCH_FS_STARTING &&
+ c->state != BCH_FS_RO)
+ goto out;
err = "error starting allocator thread";
for_each_cache(ca, c, i)
@@ -285,67 +267,43 @@ static const char *__bch_fs_read_write(struct cache_set *c)
if (bch_gc_thread_start(c))
goto err;
- for_each_cache(ca, c, i) {
- if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
- continue;
-
- err = "error starting moving GC thread";
- if (bch_moving_gc_thread_start(ca)) {
+ err = "error starting moving GC thread";
+ for_each_cache(ca, c, i)
+ if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
+ bch_moving_gc_start(ca)) {
percpu_ref_put(&ca->ref);
goto err;
}
- }
err = "error starting tiering thread";
- if (bch_tiering_read_start(c))
+ if (bch_tiering_start(c))
goto err;
schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
- return NULL;
+ if (c->state != BCH_FS_STARTING)
+ percpu_ref_reinit(&c->writes);
+
+ c->state = BCH_FS_RW;
+ err = NULL;
+out:
+ mutex_unlock(&c->state_lock);
+ return err;
err:
__bch_fs_read_only(c);
- return err;
-}
-
-const char *bch_fs_read_write(struct cache_set *c)
-{
- const char *err;
-
- lockdep_assert_held(&bch_register_lock);
-
- if (!test_bit(BCH_FS_RO_COMPLETE, &c->flags))
- return NULL;
-
- err = __bch_fs_read_write(c);
- if (err)
- return err;
-
- percpu_ref_reinit(&c->writes);
-
- clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
- clear_bit(BCH_FS_EMERGENCY_RO, &c->flags);
- clear_bit(BCH_FS_RO_COMPLETE, &c->flags);
- clear_bit(BCH_FS_RO, &c->flags);
- return NULL;
+ goto out;
}
-/* Cache set startup/shutdown: */
+/* Filesystem startup/shutdown: */
static void bch_fs_free(struct cache_set *c)
{
- del_timer_sync(&c->foreground_write_wakeup);
- cancel_delayed_work_sync(&c->pd_controllers_update);
- cancel_work_sync(&c->read_only_work);
- cancel_work_sync(&c->bio_submit_work);
- cancel_work_sync(&c->read_retry_work);
-
- bch_fs_encryption_free(c);
- bch_btree_cache_free(c);
- bch_journal_free(&c->journal);
+ bch_fs_encryption_exit(c);
+ bch_fs_btree_exit(c);
+ bch_fs_journal_exit(&c->journal);
bch_io_clock_exit(&c->io_clock[WRITE]);
bch_io_clock_exit(&c->io_clock[READ]);
- bch_compress_free(c);
+ bch_fs_compress_exit(c);
bch_fs_blockdev_exit(c);
bdi_destroy(&c->bdi);
lg_lock_free(&c->bucket_stats_lock);
@@ -372,6 +330,52 @@ static void bch_fs_free(struct cache_set *c)
module_put(THIS_MODULE);
}
+static void bch_fs_exit(struct cache_set *c)
+{
+ unsigned i;
+
+ del_timer_sync(&c->foreground_write_wakeup);
+ cancel_delayed_work_sync(&c->pd_controllers_update);
+ cancel_work_sync(&c->read_only_work);
+ cancel_work_sync(&c->bio_submit_work);
+ cancel_work_sync(&c->read_retry_work);
+
+ for (i = 0; i < c->sb.nr_devices; i++)
+ if (c->cache[i])
+ bch_dev_free(c->cache[i]);
+
+ closure_debug_destroy(&c->cl);
+ kobject_put(&c->kobj);
+}
+
+static void bch_fs_offline(struct cache_set *c)
+{
+ struct cache *ca;
+ unsigned i;
+
+ mutex_lock(&bch_register_lock);
+ list_del(&c->list);
+ mutex_unlock(&bch_register_lock);
+
+ if (c->kobj.state_in_sysfs)
+ kobject_del(&c->kobj);
+
+ for_each_cache(ca, c, i)
+ if (ca->kobj.state_in_sysfs)
+ kobject_del(&ca->kobj);
+
+ bch_fs_debug_exit(c);
+ bch_fs_chardev_exit(c);
+
+ bch_cache_accounting_destroy(&c->accounting);
+
+ kobject_put(&c->time_stats);
+ kobject_put(&c->opts_dir);
+ kobject_put(&c->internal);
+
+ __bch_fs_read_only(c);
+}
+
/*
* should be __bch_fs_stop4 - block devices are closed, now we can finally
* free it
@@ -379,15 +383,9 @@ static void bch_fs_free(struct cache_set *c)
void bch_fs_release(struct kobject *kobj)
{
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
- struct completion *stop_completion = c->stop_completion;
bch_notify_fs_stopped(c);
- bch_info(c, "stopped");
-
bch_fs_free(c);
-
- if (stop_completion)
- complete(stop_completion);
}
/*
@@ -396,18 +394,8 @@ void bch_fs_release(struct kobject *kobj)
static void __bch_fs_stop3(struct closure *cl)
{
struct cache_set *c = container_of(cl, struct cache_set, cl);
- struct cache *ca;
- unsigned i;
- mutex_lock(&bch_register_lock);
- for_each_cache(ca, c, i)
- bch_dev_stop(ca);
-
- list_del(&c->list);
- mutex_unlock(&bch_register_lock);
-
- closure_debug_destroy(&c->cl);
- kobject_put(&c->kobj);
+ bch_fs_exit(c);
}
/*
@@ -418,28 +406,14 @@ static void __bch_fs_stop2(struct closure *cl)
{
struct cache_set *c = container_of(cl, struct cache_set, caching);
- bch_debug_exit_cache_set(c);
- bch_fs_chardev_exit(c);
-
- if (c->kobj.state_in_sysfs)
- kobject_del(&c->kobj);
-
- bch_cache_accounting_destroy(&c->accounting);
-
- kobject_put(&c->time_stats);
- kobject_put(&c->opts_dir);
- kobject_put(&c->internal);
-
- mutex_lock(&bch_register_lock);
- bch_fs_read_only_sync(c);
- mutex_unlock(&bch_register_lock);
+ bch_fs_offline(c);
closure_return(cl);
}
/*
- * First phase of the shutdown process that's kicked off by bch_fs_stop(); we
- * haven't waited for anything to stop yet, we're just punting to process
+ * First phase of the shutdown process that's kicked off by bch_fs_stop_async();
+ * we haven't waited for anything to stop yet, we're just punting to process
* context to shut down block devices:
*/
static void __bch_fs_stop1(struct closure *cl)
@@ -451,29 +425,42 @@ static void __bch_fs_stop1(struct closure *cl)
continue_at(cl, __bch_fs_stop2, system_wq);
}
-void bch_fs_stop(struct cache_set *c)
+void bch_fs_stop_async(struct cache_set *c)
{
- if (!test_and_set_bit(BCH_FS_STOPPING, &c->flags))
+ mutex_lock(&c->state_lock);
+ if (c->state != BCH_FS_STOPPING) {
+ c->state = BCH_FS_STOPPING;
closure_queue(&c->caching);
+ }
+ mutex_unlock(&c->state_lock);
}
-void bch_fs_stop_sync(struct cache_set *c)
+void bch_fs_stop(struct cache_set *c)
{
- DECLARE_COMPLETION_ONSTACK(complete);
+ mutex_lock(&c->state_lock);
+ BUG_ON(c->state == BCH_FS_STOPPING);
+ c->state = BCH_FS_STOPPING;
+ mutex_unlock(&c->state_lock);
+
+ bch_blockdevs_stop(c);
+
+ closure_sync(&c->caching);
+ closure_debug_destroy(&c->caching);
+
+ bch_fs_offline(c);
- c->stop_completion = &complete;
- bch_fs_stop(c);
closure_put(&c->cl);
+ closure_sync(&c->cl);
- /* Killable? */
- wait_for_completion(&complete);
+ bch_fs_exit(c);
+ kobject_put(&c->kobj);
}
/* Stop, detaching from backing devices: */
void bch_fs_detach(struct cache_set *c)
{
if (!test_and_set_bit(BCH_FS_DETACHING, &c->flags))
- bch_fs_stop(c);
+ bch_fs_stop_async(c);
}
static unsigned bch_fs_nr_devices(struct cache_set *c)
@@ -520,6 +507,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->minor = -1;
+ mutex_init(&c->state_lock);
mutex_init(&c->sb_lock);
INIT_RADIX_TREE(&c->devices, GFP_KERNEL);
mutex_init(&c->btree_cache_lock);
@@ -534,8 +522,8 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
BCH_TIME_STATS()
#undef BCH_TIME_STAT
- bch_open_buckets_init(c);
- bch_tiering_init_cache_set(c);
+ bch_fs_allocator_init(c);
+ bch_fs_tiering_init(c);
INIT_LIST_HEAD(&c->list);
INIT_LIST_HEAD(&c->cached_devs);
@@ -636,10 +624,10 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch_fs_blockdev_init(c) ||
bch_io_clock_init(&c->io_clock[READ]) ||
bch_io_clock_init(&c->io_clock[WRITE]) ||
- bch_journal_alloc(&c->journal, journal_entry_bytes) ||
- bch_btree_cache_alloc(c) ||
+ bch_fs_journal_init(&c->journal, journal_entry_bytes) ||
+ bch_fs_btree_init(c) ||
bch_fs_encryption_init(c) ||
- bch_compress_init(c) ||
+ bch_fs_compress_init(c) ||
bch_check_set_has_compressed_data(c, c->opts.compression))
goto err;
@@ -664,6 +652,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
closure_init(&c->caching, &c->cl);
set_closure_fn(&c->caching, __bch_fs_stop1, system_wq);
+ closure_get(&c->cl);
continue_at_noreturn(&c->cl, __bch_fs_stop3, system_wq);
return c;
err:
@@ -671,7 +660,20 @@ err:
return NULL;
}
-static int bch_fs_online(struct cache_set *c)
+static struct cache_set *bch_fs_lookup(uuid_le uuid)
+{
+ struct cache_set *c;
+
+ lockdep_assert_held(&bch_register_lock);
+
+ list_for_each_entry(c, &bch_fs_list, list)
+ if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
+ return c;
+
+ return NULL;
+}
+
+static const char *__bch_fs_online(struct cache_set *c)
{
struct cache *ca;
unsigned i;
@@ -680,31 +682,58 @@ static int bch_fs_online(struct cache_set *c)
lockdep_assert_held(&bch_register_lock);
if (!list_empty(&c->list))
- return 0;
+ return NULL;
- list_add(&c->list, &bch_fs_list);
+ if (bch_fs_lookup(c->sb.uuid))
+ return "filesystem UUID already open";
ret = bch_fs_chardev_init(c);
if (ret)
- return ret;
+ return "error creating character device";
+
+ bch_fs_debug_init(c);
if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
kobject_add(&c->internal, &c->kobj, "internal") ||
kobject_add(&c->opts_dir, &c->kobj, "options") ||
kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
- return -1;
+ return "error creating sysfs objects";
for_each_cache(ca, c, i)
if (bch_dev_online(ca)) {
percpu_ref_put(&ca->ref);
- return -1;
+ return "error creating sysfs objects";
}
+ mutex_lock(&c->state_lock);
+
+ if (bch_blockdev_volumes_start(c)) {
+ mutex_unlock(&c->state_lock);
+ return "can't bring up blockdev volumes";
+ }
+
+ bch_attach_backing_devs(c);
+
+ mutex_unlock(&c->state_lock);
+
+ list_add(&c->list, &bch_fs_list);
+
return 0;
}
-static const char *bch_fs_start(struct cache_set *c)
+static const char *bch_fs_online(struct cache_set *c)
+{
+ const char *err;
+
+ mutex_lock(&bch_register_lock);
+ err = __bch_fs_online(c);
+ mutex_unlock(&bch_register_lock);
+
+ return err;
+}
+
+static const char *__bch_fs_start(struct cache_set *c)
{
const char *err = "cannot allocate memory";
struct bch_sb_field_members *mi;
@@ -715,11 +744,7 @@ static const char *bch_fs_start(struct cache_set *c)
struct jset *j;
int ret = -EINVAL;
- lockdep_assert_held(&bch_register_lock);
- BUG_ON(test_bit(BCH_FS_RUNNING, &c->flags));
-
- /* We don't want bch_fatal_error() to free underneath us */
- closure_get(&c->caching);
+ BUG_ON(c->state != BCH_FS_STARTING);
/*
* Make sure that each cache object's mi is up to date before
@@ -826,6 +851,16 @@ static const char *bch_fs_start(struct cache_set *c)
bch_notice(c, "initializing new filesystem");
+ bch_initial_gc(c, NULL);
+
+ err = "error starting allocator thread";
+ for_each_cache(ca, c, i)
+ if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
+ bch_dev_allocator_start(ca)) {
+ percpu_ref_put(&ca->ref);
+ goto err;
+ }
+
err = "unable to allocate journal buckets";
for_each_cache(ca, c, i)
if (bch_dev_journal_alloc(ca)) {
@@ -833,8 +868,6 @@ static const char *bch_fs_start(struct cache_set *c)
goto err;
}
- bch_initial_gc(c, NULL);
-
/*
* journal_res_get() will crash if called before this has
* set up the journal.pin FIFO and journal.cur pointer:
@@ -842,14 +875,6 @@ static const char *bch_fs_start(struct cache_set *c)
bch_journal_start(c);
bch_journal_set_replay_done(&c->journal);
- err = "error starting allocator thread";
- for_each_cache(ca, c, i)
- if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
- bch_dev_allocator_start(ca)) {
- percpu_ref_put(&ca->ref);
- goto err;
- }
-
err = "cannot allocate new btree root";
for (id = 0; id < BTREE_ID_NR; id++)
if (bch_btree_root_alloc(c, id, &cl)) {
@@ -877,10 +902,14 @@ static const char *bch_fs_start(struct cache_set *c)
goto err;
}
recovery_done:
+ err = "dynamic fault";
+ if (bch_fs_init_fault("fs_start"))
+ goto err;
+
if (c->opts.read_only) {
- bch_fs_read_only_sync(c);
+ bch_fs_read_only(c);
} else {
- err = __bch_fs_read_write(c);
+ err = bch_fs_read_write(c);
if (err)
goto err;
}
@@ -901,27 +930,9 @@ recovery_done:
bch_write_super(c);
mutex_unlock(&c->sb_lock);
- err = "dynamic fault";
- if (bch_fs_init_fault("fs_start"))
- goto err;
-
- err = "error creating kobject";
- if (bch_fs_online(c))
- goto err;
-
- err = "can't bring up blockdev volumes";
- if (bch_blockdev_volumes_start(c))
- goto err;
-
- bch_debug_init_cache_set(c);
- set_bit(BCH_FS_RUNNING, &c->flags);
- bch_attach_backing_devs(c);
-
- bch_notify_fs_read_write(c);
err = NULL;
out:
bch_journal_entries_free(&journal);
- closure_put(&c->caching);
return err;
err:
switch (ret) {
@@ -955,6 +966,11 @@ err:
goto out;
}
+const char *bch_fs_start(struct cache_set *c)
+{
+ return __bch_fs_start(c) ?: bch_fs_online(c);
+}
+
static const char *bch_dev_may_add(struct bch_sb *sb, struct cache_set *c)
{
struct bch_sb_field_members *sb_mi;
@@ -999,7 +1015,7 @@ static const char *bch_dev_in_fs(struct bch_sb *sb, struct cache_set *c)
return NULL;
}
-/* Cache device */
+/* Device startup/shutdown, ro/rw: */
bool bch_dev_read_only(struct cache *ca)
{
@@ -1009,14 +1025,14 @@ bool bch_dev_read_only(struct cache *ca)
bdevname(ca->disk_sb.bdev, buf);
- lockdep_assert_held(&bch_register_lock);
+ lockdep_assert_held(&c->state_lock);
if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
return false;
if (!bch_dev_may_remove(ca)) {
bch_err(c, "required member %s going RO, forcing fs RO", buf);
- bch_fs_read_only_sync(c);
+ bch_fs_read_only(c);
}
trace_bcache_cache_read_only(ca);
@@ -1053,7 +1069,7 @@ bool bch_dev_read_only(struct cache *ca)
static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
{
- lockdep_assert_held(&bch_register_lock);
+ lockdep_assert_held(&c->state_lock);
if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
return NULL;
@@ -1066,12 +1082,11 @@ static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
if (bch_dev_allocator_start(ca))
return "error starting allocator thread";
- if (bch_moving_gc_thread_start(ca))
+ if (bch_moving_gc_start(ca))
return "error starting moving GC thread";
- bch_dev_group_add(&c->journal.devs, ca);
-
- wake_up_process(c->tiering_read);
+ if (bch_tiering_start(c))
+ return "error starting tiering thread";
bch_notify_dev_read_write(ca);
trace_bcache_cache_read_write_done(ca);
@@ -1099,22 +1114,15 @@ const char *bch_dev_read_write(struct cache *ca)
return NULL;
}
-/*
- * bch_dev_stop has already returned, so we no longer hold the register
- * lock at the point this is called.
- */
-
void bch_dev_release(struct kobject *kobj)
{
struct cache *ca = container_of(kobj, struct cache, kobj);
- percpu_ref_exit(&ca->ref);
kfree(ca);
}
-static void bch_dev_free_work(struct work_struct *work)
+static void bch_dev_free(struct cache *ca)
{
- struct cache *ca = container_of(work, struct cache, free_work);
struct cache_set *c = ca->set;
unsigned i;
@@ -1131,15 +1139,7 @@ static void bch_dev_free_work(struct work_struct *work)
kobject_del(&ca->kobj);
bch_free_super(&ca->disk_sb);
-
- /*
- * bch_dev_stop can be called in the middle of initialization
- * of the struct cache object.
- * As such, not all the sub-structures may be initialized.
- * However, they were zeroed when the object was allocated.
- */
-
- bch_journal_free_cache(ca);
+ bch_dev_journal_exit(ca);
free_percpu(ca->sectors_written);
bioset_exit(&ca->replica_set);
free_percpu(ca->bucket_stats_percpu);
@@ -1155,12 +1155,20 @@ static void bch_dev_free_work(struct work_struct *work)
for (i = 0; i < RESERVE_NR; i++)
free_fifo(&ca->free[i]);
+ percpu_ref_exit(&ca->ref);
kobject_put(&ca->kobj);
if (c)
kobject_put(&c->kobj);
}
+static void bch_dev_free_work(struct work_struct *work)
+{
+ struct cache *ca = container_of(work, struct cache, free_work);
+
+ bch_dev_free(ca);
+}
+
static void bch_dev_percpu_ref_release(struct percpu_ref *ref)
{
struct cache *ca = container_of(ref, struct cache, ref);
@@ -1193,12 +1201,10 @@ static void bch_dev_stop(struct cache *ca)
{
struct cache_set *c = ca->set;
- lockdep_assert_held(&bch_register_lock);
+ lockdep_assert_held(&c->state_lock);
- if (c) {
- BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca);
- rcu_assign_pointer(c->cache[ca->dev_idx], NULL);
- }
+ BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca);
+ rcu_assign_pointer(c->cache[ca->dev_idx], NULL);
call_rcu(&ca->free_rcu, bch_dev_free_rcu);
}
@@ -1281,7 +1287,8 @@ static void bch_dev_remove_work(struct work_struct *work)
*/
closure_get(&c->cl);
- mutex_lock(&bch_register_lock);
+ mutex_lock(&c->state_lock);
+
bch_dev_stop(ca);
/*
@@ -1290,8 +1297,6 @@ static void bch_dev_remove_work(struct work_struct *work)
*/
synchronize_rcu();
- lockdep_assert_held(&bch_register_lock);
-
/*
* Free this device's slot in the bch_member array - all pointers to
* this device must be gone:
@@ -1301,23 +1306,20 @@ static void bch_dev_remove_work(struct work_struct *work)
memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
bch_write_super(c);
- mutex_unlock(&c->sb_lock);
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&c->sb_lock);
+ mutex_unlock(&c->state_lock);
closure_put(&c->cl);
}
-bool bch_dev_remove(struct cache *ca, bool force)
+static bool __bch_dev_remove(struct cache_set *c, struct cache *ca, bool force)
{
- mutex_lock(&bch_register_lock);
-
if (test_bit(BCH_DEV_REMOVING, &ca->flags))
return false;
if (!bch_dev_may_remove(ca)) {
- bch_err(ca->set, "Can't remove last device in tier %u",
- ca->mi.tier);
+ bch_err(ca->set, "Can't remove last RW device");
bch_notify_dev_remove_failed(ca);
return false;
}
@@ -1327,23 +1329,32 @@ bool bch_dev_remove(struct cache *ca, bool force)
if (force)
set_bit(BCH_DEV_FORCE_REMOVE, &ca->flags);
+
set_bit(BCH_DEV_REMOVING, &ca->flags);
bch_notify_dev_removing(ca);
- mutex_unlock(&bch_register_lock);
-
/* Migrate the data and finish removal asynchronously: */
queue_work(system_long_wq, &ca->remove_work);
return true;
}
+bool bch_dev_remove(struct cache *ca, bool force)
+{
+ struct cache_set *c = ca->set;
+ bool ret;
+
+ mutex_lock(&c->state_lock);
+ ret = __bch_dev_remove(c, ca, force);
+ mutex_unlock(&c->state_lock);
+
+ return ret;
+}
+
static int bch_dev_online(struct cache *ca)
{
char buf[12];
- lockdep_assert_held(&bch_register_lock);
-
sprintf(buf, "cache%u", ca->dev_idx);
if (kobject_add(&ca->kobj,
@@ -1386,7 +1397,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
kobject_init(&ca->kobj, &bch_dev_ktype);
spin_lock_init(&ca->self.lock);
- ca->self.nr_devices = 1;
+ ca->self.nr = 1;
rcu_assign_pointer(ca->self.d[0].dev, ca);
ca->dev_idx = sb->sb->dev_idx;
@@ -1395,10 +1406,11 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
spin_lock_init(&ca->freelist_lock);
spin_lock_init(&ca->prio_buckets_lock);
mutex_init(&ca->heap_lock);
- bch_moving_init_cache(ca);
+ bch_dev_moving_gc_init(ca);
ca->disk_sb = *sb;
- ca->disk_sb.bdev->bd_holder = ca;
+ if (sb->mode & FMODE_EXCL)
+ ca->disk_sb.bdev->bd_holder = ca;
memset(sb, 0, sizeof(*sb));
INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work);
@@ -1444,7 +1456,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
bioset_init(&ca->replica_set, 4,
offsetof(struct bch_write_bio, bio)) ||
!(ca->sectors_written = alloc_percpu(*ca->sectors_written)) ||
- bch_journal_init_cache(ca))
+ bch_dev_journal_init(ca))
goto err;
ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
@@ -1482,7 +1494,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
err = "error creating kobject";
if (c->kobj.state_in_sysfs &&
bch_dev_online(ca))
- goto err;
+ pr_warn("error creating sysfs objects");
if (ret)
*ret = ca;
@@ -1490,49 +1502,34 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
kobject_put(&ca->kobj);
return NULL;
err:
- bch_dev_stop(ca);
+ bch_dev_free(ca);
return err;
}
-static struct cache_set *bch_fs_lookup(uuid_le uuid)
-{
- struct cache_set *c;
-
- lockdep_assert_held(&bch_register_lock);
-
- list_for_each_entry(c, &bch_fs_list, list)
- if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
- return c;
-
- return NULL;
-}
-
int bch_dev_add(struct cache_set *c, const char *path)
{
struct bcache_superblock sb;
const char *err;
struct cache *ca;
- struct bch_sb_field *f;
struct bch_sb_field_members *mi, *dev_mi;
struct bch_member saved_mi;
unsigned dev_idx, nr_devices, u64s;
int ret = -EINVAL;
- mutex_lock(&bch_register_lock);
-
err = bch_read_super(&sb, c->opts, path);
if (err)
- goto err_unlock_register;
+ return -EINVAL;
err = bch_validate_cache_super(&sb);
if (err)
- goto err_unlock_register;
-
- mutex_lock(&c->sb_lock);
+ return -EINVAL;
err = bch_dev_may_add(sb.sb, c);
if (err)
- goto err_unlock;
+ return -EINVAL;
+
+ mutex_lock(&c->state_lock);
+ mutex_lock(&c->sb_lock);
/*
* Preserve the old cache member information (esp. tier)
@@ -1571,17 +1568,14 @@ have_slot:
sizeof(struct bch_member) * nr_devices) / sizeof(u64);
err = "no space in superblock for member info";
- f = bch_fs_sb_field_resize(c, &mi->field, u64s);
- if (!f)
+ mi = bch_fs_sb_resize_members(c, u64s);
+ if (!mi)
goto err_unlock;
- mi = container_of(f, struct bch_sb_field_members, field);
-
- f = bch_dev_sb_field_resize(&sb, &dev_mi->field, u64s);
- if (!f)
+ dev_mi = bch_sb_resize_members(&sb, u64s);
+ if (!dev_mi)
goto err_unlock;
- dev_mi = container_of(f, struct bch_sb_field_members, field);
memcpy(dev_mi, mi, u64s * sizeof(u64));
dev_mi->members[dev_idx] = saved_mi;
@@ -1619,14 +1613,13 @@ have_slot:
kobject_put(&ca->kobj);
mutex_unlock(&c->sb_lock);
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&c->state_lock);
return 0;
err_put:
bch_dev_stop(ca);
err_unlock:
mutex_unlock(&c->sb_lock);
-err_unlock_register:
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&c->state_lock);
bch_free_super(&sb);
bch_err(c, "Unable to add device: %s", err);
@@ -1639,11 +1632,8 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices,
const char *err;
struct cache_set *c = NULL;
struct bcache_superblock *sb;
- uuid_le uuid;
unsigned i;
- memset(&uuid, 0, sizeof(uuid_le));
-
if (!nr_devices)
return "need at least one device";
@@ -1655,60 +1645,49 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices,
if (!sb)
goto err;
- /*
- * bch_read_super() needs to happen under register_lock, so that the
- * exclusive open is atomic with adding the new cache set to the list of
- * cache sets:
- */
- mutex_lock(&bch_register_lock);
-
for (i = 0; i < nr_devices; i++) {
err = bch_read_super(&sb[i], opts, devices[i]);
if (err)
- goto err_unlock;
+ goto err;
err = "attempting to register backing device";
if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
- goto err_unlock;
+ goto err;
err = bch_validate_cache_super(&sb[i]);
if (err)
- goto err_unlock;
+ goto err;
}
- err = "cache set already registered";
- if (bch_fs_lookup(sb->sb->uuid))
- goto err_unlock;
-
err = "cannot allocate memory";
c = bch_fs_alloc(sb[0].sb, opts);
if (!c)
- goto err_unlock;
+ goto err;
for (i = 0; i < nr_devices; i++) {
err = bch_dev_alloc(&sb[i], c, NULL);
if (err)
- goto err_unlock;
+ goto err;
}
err = "insufficient devices";
if (bch_fs_nr_online_devices(c) != bch_fs_nr_devices(c))
- goto err_unlock;
+ goto err;
- err = bch_fs_start(c);
- if (err)
- goto err_unlock;
+ if (!c->opts.nostart) {
+ err = __bch_fs_start(c);
+ if (err)
+ goto err;
+ }
- err = "error creating kobject";
- if (bch_fs_online(c))
- goto err_unlock;
+ err = bch_fs_online(c);
+ if (err)
+ goto err;
- if (ret) {
- closure_get(&c->cl);
+ if (ret)
*ret = c;
- }
-
- mutex_unlock(&bch_register_lock);
+ else
+ closure_put(&c->cl);
err = NULL;
out:
@@ -1717,20 +1696,18 @@ out:
if (err)
c = NULL;
return err;
-err_unlock:
+err:
if (c)
bch_fs_stop(c);
- mutex_unlock(&bch_register_lock);
-err:
+
for (i = 0; i < nr_devices; i++)
bch_free_super(&sb[i]);
goto out;
}
static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
- struct bch_opts opts)
+ struct bch_opts opts)
{
- char name[BDEVNAME_SIZE];
const char *err;
struct cache_set *c;
bool allocated_cache_set = false;
@@ -1739,17 +1716,19 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
if (err)
return err;
- bdevname(sb->bdev, name);
-
+ mutex_lock(&bch_register_lock);
c = bch_fs_lookup(sb->sb->uuid);
if (c) {
+ closure_get(&c->cl);
+
err = bch_dev_in_fs(sb->sb, c);
if (err)
- return err;
+ goto err;
} else {
c = bch_fs_alloc(sb->sb, opts);
+ err = "cannot allocate memory";
if (!c)
- return "cannot allocate memory";
+ goto err;
allocated_cache_set = true;
}
@@ -1758,21 +1737,29 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
if (err)
goto err;
- if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c)) {
- err = bch_fs_start(c);
+ if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c) &&
+ !c->opts.nostart) {
+ err = __bch_fs_start(c);
if (err)
goto err;
- } else {
- err = "error creating kobject";
- if (bch_fs_online(c))
- goto err;
}
- bch_info(c, "started");
+ err = __bch_fs_online(c);
+ if (err)
+ goto err;
+
+ closure_put(&c->cl);
+ mutex_unlock(&bch_register_lock);
+
return NULL;
err:
+ mutex_unlock(&bch_register_lock);
+
if (allocated_cache_set)
bch_fs_stop(c);
+ else if (c)
+ closure_put(&c->cl);
+
return err;
}
@@ -1782,20 +1769,20 @@ const char *bch_fs_open_incremental(const char *path)
struct bch_opts opts = bch_opts_empty();
const char *err;
- mutex_lock(&bch_register_lock);
-
err = bch_read_super(&sb, opts, path);
if (err)
- goto err;
+ return err;
- if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version)))
+ if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) {
+ mutex_lock(&bch_register_lock);
err = bch_backing_dev_register(&sb);
- else
+ mutex_unlock(&bch_register_lock);
+ } else {
err = __bch_fs_open_incremental(&sb, opts);
+ }
bch_free_super(&sb);
-err:
- mutex_unlock(&bch_register_lock);
+
return err;
}
@@ -1854,10 +1841,10 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
pr_info("Setting all devices read only:");
list_for_each_entry(c, &bch_fs_list, list)
- bch_fs_read_only(c);
+ bch_fs_read_only_async(c);
list_for_each_entry(c, &bch_fs_list, list)
- bch_fs_read_only_sync(c);
+ bch_fs_read_only(c);
mutex_unlock(&bch_register_lock);
}
@@ -1882,7 +1869,7 @@ kobj_attribute_write(reboot, reboot_test);
static void bcache_exit(void)
{
bch_debug_exit();
- bch_fs_exit();
+ bch_vfs_exit();
bch_blockdev_exit();
bch_chardev_exit();
if (bcache_kset)
@@ -1917,7 +1904,7 @@ static int __init bcache_init(void)
sysfs_create_files(&bcache_kset->kobj, files) ||
bch_chardev_init() ||
bch_blockdev_init() ||
- bch_fs_init() ||
+ bch_vfs_init() ||
bch_debug_init())
goto err;
diff --git a/libbcache/super.h b/libbcache/super.h
index bcf7d98..bafd88e 100644
--- a/libbcache/super.h
+++ b/libbcache/super.h
@@ -57,27 +57,11 @@ static inline struct cache *bch_get_next_cache(struct cache_set *c,
static inline bool bch_dev_may_remove(struct cache *ca)
{
struct cache_set *c = ca->set;
- struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
-
- /*
- * Right now, we can't remove the last device from a tier,
- * - For tier 0, because all metadata lives in tier 0 and because
- * there is no way to have foreground writes go directly to tier 1.
- * - For tier 1, because the code doesn't completely support an
- * empty tier 1.
- */
-
- /*
- * Turning a device read-only removes it from the cache group,
- * so there may only be one read-write device in a tier, and yet
- * the device we are removing is in the same tier, so we have
- * to check for identity.
- * Removing the last RW device from a tier requires turning the
- * whole cache set RO.
- */
-
- return tier->nr_devices != 1 ||
- rcu_access_pointer(tier->d[0].dev) != ca;
+ struct cache_group *grp = &c->cache_all;
+
+ /* Can't remove the last RW device: */
+ return grp->nr != 1 ||
+ rcu_access_pointer(grp->d[0].dev) != ca;
}
void bch_dev_release(struct kobject *);
@@ -89,15 +73,15 @@ int bch_dev_add(struct cache_set *, const char *);
void bch_fs_detach(struct cache_set *);
-bool bch_fs_read_only(struct cache_set *);
bool bch_fs_emergency_read_only(struct cache_set *);
-void bch_fs_read_only_sync(struct cache_set *);
+void bch_fs_read_only(struct cache_set *);
const char *bch_fs_read_write(struct cache_set *);
void bch_fs_release(struct kobject *);
+void bch_fs_stop_async(struct cache_set *);
void bch_fs_stop(struct cache_set *);
-void bch_fs_stop_sync(struct cache_set *);
+const char *bch_fs_start(struct cache_set *);
const char *bch_fs_open(char * const *, unsigned, struct bch_opts,
struct cache_set **);
const char *bch_fs_open_incremental(const char *path);
diff --git a/libbcache/super_types.h b/libbcache/super_types.h
index 41eaf0d..69c747d 100644
--- a/libbcache/super_types.h
+++ b/libbcache/super_types.h
@@ -6,6 +6,7 @@ struct bcache_superblock {
struct block_device *bdev;
struct bio *bio;
unsigned page_order;
+ fmode_t mode;
};
#endif /* _BCACHE_SUPER_TYPES_H */
diff --git a/libbcache/sysfs.c b/libbcache/sysfs.c
index 9f45a6b..48f9f1f 100644
--- a/libbcache/sysfs.c
+++ b/libbcache/sysfs.c
@@ -22,6 +22,7 @@
#include "opts.h"
#include "request.h"
#include "super-io.h"
+#include "tier.h"
#include "writeback.h"
#include <linux/blkdev.h>
@@ -121,6 +122,8 @@ rw_attribute(cache_replacement_policy);
rw_attribute(foreground_write_ratelimit_enabled);
rw_attribute(copy_gc_enabled);
sysfs_pd_controller_attribute(copy_gc);
+
+rw_attribute(tier);
rw_attribute(tiering_enabled);
rw_attribute(tiering_percent);
sysfs_pd_controller_attribute(tiering);
@@ -134,7 +137,6 @@ rw_attribute(foreground_target_percent);
rw_attribute(size);
read_attribute(meta_replicas_have);
read_attribute(data_replicas_have);
-read_attribute(tier);
#define BCH_DEBUG_PARAM(name, description) \
rw_attribute(name);
@@ -680,7 +682,8 @@ SHOW(bch_fs)
sysfs_printf(tiering_enabled, "%i", c->tiering_enabled);
sysfs_print(tiering_percent, c->tiering_percent);
- sysfs_pd_controller_show(tiering, &c->tiering_pd);
+
+ sysfs_pd_controller_show(tiering, &c->tiers[1].pd); /* XXX */
sysfs_printf(meta_replicas_have, "%u", c->sb.meta_replicas_have);
sysfs_printf(data_replicas_have, "%u", c->sb.data_replicas_have);
@@ -694,7 +697,7 @@ SHOW(bch_fs)
BCH_DEBUG_PARAMS()
#undef BCH_DEBUG_PARAM
- if (!test_bit(BCH_FS_RUNNING, &c->flags))
+ if (!bch_fs_running(c))
return -EPERM;
if (attr == &sysfs_bset_tree_stats)
@@ -723,7 +726,7 @@ STORE(__bch_fs)
}
if (attr == &sysfs_stop) {
- bch_fs_stop(c);
+ bch_fs_stop_async(c);
return size;
}
@@ -773,25 +776,18 @@ STORE(__bch_fs)
ssize_t ret = strtoul_safe(buf, c->tiering_enabled)
?: (ssize_t) size;
- if (c->tiering_read)
- wake_up_process(c->tiering_read);
+ bch_tiering_start(c); /* issue wakeups */
return ret;
}
sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd);
- if (attr == &sysfs_journal_flush) {
- bch_journal_meta_async(&c->journal, NULL);
-
- return size;
- }
-
sysfs_strtoul(pd_controllers_update_seconds,
c->pd_controllers_update_seconds);
sysfs_strtoul(foreground_target_percent, c->foreground_target_percent);
sysfs_strtoul(tiering_percent, c->tiering_percent);
- sysfs_pd_controller_store(tiering, &c->tiering_pd);
+ sysfs_pd_controller_store(tiering, &c->tiers[1].pd); /* XXX */
/* Debugging: */
@@ -799,11 +795,14 @@ STORE(__bch_fs)
BCH_DEBUG_PARAMS()
#undef BCH_DEBUG_PARAM
- if (!test_bit(BCH_FS_RUNNING, &c->flags))
+ if (!bch_fs_running(c))
return -EPERM;
- if (test_bit(BCH_FS_STOPPING, &c->flags))
- return -EINTR;
+ if (attr == &sysfs_journal_flush) {
+ bch_journal_meta_async(&c->journal, NULL);
+
+ return size;
+ }
if (attr == &sysfs_blockdev_volume_create) {
u64 v = strtoi_h_or_return(buf);
@@ -836,9 +835,9 @@ STORE(bch_fs)
{
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
- mutex_lock(&bch_register_lock);
+ mutex_lock(&c->state_lock);
size = __bch_fs_store(kobj, attr, buf, size);
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&c->state_lock);
if (attr == &sysfs_add_device) {
char *path = kstrdup(buf, GFP_KERNEL);
@@ -1273,6 +1272,31 @@ STORE(__bch_dev)
mutex_unlock(&c->sb_lock);
}
+ if (attr == &sysfs_tier) {
+ unsigned prev_tier;
+ unsigned v = strtoul_restrict_or_return(buf,
+ 0, BCH_TIER_MAX - 1);
+
+ mutex_lock(&c->sb_lock);
+ prev_tier = ca->mi.tier;
+
+ if (v == ca->mi.tier) {
+ mutex_unlock(&c->sb_lock);
+ return size;
+ }
+
+ mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+ SET_BCH_MEMBER_TIER(mi, v);
+ bch_write_super(c);
+
+ bch_dev_group_remove(&c->tiers[prev_tier].devs, ca);
+ bch_dev_group_add(&c->tiers[ca->mi.tier].devs, ca);
+ mutex_unlock(&c->sb_lock);
+
+ bch_recalc_capacity(c);
+ bch_tiering_start(c);
+ }
+
if (attr == &sysfs_state_rw) {
char name[BDEVNAME_SIZE];
const char *err = NULL;
diff --git a/libbcache/tier.c b/libbcache/tier.c
index 4686459..0ab1770 100644
--- a/libbcache/tier.c
+++ b/libbcache/tier.c
@@ -16,8 +16,7 @@
#include <trace/events/bcache.h>
struct tiering_state {
- struct cache_group *tier;
- unsigned tier_idx;
+ struct bch_tier *tier;
unsigned sectors;
unsigned stripe_size;
unsigned dev_idx;
@@ -42,7 +41,7 @@ static bool tiering_pred(struct cache_set *c,
mi = cache_member_info_get(c);
extent_for_each_ptr(e, ptr)
if (ptr->dev < mi->nr_devices &&
- mi->m[ptr->dev].tier >= s->tier_idx)
+ mi->m[ptr->dev].tier >= s->tier->idx)
replicas++;
cache_member_info_put();
@@ -69,15 +68,15 @@ static void tier_next_device(struct cache_set *c, struct tiering_state *s)
s->sectors = 0;
s->dev_idx++;
- spin_lock(&s->tier->lock);
- if (s->dev_idx >= s->tier->nr_devices)
+ spin_lock(&s->tier->devs.lock);
+ if (s->dev_idx >= s->tier->devs.nr)
s->dev_idx = 0;
- if (s->tier->nr_devices) {
- s->ca = s->tier->d[s->dev_idx].dev;
+ if (s->tier->devs.nr) {
+ s->ca = s->tier->devs.d[s->dev_idx].dev;
percpu_ref_get(&s->ca->ref);
}
- spin_unlock(&s->tier->lock);
+ spin_unlock(&s->tier->devs.lock);
}
}
@@ -103,13 +102,13 @@ static int issue_tiering_move(struct cache_set *c,
* tiering_next_cache - issue a move to write an extent to the next cache
* device in round robin order
*/
-static s64 read_tiering(struct cache_set *c, struct cache_group *tier)
+static s64 read_tiering(struct cache_set *c, struct bch_tier *tier)
{
struct moving_context ctxt;
struct tiering_state s;
struct btree_iter iter;
struct bkey_s_c k;
- unsigned nr_devices = READ_ONCE(tier->nr_devices);
+ unsigned nr_devices = READ_ONCE(tier->devs.nr);
int ret;
if (!nr_devices)
@@ -119,10 +118,9 @@ static s64 read_tiering(struct cache_set *c, struct cache_group *tier)
memset(&s, 0, sizeof(s));
s.tier = tier;
- s.tier_idx = tier - c->cache_tiers;
s.stripe_size = 2048; /* 1 mb for now */
- bch_move_ctxt_init(&ctxt, &c->tiering_pd.rate,
+ bch_move_ctxt_init(&ctxt, &tier->pd.rate,
nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE);
bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
@@ -164,8 +162,8 @@ next:
static int bch_tiering_thread(void *arg)
{
- struct cache_set *c = arg;
- struct cache_group *tier = &c->cache_tiers[1];
+ struct bch_tier *tier = arg;
+ struct cache_set *c = container_of(tier, struct cache_set, tiers[tier->idx]);
struct io_clock *clock = &c->io_clock[WRITE];
struct cache *ca;
u64 tier_capacity, available_sectors;
@@ -176,20 +174,20 @@ static int bch_tiering_thread(void *arg)
while (!kthread_should_stop()) {
if (kthread_wait_freezable(c->tiering_enabled &&
- tier->nr_devices))
+ tier->devs.nr))
break;
while (1) {
- struct cache_group *faster_tier;
+ struct bch_tier *faster_tier;
last = atomic_long_read(&clock->now);
tier_capacity = available_sectors = 0;
rcu_read_lock();
- for (faster_tier = c->cache_tiers;
+ for (faster_tier = c->tiers;
faster_tier != tier;
faster_tier++) {
- group_for_each_cache_rcu(ca, faster_tier, i) {
+ group_for_each_cache_rcu(ca, &faster_tier->devs, i) {
tier_capacity +=
(ca->mi.nbuckets -
ca->mi.first_bucket) << ca->bucket_bits;
@@ -216,32 +214,73 @@ static int bch_tiering_thread(void *arg)
return 0;
}
-void bch_tiering_init_cache_set(struct cache_set *c)
+static void __bch_tiering_stop(struct bch_tier *tier)
{
- bch_pd_controller_init(&c->tiering_pd);
+ tier->pd.rate.rate = UINT_MAX;
+ bch_ratelimit_reset(&tier->pd.rate);
+
+ if (tier->migrate)
+ kthread_stop(tier->migrate);
+
+ tier->migrate = NULL;
+}
+
+void bch_tiering_stop(struct cache_set *c)
+{
+ struct bch_tier *tier;
+
+ for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++)
+ __bch_tiering_stop(tier);
+}
+
+static int __bch_tiering_start(struct bch_tier *tier)
+{
+ if (!tier->migrate) {
+ struct task_struct *p =
+ kthread_create(bch_tiering_thread, tier,
+ "bch_tier[%u]", tier->idx);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ tier->migrate = p;
+ }
+
+ wake_up_process(tier->migrate);
+ return 0;
}
-int bch_tiering_read_start(struct cache_set *c)
+int bch_tiering_start(struct cache_set *c)
{
- struct task_struct *t;
+ struct bch_tier *tier;
+ bool have_faster_tier = false;
if (c->opts.nochanges)
return 0;
- t = kthread_create(bch_tiering_thread, c, "bch_tier_read");
- if (IS_ERR(t))
- return PTR_ERR(t);
+ for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
+ if (!tier->devs.nr)
+ continue;
- c->tiering_read = t;
- wake_up_process(c->tiering_read);
+ if (have_faster_tier) {
+ int ret = __bch_tiering_start(tier);
+ if (ret)
+ return ret;
+ } else {
+ __bch_tiering_stop(tier);
+ }
+
+ have_faster_tier = true;
+ }
return 0;
}
-void bch_tiering_read_stop(struct cache_set *c)
+void bch_fs_tiering_init(struct cache_set *c)
{
- if (!IS_ERR_OR_NULL(c->tiering_read)) {
- kthread_stop(c->tiering_read);
- c->tiering_read = NULL;
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
+ c->tiers[i].idx = i;
+ bch_pd_controller_init(&c->tiers[i].pd);
}
}
diff --git a/libbcache/tier.h b/libbcache/tier.h
index 89c2bff..b53e83d 100644
--- a/libbcache/tier.h
+++ b/libbcache/tier.h
@@ -1,8 +1,8 @@
#ifndef _BCACHE_TIER_H
#define _BCACHE_TIER_H
-void bch_tiering_init_cache_set(struct cache_set *);
-int bch_tiering_read_start(struct cache_set *);
-void bch_tiering_read_stop(struct cache_set *);
+void bch_tiering_stop(struct cache_set *);
+int bch_tiering_start(struct cache_set *);
+void bch_fs_tiering_init(struct cache_set *);
#endif
diff --git a/linux/blkdev.c b/linux/blkdev.c
index 0bae9b0..f50a9c7 100644
--- a/linux/blkdev.c
+++ b/linux/blkdev.c
@@ -49,6 +49,12 @@ int submit_bio_wait(struct bio *bio)
BUG();
}
+ if (ret != bio->bi_iter.bi_size) {
+ fprintf(stderr, "IO error: %i (%s)\n",
+ ret, strerror(errno));
+ return -EIO;
+ }
+
if (bio->bi_opf & REQ_FUA)
fdatasync(bio->bi_bdev->bd_fd);
diff --git a/qcow2.c b/qcow2.c
index cbc8d4c..b7aa8c2 100644
--- a/qcow2.c
+++ b/qcow2.c
@@ -2,7 +2,6 @@
#include <errno.h>
#include <sys/types.h>
#include <unistd.h>
-#include <linux/sort.h>
#include "qcow2.h"
#include "tools-util.h"
@@ -69,18 +68,7 @@ static void add_l2(struct qcow2_image *img, u64 src_blk, u64 dst_offset)
img->l2_table[l2_index] = cpu_to_be64(dst_offset|QCOW_OFLAG_COPIED);
}
-static int range_cmp(const void *_l, const void *_r)
-{
- const struct range *l = _l, *r = _r;
-
- if (l->start < r->start)
- return -1;
- if (l->start > r->start)
- return 1;
- return 0;
-}
-
-void qcow2_write_image(int infd, int outfd, sparse_data *data,
+void qcow2_write_image(int infd, int outfd, ranges *data,
unsigned block_size)
{
u64 image_size = get_size(NULL, infd);
@@ -98,30 +86,11 @@ void qcow2_write_image(int infd, int outfd, sparse_data *data,
struct range *r;
char *buf = xmalloc(block_size);
u64 src_offset, dst_offset;
- sparse_data m;
assert(is_power_of_2(block_size));
- sort(&darray_item(*data, 0),
- darray_size(*data),
- sizeof(darray_item(*data, 0)),
- range_cmp, NULL);
-
- /* Round to blocksize, merge contiguous ranges: */
- darray_init(m);
- darray_foreach(r, *data) {
- struct range *l = m.size ? &m.item[m.size - 1] : NULL;
-
- r->start = round_down(r->start, block_size);
- r->end = round_up(r->end, block_size);
-
- if (l && l->end >= r->start)
- l->end = max(l->end, r->end);
- else
- darray_append(m, *r);
- }
- darray_free(*data);
- *data = m;
+ ranges_roundup(data, block_size);
+ ranges_sort_merge(data);
/* Write data: */
darray_foreach(r, *data)
diff --git a/qcow2.h b/qcow2.h
index c6f0b6b..0943d55 100644
--- a/qcow2.h
+++ b/qcow2.h
@@ -2,23 +2,8 @@
#define _QCOW2_H
#include <linux/types.h>
-#include "ccan/darray/darray.h"
+#include "tools-util.h"
-struct range {
- u64 start;
- u64 end;
-};
-
-typedef darray(struct range) sparse_data;
-
-static inline void data_add(sparse_data *data, u64 offset, u64 size)
-{
- darray_append(*data, (struct range) {
- .start = offset,
- .end = offset + size
- });
-}
-
-void qcow2_write_image(int, int, sparse_data *, unsigned);
+void qcow2_write_image(int, int, ranges *, unsigned);
#endif /* _QCOW2_H */
diff --git a/tools-util.c b/tools-util.c
index 0a95fbe..07fb82d 100644
--- a/tools-util.c
+++ b/tools-util.c
@@ -1,4 +1,3 @@
-#include <alloca.h>
#include <assert.h>
#include <ctype.h>
#include <errno.h>
@@ -19,6 +18,7 @@
#include "ccan/crc/crc.h"
#include "linux/bcache-ioctl.h"
+#include "linux/sort.h"
#include "tools-util.h"
#include "util.h"
@@ -59,20 +59,12 @@ struct units_buf __pr_units(u64 v, enum units units)
char *read_file_str(int dirfd, const char *path)
{
- int fd = openat(dirfd, path, O_RDONLY);
+ int fd = xopenat(dirfd, path, O_RDONLY);
+ size_t len = xfstat(fd).st_size;
- if (fd < 0)
- die("Unable to open %s\n", path);
+ char *buf = malloc(len + 1);
- struct stat statbuf;
- if (fstat(fd, &statbuf) < 0)
- die("fstat error\n");
-
- char *buf = malloc(statbuf.st_size + 1);
-
- int len = read(fd, buf, statbuf.st_size);
- if (len < 0)
- die("read error while reading from file %s\n", path);
+ xpread(fd, buf, len, 0);
buf[len] = '\0';
if (len && buf[len - 1] == '\n')
@@ -107,48 +99,33 @@ ssize_t read_string_list_or_die(const char *opt, const char * const list[],
/* Returns size of file or block device: */
u64 get_size(const char *path, int fd)
{
- struct stat statbuf;
- u64 ret;
-
- if (fstat(fd, &statbuf))
- die("Error statting %s: %s", path, strerror(errno));
+ struct stat statbuf = xfstat(fd);
if (!S_ISBLK(statbuf.st_mode))
return statbuf.st_size;
- if (ioctl(fd, BLKGETSIZE64, &ret))
- die("Error getting block device size on %s: %s\n",
- path, strerror(errno));
-
+ u64 ret;
+ xioctl(fd, BLKGETSIZE64, &ret);
return ret;
}
/* Returns blocksize in units of 512 byte sectors: */
unsigned get_blocksize(const char *path, int fd)
{
- struct stat statbuf;
- if (fstat(fd, &statbuf))
- die("Error statting %s: %s", path, strerror(errno));
+ struct stat statbuf = xfstat(fd);
if (!S_ISBLK(statbuf.st_mode))
return statbuf.st_blksize >> 9;
unsigned ret;
- if (ioctl(fd, BLKPBSZGET, &ret))
- die("Error getting blocksize on %s: %s\n",
- path, strerror(errno));
-
+ xioctl(fd, BLKPBSZGET, &ret);
return ret >> 9;
}
/* Global control device: */
int bcachectl_open(void)
{
- int fd = open("/dev/bcache-ctl", O_RDWR);
- if (fd < 0)
- die("Can't open bcache device: %s", strerror(errno));
-
- return fd;
+ return xopen("/dev/bcache-ctl", O_RDWR);
}
/* Filesystem handles (ioctl, sysfs dir): */
@@ -162,47 +139,29 @@ struct bcache_handle bcache_fs_open(const char *path)
if (!uuid_parse(path, tmp)) {
/* It's a UUID, look it up in sysfs: */
-
- char *sysfs = alloca(strlen(SYSFS_BASE) + strlen(path) + 1);
- sprintf(sysfs, "%s%s", SYSFS_BASE, path);
-
- ret.sysfs_fd = open(sysfs, O_RDONLY);
- if (!ret.sysfs_fd)
- die("Unable to open %s\n", path);
+ char *sysfs = mprintf("%s%s", SYSFS_BASE, path);
+ ret.sysfs_fd = xopen(sysfs, O_RDONLY);
char *minor = read_file_str(ret.sysfs_fd, "minor");
- char *ctl = alloca(20 + strlen(minor));
+ char *ctl = mprintf("/dev/bcache%s-ctl", minor);
+ ret.ioctl_fd = xopen(ctl, O_RDWR);
- sprintf(ctl, "/dev/bcache%s-ctl", minor);
+ free(sysfs);
free(minor);
-
- ret.ioctl_fd = open(ctl, O_RDWR);
- if (ret.ioctl_fd < 0)
- die("Error opening control device: %s\n",
- strerror(errno));
+ free(ctl);
} else {
/* It's a path: */
-
- ret.ioctl_fd = open(path, O_RDONLY);
- if (ret.ioctl_fd < 0)
- die("Error opening %s: %s\n",
- path, strerror(errno));
+ ret.ioctl_fd = xopen(path, O_RDONLY);
struct bch_ioctl_query_uuid uuid;
- if (ioctl(ret.ioctl_fd, BCH_IOCTL_QUERY_UUID, &uuid))
- die("ioctl error (not a bcache fs?): %s\n",
- strerror(errno));
+ xioctl(ret.ioctl_fd, BCH_IOCTL_QUERY_UUID, &uuid);
char uuid_str[40];
uuid_unparse(uuid.uuid.b, uuid_str);
- char *sysfs = alloca(strlen(SYSFS_BASE) + strlen(uuid_str) + 1);
- sprintf(sysfs, "%s%s", SYSFS_BASE, uuid_str);
-
- ret.sysfs_fd = open(sysfs, O_RDONLY);
- if (ret.sysfs_fd < 0)
- die("Unable to open sysfs dir %s: %s\n",
- sysfs, strerror(errno));
+ char *sysfs = mprintf("%s%s", SYSFS_BASE, uuid_str);
+ ret.sysfs_fd = xopen(sysfs, O_RDONLY);
+ free(sysfs);
}
return ret;
@@ -225,3 +184,89 @@ bool ask_yn(void)
free(buf);
return ret;
}
+
+static int range_cmp(const void *_l, const void *_r)
+{
+ const struct range *l = _l, *r = _r;
+
+ if (l->start < r->start)
+ return -1;
+ if (l->start > r->start)
+ return 1;
+ return 0;
+}
+
+void ranges_sort_merge(ranges *r)
+{
+ struct range *t, *i;
+ ranges tmp = { NULL };
+
+ sort(&darray_item(*r, 0), darray_size(*r),
+ sizeof(darray_item(*r, 0)), range_cmp, NULL);
+
+ /* Merge contiguous ranges: */
+ darray_foreach(i, *r) {
+ t = tmp.size ? &tmp.item[tmp.size - 1] : NULL;
+
+ if (t && t->end >= i->start)
+ t->end = max(t->end, i->end);
+ else
+ darray_append(tmp, *i);
+ }
+
+ darray_free(*r);
+ *r = tmp;
+}
+
+void ranges_roundup(ranges *r, unsigned block_size)
+{
+ struct range *i;
+
+ darray_foreach(i, *r) {
+ i->start = round_down(i->start, block_size);
+ i->end = round_up(i->end, block_size);
+ }
+}
+
+void ranges_rounddown(ranges *r, unsigned block_size)
+{
+ struct range *i;
+
+ darray_foreach(i, *r) {
+ i->start = round_up(i->start, block_size);
+ i->end = round_down(i->end, block_size);
+ i->end = max(i->end, i->start);
+ }
+}
+
+struct fiemap_extent fiemap_iter_next(struct fiemap_iter *iter)
+{
+ struct fiemap_extent e;
+
+ BUG_ON(iter->idx > iter->f.fm_mapped_extents);
+
+ if (iter->idx == iter->f.fm_mapped_extents) {
+ xioctl(iter->fd, FS_IOC_FIEMAP, &iter->f);
+
+ if (!iter->f.fm_mapped_extents)
+ return (struct fiemap_extent) { .fe_length = 0 };
+
+ iter->idx = 0;
+ }
+
+ e = iter->f.fm_extents[iter->idx++];
+ BUG_ON(!e.fe_length);
+
+ iter->f.fm_start = e.fe_logical + e.fe_length;
+
+ return e;
+}
+
+const char *strcmp_prefix(const char *a, const char *a_prefix)
+{
+ while (*a_prefix && *a == *a_prefix) {
+ a++;
+ a_prefix++;
+ }
+ return *a_prefix ? NULL : a;
+}
diff --git a/tools-util.h b/tools-util.h
index 09f00ef..1aac56a 100644
--- a/tools-util.h
+++ b/tools-util.h
@@ -5,21 +5,31 @@
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
+#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
+#include <linux/bug.h>
#include <linux/byteorder.h>
#include <linux/kernel.h>
#include <linux/log2.h>
#include <linux/string.h>
#include <linux/types.h>
+#include "ccan/darray/darray.h"
-#define die(arg, ...) \
-do { \
- fprintf(stderr, arg "\n", ##__VA_ARGS__); \
- exit(EXIT_FAILURE); \
+#define die(arg, ...) \
+do { \
+ fprintf(stderr, arg "\n", ##__VA_ARGS__); \
+ exit(EXIT_FAILURE); \
} while (0)
+#define mprintf(...) \
+({ \
+ char *_str; \
+ asprintf(&_str, __VA_ARGS__); \
+ _str; \
+})
+
static inline void *xcalloc(size_t count, size_t size)
{
void *p = calloc(count, size);
@@ -57,6 +67,38 @@ static inline void xpwrite(int fd, const void *buf, size_t count, off_t offset)
die("write error (ret %zi err %s)", r, strerror(errno));
}
+#define xopenat(_dirfd, _path, ...) \
+({ \
+ int _fd = openat((_dirfd), (_path), __VA_ARGS__); \
+ if (_fd < 0) \
+ die("Error opening %s: %s", (_path), strerror(errno)); \
+ _fd; \
+})
+
+#define xopen(...) xopenat(AT_FDCWD, __VA_ARGS__)
+
+static inline struct stat xfstatat(int dirfd, const char *path, int flags)
+{
+ struct stat stat;
+ if (fstatat(dirfd, path, &stat, flags))
+ die("stat error: %s", strerror(errno));
+ return stat;
+}
+
+static inline struct stat xfstat(int fd)
+{
+ struct stat stat;
+ if (fstat(fd, &stat))
+ die("stat error: %s", strerror(errno));
+ return stat;
+}
+
+#define xioctl(_fd, _nr, ...) \
+do { \
+ if (ioctl((_fd), (_nr), ##__VA_ARGS__)) \
+ die(#_nr " ioctl error: %s", strerror(errno)); \
+} while (0)
+
enum units {
BYTES,
SECTORS,
@@ -91,4 +133,74 @@ struct bcache_handle bcache_fs_open(const char *);
bool ask_yn(void);
+struct range {
+ u64 start;
+ u64 end;
+};
+
+typedef darray(struct range) ranges;
+
+static inline void range_add(ranges *data, u64 offset, u64 size)
+{
+ darray_append(*data, (struct range) {
+ .start = offset,
+ .end = offset + size
+ });
+}
+
+void ranges_sort_merge(ranges *);
+void ranges_roundup(ranges *, unsigned);
+void ranges_rounddown(ranges *, unsigned);
+
+struct hole_iter {
+ ranges r;
+ size_t idx;
+ u64 end;
+};
+
+static inline struct range hole_iter_next(struct hole_iter *iter)
+{
+ struct range r = {
+ .start = iter->idx ? iter->r.item[iter->idx - 1].end : 0,
+ .end = iter->idx < iter->r.size
+ ? iter->r.item[iter->idx].start : iter->end,
+ };
+
+ BUG_ON(r.start > r.end);
+
+ iter->idx++;
+ return r;
+}
+
+#define for_each_hole(_iter, _ranges, _end, _i) \
+ for (_iter = (struct hole_iter) { .r = _ranges, .end = _end }; \
+ (_iter.idx <= _iter.r.size && \
+ (_i = hole_iter_next(&_iter), true));)
+
+#include <linux/fiemap.h>
+
+struct fiemap_iter {
+ struct fiemap f;
+ struct fiemap_extent fe[1024];
+ unsigned idx;
+ int fd;
+};
+
+static inline void fiemap_iter_init(struct fiemap_iter *iter, int fd)
+{
+ memset(iter, 0, sizeof(*iter));
+
+ iter->f.fm_extent_count = ARRAY_SIZE(iter->fe);
+ iter->f.fm_length = FIEMAP_MAX_OFFSET;
+ iter->fd = fd;
+}
+
+struct fiemap_extent fiemap_iter_next(struct fiemap_iter *);
+
+#define fiemap_for_each(fd, iter, extent) \
+ for (fiemap_iter_init(&iter, fd); \
+ (extent = fiemap_iter_next(&iter)).fe_length;)
+
+const char *strcmp_prefix(const char *, const char *);
+
#endif /* _TOOLS_UTIL_H */