summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2018-01-11 06:41:59 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2018-01-11 07:02:08 -0500
commit4de98a2712764bceb9e0f67b1ac2f2c7862feb77 (patch)
tree94861af51e19af917d80952dfabeaa7a76901eaa
parentf8cbede6d18e81c804e62fd7d576310b420dcaac (diff)
Update bcachefs sources to 02ae70070a bcachefs: Allocate new btree roots lazily
-rw-r--r--.bcachefs_revision2
-rw-r--r--cmd_debug.c16
-rw-r--r--cmd_device.c8
-rw-r--r--cmd_format.c8
-rw-r--r--cmd_fsck.c8
-rw-r--r--cmd_key.c26
-rw-r--r--cmd_migrate.c18
-rw-r--r--include/linux/generic-radix-tree.h10
-rw-r--r--libbcachefs.c5
-rw-r--r--libbcachefs/alloc.c409
-rw-r--r--libbcachefs/alloc.h1
-rw-r--r--libbcachefs/bcachefs.h24
-rw-r--r--libbcachefs/bcachefs_format.h64
-rw-r--r--libbcachefs/bkey.h9
-rw-r--r--libbcachefs/bkey_methods.c2
-rw-r--r--libbcachefs/bset.c3
-rw-r--r--libbcachefs/btree_cache.h4
-rw-r--r--libbcachefs/btree_gc.c28
-rw-r--r--libbcachefs/btree_io.c108
-rw-r--r--libbcachefs/btree_io.h8
-rw-r--r--libbcachefs/btree_iter.c25
-rw-r--r--libbcachefs/btree_iter.h4
-rw-r--r--libbcachefs/btree_locking.h3
-rw-r--r--libbcachefs/btree_types.h2
-rw-r--r--libbcachefs/btree_update_interior.c65
-rw-r--r--libbcachefs/btree_update_interior.h5
-rw-r--r--libbcachefs/buckets.c48
-rw-r--r--libbcachefs/buckets.h8
-rw-r--r--libbcachefs/buckets_types.h3
-rw-r--r--libbcachefs/chardev.c14
-rw-r--r--libbcachefs/debug.c2
-rw-r--r--libbcachefs/error.h3
-rw-r--r--libbcachefs/extents.c4
-rw-r--r--libbcachefs/extents.h11
-rw-r--r--libbcachefs/fifo.h1
-rw-r--r--libbcachefs/fs-io.c284
-rw-r--r--libbcachefs/fs-ioctl.c28
-rw-r--r--libbcachefs/fs.c224
-rw-r--r--libbcachefs/fs.h2
-rw-r--r--libbcachefs/fsck.c46
-rw-r--r--libbcachefs/io.c3
-rw-r--r--libbcachefs/io_types.h6
-rw-r--r--libbcachefs/journal.c321
-rw-r--r--libbcachefs/journal.h4
-rw-r--r--libbcachefs/journal_types.h6
-rw-r--r--libbcachefs/migrate.c197
-rw-r--r--libbcachefs/move.c81
-rw-r--r--libbcachefs/move.h12
-rw-r--r--libbcachefs/movinggc.c7
-rw-r--r--libbcachefs/opts.c32
-rw-r--r--libbcachefs/opts.h11
-rw-r--r--libbcachefs/quota.c786
-rw-r--r--libbcachefs/quota.h48
-rw-r--r--libbcachefs/quota_types.h36
-rw-r--r--libbcachefs/super-io.c217
-rw-r--r--libbcachefs/super-io.h15
-rw-r--r--libbcachefs/super.c218
-rw-r--r--libbcachefs/super.h3
-rw-r--r--libbcachefs/tier.c6
-rw-r--r--linux/kthread.c2
60 files changed, 2337 insertions, 1217 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 699d6f22..92bf9ad4 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-0b8c5d0fb7b5de6fb99030565cd2d0411da37f2b
+02ae70070acc3bc4740d221efa5ff5425cf6fce5
diff --git a/cmd_debug.c b/cmd_debug.c
index 1a2c1dbd..6e395bab 100644
--- a/cmd_debug.c
+++ b/cmd_debug.c
@@ -80,9 +80,7 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd)
int cmd_dump(int argc, char *argv[])
{
struct bch_opts opts = bch2_opts_empty();
- struct bch_fs *c = NULL;
struct bch_dev *ca;
- const char *err;
char *out = NULL;
unsigned i, nr_devices = 0;
bool force = false;
@@ -112,9 +110,9 @@ int cmd_dump(int argc, char *argv[])
if (!out)
die("Please supply output filename");
- err = bch2_fs_open(argv + optind, argc - optind, opts, &c);
- if (err)
- die("error opening %s: %s", argv[optind], err);
+ struct bch_fs *c = bch2_fs_open(argv + optind, argc - optind, opts);
+ if (IS_ERR(c))
+ die("error opening %s: %s", argv[optind], strerror(-PTR_ERR(c)));
down_read(&c->gc_lock);
@@ -258,10 +256,8 @@ static const char * const list_modes[] = {
int cmd_list(int argc, char *argv[])
{
struct bch_opts opts = bch2_opts_empty();
- struct bch_fs *c = NULL;
enum btree_id btree_id = BTREE_ID_EXTENTS;
struct bpos start = POS_MIN, end = POS_MAX;
- const char *err;
u64 inum;
int mode = 0, opt;
@@ -307,9 +303,9 @@ int cmd_list(int argc, char *argv[])
if (optind >= argc)
die("Please supply device(s) to check");
- err = bch2_fs_open(argv + optind, argc - optind, opts, &c);
- if (err)
- die("error opening %s: %s", argv[optind], err);
+ struct bch_fs *c = bch2_fs_open(argv + optind, argc - optind, opts);
+ if (IS_ERR(c))
+ die("error opening %s: %s", argv[optind], strerror(-PTR_ERR(c)));
switch (mode) {
case 0:
diff --git a/cmd_device.c b/cmd_device.c
index 22ab016f..390c48ad 100644
--- a/cmd_device.c
+++ b/cmd_device.c
@@ -528,11 +528,9 @@ int cmd_device_resize(int argc, char *argv[])
} else {
printf("Doing offline resize of %s\n", dev);
- struct bch_fs *c = NULL;
- struct bch_opts opts = bch2_opts_empty();
- const char *err = bch2_fs_open(&dev, 1, opts, &c);
- if (err)
- die("error opening %s: %s", dev, err);
+ struct bch_fs *c = bch2_fs_open(&dev, 1, bch2_opts_empty());
+ if (IS_ERR(c))
+ die("error opening %s: %s", dev, strerror(-PTR_ERR(c)));
struct bch_dev *ca, *resize = NULL;
unsigned i;
diff --git a/cmd_format.c b/cmd_format.c
index 47617660..42e8d1a6 100644
--- a/cmd_format.c
+++ b/cmd_format.c
@@ -328,11 +328,11 @@ int cmd_show_super(int argc, char *argv[])
if (argc)
die("too many arguments");
- const char *err;
+ struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb;
- err = bch2_read_super(dev, bch2_opts_empty(), &sb);
- if (err)
- die("Error opening %s: %s", dev, err);
+ int ret = bch2_read_super(dev, &opts, &sb);
+ if (ret)
+ die("Error opening %s: %s", dev, strerror(-ret));
bch2_sb_print(sb.sb, print_layout, fields, HUMAN_READABLE);
bch2_free_super(&sb);
diff --git a/cmd_fsck.c b/cmd_fsck.c
index 556a4e1b..6f873b1f 100644
--- a/cmd_fsck.c
+++ b/cmd_fsck.c
@@ -23,8 +23,6 @@ static void usage(void)
int cmd_fsck(int argc, char *argv[])
{
struct bch_opts opts = bch2_opts_empty();
- struct bch_fs *c = NULL;
- const char *err;
int opt;
opt_set(opts, degraded, true);
@@ -56,9 +54,9 @@ int cmd_fsck(int argc, char *argv[])
if (optind >= argc)
die("Please supply device(s) to check");
- err = bch2_fs_open(argv + optind, argc - optind, opts, &c);
- if (err)
- die("error opening %s: %s", argv[optind], err);
+ struct bch_fs *c = bch2_fs_open(argv + optind, argc - optind, opts);
+ if (IS_ERR(c))
+ die("error opening %s: %s", argv[optind], strerror(-PTR_ERR(c)));
bch2_fs_stop(c);
return 0;
diff --git a/cmd_key.c b/cmd_key.c
index 879163f1..e670b508 100644
--- a/cmd_key.c
+++ b/cmd_key.c
@@ -9,16 +9,16 @@
int cmd_unlock(int argc, char *argv[])
{
+ struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb;
- const char *err;
char *passphrase;
if (argc != 2)
die("Please supply a single device");
- err = bch2_read_super(argv[1], bch2_opts_empty(), &sb);
- if (err)
- die("Error opening %s: %s", argv[1], err);
+ int ret = bch2_read_super(argv[1], &opts, &sb);
+ if (ret)
+ die("Error opening %s: %s", argv[1], strerror(-ret));
passphrase = read_passphrase("Enter passphrase: ");
@@ -32,16 +32,15 @@ int cmd_unlock(int argc, char *argv[])
int cmd_set_passphrase(int argc, char *argv[])
{
struct bch_opts opts = bch2_opts_empty();
- struct bch_fs *c = NULL;
- const char *err;
+ struct bch_fs *c;
if (argc < 2)
die("Please supply one or more devices");
opt_set(opts, nostart, true);
- err = bch2_fs_open(argv + 1, argc - 1, opts, &c);
- if (err)
- die("Error opening %s: %s", argv[1], err);
+ c = bch2_fs_open(argv + 1, argc - 1, opts);
+ if (IS_ERR(c))
+ die("Error opening %s: %s", argv[1], strerror(-PTR_ERR(c)));
struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb);
if (!crypt)
@@ -70,16 +69,15 @@ int cmd_set_passphrase(int argc, char *argv[])
int cmd_remove_passphrase(int argc, char *argv[])
{
struct bch_opts opts = bch2_opts_empty();
- struct bch_fs *c = NULL;
- const char *err;
+ struct bch_fs *c;
if (argc < 2)
die("Please supply one or more devices");
opt_set(opts, nostart, true);
- err = bch2_fs_open(argv + 1, argc - 1, opts, &c);
- if (err)
- die("Error opening %s: %s", argv[1], err);
+ c = bch2_fs_open(argv + 1, argc - 1, opts);
+ if (IS_ERR(c))
+ die("Error opening %s: %s", argv[1], strerror(-PTR_ERR(c)));
struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb);
if (!crypt)
diff --git a/cmd_migrate.c b/cmd_migrate.c
index d82fee6d..1c449554 100644
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@@ -334,7 +334,8 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
die("error reserving space in new filesystem: %s",
strerror(-ret));
- bch2_check_mark_super(c, extent_i_to_s_c(e), false);
+ bch2_check_mark_super(c, BCH_DATA_USER,
+ bch2_bkey_devs(extent_i_to_s_c(e).s_c));
ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i,
&res, NULL, NULL, 0);
@@ -734,19 +735,18 @@ int cmd_migrate(int argc, char *argv[])
struct bch_opts opts = bch2_opts_empty();
struct bch_fs *c = NULL;
char *path[1] = { dev.path };
- const char *err;
opt_set(opts, sb, sb_offset);
opt_set(opts, nostart, true);
opt_set(opts, noexcl, true);
- err = bch2_fs_open(path, 1, opts, &c);
- if (err)
- die("Error opening new filesystem: %s", err);
+ c = bch2_fs_open(path, 1, opts);
+ if (IS_ERR(c))
+ die("Error opening new filesystem: %s", strerror(-PTR_ERR(c)));
mark_unreserved_space(c, extents);
- err = bch2_fs_start(c);
+ const char *err = bch2_fs_start(c);
if (err)
die("Error starting new filesystem: %s", err);
@@ -758,9 +758,9 @@ int cmd_migrate(int argc, char *argv[])
opt_set(opts, nostart, false);
opt_set(opts, nochanges, true);
- err = bch2_fs_open(path, 1, opts, &c);
- if (err)
- die("Error opening new filesystem: %s", err);
+ c = bch2_fs_open(path, 1, opts);
+ if (IS_ERR(c))
+ die("Error opening new filesystem: %s", strerror(-PTR_ERR(c)));
bch2_fs_stop(c);
printf("fsck complete\n");
diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index 6ea2deb2..7f637e17 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -99,11 +99,11 @@ struct genradix_iter {
size_t pos;
};
-static inline void genradix_iter_init(struct genradix_iter *iter)
-{
- iter->offset = 0;
- iter->pos = 0;
-}
+#define genradix_iter_init(_radix, _idx) \
+ ((struct genradix_iter) { \
+ .pos = (_idx), \
+ .offset = __genradix_idx_to_offset((_radix), (_idx)),\
+ })
void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
diff --git a/libbcachefs.c b/libbcachefs.c
index 1481ef38..3632e30d 100644
--- a/libbcachefs.c
+++ b/libbcachefs.c
@@ -454,6 +454,11 @@ static void bch2_sb_print_replicas(struct bch_sb *sb, struct bch_sb_field *f,
}
}
+static void bch2_sb_print_quota(struct bch_sb *sb, struct bch_sb_field *f,
+ enum units units)
+{
+}
+
typedef void (*sb_field_print_fn)(struct bch_sb *, struct bch_sb_field *, enum units);
struct bch_sb_field_ops {
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
index ec02adc0..f7ff8027 100644
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -55,6 +55,8 @@
#include "bcachefs.h"
#include "alloc.h"
+#include "btree_cache.h"
+#include "btree_io.h"
#include "btree_update.h"
#include "btree_gc.h"
#include "buckets.h"
@@ -290,9 +292,6 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
unsigned i;
int ret;
- if (!c->btree_roots[BTREE_ID_ALLOC].b)
- return 0;
-
for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) {
bch2_alloc_read_key(c, k);
bch2_btree_iter_cond_resched(&iter);
@@ -401,7 +400,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
return ret;
}
-static int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq)
+static int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca)
{
struct btree_iter iter;
unsigned long bucket;
@@ -412,7 +411,7 @@ static int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_s
down_read(&ca->bucket_lock);
for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) {
- ret = __bch2_alloc_write_key(c, ca, bucket, &iter, journal_seq);
+ ret = __bch2_alloc_write_key(c, ca, bucket, &iter, NULL);
if (ret)
break;
@@ -537,7 +536,8 @@ static void bch2_prio_timer_init(struct bch_fs *c, int rw)
static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
size_t bucket)
{
- if (expensive_debug_checks(c)) {
+ if (expensive_debug_checks(c) &&
+ test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) {
size_t iter;
long i;
unsigned j;
@@ -692,7 +692,7 @@ static inline int bucket_alloc_cmp(alloc_heap *h,
return (l.key > r.key) - (l.key < r.key);
}
-static void invalidate_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
+static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
{
struct bucket_array *buckets;
struct alloc_heap_entry e;
@@ -740,7 +740,7 @@ static void invalidate_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
bch2_invalidate_one_bucket(c, ca, e.bucket);
}
-static void invalidate_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
+static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
{
struct bucket_array *buckets = bucket_array(ca);
struct bucket_mark m;
@@ -762,7 +762,7 @@ static void invalidate_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
}
}
-static void invalidate_buckets_random(struct bch_fs *c, struct bch_dev *ca)
+static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
{
struct bucket_array *buckets = bucket_array(ca);
struct bucket_mark m;
@@ -782,21 +782,21 @@ static void invalidate_buckets_random(struct bch_fs *c, struct bch_dev *ca)
}
}
-static void invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
+static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
{
ca->inc_gen_needs_gc = 0;
ca->inc_gen_really_needs_gc = 0;
switch (ca->mi.replacement) {
- case CACHE_REPLACEMENT_LRU:
- invalidate_buckets_lru(c, ca);
- break;
- case CACHE_REPLACEMENT_FIFO:
- invalidate_buckets_fifo(c, ca);
- break;
- case CACHE_REPLACEMENT_RANDOM:
- invalidate_buckets_random(c, ca);
- break;
+ case CACHE_REPLACEMENT_LRU:
+ find_reclaimable_buckets_lru(c, ca);
+ break;
+ case CACHE_REPLACEMENT_FIFO:
+ find_reclaimable_buckets_fifo(c, ca);
+ break;
+ case CACHE_REPLACEMENT_RANDOM:
+ find_reclaimable_buckets_random(c, ca);
+ break;
}
}
@@ -807,79 +807,119 @@ static int size_t_cmp(const void *_l, const void *_r)
return (*l > *r) - (*l < *r);
}
+static void sort_free_inc(struct bch_fs *c, struct bch_dev *ca)
+{
+ BUG_ON(ca->free_inc.front);
+
+ spin_lock(&c->freelist_lock);
+ sort(ca->free_inc.data,
+ ca->free_inc.back,
+ sizeof(ca->free_inc.data[0]),
+ size_t_cmp, NULL);
+ spin_unlock(&c->freelist_lock);
+}
+
static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
- u64 *journal_seq)
+ u64 *journal_seq, size_t nr)
{
struct btree_iter iter;
- unsigned nr_invalidated = 0;
- size_t b, i;
int ret = 0;
bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
BTREE_ITER_INTENT);
- fifo_for_each_entry(b, &ca->free_inc, i) {
+ /*
+ * XXX: if ca->nr_invalidated != 0, just return if we'd block doing the
+ * btree update or journal_res_get
+ */
+ while (ca->nr_invalidated < min(nr, fifo_used(&ca->free_inc))) {
+ size_t b = fifo_idx_entry(&ca->free_inc, ca->nr_invalidated);
+
ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq);
if (ret)
break;
- nr_invalidated++;
+ ca->nr_invalidated++;
}
bch2_btree_iter_unlock(&iter);
- return nr_invalidated ?: ret;
+ return ret;
}
-/*
- * Given an invalidated, ready to use bucket: issue a discard to it if enabled,
- * then add it to the freelist, waiting until there's room if necessary:
- */
-static void discard_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca,
- long bucket)
+static bool __push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
{
- if (ca->mi.discard &&
- blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
- blkdev_issue_discard(ca->disk_sb.bdev,
- bucket_to_sector(ca, bucket),
- ca->mi.bucket_size, GFP_NOIO, 0);
+ unsigned i;
- while (1) {
- bool pushed = false;
- unsigned i;
+ /*
+ * Don't remove from free_inc until after it's added to
+ * freelist, so gc can find it:
+ */
+ spin_lock(&c->freelist_lock);
+ for (i = 0; i < RESERVE_NR; i++)
+ if (fifo_push(&ca->free[i], bucket)) {
+ fifo_pop(&ca->free_inc, bucket);
+ --ca->nr_invalidated;
+ closure_wake_up(&c->freelist_wait);
+ spin_unlock(&c->freelist_lock);
+ return true;
+ }
+ spin_unlock(&c->freelist_lock);
- set_current_state(TASK_INTERRUPTIBLE);
+ return false;
+}
- /*
- * Don't remove from free_inc until after it's added to
- * freelist, so gc can find it:
- */
- spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++)
- if (fifo_push(&ca->free[i], bucket)) {
- fifo_pop(&ca->free_inc, bucket);
- closure_wake_up(&c->freelist_wait);
- pushed = true;
- break;
- }
- spin_unlock(&c->freelist_lock);
+static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
+{
+ int ret = 0;
- if (pushed)
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (__push_invalidated_bucket(c, ca, bucket))
break;
- if (kthread_should_stop())
+ if ((current->flags & PF_KTHREAD) &&
+ kthread_should_stop()) {
+ ret = -1;
break;
+ }
schedule();
try_to_freeze();
}
__set_current_state(TASK_RUNNING);
+ return ret;
+}
+
+/*
+ * Given an invalidated, ready to use bucket: issue a discard to it if enabled,
+ * then add it to the freelist, waiting until there's room if necessary:
+ */
+static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
+{
+ while (ca->nr_invalidated) {
+ size_t bucket = fifo_peek(&ca->free_inc);
+
+ BUG_ON(fifo_empty(&ca->free_inc) || !ca->nr_invalidated);
+
+ if (ca->mi.discard &&
+ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ bucket_to_sector(ca, bucket),
+ ca->mi.bucket_size, GFP_NOIO, 0);
+
+ if (push_invalidated_bucket(c, ca, bucket))
+ return -1;
+ }
+
+ return 0;
}
/**
* bch_allocator_thread - move buckets from free_inc to reserves
*
- * The free_inc FIFO is populated by invalidate_buckets(), and
+ * The free_inc FIFO is populated by find_reclaimable_buckets(), and
* the reserves are depleted by bucket allocation. When we run out
* of free_inc, try to invalidate some buckets and write out
* prios and gens.
@@ -889,43 +929,36 @@ static int bch2_allocator_thread(void *arg)
struct bch_dev *ca = arg;
struct bch_fs *c = ca->fs;
u64 journal_seq;
- size_t bucket;
int ret;
set_freezable();
while (1) {
while (1) {
- while (ca->nr_invalidated) {
- BUG_ON(fifo_empty(&ca->free_inc));
-
- bucket = fifo_peek(&ca->free_inc);
- discard_invalidated_bucket(c, ca, bucket);
- if (kthread_should_stop())
- return 0;
- --ca->nr_invalidated;
- }
+ ret = discard_invalidated_buckets(c, ca);
+ if (ret)
+ return 0;
if (fifo_empty(&ca->free_inc))
break;
journal_seq = 0;
- ret = bch2_invalidate_free_inc(c, ca, &journal_seq);
- if (ret < 0)
+ ret = bch2_invalidate_free_inc(c, ca, &journal_seq, SIZE_MAX);
+ if (ret)
return 0;
- ca->nr_invalidated = ret;
-
- if (ca->nr_invalidated == fifo_used(&ca->free_inc)) {
- ca->alloc_thread_started = true;
- bch2_alloc_write(c, ca, &journal_seq);
- }
-
if (ca->allocator_invalidating_data)
- bch2_journal_flush_seq(&c->journal, journal_seq);
+ ret = bch2_journal_flush_seq(&c->journal, journal_seq);
else if (ca->allocator_journal_seq_flush)
- bch2_journal_flush_seq(&c->journal,
+ ret = bch2_journal_flush_seq(&c->journal,
ca->allocator_journal_seq_flush);
+
+ /*
+ * journal error - buckets haven't actually been
+ * invalidated, can't discard them:
+ */
+ if (ret)
+ return 0;
}
/* Reset front/back so we can easily sort fifo entries later: */
@@ -947,7 +980,7 @@ static int bch2_allocator_thread(void *arg)
* another cache tier
*/
- invalidate_buckets(c, ca);
+ find_reclaimable_buckets(c, ca);
trace_alloc_batch(ca, fifo_used(&ca->free_inc),
ca->free_inc.size);
@@ -970,14 +1003,7 @@ static int bch2_allocator_thread(void *arg)
}
up_read(&c->gc_lock);
- BUG_ON(ca->free_inc.front);
-
- spin_lock(&c->freelist_lock);
- sort(ca->free_inc.data,
- ca->free_inc.back,
- sizeof(ca->free_inc.data[0]),
- size_t_cmp, NULL);
- spin_unlock(&c->freelist_lock);
+ sort_free_inc(c, ca);
/*
* free_inc is now full of newly-invalidated buckets: next,
@@ -1037,51 +1063,27 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
return ob;
}
-/*
- * XXX: allocation on startup is still sketchy. There is insufficient
- * synchronization for bch2_bucket_alloc_startup() to work correctly after
- * bch2_alloc_write() has been called, and we aren't currently doing anything
- * to guarantee that this won't happen.
- *
- * Even aside from that, it's really difficult to avoid situations where on
- * startup we write out a pointer to a freshly allocated bucket before the
- * corresponding gen - when we're still digging ourself out of the "i need to
- * allocate to write bucket gens, but i need to write bucket gens to allocate"
- * hole.
- *
- * Fortunately, bch2_btree_mark_key_initial() will detect and repair this
- * easily enough...
- */
-static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
+/* _only_ for allocating the journal and btree roots on a brand new fs: */
+int bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
{
struct bucket_array *buckets;
ssize_t b;
- if (!down_read_trylock(&c->gc_lock))
- return -1;
-
- if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
- up_read(&c->gc_lock);
- return -1;
- }
-
- spin_unlock(&c->freelist_lock);
-
- down_read(&ca->bucket_lock);
+ rcu_read_lock();
buckets = bucket_array(ca);
- spin_lock(&c->freelist_lock);
-
for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
- if (is_startup_available_bucket(buckets->b[b].mark) &&
- bch2_mark_alloc_bucket_startup(c, ca, b)) {
+ if (is_available_bucket(buckets->b[b].mark)) {
+ bch2_mark_alloc_bucket(c, ca, b, true,
+ gc_pos_alloc(c, NULL),
+ BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+ BCH_BUCKET_MARK_GC_LOCK_HELD);
set_bit(b, ca->buckets_dirty);
goto success;
}
b = -1;
success:
- up_read(&ca->bucket_lock);
- up_read(&c->gc_lock);
+ rcu_read_unlock();
return b;
}
@@ -1150,8 +1152,7 @@ int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
break;
}
- if (unlikely(!ca->alloc_thread_started) &&
- (reserve == RESERVE_ALLOC) &&
+ if (unlikely(test_bit(BCH_FS_BRAND_NEW_FS, &c->flags)) &&
(bucket = bch2_bucket_alloc_startup(c, ca)) >= 0)
goto out;
@@ -1858,6 +1859,172 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
return 0;
}
+static int __bch2_fs_allocator_start(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ size_t bu, i, devs_have_enough = 0;
+ unsigned dev_iter;
+ u64 journal_seq = 0;
+ bool invalidating_data = false;
+ int ret = 0;
+
+ if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+ return -1;
+
+ /* Scan for buckets that are already invalidated: */
+ for_each_rw_member(ca, c, dev_iter) {
+ struct btree_iter iter;
+ struct bucket_mark m;
+ struct bkey_s_c k;
+
+ for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), 0, k) {
+ if (k.k->type != BCH_ALLOC)
+ continue;
+
+ bu = k.k->p.offset;
+ m = READ_ONCE(bucket(ca, bu)->mark);
+
+ if (!is_available_bucket(m) || m.cached_sectors)
+ continue;
+
+ bch2_mark_alloc_bucket(c, ca, bu, true,
+ gc_pos_alloc(c, NULL),
+ BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+ BCH_BUCKET_MARK_GC_LOCK_HELD);
+
+ fifo_push(&ca->free_inc, bu);
+ ca->nr_invalidated++;
+
+ if (fifo_full(&ca->free_inc))
+ break;
+ }
+ bch2_btree_iter_unlock(&iter);
+ }
+
+ /* did we find enough buckets? */
+ for_each_rw_member(ca, c, dev_iter)
+ devs_have_enough += (fifo_used(&ca->free_inc) >=
+ ca->free[RESERVE_BTREE].size);
+
+ if (devs_have_enough >= c->opts.metadata_replicas)
+ return 0;
+
+ /* clear out free_inc - find_reclaimable_buckets() assumes it's empty */
+ for_each_rw_member(ca, c, dev_iter)
+ discard_invalidated_buckets(c, ca);
+
+ for_each_rw_member(ca, c, dev_iter) {
+ BUG_ON(!fifo_empty(&ca->free_inc));
+ ca->free_inc.front = ca->free_inc.back = 0;
+
+ find_reclaimable_buckets(c, ca);
+ sort_free_inc(c, ca);
+
+ invalidating_data |= ca->allocator_invalidating_data;
+
+ fifo_for_each_entry(bu, &ca->free_inc, i)
+ if (!fifo_push(&ca->free[RESERVE_BTREE], bu))
+ break;
+ }
+
+ /*
+ * We're moving buckets to freelists _before_ they've been marked as
+ * invalidated on disk - we have to so that we can allocate new btree
+ * nodes to mark them as invalidated on disk.
+ *
+ * However, we can't _write_ to any of these buckets yet - they might
+ * have cached data in them, which is live until they're marked as
+ * invalidated on disk:
+ */
+ if (invalidating_data)
+ set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+
+ /*
+ * XXX: it's possible for this to deadlock waiting on journal reclaim,
+ * since we're holding btree writes. What then?
+ */
+
+ for_each_rw_member(ca, c, dev_iter) {
+ ret = bch2_invalidate_free_inc(c, ca, &journal_seq,
+ ca->free[RESERVE_BTREE].size);
+ if (ret) {
+ percpu_ref_put(&ca->io_ref);
+ return ret;
+ }
+ }
+
+ if (invalidating_data) {
+ ret = bch2_journal_flush_seq(&c->journal, journal_seq);
+ if (ret)
+ return ret;
+ }
+
+ for_each_rw_member(ca, c, dev_iter)
+ while (ca->nr_invalidated) {
+ BUG_ON(!fifo_pop(&ca->free_inc, bu));
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ bucket_to_sector(ca, bu),
+ ca->mi.bucket_size, GFP_NOIO, 0);
+ ca->nr_invalidated--;
+ }
+
+ set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
+
+ /* now flush dirty btree nodes: */
+ if (invalidating_data) {
+ struct bucket_table *tbl;
+ struct rhash_head *pos;
+ struct btree *b;
+
+ clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+again:
+ rcu_read_lock();
+ for_each_cached_btree(b, c, tbl, i, pos)
+ if (btree_node_dirty(b) && (!b->written || b->level)) {
+ rcu_read_unlock();
+ six_lock_read(&b->lock);
+ bch2_btree_node_write(c, b, NULL, SIX_LOCK_read);
+ six_unlock_read(&b->lock);
+ goto again;
+ }
+ rcu_read_unlock();
+ }
+
+ return 0;
+}
+
+int bch2_fs_allocator_start(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+ int ret;
+
+ down_read(&c->gc_lock);
+ ret = __bch2_fs_allocator_start(c);
+ up_read(&c->gc_lock);
+
+ if (ret)
+ return ret;
+
+ for_each_rw_member(ca, c, i) {
+ ret = bch2_dev_allocator_start(ca);
+ if (ret) {
+ percpu_ref_put(&ca->io_ref);
+ return ret;
+ }
+ }
+
+ for_each_rw_member(ca, c, i) {
+ ret = bch2_alloc_write(c, ca);
+ if (ret) {
+ percpu_ref_put(&ca->io_ref);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
void bch2_fs_allocator_init(struct bch_fs *c)
{
struct open_bucket *ob;
diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h
index ee771ee1..1b9d960b 100644
--- a/libbcachefs/alloc.h
+++ b/libbcachefs/alloc.h
@@ -118,6 +118,7 @@ static inline void writepoint_init(struct write_point *wp,
wp->type = type;
}
+int bch2_fs_allocator_start(struct bch_fs *);
void bch2_fs_allocator_init(struct bch_fs *);
extern const struct bkey_ops bch2_bkey_alloc_ops;
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 02e38410..78c427fa 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -281,11 +281,9 @@ do { \
#include "clock_types.h"
#include "journal_types.h"
#include "keylist_types.h"
+#include "quota_types.h"
#include "super_types.h"
-/* 256k, in sectors */
-#define BTREE_NODE_SIZE_MAX 512
-
/*
* Number of nodes we might have to allocate in a worst case btree split
* operation - we split all the way up to the root, then allocate a new root.
@@ -380,7 +378,6 @@ struct bch_dev {
alloc_fifo free_inc;
spinlock_t freelist_lock;
unsigned nr_invalidated;
- bool alloc_thread_started;
u8 open_buckets_partial[OPEN_BUCKETS_COUNT];
unsigned open_buckets_partial_nr;
@@ -423,18 +420,28 @@ struct bch_dev {
* won't automatically reattach).
*/
enum {
+ /* startup: */
+ BCH_FS_BRAND_NEW_FS,
BCH_FS_ALLOC_READ_DONE,
+ BCH_FS_ALLOCATOR_STARTED,
BCH_FS_INITIAL_GC_DONE,
+ BCH_FS_FSCK_DONE,
+
+ /* shutdown: */
BCH_FS_EMERGENCY_RO,
BCH_FS_WRITE_DISABLE_COMPLETE,
BCH_FS_GC_STOPPING,
+
+ /* errors: */
+ BCH_FS_ERROR,
BCH_FS_GC_FAILURE,
+
+ /* misc: */
BCH_FS_BDEV_MOUNTED,
- BCH_FS_ERROR,
BCH_FS_FSCK_FIXED_ERRORS,
- BCH_FS_FSCK_DONE,
BCH_FS_FIXED_GENS,
BCH_FS_REBUILD_REPLICAS,
+ BCH_FS_HOLD_BTREE_WRITES,
};
struct btree_debug {
@@ -517,7 +524,7 @@ struct bch_fs {
struct mutex sb_lock;
/* BTREE CACHE */
- struct bio_set btree_read_bio;
+ struct bio_set btree_bio;
struct btree_root btree_roots[BTREE_ID_NR];
bool btree_roots_dirty;
@@ -665,6 +672,9 @@ struct bch_fs {
unsigned writeback_pages_max;
atomic_long_t nr_inodes;
+ /* QUOTAS */
+ struct bch_memquota_type quotas[QTYP_NR];
+
/* DEBUG JUNK */
struct dentry *debug;
struct btree_debug btree_debug[BTREE_ID_NR];
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index d65b5e66..cb9e450b 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -606,11 +606,13 @@ BKEY_VAL_TYPE(inode_generation, BCH_INODE_GENERATION);
BCH_INODE_FIELD(bi_generation, 32) \
BCH_INODE_FIELD(bi_dev, 32) \
BCH_INODE_FIELD(bi_data_checksum, 8) \
- BCH_INODE_FIELD(bi_compression, 8)
+ BCH_INODE_FIELD(bi_compression, 8) \
+ BCH_INODE_FIELD(bi_project, 32)
#define BCH_INODE_FIELDS_INHERIT() \
BCH_INODE_FIELD(bi_data_checksum) \
- BCH_INODE_FIELD(bi_compression)
+ BCH_INODE_FIELD(bi_compression) \
+ BCH_INODE_FIELD(bi_project)
enum {
/*
@@ -737,6 +739,36 @@ struct bch_alloc {
} __attribute__((packed, aligned(8)));
BKEY_VAL_TYPE(alloc, BCH_ALLOC);
+/* Quotas: */
+
+enum {
+ BCH_QUOTA = 128,
+};
+
+enum quota_types {
+ QTYP_USR = 0,
+ QTYP_GRP = 1,
+ QTYP_PRJ = 2,
+ QTYP_NR = 3,
+};
+
+enum quota_counters {
+ Q_SPC = 0,
+ Q_INO = 1,
+ Q_COUNTERS = 2,
+};
+
+struct bch_quota_counter {
+ __le64 hardlimit;
+ __le64 softlimit;
+};
+
+struct bch_quota {
+ struct bch_val v;
+ struct bch_quota_counter c[Q_COUNTERS];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(quota, BCH_QUOTA);
+
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@@ -749,7 +781,8 @@ struct bch_sb_field {
x(journal, 0) \
x(members, 1) \
x(crypt, 2) \
- x(replicas, 3)
+ x(replicas, 3) \
+ x(quota, 4)
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
@@ -883,6 +916,23 @@ struct bch_sb_field_replicas {
struct bch_replicas_entry entries[0];
};
+/* BCH_SB_FIELD_quota: */
+
+struct bch_sb_quota_counter {
+ __le32 timelimit;
+ __le32 warnlimit;
+};
+
+struct bch_sb_quota_type {
+ __le64 flags;
+ struct bch_sb_quota_counter c[Q_COUNTERS];
+};
+
+struct bch_sb_field_quota {
+ struct bch_sb_field field;
+ struct bch_sb_quota_type q[QTYP_NR];
+} __attribute__((packed, aligned(8)));
+
/* Superblock: */
/*
@@ -986,6 +1036,11 @@ LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52);
LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56);
LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57);
+LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58);
+LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59);
+LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60);
+
+/* 60-64 unused */
LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4);
LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8);
@@ -1181,7 +1236,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
DEF_BTREE_ID(INODES, 1, "inodes") \
DEF_BTREE_ID(DIRENTS, 2, "dirents") \
DEF_BTREE_ID(XATTRS, 3, "xattrs") \
- DEF_BTREE_ID(ALLOC, 4, "alloc")
+ DEF_BTREE_ID(ALLOC, 4, "alloc") \
+ DEF_BTREE_ID(QUOTAS, 5, "quotas")
#define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,
diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h
index 89697956..f665e2e1 100644
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@@ -7,6 +7,10 @@
#include "util.h"
#include "vstructs.h"
+#ifdef CONFIG_X86_64
+#define HAVE_BCACHEFS_COMPILED_UNPACK 1
+#endif
+
void bch2_to_binary(char *, const u64 *, unsigned);
#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
@@ -381,8 +385,7 @@ static inline u64 bkey_field_max(const struct bkey_format *f,
: U64_MAX;
}
-#ifdef CONFIG_X86_64
-#define HAVE_BCACHEFS_COMPILED_UNPACK 1
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
int bch2_compile_bkey_format(const struct bkey_format *, void *);
@@ -583,6 +586,8 @@ BKEY_VAL_ACCESSORS(xattr, BCH_XATTR);
BKEY_VAL_ACCESSORS(alloc, BCH_ALLOC);
+BKEY_VAL_ACCESSORS(quota, BCH_QUOTA);
+
/* byte order helpers */
#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c
index 1736a483..3b3a09eb 100644
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@@ -7,6 +7,7 @@
#include "error.h"
#include "extents.h"
#include "inode.h"
+#include "quota.h"
#include "xattr.h"
const struct bkey_ops *bch2_bkey_ops[] = {
@@ -15,6 +16,7 @@ const struct bkey_ops *bch2_bkey_ops[] = {
[BKEY_TYPE_DIRENTS] = &bch2_bkey_dirent_ops,
[BKEY_TYPE_XATTRS] = &bch2_bkey_xattr_ops,
[BKEY_TYPE_ALLOC] = &bch2_bkey_alloc_ops,
+ [BKEY_TYPE_QUOTAS] = &bch2_bkey_quota_ops,
[BKEY_TYPE_BTREE] = &bch2_bkey_btree_ops,
};
diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c
index 10f3f3f3..02be5bb4 100644
--- a/libbcachefs/bset.c
+++ b/libbcachefs/bset.c
@@ -1550,9 +1550,6 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
__bch2_btree_node_iter_init(iter, is_extents);
- //if (bkey_cmp(search, b->curr_max_key) > 0)
- // return;
-
switch (bch2_bkey_pack_pos_lossy(&p, search, b)) {
case BKEY_PACK_POS_EXACT:
packed_search = &p;
diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h
index 46d536eb..e021d6e9 100644
--- a/libbcachefs/btree_cache.h
+++ b/libbcachefs/btree_cache.h
@@ -45,8 +45,8 @@ static inline bool btree_node_hashed(struct btree *b)
}
#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \
- for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl, \
- &(_c)->btree_cache_table), \
+ for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \
+ &(_c)->btree_cache.table), \
_iter = 0; _iter < (_tbl)->size; _iter++) \
rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 7d1be86f..9f1071e5 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -148,23 +148,24 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
{
enum bch_data_type data_type = type == BKEY_TYPE_BTREE
? BCH_DATA_BTREE : BCH_DATA_USER;
+ struct bch_devs_list devs = bch2_bkey_devs(k);
int ret = 0;
+ if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
+ fsck_err_on(!bch2_sb_has_replicas(c, data_type, devs), c,
+ "superblock not marked as containing replicas (type %u)",
+ data_type)) {
+ ret = bch2_check_mark_super(c, data_type, devs);
+ if (ret)
+ return ret;
+ }
+
switch (k.k->type) {
case BCH_EXTENT:
case BCH_EXTENT_CACHED: {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const struct bch_extent_ptr *ptr;
- if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c,
- "superblock not marked as containing replicas (type %u)",
- data_type)) {
- ret = bch2_check_mark_super(c, e, data_type);
- if (ret)
- return ret;
- }
-
extent_for_each_ptr(e, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
size_t b = PTR_BUCKET_NR(ca, ptr);
@@ -284,7 +285,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
mutex_lock(&c->btree_root_lock);
b = c->btree_roots[btree_id].b;
- bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
+ if (!btree_node_fake(b))
+ bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
gc_pos_set(c, gc_pos_btree_root(b->btree_id));
mutex_unlock(&c->btree_root_lock);
@@ -991,8 +993,10 @@ static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id)
if (!c->btree_roots[id].b)
return 0;
- ret = bch2_btree_mark_key_initial(c, BKEY_TYPE_BTREE,
- bkey_i_to_s_c(&c->btree_roots[id].b->key));
+ b = c->btree_roots[id].b;
+ if (!btree_node_fake(b))
+ ret = bch2_btree_mark_key_initial(c, BKEY_TYPE_BTREE,
+ bkey_i_to_s_c(&b->key));
if (ret)
return ret;
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 87a8ddf9..3f87e91e 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1352,7 +1352,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
return;
}
- bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
+ bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
rb = container_of(bio, struct btree_read_bio, bio);
rb->c = c;
rb->start_time = local_clock();
@@ -1438,9 +1438,9 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
}
static void bch2_btree_node_write_error(struct bch_fs *c,
- struct bch_write_bio *wbio)
+ struct btree_write_bio *wbio)
{
- struct btree *b = wbio->bio.bi_private;
+ struct btree *b = wbio->wbio.bio.bi_private;
struct closure *cl = wbio->cl;
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
struct bkey_i_extent *new_key;
@@ -1473,7 +1473,7 @@ retry:
new_key = bkey_i_to_extent(&tmp.k);
e = extent_i_to_s(new_key);
extent_for_each_ptr_backwards(e, ptr)
- if (bch2_dev_list_has_dev(wbio->failed, ptr->dev))
+ if (bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev))
bch2_extent_drop_ptr(e, ptr);
if (!bch2_extent_nr_ptrs(e.c))
@@ -1486,7 +1486,7 @@ retry:
goto err;
out:
bch2_btree_iter_unlock(&iter);
- bio_put(&wbio->bio);
+ bio_put(&wbio->wbio.bio);
btree_node_write_done(c, b);
if (cl)
closure_put(cl);
@@ -1511,17 +1511,46 @@ void bch2_btree_write_error_work(struct work_struct *work)
if (!bio)
break;
- bch2_btree_node_write_error(c, to_wbio(bio));
+ bch2_btree_node_write_error(c,
+ container_of(bio, struct btree_write_bio, wbio.bio));
}
}
+static void btree_node_write_work(struct work_struct *work)
+{
+ struct btree_write_bio *wbio =
+ container_of(work, struct btree_write_bio, work);
+ struct closure *cl = wbio->cl;
+ struct bch_fs *c = wbio->wbio.c;
+ struct btree *b = wbio->wbio.bio.bi_private;
+
+ btree_bounce_free(c,
+ wbio->wbio.order,
+ wbio->wbio.used_mempool,
+ wbio->data);
+
+ if (wbio->wbio.failed.nr) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&c->btree_write_error_lock, flags);
+ bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio);
+ spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+
+ queue_work(c->wq, &c->btree_write_error_work);
+ return;
+ }
+
+ bio_put(&wbio->wbio.bio);
+ btree_node_write_done(c, b);
+ if (cl)
+ closure_put(cl);
+}
+
static void btree_node_write_endio(struct bio *bio)
{
- struct btree *b = bio->bi_private;
struct bch_write_bio *wbio = to_wbio(bio);
struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
struct bch_write_bio *orig = parent ?: wbio;
- struct closure *cl = !wbio->split ? wbio->cl : NULL;
struct bch_fs *c = wbio->c;
struct bch_dev *ca = wbio->ca;
unsigned long flags;
@@ -1542,27 +1571,13 @@ static void btree_node_write_endio(struct bio *bio)
if (parent) {
bio_put(bio);
bio_endio(&parent->bio);
- return;
- }
-
- btree_bounce_free(c,
- wbio->order,
- wbio->used_mempool,
- wbio->data);
-
- if (wbio->failed.nr) {
- spin_lock_irqsave(&c->btree_write_error_lock, flags);
- bio_list_add(&c->btree_write_error_list, &wbio->bio);
- spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+ } else {
+ struct btree_write_bio *wb =
+ container_of(orig, struct btree_write_bio, wbio);
- queue_work(c->wq, &c->btree_write_error_work);
- return;
+ INIT_WORK(&wb->work, btree_node_write_work);
+ schedule_work(&wb->work);
}
-
- bio_put(bio);
- btree_node_write_done(c, b);
- if (cl)
- closure_put(cl);
}
static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
@@ -1586,7 +1601,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
struct closure *parent,
enum six_lock_type lock_type_held)
{
- struct bch_write_bio *wbio;
+ struct btree_write_bio *wbio;
struct bset_tree *t;
struct bset *i;
struct btree_node *bn = NULL;
@@ -1602,6 +1617,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
unsigned long old, new;
void *data;
+ if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
+ return;
+
/*
* We may only have a read lock on the btree node - the dirty bit is our
* "lock" against racing with other threads that may be trying to start
@@ -1631,6 +1649,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
new ^= (1 << BTREE_NODE_write_idx);
} while (cmpxchg_acquire(&b->flags, old, new) != old);
+ BUG_ON(btree_node_fake(b));
BUG_ON(!list_empty(&b->write_blocked));
BUG_ON((b->will_make_reachable != NULL) != !b->written);
@@ -1763,21 +1782,22 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
trace_btree_write(b, bytes_to_write, sectors_to_write);
- wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write));
- wbio->cl = parent;
- wbio->failed.nr = 0;
- wbio->order = order;
- wbio->used_mempool = used_mempool;
- wbio->data = data;
- wbio->bio.bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA;
- wbio->bio.bi_iter.bi_size = sectors_to_write << 9;
- wbio->bio.bi_end_io = btree_node_write_endio;
- wbio->bio.bi_private = b;
+ wbio = container_of(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->btree_bio),
+ struct btree_write_bio, wbio.bio);
+ wbio_init(&wbio->wbio.bio);
+ wbio->data = data;
+ wbio->cl = parent;
+ wbio->wbio.order = order;
+ wbio->wbio.used_mempool = used_mempool;
+ wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA;
+ wbio->wbio.bio.bi_iter.bi_size = sectors_to_write << 9;
+ wbio->wbio.bio.bi_end_io = btree_node_write_endio;
+ wbio->wbio.bio.bi_private = b;
if (parent)
closure_get(parent);
- bch2_bio_map(&wbio->bio, data);
+ bch2_bio_map(&wbio->wbio.bio, data);
/*
* If we're appending to a leaf node, we don't technically need FUA -
@@ -1802,7 +1822,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
b->written += sectors_to_write;
- bch2_submit_wbio_replicas(wbio, c, BCH_DATA_BTREE, &k.key);
+ bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key);
return;
err:
set_btree_node_noevict(b);
@@ -1905,11 +1925,7 @@ void bch2_btree_verify_flushed(struct bch_fs *c)
unsigned i;
rcu_read_lock();
- tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
- &c->btree_cache.table);
-
- for (i = 0; i < tbl->size; i++)
- rht_for_each_entry_rcu(b, pos, tbl, i, hash)
- BUG_ON(btree_node_dirty(b));
+ for_each_cached_btree(b, c, tbl, i, pos)
+ BUG_ON(btree_node_dirty(b));
rcu_read_unlock();
}
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index 61165a63..c8417ac3 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -2,6 +2,7 @@
#define _BCACHEFS_BTREE_IO_H
#include "extents.h"
+#include "io_types.h"
struct bch_fs;
struct btree_write;
@@ -17,6 +18,13 @@ struct btree_read_bio {
struct bio bio;
};
+struct btree_write_bio {
+ struct closure *cl;
+ void *data;
+ struct work_struct work;
+ struct bch_write_bio wbio;
+};
+
static inline void btree_node_io_unlock(struct btree *b)
{
EBUG_ON(!btree_node_write_in_flight(b));
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 0b505a73..ee463f36 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -202,21 +202,20 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
/* Btree iterator locking: */
-
static void btree_iter_drop_extra_locks(struct btree_iter *iter)
{
unsigned l;
while (iter->nodes_locked &&
(l = __fls(iter->nodes_locked)) > iter->locks_want) {
- if (!btree_node_locked(iter, l))
- panic("l %u nodes_locked %u\n", l, iter->nodes_locked);
-
if (l > iter->level) {
btree_node_unlock(iter, l);
- } else if (btree_node_intent_locked(iter, l)) {
- six_lock_downgrade(&iter->nodes[l]->lock);
- iter->nodes_intent_locked ^= 1 << l;
+ } else {
+ if (btree_node_intent_locked(iter, l)) {
+ six_lock_downgrade(&iter->nodes[l]->lock);
+ iter->nodes_intent_locked ^= 1 << l;
+ }
+ break;
}
}
}
@@ -861,7 +860,8 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
i < iter->locks_want && iter->nodes[i];
i++)
if (!bch2_btree_node_relock(iter, i)) {
- while (iter->nodes[iter->level] &&
+ while (iter->level < BTREE_MAX_DEPTH &&
+ iter->nodes[iter->level] &&
iter->level + 1 < iter->locks_want)
btree_iter_up(iter);
break;
@@ -872,7 +872,8 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
* If the current node isn't locked, go up until we have a locked node
* or run out of nodes:
*/
- while (iter->nodes[iter->level] &&
+ while (iter->level < BTREE_MAX_DEPTH &&
+ iter->nodes[iter->level] &&
!(is_btree_node(iter, iter->level) &&
bch2_btree_node_relock(iter, iter->level) &&
btree_iter_pos_cmp(iter->pos,
@@ -884,7 +885,8 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
* If we've got a btree node locked (i.e. we aren't about to relock the
* root) - advance its node iterator if necessary:
*/
- if (iter->nodes[iter->level]) {
+ if (iter->level < BTREE_MAX_DEPTH &&
+ iter->nodes[iter->level]) {
struct bkey_s_c k;
while ((k = __btree_iter_peek_all(iter)).k &&
@@ -956,7 +958,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
btree_iter_up(iter);
- if (!iter->nodes[iter->level])
+ if (iter->level == BTREE_MAX_DEPTH ||
+ !iter->nodes[iter->level])
return NULL;
/* parent node usually won't be locked: redo traversal if necessary */
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index a7fdba82..eb196a3a 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -50,10 +50,8 @@ struct btree_iter {
* always fail (but since freeing a btree node takes a write lock on the
* node, which increments the node's lock seq, that's not actually
* necessary in that example).
- *
- * One extra slot for a sentinel NULL:
*/
- struct btree *nodes[BTREE_MAX_DEPTH + 1];
+ struct btree *nodes[BTREE_MAX_DEPTH];
struct btree_node_iter node_iters[BTREE_MAX_DEPTH];
/*
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index acfe5b59..ca2992ba 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -92,6 +92,7 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
int lock_type = btree_node_locked_type(iter, level);
EBUG_ON(!level && iter->flags & BTREE_ITER_UPTODATE);
+ EBUG_ON(level >= BTREE_MAX_DEPTH);
if (lock_type != BTREE_NODE_UNLOCKED)
six_unlock_type(&iter->nodes[level]->lock, lock_type);
@@ -106,6 +107,8 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
struct btree_iter *iter,
enum six_lock_type type)
{
+ EBUG_ON(level >= BTREE_MAX_DEPTH);
+
return likely(six_trylock_type(&b->lock, type)) ||
__bch2_btree_node_lock(b, pos, level, iter, type);
}
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index f0e6896a..fb2f7e21 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -197,6 +197,7 @@ enum btree_flags {
BTREE_NODE_write_in_flight,
BTREE_NODE_just_written,
BTREE_NODE_dying,
+ BTREE_NODE_fake,
};
BTREE_FLAG(read_in_flight);
@@ -209,6 +210,7 @@ BTREE_FLAG(accessed);
BTREE_FLAG(write_in_flight);
BTREE_FLAG(just_written);
BTREE_FLAG(dying);
+BTREE_FLAG(fake);
static inline struct btree_write *btree_current_write(struct btree *b)
{
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 04854532..a0f37c4c 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -546,8 +546,8 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
goto err_free;
}
- ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key),
- BCH_DATA_BTREE);
+ ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
+ bch2_bkey_devs(bkey_i_to_s_c(&b->key)));
if (ret)
goto err_free;
@@ -915,6 +915,10 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
struct bset_tree *t;
set_btree_node_dying(b);
+
+ if (btree_node_fake(b))
+ return;
+
btree_interior_update_add_node_reference(as, b);
/*
@@ -1052,7 +1056,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
gc_pos_btree_root(b->btree_id),
&stats, 0, 0);
- if (old)
+ if (old && !btree_node_fake(old))
bch2_btree_node_free_index(as, NULL,
bkey_i_to_s_c(&old->key),
&stats);
@@ -1422,7 +1426,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
bch2_btree_node_lock_for_insert(c, b, iter);
- if (bch_keylist_u64s(keys) > bch_btree_keys_u64s_remaining(c, b)) {
+ if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) {
bch2_btree_node_unlock_write(b, iter);
return -1;
}
@@ -1957,7 +1961,8 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
goto err;
}
- ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
+ ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
+ bch2_extent_devs(extent_i_to_s_c(new_key)));
if (ret)
goto err_free_update;
@@ -1993,45 +1998,43 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
bch2_btree_set_root_ondisk(c, b, READ);
}
-int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
- struct closure *writes)
+void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
{
- struct btree_update *as;
struct closure cl;
struct btree *b;
+ int ret;
- memset(&as, 0, sizeof(as));
closure_init_stack(&cl);
- while (1) {
- /* XXX haven't calculated capacity yet :/ */
- as = bch2_btree_update_start(c, id, 1,
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_USE_ALLOC_RESERVE,
- &cl);
+ do {
+ ret = bch2_btree_cache_cannibalize_lock(c, &cl);
closure_sync(&cl);
+ } while (ret);
- if (!IS_ERR(as))
- break;
-
- if (PTR_ERR(as) == -ENOSPC)
- return PTR_ERR(as);
- }
+ b = bch2_btree_node_mem_alloc(c);
+ bch2_btree_cache_cannibalize_unlock(c);
- b = __btree_root_alloc(as, 0);
+ set_btree_node_fake(b);
+ b->level = 0;
+ b->btree_id = id;
- bch2_btree_node_write(c, b, writes, SIX_LOCK_intent);
- btree_update_drop_new_node(c, b);
+ bkey_extent_init(&b->key);
+ b->key.k.p = POS_MAX;
+ bkey_i_to_extent(&b->key)->v._data[0] = U64_MAX - id;
- BUG_ON(btree_node_root(c, b));
+ bch2_bset_init_first(b, &b->data->keys);
+ bch2_btree_build_aux_trees(b);
- bch2_btree_set_root_inmem(as, b);
- bch2_btree_set_root_ondisk(c, b, WRITE);
+ b->data->min_key = POS_MIN;
+ b->data->max_key = POS_MAX;
+ b->data->format = bch2_btree_calc_format(b);
+ btree_node_set_format(b, b->data->format);
- bch2_btree_open_bucket_put(c, b);
- six_unlock_intent(&b->lock);
+ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, b->level, b->btree_id);
+ BUG_ON(ret);
- bch2_btree_update_free(as);
+ __bch2_btree_set_root_inmem(c, b);
- return 0;
+ six_unlock_write(&b->lock);
+ six_unlock_intent(&b->lock);
}
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index e129b24e..23ee3980 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -150,7 +150,7 @@ int bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
enum btree_node_sibling);
void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
-int bch2_btree_root_alloc(struct bch_fs *, enum btree_id, struct closure *);
+void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
static inline unsigned btree_update_reserve_required(struct bch_fs *c,
struct btree *b)
@@ -280,6 +280,9 @@ static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
struct btree *b, unsigned u64s)
{
+ if (unlikely(btree_node_fake(b)))
+ return false;
+
if (btree_node_is_extents(b)) {
/* The insert key might split an existing key
* (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case:
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 2dbe7d37..43133cbb 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -258,6 +258,11 @@ static u64 reserve_factor(u64 r)
return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
}
+static u64 avail_factor(u64 r)
+{
+ return (r << RESERVE_FACTOR) / (1 << RESERVE_FACTOR) + 1;
+}
+
u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
{
struct fs_usage_sum sum = __fs_usage_sum(stats);
@@ -270,6 +275,11 @@ u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
return min(c->capacity, __bch2_fs_sectors_used(c, stats));
}
+u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats)
+{
+ return avail_factor(c->capacity - bch2_fs_sectors_used(c, stats));
+}
+
static inline int is_unavailable_bucket(struct bucket_mark m)
{
return !is_available_bucket(m);
@@ -382,7 +392,6 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
}
new.owned_by_allocator = 1;
- new.touched_this_mount = 1;
new.data_type = 0;
new.cached_sectors = 0;
new.dirty_sectors = 0;
@@ -396,29 +405,6 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
return true;
}
-bool bch2_mark_alloc_bucket_startup(struct bch_fs *c, struct bch_dev *ca,
- size_t b)
-{
- struct bucket *g;
- struct bucket_mark new, old;
-
- lg_local_lock(&c->usage_lock);
- g = bucket(ca, b);
-
- old = bucket_data_cmpxchg(c, ca, g, new, ({
- if (!is_startup_available_bucket(new)) {
- lg_local_unlock(&c->usage_lock);
- return false;
- }
-
- new.owned_by_allocator = 1;
- new.touched_this_mount = 1;
- }));
- lg_local_unlock(&c->usage_lock);
-
- return true;
-}
-
void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator,
struct gc_pos pos, unsigned flags)
@@ -436,7 +422,6 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
}
old = bucket_data_cmpxchg(c, ca, g, new, ({
- new.touched_this_mount = 1;
new.owned_by_allocator = owned_by_allocator;
}));
lg_local_unlock(&c->usage_lock);
@@ -481,7 +466,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
saturated_add(ca, new.dirty_sectors, sectors,
GC_MAX_SECTORS_USED);
new.data_type = type;
- new.touched_this_mount = 1;
}));
lg_local_unlock(&c->usage_lock);
@@ -539,7 +523,6 @@ static void bch2_mark_pointer(struct bch_fs *c,
if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
if (journal_seq)
bucket_cmpxchg(g, new, ({
- new.touched_this_mount = 1;
new.journal_seq_valid = 1;
new.journal_seq = journal_seq;
}));
@@ -588,8 +571,6 @@ static void bch2_mark_pointer(struct bch_fs *c,
new.data_type = data_type;
}
- new.touched_this_mount = 1;
-
if (flags & BCH_BUCKET_MARK_NOATOMIC) {
g->_mark = new;
break;
@@ -694,17 +675,12 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
static u64 __recalc_sectors_available(struct bch_fs *c)
{
- u64 avail;
int cpu;
for_each_possible_cpu(cpu)
per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
- avail = c->capacity - bch2_fs_sectors_used(c, bch2_fs_usage_read(c));
-
- avail <<= RESERVE_FACTOR;
- avail /= (1 << RESERVE_FACTOR) + 1;
- return avail;
+ return bch2_fs_sectors_free(c, bch2_fs_usage_read(c));
}
/* Used by gc when it's starting: */
@@ -839,7 +815,7 @@ static void buckets_free_rcu(struct rcu_head *rcu)
int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
{
- struct bucket_array *buckets = NULL, *old_buckets;
+ struct bucket_array *buckets = NULL, *old_buckets = NULL;
unsigned long *buckets_dirty = NULL;
u8 *oldest_gens = NULL;
alloc_fifo free[RESERVE_NR];
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 78243129..86e72829 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -184,6 +184,7 @@ void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
+u64 bch2_fs_sectors_free(struct bch_fs *, struct bch_fs_usage);
static inline bool is_available_bucket(struct bucket_mark mark)
{
@@ -192,11 +193,6 @@ static inline bool is_available_bucket(struct bucket_mark mark)
!mark.nouse);
}
-static inline bool is_startup_available_bucket(struct bucket_mark mark)
-{
- return !mark.touched_this_mount && is_available_bucket(mark);
-}
-
static inline bool bucket_needs_journal_commit(struct bucket_mark m,
u16 last_seq_ondisk)
{
@@ -208,8 +204,6 @@ void bch2_bucket_seq_cleanup(struct bch_fs *);
bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
size_t, struct bucket_mark *);
-bool bch2_mark_alloc_bucket_startup(struct bch_fs *, struct bch_dev *,
- size_t);
void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
size_t, bool, struct gc_pos, unsigned);
void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index 7cd8439a..6f52a109 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -15,8 +15,7 @@ struct bucket_mark {
gen_valid:1,
owned_by_allocator:1,
nouse:1,
- journal_seq_valid:1,
- touched_this_mount:1;
+ journal_seq_valid:1;
u16 dirty_sectors;
u16 cached_sectors;
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index 1618ffe7..1498832b 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -64,7 +64,7 @@ found:
static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
{
struct bch_ioctl_assemble arg;
- const char *err;
+ struct bch_fs *c;
u64 *user_devs = NULL;
char **devs = NULL;
unsigned i;
@@ -96,14 +96,10 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
}
}
- err = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty(), NULL);
- if (err) {
- pr_err("Could not open filesystem: %s", err);
- ret = -EINVAL;
- goto err;
- }
-
- ret = 0;
+ c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
+ ret = PTR_ERR_OR_ZERO(c);
+ if (!ret)
+ closure_put(&c->cl);
err:
if (devs)
for (i = 0; i < arg.nr_devs; i++)
diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c
index ccfb0386..0f090ca5 100644
--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@@ -58,7 +58,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
if (IS_ERR_OR_NULL(pick.ca))
return;
- bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
+ bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
bio->bi_bdev = pick.ca->disk_sb.bdev;
bio->bi_opf = REQ_OP_READ|REQ_META;
bio->bi_iter.bi_sector = pick.ptr.offset;
diff --git a/libbcachefs/error.h b/libbcachefs/error.h
index 28fe4fce..ac3e96d2 100644
--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@@ -143,9 +143,6 @@ void bch2_flush_fsck_errs(struct bch_fs *);
#define __fsck_err_on(cond, c, _flags, ...) \
((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
-#define unfixable_fsck_err_on(cond, c, ...) \
- __fsck_err_on(cond, c, FSCK_CAN_IGNORE, ##__VA_ARGS__)
-
#define need_fsck_err_on(cond, c, ...) \
__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 2b4a2dc2..bceea486 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -666,7 +666,7 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
goto err;
}
- if (!bch2_sb_has_replicas(c, e, BCH_DATA_BTREE)) {
+ if (!bch2_sb_has_replicas(c, BCH_DATA_BTREE, bch2_extent_devs(e))) {
bch2_bkey_val_to_text(c, btree_node_type(b),
buf, sizeof(buf), k);
bch2_fs_bug(c,
@@ -1803,7 +1803,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
}
if (!bkey_extent_is_cached(e.k) &&
- !bch2_sb_has_replicas(c, e, BCH_DATA_USER)) {
+ !bch2_sb_has_replicas(c, BCH_DATA_USER, bch2_extent_devs(e))) {
bch2_bkey_val_to_text(c, btree_node_type(b),
buf, sizeof(buf), e.s_c);
bch2_fs_bug(c,
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index aeae361d..eda34381 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -426,6 +426,17 @@ static inline struct bch_devs_list bch2_extent_dirty_devs(struct bkey_s_c_extent
return ret;
}
+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case BCH_EXTENT:
+ case BCH_EXTENT_CACHED:
+ return bch2_extent_devs(bkey_s_c_to_extent(k));
+ default:
+ return (struct bch_devs_list) { .nr = 0 };
+ }
+}
+
bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
struct bch_extent_crc_unpacked);
bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
diff --git a/libbcachefs/fifo.h b/libbcachefs/fifo.h
index 98f22f6a..08739d26 100644
--- a/libbcachefs/fifo.h
+++ b/libbcachefs/fifo.h
@@ -57,6 +57,7 @@ do { \
#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
+#define fifo_idx_entry(fifo, i) (fifo)->data[((fifo)->front + (i)) & (fifo)->mask]
#define fifo_push_back_ref(f) \
(fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 2c34a85c..66374a9c 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -12,6 +12,7 @@
#include "journal.h"
#include "io.h"
#include "keylist.h"
+#include "quota.h"
#include <linux/aio.h>
#include <linux/backing-dev.h>
@@ -56,14 +57,13 @@ struct bch_writepage_io {
struct dio_write {
struct closure cl;
struct kiocb *req;
- struct bch_fs *c;
- loff_t offset;
+ struct task_struct *task;
+ unsigned loop:1,
+ sync:1,
+ free_iov:1;
- struct iovec *iovec;
- struct iovec inline_vecs[UIO_FASTIOV];
struct iov_iter iter;
-
- struct task_struct *task;
+ struct iovec inline_vecs[2];
/* must be last: */
struct bchfs_write_op iop;
@@ -130,6 +130,7 @@ static int __must_check bch2_write_inode_size(struct bch_fs *c,
static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, int sectors)
{
inode->v.i_blocks += sectors;
+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, BCH_QUOTA_WARN);
}
static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, int sectors)
@@ -1286,7 +1287,8 @@ static int bch2_read_single_page(struct page *page,
int ret;
DECLARE_COMPLETION_ONSTACK(done);
- rbio = to_rbio(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read));
+ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read),
+ io_opts(c, inode));
rbio->bio.bi_private = &done;
rbio->bio.bi_end_io = bch2_read_single_page_end_io;
@@ -1439,13 +1441,15 @@ static void bch2_direct_IO_read_split_endio(struct bio *bio)
bio_check_pages_dirty(bio); /* transfers ownership */
}
-static int bch2_direct_IO_read(struct bch_fs *c, struct kiocb *req,
- struct file *file, struct bch_inode_info *inode,
- struct iov_iter *iter, loff_t offset)
+static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
{
+ struct file *file = req->ki_filp;
+ struct bch_inode_info *inode = file_bch_inode(file);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_io_opts opts = io_opts(c, inode);
struct dio_read *dio;
struct bio *bio;
+ loff_t offset = req->ki_pos;
bool sync = is_sync_kiocb(req);
ssize_t ret;
@@ -1525,103 +1529,128 @@ start:
}
}
-static long __bch2_dio_write_complete(struct dio_write *dio)
+static void bch2_dio_write_loop_async(struct closure *);
+
+static long bch2_dio_write_loop(struct dio_write *dio)
{
- struct file *file = dio->req->ki_filp;
+ struct kiocb *req = dio->req;
+ struct file *file = req->ki_filp;
struct address_space *mapping = file->f_mapping;
struct bch_inode_info *inode = file_bch_inode(file);
- long ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9);
+ struct bio *bio = &dio->iop.op.wbio.bio;
+ struct bio_vec *bv;
+ bool sync;
+ long ret;
+ int i;
- bch2_disk_reservation_put(dio->c, &dio->iop.op.res);
+ if (dio->loop)
+ goto loop;
- __pagecache_block_put(&mapping->add_lock);
- inode_dio_end(&inode->v);
+ inode_dio_begin(&inode->v);
+ __pagecache_block_get(&mapping->add_lock);
- if (dio->iovec && dio->iovec != dio->inline_vecs)
- kfree(dio->iovec);
+ /* Write and invalidate pagecache range that we're writing to: */
+ ret = write_invalidate_inode_pages_range(mapping, req->ki_pos,
+ req->ki_pos + iov_iter_count(&dio->iter) - 1);
+ if (unlikely(ret))
+ goto err;
- bio_put(&dio->iop.op.wbio.bio);
- return ret;
-}
+ while (1) {
+ BUG_ON(current->pagecache_lock);
+ current->pagecache_lock = &mapping->add_lock;
+ if (current != dio->task)
+ use_mm(dio->task->mm);
-static void bch2_dio_write_complete(struct closure *cl)
-{
- struct dio_write *dio = container_of(cl, struct dio_write, cl);
- struct kiocb *req = dio->req;
+ ret = bio_iov_iter_get_pages(bio, &dio->iter);
- req->ki_complete(req, __bch2_dio_write_complete(dio), 0);
-}
+ if (current != dio->task)
+ unuse_mm(dio->task->mm);
+ current->pagecache_lock = NULL;
-static void bch2_dio_write_done(struct dio_write *dio)
-{
- struct bio_vec *bv;
- int i;
+ if (unlikely(ret < 0))
+ goto err;
- bio_for_each_segment_all(bv, &dio->iop.op.wbio.bio, i)
- put_page(bv->bv_page);
+ dio->iop.op.pos = POS(inode->v.i_ino,
+ (req->ki_pos >> 9) + dio->iop.op.written);
- if (dio->iter.count)
- bio_reset(&dio->iop.op.wbio.bio);
-}
+ task_io_account_write(bio->bi_iter.bi_size);
-static void bch2_do_direct_IO_write(struct dio_write *dio)
-{
- struct file *file = dio->req->ki_filp;
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bio *bio = &dio->iop.op.wbio.bio;
- int ret;
+ closure_call(&dio->iop.op.cl, bch2_write, NULL, &dio->cl);
- ret = bio_iov_iter_get_pages(bio, &dio->iter);
- if (ret < 0) {
- dio->iop.op.error = ret;
- return;
- }
+ if (!dio->sync && !dio->loop && dio->iter.count) {
+ struct iovec *iov = dio->inline_vecs;
- dio->iop.op.pos = POS(inode->v.i_ino, (dio->offset >> 9) + dio->iop.op.written);
+ if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
+ iov = kmalloc(dio->iter.nr_segs * sizeof(*iov),
+ GFP_KERNEL);
+ if (unlikely(!iov)) {
+ dio->iop.op.error = -ENOMEM;
+ goto err_wait_io;
+ }
- task_io_account_write(bio->bi_iter.bi_size);
+ dio->free_iov = true;
+ }
- closure_call(&dio->iop.op.cl, bch2_write, NULL, &dio->cl);
-}
+ memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
+ dio->iter.iov = iov;
+ }
+err_wait_io:
+ dio->loop = true;
-static void bch2_dio_write_loop_async(struct closure *cl)
-{
- struct dio_write *dio =
- container_of(cl, struct dio_write, cl);
- struct address_space *mapping = dio->req->ki_filp->f_mapping;
+ if (!dio->sync) {
+ continue_at_noreturn(&dio->cl,
+ bch2_dio_write_loop_async, NULL);
+ return -EIOCBQUEUED;
+ }
- bch2_dio_write_done(dio);
+ closure_sync(&dio->cl);
+loop:
+ bio_for_each_segment_all(bv, bio, i)
+ put_page(bv->bv_page);
+ if (!dio->iter.count || dio->iop.op.error)
+ break;
+ bio_reset(bio);
+ }
+
+ ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9);
+err:
+ __pagecache_block_put(&mapping->add_lock);
+ inode_dio_end(&inode->v);
+ bch2_disk_reservation_put(dio->iop.op.c, &dio->iop.op.res);
- if (dio->iter.count && !dio->iop.op.error) {
- use_mm(dio->task->mm);
- pagecache_block_get(&mapping->add_lock);
+ if (dio->free_iov)
+ kfree(dio->iter.iov);
- bch2_do_direct_IO_write(dio);
+ closure_debug_destroy(&dio->cl);
- pagecache_block_put(&mapping->add_lock);
- unuse_mm(dio->task->mm);
+ sync = dio->sync;
+ bio_put(bio);
- continue_at(&dio->cl, bch2_dio_write_loop_async, NULL);
- } else {
-#if 0
- closure_return_with_destructor(cl, bch2_dio_write_complete);
-#else
- closure_debug_destroy(cl);
- bch2_dio_write_complete(cl);
-#endif
+ if (!sync) {
+ req->ki_complete(req, ret, 0);
+ ret = -EIOCBQUEUED;
}
+ return ret;
+}
+
+static void bch2_dio_write_loop_async(struct closure *cl)
+{
+ struct dio_write *dio = container_of(cl, struct dio_write, cl);
+
+ bch2_dio_write_loop(dio);
}
-static int bch2_direct_IO_write(struct bch_fs *c,
- struct kiocb *req, struct file *file,
- struct bch_inode_info *inode,
- struct iov_iter *iter, loff_t offset)
+static int bch2_direct_IO_write(struct kiocb *req,
+ struct iov_iter *iter,
+ bool swap)
{
- struct address_space *mapping = file->f_mapping;
+ struct file *file = req->ki_filp;
+ struct bch_inode_info *inode = file_bch_inode(file);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct dio_write *dio;
struct bio *bio;
+ loff_t offset = req->ki_pos;
ssize_t ret;
- bool sync = is_sync_kiocb(req);
lockdep_assert_held(&inode->v.i_rwsem);
@@ -1637,95 +1666,49 @@ static int bch2_direct_IO_write(struct bch_fs *c,
dio = container_of(bio, struct dio_write, iop.op.wbio.bio);
closure_init(&dio->cl, NULL);
dio->req = req;
- dio->c = c;
- dio->offset = offset;
- dio->iovec = NULL;
- dio->iter = *iter;
dio->task = current;
+ dio->loop = false;
+ dio->sync = is_sync_kiocb(req) ||
+ offset + iter->count > inode->v.i_size;
+ dio->free_iov = false;
+ dio->iter = *iter;
bch2_fswrite_op_init(&dio->iop, c, inode, io_opts(c, inode), true);
dio->iop.op.write_point = writepoint_hashed((unsigned long) dio->task);
dio->iop.op.flags |= BCH_WRITE_NOPUT_RESERVATION;
- if ((dio->req->ki_flags & IOCB_DSYNC) &&
+ if ((req->ki_flags & IOCB_DSYNC) &&
!c->opts.journal_flush_disabled)
dio->iop.op.flags |= BCH_WRITE_FLUSH;
- if (offset + iter->count > inode->v.i_size)
- sync = true;
-
- /*
- * XXX: we shouldn't return -ENOSPC if we're overwriting existing data -
- * if getting a reservation fails we should check if we are doing an
- * overwrite.
- *
- * Have to then guard against racing with truncate (deleting data that
- * we would have been overwriting)
- */
ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9, 0);
if (unlikely(ret)) {
if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
offset >> 9),
- iter->count >> 9)) {
- closure_debug_destroy(&dio->cl);
- bio_put(bio);
- return ret;
- }
+ iter->count >> 9))
+ goto err;
dio->iop.unalloc = true;
}
dio->iop.op.nr_replicas = dio->iop.op.res.nr_replicas;
- inode_dio_begin(&inode->v);
- __pagecache_block_get(&mapping->add_lock);
-
- if (sync) {
- do {
- bch2_do_direct_IO_write(dio);
-
- closure_sync(&dio->cl);
- bch2_dio_write_done(dio);
- } while (dio->iter.count && !dio->iop.op.error);
-
- closure_debug_destroy(&dio->cl);
- return __bch2_dio_write_complete(dio);
- } else {
- bch2_do_direct_IO_write(dio);
-
- if (dio->iter.count && !dio->iop.op.error) {
- if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
- dio->iovec = kmalloc(dio->iter.nr_segs *
- sizeof(struct iovec),
- GFP_KERNEL);
- if (!dio->iovec)
- dio->iop.op.error = -ENOMEM;
- } else {
- dio->iovec = dio->inline_vecs;
- }
-
- memcpy(dio->iovec,
- dio->iter.iov,
- dio->iter.nr_segs * sizeof(struct iovec));
- dio->iter.iov = dio->iovec;
- }
-
- continue_at_noreturn(&dio->cl, bch2_dio_write_loop_async, NULL);
- return -EIOCBQUEUED;
- }
+ return bch2_dio_write_loop(dio);
+err:
+ bch2_disk_reservation_put(c, &dio->iop.op.res);
+ closure_debug_destroy(&dio->cl);
+ bio_put(bio);
+ return ret;
}
ssize_t bch2_direct_IO(struct kiocb *req, struct iov_iter *iter)
{
- struct file *file = req->ki_filp;
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct blk_plug plug;
ssize_t ret;
blk_start_plug(&plug);
- ret = ((iov_iter_rw(iter) == WRITE)
- ? bch2_direct_IO_write
- : bch2_direct_IO_read)(c, req, file, inode, iter, req->ki_pos);
+ ret = iov_iter_rw(iter) == WRITE
+ ? bch2_direct_IO_write(req, iter, false)
+ : bch2_direct_IO_read(req, iter);
blk_finish_plug(&plug);
return ret;
@@ -1734,26 +1717,7 @@ ssize_t bch2_direct_IO(struct kiocb *req, struct iov_iter *iter)
static ssize_t
bch2_direct_write(struct kiocb *iocb, struct iov_iter *iter)
{
- struct file *file = iocb->ki_filp;
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct address_space *mapping = file->f_mapping;
- loff_t pos = iocb->ki_pos;
- ssize_t ret;
-
- pagecache_block_get(&mapping->add_lock);
-
- /* Write and invalidate pagecache range that we're writing to: */
- ret = write_invalidate_inode_pages_range(file->f_mapping, pos,
- pos + iov_iter_count(iter) - 1);
- if (unlikely(ret))
- goto err;
-
- ret = bch2_direct_IO_write(c, iocb, file, inode, iter, pos);
-err:
- pagecache_block_put(&mapping->add_lock);
-
- return ret;
+ return bch2_direct_IO_write(iocb, iter, true);
}
static ssize_t __bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c
index 24228c8e..6ae67f92 100644
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@@ -4,6 +4,7 @@
#include "chardev.h"
#include "fs.h"
#include "fs-ioctl.h"
+#include "quota.h"
#include <linux/compat.h>
#include <linux/mount.h>
@@ -154,10 +155,32 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
struct fsxattr fa = { 0 };
fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
+ fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
return copy_to_user(arg, &fa, sizeof(fa));
}
+static int bch2_set_projid(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ u32 projid)
+{
+ struct bch_qid qid = inode->ei_qid;
+ int ret;
+
+ if (projid == inode->ei_qid.q[QTYP_PRJ])
+ return 0;
+
+ qid.q[QTYP_PRJ] = projid;
+
+ ret = bch2_quota_transfer(c, 1 << QTYP_PRJ, qid, inode->ei_qid,
+ inode->v.i_blocks);
+ if (ret)
+ return ret;
+
+ inode->ei_qid.q[QTYP_PRJ] = projid;
+ return 0;
+}
+
static int bch2_ioc_fssetxattr(struct bch_fs *c,
struct file *file,
struct bch_inode_info *inode,
@@ -185,9 +208,14 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
}
mutex_lock(&inode->ei_update_lock);
+ ret = bch2_set_projid(c, inode, fa.fsx_projid);
+ if (ret)
+ goto err_unlock;
+
ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &flags);
if (!ret)
bch2_inode_flags_to_vfs(inode);
+err_unlock:
mutex_unlock(&inode->ei_update_lock);
err:
inode_unlock(&inode->v);
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 472df23a..8869ba0f 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -15,6 +15,7 @@
#include "io.h"
#include "journal.h"
#include "keylist.h"
+#include "quota.h"
#include "super.h"
#include "xattr.h"
@@ -116,6 +117,7 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
inode_u.bi_mode = inode->v.i_mode;
inode_u.bi_uid = i_uid_read(&inode->v);
inode_u.bi_gid = i_gid_read(&inode->v);
+ inode_u.bi_project = inode->ei_qid.q[QTYP_PRJ];
inode_u.bi_nlink= i_nlink - nlink_bias(inode->v.i_mode);
inode_u.bi_dev = inode->v.i_rdev;
inode_u.bi_atime= timespec_to_bch2_time(c, inode->v.i_atime);
@@ -131,8 +133,10 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
} while (ret == -EINTR);
- if (!ret)
+ if (!ret) {
inode->ei_inode = inode_u;
+ inode->ei_qid = bch_qid(&inode_u);
+ }
out:
bch2_btree_iter_unlock(&iter);
@@ -215,7 +219,7 @@ static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c,
ret = posix_acl_create(&dir->v, &inode->v.i_mode, &default_acl, &acl);
if (ret) {
make_bad_inode(&inode->v);
- goto err;
+ goto err_make_bad;
}
#endif
@@ -225,16 +229,20 @@ static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c,
inode->v.i_mode, rdev,
&dir->ei_inode);
+ inode_u.bi_project = dir->ei_qid.q[QTYP_PRJ];
+
+ ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, BCH_QUOTA_PREALLOC);
+ if (ret) {
+ make_bad_inode(&inode->v);
+ goto err_make_bad;
+ }
+
ret = bch2_inode_create(c, &inode_u,
BLOCKDEV_INODE_MAX, 0,
&c->unused_inode_hint);
if (unlikely(ret)) {
- /*
- * indicate to bch_evict_inode that the inode was never actually
- * created:
- */
- make_bad_inode(&inode->v);
- goto err;
+ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN);
+ goto err_make_bad;
}
bch2_vfs_inode_init(c, inode, &inode_u);
@@ -257,6 +265,12 @@ out:
posix_acl_release(default_acl);
posix_acl_release(acl);
return inode;
+err_make_bad:
+ /*
+ * indicate to bch_evict_inode that the inode was never actually
+ * created:
+ */
+ make_bad_inode(&inode->v);
err:
clear_nlink(&inode->v);
iput(&inode->v);
@@ -604,36 +618,65 @@ static int bch2_rename2(struct inode *old_vdir, struct dentry *old_dentry,
return bch2_rename(c, old_dir, old_dentry, new_dir, new_dentry);
}
-static int bch2_setattr(struct dentry *dentry, struct iattr *iattr)
+static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iattr)
{
- struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret = 0;
+ struct bch_qid qid = inode->ei_qid;
+ unsigned qtypes = 0;
+ int ret;
- lockdep_assert_held(&inode->v.i_rwsem);
+ mutex_lock(&inode->ei_update_lock);
- ret = setattr_prepare(dentry, iattr);
- if (ret)
- return ret;
+ if (c->opts.usrquota &&
+ (iattr->ia_valid & ATTR_UID) &&
+ !uid_eq(iattr->ia_uid, inode->v.i_uid)) {
+ qid.q[QTYP_USR] = from_kuid(&init_user_ns, iattr->ia_uid),
+ qtypes |= 1 << QTYP_USR;
+ }
- if (iattr->ia_valid & ATTR_SIZE) {
- ret = bch2_truncate(inode, iattr);
- } else {
- mutex_lock(&inode->ei_update_lock);
- setattr_copy(&inode->v, iattr);
- ret = bch2_write_inode(c, inode);
- mutex_unlock(&inode->ei_update_lock);
+ if (c->opts.grpquota &&
+ (iattr->ia_valid & ATTR_GID) &&
+ !gid_eq(iattr->ia_gid, inode->v.i_gid)) {
+ qid.q[QTYP_GRP] = from_kgid(&init_user_ns, iattr->ia_gid);
+ qtypes |= 1 << QTYP_GRP;
}
- if (unlikely(ret))
- return ret;
+ if (qtypes) {
+ ret = bch2_quota_transfer(c, qtypes, qid, inode->ei_qid,
+ inode->v.i_blocks);
+ if (ret)
+ goto out_unlock;
+ }
+
+ setattr_copy(&inode->v, iattr);
+
+ ret = bch2_write_inode(c, inode);
+out_unlock:
+ mutex_unlock(&inode->ei_update_lock);
- if (iattr->ia_valid & ATTR_MODE)
+ if (!ret &&
+ iattr->ia_valid & ATTR_MODE)
ret = posix_acl_chmod(&inode->v, inode->v.i_mode);
return ret;
}
+static int bch2_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+ int ret;
+
+ lockdep_assert_held(&inode->v.i_rwsem);
+
+ ret = setattr_prepare(dentry, iattr);
+ if (ret)
+ return ret;
+
+ return iattr->ia_valid & ATTR_SIZE
+ ? bch2_truncate(inode, iattr)
+ : bch2_setattr_nonsize(inode, iattr);
+}
+
static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode)
{
struct bch_fs *c = vdir->i_sb->s_fs_info;
@@ -910,6 +953,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
inode->ei_journal_seq = 0;
+ inode->ei_qid = bch_qid(bi);
inode->ei_str_hash = bch2_hash_info_init(c, bi);
inode->ei_inode = *bi;
@@ -995,6 +1039,10 @@ static void bch2_evict_inode(struct inode *vinode)
clear_inode(&inode->v);
if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
+ BCH_QUOTA_WARN);
+ bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
+ BCH_QUOTA_WARN);
bch2_inode_rm(c, inode->v.i_ino);
atomic_long_dec(&c->nr_inodes);
}
@@ -1009,8 +1057,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = BCACHEFS_STATFS_MAGIC;
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = c->capacity >> PAGE_SECTOR_SHIFT;
- buf->f_bfree = (c->capacity -
- bch2_fs_sectors_used(c, bch2_fs_usage_read(c))) >>
+ buf->f_bfree = bch2_fs_sectors_free(c, bch2_fs_usage_read(c)) >>
PAGE_SECTOR_SHIFT;
buf->f_bavail = buf->f_bfree;
buf->f_files = atomic_long_read(&c->nr_inodes);
@@ -1037,81 +1084,100 @@ static int bch2_sync_fs(struct super_block *sb, int wait)
return bch2_journal_flush(&c->journal);
}
-static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name,
- struct bch_opts opts)
+static struct bch_fs *bch2_path_to_fs(const char *dev)
{
- size_t nr_devs = 0, i = 0;
- char *dev_name, *s, **devs;
- struct bch_fs *c = NULL;
- const char *err = "cannot allocate memory";
+ struct bch_fs *c;
+ struct block_device *bdev = lookup_bdev(dev);
- dev_name = kstrdup(_dev_name, GFP_KERNEL);
- if (!dev_name)
- return NULL;
+ if (IS_ERR(bdev))
+ return ERR_CAST(bdev);
- for (s = dev_name; s; s = strchr(s + 1, ':'))
- nr_devs++;
+ c = bch2_bdev_to_fs(bdev);
+ bdput(bdev);
+ return c ?: ERR_PTR(-ENOENT);
+}
- devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
- if (!devs)
- goto err;
+static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs,
+ unsigned nr_devs, struct bch_opts opts)
+{
+ struct bch_fs *c, *c1, *c2;
+ size_t i;
- for (i = 0, s = dev_name;
- s;
- (s = strchr(s, ':')) && (*s++ = '\0'))
- devs[i++] = s;
+ if (!nr_devs)
+ return ERR_PTR(-EINVAL);
+
+ c = bch2_fs_open(devs, nr_devs, opts);
- err = bch2_fs_open(devs, nr_devs, opts, &c);
- if (err) {
+ if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) {
/*
* Already open?
* Look up each block device, make sure they all belong to a
* filesystem and they all belong to the _same_ filesystem
*/
- for (i = 0; i < nr_devs; i++) {
- struct block_device *bdev = lookup_bdev(devs[i]);
- struct bch_fs *c2;
-
- if (IS_ERR(bdev))
- goto err;
+ c1 = bch2_path_to_fs(devs[0]);
+ if (!c1)
+ return c;
- c2 = bch2_bdev_to_fs(bdev);
- bdput(bdev);
-
- if (!c)
- c = c2;
- else if (c2)
+ for (i = 1; i < nr_devs; i++) {
+ c2 = bch2_path_to_fs(devs[i]);
+ if (!IS_ERR(c2))
closure_put(&c2->cl);
- if (!c)
- goto err;
- if (c != c2) {
- closure_put(&c->cl);
- goto err;
+ if (c1 != c2) {
+ closure_put(&c1->cl);
+ return c;
}
}
- mutex_lock(&c->state_lock);
+ c = c1;
+ }
- if (!bch2_fs_running(c)) {
- mutex_unlock(&c->state_lock);
- closure_put(&c->cl);
- err = "incomplete filesystem";
- c = NULL;
- goto err;
- }
+ if (IS_ERR(c))
+ return c;
+
+ mutex_lock(&c->state_lock);
+ if (!bch2_fs_running(c)) {
mutex_unlock(&c->state_lock);
+ closure_put(&c->cl);
+ pr_err("err mounting %s: incomplete filesystem", dev_name);
+ return ERR_PTR(-EINVAL);
}
+ mutex_unlock(&c->state_lock);
+
set_bit(BCH_FS_BDEV_MOUNTED, &c->flags);
+ return c;
+}
+
+static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name,
+ struct bch_opts opts)
+{
+ char *dev_name = NULL, **devs = NULL, *s;
+ struct bch_fs *c = ERR_PTR(-ENOMEM);
+ size_t i, nr_devs = 0;
+
+ dev_name = kstrdup(_dev_name, GFP_KERNEL);
+ if (!dev_name)
+ goto err;
+
+ for (s = dev_name; s; s = strchr(s + 1, ':'))
+ nr_devs++;
+
+ devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
+ if (!devs)
+ goto err;
+
+ for (i = 0, s = dev_name;
+ s;
+ (s = strchr(s, ':')) && (*s++ = '\0'))
+ devs[i++] = s;
+
+ c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts);
err:
kfree(devs);
kfree(dev_name);
-
- if (!c)
- pr_err("bch_fs_open err %s", err);
return c;
}
@@ -1234,8 +1300,8 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
return ERR_PTR(ret);
c = bch2_open_as_blockdevs(dev_name, opts);
- if (!c)
- return ERR_PTR(-ENOENT);
+ if (IS_ERR(c))
+ return ERR_CAST(c);
sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|MS_NOSEC, c);
if (IS_ERR(sb)) {
@@ -1261,6 +1327,10 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_op = &bch_super_operations;
sb->s_export_op = &bch_export_ops;
+#ifdef CONFIG_BCACHEFS_QUOTA
+ sb->s_qcop = &bch2_quotactl_operations;
+ sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
+#endif
sb->s_xattr = bch2_xattr_handlers;
sb->s_magic = BCACHEFS_STATFS_MAGIC;
sb->s_time_gran = c->sb.time_precision;
diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h
index 652105fb..dd0bd4ef 100644
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@@ -3,6 +3,7 @@
#include "opts.h"
#include "str_hash.h"
+#include "quota_types.h"
#include <linux/seqlock.h>
#include <linux/stat.h>
@@ -13,6 +14,7 @@ struct bch_inode_info {
struct mutex ei_update_lock;
u64 ei_journal_seq;
unsigned long ei_last_dirtied;
+ struct bch_qid ei_qid;
struct bch_hash_info ei_str_hash;
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 696926fe..ef09c131 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -266,26 +266,60 @@ static int check_extents(struct bch_fs *c)
!S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
"extent type %u for non regular file, inode %llu mode %o",
k.k->type, k.k->p.inode, w.inode.bi_mode)) {
- ret = bch2_btree_delete_at(&iter, 0);
+ bch2_btree_iter_unlock(&iter);
+
+ ret = bch2_inode_truncate(c, k.k->p.inode, 0, NULL, NULL);
if (ret)
goto err;
continue;
}
- unfixable_fsck_err_on(w.first_this_inode &&
+ if (fsck_err_on(w.first_this_inode &&
w.have_inode &&
!(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
w.inode.bi_sectors !=
(i_sectors = bch2_count_inode_sectors(c, w.cur_inum)),
c, "i_sectors wrong: got %llu, should be %llu",
- w.inode.bi_sectors, i_sectors);
+ w.inode.bi_sectors, i_sectors)) {
+ struct bkey_inode_buf p;
+
+ w.inode.bi_sectors = i_sectors;
+
+ bch2_btree_iter_unlock(&iter);
+
+ bch2_inode_pack(&p, &w.inode);
- unfixable_fsck_err_on(w.have_inode &&
+ ret = bch2_btree_insert(c, BTREE_ID_INODES,
+ &p.inode.k_i,
+ NULL,
+ NULL,
+ NULL,
+ BTREE_INSERT_NOFAIL);
+ if (ret) {
+ bch_err(c, "error in fs gc: error %i "
+ "updating inode", ret);
+ goto err;
+ }
+
+ /* revalidate iterator: */
+ k = bch2_btree_iter_peek(&iter);
+ }
+
+ if (fsck_err_on(w.have_inode &&
!(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
k.k->type != BCH_RESERVATION &&
k.k->p.offset > round_up(w.inode.bi_size, PAGE_SIZE) >> 9, c,
"extent type %u offset %llu past end of inode %llu, i_size %llu",
- k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size);
+ k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
+ bch2_btree_iter_unlock(&iter);
+
+ ret = bch2_inode_truncate(c, k.k->p.inode,
+ round_up(w.inode.bi_size, PAGE_SIZE) >> 9,
+ NULL, NULL);
+ if (ret)
+ goto err;
+ continue;
+ }
}
err:
fsck_err:
@@ -999,7 +1033,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
u64 nlinks_pos;
bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0), 0);
- genradix_iter_init(&nlinks_iter);
+ nlinks_iter = genradix_iter_init(links, 0);
while ((k = bch2_btree_iter_peek(&iter)).k &&
!btree_iter_err(k)) {
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index e045eb20..6f6d42fc 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -268,7 +268,8 @@ static void bch2_write_index(struct closure *cl)
}
if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) {
- ret = bch2_check_mark_super(c, e.c, BCH_DATA_USER);
+ ret = bch2_check_mark_super(c, BCH_DATA_USER,
+ bch2_extent_devs(e.c));
if (ret)
goto err;
}
diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h
index ff18fdc9..32ecac24 100644
--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_types.h
@@ -67,10 +67,7 @@ struct bch_read_bio {
struct bch_write_bio {
struct bch_fs *c;
struct bch_dev *ca;
- union {
struct bch_write_bio *parent;
- struct closure *cl;
- };
struct bch_devs_list failed;
u8 order;
@@ -82,7 +79,6 @@ struct bch_write_bio {
used_mempool:1;
unsigned submit_time_us;
- void *data;
struct bio bio;
};
@@ -94,7 +90,7 @@ struct bch_write_op {
unsigned written; /* sectors */
u16 flags;
- s8 error;
+ s16 error; /* dio write path expects it to hold -ERESTARTSYS... */
unsigned csum_type:4;
unsigned compression_type:4;
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 0133a31e..811f7a5c 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -88,6 +88,9 @@ struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *c, struct jset *j,
if (!entry)
return NULL;
+ if (!entry->u64s)
+ return ERR_PTR(-EINVAL);
+
k = entry->start;
*level = entry->level;
*level = entry->level;
@@ -415,6 +418,7 @@ static struct nonce journal_nonce(const struct jset *jset)
}};
}
+/* this fills in a range with empty jset_entries: */
static void journal_entry_null_range(void *start, void *end)
{
struct jset_entry *entry;
@@ -423,7 +427,7 @@ static void journal_entry_null_range(void *start, void *end)
memset(entry, 0, sizeof(*entry));
}
-static int journal_validate_key(struct bch_fs *c, struct jset *j,
+static int journal_validate_key(struct bch_fs *c, struct jset *jset,
struct jset_entry *entry,
struct bkey_i *k, enum bkey_type key_type,
const char *type)
@@ -458,7 +462,7 @@ static int journal_validate_key(struct bch_fs *c, struct jset *j,
return 0;
}
- if (JSET_BIG_ENDIAN(j) != CPU_BIG_ENDIAN)
+ if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN)
bch2_bkey_swab(key_type, NULL, bkey_to_packed(k));
invalid = bch2_bkey_invalid(c, key_type, bkey_i_to_s_c(k));
@@ -497,26 +501,27 @@ fsck_err:
#define journal_entry_err_on(cond, c, msg, ...) \
((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
-static int journal_entry_validate_entries(struct bch_fs *c, struct jset *j,
+static int journal_entry_validate_entries(struct bch_fs *c, struct jset *jset,
int write)
{
struct jset_entry *entry;
int ret = 0;
- vstruct_for_each(j, entry) {
+ vstruct_for_each(jset, entry) {
+ void *next = vstruct_next(entry);
struct bkey_i *k;
if (journal_entry_err_on(vstruct_next(entry) >
- vstruct_last(j), c,
+ vstruct_last(jset), c,
"journal entry extends past end of jset")) {
- j->u64s = cpu_to_le32((u64 *) entry - j->_data);
+ jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
break;
}
switch (entry->type) {
case JOURNAL_ENTRY_BTREE_KEYS:
vstruct_for_each(entry, k) {
- ret = journal_validate_key(c, j, entry, k,
+ ret = journal_validate_key(c, jset, entry, k,
bkey_type(entry->level,
entry->btree_id),
"key");
@@ -531,12 +536,17 @@ static int journal_entry_validate_entries(struct bch_fs *c, struct jset *j,
if (journal_entry_err_on(!entry->u64s ||
le16_to_cpu(entry->u64s) != k->k.u64s, c,
"invalid btree root journal entry: wrong number of keys")) {
- journal_entry_null_range(entry,
- vstruct_next(entry));
+ /*
+ * we don't want to null out this jset_entry,
+ * just the contents, so that later we can tell
+ * we were _supposed_ to have a btree root
+ */
+ entry->u64s = 0;
+ journal_entry_null_range(vstruct_next(entry), next);
continue;
}
- ret = journal_validate_key(c, j, entry, k,
+ ret = journal_validate_key(c, jset, entry, k,
BKEY_TYPE_BTREE, "btree root");
if (ret)
goto fsck_err;
@@ -566,21 +576,21 @@ fsck_err:
}
static int journal_entry_validate(struct bch_fs *c,
- struct jset *j, u64 sector,
+ struct jset *jset, u64 sector,
unsigned bucket_sectors_left,
unsigned sectors_read,
int write)
{
- size_t bytes = vstruct_bytes(j);
+ size_t bytes = vstruct_bytes(jset);
struct bch_csum csum;
int ret = 0;
- if (le64_to_cpu(j->magic) != jset_magic(c))
+ if (le64_to_cpu(jset->magic) != jset_magic(c))
return JOURNAL_ENTRY_NONE;
- if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) {
+ if (le32_to_cpu(jset->version) != BCACHE_JSET_VERSION) {
bch_err(c, "unknown journal entry version %u",
- le32_to_cpu(j->version));
+ le32_to_cpu(jset->version));
return BCH_FSCK_UNKNOWN_VERSION;
}
@@ -594,26 +604,26 @@ static int journal_entry_validate(struct bch_fs *c,
if (bytes > sectors_read << 9)
return JOURNAL_ENTRY_REREAD;
- if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)), c,
+ if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
"journal entry with unknown csum type %llu sector %lluu",
- JSET_CSUM_TYPE(j), sector))
+ JSET_CSUM_TYPE(jset), sector))
return JOURNAL_ENTRY_BAD;
- csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
- if (journal_entry_err_on(bch2_crc_cmp(csum, j->csum), c,
+ csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
+ if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
"journal checksum bad, sector %llu", sector)) {
/* XXX: retry IO, when we start retrying checksum errors */
/* XXX: note we might have missing journal entries */
return JOURNAL_ENTRY_BAD;
}
- bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
- j->encrypted_start,
- vstruct_end(j) - (void *) j->encrypted_start);
+ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+ jset->encrypted_start,
+ vstruct_end(jset) - (void *) jset->encrypted_start);
- if (journal_entry_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c,
+ if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
"invalid journal entry: last_seq > seq"))
- j->last_seq = j->seq;
+ jset->last_seq = jset->seq;
return 0;
fsck_err:
@@ -960,6 +970,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
struct bch_dev *ca;
u64 cur_seq, end_seq;
unsigned iter, keys = 0, entries = 0;
+ size_t nr;
int ret = 0;
closure_init_stack(&jlist.cl);
@@ -994,12 +1005,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
goto fsck_err;
if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err_on(!bch2_sb_has_replicas_devlist(c, &i->devs,
- BCH_DATA_JOURNAL), c,
+ fsck_err_on(!bch2_sb_has_replicas(c, BCH_DATA_JOURNAL,
+ i->devs), c,
"superblock not marked as containing replicas (type %u)",
BCH_DATA_JOURNAL)) {
- ret = bch2_check_mark_super_devlist(c, &i->devs,
- BCH_DATA_JOURNAL);
+ ret = bch2_check_mark_super(c, BCH_DATA_JOURNAL,
+ i->devs);
if (ret)
return ret;
}
@@ -1007,9 +1018,16 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
i = list_last_entry(list, struct journal_replay, list);
- unfixable_fsck_err_on(le64_to_cpu(i->j.seq) -
- le64_to_cpu(i->j.last_seq) + 1 > j->pin.size, c,
- "too many journal entries open for refcount fifo");
+ nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1;
+
+ if (nr > j->pin.size) {
+ free_fifo(&j->pin);
+ init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
+ if (!j->pin.data) {
+ bch_err(c, "error reallocating journal fifo (%zu open entries)", nr);
+ return -ENOMEM;
+ }
+ }
atomic64_set(&j->seq, le64_to_cpu(i->j.seq));
j->last_seq_ondisk = le64_to_cpu(i->j.last_seq);
@@ -1131,18 +1149,19 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
#endif
}
-static void __journal_entry_new(struct journal *j, int count)
+static void journal_pin_new_entry(struct journal *j, int count)
{
- struct journal_entry_pin_list *p = fifo_push_ref(&j->pin);
+ struct journal_entry_pin_list *p;
/*
* The fifo_push() needs to happen at the same time as j->seq is
* incremented for last_seq() to be calculated correctly
*/
+ p = fifo_push_ref(&j->pin);
atomic64_inc(&j->seq);
- BUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) !=
- &fifo_peek_back(&j->pin));
+ EBUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) !=
+ &fifo_peek_back(&j->pin));
INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed);
@@ -1150,13 +1169,10 @@ static void __journal_entry_new(struct journal *j, int count)
p->devs.nr = 0;
}
-static void __bch2_journal_next_entry(struct journal *j)
+static void bch2_journal_buf_init(struct journal *j)
{
- struct journal_buf *buf;
-
- __journal_entry_new(j, 1);
+ struct journal_buf *buf = journal_cur_buf(j);
- buf = journal_cur_buf(j);
memset(buf->has_inode, 0, sizeof(buf->has_inode));
memset(buf->data, 0, sizeof(*buf->data));
@@ -1208,22 +1224,24 @@ static enum {
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
- journal_reclaim_fast(j);
-
clear_bit(JOURNAL_NEED_WRITE, &j->flags);
buf = &j->buf[old.idx];
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
- buf->data->last_seq = cpu_to_le64(last_seq(j));
j->prev_buf_sectors =
vstruct_blocks_plus(buf->data, c->block_bits,
journal_entry_u64s_reserve(buf)) *
c->opts.block_size;
-
BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
- __bch2_journal_next_entry(j);
+ journal_reclaim_fast(j);
+ /* XXX: why set this here, and not in journal_write()? */
+ buf->data->last_seq = cpu_to_le64(last_seq(j));
+
+ journal_pin_new_entry(j, 1);
+
+ bch2_journal_buf_init(j);
cancel_delayed_work(&j->write_work);
spin_unlock(&j->lock);
@@ -1352,12 +1370,20 @@ static int journal_entry_sectors(struct journal *j)
/*
* should _only_ called from journal_res_get() - when we actually want a
* journal reservation - journal entry is open means journal is dirty:
+ *
+ * returns:
+ * 1: success
+ * 0: journal currently full (must wait)
+ * -EROFS: insufficient rw devices
+ * -EIO: journal error
*/
static int journal_entry_open(struct journal *j)
{
struct journal_buf *buf = journal_cur_buf(j);
+ union journal_res_state old, new;
ssize_t u64s;
- int ret = 0, sectors;
+ int sectors;
+ u64 v;
lockdep_assert_held(&j->lock);
BUG_ON(journal_entry_is_open(j));
@@ -1387,41 +1413,36 @@ static int journal_entry_open(struct journal *j)
BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL);
- if (u64s > le32_to_cpu(buf->data->u64s)) {
- union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
-
- /*
- * Must be set before marking the journal entry as open:
- */
- j->cur_entry_u64s = u64s;
-
- do {
- old.v = new.v = v;
+ if (u64s <= le32_to_cpu(buf->data->u64s))
+ return 0;
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return false;
+ /*
+ * Must be set before marking the journal entry as open:
+ */
+ j->cur_entry_u64s = u64s;
- /* Handle any already added entries */
- new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
- ret = 1;
+ v = atomic64_read(&j->reservations.counter);
+ do {
+ old.v = new.v = v;
- wake_up(&j->wait);
+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+ return -EIO;
- if (j->res_get_blocked_start) {
- __bch2_time_stats_update(j->blocked_time,
- j->res_get_blocked_start);
- j->res_get_blocked_start = 0;
- }
+ /* Handle any already added entries */
+ new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+ } while ((v = atomic64_cmpxchg(&j->reservations.counter,
+ old.v, new.v)) != old.v);
- mod_delayed_work(system_freezable_wq,
- &j->write_work,
- msecs_to_jiffies(j->write_delay_ms));
- }
+ if (j->res_get_blocked_start)
+ __bch2_time_stats_update(j->blocked_time,
+ j->res_get_blocked_start);
+ j->res_get_blocked_start = 0;
- return ret;
+ mod_delayed_work(system_freezable_wq,
+ &j->write_work,
+ msecs_to_jiffies(j->write_delay_ms));
+ wake_up(&j->wait);
+ return 1;
}
void bch2_journal_start(struct bch_fs *c)
@@ -1438,14 +1459,15 @@ void bch2_journal_start(struct bch_fs *c)
set_bit(JOURNAL_STARTED, &j->flags);
while (atomic64_read(&j->seq) < new_seq)
- __journal_entry_new(j, 0);
+ journal_pin_new_entry(j, 0);
/*
* journal_buf_switch() only inits the next journal entry when it
* closes an open journal entry - the very first journal entry gets
* initialized here:
*/
- __bch2_journal_next_entry(j);
+ journal_pin_new_entry(j, 1);
+ bch2_journal_buf_init(j);
/*
* Adding entries to the next journal entry before allocating space on
@@ -1476,7 +1498,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
struct bkey_i *k, *_n;
struct jset_entry *entry;
struct journal_replay *i, *n;
- int ret = 0, did_replay = 0;
+ int ret = 0;
list_for_each_entry_safe(i, n, list, list) {
j->replay_pin_list =
@@ -1514,7 +1536,6 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
}
cond_resched();
- did_replay = true;
}
if (atomic_dec_and_test(&j->replay_pin_list->count))
@@ -1524,22 +1545,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
j->replay_pin_list = NULL;
bch2_journal_set_replay_done(j);
-
- if (did_replay) {
- bch2_journal_flush_pins(&c->journal, U64_MAX);
-
- /*
- * Write a new journal entry _before_ we start journalling new data -
- * otherwise, we could end up with btree node bsets with journal seqs
- * arbitrarily far in the future vs. the most recently written journal
- * entry on disk, if we crash before writing the next journal entry:
- */
- ret = bch2_journal_meta(j);
- if (ret) {
- bch_err(c, "journal replay: error %d flushing journal", ret);
- goto err;
- }
- }
+ ret = bch2_journal_flush_all_pins(j);
err:
bch2_journal_entries_free(list);
return ret;
@@ -1654,7 +1660,7 @@ err:
return ret;
}
-int bch2_dev_journal_alloc(struct bch_dev *ca)
+int bch2_dev_journal_alloc(struct bch_fs *c, struct bch_dev *ca)
{
unsigned nr;
@@ -1670,7 +1676,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
min(1 << 10,
(1 << 20) / ca->mi.bucket_size));
- return bch2_set_nr_journal_buckets(ca->fs, ca, nr);
+ return bch2_set_nr_journal_buckets(c, ca, nr);
}
/* Journalling */
@@ -1723,6 +1729,7 @@ static inline void __journal_pin_add(struct journal *j,
list_add(&pin->list, &pin_list->list);
else
INIT_LIST_HEAD(&pin->list);
+ wake_up(&j->wait);
}
static void journal_pin_add_entry(struct journal *j,
@@ -1730,9 +1737,9 @@ static void journal_pin_add_entry(struct journal *j,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
- spin_lock_irq(&j->pin_lock);
+ spin_lock(&j->lock);
__journal_pin_add(j, pin_list, pin, flush_fn);
- spin_unlock_irq(&j->pin_lock);
+ spin_unlock(&j->lock);
}
void bch2_journal_pin_add(struct journal *j,
@@ -1744,9 +1751,9 @@ void bch2_journal_pin_add(struct journal *j,
? journal_seq_pin(j, res->seq)
: j->replay_pin_list;
- spin_lock_irq(&j->pin_lock);
+ spin_lock(&j->lock);
__journal_pin_add(j, pin_list, pin, flush_fn);
- spin_unlock_irq(&j->pin_lock);
+ spin_unlock(&j->lock);
}
static inline bool __journal_pin_drop(struct journal *j,
@@ -1766,13 +1773,12 @@ static inline bool __journal_pin_drop(struct journal *j,
void bch2_journal_pin_drop(struct journal *j,
struct journal_entry_pin *pin)
{
- unsigned long flags;
bool wakeup = false;
- spin_lock_irqsave(&j->pin_lock, flags);
+ spin_lock(&j->lock);
if (journal_pin_active(pin))
wakeup = __journal_pin_drop(j, pin);
- spin_unlock_irqrestore(&j->pin_lock, flags);
+ spin_unlock(&j->lock);
/*
* Unpinning a journal entry make make journal_next_bucket() succeed, if
@@ -1789,7 +1795,7 @@ void bch2_journal_pin_add_if_older(struct journal *j,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
- spin_lock_irq(&j->pin_lock);
+ spin_lock(&j->lock);
if (journal_pin_active(src_pin) &&
(!journal_pin_active(pin) ||
@@ -1800,24 +1806,19 @@ void bch2_journal_pin_add_if_older(struct journal *j,
__journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
}
- spin_unlock_irq(&j->pin_lock);
+ spin_unlock(&j->lock);
}
static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
{
struct journal_entry_pin_list *pin_list;
- struct journal_entry_pin *ret = NULL;
+ struct journal_entry_pin *ret;
unsigned iter;
- /* so we don't iterate over empty fifo entries below: */
- if (!atomic_read(&fifo_peek_front(&j->pin).count)) {
- spin_lock(&j->lock);
- journal_reclaim_fast(j);
- spin_unlock(&j->lock);
- }
+ /* no need to iterate over empty fifo entries: */
+ journal_reclaim_fast(j);
- spin_lock_irq(&j->pin_lock);
fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
if (journal_pin_seq(j, pin_list) > seq_to_flush)
break;
@@ -1828,71 +1829,82 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
/* must be list_del_init(), see bch2_journal_pin_drop() */
list_move(&ret->list, &pin_list->flushed);
*seq = journal_pin_seq(j, pin_list);
- break;
+ return ret;
}
}
- spin_unlock_irq(&j->pin_lock);
- return ret;
+ return NULL;
}
-static bool journal_flush_done(struct journal *j, u64 seq_to_flush)
+static struct journal_entry_pin *
+journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
{
- bool ret;
+ struct journal_entry_pin *ret;
spin_lock(&j->lock);
- journal_reclaim_fast(j);
-
- ret = (fifo_used(&j->pin) == 1 &&
- atomic_read(&fifo_peek_front(&j->pin).count) == 1) ||
- last_seq(j) > seq_to_flush;
+ ret = __journal_get_next_pin(j, seq_to_flush, seq);
spin_unlock(&j->lock);
return ret;
}
-void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
+static int journal_flush_done(struct journal *j, u64 seq_to_flush,
+ struct journal_entry_pin **pin,
+ u64 *pin_seq)
{
- struct journal_entry_pin *pin;
- u64 pin_seq;
+ int ret;
- if (!test_bit(JOURNAL_STARTED, &j->flags))
- return;
+ *pin = NULL;
- while ((pin = journal_get_next_pin(j, seq_to_flush, &pin_seq)))
- pin->flush(j, pin, pin_seq);
+ ret = bch2_journal_error(j);
+ if (ret)
+ return ret;
+ spin_lock(&j->lock);
/*
* If journal replay hasn't completed, the unreplayed journal entries
- * hold refs on their corresponding sequence numbers and thus this would
- * deadlock:
+ * hold refs on their corresponding sequence numbers
*/
- if (!test_bit(JOURNAL_REPLAY_DONE, &j->flags))
- return;
+ ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) ||
+ !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
+ last_seq(j) > seq_to_flush ||
+ (fifo_used(&j->pin) == 1 &&
+ atomic_read(&fifo_peek_front(&j->pin).count) == 1);
+ spin_unlock(&j->lock);
- wait_event(j->wait,
- journal_flush_done(j, seq_to_flush) ||
- bch2_journal_error(j));
+ return ret;
}
-int bch2_journal_flush_all_pins(struct journal *j)
+int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_entry_pin *pin;
+ u64 pin_seq;
bool flush;
if (!test_bit(JOURNAL_STARTED, &j->flags))
return 0;
-
- bch2_journal_flush_pins(j, U64_MAX);
+again:
+ wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq));
+ if (pin) {
+ /* flushing a journal pin might cause a new one to be added: */
+ pin->flush(j, pin, pin_seq);
+ goto again;
+ }
spin_lock(&j->lock);
flush = last_seq(j) != j->last_seq_ondisk ||
- c->btree_roots_dirty;
+ (seq_to_flush == U64_MAX && c->btree_roots_dirty);
spin_unlock(&j->lock);
return flush ? bch2_journal_meta(j) : 0;
}
+int bch2_journal_flush_all_pins(struct journal *j)
+{
+ return bch2_journal_flush_pins(j, U64_MAX);
+}
+
static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
{
bool ret;
@@ -2179,14 +2191,15 @@ static void journal_write_done(struct closure *cl)
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *w = journal_prev_buf(j);
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&w->key);
+ struct bch_devs_list devs =
+ bch2_extent_devs(bkey_i_to_s_c_extent(&w->key));
- if (!bch2_extent_nr_ptrs(e)) {
+ if (!devs.nr) {
bch_err(c, "unable to write journal to sufficient devices");
goto err;
}
- if (bch2_check_mark_super(c, e, BCH_DATA_JOURNAL))
+ if (bch2_check_mark_super(c, BCH_DATA_JOURNAL, devs))
goto err;
out:
__bch2_time_stats_update(j->write_time, j->write_start_time);
@@ -2194,8 +2207,7 @@ out:
spin_lock(&j->lock);
j->last_seq_ondisk = le64_to_cpu(w->data->last_seq);
- journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs =
- bch2_extent_devs(bkey_i_to_s_c_extent(&w->key));
+ journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs = devs;
/*
* Updating last_seq_ondisk may let journal_reclaim_work() discard more
@@ -2358,7 +2370,7 @@ static void journal_write(struct closure *cl)
}
no_io:
- extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr)
+ extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
ptr->offset += sectors;
continue_at(cl, journal_write_done, system_highpri_wq);
@@ -2737,7 +2749,9 @@ int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
seq = journal_pin_seq(j, p);
spin_unlock(&j->lock);
- bch2_journal_flush_pins(j, seq);
+ ret = bch2_journal_flush_pins(j, seq);
+ if (ret)
+ return ret;
mutex_lock(&c->replicas_gc_lock);
bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
@@ -2751,7 +2765,7 @@ int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
seq++;
spin_unlock(&j->lock);
- ret = bch2_check_mark_super_devlist(c, &devs, BCH_DATA_JOURNAL);
+ ret = bch2_check_mark_super(c, BCH_DATA_JOURNAL, devs);
spin_lock(&j->lock);
}
spin_unlock(&j->lock);
@@ -2857,7 +2871,6 @@ int bch2_fs_journal_init(struct journal *j)
static struct lock_class_key res_key;
spin_lock_init(&j->lock);
- spin_lock_init(&j->pin_lock);
spin_lock_init(&j->err_lock);
init_waitqueue_head(&j->wait);
INIT_DELAYED_WORK(&j->write_work, journal_write_work);
@@ -2956,7 +2969,7 @@ ssize_t bch2_journal_print_pins(struct journal *j, char *buf)
ssize_t ret = 0;
unsigned i;
- spin_lock_irq(&j->pin_lock);
+ spin_lock(&j->lock);
fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
ret += scnprintf(buf + ret, PAGE_SIZE - ret,
"%llu: count %u\n",
@@ -2977,7 +2990,7 @@ ssize_t bch2_journal_print_pins(struct journal *j, char *buf)
"\t%p %pf\n",
pin, pin->flush);
}
- spin_unlock_irq(&j->pin_lock);
+ spin_unlock(&j->lock);
return ret;
}
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index 61197e57..5abf356e 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -165,7 +165,7 @@ void bch2_journal_pin_add_if_older(struct journal *,
struct journal_entry_pin *,
struct journal_entry_pin *,
journal_pin_flush_fn);
-void bch2_journal_flush_pins(struct journal *, u64);
+int bch2_journal_flush_pins(struct journal *, u64);
int bch2_journal_flush_all_pins(struct journal *);
struct closure;
@@ -390,7 +390,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
ssize_t bch2_journal_print_debug(struct journal *, char *);
ssize_t bch2_journal_print_pins(struct journal *, char *);
-int bch2_dev_journal_alloc(struct bch_dev *);
+int bch2_dev_journal_alloc(struct bch_fs *, struct bch_dev *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
void bch2_fs_journal_stop(struct journal *);
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index 66923cf4..5eea6579 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -169,12 +169,6 @@ struct journal {
DECLARE_FIFO(struct journal_entry_pin_list, pin);
struct journal_entry_pin_list *replay_pin_list;
- /*
- * Protects the pin lists - the fifo itself is still protected by
- * j->lock though:
- */
- spinlock_t pin_lock;
-
struct mutex blacklist_lock;
struct list_head seq_blacklist;
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index 328316a1..2033db81 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -16,13 +16,8 @@
static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
{
struct bch_dev *ca = arg;
- const struct bch_extent_ptr *ptr;
- extent_for_each_ptr(e, ptr)
- if (ptr->dev == ca->dev_idx)
- return true;
-
- return false;
+ return bch2_extent_has_device(e, ca->dev_idx);
}
#define MAX_DATA_OFF_ITER 10
@@ -32,30 +27,17 @@ static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
{
struct btree_iter iter;
struct bkey_s_c k;
- u64 keys_moved, sectors_moved;
+ struct bch_move_stats stats;
unsigned pass = 0;
int ret = 0;
- BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
-
if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER)))
return 0;
/*
- * In theory, only one pass should be necessary as we've
- * quiesced all writes before calling this.
- *
- * However, in practice, more than one pass may be necessary:
- * - Some move fails due to an error. We can can find this out
- * from the moving_context.
- * - Some key swap failed because some of the pointers in the
- * key in the tree changed due to caching behavior, btree gc
- * pruning stale pointers, or tiering (if the device being
- * removed is in tier 0). A smarter bkey_cmpxchg would
- * handle these cases.
- *
- * Thus this scans the tree one more time than strictly necessary,
- * but that can be viewed as a verification pass.
+ * XXX: we should be able to do this in one pass, but bch2_move_data()
+ * can spuriously fail to move an extent due to racing with other move
+ * operations
*/
do {
ret = bch2_move_data(c, NULL,
@@ -65,15 +47,14 @@ static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
0,
ca->dev_idx,
migrate_pred, ca,
- &keys_moved,
- &sectors_moved);
+ &stats);
if (ret) {
bch_err(c, "error migrating data: %i", ret);
return ret;
}
- } while (keys_moved && pass++ < MAX_DATA_OFF_ITER);
+ } while (atomic64_read(&stats.keys_moved) && pass++ < MAX_DATA_OFF_ITER);
- if (keys_moved) {
+ if (atomic64_read(&stats.keys_moved)) {
bch_err(c, "unable to migrate all data in %d iterations",
MAX_DATA_OFF_ITER);
return -1;
@@ -83,11 +64,7 @@ static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH, k) {
- if (!bkey_extent_is_data(k.k))
- continue;
-
- ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k),
- BCH_DATA_USER);
+ ret = bch2_check_mark_super(c, BCH_DATA_USER, bch2_bkey_devs(k));
if (ret) {
bch_err(c, "error migrating data %i from check_mark_super()", ret);
break;
@@ -99,107 +76,34 @@ static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
return ret;
}
-static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
- enum btree_id id)
-{
- struct btree_iter iter;
- struct btree *b;
- int ret;
-
- BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
-
- for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
-
- if (!bch2_extent_has_device(e, ca->dev_idx))
- continue;
-
- ret = bch2_btree_node_rewrite(c, &iter, b->data->keys.seq, 0);
- if (ret) {
- bch2_btree_iter_unlock(&iter);
- return ret;
- }
-
- bch2_btree_iter_set_locks_want(&iter, 0);
- }
- ret = bch2_btree_iter_unlock(&iter);
- if (ret)
- return ret; /* btree IO error */
-
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
- for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
-
- BUG_ON(bch2_extent_has_device(e, ca->dev_idx));
- }
- bch2_btree_iter_unlock(&iter);
- }
-
- return 0;
-}
-
-/*
- * This moves only the meta-data off, leaving the data (if any) in place.
- * The data is moved off by bch_move_data_off_device, if desired, and
- * called first.
- *
- * Before calling this, allocation of buckets to the device must have
- * been disabled, as else we'll continue to write meta-data to the device
- * when new buckets are picked for meta-data writes.
- * In addition, the copying gc and allocator threads for the device
- * must have been stopped. The allocator thread is the only thread
- * that writes prio/gen information.
- *
- * Meta-data consists of:
- * - Btree nodes
- * - Prio/gen information
- * - Journal entries
- * - Superblock
- *
- * This has to move the btree nodes and the journal only:
- * - prio/gen information is not written once the allocator thread is stopped.
- * also, as the prio/gen information is per-device it is not moved.
- * - the superblock will be written by the caller once after everything
- * is stopped.
- *
- * Note that currently there is no way to stop btree node and journal
- * meta-data writes to a device without moving the meta-data because
- * once a bucket is open for a btree node, unless a replacement btree
- * node is allocated (and the tree updated), the bucket will continue
- * to be written with updates. Similarly for the journal (it gets
- * written until filled).
- *
- * This routine leaves the data (if any) in place. Whether the data
- * should be moved off is a decision independent of whether the meta
- * data should be moved off and stopped:
- *
- * - For device removal, both data and meta-data are moved off, in
- * that order.
- *
- * - However, for turning a device read-only without removing it, only
- * meta-data is moved off since that's the only way to prevent it
- * from being written. Data is left in the device, but no new data
- * is written.
- */
-
static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca,
int flags)
{
- unsigned i;
+ struct btree_iter iter;
+ struct btree *b;
int ret = 0;
+ unsigned id;
- BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
-
- if (!(bch2_dev_has_data(c, ca) &
- ((1 << BCH_DATA_JOURNAL)|
- (1 << BCH_DATA_BTREE))))
+ if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_BTREE)))
return 0;
mutex_lock(&c->replicas_gc_lock);
bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
- for (i = 0; i < BTREE_ID_NR; i++) {
- ret = bch2_move_btree_off(c, ca, i);
+ for (id = 0; id < BTREE_ID_NR; id++) {
+ for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+ struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
+
+ if (!bch2_extent_has_device(e, ca->dev_idx))
+ continue;
+
+ ret = bch2_btree_node_rewrite(c, &iter, b->data->keys.seq, 0);
+ if (ret) {
+ bch2_btree_iter_unlock(&iter);
+ goto err;
+ }
+ }
+ ret = bch2_btree_iter_unlock(&iter);
if (ret)
goto err;
}
@@ -211,6 +115,9 @@ err:
int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags)
{
+ BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW &&
+ bch2_dev_is_online(ca));
+
return bch2_dev_usrdata_migrate(c, ca, flags) ?:
bch2_dev_metadata_migrate(c, ca, flags);
}
@@ -233,17 +140,6 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
return 0;
}
-/*
- * This doesn't actually move any data -- it marks the keys as bad
- * if they contain a pointer to a device that is forcibly removed
- * and don't have other valid pointers. If there are valid pointers,
- * the necessary pointers to the removed device are replaced with
- * bad pointers instead.
- *
- * This is only called if bch_move_data_off_device above failed, meaning
- * that we've already tried to move the data MAX_DATA_OFF_ITER times and
- * are not likely to succeed if we try again.
- */
static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
struct bkey_s_c k;
@@ -260,11 +156,15 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
while ((k = bch2_btree_iter_peek(&iter)).k &&
!(ret = btree_iter_err(k))) {
- if (!bkey_extent_is_data(k.k))
- goto advance;
-
- if (!bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx))
- goto advance;
+ if (!bkey_extent_is_data(k.k) ||
+ !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
+ ret = bch2_check_mark_super(c, BCH_DATA_USER,
+ bch2_bkey_devs(k));
+ if (ret)
+ break;
+ bch2_btree_iter_advance_pos(&iter);
+ continue;
+ }
bkey_reassemble(&tmp.key, k);
e = bkey_i_to_s_extent(&tmp.key);
@@ -280,8 +180,9 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
*/
bch2_extent_normalize(c, e.s);
- if (bkey_extent_is_data(e.k) &&
- (ret = bch2_check_mark_super(c, e.c, BCH_DATA_USER)))
+ ret = bch2_check_mark_super(c, BCH_DATA_USER,
+ bch2_bkey_devs(bkey_i_to_s_c(&tmp.key)));
+ if (ret)
break;
iter.pos = bkey_start_pos(&tmp.key.k);
@@ -300,16 +201,6 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
ret = 0;
if (ret)
break;
-
- continue;
-advance:
- if (bkey_extent_is_data(k.k)) {
- ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k),
- BCH_DATA_USER);
- if (ret)
- break;
- }
- bch2_btree_iter_advance_pos(&iter);
}
bch2_btree_iter_unlock(&iter);
@@ -346,8 +237,8 @@ retry:
dev_idx)) {
bch2_btree_iter_set_locks_want(&iter, 0);
- ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key),
- BCH_DATA_BTREE);
+ ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
+ bch2_bkey_devs(bkey_i_to_s_c(&b->key)));
if (ret)
goto err;
} else {
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index a3de3b05..7c7f436c 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -31,15 +31,10 @@ struct moving_context {
/* Closure for waiting on all reads and writes to complete */
struct closure cl;
- /* Key and sector moves issued, updated from submission context */
- u64 keys_moved;
- u64 sectors_moved;
- atomic64_t sectors_raced;
+ struct bch_move_stats *stats;
struct list_head reads;
-
atomic_t sectors_in_flight;
-
wait_queue_head_t wait;
};
@@ -116,8 +111,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
bch2_extent_normalize(c, extent_i_to_s(insert).s);
bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert));
- ret = bch2_check_mark_super(c, extent_i_to_s_c(insert),
- BCH_DATA_USER);
+ ret = bch2_check_mark_super(c, BCH_DATA_USER,
+ bch2_extent_devs(extent_i_to_s_c(insert)));
if (ret)
break;
@@ -145,7 +140,7 @@ next:
nomatch:
if (m->ctxt)
atomic64_add(k.k->p.offset - iter.pos.offset,
- &m->ctxt->sectors_raced);
+ &m->ctxt->stats->sectors_raced);
atomic_long_inc(&c->extent_migrate_raced);
trace_move_race(&new->k);
bch2_btree_iter_advance_pos(&iter);
@@ -303,8 +298,8 @@ static int bch2_move_extent(struct bch_fs *c,
io->write.op.devs = devs;
io->write.op.write_point = wp;
- ctxt->keys_moved++;
- ctxt->sectors_moved += k.k->size;
+ atomic64_inc(&ctxt->stats->keys_moved);
+ atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
trace_move_extent(k.k);
@@ -353,24 +348,6 @@ static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
atomic_read(&ctxt->sectors_in_flight) != sectors_pending);
}
-static void bch2_move_ctxt_exit(struct moving_context *ctxt)
-{
- move_ctxt_wait_event(ctxt, !atomic_read(&ctxt->sectors_in_flight));
- closure_sync(&ctxt->cl);
-
- EBUG_ON(!list_empty(&ctxt->reads));
- EBUG_ON(atomic_read(&ctxt->sectors_in_flight));
-}
-
-static void bch2_move_ctxt_init(struct moving_context *ctxt)
-{
- memset(ctxt, 0, sizeof(*ctxt));
- closure_init_stack(&ctxt->cl);
-
- INIT_LIST_HEAD(&ctxt->reads);
- init_waitqueue_head(&ctxt->wait);
-}
-
int bch2_move_data(struct bch_fs *c,
struct bch_ratelimit *rate,
unsigned sectors_in_flight,
@@ -379,20 +356,21 @@ int bch2_move_data(struct bch_fs *c,
int btree_insert_flags,
int move_device,
move_pred_fn pred, void *arg,
- u64 *keys_moved,
- u64 *sectors_moved)
+ struct bch_move_stats *stats)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
- struct moving_context ctxt;
+ struct moving_context ctxt = { .stats = stats };
struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
- struct btree_iter iter;
BKEY_PADDED(k) tmp;
struct bkey_s_c k;
u64 cur_inum = U64_MAX;
int ret = 0;
- bch2_move_ctxt_init(&ctxt);
- bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+ memset(stats, 0, sizeof(*stats));
+ closure_init_stack(&ctxt.cl);
+ INIT_LIST_HEAD(&ctxt.reads);
+ init_waitqueue_head(&ctxt.wait);
+ bch2_btree_iter_init(&stats->iter, c, BTREE_ID_EXTENTS, POS_MIN,
BTREE_ITER_PREFETCH);
if (rate)
@@ -400,7 +378,7 @@ int bch2_move_data(struct bch_fs *c,
while (!kthread || !(ret = kthread_should_stop())) {
if (atomic_read(&ctxt.sectors_in_flight) >= sectors_in_flight) {
- bch2_btree_iter_unlock(&iter);
+ bch2_btree_iter_unlock(&stats->iter);
move_ctxt_wait_event(&ctxt,
atomic_read(&ctxt.sectors_in_flight) <
sectors_in_flight);
@@ -408,11 +386,11 @@ int bch2_move_data(struct bch_fs *c,
if (rate &&
bch2_ratelimit_delay(rate) &&
- (bch2_btree_iter_unlock(&iter),
+ (bch2_btree_iter_unlock(&stats->iter),
(ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
break;
peek:
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek(&stats->iter);
if (!k.k)
break;
ret = btree_iter_err(k);
@@ -420,13 +398,13 @@ peek:
break;
if (!bkey_extent_is_data(k.k))
- goto next;
+ goto next_nondata;
if (cur_inum != k.k->p.inode) {
struct bch_inode_unpacked inode;
/* don't hold btree locks while looking up inode: */
- bch2_btree_iter_unlock(&iter);
+ bch2_btree_iter_unlock(&stats->iter);
opts = bch2_opts_to_inode_opts(c->opts);
if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
@@ -441,7 +419,7 @@ peek:
/* unlock before doing IO: */
bkey_reassemble(&tmp.k, k);
k = bkey_i_to_s_c(&tmp.k);
- bch2_btree_iter_unlock(&iter);
+ bch2_btree_iter_unlock(&stats->iter);
if (bch2_move_extent(c, &ctxt, devs, wp,
btree_insert_flags,
@@ -454,17 +432,24 @@ peek:
if (rate)
bch2_ratelimit_increment(rate, k.k->size);
next:
- bch2_btree_iter_advance_pos(&iter);
- bch2_btree_iter_cond_resched(&iter);
+ atomic64_add(k.k->size * bch2_extent_nr_dirty_ptrs(k),
+ &stats->sectors_seen);
+next_nondata:
+ bch2_btree_iter_advance_pos(&stats->iter);
+ bch2_btree_iter_cond_resched(&stats->iter);
}
- bch2_btree_iter_unlock(&iter);
- bch2_move_ctxt_exit(&ctxt);
+ bch2_btree_iter_unlock(&stats->iter);
+
+ move_ctxt_wait_event(&ctxt, !atomic_read(&ctxt.sectors_in_flight));
+ closure_sync(&ctxt.cl);
- trace_move_data(c, ctxt.sectors_moved, ctxt.keys_moved);
+ EBUG_ON(!list_empty(&ctxt.reads));
+ EBUG_ON(atomic_read(&ctxt.sectors_in_flight));
- *keys_moved = ctxt.keys_moved;
- *sectors_moved = ctxt.sectors_moved;
+ trace_move_data(c,
+ atomic64_read(&stats->sectors_moved),
+ atomic64_read(&stats->keys_moved));
return ret;
}
diff --git a/libbcachefs/move.h b/libbcachefs/move.h
index 2e884ce0..24d6ddfa 100644
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@@ -1,6 +1,7 @@
#ifndef _BCACHEFS_MOVE_H
#define _BCACHEFS_MOVE_H
+#include "btree_iter.h"
#include "buckets.h"
#include "io_types.h"
@@ -25,10 +26,19 @@ void bch2_migrate_write_init(struct migrate_write *, struct bch_read_bio *);
typedef bool (*move_pred_fn)(void *, struct bkey_s_c_extent);
+struct bch_move_stats {
+ struct btree_iter iter;
+
+ atomic64_t keys_moved;
+ atomic64_t sectors_moved;
+ atomic64_t sectors_seen;
+ atomic64_t sectors_raced;
+};
+
int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
unsigned, struct bch_devs_mask *,
struct write_point_specifier,
int, int, move_pred_fn, void *,
- u64 *, u64 *);
+ struct bch_move_stats *);
#endif /* _BCACHEFS_MOVE_H */
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 90eb4ca2..d6f2968e 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -100,7 +100,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
copygc_heap *h = &ca->copygc_heap;
struct copygc_heap_entry e, *i;
struct bucket_array *buckets;
- u64 keys_moved, sectors_moved;
+ struct bch_move_stats move_stats;
u64 sectors_to_move = 0, sectors_not_moved = 0;
u64 buckets_to_move, buckets_not_moved = 0;
size_t b;
@@ -167,8 +167,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
BTREE_INSERT_USE_RESERVE,
ca->dev_idx,
copygc_pred, ca,
- &keys_moved,
- &sectors_moved);
+ &move_stats);
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
@@ -189,7 +188,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
buckets_not_moved, buckets_to_move);
trace_copygc(ca,
- sectors_moved, sectors_not_moved,
+ atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
buckets_to_move, buckets_not_moved);
}
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index e6833d95..eae63cf8 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -167,6 +167,27 @@ int bch2_opt_lookup(const char *name)
return -1;
}
+struct synonym {
+ const char *s1, *s2;
+};
+
+static const struct synonym bch_opt_synonyms[] = {
+ { "quota", "usrquota" },
+};
+
+static int bch2_mount_opt_lookup(const char *name)
+{
+ const struct synonym *i;
+
+ for (i = bch_opt_synonyms;
+ i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms);
+ i++)
+ if (!strcmp(name, i->s1))
+ name = i->s2;
+
+ return bch2_opt_lookup(name);
+}
+
int bch2_opt_parse(const struct bch_option *opt, const char *val, u64 *res)
{
ssize_t ret;
@@ -211,7 +232,7 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
val = opt;
if (val) {
- id = bch2_opt_lookup(name);
+ id = bch2_mount_opt_lookup(name);
if (id < 0)
goto bad_opt;
@@ -219,12 +240,12 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
if (ret < 0)
goto bad_val;
} else {
- id = bch2_opt_lookup(name);
+ id = bch2_mount_opt_lookup(name);
v = 1;
if (id < 0 &&
!strncmp("no", name, 2)) {
- id = bch2_opt_lookup(name + 2);
+ id = bch2_mount_opt_lookup(name + 2);
v = 0;
}
@@ -242,6 +263,11 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
!IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
goto bad_opt;
+ if ((id == Opt_usrquota ||
+ id == Opt_grpquota) &&
+ !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
+ goto bad_opt;
+
bch2_opt_set_by_id(opts, id, v);
}
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 126056e6..5d42dd5f 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -112,6 +112,15 @@ enum opt_type {
BCH_OPT(acl, u8, OPT_MOUNT, \
OPT_BOOL(), \
BCH_SB_POSIX_ACL, true) \
+ BCH_OPT(usrquota, u8, OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH_SB_USRQUOTA, false) \
+ BCH_OPT(grpquota, u8, OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH_SB_GRPQUOTA, false) \
+ BCH_OPT(prjquota, u8, OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH_SB_PRJQUOTA, false) \
BCH_OPT(degraded, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false) \
@@ -171,7 +180,7 @@ static const struct bch_opts bch2_opts_default = {
#define opt_defined(_opts, _name) ((_opts)._name##_defined)
#define opt_get(_opts, _name) \
- (opt_defined(_opts, _name) ? _opts._name : bch2_opts_default._name)
+ (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name)
#define opt_set(_opts, _name, _v) \
do { \
diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c
new file mode 100644
index 00000000..c550fd9e
--- /dev/null
+++ b/libbcachefs/quota.c
@@ -0,0 +1,786 @@
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "inode.h"
+#include "quota.h"
+#include "super-io.h"
+
+static const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_quota dq;
+
+ if (k.k->p.inode >= QTYP_NR)
+ return "invalid quota type";
+
+ switch (k.k->type) {
+ case BCH_QUOTA: {
+ dq = bkey_s_c_to_quota(k);
+
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_quota))
+ return "incorrect value size";
+
+ return NULL;
+ }
+ default:
+ return "invalid type";
+ }
+}
+
+static const char * const bch2_quota_counters[] = {
+ "space",
+ "inodes",
+};
+
+static void bch2_quota_to_text(struct bch_fs *c, char *buf,
+ size_t size, struct bkey_s_c k)
+{
+ char *out = buf, *end= buf + size;
+ struct bkey_s_c_quota dq;
+ unsigned i;
+
+ switch (k.k->type) {
+ case BCH_QUOTA:
+ dq = bkey_s_c_to_quota(k);
+
+ for (i = 0; i < Q_COUNTERS; i++)
+ out += scnprintf(out, end - out, "%s hardlimit %llu softlimit %llu",
+ bch2_quota_counters[i],
+ le64_to_cpu(dq.v->c[i].hardlimit),
+ le64_to_cpu(dq.v->c[i].softlimit));
+ break;
+ }
+}
+
+const struct bkey_ops bch2_bkey_quota_ops = {
+ .key_invalid = bch2_quota_invalid,
+ .val_to_text = bch2_quota_to_text,
+};
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+#include <linux/cred.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+
+static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
+{
+ qtypes >>= i;
+ return qtypes ? i + __ffs(qtypes) : QTYP_NR;
+}
+
+#define for_each_set_qtype(_c, _i, _q, _qtypes) \
+ for (_i = 0; \
+ (_i = __next_qtype(_i, _qtypes), \
+ _q = &(_c)->quotas[_i], \
+ _i < QTYP_NR); \
+ _i++)
+
+static inline unsigned enabled_qtypes(struct bch_fs *c)
+{
+ return ((c->opts.usrquota << QTYP_USR)|
+ (c->opts.grpquota << QTYP_GRP)|
+ (c->opts.prjquota << QTYP_PRJ));
+}
+
+static bool ignore_hardlimit(struct bch_memquota_type *q)
+{
+ if (capable(CAP_SYS_RESOURCE))
+ return true;
+#if 0
+ struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
+
+ return capable(CAP_SYS_RESOURCE) &&
+ (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
+ !(info->dqi_flags & DQF_ROOT_SQUASH));
+#endif
+ return false;
+}
+
+enum quota_msg {
+ SOFTWARN, /* Softlimit reached */
+ SOFTLONGWARN, /* Grace time expired */
+ HARDWARN, /* Hardlimit reached */
+
+ HARDBELOW, /* Usage got below inode hardlimit */
+ SOFTBELOW, /* Usage got below inode softlimit */
+};
+
+static int quota_nl[][Q_COUNTERS] = {
+ [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN,
+ [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN,
+ [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN,
+ [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW,
+ [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW,
+
+ [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN,
+ [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN,
+ [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN,
+ [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW,
+ [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW,
+};
+
+struct quota_msgs {
+ u8 nr;
+ struct {
+ u8 qtype;
+ u8 msg;
+ } m[QTYP_NR * Q_COUNTERS];
+};
+
+static void prepare_msg(unsigned qtype,
+ enum quota_counters counter,
+ struct quota_msgs *msgs,
+ enum quota_msg msg_type)
+{
+ BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m));
+
+ msgs->m[msgs->nr].qtype = qtype;
+ msgs->m[msgs->nr].msg = quota_nl[msg_type][counter];
+ msgs->nr++;
+}
+
+static void prepare_warning(struct memquota_counter *qc,
+ unsigned qtype,
+ enum quota_counters counter,
+ struct quota_msgs *msgs,
+ enum quota_msg msg_type)
+{
+ if (qc->warning_issued & (1 << msg_type))
+ return;
+
+ prepare_msg(qtype, counter, msgs, msg_type);
+}
+
+static void flush_warnings(struct bch_qid qid,
+ struct super_block *sb,
+ struct quota_msgs *msgs)
+{
+ unsigned i;
+
+ for (i = 0; i < msgs->nr; i++)
+ quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]),
+ sb->s_dev, msgs->m[i].msg);
+}
+
+static int bch2_quota_check_limit(struct bch_fs *c,
+ unsigned qtype,
+ struct bch_memquota *mq,
+ struct quota_msgs *msgs,
+ enum quota_counters counter,
+ s64 v,
+ enum quota_acct_mode mode)
+{
+ struct bch_memquota_type *q = &c->quotas[qtype];
+ struct memquota_counter *qc = &mq->c[counter];
+ u64 n = qc->v + v;
+
+ BUG_ON((s64) n < 0);
+
+ if (mode == BCH_QUOTA_NOCHECK)
+ return 0;
+
+ if (v <= 0) {
+ if (n < qc->hardlimit &&
+ (qc->warning_issued & (1 << HARDWARN))) {
+ qc->warning_issued &= ~(1 << HARDWARN);
+ prepare_msg(qtype, counter, msgs, HARDBELOW);
+ }
+
+ if (n < qc->softlimit &&
+ (qc->warning_issued & (1 << SOFTWARN))) {
+ qc->warning_issued &= ~(1 << SOFTWARN);
+ prepare_msg(qtype, counter, msgs, SOFTBELOW);
+ }
+
+ qc->warning_issued = 0;
+ return 0;
+ }
+
+ if (qc->hardlimit &&
+ qc->hardlimit < n &&
+ !ignore_hardlimit(q)) {
+ if (mode == BCH_QUOTA_PREALLOC)
+ return -EDQUOT;
+
+ prepare_warning(qc, qtype, counter, msgs, HARDWARN);
+ }
+
+ if (qc->softlimit &&
+ qc->softlimit < n &&
+ qc->timer &&
+ ktime_get_real_seconds() >= qc->timer &&
+ !ignore_hardlimit(q)) {
+ if (mode == BCH_QUOTA_PREALLOC)
+ return -EDQUOT;
+
+ prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
+ }
+
+ if (qc->softlimit &&
+ qc->softlimit < n &&
+ qc->timer == 0) {
+ if (mode == BCH_QUOTA_PREALLOC)
+ return -EDQUOT;
+
+ prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
+
+ /* XXX is this the right one? */
+ qc->timer = ktime_get_real_seconds() +
+ q->limits[counter].warnlimit;
+ }
+
+ return 0;
+}
+
+int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
+ enum quota_counters counter, s64 v,
+ enum quota_acct_mode mode)
+{
+ unsigned qtypes = enabled_qtypes(c);
+ struct bch_memquota_type *q;
+ struct bch_memquota *mq[QTYP_NR];
+ struct quota_msgs msgs;
+ unsigned i;
+ int ret = 0;
+
+ memset(&msgs, 0, sizeof(msgs));
+
+ for_each_set_qtype(c, i, q, qtypes)
+ mutex_lock(&q->lock);
+
+ for_each_set_qtype(c, i, q, qtypes) {
+ mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS);
+ if (!mq[i]) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
+ if (ret)
+ goto err;
+ }
+
+ for_each_set_qtype(c, i, q, qtypes)
+ mq[i]->c[counter].v += v;
+err:
+ for_each_set_qtype(c, i, q, qtypes)
+ mutex_unlock(&q->lock);
+
+ flush_warnings(qid, c->vfs_sb, &msgs);
+
+ return ret;
+}
+
+static void __bch2_quota_transfer(struct bch_memquota *src_q,
+ struct bch_memquota *dst_q,
+ enum quota_counters counter, s64 v)
+{
+ BUG_ON(v > src_q->c[counter].v);
+ BUG_ON(v + dst_q->c[counter].v < v);
+
+ src_q->c[counter].v -= v;
+ dst_q->c[counter].v += v;
+}
+
+int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
+ struct bch_qid dst,
+ struct bch_qid src, u64 space)
+{
+ struct bch_memquota_type *q;
+ struct bch_memquota *src_q[3], *dst_q[3];
+ struct quota_msgs msgs;
+ unsigned i;
+ int ret = 0;
+
+ qtypes &= enabled_qtypes(c);
+
+ memset(&msgs, 0, sizeof(msgs));
+
+ for_each_set_qtype(c, i, q, qtypes)
+ mutex_lock(&q->lock);
+
+ for_each_set_qtype(c, i, q, qtypes) {
+ src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS);
+ dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS);
+
+ if (!src_q[i] || !dst_q[i]) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
+ dst_q[i]->c[Q_SPC].v + space,
+ BCH_QUOTA_PREALLOC);
+ if (ret)
+ goto err;
+
+ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
+ dst_q[i]->c[Q_INO].v + 1,
+ BCH_QUOTA_PREALLOC);
+ if (ret)
+ goto err;
+ }
+
+ for_each_set_qtype(c, i, q, qtypes) {
+ __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space);
+ __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1);
+ }
+
+err:
+ for_each_set_qtype(c, i, q, qtypes)
+ mutex_unlock(&q->lock);
+
+ flush_warnings(dst, c->vfs_sb, &msgs);
+
+ return ret;
+}
+
+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_quota dq;
+ struct bch_memquota_type *q;
+ struct bch_memquota *mq;
+ unsigned i;
+
+ BUG_ON(k.k->p.inode >= QTYP_NR);
+
+ switch (k.k->type) {
+ case BCH_QUOTA:
+ dq = bkey_s_c_to_quota(k);
+ q = &c->quotas[k.k->p.inode];
+
+ mutex_lock(&q->lock);
+ mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL);
+ if (!mq) {
+ mutex_unlock(&q->lock);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < Q_COUNTERS; i++) {
+ mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit);
+ mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
+ }
+
+ mutex_unlock(&q->lock);
+ }
+
+ return 0;
+}
+
+static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key(&iter, c, BTREE_ID_QUOTAS, POS(type, 0),
+ BTREE_ITER_PREFETCH, k) {
+ if (k.k->p.inode != type)
+ break;
+
+ ret = __bch2_quota_set(c, k);
+ if (ret)
+ break;
+ }
+
+ return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+void bch2_fs_quota_exit(struct bch_fs *c)
+{
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
+ genradix_free(&c->quotas[i].table);
+}
+
+void bch2_fs_quota_init(struct bch_fs *c)
+{
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
+ mutex_init(&c->quotas[i].lock);
+}
+
+static void bch2_sb_quota_read(struct bch_fs *c)
+{
+ struct bch_sb_field_quota *sb_quota;
+ unsigned i, j;
+
+ sb_quota = bch2_sb_get_quota(c->disk_sb);
+ if (!sb_quota)
+ return;
+
+ for (i = 0; i < QTYP_NR; i++) {
+ struct bch_memquota_type *q = &c->quotas[i];
+
+ for (j = 0; j < Q_COUNTERS; j++) {
+ q->limits[j].timelimit =
+ le32_to_cpu(sb_quota->q[i].c[j].timelimit);
+ q->limits[j].warnlimit =
+ le32_to_cpu(sb_quota->q[i].c[j].warnlimit);
+ }
+ }
+}
+
+int bch2_fs_quota_read(struct bch_fs *c)
+{
+ unsigned i, qtypes = enabled_qtypes(c);
+ struct bch_memquota_type *q;
+ struct btree_iter iter;
+ struct bch_inode_unpacked u;
+ struct bkey_s_c k;
+ int ret;
+
+ mutex_lock(&c->sb_lock);
+ bch2_sb_quota_read(c);
+ mutex_unlock(&c->sb_lock);
+
+ for_each_set_qtype(c, i, q, qtypes) {
+ ret = bch2_quota_init_type(c, i);
+ if (ret)
+ return ret;
+ }
+
+ for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN,
+ BTREE_ITER_PREFETCH, k) {
+ switch (k.k->type) {
+ case BCH_INODE_FS:
+ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
+ if (ret)
+ return ret;
+
+ bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
+ BCH_QUOTA_NOCHECK);
+ bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
+ BCH_QUOTA_NOCHECK);
+ }
+ }
+ return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+/* Enable/disable/delete quotas for an entire filesystem: */
+
+static int bch2_quota_enable(struct super_block *sb, unsigned uflags)
+{
+ struct bch_fs *c = sb->s_fs_info;
+
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ /* Accounting must be enabled at mount time: */
+ if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT))
+ return -EINVAL;
+
+ /* Can't enable enforcement without accounting: */
+ if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota)
+ return -EINVAL;
+
+ if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
+ return -EINVAL;
+
+ if (uflags & FS_QUOTA_PDQ_ENFD)
+ return -EINVAL;
+
+ mutex_lock(&c->sb_lock);
+ if (uflags & FS_QUOTA_UDQ_ENFD)
+ SET_BCH_SB_USRQUOTA(c->disk_sb, true);
+
+ if (uflags & FS_QUOTA_GDQ_ENFD)
+ SET_BCH_SB_GRPQUOTA(c->disk_sb, true);
+#if 0
+ if (uflags & FS_QUOTA_PDQ_ENFD)
+ SET_BCH_SB_PRJQUOTA(c->disk_sb, true);
+#endif
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ return 0;
+}
+
+static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
+{
+ struct bch_fs *c = sb->s_fs_info;
+
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ mutex_lock(&c->sb_lock);
+ if (uflags & FS_QUOTA_UDQ_ENFD)
+ SET_BCH_SB_USRQUOTA(c->disk_sb, false);
+
+ if (uflags & FS_QUOTA_GDQ_ENFD)
+ SET_BCH_SB_GRPQUOTA(c->disk_sb, false);
+
+ if (uflags & FS_QUOTA_PDQ_ENFD)
+ SET_BCH_SB_PRJQUOTA(c->disk_sb, false);
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ return 0;
+}
+
+static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ int ret;
+
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ if (uflags & FS_USER_QUOTA) {
+ if (c->opts.usrquota)
+ return -EINVAL;
+
+ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+ POS(QTYP_USR, 0),
+ POS(QTYP_USR + 1, 0),
+ ZERO_VERSION, NULL, NULL, NULL);
+ if (ret)
+ return ret;
+ }
+
+ if (uflags & FS_GROUP_QUOTA) {
+ if (c->opts.grpquota)
+ return -EINVAL;
+
+ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+ POS(QTYP_GRP, 0),
+ POS(QTYP_GRP + 1, 0),
+ ZERO_VERSION, NULL, NULL, NULL);
+ if (ret)
+ return ret;
+ }
+
+ if (uflags & FS_PROJ_QUOTA) {
+ if (c->opts.prjquota)
+ return -EINVAL;
+
+ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+ POS(QTYP_PRJ, 0),
+ POS(QTYP_PRJ + 1, 0),
+ ZERO_VERSION, NULL, NULL, NULL);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * Return quota status information, such as enforcements, quota file inode
+ * numbers etc.
+ */
+static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ unsigned qtypes = enabled_qtypes(c);
+ unsigned i;
+
+ memset(state, 0, sizeof(*state));
+
+ for (i = 0; i < QTYP_NR; i++) {
+ state->s_state[i].flags |= QCI_SYSFILE;
+
+ if (!(qtypes & (1 << i)))
+ continue;
+
+ state->s_state[i].flags |= QCI_ACCT_ENABLED;
+
+ state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit;
+ state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit;
+
+ state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit;
+ state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit;
+ }
+
+ return 0;
+}
+
+/*
+ * Adjust quota timers & warnings
+ */
+static int bch2_quota_set_info(struct super_block *sb, int type,
+ struct qc_info *info)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ struct bch_sb_field_quota *sb_quota;
+ struct bch_memquota_type *q;
+
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ if (type >= QTYP_NR)
+ return -EINVAL;
+
+ if (!((1 << type) & enabled_qtypes(c)))
+ return -ESRCH;
+
+ if (info->i_fieldmask &
+ ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
+ return -EINVAL;
+
+ q = &c->quotas[type];
+
+ mutex_lock(&c->sb_lock);
+ sb_quota = bch2_sb_get_quota(c->disk_sb);
+ if (!sb_quota) {
+ sb_quota = bch2_fs_sb_resize_quota(c, sizeof(*sb_quota) / sizeof(u64));
+ if (!sb_quota)
+ return -ENOSPC;
+ }
+
+ if (info->i_fieldmask & QC_SPC_TIMER)
+ sb_quota->q[type].c[Q_SPC].timelimit =
+ cpu_to_le32(info->i_spc_timelimit);
+
+ if (info->i_fieldmask & QC_SPC_WARNS)
+ sb_quota->q[type].c[Q_SPC].warnlimit =
+ cpu_to_le32(info->i_spc_warnlimit);
+
+ if (info->i_fieldmask & QC_INO_TIMER)
+ sb_quota->q[type].c[Q_INO].timelimit =
+ cpu_to_le32(info->i_ino_timelimit);
+
+ if (info->i_fieldmask & QC_INO_WARNS)
+ sb_quota->q[type].c[Q_INO].warnlimit =
+ cpu_to_le32(info->i_ino_warnlimit);
+
+ bch2_sb_quota_read(c);
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ return 0;
+}
+
+/* Get/set individual quotas: */
+
+static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src)
+{
+ dst->d_space = src->c[Q_SPC].v << 9;
+ dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9;
+ dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9;
+ dst->d_spc_timer = src->c[Q_SPC].timer;
+ dst->d_spc_warns = src->c[Q_SPC].warns;
+
+ dst->d_ino_count = src->c[Q_INO].v;
+ dst->d_ino_hardlimit = src->c[Q_INO].hardlimit;
+ dst->d_ino_softlimit = src->c[Q_INO].softlimit;
+ dst->d_ino_timer = src->c[Q_INO].timer;
+ dst->d_ino_warns = src->c[Q_INO].warns;
+}
+
+static int bch2_get_quota(struct super_block *sb, struct kqid kqid,
+ struct qc_dqblk *qdq)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ struct bch_memquota_type *q = &c->quotas[kqid.type];
+ qid_t qid = from_kqid(&init_user_ns, kqid);
+ struct bch_memquota *mq;
+
+ memset(qdq, 0, sizeof(*qdq));
+
+ mutex_lock(&q->lock);
+ mq = genradix_ptr(&q->table, qid);
+ if (mq)
+ __bch2_quota_get(qdq, mq);
+ mutex_unlock(&q->lock);
+
+ return 0;
+}
+
+static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
+ struct qc_dqblk *qdq)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ struct bch_memquota_type *q = &c->quotas[kqid->type];
+ qid_t qid = from_kqid(&init_user_ns, *kqid);
+ struct genradix_iter iter = genradix_iter_init(&q->table, qid);
+ struct bch_memquota *mq;
+ int ret = 0;
+
+ mutex_lock(&q->lock);
+
+ while ((mq = genradix_iter_peek(&iter, &q->table))) {
+ if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
+ __bch2_quota_get(qdq, mq);
+ *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
+ goto found;
+ }
+
+ genradix_iter_advance(&iter, &q->table);
+ }
+
+ ret = -ENOENT;
+found:
+ mutex_unlock(&q->lock);
+ return ret;
+}
+
+static int bch2_set_quota(struct super_block *sb, struct kqid qid,
+ struct qc_dqblk *qdq)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_quota new_quota;
+ int ret;
+
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ bkey_quota_init(&new_quota.k_i);
+ new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
+
+ bch2_btree_iter_init(&iter, c, BTREE_ID_QUOTAS, new_quota.k.p,
+ BTREE_ITER_WITH_HOLES|BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_with_holes(&iter);
+
+ ret = btree_iter_err(k);
+ if (unlikely(ret))
+ return ret;
+
+ switch (k.k->type) {
+ case BCH_QUOTA:
+ new_quota.v = *bkey_s_c_to_quota(k).v;
+ break;
+ }
+
+ if (qdq->d_fieldmask & QC_SPC_SOFT)
+ new_quota.v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit);
+ if (qdq->d_fieldmask & QC_SPC_HARD)
+ new_quota.v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit);
+
+ if (qdq->d_fieldmask & QC_INO_SOFT)
+ new_quota.v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_spc_softlimit);
+ if (qdq->d_fieldmask & QC_INO_HARD)
+ new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit);
+
+ ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+ BTREE_INSERT_ENTRY(&iter, &new_quota.k_i));
+ bch2_btree_iter_unlock(&iter);
+
+ if (ret)
+ return ret;
+
+ ret = __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
+
+ return ret;
+}
+
+const struct quotactl_ops bch2_quotactl_operations = {
+ .quota_enable = bch2_quota_enable,
+ .quota_disable = bch2_quota_disable,
+ .rm_xquota = bch2_quota_remove,
+
+ .get_state = bch2_quota_get_state,
+ .set_info = bch2_quota_set_info,
+
+ .get_dqblk = bch2_get_quota,
+ .get_nextdqblk = bch2_get_next_quota,
+ .set_dqblk = bch2_set_quota,
+};
+
+#endif /* CONFIG_BCACHEFS_QUOTA */
diff --git a/libbcachefs/quota.h b/libbcachefs/quota.h
new file mode 100644
index 00000000..09d51a83
--- /dev/null
+++ b/libbcachefs/quota.h
@@ -0,0 +1,48 @@
+#ifndef _BCACHEFS_QUOTA_H
+#define _BCACHEFS_QUOTA_H
+
+#include "quota_types.h"
+
+extern const struct bkey_ops bch2_bkey_quota_ops;
+
+enum quota_acct_mode {
+ BCH_QUOTA_PREALLOC,
+ BCH_QUOTA_WARN,
+ BCH_QUOTA_NOCHECK,
+};
+
+static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
+{
+ return (struct bch_qid) {
+ .q[QTYP_USR] = u->bi_uid,
+ .q[QTYP_GRP] = u->bi_gid,
+ .q[QTYP_PRJ] = u->bi_project,
+ };
+}
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
+ s64, enum quota_acct_mode);
+
+int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
+ struct bch_qid, u64);
+
+void bch2_fs_quota_exit(struct bch_fs *);
+void bch2_fs_quota_init(struct bch_fs *);
+int bch2_fs_quota_read(struct bch_fs *);
+
+extern const struct quotactl_ops bch2_quotactl_operations;
+
+#else
+
+#define bch2_quota_acct(_c, _uid, _gid, _counter, _v) (0)
+#define bch2_quota_transfer(_c, _type, _src, _dst, _v) (0)
+
+static inline void bch2_fs_quota_exit(struct bch_fs *c) {}
+static inline void bch2_fs_quota_init(struct bch_fs *c) {}
+static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; }
+
+#endif
+
+#endif /* _BCACHEFS_QUOTA_H */
diff --git a/libbcachefs/quota_types.h b/libbcachefs/quota_types.h
new file mode 100644
index 00000000..bcaed4ea
--- /dev/null
+++ b/libbcachefs/quota_types.h
@@ -0,0 +1,36 @@
+#ifndef _BCACHEFS_QUOTA_TYPES_H
+#define _BCACHEFS_QUOTA_TYPES_H
+
+#include <linux/generic-radix-tree.h>
+
+struct bch_qid {
+ u32 q[QTYP_NR];
+};
+
+struct memquota_counter {
+ u64 v;
+ u64 hardlimit;
+ u64 softlimit;
+ s64 timer;
+ int warns;
+ int warning_issued;
+};
+
+struct bch_memquota {
+ struct memquota_counter c[Q_COUNTERS];
+};
+
+typedef GENRADIX(struct bch_memquota) bch_memquota_table;
+
+struct quota_limit {
+ u32 timelimit;
+ u32 warnlimit;
+};
+
+struct bch_memquota_type {
+ struct quota_limit limits[Q_COUNTERS];
+ bch_memquota_table table;
+ struct mutex lock;
+};
+
+#endif /* _BCACHEFS_QUOTA_TYPES_H */
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 21720186..8dce7dc1 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -330,9 +330,6 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb)))
return "Btree node size not a power of two";
- if (BCH_SB_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX)
- return "Btree node size too large";
-
if (BCH_SB_GC_RESERVE(sb) < 5)
return "gc reserve percentage too small";
@@ -383,27 +380,6 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
/* device open: */
-static const char *bch2_blkdev_open(const char *path, fmode_t mode,
- void *holder, struct block_device **ret)
-{
- struct block_device *bdev;
-
- *ret = NULL;
- bdev = blkdev_get_by_path(path, mode, holder);
- if (bdev == ERR_PTR(-EBUSY))
- return "device busy";
-
- if (IS_ERR(bdev))
- return "failed to open device";
-
- if (mode & FMODE_WRITE)
- bdev_get_queue(bdev)->backing_dev_info->capabilities
- |= BDI_CAP_STABLE_WRITES;
-
- *ret = bdev;
- return NULL;
-}
-
static void bch2_sb_update(struct bch_fs *c)
{
struct bch_sb *src = c->disk_sb;
@@ -555,44 +531,55 @@ reread:
return NULL;
}
-const char *bch2_read_super(const char *path,
- struct bch_opts opts,
- struct bch_sb_handle *ret)
+int bch2_read_super(const char *path, struct bch_opts *opts,
+ struct bch_sb_handle *sb)
{
- u64 offset = opt_get(opts, sb);
+ u64 offset = opt_get(*opts, sb);
struct bch_sb_layout layout;
const char *err;
- unsigned i;
+ __le64 *i;
+ int ret;
- memset(ret, 0, sizeof(*ret));
- ret->mode = FMODE_READ;
+ memset(sb, 0, sizeof(*sb));
+ sb->mode = FMODE_READ;
- if (!opt_get(opts, noexcl))
- ret->mode |= FMODE_EXCL;
+ if (!opt_get(*opts, noexcl))
+ sb->mode |= FMODE_EXCL;
- if (!opt_get(opts, nochanges))
- ret->mode |= FMODE_WRITE;
+ if (!opt_get(*opts, nochanges))
+ sb->mode |= FMODE_WRITE;
- err = bch2_blkdev_open(path, ret->mode, ret, &ret->bdev);
- if (err)
- return err;
+ sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
+ if (IS_ERR(sb->bdev) &&
+ PTR_ERR(sb->bdev) == -EACCES &&
+ opt_get(*opts, read_only)) {
+ sb->mode &= ~FMODE_WRITE;
+
+ sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
+ if (!IS_ERR(sb->bdev))
+ opt_set(*opts, nochanges, true);
+ }
+
+ if (IS_ERR(sb->bdev))
+ return PTR_ERR(sb->bdev);
err = "cannot allocate memory";
- if (__bch2_super_realloc(ret, 0))
+ ret = __bch2_super_realloc(sb, 0);
+ if (ret)
goto err;
+ ret = -EFAULT;
err = "dynamic fault";
if (bch2_fs_init_fault("read_super"))
goto err;
- err = read_one_super(ret, offset);
+ ret = -EINVAL;
+ err = read_one_super(sb, offset);
if (!err)
goto got_super;
- if (offset != BCH_SB_SECTOR) {
- pr_err("error reading superblock: %s", err);
+ if (opt_defined(*opts, sb))
goto err;
- }
pr_err("error reading default superblock: %s", err);
@@ -600,53 +587,57 @@ const char *bch2_read_super(const char *path,
* Error reading primary superblock - read location of backup
* superblocks:
*/
- bio_reset(ret->bio);
- ret->bio->bi_bdev = ret->bdev;
- ret->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
- ret->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout);
- bio_set_op_attrs(ret->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
+ bio_reset(sb->bio);
+ sb->bio->bi_bdev = sb->bdev;
+ sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
+ sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout);
+ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
/*
* use sb buffer to read layout, since sb buffer is page aligned but
* layout won't be:
*/
- bch2_bio_map(ret->bio, ret->sb);
+ bch2_bio_map(sb->bio, sb->sb);
err = "IO error";
- if (submit_bio_wait(ret->bio))
+ if (submit_bio_wait(sb->bio))
goto err;
- memcpy(&layout, ret->sb, sizeof(layout));
+ memcpy(&layout, sb->sb, sizeof(layout));
err = validate_sb_layout(&layout);
if (err)
goto err;
- for (i = 0; i < layout.nr_superblocks; i++) {
- u64 offset = le64_to_cpu(layout.sb_offset[i]);
+ for (i = layout.sb_offset;
+ i < layout.sb_offset + layout.nr_superblocks; i++) {
+ offset = le64_to_cpu(*i);
- if (offset == BCH_SB_SECTOR)
+ if (offset == opt_get(*opts, sb))
continue;
- err = read_one_super(ret, offset);
+ err = read_one_super(sb, offset);
if (!err)
goto got_super;
}
+
+ ret = -EINVAL;
goto err;
-got_super:
- pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
- le64_to_cpu(ret->sb->version),
- le64_to_cpu(ret->sb->flags[0]),
- le64_to_cpu(ret->sb->seq),
- le32_to_cpu(ret->sb->u64s));
+got_super:
err = "Superblock block size smaller than device block size";
- if (le16_to_cpu(ret->sb->block_size) << 9 <
- bdev_logical_block_size(ret->bdev))
+ ret = -EINVAL;
+ if (le16_to_cpu(sb->sb->block_size) << 9 <
+ bdev_logical_block_size(sb->bdev))
goto err;
- return NULL;
+ if (sb->mode & FMODE_WRITE)
+ bdev_get_queue(sb->bdev)->backing_dev_info->capabilities
+ |= BDI_CAP_STABLE_WRITES;
+
+ return 0;
err:
- bch2_free_super(ret);
- return err;
+ bch2_free_super(sb);
+ pr_err("error reading superblock: %s", err);
+ return ret;
}
/* write superblock: */
@@ -1108,13 +1099,20 @@ err:
return ret;
}
-static inline int __bch2_check_mark_super(struct bch_fs *c,
- struct bch_replicas_cpu_entry search,
- unsigned max_dev)
+int bch2_check_mark_super(struct bch_fs *c,
+ enum bch_data_type data_type,
+ struct bch_devs_list devs)
{
+ struct bch_replicas_cpu_entry search;
struct bch_replicas_cpu *r, *gc_r;
+ unsigned max_dev;
bool marked;
+ if (!devs.nr)
+ return 0;
+
+ devlist_to_replicas(devs, data_type, &search, &max_dev);
+
rcu_read_lock();
r = rcu_dereference(c->replicas);
gc_r = rcu_dereference(c->replicas_gc);
@@ -1126,32 +1124,6 @@ static inline int __bch2_check_mark_super(struct bch_fs *c,
: bch2_check_mark_super_slowpath(c, search, max_dev);
}
-int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
- enum bch_data_type data_type)
-{
- struct bch_replicas_cpu_entry search;
- unsigned max_dev;
-
- if (!bkey_to_replicas(e, data_type, &search, &max_dev))
- return 0;
-
- return __bch2_check_mark_super(c, search, max_dev);
-}
-
-int bch2_check_mark_super_devlist(struct bch_fs *c,
- struct bch_devs_list *devs,
- enum bch_data_type data_type)
-{
- struct bch_replicas_cpu_entry search;
- unsigned max_dev;
-
- if (!devs->nr)
- return 0;
-
- devlist_to_replicas(*devs, data_type, &search, &max_dev);
- return __bch2_check_mark_super(c, search, max_dev);
-}
-
int bch2_replicas_gc_end(struct bch_fs *c, int err)
{
struct bch_replicas_cpu *new_r, *old_r;
@@ -1435,12 +1407,19 @@ int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t
/* Query replicas: */
-static bool __bch2_sb_has_replicas(struct bch_fs *c,
- struct bch_replicas_cpu_entry search,
- unsigned max_dev)
+bool bch2_sb_has_replicas(struct bch_fs *c,
+ enum bch_data_type data_type,
+ struct bch_devs_list devs)
{
+ struct bch_replicas_cpu_entry search;
+ unsigned max_dev;
bool ret;
+ if (!devs.nr)
+ return true;
+
+ devlist_to_replicas(devs, data_type, &search, &max_dev);
+
rcu_read_lock();
ret = replicas_has_entry(rcu_dereference(c->replicas),
search, max_dev);
@@ -1449,31 +1428,6 @@ static bool __bch2_sb_has_replicas(struct bch_fs *c,
return ret;
}
-bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
- enum bch_data_type data_type)
-{
- struct bch_replicas_cpu_entry search;
- unsigned max_dev;
-
- if (!bkey_to_replicas(e, data_type, &search, &max_dev))
- return true;
-
- return __bch2_sb_has_replicas(c, search, max_dev);
-}
-
-bool bch2_sb_has_replicas_devlist(struct bch_fs *c, struct bch_devs_list *devs,
- enum bch_data_type data_type)
-{
- struct bch_replicas_cpu_entry search;
- unsigned max_dev;
-
- if (!devs->nr)
- return true;
-
- devlist_to_replicas(*devs, data_type, &search, &max_dev);
- return __bch2_sb_has_replicas(c, search, max_dev);
-}
-
struct replicas_status __bch2_replicas_status(struct bch_fs *c,
struct bch_devs_mask online_devs)
{
@@ -1579,12 +1533,23 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
goto out;
for_each_cpu_replicas_entry(r, e)
- if (replicas_test_dev(e, ca->dev_idx)) {
+ if (replicas_test_dev(e, ca->dev_idx))
ret |= 1 << e->data_type;
- break;
- }
out:
rcu_read_unlock();
return ret;
}
+
+/* Quotas: */
+
+static const char *bch2_sb_validate_quota(struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_quota *q = field_to_type(f, quota);
+
+ if (vstruct_bytes(&q->field) != sizeof(*q))
+ return "invalid field quota: wrong size";
+
+ return NULL;
+}
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index e0dd26e3..59a8b816 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -94,8 +94,7 @@ int bch2_super_realloc(struct bch_sb_handle *, unsigned);
const char *bch2_sb_validate(struct bch_sb_handle *);
-const char *bch2_read_super(const char *, struct bch_opts,
- struct bch_sb_handle *);
+int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
void bch2_write_super(struct bch_fs *);
/* BCH_SB_FIELD_journal: */
@@ -139,14 +138,10 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
/* BCH_SB_FIELD_replicas: */
-bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent,
- enum bch_data_type);
-bool bch2_sb_has_replicas_devlist(struct bch_fs *, struct bch_devs_list *,
- enum bch_data_type);
-int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent,
- enum bch_data_type);
-int bch2_check_mark_super_devlist(struct bch_fs *, struct bch_devs_list *,
- enum bch_data_type);
+bool bch2_sb_has_replicas(struct bch_fs *, enum bch_data_type,
+ struct bch_devs_list);
+int bch2_check_mark_super(struct bch_fs *, enum bch_data_type,
+ struct bch_devs_list);
int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t);
int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t);
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 69290d27..29ffba65 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -29,6 +29,7 @@
#include "move.h"
#include "migrate.h"
#include "movinggc.h"
+#include "quota.h"
#include "super.h"
#include "super-io.h"
#include "sysfs.h"
@@ -214,14 +215,15 @@ static void __bch2_fs_read_only(struct bch_fs *c)
*/
bch2_journal_flush_all_pins(&c->journal);
- if (!bch2_journal_error(&c->journal))
- bch2_btree_verify_flushed(c);
-
for_each_member_device(ca, c, i)
bch2_dev_allocator_stop(ca);
bch2_fs_journal_stop(&c->journal);
+ if (!bch2_journal_error(&c->journal) &&
+ !test_bit(BCH_FS_ERROR, &c->flags))
+ bch2_btree_verify_flushed(c);
+
for_each_member_device(ca, c, i)
bch2_dev_allocator_remove(c, ca);
}
@@ -366,6 +368,7 @@ err:
static void bch2_fs_free(struct bch_fs *c)
{
+ bch2_fs_quota_exit(c);
bch2_fs_fsio_exit(c);
bch2_fs_encryption_exit(c);
bch2_fs_btree_cache_exit(c);
@@ -380,7 +383,7 @@ static void bch2_fs_free(struct bch_fs *c)
bioset_exit(&c->bio_write);
bioset_exit(&c->bio_read_split);
bioset_exit(&c->bio_read);
- bioset_exit(&c->btree_read_bio);
+ bioset_exit(&c->btree_bio);
mempool_exit(&c->btree_interior_update_pool);
mempool_exit(&c->btree_reserve_pool);
mempool_exit(&c->fill_iter);
@@ -492,6 +495,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_allocator_init(c);
bch2_fs_tiering_init(c);
+ bch2_fs_quota_init(c);
INIT_LIST_HEAD(&c->list);
@@ -561,8 +565,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
sizeof(struct btree_update)) ||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
- bioset_init(&c->btree_read_bio, 1,
- offsetof(struct btree_read_bio, bio),
+ bioset_init(&c->btree_bio, 1,
+ max(offsetof(struct btree_read_bio, bio),
+ offsetof(struct btree_write_bio, wbio.bio)),
BIOSET_NEED_BVECS) ||
bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
BIOSET_NEED_BVECS) ||
@@ -671,13 +676,10 @@ static const char *__bch2_fs_start(struct bch_fs *c)
struct bch_dev *ca;
LIST_HEAD(journal);
struct jset *j;
- struct closure cl;
time64_t now;
unsigned i;
int ret = -EINVAL;
- closure_init_stack(&cl);
-
mutex_lock(&c->state_lock);
BUG_ON(c->state != BCH_FS_STARTING);
@@ -705,14 +707,14 @@ static const char *__bch2_fs_start(struct bch_fs *c)
unsigned level;
struct bkey_i *k;
- err = "missing btree root";
k = bch2_journal_find_btree_root(c, j, i, &level);
- if (!k && i < BTREE_ID_ALLOC)
- goto err;
-
if (!k)
continue;
+ err = "invalid btree root pointer";
+ if (IS_ERR(k))
+ goto err;
+
err = "error reading btree root";
if (bch2_btree_root_read(c, i, k, level)) {
if (i != BTREE_ID_ALLOC)
@@ -722,6 +724,10 @@ static const char *__bch2_fs_start(struct bch_fs *c)
}
}
+ for (i = 0; i < BTREE_ID_NR; i++)
+ if (!c->btree_roots[i].b)
+ bch2_btree_root_alloc(c, i);
+
err = "error reading allocation information";
ret = bch2_alloc_read(c, &journal);
if (ret)
@@ -739,14 +745,6 @@ static const char *__bch2_fs_start(struct bch_fs *c)
if (c->opts.noreplay)
goto recovery_done;
- err = "cannot allocate new btree root";
- for (i = 0; i < BTREE_ID_NR; i++)
- if (!c->btree_roots[i].b &&
- bch2_btree_root_alloc(c, i, &cl))
- goto err;
-
- closure_sync(&cl);
-
/*
* bch2_journal_start() can't happen sooner, or btree_gc_finish()
* will give spurious errors about oldest_gen > bucket_gen -
@@ -754,12 +752,9 @@ static const char *__bch2_fs_start(struct bch_fs *c)
*/
bch2_journal_start(c);
- err = "error starting allocator thread";
- for_each_rw_member(ca, c, i)
- if (bch2_dev_allocator_start(ca)) {
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
+ err = "error starting allocator";
+ if (bch2_fs_allocator_start(c))
+ goto err;
bch_verbose(c, "starting journal replay:");
err = "journal replay failed";
@@ -777,6 +772,14 @@ static const char *__bch2_fs_start(struct bch_fs *c)
if (ret)
goto err;
bch_verbose(c, "fsck done");
+
+ if (c->opts.usrquota || c->opts.grpquota) {
+ bch_verbose(c, "reading quotas:");
+ ret = bch2_fs_quota_read(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "quotas done");
+ }
} else {
struct bch_inode_unpacked inode;
struct bkey_inode_buf packed_inode;
@@ -784,6 +787,7 @@ static const char *__bch2_fs_start(struct bch_fs *c)
bch_notice(c, "initializing new filesystem");
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+ set_bit(BCH_FS_BRAND_NEW_FS, &c->flags);
ret = bch2_initial_gc(c, &journal);
if (ret)
@@ -791,15 +795,15 @@ static const char *__bch2_fs_start(struct bch_fs *c)
err = "unable to allocate journal buckets";
for_each_rw_member(ca, c, i)
- if (bch2_dev_journal_alloc(ca)) {
+ if (bch2_dev_journal_alloc(c, ca)) {
percpu_ref_put(&ca->io_ref);
goto err;
}
- err = "cannot allocate new btree root";
+ clear_bit(BCH_FS_BRAND_NEW_FS, &c->flags);
+
for (i = 0; i < BTREE_ID_NR; i++)
- if (bch2_btree_root_alloc(c, i, &cl))
- goto err;
+ bch2_btree_root_alloc(c, i);
/*
* journal_res_get() will crash if called before this has
@@ -808,15 +812,9 @@ static const char *__bch2_fs_start(struct bch_fs *c)
bch2_journal_start(c);
bch2_journal_set_replay_done(&c->journal);
- err = "error starting allocator thread";
- for_each_rw_member(ca, c, i)
- if (bch2_dev_allocator_start(ca)) {
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
-
- /* Wait for new btree roots to be written: */
- closure_sync(&cl);
+ err = "error starting allocator";
+ if (bch2_fs_allocator_start(c))
+ goto err;
bch2_inode_init(c, &inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
@@ -830,6 +828,12 @@ static const char *__bch2_fs_start(struct bch_fs *c)
NULL, NULL, NULL, 0))
goto err;
+ if (c->opts.usrquota || c->opts.grpquota) {
+ ret = bch2_fs_quota_read(c);
+ if (ret)
+ goto err;
+ }
+
err = "error writing first journal entry";
if (bch2_journal_meta(&c->journal))
goto err;
@@ -867,8 +871,6 @@ out:
return err;
err:
fsck_err:
- closure_sync(&cl);
-
switch (ret) {
case BCH_FSCK_ERRORS_NOT_FIXED:
bch_err(c, "filesystem contains errors: please report this to the developers");
@@ -1107,6 +1109,8 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
struct bch_dev *ca;
int ret;
+ lockdep_assert_held(&c->state_lock);
+
if (le64_to_cpu(sb->sb->seq) >
le64_to_cpu(c->disk_sb->seq))
bch2_sb_to_fs(c, sb->sb);
@@ -1153,7 +1157,9 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
bdevname(ca->disk_sb.bdev, c->name);
bdevname(ca->disk_sb.bdev, ca->name);
+ mutex_lock(&c->sb_lock);
bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+ mutex_unlock(&c->sb_lock);
if (ca->mi.state == BCH_MEMBER_STATE_RW)
bch2_dev_allocator_add(c, ca);
@@ -1430,17 +1436,18 @@ err:
/* Add new device to running filesystem: */
int bch2_dev_add(struct bch_fs *c, const char *path)
{
+ struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb;
const char *err;
struct bch_dev *ca = NULL;
struct bch_sb_field_members *mi, *dev_mi;
struct bch_member saved_mi;
unsigned dev_idx, nr_devices, u64s;
- int ret = -EINVAL;
+ int ret;
- err = bch2_read_super(path, bch2_opts_empty(), &sb);
- if (err)
- return -EINVAL;
+ ret = bch2_read_super(path, &opts, &sb);
+ if (ret)
+ return ret;
err = bch2_sb_validate(&sb);
if (err)
@@ -1479,14 +1486,14 @@ have_slot:
sizeof(struct bch_member) * nr_devices) / sizeof(u64);
err = "no space in superblock for member info";
- mi = bch2_fs_sb_resize_members(c, u64s);
- if (!mi)
- goto err_unlock;
-
dev_mi = bch2_sb_resize_members(&sb, u64s);
if (!dev_mi)
goto err_unlock;
+ mi = bch2_fs_sb_resize_members(c, u64s);
+ if (!mi)
+ goto err_unlock;
+
memcpy(dev_mi, mi, u64s * sizeof(u64));
dev_mi->members[dev_idx] = saved_mi;
@@ -1499,30 +1506,30 @@ have_slot:
c->disk_sb->nr_devices = nr_devices;
c->sb.nr_devices = nr_devices;
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
if (bch2_dev_alloc(c, dev_idx)) {
err = "cannot allocate memory";
ret = -ENOMEM;
- goto err_unlock;
+ goto err;
}
if (__bch2_dev_online(c, &sb)) {
err = "bch2_dev_online() error";
ret = -ENOMEM;
- goto err_unlock;
+ goto err;
}
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
ca = bch_dev_locked(c, dev_idx);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
- err = "journal alloc failed";
- if (bch2_dev_journal_alloc(ca))
- goto err;
-
err = __bch2_dev_read_write(c, ca);
if (err)
goto err;
+
+ err = "journal alloc failed";
+ if (bch2_dev_journal_alloc(c, ca))
+ goto err;
}
mutex_unlock(&c->state_lock);
@@ -1540,16 +1547,20 @@ err:
/* Hot add existing device to running filesystem: */
int bch2_dev_online(struct bch_fs *c, const char *path)
{
+ struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb = { NULL };
struct bch_dev *ca;
unsigned dev_idx;
const char *err;
+ int ret;
mutex_lock(&c->state_lock);
- err = bch2_read_super(path, bch2_opts_empty(), &sb);
- if (err)
- goto err;
+ ret = bch2_read_super(path, &opts, &sb);
+ if (ret) {
+ mutex_unlock(&c->state_lock);
+ return ret;
+ }
dev_idx = sb.sb->dev_idx;
@@ -1557,13 +1568,10 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
if (err)
goto err;
- mutex_lock(&c->sb_lock);
if (__bch2_dev_online(c, &sb)) {
err = "__bch2_dev_online() error";
- mutex_unlock(&c->sb_lock);
goto err;
}
- mutex_unlock(&c->sb_lock);
ca = bch_dev_locked(c, dev_idx);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
@@ -1585,6 +1593,12 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
{
mutex_lock(&c->state_lock);
+ if (!bch2_dev_is_online(ca)) {
+ bch_err(ca, "Already offline");
+ mutex_unlock(&c->state_lock);
+ return 0;
+ }
+
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
bch_err(ca, "Cannot offline required disk");
mutex_unlock(&c->state_lock);
@@ -1617,9 +1631,19 @@ int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
goto err;
}
+ ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
+ if (ret) {
+ bch_err(ca, "Migrate failed: error %i flushing journal", ret);
+ goto err;
+ }
+
data = bch2_dev_has_data(c, ca);
if (data) {
- bch_err(ca, "Migrate error: data still present (%x)", data);
+ char buf[100];
+
+ bch2_scnprint_flag_list(buf, sizeof(buf),
+ bch2_data_types, data);
+ bch_err(ca, "Migrate failed, still has data (%s)", buf);
ret = -EINVAL;
goto err;
}
@@ -1670,33 +1694,33 @@ err:
/* Filesystem open: */
-const char *bch2_fs_open(char * const *devices, unsigned nr_devices,
- struct bch_opts opts, struct bch_fs **ret)
+struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
+ struct bch_opts opts)
{
- const char *err;
+ struct bch_sb_handle *sb = NULL;
struct bch_fs *c = NULL;
- struct bch_sb_handle *sb;
unsigned i, best_sb = 0;
+ const char *err;
+ int ret = -ENOMEM;
if (!nr_devices)
- return "need at least one device";
+ return ERR_PTR(-EINVAL);
if (!try_module_get(THIS_MODULE))
- return "module unloading";
+ return ERR_PTR(-ENODEV);
- err = "cannot allocate memory";
sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
if (!sb)
goto err;
for (i = 0; i < nr_devices; i++) {
- err = bch2_read_super(devices[i], opts, &sb[i]);
- if (err)
+ ret = bch2_read_super(devices[i], &opts, &sb[i]);
+ if (ret)
goto err;
err = bch2_sb_validate(&sb[i]);
if (err)
- goto err;
+ goto err_print;
}
for (i = 1; i < nr_devices; i++)
@@ -1707,56 +1731,53 @@ const char *bch2_fs_open(char * const *devices, unsigned nr_devices,
for (i = 0; i < nr_devices; i++) {
err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
if (err)
- goto err;
+ goto err_print;
}
- err = "cannot allocate memory";
+ ret = -ENOMEM;
c = bch2_fs_alloc(sb[best_sb].sb, opts);
if (!c)
goto err;
err = "bch2_dev_online() error";
- mutex_lock(&c->sb_lock);
+ mutex_lock(&c->state_lock);
for (i = 0; i < nr_devices; i++)
if (__bch2_dev_online(c, &sb[i])) {
- mutex_unlock(&c->sb_lock);
- goto err;
+ mutex_unlock(&c->state_lock);
+ goto err_print;
}
- mutex_unlock(&c->sb_lock);
+ mutex_unlock(&c->state_lock);
err = "insufficient devices";
if (!bch2_fs_may_start(c))
- goto err;
+ goto err_print;
if (!c->opts.nostart) {
err = __bch2_fs_start(c);
if (err)
- goto err;
+ goto err_print;
}
err = bch2_fs_online(c);
if (err)
- goto err;
-
- if (ret)
- *ret = c;
- else
- closure_put(&c->cl);
+ goto err_print;
- err = NULL;
-out:
kfree(sb);
module_put(THIS_MODULE);
- if (err)
- c = NULL;
- return err;
+ return c;
+err_print:
+ pr_err("bch_fs_open err opening %s: %s",
+ devices[0], err);
+ ret = -EINVAL;
err:
if (c)
bch2_fs_stop(c);
for (i = 0; i < nr_devices; i++)
bch2_free_super(&sb[i]);
- goto out;
+ kfree(sb);
+ module_put(THIS_MODULE);
+ return ERR_PTR(ret);
}
static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
@@ -1827,9 +1848,8 @@ const char *bch2_fs_open_incremental(const char *path)
struct bch_opts opts = bch2_opts_empty();
const char *err;
- err = bch2_read_super(path, opts, &sb);
- if (err)
- return err;
+ if (bch2_read_super(path, &opts, &sb))
+ return "error reading superblock";
err = __bch2_fs_open_incremental(&sb, opts);
bch2_free_super(&sb);
diff --git a/libbcachefs/super.h b/libbcachefs/super.h
index 6f628830..a35ee3db 100644
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@@ -198,8 +198,7 @@ const char *bch2_fs_read_write(struct bch_fs *);
void bch2_fs_stop(struct bch_fs *);
const char *bch2_fs_start(struct bch_fs *);
-const char *bch2_fs_open(char * const *, unsigned, struct bch_opts,
- struct bch_fs **);
+struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
const char *bch2_fs_open_incremental(const char *path);
#endif /* _BCACHEFS_SUPER_H */
diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c
index f5007864..6a581097 100644
--- a/libbcachefs/tier.c
+++ b/libbcachefs/tier.c
@@ -39,7 +39,8 @@ static int bch2_tiering_thread(void *arg)
struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]);
struct io_clock *clock = &c->io_clock[WRITE];
struct bch_dev *ca;
- u64 tier_capacity, available_sectors, keys_moved, sectors_moved;
+ struct bch_move_stats move_stats;
+ u64 tier_capacity, available_sectors;
unsigned long last;
unsigned i, nr_devices;
@@ -91,8 +92,7 @@ static int bch2_tiering_thread(void *arg)
0,
-1,
tiering_pred, tier,
- &keys_moved,
- &sectors_moved);
+ &move_stats);
}
return 0;
diff --git a/linux/kthread.c b/linux/kthread.c
index 0f4b5715..80a9ac9a 100644
--- a/linux/kthread.c
+++ b/linux/kthread.c
@@ -64,6 +64,7 @@ struct task_struct *kthread_create(int (*thread_fn)(void *data),
vsnprintf(p->comm, sizeof(p->comm), namefmt, args);
va_end(args);
+ p->flags |= PF_KTHREAD;
p->thread_fn = thread_fn;
p->thread_data = thread_data;
p->state = TASK_UNINTERRUPTIBLE;
@@ -73,6 +74,7 @@ struct task_struct *kthread_create(int (*thread_fn)(void *data),
init_completion(&p->exited);
pthread_create(&p->thread, NULL, kthread_start_fn, p);
+ pthread_setname_np(p->thread, p->comm);
return p;
}