summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/bcachefs/alloc_background.c557
-rw-r--r--fs/bcachefs/alloc_background.h33
-rw-r--r--fs/bcachefs/alloc_foreground.c350
-rw-r--r--fs/bcachefs/alloc_types.h22
-rw-r--r--fs/bcachefs/bcachefs.h21
-rw-r--r--fs/bcachefs/btree_gc.c10
-rw-r--r--fs/bcachefs/buckets.c69
-rw-r--r--fs/bcachefs/buckets.h62
-rw-r--r--fs/bcachefs/buckets_types.h2
-rw-r--r--fs/bcachefs/journal.c2
-rw-r--r--fs/bcachefs/journal_io.c2
-rw-r--r--fs/bcachefs/movinggc.c23
-rw-r--r--fs/bcachefs/recovery.c2
-rw-r--r--fs/bcachefs/super.c82
-rw-r--r--fs/bcachefs/sysfs.c43
15 files changed, 340 insertions, 940 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 26568cd15aa3..f8f68aa31ec0 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -27,13 +27,6 @@
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
-const char * const bch2_allocator_states[] = {
-#define x(n) #n,
- ALLOC_THREAD_STATES()
-#undef x
- NULL
-};
-
static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
BCH_ALLOC_FIELDS_V1()
@@ -369,7 +362,6 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
g->_mark.gen = u.gen;
g->io_time[READ] = u.read_time;
g->io_time[WRITE] = u.write_time;
- g->oldest_gen = !gc ? u.oldest_gen : u.gen;
g->gen_valid = 1;
if (!gc ||
@@ -647,491 +639,6 @@ out:
return ret;
}
-/* Background allocator thread: */
-
-/*
- * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
- * (marking them as invalidated on disk), then optionally issues discard
- * commands to the newly free buckets, then puts them on the various freelists.
- */
-
-/*
- * bucket_gc_gen() returns the difference between the bucket's current gen and
- * the oldest gen of any pointer into that bucket in the btree.
- */
-
-static inline u8 bucket_gc_gen(struct bucket *g)
-{
- return g->mark.gen - g->oldest_gen;
-}
-
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
- struct bucket_mark m)
-{
- u8 gc_gen;
-
- if (!is_available_bucket(m))
- return false;
-
- if (m.owned_by_allocator)
- return false;
-
- if (ca->buckets_nouse &&
- test_bit(b, ca->buckets_nouse))
- return false;
-
- if (ca->new_fs_bucket_idx) {
- /*
- * Device or filesystem is still being initialized, and we
- * haven't fully marked superblocks & journal:
- */
- if (is_superblock_bucket(ca, b))
- return false;
-
- if (b < ca->new_fs_bucket_idx)
- return false;
- }
-
- gc_gen = bucket_gc_gen(bucket(ca, b));
-
- ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2;
- ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX;
-
- return gc_gen < BUCKET_GC_GEN_MAX;
-}
-
-/*
- * Determines what order we're going to reuse buckets, smallest bucket_key()
- * first.
- */
-
-static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
- u64 now, u64 last_seq_ondisk)
-{
- unsigned used = m.cached_sectors;
-
- if (used) {
- /*
- * Prefer to keep buckets that have been read more recently, and
- * buckets that have more data in them:
- */
- u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
- u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
-
- return -last_read_scaled;
- } else {
- /*
- * Prefer to use buckets with smaller gc_gen so that we don't
- * have to walk the btree and recalculate oldest_gen - but shift
- * off the low bits so that buckets will still have equal sort
- * keys when there's only a small difference, so that we can
- * keep sequential buckets together:
- */
- return bucket_gc_gen(g) >> 4;
- }
-}
-
-static inline int bucket_alloc_cmp(alloc_heap *h,
- struct alloc_heap_entry l,
- struct alloc_heap_entry r)
-{
- return cmp_int(l.key, r.key) ?:
- cmp_int(r.nr, l.nr) ?:
- cmp_int(l.bucket, r.bucket);
-}
-
-static inline int bucket_idx_cmp(const void *_l, const void *_r)
-{
- const struct alloc_heap_entry *l = _l, *r = _r;
-
- return cmp_int(l->bucket, r->bucket);
-}
-
-static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bucket_array *buckets;
- struct alloc_heap_entry e = { 0 };
- u64 now, last_seq_ondisk;
- size_t b, i, nr = 0;
-
- down_read(&ca->bucket_lock);
-
- buckets = bucket_array(ca);
- ca->alloc_heap.used = 0;
- now = atomic64_read(&c->io_clock[READ].now);
- last_seq_ondisk = c->journal.flushed_seq_ondisk;
-
- /*
- * Find buckets with lowest read priority, by building a maxheap sorted
- * by read priority and repeatedly replacing the maximum element until
- * all buckets have been visited.
- */
- for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
- struct bucket *g = &buckets->b[b];
- struct bucket_mark m = READ_ONCE(g->mark);
- unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
-
- cond_resched();
-
- if (!bch2_can_invalidate_bucket(ca, b, m))
- continue;
-
- if (!m.data_type &&
- bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- last_seq_ondisk,
- ca->dev_idx, b)) {
- ca->buckets_waiting_on_journal++;
- continue;
- }
-
- if (e.nr && e.bucket + e.nr == b && e.key == key) {
- e.nr++;
- } else {
- if (e.nr)
- heap_add_or_replace(&ca->alloc_heap, e,
- -bucket_alloc_cmp, NULL);
-
- e = (struct alloc_heap_entry) {
- .bucket = b,
- .nr = 1,
- .key = key,
- };
- }
- }
-
- if (e.nr)
- heap_add_or_replace(&ca->alloc_heap, e,
- -bucket_alloc_cmp, NULL);
-
- for (i = 0; i < ca->alloc_heap.used; i++)
- nr += ca->alloc_heap.data[i].nr;
-
- while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
- nr -= ca->alloc_heap.data[0].nr;
- heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
- }
-
- up_read(&ca->bucket_lock);
-}
-
-static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
- size_t i, nr = 0;
-
- ca->inc_gen_needs_gc = 0;
- ca->inc_gen_really_needs_gc = 0;
- ca->buckets_waiting_on_journal = 0;
-
- find_reclaimable_buckets_lru(c, ca);
-
- heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
-
- for (i = 0; i < ca->alloc_heap.used; i++)
- nr += ca->alloc_heap.data[i].nr;
-
- return nr;
-}
-
-static int bucket_invalidate_btree(struct btree_trans *trans,
- struct bch_dev *ca, u64 b,
- struct bkey_alloc_unpacked *u)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- POS(ca->dev_idx, b),
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- *u = bch2_alloc_unpack(k);
- u->gen++;
- u->data_type = 0;
- u->dirty_sectors = 0;
- u->cached_sectors = 0;
- u->read_time = atomic64_read(&c->io_clock[READ].now);
- u->write_time = atomic64_read(&c->io_clock[WRITE].now);
-
- ret = bch2_alloc_write(trans, &iter, u,
- BTREE_TRIGGER_BUCKET_INVALIDATE);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
- u64 *journal_seq, unsigned flags)
-{
- struct bkey_alloc_unpacked u;
- size_t b;
- u64 commit_seq = 0;
- int ret = 0;
-
- /*
- * If the read-only path is trying to shut down, we can't be generating
- * new btree updates:
- */
- if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags))
- return 1;
-
- BUG_ON(!ca->alloc_heap.used ||
- !ca->alloc_heap.data[0].nr);
- b = ca->alloc_heap.data[0].bucket;
-
- /* first, put on free_inc and mark as owned by allocator: */
- percpu_down_read(&c->mark_lock);
-
- bch2_mark_alloc_bucket(c, ca, b, true);
-
- spin_lock(&c->freelist_lock);
- verify_not_on_freelist(c, ca, b);
- BUG_ON(!fifo_push(&ca->free_inc, b));
- spin_unlock(&c->freelist_lock);
-
- percpu_up_read(&c->mark_lock);
-
- ret = bch2_trans_do(c, NULL, &commit_seq,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RESERVED|
- flags,
- bucket_invalidate_btree(&trans, ca, b, &u));
-
- if (!ret) {
- /* remove from alloc_heap: */
- struct alloc_heap_entry e, *top = ca->alloc_heap.data;
-
- top->bucket++;
- top->nr--;
-
- if (!top->nr)
- heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
-
- /*
- * If we invalidating cached data then we need to wait on the
- * journal commit:
- */
- if (u.data_type)
- *journal_seq = max(*journal_seq, commit_seq);
-
- /*
- * We already waiting on u.alloc_seq when we filtered out
- * buckets that need journal commit:
- */
- BUG_ON(*journal_seq > u.journal_seq);
- } else {
- size_t b2;
-
- /* remove from free_inc: */
- percpu_down_read(&c->mark_lock);
- spin_lock(&c->freelist_lock);
-
- bch2_mark_alloc_bucket(c, ca, b, false);
-
- BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
- BUG_ON(b != b2);
-
- spin_unlock(&c->freelist_lock);
- percpu_up_read(&c->mark_lock);
- }
-
- return ret < 0 ? ret : 0;
-}
-
-/*
- * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
- */
-static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
- u64 journal_seq = 0;
- int ret = 0;
-
- /* Only use nowait if we've already invalidated at least one bucket: */
- while (!ret &&
- !fifo_full(&ca->free_inc) &&
- ca->alloc_heap.used) {
- if (kthread_should_stop()) {
- ret = 1;
- break;
- }
-
- ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
- (!fifo_empty(&ca->free_inc)
- ? BTREE_INSERT_NOWAIT : 0));
- /*
- * We only want to batch up invalidates when they're going to
- * require flushing the journal:
- */
- if (!journal_seq)
- break;
- }
-
- /* If we used NOWAIT, don't return the error: */
- if (!fifo_empty(&ca->free_inc))
- ret = 0;
- if (ret < 0)
- bch_err(ca, "error invalidating buckets: %i", ret);
- if (ret)
- return ret;
-
- if (journal_seq)
- ret = bch2_journal_flush_seq(&c->journal, journal_seq);
- if (ret) {
- bch_err(ca, "journal error: %i", ret);
- return ret;
- }
-
- return 0;
-}
-
-static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
-{
- if (ca->allocator_state != new_state) {
- ca->allocator_state = new_state;
- closure_wake_up(&ca->fs->freelist_wait);
- }
-}
-
-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-{
- unsigned i;
- int ret = 0;
-
- spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++) {
- /*
- * Don't strand buckets on the copygc freelist until
- * after recovery is finished:
- */
- if (i == RESERVE_MOVINGGC &&
- !test_bit(BCH_FS_STARTED, &c->flags))
- continue;
-
- if (fifo_push(&ca->free[i], b)) {
- fifo_pop(&ca->free_inc, b);
- ret = 1;
- break;
- }
- }
- spin_unlock(&c->freelist_lock);
-
- ca->allocator_state = ret
- ? ALLOCATOR_running
- : ALLOCATOR_blocked_full;
- closure_wake_up(&c->freelist_wait);
- return ret;
-}
-
-static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-{
- if (!c->opts.nochanges &&
- ca->mi.discard &&
- blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
- blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
- ca->mi.bucket_size, GFP_NOFS, 0);
-}
-
-static bool allocator_thread_running(struct bch_dev *ca)
-{
- unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
- test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
- ? ALLOCATOR_running
- : ALLOCATOR_stopped;
- alloc_thread_set_state(ca, state);
- return state == ALLOCATOR_running;
-}
-
-static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
-{
- s64 available = dev_buckets_reclaimable(ca) -
- (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
- bool ret = available > 0;
-
- alloc_thread_set_state(ca, ret
- ? ALLOCATOR_running
- : ALLOCATOR_blocked);
- return ret;
-}
-
-/**
- * bch_allocator_thread - move buckets from free_inc to reserves
- *
- * The free_inc FIFO is populated by find_reclaimable_buckets(), and
- * the reserves are depleted by bucket allocation. When we run out
- * of free_inc, try to invalidate some buckets and write out
- * prios and gens.
- */
-static int bch2_allocator_thread(void *arg)
-{
- struct bch_dev *ca = arg;
- struct bch_fs *c = ca->fs;
- unsigned long gc_count = c->gc_count;
- size_t nr;
- int ret;
-
- set_freezable();
-
- while (1) {
- ret = kthread_wait_freezable(allocator_thread_running(ca));
- if (ret)
- goto stop;
-
- while (!ca->alloc_heap.used) {
- cond_resched();
-
- ret = kthread_wait_freezable(buckets_available(ca, gc_count));
- if (ret)
- goto stop;
-
- gc_count = c->gc_count;
- nr = find_reclaimable_buckets(c, ca);
-
- if (!nr && ca->buckets_waiting_on_journal) {
- ret = bch2_journal_flush(&c->journal);
- if (ret)
- goto stop;
- } else if (nr < (ca->mi.nbuckets >> 6) &&
- ca->buckets_waiting_on_journal >= nr / 2) {
- bch2_journal_flush_async(&c->journal, NULL);
- }
-
- if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
- ca->inc_gen_really_needs_gc) &&
- c->gc_thread) {
- atomic_inc(&c->kick_gc);
- wake_up_process(c->gc_thread);
- }
-
- trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
- ca->inc_gen_really_needs_gc);
- }
-
- ret = bch2_invalidate_buckets(c, ca);
- if (ret)
- goto stop;
-
- while (!fifo_empty(&ca->free_inc)) {
- u64 b = fifo_peek(&ca->free_inc);
-
- discard_one_bucket(c, ca, b);
-
- ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
- if (ret)
- goto stop;
- }
- }
-stop:
- alloc_thread_set_state(ca, ALLOCATOR_stopped);
- return 0;
-}
-
/* Startup/shutdown (ro/rw): */
void bch2_recalc_capacity(struct bch_fs *c)
@@ -1140,7 +647,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
u64 capacity = 0, reserved_sectors = 0, gc_reserve;
unsigned bucket_size_max = 0;
unsigned long ra_pages = 0;
- unsigned i, j;
+ unsigned i;
lockdep_assert_held(&c->state_lock);
@@ -1171,8 +678,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
* allocations for foreground writes must wait -
* not -ENOSPC calculations.
*/
- for (j = 0; j < RESERVE_NONE; j++)
- dev_reserve += ca->free[j].size;
+
+ dev_reserve += ca->nr_btree_reserve * 2;
+ dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
dev_reserve += 1; /* btree write point */
dev_reserve += 1; /* copygc write point */
@@ -1228,8 +736,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
{
unsigned i;
- BUG_ON(ca->alloc_thread);
-
/* First, remove device from allocation groups: */
for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
@@ -1303,61 +809,6 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
set_bit(ca->dev_idx, c->rw_devs[i].d);
}
-void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
-{
- if (ca->alloc_thread)
- closure_wait_event(&c->freelist_wait,
- ca->allocator_state != ALLOCATOR_running);
-}
-
-/* stop allocator thread: */
-void bch2_dev_allocator_stop(struct bch_dev *ca)
-{
- struct task_struct *p;
-
- p = rcu_dereference_protected(ca->alloc_thread, 1);
- ca->alloc_thread = NULL;
-
- /*
- * We need an rcu barrier between setting ca->alloc_thread = NULL and
- * the thread shutting down to avoid bch2_wake_allocator() racing:
- *
- * XXX: it would be better to have the rcu barrier be asynchronous
- * instead of blocking us here
- */
- synchronize_rcu();
-
- if (p) {
- kthread_stop(p);
- put_task_struct(p);
- }
-}
-
-/* start allocator thread: */
-int bch2_dev_allocator_start(struct bch_dev *ca)
-{
- struct task_struct *p;
-
- /*
- * allocator thread already started?
- */
- if (ca->alloc_thread)
- return 0;
-
- p = kthread_create(bch2_allocator_thread, ca,
- "bch-alloc/%s", ca->name);
- if (IS_ERR(p)) {
- bch_err(ca->fs, "error creating allocator thread: %li",
- PTR_ERR(p));
- return PTR_ERR(p);
- }
-
- get_task_struct(p);
- rcu_assign_pointer(ca->alloc_thread, p);
- wake_up_process(p);
- return 0;
-}
-
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index c45a702c9e7f..34e0b1da63bb 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -8,8 +8,6 @@
#include "debug.h"
#include "super.h"
-extern const char * const bch2_allocator_states[];
-
struct bkey_alloc_unpacked {
u64 journal_seq;
u64 bucket;
@@ -141,42 +139,11 @@ int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c,
struct bkey_i *, unsigned);
int bch2_fs_freespace_init(struct bch_fs *);
-static inline void bch2_wake_allocator(struct bch_dev *ca)
-{
- struct task_struct *p;
-
- rcu_read_lock();
- p = rcu_dereference(ca->alloc_thread);
- if (p)
- wake_up_process(p);
- rcu_read_unlock();
-}
-
-static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
- size_t bucket)
-{
- if (bch2_expensive_debug_checks) {
- size_t iter;
- long i;
- unsigned j;
-
- for (j = 0; j < RESERVE_NR; j++)
- fifo_for_each_entry(i, &ca->free[j], iter)
- BUG_ON(i == bucket);
- fifo_for_each_entry(i, &ca->free_inc, iter)
- BUG_ON(i == bucket);
- }
-}
-
void bch2_recalc_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_stop(struct bch_dev *);
-int bch2_dev_allocator_start(struct bch_dev *);
-
void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 9b81ed2665c8..e0dc585b50da 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -14,13 +14,18 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "btree_iter.h"
+#include "btree_update.h"
#include "btree_gc.h"
#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
#include "clock.h"
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
+#include "error.h"
#include "io.h"
+#include "journal.h"
#include <linux/math64.h>
#include <linux/rculist.h>
@@ -78,7 +83,6 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
percpu_down_read(&c->mark_lock);
spin_lock(&ob->lock);
- bch2_mark_alloc_bucket(c, ca, ob->bucket, false);
ob->valid = false;
ob->data_type = 0;
@@ -178,39 +182,28 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
}
}
-/**
- * bch_bucket_alloc - allocate a single bucket from a specific device
- *
- * Returns index of bucket on success, 0 on failure
- * */
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
- enum alloc_reserve reserve,
- bool may_alloc_partial,
- struct closure *cl)
+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ struct bkey_alloc_unpacked a,
+ size_t *need_journal_commit,
+ struct closure *cl)
{
struct open_bucket *ob;
- long b = 0;
- spin_lock(&c->freelist_lock);
+ if (unlikely(ca->buckets_nouse && test_bit(a.bucket, ca->buckets_nouse)))
+ return NULL;
- if (may_alloc_partial) {
- int i;
-
- for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
- ob = c->open_buckets + ca->open_buckets_partial[i];
-
- if (reserve <= ob->alloc_reserve) {
- array_remove_item(ca->open_buckets_partial,
- ca->open_buckets_partial_nr,
- i);
- ob->on_partial_list = false;
- ob->alloc_reserve = reserve;
- spin_unlock(&c->freelist_lock);
- return ob;
- }
- }
+ if (bch2_bucket_is_open(c, ca->dev_idx, a.bucket))
+ return NULL;
+
+ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk, ca->dev_idx, a.bucket)) {
+ (*need_journal_commit)++;
+ return NULL;
}
+ spin_lock(&c->freelist_lock);
+
if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
if (cl)
closure_wait(&c->open_buckets_wait, cl);
@@ -219,36 +212,17 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
c->blocked_allocate_open_bucket = local_clock();
spin_unlock(&c->freelist_lock);
+
trace_open_bucket_alloc_fail(ca, reserve);
return ERR_PTR(-OPEN_BUCKETS_EMPTY);
}
- if (likely(fifo_pop(&ca->free[RESERVE_NONE], b)))
- goto out;
-
- switch (reserve) {
- case RESERVE_BTREE_MOVINGGC:
- case RESERVE_MOVINGGC:
- if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b))
- goto out;
- break;
- default:
- break;
+ /* Recheck under lock: */
+ if (bch2_bucket_is_open(c, ca->dev_idx, a.bucket)) {
+ spin_unlock(&c->freelist_lock);
+ return NULL;
}
- if (cl)
- closure_wait(&c->freelist_wait, cl);
-
- if (!c->blocked_allocate)
- c->blocked_allocate = local_clock();
-
- spin_unlock(&c->freelist_lock);
-
- trace_bucket_alloc_fail(ca, reserve);
- return ERR_PTR(-FREELIST_EMPTY);
-out:
- verify_not_on_freelist(c, ca, b);
-
ob = bch2_open_bucket_alloc(c);
spin_lock(&ob->lock);
@@ -257,8 +231,8 @@ out:
ob->sectors_free = ca->mi.bucket_size;
ob->alloc_reserve = reserve;
ob->dev = ca->dev_idx;
- ob->gen = *bucket_gen(ca, b);
- ob->bucket = b;
+ ob->gen = a.gen;
+ ob->bucket = a.bucket;
spin_unlock(&ob->lock);
ca->nr_open_buckets++;
@@ -280,12 +254,238 @@ out:
spin_unlock(&c->freelist_lock);
- bch2_wake_allocator(ca);
-
trace_bucket_alloc(ca, reserve);
return ob;
}
+static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
+ enum alloc_reserve reserve, u64 free_entry,
+ size_t *need_journal_commit,
+ struct closure *cl)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct open_bucket *ob;
+ struct bkey_alloc_unpacked a;
+ u64 b = free_entry & ~(~0ULL << 56);
+ unsigned genbits = free_entry >> 56;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret) {
+ ob = ERR_PTR(ret);
+ goto err;
+ }
+
+ a = bch2_alloc_unpack(k);
+
+ if (bch2_fs_inconsistent_on(bucket_state(a) != BUCKET_free, c,
+ "non free bucket in freespace btree (state %s)\n"
+ " %s\n"
+ " at %llu (genbits %u)",
+ bch2_bucket_states[bucket_state(a)],
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+ free_entry, genbits)) {
+ ob = ERR_PTR(-EIO);
+ goto err;
+ }
+
+ if (bch2_fs_inconsistent_on(genbits != (alloc_freespace_genbits(a) >> 56), c,
+ "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
+ " %s",
+ genbits, alloc_freespace_genbits(a) >> 56,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ob = ERR_PTR(-EIO);
+ goto err;
+ }
+
+ if (bch2_fs_inconsistent_on(b < ca->mi.first_bucket || b >= ca->mi.nbuckets, c,
+ "freespace btree has bucket outside allowed range (got %llu, valid %u-%llu)",
+ b, ca->mi.first_bucket, ca->mi.nbuckets)) {
+ ob = ERR_PTR(-EIO);
+ goto err;
+ }
+
+ ob = __try_alloc_bucket(c, ca, reserve, a, need_journal_commit, cl);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ob;
+}
+
+static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca,
+ enum alloc_reserve reserve)
+{
+ struct open_bucket *ob;
+ int i;
+
+ spin_lock(&c->freelist_lock);
+
+ for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
+ ob = c->open_buckets + ca->open_buckets_partial[i];
+
+ if (reserve <= ob->alloc_reserve) {
+ array_remove_item(ca->open_buckets_partial,
+ ca->open_buckets_partial_nr,
+ i);
+ ob->on_partial_list = false;
+ ob->alloc_reserve = reserve;
+ spin_unlock(&c->freelist_lock);
+ return ob;
+ }
+ }
+
+ spin_unlock(&c->freelist_lock);
+ return NULL;
+}
+
+/*
+ * This path is for before the freespace btree is initialized:
+ *
+ * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
+ * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
+ */
+static noinline struct open_bucket *
+bch2_bucket_alloc_trans_early(struct btree_trans *trans,
+ struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ u64 *b,
+ size_t *need_journal_commit,
+ struct closure *cl)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct open_bucket *ob = NULL;
+ int ret;
+
+ *b = max_t(u64, *b, ca->mi.first_bucket);
+ *b = max_t(u64, *b, ca->new_fs_bucket_idx);
+
+ for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *b),
+ BTREE_ITER_SLOTS, k, ret) {
+ struct bkey_alloc_unpacked a;
+
+ if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+ break;
+
+ if (ca->new_fs_bucket_idx &&
+ is_superblock_bucket(ca, k.k->p.offset))
+ continue;
+
+ a = bch2_alloc_unpack(k);
+
+ if (bucket_state(a) != BUCKET_free)
+ continue;
+
+ ob = __try_alloc_bucket(trans->c, ca, reserve, a,
+ need_journal_commit, cl);
+ if (ob)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ *b = iter.pos.offset;
+
+ return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY);
+}
+
+static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
+ struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ u64 *b,
+ size_t *need_journal_commit,
+ struct closure *cl)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct open_bucket *ob = NULL;
+ int ret;
+
+ if (unlikely(!ca->mi.freespace_initialized))
+ return bch2_bucket_alloc_trans_early(trans, ca, reserve, b,
+ need_journal_commit, cl);
+
+ BUG_ON(ca->new_fs_bucket_idx);
+
+ for_each_btree_key(trans, iter, BTREE_ID_freespace,
+ POS(ca->dev_idx, *b), 0, k, ret) {
+ if (k.k->p.inode != ca->dev_idx)
+ break;
+
+ for (*b = max(*b, bkey_start_offset(k.k));
+ *b != k.k->p.offset && !ob;
+ (*b)++) {
+ if (btree_trans_too_many_iters(trans)) {
+ ob = ERR_PTR(-EINTR);
+ break;
+ }
+
+ ob = try_alloc_bucket(trans, ca, reserve, *b,
+ need_journal_commit, cl);
+ }
+ if (ob)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY);
+}
+
+/**
+ * bch_bucket_alloc - allocate a single bucket from a specific device
+ *
+ * Returns index of bucket on success, 0 on failure
+ * */
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ bool may_alloc_partial,
+ struct closure *cl)
+{
+ struct open_bucket *ob = NULL;
+ size_t need_journal_commit = 0;
+ u64 avail = dev_buckets_available(ca, reserve);
+ u64 b = 0;
+ int ret;
+
+ if (may_alloc_partial) {
+ ob = try_alloc_partial_bucket(c, ca, reserve);
+ if (ob)
+ return ob;
+ }
+again:
+ if (!avail) {
+ if (cl) {
+ closure_wait(&c->freelist_wait, cl);
+ /* recheck after putting ourself on waitlist */
+ avail = dev_buckets_available(ca, reserve);
+ if (avail) {
+ closure_wake_up(&c->freelist_wait);
+ goto again;
+ }
+ }
+
+ if (!c->blocked_allocate)
+ c->blocked_allocate = local_clock();
+
+ trace_bucket_alloc_fail(ca, reserve);
+ return ERR_PTR(-FREELIST_EMPTY);
+ }
+
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans,
+ ca, reserve, &b,
+ &need_journal_commit, cl)));
+
+ if (need_journal_commit * 2 > avail)
+ bch2_journal_flush_async(&c->journal, NULL);
+
+ return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY);
+}
+
static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
unsigned l, unsigned r)
{
@@ -313,7 +513,7 @@ void bch2_dev_stripe_increment(struct bch_dev *ca,
struct dev_stripe_state *stripe)
{
u64 *v = stripe->next_alloc + ca->dev_idx;
- u64 free_space = dev_buckets_available(ca);
+ u64 free_space = dev_buckets_available(ca, RESERVE_NONE);
u64 free_space_inv = free_space
? div64_u64(1ULL << 48, free_space)
: 1ULL << 48;
@@ -364,6 +564,7 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
{
struct dev_alloc_list devs_sorted =
bch2_dev_alloc_list(c, stripe, devs_may_alloc);
+ unsigned dev;
struct bch_dev *ca;
int ret = -INSUFFICIENT_DEVICES;
unsigned i;
@@ -373,30 +574,43 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
for (i = 0; i < devs_sorted.nr; i++) {
struct open_bucket *ob;
- ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+ dev = devs_sorted.devs[i];
+
+ rcu_read_lock();
+ ca = rcu_dereference(c->devs[dev]);
+ if (ca)
+ percpu_ref_get(&ca->ref);
+ rcu_read_unlock();
+
if (!ca)
continue;
- if (!ca->mi.durability && *have_cache)
+ if (!ca->mi.durability && *have_cache) {
+ percpu_ref_put(&ca->ref);
continue;
+ }
ob = bch2_bucket_alloc(c, ca, reserve,
flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
+ if (!IS_ERR(ob))
+ bch2_dev_stripe_increment(ca, stripe);
+ percpu_ref_put(&ca->ref);
+
if (IS_ERR(ob)) {
ret = PTR_ERR(ob);
if (cl)
- return ret;
+ break;
continue;
}
add_new_bucket(c, ptrs, devs_may_alloc,
nr_effective, have_cache, flags, ob);
- bch2_dev_stripe_increment(ca, stripe);
-
- if (*nr_effective >= nr_replicas)
- return 0;
+ if (*nr_effective >= nr_replicas) {
+ ret = 0;
+ break;
+ }
}
return ret;
@@ -564,9 +778,6 @@ static int open_bucket_add_buckets(struct bch_fs *c,
if (*nr_effective >= nr_replicas)
return 0;
- percpu_down_read(&c->mark_lock);
- rcu_read_lock();
-
retry_blocking:
/*
* Try nonblocking first, so that if one device is full we'll try from
@@ -580,9 +791,6 @@ retry_blocking:
goto retry_blocking;
}
- rcu_read_unlock();
- percpu_up_read(&c->mark_lock);
-
return ret;
}
@@ -863,7 +1071,7 @@ err:
case -INSUFFICIENT_DEVICES:
return ERR_PTR(-EROFS);
default:
- BUG();
+ return ERR_PTR(ret);
}
}
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 409232e3d998..22e1fbda9046 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -10,18 +10,6 @@
struct ec_bucket_buf;
-#define ALLOC_THREAD_STATES() \
- x(stopped) \
- x(running) \
- x(blocked) \
- x(blocked_full)
-
-enum allocator_states {
-#define x(n) ALLOCATOR_##n,
- ALLOC_THREAD_STATES()
-#undef x
-};
-
enum alloc_reserve {
RESERVE_BTREE_MOVINGGC = -2,
RESERVE_BTREE = -1,
@@ -30,8 +18,6 @@ enum alloc_reserve {
RESERVE_NR = 2,
};
-typedef FIFO(long) alloc_fifo;
-
#define OPEN_BUCKETS_COUNT 1024
#define WRITE_POINT_HASH_NR 32
@@ -94,12 +80,4 @@ struct write_point_specifier {
unsigned long v;
};
-struct alloc_heap_entry {
- size_t bucket;
- size_t nr;
- unsigned long key;
-};
-
-typedef HEAP(struct alloc_heap_entry) alloc_heap;
-
#endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 7350fb6a8355..c82a9e1aab8d 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -462,34 +462,17 @@ struct bch_dev {
/* Allocator: */
u64 new_fs_bucket_idx;
- struct task_struct __rcu *alloc_thread;
- /*
- * free: Buckets that are ready to be used
- *
- * free_inc: Incoming buckets - these are buckets that currently have
- * cached data in them, and we can't reuse them until after we write
- * their new gen to disk. After prio_write() finishes writing the new
- * gens/prios, they'll be moved to the free list (and possibly discarded
- * in the process)
- */
- alloc_fifo free[RESERVE_NR];
- alloc_fifo free_inc;
unsigned nr_open_buckets;
+ unsigned nr_btree_reserve;
open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT];
open_bucket_idx_t open_buckets_partial_nr;
- size_t fifo_last_bucket;
-
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
size_t buckets_waiting_on_journal;
- enum allocator_states allocator_state;
-
- alloc_heap alloc_heap;
-
atomic64_t rebalance_work;
struct journal_device journal;
@@ -511,8 +494,6 @@ struct bch_dev {
enum {
/* startup: */
BCH_FS_ALLOC_CLEAN,
- BCH_FS_ALLOCATOR_RUNNING,
- BCH_FS_ALLOCATOR_STOPPING,
BCH_FS_INITIAL_GC_DONE,
BCH_FS_INITIAL_GC_UNFIXED,
BCH_FS_TOPOLOGY_REPAIR_DONE,
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index a8c566fd12bb..0bab695bcb41 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1673,9 +1673,8 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
*/
int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
{
- struct bch_dev *ca;
u64 start_time = local_clock();
- unsigned i, iter = 0;
+ unsigned iter = 0;
int ret;
lockdep_assert_held(&c->state_lock);
@@ -1777,13 +1776,6 @@ out:
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
/*
- * Wake up allocator in case it was waiting for buckets
- * because of not being able to inc gens
- */
- for_each_member_device(ca, c, i)
- bch2_wake_allocator(ca);
-
- /*
* At startup, allocations can happen directly instead of via the
* allocator thread - issue wakeup in case they blocked on gc_lock:
*/
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index b5178d3067a9..22d8d185a414 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -292,11 +292,6 @@ static inline int bucket_sectors_fragmented(struct bch_dev *ca,
: 0;
}
-static inline int is_stripe_data_bucket(struct bucket_mark m)
-{
- return m.stripe && m.data_type != BCH_DATA_parity;
-}
-
static inline enum bch_data_type bucket_type(struct bucket_mark m)
{
return m.cached_sectors && !m.dirty_sectors
@@ -347,9 +342,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
preempt_enable();
-
- if (!is_available_bucket(old) && is_available_bucket(new))
- bch2_wake_allocator(ca);
}
static inline int __update_replicas(struct bch_fs *c,
@@ -484,19 +476,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
update_replicas_list(trans, &r.e, sectors);
}
-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, bool owned_by_allocator)
-{
- struct bucket *g = bucket(ca, b);
- struct bucket_mark old, new;
-
- old = bucket_cmpxchg(g, new, ({
- new.owned_by_allocator = owned_by_allocator;
- }));
-
- BUG_ON(owned_by_allocator == old.owned_by_allocator);
-}
-
int bch2_mark_alloc(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
@@ -555,6 +534,10 @@ int bch2_mark_alloc(struct btree_trans *trans,
}
}
+ if (!new_u.data_type &&
+ (!new_u.journal_seq || new_u.journal_seq < c->journal.flushed_seq_ondisk))
+ closure_wake_up(&c->freelist_wait);
+
if (bucket_state(new_u) == BUCKET_need_gc_gens) {
atomic_inc(&c->kick_gc);
wake_up_process(c->gc_thread);
@@ -578,7 +561,6 @@ int bch2_mark_alloc(struct btree_trans *trans,
g->io_time[READ] = new_u.read_time;
g->io_time[WRITE] = new_u.write_time;
- g->oldest_gen = new_u.oldest_gen;
g->gen_valid = 1;
g->stripe = new_u.stripe;
g->stripe_redundancy = new_u.stripe_redundancy;
@@ -2069,24 +2051,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
struct bucket_array *buckets = NULL, *old_buckets = NULL;
struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
unsigned long *buckets_nouse = NULL;
- alloc_fifo free[RESERVE_NR];
- alloc_fifo free_inc;
- alloc_heap alloc_heap;
-
- size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
- ca->mi.bucket_size / btree_sectors(c));
- /* XXX: these should be tunable */
- size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
- size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6);
- size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
- btree_reserve * 2);
bool resize = ca->buckets[0] != NULL;
int ret = -ENOMEM;
- unsigned i;
-
- memset(&free, 0, sizeof(free));
- memset(&free_inc, 0, sizeof(free_inc));
- memset(&alloc_heap, 0, sizeof(alloc_heap));
if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
nbuckets * sizeof(struct bucket),
@@ -2096,12 +2062,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
(c->opts.buckets_nouse &&
!(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
- GFP_KERNEL|__GFP_ZERO))) ||
- !init_fifo(&free[RESERVE_MOVINGGC],
- copygc_reserve, GFP_KERNEL) ||
- !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
- !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) ||
- !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL))
+ GFP_KERNEL|__GFP_ZERO))))
goto err;
buckets->first_bucket = ca->mi.first_bucket;
@@ -2147,18 +2108,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
up_write(&c->gc_lock);
}
- spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++) {
- fifo_move(&free[i], &ca->free[i]);
- swap(ca->free[i], free[i]);
- }
- fifo_move(&free_inc, &ca->free_inc);
- swap(ca->free_inc, free_inc);
- spin_unlock(&c->freelist_lock);
-
- /* with gc lock held, alloc_heap can't be in use: */
- swap(ca->alloc_heap, alloc_heap);
-
nbuckets = ca->mi.nbuckets;
if (resize)
@@ -2166,10 +2115,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
ret = 0;
err:
- free_heap(&alloc_heap);
- free_fifo(&free_inc);
- for (i = 0; i < RESERVE_NR; i++)
- free_fifo(&free[i]);
kvpfree(buckets_nouse,
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
if (bucket_gens)
@@ -2184,10 +2129,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
{
unsigned i;
- free_heap(&ca->alloc_heap);
- free_fifo(&ca->free_inc);
- for (i = 0; i < RESERVE_NR; i++)
- free_fifo(&ca->free[i]);
kvpfree(ca->buckets_nouse,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 07fe5cddbb41..a05d8adc8372 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -58,11 +58,6 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
return __bucket(ca, b, true);
}
-static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
-{
- return __bucket(ca, b, false);
-}
-
static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
{
return rcu_dereference_check(ca->bucket_gens,
@@ -143,50 +138,50 @@ static inline bool is_available_bucket(struct bucket_mark mark)
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
static inline u64 __dev_buckets_available(struct bch_dev *ca,
- struct bch_dev_usage stats)
+ struct bch_dev_usage stats,
+ enum alloc_reserve reserve)
{
- u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+ s64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+ s64 reserved = 0;
+
+ switch (reserve) {
+ case RESERVE_NONE:
+ reserved += ca->mi.nbuckets >> 6;
+ fallthrough;
+ case RESERVE_MOVINGGC:
+ reserved += ca->nr_btree_reserve;
+ fallthrough;
+ case RESERVE_BTREE:
+ reserved += ca->nr_btree_reserve;
+ fallthrough;
+ case RESERVE_BTREE_MOVINGGC:
+ break;
+ default:
+ BUG();
+ }
if (WARN_ONCE(stats.buckets_unavailable > total,
"buckets_unavailable overflow (%llu > %llu)\n",
stats.buckets_unavailable, total))
return 0;
- return total - stats.buckets_unavailable;
-}
-
-static inline u64 dev_buckets_available(struct bch_dev *ca)
-{
- return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
+ return max_t(s64, 0,
+ total -
+ stats.buckets_unavailable -
+ ca->nr_open_buckets -
+ reserved);
}
-static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca,
- struct bch_dev_usage stats)
+static inline u64 dev_buckets_available(struct bch_dev *ca,
+ enum alloc_reserve reserve)
{
- struct bch_fs *c = ca->fs;
- s64 available = __dev_buckets_available(ca, stats);
- unsigned i;
-
- spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++)
- available -= fifo_used(&ca->free[i]);
- available -= fifo_used(&ca->free_inc);
- available -= ca->nr_open_buckets;
- spin_unlock(&c->freelist_lock);
-
- return max(available, 0LL);
-}
-
-static inline u64 dev_buckets_reclaimable(struct bch_dev *ca)
-{
- return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca));
+ return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve);
}
/* Filesystem usage: */
static inline unsigned fs_usage_u64s(struct bch_fs *c)
{
-
return sizeof(struct bch_fs_usage) / sizeof(u64) +
READ_ONCE(c->replicas.nr);
}
@@ -214,7 +209,6 @@ bch2_fs_usage_read_short(struct bch_fs *);
void bch2_fs_usage_initialize(struct bch_fs *);
-void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned);
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 2c73dc60b838..2280aee59964 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -14,7 +14,6 @@ struct bucket_mark {
struct {
u8 gen;
u8 data_type:3,
- owned_by_allocator:1,
stripe:1;
u16 dirty_sectors;
u16 cached_sectors;
@@ -29,7 +28,6 @@ struct bucket {
};
u64 io_time[2];
- u8 oldest_gen;
unsigned gen_valid:1;
u8 stripe_redundancy;
u32 stripe;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 4e920ce12865..340f0bed7391 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -800,10 +800,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
break;
}
} else {
- rcu_read_lock();
ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_NONE,
false, cl);
- rcu_read_unlock();
if (IS_ERR(ob[nr_got])) {
ret = cl ? -EAGAIN : -ENOSPC;
break;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index fb24ca212b09..2099044c7083 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1372,6 +1372,8 @@ static void journal_write_done(struct closure *cl)
if (!JSET_NO_FLUSH(w->data)) {
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = w->last_seq;
+
+ closure_wake_up(&c->freelist_wait);
}
} else if (!j->err_seq || seq < j->err_seq)
j->err_seq = seq;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index c82ecff3efe2..0fb60d8581a7 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -119,18 +119,6 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
return DATA_SKIP;
}
-static bool have_copygc_reserve(struct bch_dev *ca)
-{
- bool ret;
-
- spin_lock(&ca->fs->freelist_lock);
- ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
- ca->allocator_state != ALLOCATOR_running;
- spin_unlock(&ca->fs->freelist_lock);
-
- return ret;
-}
-
static inline int fragmentation_cmp(copygc_heap *heap,
struct copygc_heap_entry l,
struct copygc_heap_entry r)
@@ -262,11 +250,10 @@ static int bch2_copygc(struct bch_fs *c)
}
for_each_rw_member(ca, c, dev_idx) {
- closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
+ s64 avail = min(dev_buckets_available(ca, RESERVE_MOVINGGC),
+ ca->mi.nbuckets >> 6);
- spin_lock(&ca->fs->freelist_lock);
- sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size;
- spin_unlock(&ca->fs->freelist_lock);
+ sectors_reserved += avail * ca->mi.bucket_size;
}
ret = walk_buckets_to_copygc(c);
@@ -367,8 +354,8 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
for_each_rw_member(ca, c, dev_idx) {
struct bch_dev_usage usage = bch2_dev_usage_read(ca);
- fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) *
- ca->mi.bucket_size) >> 1);
+ fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_NONE) *
+ ca->mi.bucket_size) >> 1);
fragmented = usage.d[BCH_DATA_user].fragmented;
wait = min(wait, max(0LL, fragmented_allowed - fragmented));
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 4d01a01ea5c5..b7e735d7774f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1373,6 +1373,7 @@ int bch2_fs_initialize(struct bch_fs *c)
* Write out the superblock and journal buckets, now that we can do
* btree updates
*/
+ bch_verbose(c, "marking superblocks");
err = "error marking superblock and journal";
for_each_member_device(ca, c, i) {
ret = bch2_trans_mark_dev_sb(c, ca);
@@ -1384,6 +1385,7 @@ int bch2_fs_initialize(struct bch_fs *c)
ca->new_fs_bucket_idx = 0;
}
+ bch_verbose(c, "initializing freespace");
err = "error initializing freespace";
ret = bch2_fs_freespace_init(c);
if (ret)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 019cbf32d40e..5857f057497b 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -199,17 +199,9 @@ static void __bch2_fs_read_only(struct bch_fs *c)
*/
bch2_journal_flush_all_pins(&c->journal);
- /*
- * If the allocator threads didn't all start up, the btree updates to
- * write out alloc info aren't going to work:
- */
- if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
- goto nowrote_alloc;
-
bch_verbose(c, "flushing journal and stopping allocators");
bch2_journal_flush_all_pins(&c->journal);
- set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
do {
clean_passes++;
@@ -234,17 +226,11 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch_verbose(c, "flushing journal and stopping allocators complete");
set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
-nowrote_alloc:
+
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
flush_work(&c->btree_interior_update_work);
- for_each_member_device(ca, c, i)
- bch2_dev_allocator_stop(ca);
-
- clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
- clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
-
bch2_fs_journal_stop(&c->journal);
/*
@@ -280,10 +266,6 @@ void bch2_fs_read_only(struct bch_fs *c)
/*
* Block new foreground-end write operations from starting - any new
* writes will return -EROFS:
- *
- * (This is really blocking new _allocations_, writes to previously
- * allocated space can still happen until stopping the allocator in
- * bch2_dev_allocator_stop()).
*/
percpu_ref_kill(&c->writes);
@@ -412,20 +394,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- for_each_rw_member(ca, c, i) {
- ret = bch2_dev_allocator_start(ca);
- if (ret) {
- bch_err(c, "error starting allocator threads");
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
- }
-
- set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-
- for_each_rw_member(ca, c, i)
- bch2_wake_allocator(ca);
-
if (!early) {
ret = bch2_fs_read_write_late(c);
if (ret)
@@ -941,20 +909,6 @@ int bch2_fs_start(struct bch_fs *c)
set_bit(BCH_FS_STARTED, &c->flags);
- /*
- * Allocator threads don't start filling copygc reserve until after we
- * set BCH_FS_STARTED - wake them now:
- *
- * XXX ugly hack:
- * Need to set ca->allocator_state here instead of relying on the
- * allocator threads to do it to avoid racing with the copygc threads
- * checking it and thinking they have no alloc reserve:
- */
- for_each_online_member(ca, c, i) {
- ca->allocator_state = ALLOCATOR_running;
- bch2_wake_allocator(ca);
- }
-
if (c->opts.read_only || c->opts.nochanges) {
bch2_fs_read_only(c);
} else {
@@ -1046,8 +1000,6 @@ static void bch2_dev_release(struct kobject *kobj)
static void bch2_dev_free(struct bch_dev *ca)
{
- bch2_dev_allocator_stop(ca);
-
cancel_work_sync(&ca->io_error_work);
if (ca->kobj.state_in_sysfs &&
@@ -1162,6 +1114,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
ca->mi = bch2_mi_to_cpu(member);
ca->uuid = member->uuid;
+ ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
+ ca->mi.bucket_size / btree_sectors(c));
+
if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
0, GFP_KERNEL) ||
percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
@@ -1211,12 +1166,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
ca->fs = c;
- if (ca->mi.state == BCH_MEMBER_STATE_rw &&
- bch2_dev_allocator_start(ca)) {
- bch2_dev_free(ca);
- goto err;
- }
-
bch2_dev_attach(c, ca, dev_idx);
out:
pr_verbose_init(c->opts, "ret %i", ret);
@@ -1402,14 +1351,13 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
/*
* The allocator thread itself allocates btree nodes, so stop it first:
*/
- bch2_dev_allocator_stop(ca);
bch2_dev_allocator_remove(c, ca);
bch2_dev_journal_stop(&c->journal, ca);
bch2_copygc_start(c);
}
-static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
{
lockdep_assert_held(&c->state_lock);
@@ -1417,8 +1365,6 @@ static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
-
- return bch2_dev_allocator_start(ca);
}
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
@@ -1445,7 +1391,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
mutex_unlock(&c->sb_lock);
if (new_state == BCH_MEMBER_STATE_rw)
- ret = __bch2_dev_read_write(c, ca);
+ __bch2_dev_read_write(c, ca);
rebalance_wakeup(c);
@@ -1707,13 +1653,8 @@ have_slot:
ca->new_fs_bucket_idx = 0;
- if (ca->mi.state == BCH_MEMBER_STATE_rw) {
- ret = __bch2_dev_read_write(c, ca);
- if (ret) {
- bch_err(c, "device add error: error going RW on new device: %i", ret);
- goto err_late;
- }
- }
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ __bch2_dev_read_write(c, ca);
up_write(&c->state_lock);
return 0;
@@ -1773,11 +1714,8 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
goto err;
}
- if (ca->mi.state == BCH_MEMBER_STATE_rw) {
- ret = __bch2_dev_read_write(c, ca);
- if (ret)
- goto err;
- }
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ __bch2_dev_read_write(c, ca);
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb.sb);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 3d6ece515a88..1b5ed7adc261 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -170,7 +170,6 @@ read_attribute(congested);
read_attribute(btree_avg_write_size);
-read_attribute(reserve_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
read_attribute(journal_debug);
@@ -185,7 +184,6 @@ read_attribute(internal_uuid);
read_attribute(has_data);
read_attribute(alloc_debug);
-write_attribute(wake_allocator);
read_attribute(read_realloc_races);
read_attribute(extent_migrate_done);
@@ -698,24 +696,6 @@ struct attribute *bch2_fs_time_stats_files[] = {
NULL
};
-static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca)
-{
- enum alloc_reserve i;
-
- spin_lock(&ca->fs->freelist_lock);
-
- pr_buf(out, "free_inc:\t%zu\t%zu\n",
- fifo_used(&ca->free_inc),
- ca->free_inc.size);
-
- for (i = 0; i < RESERVE_NR; i++)
- pr_buf(out, "free[%u]:\t%zu\t%zu\n", i,
- fifo_used(&ca->free[i]),
- ca->free[i].size);
-
- spin_unlock(&ca->fs->freelist_lock);
-}
-
static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
@@ -741,9 +721,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
"ec\t%16llu\n"
"available%15llu\n"
"\n"
- "free_inc\t\t%zu/%zu\n"
- "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
- "free[RESERVE_NONE]\t%zu/%zu\n"
"freelist_wait\t\t%s\n"
"open buckets allocated\t%u\n"
"open buckets this dev\t%u\n"
@@ -751,13 +728,9 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
"open_buckets_wait\t%s\n"
"open_buckets_btree\t%u\n"
"open_buckets_user\t%u\n"
- "btree reserve cache\t%u\n"
- "thread state:\t\t%s\n",
+ "btree reserve cache\t%u\n",
stats.buckets_ec,
- __dev_buckets_available(ca, stats),
- fifo_used(&ca->free_inc), ca->free_inc.size,
- fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
- fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
+ __dev_buckets_available(ca, stats, RESERVE_NONE),
c->freelist_wait.list.first ? "waiting" : "empty",
OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
ca->nr_open_buckets,
@@ -765,8 +738,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
c->open_buckets_wait.list.first ? "waiting" : "empty",
nr[BCH_DATA_btree],
nr[BCH_DATA_user],
- c->btree_reserve_cache_nr,
- bch2_allocator_states[ca->allocator_state]);
+ c->btree_reserve_cache_nr);
}
static const char * const bch2_rw[] = {
@@ -841,9 +813,6 @@ SHOW(bch2_dev)
clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
* 100 / CONGESTED_MAX);
- if (attr == &sysfs_reserve_stats)
- reserve_stats_to_text(out, ca);
-
if (attr == &sysfs_alloc_debug)
dev_alloc_debug_to_text(out, ca);
@@ -883,9 +852,6 @@ STORE(bch2_dev)
return ret;
}
- if (attr == &sysfs_wake_allocator)
- bch2_wake_allocator(ca);
-
return size;
}
SYSFS_OPS(bch2_dev);
@@ -911,11 +877,8 @@ struct attribute *bch2_dev_files[] = {
&sysfs_io_latency_stats_write,
&sysfs_congested,
- &sysfs_reserve_stats,
-
/* debug: */
&sysfs_alloc_debug,
- &sysfs_wake_allocator,
NULL
};