summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2018-05-02 15:40:05 -0400
committerKent Overstreet <kent.overstreet@gmail.com>2018-05-02 15:54:22 -0400
commit2df3d2e77e81e403211b056ecc8908af259a3d92 (patch)
tree3af2f74d2ffb853963490ceb8b29cef00aabcc3b
parent92fb4f056560452c3b6302b33481e8fe8c2638cf (diff)
bcachefs: Split out journal_reclaim.c
-rw-r--r--fs/bcachefs/Makefile1
-rw-r--r--fs/bcachefs/btree_io.c1
-rw-r--r--fs/bcachefs/btree_update_interior.c1
-rw-r--r--fs/bcachefs/btree_update_leaf.c1
-rw-r--r--fs/bcachefs/journal.c416
-rw-r--r--fs/bcachefs/journal.h34
-rw-r--r--fs/bcachefs/journal_reclaim.c411
-rw-r--r--fs/bcachefs/journal_reclaim.h36
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c1
-rw-r--r--fs/bcachefs/move.c5
-rw-r--r--fs/bcachefs/super.c3
11 files changed, 470 insertions, 440 deletions
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index fea5f7b0a24d..f4a395b699a6 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -30,6 +30,7 @@ bcachefs-y := \
inode.o \
io.o \
journal.o \
+ journal_reclaim.o \
journal_seq_blacklist.o \
keylist.o \
lz4_decompress.o \
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 27b86dcd636f..b407c5db5498 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -13,6 +13,7 @@
#include "error.h"
#include "extents.h"
#include "io.h"
+#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
#include "super-io.h"
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 63696920ba9a..adba309204ed 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -12,6 +12,7 @@
#include "buckets.h"
#include "extents.h"
#include "journal.h"
+#include "journal_reclaim.h"
#include "keylist.h"
#include "replicas.h"
#include "super-io.h"
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 7935226f55fb..92fb5f61ff14 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -8,6 +8,7 @@
#include "debug.h"
#include "extents.h"
#include "journal.h"
+#include "journal_reclaim.h"
#include "keylist.h"
#include <linux/sort.h>
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 837228d9d59c..7474f6ab291d 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -17,6 +17,7 @@
#include "io.h"
#include "keylist.h"
#include "journal.h"
+#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
#include "replicas.h"
#include "super-io.h"
@@ -25,31 +26,6 @@
#include <trace/events/bcachefs.h>
static void journal_write(struct closure *);
-static void journal_reclaim_fast(struct journal *);
-
-static inline void journal_wake(struct journal *j)
-{
- wake_up(&j->wait);
- closure_wake_up(&j->async_wait);
-}
-
-static inline u64 journal_pin_seq(struct journal *j,
- struct journal_entry_pin_list *pin_list)
-{
- return fifo_entry_idx_abs(&j->pin, pin_list);
-}
-
-u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
-{
- u64 ret = 0;
-
- spin_lock(&j->lock);
- if (journal_pin_active(pin))
- ret = journal_pin_seq(j, pin->pin_list);
- spin_unlock(&j->lock);
-
- return ret;
-}
static struct jset_entry *bch2_journal_find_entry(struct jset *j, unsigned type,
enum btree_id id)
@@ -989,7 +965,7 @@ static enum {
c->opts.block_size;
BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
- journal_reclaim_fast(j);
+ bch2_journal_reclaim_fast(j);
/* XXX: why set this here, and not in journal_write()? */
buf->data->last_seq = cpu_to_le64(journal_last_seq(j));
@@ -1462,341 +1438,6 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
/* Journalling */
/**
- * journal_reclaim_fast - do the fast part of journal reclaim
- *
- * Called from IO submission context, does not block. Cleans up after btree
- * write completions by advancing the journal pin and each cache's last_idx,
- * kicking off discards and background reclaim as necessary.
- */
-static void journal_reclaim_fast(struct journal *j)
-{
- struct journal_entry_pin_list temp;
- bool popped = false;
-
- lockdep_assert_held(&j->lock);
-
- /*
- * Unpin journal entries whose reference counts reached zero, meaning
- * all btree nodes got written out
- */
- while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
- BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
- BUG_ON(!fifo_pop(&j->pin, temp));
- popped = true;
- }
-
- if (popped)
- journal_wake(j);
-}
-
-/*
- * Journal entry pinning - machinery for holding a reference on a given journal
- * entry, marking it as dirty:
- */
-
-static inline void __journal_pin_add(struct journal *j,
- struct journal_entry_pin_list *pin_list,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- BUG_ON(journal_pin_active(pin));
- BUG_ON(!atomic_read(&pin_list->count));
-
- atomic_inc(&pin_list->count);
- pin->pin_list = pin_list;
- pin->flush = flush_fn;
-
- if (flush_fn)
- list_add(&pin->list, &pin_list->list);
- else
- INIT_LIST_HEAD(&pin->list);
-
- /*
- * If the journal is currently full, we might want to call flush_fn
- * immediately:
- */
- journal_wake(j);
-}
-
-void bch2_journal_pin_add(struct journal *j, u64 seq,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock(&j->lock);
- __journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn);
- spin_unlock(&j->lock);
-}
-
-static inline void __journal_pin_drop(struct journal *j,
- struct journal_entry_pin *pin)
-{
- struct journal_entry_pin_list *pin_list = pin->pin_list;
-
- if (!journal_pin_active(pin))
- return;
-
- pin->pin_list = NULL;
- list_del_init(&pin->list);
-
- /*
- * Unpinning a journal entry make make journal_next_bucket() succeed, if
- * writing a new last_seq will now make another bucket available:
- */
- if (atomic_dec_and_test(&pin_list->count) &&
- pin_list == &fifo_peek_front(&j->pin))
- journal_reclaim_fast(j);
-}
-
-void bch2_journal_pin_drop(struct journal *j,
- struct journal_entry_pin *pin)
-{
- spin_lock(&j->lock);
- __journal_pin_drop(j, pin);
- spin_unlock(&j->lock);
-}
-
-void bch2_journal_pin_add_if_older(struct journal *j,
- struct journal_entry_pin *src_pin,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock(&j->lock);
-
- if (journal_pin_active(src_pin) &&
- (!journal_pin_active(pin) ||
- journal_pin_seq(j, src_pin->pin_list) <
- journal_pin_seq(j, pin->pin_list))) {
- __journal_pin_drop(j, pin);
- __journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
- }
-
- spin_unlock(&j->lock);
-}
-
-static struct journal_entry_pin *
-__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
-{
- struct journal_entry_pin_list *pin_list;
- struct journal_entry_pin *ret;
- u64 iter;
-
- /* no need to iterate over empty fifo entries: */
- journal_reclaim_fast(j);
-
- fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
- if (iter > seq_to_flush)
- break;
-
- ret = list_first_entry_or_null(&pin_list->list,
- struct journal_entry_pin, list);
- if (ret) {
- /* must be list_del_init(), see bch2_journal_pin_drop() */
- list_move(&ret->list, &pin_list->flushed);
- *seq = iter;
- return ret;
- }
- }
-
- return NULL;
-}
-
-static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
-{
- struct journal_entry_pin *ret;
-
- spin_lock(&j->lock);
- ret = __journal_get_next_pin(j, seq_to_flush, seq);
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-static int journal_flush_done(struct journal *j, u64 seq_to_flush,
- struct journal_entry_pin **pin,
- u64 *pin_seq)
-{
- int ret;
-
- *pin = NULL;
-
- ret = bch2_journal_error(j);
- if (ret)
- return ret;
-
- spin_lock(&j->lock);
- /*
- * If journal replay hasn't completed, the unreplayed journal entries
- * hold refs on their corresponding sequence numbers
- */
- ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
- !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
- journal_last_seq(j) > seq_to_flush ||
- (fifo_used(&j->pin) == 1 &&
- atomic_read(&fifo_peek_front(&j->pin).count) == 1);
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_entry_pin *pin;
- u64 pin_seq;
- bool flush;
-
- if (!test_bit(JOURNAL_STARTED, &j->flags))
- return 0;
-again:
- wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq));
- if (pin) {
- /* flushing a journal pin might cause a new one to be added: */
- pin->flush(j, pin, pin_seq);
- goto again;
- }
-
- spin_lock(&j->lock);
- flush = journal_last_seq(j) != j->last_seq_ondisk ||
- (seq_to_flush == U64_MAX && c->btree_roots_dirty);
- spin_unlock(&j->lock);
-
- return flush ? bch2_journal_meta(j) : 0;
-}
-
-int bch2_journal_flush_all_pins(struct journal *j)
-{
- return bch2_journal_flush_pins(j, U64_MAX);
-}
-
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
-{
- bool ret;
-
- spin_lock(&j->lock);
- ret = ja->nr &&
- (ja->last_idx != ja->cur_idx &&
- ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-/**
- * journal_reclaim_work - free up journal buckets
- *
- * Background journal reclaim writes out btree nodes. It should be run
- * early enough so that we never completely run out of journal buckets.
- *
- * High watermarks for triggering background reclaim:
- * - FIFO has fewer than 512 entries left
- * - fewer than 25% journal buckets free
- *
- * Background reclaim runs until low watermarks are reached:
- * - FIFO has more than 1024 entries left
- * - more than 50% journal buckets free
- *
- * As long as a reclaim can complete in the time it takes to fill up
- * 512 journal entries or 25% of all journal buckets, then
- * journal_next_bucket() should not stall.
- */
-static void journal_reclaim_work(struct work_struct *work)
-{
- struct bch_fs *c = container_of(to_delayed_work(work),
- struct bch_fs, journal.reclaim_work);
- struct journal *j = &c->journal;
- struct bch_dev *ca;
- struct journal_entry_pin *pin;
- u64 seq, seq_to_flush = 0;
- unsigned iter, bucket_to_flush;
- unsigned long next_flush;
- bool reclaim_lock_held = false, need_flush;
-
- /*
- * Advance last_idx to point to the oldest journal entry containing
- * btree node updates that have not yet been written out
- */
- for_each_rw_member(ca, c, iter) {
- struct journal_device *ja = &ca->journal;
-
- if (!ja->nr)
- continue;
-
- while (should_discard_bucket(j, ja)) {
- if (!reclaim_lock_held) {
- /*
- * ugh:
- * might be called from __journal_res_get()
- * under wait_event() - have to go back to
- * TASK_RUNNING before doing something that
- * would block, but only if we're doing work:
- */
- __set_current_state(TASK_RUNNING);
-
- mutex_lock(&j->reclaim_lock);
- reclaim_lock_held = true;
- /* recheck under reclaim_lock: */
- continue;
- }
-
- if (ca->mi.discard &&
- blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
- blkdev_issue_discard(ca->disk_sb.bdev,
- bucket_to_sector(ca,
- ja->buckets[ja->last_idx]),
- ca->mi.bucket_size, GFP_NOIO, 0);
-
- spin_lock(&j->lock);
- ja->last_idx = (ja->last_idx + 1) % ja->nr;
- spin_unlock(&j->lock);
-
- journal_wake(j);
- }
-
- /*
- * Write out enough btree nodes to free up 50% journal
- * buckets
- */
- spin_lock(&j->lock);
- bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
- seq_to_flush = max_t(u64, seq_to_flush,
- ja->bucket_seq[bucket_to_flush]);
- spin_unlock(&j->lock);
- }
-
- if (reclaim_lock_held)
- mutex_unlock(&j->reclaim_lock);
-
- /* Also flush if the pin fifo is more than half full */
- spin_lock(&j->lock);
- seq_to_flush = max_t(s64, seq_to_flush,
- (s64) journal_cur_seq(j) -
- (j->pin.size >> 1));
- spin_unlock(&j->lock);
-
- /*
- * If it's been longer than j->reclaim_delay_ms since we last flushed,
- * make sure to flush at least one journal pin:
- */
- next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
- need_flush = time_after(jiffies, next_flush);
-
- while ((pin = journal_get_next_pin(j, need_flush
- ? U64_MAX
- : seq_to_flush, &seq))) {
- __set_current_state(TASK_RUNNING);
- pin->flush(j, pin, seq);
- need_flush = false;
-
- j->last_flushed = jiffies;
- }
-
- if (!test_bit(BCH_FS_RO, &c->flags))
- queue_delayed_work(system_freezable_wq, &j->reclaim_work,
- msecs_to_jiffies(j->reclaim_delay_ms));
-}
-
-/**
* journal_next_bucket - move on to the next journal bucket if possible
*/
static int journal_write_alloc(struct journal *j, struct journal_buf *w,
@@ -1980,8 +1621,8 @@ out:
journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs = devs;
/*
- * Updating last_seq_ondisk may let journal_reclaim_work() discard more
- * buckets:
+ * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+ * more buckets:
*
* Must come before signaling write completion, for
* bch2_fs_journal_stop():
@@ -2271,7 +1912,7 @@ retry:
* Direct reclaim - can't rely on reclaim from work item
* due to freezing..
*/
- journal_reclaim_work(&j->reclaim_work.work);
+ bch2_journal_reclaim_work(&j->reclaim_work.work);
trace_journal_full(c);
blocked:
@@ -2333,7 +1974,7 @@ int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *pare
spin_unlock(&j->lock);
if (!ret)
- journal_reclaim_work(&j->reclaim_work.work);
+ bch2_journal_reclaim_work(&j->reclaim_work.work);
return ret;
}
@@ -2556,49 +2197,6 @@ int bch2_journal_flush(struct journal *j)
return bch2_journal_flush_seq(j, seq);
}
-int bch2_journal_flush_device(struct journal *j, int dev_idx)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_entry_pin_list *p;
- struct bch_devs_list devs;
- u64 iter, seq = 0;
- int ret = 0;
-
- spin_lock(&j->lock);
- fifo_for_each_entry_ptr(p, &j->pin, iter)
- if (dev_idx >= 0
- ? bch2_dev_list_has_dev(p->devs, dev_idx)
- : p->devs.nr < c->opts.metadata_replicas)
- seq = iter;
- spin_unlock(&j->lock);
-
- ret = bch2_journal_flush_pins(j, seq);
- if (ret)
- return ret;
-
- mutex_lock(&c->replicas_gc_lock);
- bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
-
- seq = 0;
-
- spin_lock(&j->lock);
- while (!ret && seq < j->pin.back) {
- seq = max(seq, journal_last_seq(j));
- devs = journal_seq_pin(j, seq)->devs;
- seq++;
-
- spin_unlock(&j->lock);
- ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
- spin_lock(&j->lock);
- }
- spin_unlock(&j->lock);
-
- bch2_replicas_gc_end(c, ret);
- mutex_unlock(&c->replicas_gc_lock);
-
- return ret;
-}
-
/* startup/shutdown: */
static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
@@ -2693,7 +2291,7 @@ int bch2_fs_journal_init(struct journal *j)
spin_lock_init(&j->err_lock);
init_waitqueue_head(&j->wait);
INIT_DELAYED_WORK(&j->write_work, journal_write_work);
- INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
+ INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
mutex_init(&j->blacklist_lock);
INIT_LIST_HEAD(&j->seq_blacklist);
mutex_init(&j->reclaim_lock);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 26bb787937fb..0e511a064ec3 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -114,6 +114,12 @@
struct bch_fs;
+static inline void journal_wake(struct journal *j)
+{
+ wake_up(&j->wait);
+ closure_wake_up(&j->async_wait);
+}
+
static inline struct journal_buf *journal_cur_buf(struct journal *j)
{
return j->buf + j->reservations.idx;
@@ -171,33 +177,6 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \
vstruct_for_each_safe(entry, k, _n)
-#define JOURNAL_PIN (32 * 1024)
-
-static inline bool journal_pin_active(struct journal_entry_pin *pin)
-{
- return pin->pin_list != NULL;
-}
-
-static inline struct journal_entry_pin_list *
-journal_seq_pin(struct journal *j, u64 seq)
-{
- BUG_ON(seq < j->pin.front || seq >= j->pin.back);
-
- return &j->pin.data[seq & j->pin.mask];
-}
-
-u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
-
-void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
- journal_pin_flush_fn);
-void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
-void bch2_journal_pin_add_if_older(struct journal *,
- struct journal_entry_pin *,
- struct journal_entry_pin *,
- journal_pin_flush_fn);
-int bch2_journal_flush_pins(struct journal *, u64);
-int bch2_journal_flush_all_pins(struct journal *);
-
struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *,
enum btree_id, unsigned *);
@@ -402,7 +381,6 @@ void bch2_journal_meta_async(struct journal *, struct closure *);
int bch2_journal_flush_seq(struct journal *, u64);
int bch2_journal_flush(struct journal *);
int bch2_journal_meta(struct journal *);
-int bch2_journal_flush_device(struct journal *, int);
void bch2_journal_halt(struct journal *);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
new file mode 100644
index 000000000000..3d6d751cfaff
--- /dev/null
+++ b/fs/bcachefs/journal_reclaim.c
@@ -0,0 +1,411 @@
+
+#include "bcachefs.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "replicas.h"
+#include "super.h"
+
+/*
+ * Journal entry pinning - machinery for holding a reference on a given journal
+ * entry, marking it as dirty:
+ */
+
+static inline u64 journal_pin_seq(struct journal *j,
+ struct journal_entry_pin_list *pin_list)
+{
+ return fifo_entry_idx_abs(&j->pin, pin_list);
+}
+
+u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
+{
+ u64 ret = 0;
+
+ spin_lock(&j->lock);
+ if (journal_pin_active(pin))
+ ret = journal_pin_seq(j, pin->pin_list);
+ spin_unlock(&j->lock);
+
+ return ret;
+}
+
+static inline void __journal_pin_add(struct journal *j,
+ struct journal_entry_pin_list *pin_list,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ BUG_ON(journal_pin_active(pin));
+ BUG_ON(!atomic_read(&pin_list->count));
+
+ atomic_inc(&pin_list->count);
+ pin->pin_list = pin_list;
+ pin->flush = flush_fn;
+
+ if (flush_fn)
+ list_add(&pin->list, &pin_list->list);
+ else
+ INIT_LIST_HEAD(&pin->list);
+
+ /*
+ * If the journal is currently full, we might want to call flush_fn
+ * immediately:
+ */
+ journal_wake(j);
+}
+
+void bch2_journal_pin_add(struct journal *j, u64 seq,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ spin_lock(&j->lock);
+ __journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn);
+ spin_unlock(&j->lock);
+}
+
+static inline void __journal_pin_drop(struct journal *j,
+ struct journal_entry_pin *pin)
+{
+ struct journal_entry_pin_list *pin_list = pin->pin_list;
+
+ if (!journal_pin_active(pin))
+ return;
+
+ pin->pin_list = NULL;
+ list_del_init(&pin->list);
+
+ /*
+ * Unpinning a journal entry make make journal_next_bucket() succeed, if
+ * writing a new last_seq will now make another bucket available:
+ */
+ if (atomic_dec_and_test(&pin_list->count) &&
+ pin_list == &fifo_peek_front(&j->pin))
+ bch2_journal_reclaim_fast(j);
+}
+
+void bch2_journal_pin_drop(struct journal *j,
+ struct journal_entry_pin *pin)
+{
+ spin_lock(&j->lock);
+ __journal_pin_drop(j, pin);
+ spin_unlock(&j->lock);
+}
+
+void bch2_journal_pin_add_if_older(struct journal *j,
+ struct journal_entry_pin *src_pin,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ spin_lock(&j->lock);
+
+ if (journal_pin_active(src_pin) &&
+ (!journal_pin_active(pin) ||
+ journal_pin_seq(j, src_pin->pin_list) <
+ journal_pin_seq(j, pin->pin_list))) {
+ __journal_pin_drop(j, pin);
+ __journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
+ }
+
+ spin_unlock(&j->lock);
+}
+
+/*
+ * Journal reclaim: flush references to open journal entries to reclaim space in
+ * the journal
+ *
+ * May be done by the journal code in the background as needed to free up space
+ * for more journal entries, or as part of doing a clean shutdown, or to migrate
+ * data off of a specific device:
+ */
+
+/**
+ * bch2_journal_reclaim_fast - do the fast part of journal reclaim
+ *
+ * Called from IO submission context, does not block. Cleans up after btree
+ * write completions by advancing the journal pin and each cache's last_idx,
+ * kicking off discards and background reclaim as necessary.
+ */
+void bch2_journal_reclaim_fast(struct journal *j)
+{
+ struct journal_entry_pin_list temp;
+ bool popped = false;
+
+ lockdep_assert_held(&j->lock);
+
+ /*
+ * Unpin journal entries whose reference counts reached zero, meaning
+ * all btree nodes got written out
+ */
+ while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
+ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+ BUG_ON(!fifo_pop(&j->pin, temp));
+ popped = true;
+ }
+
+ if (popped)
+ journal_wake(j);
+}
+
+static struct journal_entry_pin *
+__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+{
+ struct journal_entry_pin_list *pin_list;
+ struct journal_entry_pin *ret;
+ u64 iter;
+
+ /* no need to iterate over empty fifo entries: */
+ bch2_journal_reclaim_fast(j);
+
+ fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
+ if (iter > seq_to_flush)
+ break;
+
+ ret = list_first_entry_or_null(&pin_list->list,
+ struct journal_entry_pin, list);
+ if (ret) {
+ /* must be list_del_init(), see bch2_journal_pin_drop() */
+ list_move(&ret->list, &pin_list->flushed);
+ *seq = iter;
+ return ret;
+ }
+ }
+
+ return NULL;
+}
+
+static struct journal_entry_pin *
+journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+{
+ struct journal_entry_pin *ret;
+
+ spin_lock(&j->lock);
+ ret = __journal_get_next_pin(j, seq_to_flush, seq);
+ spin_unlock(&j->lock);
+
+ return ret;
+}
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+ bool ret;
+
+ spin_lock(&j->lock);
+ ret = ja->nr &&
+ (ja->last_idx != ja->cur_idx &&
+ ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
+ spin_unlock(&j->lock);
+
+ return ret;
+}
+
+/**
+ * bch2_journal_reclaim_work - free up journal buckets
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+void bch2_journal_reclaim_work(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(to_delayed_work(work),
+ struct bch_fs, journal.reclaim_work);
+ struct journal *j = &c->journal;
+ struct bch_dev *ca;
+ struct journal_entry_pin *pin;
+ u64 seq, seq_to_flush = 0;
+ unsigned iter, bucket_to_flush;
+ unsigned long next_flush;
+ bool reclaim_lock_held = false, need_flush;
+
+ /*
+ * Advance last_idx to point to the oldest journal entry containing
+ * btree node updates that have not yet been written out
+ */
+ for_each_rw_member(ca, c, iter) {
+ struct journal_device *ja = &ca->journal;
+
+ if (!ja->nr)
+ continue;
+
+ while (should_discard_bucket(j, ja)) {
+ if (!reclaim_lock_held) {
+ /*
+ * ugh:
+ * might be called from __journal_res_get()
+ * under wait_event() - have to go back to
+ * TASK_RUNNING before doing something that
+ * would block, but only if we're doing work:
+ */
+ __set_current_state(TASK_RUNNING);
+
+ mutex_lock(&j->reclaim_lock);
+ reclaim_lock_held = true;
+ /* recheck under reclaim_lock: */
+ continue;
+ }
+
+ if (ca->mi.discard &&
+ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ bucket_to_sector(ca,
+ ja->buckets[ja->last_idx]),
+ ca->mi.bucket_size, GFP_NOIO, 0);
+
+ spin_lock(&j->lock);
+ ja->last_idx = (ja->last_idx + 1) % ja->nr;
+ spin_unlock(&j->lock);
+
+ journal_wake(j);
+ }
+
+ /*
+ * Write out enough btree nodes to free up 50% journal
+ * buckets
+ */
+ spin_lock(&j->lock);
+ bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
+ seq_to_flush = max_t(u64, seq_to_flush,
+ ja->bucket_seq[bucket_to_flush]);
+ spin_unlock(&j->lock);
+ }
+
+ if (reclaim_lock_held)
+ mutex_unlock(&j->reclaim_lock);
+
+ /* Also flush if the pin fifo is more than half full */
+ spin_lock(&j->lock);
+ seq_to_flush = max_t(s64, seq_to_flush,
+ (s64) journal_cur_seq(j) -
+ (j->pin.size >> 1));
+ spin_unlock(&j->lock);
+
+ /*
+ * If it's been longer than j->reclaim_delay_ms since we last flushed,
+ * make sure to flush at least one journal pin:
+ */
+ next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
+ need_flush = time_after(jiffies, next_flush);
+
+ while ((pin = journal_get_next_pin(j, need_flush
+ ? U64_MAX
+ : seq_to_flush, &seq))) {
+ __set_current_state(TASK_RUNNING);
+ pin->flush(j, pin, seq);
+ need_flush = false;
+
+ j->last_flushed = jiffies;
+ }
+
+ if (!test_bit(BCH_FS_RO, &c->flags))
+ queue_delayed_work(system_freezable_wq, &j->reclaim_work,
+ msecs_to_jiffies(j->reclaim_delay_ms));
+}
+
+static int journal_flush_done(struct journal *j, u64 seq_to_flush,
+ struct journal_entry_pin **pin,
+ u64 *pin_seq)
+{
+ int ret;
+
+ *pin = NULL;
+
+ ret = bch2_journal_error(j);
+ if (ret)
+ return ret;
+
+ spin_lock(&j->lock);
+ /*
+ * If journal replay hasn't completed, the unreplayed journal entries
+ * hold refs on their corresponding sequence numbers
+ */
+ ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
+ !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
+ journal_last_seq(j) > seq_to_flush ||
+ (fifo_used(&j->pin) == 1 &&
+ atomic_read(&fifo_peek_front(&j->pin).count) == 1);
+ spin_unlock(&j->lock);
+
+ return ret;
+}
+
+int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_entry_pin *pin;
+ u64 pin_seq;
+ bool flush;
+
+ if (!test_bit(JOURNAL_STARTED, &j->flags))
+ return 0;
+again:
+ wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq));
+ if (pin) {
+ /* flushing a journal pin might cause a new one to be added: */
+ pin->flush(j, pin, pin_seq);
+ goto again;
+ }
+
+ spin_lock(&j->lock);
+ flush = journal_last_seq(j) != j->last_seq_ondisk ||
+ (seq_to_flush == U64_MAX && c->btree_roots_dirty);
+ spin_unlock(&j->lock);
+
+ return flush ? bch2_journal_meta(j) : 0;
+}
+
+int bch2_journal_flush_all_pins(struct journal *j)
+{
+ return bch2_journal_flush_pins(j, U64_MAX);
+}
+
+int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_entry_pin_list *p;
+ struct bch_devs_list devs;
+ u64 iter, seq = 0;
+ int ret = 0;
+
+ spin_lock(&j->lock);
+ fifo_for_each_entry_ptr(p, &j->pin, iter)
+ if (dev_idx >= 0
+ ? bch2_dev_list_has_dev(p->devs, dev_idx)
+ : p->devs.nr < c->opts.metadata_replicas)
+ seq = iter;
+ spin_unlock(&j->lock);
+
+ ret = bch2_journal_flush_pins(j, seq);
+ if (ret)
+ return ret;
+
+ mutex_lock(&c->replicas_gc_lock);
+ bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+
+ seq = 0;
+
+ spin_lock(&j->lock);
+ while (!ret && seq < j->pin.back) {
+ seq = max(seq, journal_last_seq(j));
+ devs = journal_seq_pin(j, seq)->devs;
+ seq++;
+
+ spin_unlock(&j->lock);
+ ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
+ spin_lock(&j->lock);
+ }
+ spin_unlock(&j->lock);
+
+ bch2_replicas_gc_end(c, ret);
+ mutex_unlock(&c->replicas_gc_lock);
+
+ return ret;
+}
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
new file mode 100644
index 000000000000..7d460c35cfae
--- /dev/null
+++ b/fs/bcachefs/journal_reclaim.h
@@ -0,0 +1,36 @@
+#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
+#define _BCACHEFS_JOURNAL_RECLAIM_H
+
+#define JOURNAL_PIN (32 * 1024)
+
+static inline bool journal_pin_active(struct journal_entry_pin *pin)
+{
+ return pin->pin_list != NULL;
+}
+
+static inline struct journal_entry_pin_list *
+journal_seq_pin(struct journal *j, u64 seq)
+{
+ BUG_ON(seq < j->pin.front || seq >= j->pin.back);
+
+ return &j->pin.data[seq & j->pin.mask];
+}
+
+u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
+
+void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
+ journal_pin_flush_fn);
+void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
+void bch2_journal_pin_add_if_older(struct journal *,
+ struct journal_entry_pin *,
+ struct journal_entry_pin *,
+ journal_pin_flush_fn);
+
+void bch2_journal_reclaim_fast(struct journal *);
+void bch2_journal_reclaim_work(struct work_struct *);
+
+int bch2_journal_flush_pins(struct journal *, u64);
+int bch2_journal_flush_all_pins(struct journal *);
+int bch2_journal_flush_device_pins(struct journal *, int);
+
+#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index ba697763daff..8441520e3662 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -4,6 +4,7 @@
#include "btree_update_interior.h"
#include "error.h"
#include "journal.h"
+#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
/*
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index efab1f8408e4..0431fb81078f 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -5,6 +5,7 @@
#include "buckets.h"
#include "inode.h"
#include "io.h"
+#include "journal_reclaim.h"
#include "move.h"
#include "replicas.h"
#include "super-io.h"
@@ -724,7 +725,7 @@ int bch2_data_job(struct bch_fs *c,
switch (op.op) {
case BCH_DATA_OP_REREPLICATE:
stats->data_type = BCH_DATA_JOURNAL;
- ret = bch2_journal_flush_device(&c->journal, -1);
+ ret = bch2_journal_flush_device_pins(&c->journal, -1);
ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
ret = bch2_gc_btree_replicas(c) ?: ret;
@@ -741,7 +742,7 @@ int bch2_data_job(struct bch_fs *c,
return -EINVAL;
stats->data_type = BCH_DATA_JOURNAL;
- ret = bch2_journal_flush_device(&c->journal, op.migrate.dev);
+ ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
ret = bch2_gc_btree_replicas(c) ?: ret;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 05910c404aec..01ca2ba95954 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -26,6 +26,7 @@
#include "inode.h"
#include "io.h"
#include "journal.h"
+#include "journal_reclaim.h"
#include "keylist.h"
#include "move.h"
#include "migrate.h"
@@ -1421,7 +1422,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
goto err;
}
- ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
+ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
if (ret) {
bch_err(ca, "Remove failed: error %i flushing journal", ret);
goto err;