summaryrefslogtreecommitdiff
path: root/libbcachefs/journal.c
diff options
context:
space:
mode:
Diffstat (limited to 'libbcachefs/journal.c')
-rw-r--r--libbcachefs/journal.c246
1 files changed, 144 insertions, 102 deletions
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 0fc680b4..9e290618 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -138,7 +138,7 @@ static inline void bch2_journal_add_prios(struct journal *j,
}
static void journal_seq_blacklist_flush(struct journal *j,
- struct journal_entry_pin *pin)
+ struct journal_entry_pin *pin, u64 seq)
{
struct bch_fs *c =
container_of(j, struct bch_fs, journal);
@@ -406,7 +406,8 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
break;
list_del(&i->list);
- kfree(i);
+ kvpfree(i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&i->j));
}
list_for_each_entry_reverse(i, jlist->head, list) {
@@ -429,7 +430,7 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
where = jlist->head;
add:
- i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+ i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
if (!i) {
ret = -ENOMEM;
goto out;
@@ -646,12 +647,16 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
{
void *n;
+ /* the bios are sized for this many pages, max: */
+ if (new_size > JOURNAL_ENTRY_SIZE_MAX)
+ return -ENOMEM;
+
new_size = roundup_pow_of_two(new_size);
- n = (void *) __get_free_pages(GFP_KERNEL, get_order(new_size));
+ n = kvpmalloc(new_size, GFP_KERNEL);
if (!n)
return -ENOMEM;
- free_pages((unsigned long) b->data, get_order(b->size));
+ kvpfree(b->data, b->size);
b->data = n;
b->size = new_size;
return 0;
@@ -894,7 +899,7 @@ search_done:
!read_bucket(i))
break;
out:
- free_pages((unsigned long) buf.data, get_order(buf.size));
+ kvpfree(buf.data, buf.size);
percpu_ref_put(&ca->io_ref);
closure_return(cl);
err:
@@ -912,7 +917,8 @@ void bch2_journal_entries_free(struct list_head *list)
struct journal_replay *i =
list_first_entry(list, struct journal_replay, list);
list_del(&i->list);
- kvfree(i);
+ kvpfree(i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&i->j));
}
}
@@ -958,14 +964,14 @@ static inline bool journal_has_keys(struct list_head *list)
int bch2_journal_read(struct bch_fs *c, struct list_head *list)
{
+ struct journal *j = &c->journal;
struct jset_entry *prio_ptrs;
struct journal_list jlist;
struct journal_replay *i;
- struct jset *j;
struct journal_entry_pin_list *p;
struct bch_dev *ca;
u64 cur_seq, end_seq;
- unsigned iter;
+ unsigned iter, keys = 0, entries = 0;
int ret = 0;
closure_init_stack(&jlist.cl);
@@ -994,63 +1000,59 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
fsck_err_on(c->sb.clean && journal_has_keys(list), c,
"filesystem marked clean but journal has keys to replay");
- j = &list_entry(list->prev, struct journal_replay, list)->j;
+ i = list_last_entry(list, struct journal_replay, list);
- unfixable_fsck_err_on(le64_to_cpu(j->seq) -
- le64_to_cpu(j->last_seq) + 1 >
- c->journal.pin.size, c,
+ unfixable_fsck_err_on(le64_to_cpu(i->j.seq) -
+ le64_to_cpu(i->j.last_seq) + 1 > j->pin.size, c,
"too many journal entries open for refcount fifo");
- c->journal.pin.back = le64_to_cpu(j->seq) -
- le64_to_cpu(j->last_seq) + 1;
+ atomic64_set(&j->seq, le64_to_cpu(i->j.seq));
+ j->last_seq_ondisk = le64_to_cpu(i->j.last_seq);
- atomic64_set(&c->journal.seq, le64_to_cpu(j->seq));
- c->journal.last_seq_ondisk = le64_to_cpu(j->last_seq);
+ j->pin.front = le64_to_cpu(i->j.last_seq);
+ j->pin.back = le64_to_cpu(i->j.seq) + 1;
- BUG_ON(last_seq(&c->journal) != le64_to_cpu(j->last_seq));
-
- i = list_first_entry(list, struct journal_replay, list);
-
- mutex_lock(&c->journal.blacklist_lock);
-
- fifo_for_each_entry_ptr(p, &c->journal.pin, iter) {
- u64 seq = journal_pin_seq(&c->journal, p);
+ BUG_ON(last_seq(j) != le64_to_cpu(i->j.last_seq));
+ BUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) !=
+ &fifo_peek_back(&j->pin));
+ fifo_for_each_entry_ptr(p, &j->pin, iter) {
INIT_LIST_HEAD(&p->list);
+ atomic_set(&p->count, 0);
+ }
- if (i && le64_to_cpu(i->j.seq) == seq) {
- atomic_set(&p->count, 1);
+ mutex_lock(&j->blacklist_lock);
- if (journal_seq_blacklist_read(&c->journal, i, p)) {
- mutex_unlock(&c->journal.blacklist_lock);
- return -ENOMEM;
- }
+ list_for_each_entry(i, list, list) {
+ p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
- i = list_is_last(&i->list, list)
- ? NULL
- : list_next_entry(i, list);
- } else {
- atomic_set(&p->count, 0);
+ atomic_set(&p->count, 1);
+
+ if (journal_seq_blacklist_read(j, i, p)) {
+ mutex_unlock(&j->blacklist_lock);
+ return -ENOMEM;
}
}
- mutex_unlock(&c->journal.blacklist_lock);
+ mutex_unlock(&j->blacklist_lock);
- cur_seq = last_seq(&c->journal);
+ cur_seq = last_seq(j);
end_seq = le64_to_cpu(list_last_entry(list,
struct journal_replay, list)->j.seq);
list_for_each_entry(i, list, list) {
+ struct jset_entry *entry;
+ struct bkey_i *k, *_n;
bool blacklisted;
- mutex_lock(&c->journal.blacklist_lock);
+ mutex_lock(&j->blacklist_lock);
while (cur_seq < le64_to_cpu(i->j.seq) &&
- journal_seq_blacklist_find(&c->journal, cur_seq))
+ journal_seq_blacklist_find(j, cur_seq))
cur_seq++;
- blacklisted = journal_seq_blacklist_find(&c->journal,
+ blacklisted = journal_seq_blacklist_find(j,
le64_to_cpu(i->j.seq));
- mutex_unlock(&c->journal.blacklist_lock);
+ mutex_unlock(&j->blacklist_lock);
fsck_err_on(blacklisted, c,
"found blacklisted journal entry %llu",
@@ -1059,17 +1061,25 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
"journal entries %llu-%llu missing! (replaying %llu-%llu)",
cur_seq, le64_to_cpu(i->j.seq) - 1,
- last_seq(&c->journal), end_seq);
+ last_seq(j), end_seq);
cur_seq = le64_to_cpu(i->j.seq) + 1;
+
+ for_each_jset_key(k, _n, entry, &i->j)
+ keys++;
+ entries++;
}
- prio_ptrs = bch2_journal_find_entry(j, JOURNAL_ENTRY_PRIO_PTRS, 0);
+ bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
+ keys, entries, (u64) atomic64_read(&j->seq));
+
+ i = list_last_entry(list, struct journal_replay, list);
+ prio_ptrs = bch2_journal_find_entry(&i->j, JOURNAL_ENTRY_PRIO_PTRS, 0);
if (prio_ptrs) {
- memcpy_u64s(c->journal.prio_buckets,
+ memcpy_u64s(j->prio_buckets,
prio_ptrs->_data,
le16_to_cpu(prio_ptrs->u64s));
- c->journal.nr_prio_buckets = le16_to_cpu(prio_ptrs->u64s);
+ j->nr_prio_buckets = le16_to_cpu(prio_ptrs->u64s);
}
fsck_err:
return ret;
@@ -1105,6 +1115,9 @@ static bool journal_entry_is_open(struct journal *j)
void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_buf *w = journal_prev_buf(j);
+
+ atomic_dec_bug(&journal_seq_pin(j, w->data->seq)->count);
if (!need_write_just_set &&
test_bit(JOURNAL_NEED_WRITE, &j->flags))
@@ -1120,8 +1133,7 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
#endif
}
-static struct journal_entry_pin_list *
-__journal_entry_new(struct journal *j, int count)
+static void __journal_entry_new(struct journal *j, int count)
{
struct journal_entry_pin_list *p = fifo_push_ref(&j->pin);
@@ -1131,25 +1143,18 @@ __journal_entry_new(struct journal *j, int count)
*/
atomic64_inc(&j->seq);
- BUG_ON(journal_pin_seq(j, p) != atomic64_read(&j->seq));
+ BUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) !=
+ &fifo_peek_back(&j->pin));
INIT_LIST_HEAD(&p->list);
atomic_set(&p->count, count);
-
- return p;
}
static void __bch2_journal_next_entry(struct journal *j)
{
- struct journal_entry_pin_list *p;
struct journal_buf *buf;
- p = __journal_entry_new(j, 1);
-
- if (test_bit(JOURNAL_REPLAY_DONE, &j->flags)) {
- smp_wmb();
- j->cur_pin_list = p;
- }
+ __journal_entry_new(j, 1);
buf = journal_cur_buf(j);
memset(buf->has_inode, 0, sizeof(buf->has_inode));
@@ -1181,6 +1186,8 @@ static enum {
union journal_res_state old, new;
u64 v = atomic64_read(&j->reservations.counter);
+ lockdep_assert_held(&j->lock);
+
do {
old.v = new.v = v;
if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
@@ -1221,7 +1228,6 @@ static enum {
BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
- atomic_dec_bug(&fifo_peek_back(&j->pin).count);
__bch2_journal_next_entry(j);
cancel_delayed_work(&j->write_work);
@@ -1295,7 +1301,7 @@ static int journal_entry_sectors(struct journal *j)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
- unsigned sectors_available = j->entry_size_max >> 9;
+ unsigned sectors_available = UINT_MAX;
unsigned i, nr_online = 0, nr_devs = 0;
lockdep_assert_held(&j->lock);
@@ -1363,6 +1369,10 @@ static int journal_entry_open(struct journal *j)
if (sectors <= 0)
return sectors;
+ buf->disk_sectors = sectors;
+
+ sectors = min_t(unsigned, sectors, buf->size >> 9);
+
j->cur_buf_sectors = sectors;
buf->nr_prio_buckets = j->nr_prio_buckets;
@@ -1464,18 +1474,15 @@ void bch2_journal_start(struct bch_fs *c)
int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
{
- int ret = 0, keys = 0, entries = 0;
struct journal *j = &c->journal;
struct bkey_i *k, *_n;
struct jset_entry *entry;
struct journal_replay *i, *n;
+ int ret = 0, did_replay = 0;
list_for_each_entry_safe(i, n, list, list) {
- j->cur_pin_list =
- &j->pin.data[((j->pin.back - 1 -
- (atomic64_read(&j->seq) -
- le64_to_cpu(i->j.seq))) &
- j->pin.mask)];
+ j->replay_pin_list =
+ journal_seq_pin(j, le64_to_cpu(i->j.seq));
for_each_jset_key(k, _n, entry, &i->j) {
struct disk_reservation disk_res;
@@ -1499,16 +1506,16 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
}
cond_resched();
- keys++;
+ did_replay = true;
}
- if (atomic_dec_and_test(&j->cur_pin_list->count))
+ if (atomic_dec_and_test(&j->replay_pin_list->count))
wake_up(&j->wait);
-
- entries++;
}
- if (keys) {
+ j->replay_pin_list = NULL;
+
+ if (did_replay) {
bch2_btree_flush(c);
/*
@@ -1517,17 +1524,14 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
* arbitrarily far in the future vs. the most recently written journal
* entry on disk, if we crash before writing the next journal entry:
*/
- ret = bch2_journal_meta(&c->journal);
+ ret = bch2_journal_meta(j);
if (ret) {
bch_err(c, "journal replay: error %d flushing journal", ret);
goto err;
}
}
- bch_info(c, "journal replay done, %i keys in %i entries, seq %llu",
- keys, entries, (u64) atomic64_read(&j->seq));
-
- bch2_journal_set_replay_done(&c->journal);
+ bch2_journal_set_replay_done(j);
err:
bch2_journal_entries_free(list);
return ret;
@@ -1763,11 +1767,16 @@ static void journal_pin_add_entry(struct journal *j,
}
void bch2_journal_pin_add(struct journal *j,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
+ struct journal_res *res,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
{
+ struct journal_entry_pin_list *pin_list = res->ref
+ ? journal_seq_pin(j, res->seq)
+ : j->replay_pin_list;
+
spin_lock_irq(&j->pin_lock);
- __journal_pin_add(j, j->cur_pin_list, pin, flush_fn);
+ __journal_pin_add(j, pin_list, pin, flush_fn);
spin_unlock_irq(&j->pin_lock);
}
@@ -1828,7 +1837,7 @@ void bch2_journal_pin_add_if_older(struct journal *j,
}
static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 seq_to_flush)
+journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *ret = NULL;
@@ -1851,6 +1860,7 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush)
if (ret) {
/* must be list_del_init(), see bch2_journal_pin_drop() */
list_del_init(&ret->list);
+ *seq = journal_pin_seq(j, pin_list);
break;
}
}
@@ -1875,9 +1885,10 @@ static bool journal_has_pins(struct journal *j)
void bch2_journal_flush_pins(struct journal *j)
{
struct journal_entry_pin *pin;
+ u64 seq;
- while ((pin = journal_get_next_pin(j, U64_MAX)))
- pin->flush(j, pin);
+ while ((pin = journal_get_next_pin(j, U64_MAX, &seq)))
+ pin->flush(j, pin, seq);
wait_event(j->wait, !journal_has_pins(j) || bch2_journal_error(j));
}
@@ -1920,7 +1931,7 @@ static void journal_reclaim_work(struct work_struct *work)
struct journal *j = &c->journal;
struct bch_dev *ca;
struct journal_entry_pin *pin;
- u64 seq_to_flush = 0;
+ u64 seq, seq_to_flush = 0;
unsigned iter, bucket_to_flush;
unsigned long next_flush;
bool reclaim_lock_held = false, need_flush;
@@ -1994,9 +2005,9 @@ static void journal_reclaim_work(struct work_struct *work)
while ((pin = journal_get_next_pin(j, need_flush
? U64_MAX
- : seq_to_flush))) {
+ : seq_to_flush, &seq))) {
__set_current_state(TASK_RUNNING);
- pin->flush(j, pin);
+ pin->flush(j, pin, seq);
need_flush = false;
j->last_flushed = jiffies;
@@ -2196,17 +2207,39 @@ static void journal_write_done(struct closure *cl)
mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
}
+static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
+{
+ /* we aren't holding j->lock: */
+ unsigned new_size = READ_ONCE(j->buf_size_want);
+ void *new_buf;
+
+ if (buf->size >= new_size)
+ return;
+
+ new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
+ if (!new_buf)
+ return;
+
+ memcpy(new_buf, buf->data, buf->size);
+ kvpfree(buf->data, buf->size);
+ buf->data = new_buf;
+ buf->size = new_size;
+}
+
static void journal_write(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_prev_buf(j);
- struct jset *jset = w->data;
+ struct jset *jset;
struct bio *bio;
struct bch_extent_ptr *ptr;
unsigned i, sectors, bytes;
+ journal_buf_realloc(j, w);
+ jset = w->data;
+
j->write_start_time = local_clock();
bch2_journal_add_prios(j, w);
@@ -2346,6 +2379,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
unsigned u64s_min, unsigned u64s_max)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_buf *buf;
int ret;
retry:
ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
@@ -2365,7 +2399,18 @@ retry:
}
/*
- * Ok, no more room in the current journal entry - try to start a new
+ * If we couldn't get a reservation because the current buf filled up,
+ * and we had room for a bigger entry on disk, signal that we want to
+ * realloc the journal bufs:
+ */
+ buf = journal_cur_buf(j);
+ if (journal_entry_is_open(j) &&
+ buf->size >> 9 < buf->disk_sectors &&
+ buf->size < JOURNAL_ENTRY_SIZE_MAX)
+ j->buf_size_want = max(j->buf_size_want, buf->size << 1);
+
+ /*
+ * Close the current journal entry if necessary, then try to start a new
* one:
*/
switch (journal_buf_switch(j, false)) {
@@ -2765,11 +2810,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
struct journal_device *ja = &ca->journal;
struct bch_sb_field_journal *journal_buckets =
bch2_sb_get_journal(sb);
- unsigned i, journal_entry_pages;
-
- journal_entry_pages =
- DIV_ROUND_UP(1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb),
- PAGE_SECTORS);
+ unsigned i;
ja->nr = bch2_nr_journal_buckets(journal_buckets);
@@ -2777,7 +2818,8 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
if (!ja->bucket_seq)
return -ENOMEM;
- ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages);
+ ca->journal.bio = bio_kmalloc(GFP_KERNEL,
+ DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE));
if (!ca->journal.bio)
return -ENOMEM;
@@ -2793,17 +2835,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
void bch2_fs_journal_exit(struct journal *j)
{
- unsigned order = get_order(j->entry_size_max);
-
- free_pages((unsigned long) j->buf[1].data, order);
- free_pages((unsigned long) j->buf[0].data, order);
+ kvpfree(j->buf[1].data, j->buf[1].size);
+ kvpfree(j->buf[0].data, j->buf[0].size);
free_fifo(&j->pin);
}
-int bch2_fs_journal_init(struct journal *j, unsigned entry_size_max)
+int bch2_fs_journal_init(struct journal *j)
{
static struct lock_class_key res_key;
- unsigned order = get_order(entry_size_max);
spin_lock_init(&j->lock);
spin_lock_init(&j->pin_lock);
@@ -2817,7 +2856,8 @@ int bch2_fs_journal_init(struct journal *j, unsigned entry_size_max)
lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
- j->entry_size_max = entry_size_max;
+ j->buf[0].size = JOURNAL_ENTRY_SIZE_MIN;
+ j->buf[1].size = JOURNAL_ENTRY_SIZE_MIN;
j->write_delay_ms = 100;
j->reclaim_delay_ms = 100;
@@ -2828,9 +2868,11 @@ int bch2_fs_journal_init(struct journal *j, unsigned entry_size_max)
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
- !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
- !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
+ !(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) ||
+ !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL)))
return -ENOMEM;
+ j->pin.front = j->pin.back = 1;
+
return 0;
}