summaryrefslogtreecommitdiff
path: root/libbcache/journal.c
diff options
context:
space:
mode:
Diffstat (limited to 'libbcache/journal.c')
-rw-r--r--libbcache/journal.c318
1 files changed, 208 insertions, 110 deletions
diff --git a/libbcache/journal.c b/libbcache/journal.c
index 99dd9f2..b283837 100644
--- a/libbcache/journal.c
+++ b/libbcache/journal.c
@@ -545,8 +545,7 @@ static int journal_entry_validate(struct cache_set *c,
return BCH_FSCK_UNKNOWN_VERSION;
}
- if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9 ||
- bytes > c->journal.entry_size_max, c,
+ if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9, c,
"journal entry too big (%zu bytes), sector %lluu",
bytes, sector)) {
/* XXX: note we might have missing journal entries */
@@ -1406,13 +1405,7 @@ void bch_journal_start(struct cache_set *c)
{
struct journal *j = &c->journal;
struct journal_seq_blacklist *bl;
- struct cache *ca;
u64 new_seq = 0;
- unsigned i;
-
- for_each_cache(ca, c, i)
- if (is_journal_device(ca))
- bch_dev_group_add(&c->journal.devs, ca);
list_for_each_entry(bl, &j->seq_blacklist, list)
new_seq = max(new_seq, bl->seq);
@@ -1534,48 +1527,111 @@ err:
return ret;
}
-static int bch_set_nr_journal_buckets(struct cache *ca, unsigned nr)
+static int bch_set_nr_journal_buckets(struct cache_set *c, struct cache *ca,
+ unsigned nr, bool write_super)
{
+ struct journal *j = &c->journal;
struct journal_device *ja = &ca->journal;
- struct bch_sb_field_journal *journal_buckets =
- bch_sb_get_journal(ca->disk_sb.sb);
- struct bch_sb_field *f;
- u64 *p;
+ struct bch_sb_field_journal *journal_buckets;
+ struct disk_reservation disk_res = { 0, 0 };
+ struct closure cl;
+ u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+ int ret = 0;
- p = krealloc(ja->bucket_seq, nr * sizeof(u64),
- GFP_KERNEL|__GFP_ZERO);
- if (!p)
- return -ENOMEM;
+ closure_init_stack(&cl);
- ja->bucket_seq = p;
+ mutex_lock(&c->sb_lock);
- p = krealloc(ja->buckets, nr * sizeof(u64),
- GFP_KERNEL|__GFP_ZERO);
- if (!p)
- return -ENOMEM;
+ /* don't handle reducing nr of buckets yet: */
+ if (nr <= ja->nr)
+ goto err;
- ja->buckets = p;
+ /*
+ * note: journal buckets aren't really counted as _sectors_ used yet, so
+ * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
+ * when space used goes up without a reservation - but we do need the
+ * reservation to ensure we'll actually be able to allocate:
+ */
- f = bch_dev_sb_field_resize(&ca->disk_sb, &journal_buckets->field, nr +
- sizeof(*journal_buckets) / sizeof(u64));
- if (!f)
- return -ENOMEM;
- f->type = BCH_SB_FIELD_journal;
+ ret = ENOSPC;
+ if (bch_disk_reservation_get(c, &disk_res,
+ (nr - ja->nr) << ca->bucket_bits, 0))
+ goto err;
- ja->nr = nr;
- return 0;
+ ret = -ENOMEM;
+ new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL);
+ new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL);
+ if (!new_buckets || !new_bucket_seq)
+ goto err;
+
+ journal_buckets = bch_sb_resize_journal(&ca->disk_sb,
+ nr + sizeof(*journal_buckets) / sizeof(u64));
+ if (!journal_buckets)
+ goto err;
+
+ spin_lock(&j->lock);
+ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
+ memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
+ swap(new_buckets, ja->buckets);
+ swap(new_bucket_seq, ja->bucket_seq);
+
+ while (ja->nr < nr) {
+ /* must happen under journal lock, to avoid racing with gc: */
+ u64 b = bch_bucket_alloc(ca, RESERVE_NONE);
+ if (!b) {
+ if (!closure_wait(&c->freelist_wait, &cl)) {
+ spin_unlock(&j->lock);
+ closure_sync(&cl);
+ spin_lock(&j->lock);
+ }
+ continue;
+ }
+
+ bch_mark_metadata_bucket(ca, &ca->buckets[b],
+ BUCKET_JOURNAL, false);
+ bch_mark_alloc_bucket(ca, &ca->buckets[b], false);
+
+ memmove(ja->buckets + ja->last_idx + 1,
+ ja->buckets + ja->last_idx,
+ (ja->nr - ja->last_idx) * sizeof(u64));
+ memmove(ja->bucket_seq + ja->last_idx + 1,
+ ja->bucket_seq + ja->last_idx,
+ (ja->nr - ja->last_idx) * sizeof(u64));
+ memmove(journal_buckets->buckets + ja->last_idx + 1,
+ journal_buckets->buckets + ja->last_idx,
+ (ja->nr - ja->last_idx) * sizeof(u64));
+
+ ja->buckets[ja->last_idx] = b;
+ journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b);
+
+ if (ja->last_idx < ja->nr) {
+ if (ja->cur_idx >= ja->last_idx)
+ ja->cur_idx++;
+ ja->last_idx++;
+ }
+ ja->nr++;
+
+ }
+ spin_unlock(&j->lock);
+
+ BUG_ON(bch_validate_journal_layout(ca->disk_sb.sb, ca->mi));
+
+ if (write_super)
+ bch_write_super(c);
+
+ ret = 0;
+err:
+ mutex_unlock(&c->sb_lock);
+
+ kfree(new_bucket_seq);
+ kfree(new_buckets);
+ bch_disk_reservation_put(c, &disk_res);
+
+ return ret;
}
int bch_dev_journal_alloc(struct cache *ca)
{
- struct journal_device *ja = &ca->journal;
- struct bch_sb_field_journal *journal_buckets;
- int ret;
- unsigned i;
-
- if (ca->mi.tier != 0)
- return 0;
-
if (dynamic_fault("bcache:add:journal_alloc"))
return -ENOMEM;
@@ -1583,26 +1639,12 @@ int bch_dev_journal_alloc(struct cache *ca)
* clamp journal size to 1024 buckets or 512MB (in sectors), whichever
* is smaller:
*/
- ret = bch_set_nr_journal_buckets(ca,
+ return bch_set_nr_journal_buckets(ca->set, ca,
clamp_t(unsigned, ca->mi.nbuckets >> 8,
BCH_JOURNAL_BUCKETS_MIN,
min(1 << 10,
- (1 << 20) / ca->mi.bucket_size)));
- if (ret)
- return ret;
-
- journal_buckets = bch_sb_get_journal(ca->disk_sb.sb);
-
- for (i = 0; i < ja->nr; i++) {
- u64 bucket = ca->mi.first_bucket + i;
-
- ja->buckets[i] = bucket;
- journal_buckets->buckets[i] = cpu_to_le64(bucket);
-
- bch_mark_metadata_bucket(ca, &ca->buckets[bucket], true);
- }
-
- return 0;
+ (1 << 20) / ca->mi.bucket_size)),
+ false);
}
/* Journalling */
@@ -1726,14 +1768,12 @@ void bch_journal_pin_add_if_older(struct journal *j,
fifo_entry_idx(&j->pin, pin->pin_list))) {
if (journal_pin_active(pin))
__journal_pin_drop(j, pin);
- __journal_pin_add(j, src_pin->pin_list,
- pin, NULL);
+ __journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
}
spin_unlock_irq(&j->pin_lock);
}
-
static struct journal_entry_pin *
journal_get_next_pin(struct journal *j, u64 seq_to_flush)
{
@@ -1766,6 +1806,29 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush)
return ret;
}
+static bool journal_has_pins(struct journal *j)
+{
+ bool ret;
+
+ spin_lock(&j->lock);
+ journal_reclaim_fast(j);
+ ret = fifo_used(&j->pin) > 1 ||
+ atomic_read(&fifo_peek_front(&j->pin).count) > 1;
+ spin_unlock(&j->lock);
+
+ return ret;
+}
+
+void bch_journal_flush_pins(struct journal *j)
+{
+ struct journal_entry_pin *pin;
+
+ while ((pin = journal_get_next_pin(j, U64_MAX)))
+ pin->flush(j, pin);
+
+ wait_event(j->wait, !journal_has_pins(j) || bch_journal_error(j));
+}
+
static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
{
bool ret;
@@ -1895,8 +1958,10 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
struct cache_set *c = container_of(j, struct cache_set, journal);
struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
struct bch_extent_ptr *ptr;
+ struct journal_device *ja;
struct cache *ca;
- unsigned iter, replicas, replicas_want =
+ bool swapped;
+ unsigned i, replicas, replicas_want =
READ_ONCE(c->opts.metadata_replicas);
spin_lock(&j->lock);
@@ -1921,12 +1986,27 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
replicas = bch_extent_nr_ptrs(e.c);
+ spin_lock(&j->devs.lock);
+
+ /* Sort by tier: */
+ do {
+ swapped = false;
+
+ for (i = 0; i + 1 < j->devs.nr; i++)
+ if (j->devs.d[i + 0].dev->mi.tier >
+ j->devs.d[i + 1].dev->mi.tier) {
+ swap(j->devs.d[i], j->devs.d[i + 1]);
+ swapped = true;
+ }
+ } while (swapped);
+
/*
- * Determine location of the next journal write:
- * XXX: sort caches by free journal space
+ * Pick devices for next journal write:
+ * XXX: sort devices by free journal space?
*/
- group_for_each_cache_rcu(ca, &j->devs, iter) {
- struct journal_device *ja = &ca->journal;
+ for (i = 0; i < j->devs.nr; i++) {
+ ca = j->devs.d[i].dev;
+ ja = &ca->journal;
if (replicas >= replicas_want)
break;
@@ -1954,7 +2034,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx);
}
-
+ spin_unlock(&j->devs.lock);
rcu_read_unlock();
j->prev_buf_sectors = 0;
@@ -2468,50 +2548,6 @@ int bch_journal_flush(struct journal *j)
return bch_journal_flush_seq(j, seq);
}
-void bch_journal_free(struct journal *j)
-{
- unsigned order = get_order(j->entry_size_max);
-
- free_pages((unsigned long) j->buf[1].data, order);
- free_pages((unsigned long) j->buf[0].data, order);
- free_fifo(&j->pin);
-}
-
-int bch_journal_alloc(struct journal *j, unsigned entry_size_max)
-{
- static struct lock_class_key res_key;
- unsigned order = get_order(entry_size_max);
-
- spin_lock_init(&j->lock);
- spin_lock_init(&j->pin_lock);
- init_waitqueue_head(&j->wait);
- INIT_DELAYED_WORK(&j->write_work, journal_write_work);
- INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
- mutex_init(&j->blacklist_lock);
- INIT_LIST_HEAD(&j->seq_blacklist);
- spin_lock_init(&j->devs.lock);
- mutex_init(&j->reclaim_lock);
-
- lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
-
- j->entry_size_max = entry_size_max;
- j->write_delay_ms = 100;
- j->reclaim_delay_ms = 100;
-
- bkey_extent_init(&j->key);
-
- atomic64_set(&j->reservations.counter,
- ((union journal_res_state)
- { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
-
- if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
- !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
- !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
- return -ENOMEM;
-
- return 0;
-}
-
ssize_t bch_journal_print_debug(struct journal *j, char *buf)
{
union journal_res_state *s = &j->reservations;
@@ -2643,13 +2679,31 @@ int bch_journal_move(struct cache *ca)
return ret;
}
-void bch_journal_free_cache(struct cache *ca)
+void bch_fs_journal_stop(struct journal *j)
+{
+ if (!test_bit(JOURNAL_STARTED, &j->flags))
+ return;
+
+ /*
+ * Empty out the journal by first flushing everything pinning existing
+ * journal entries, then force a brand new empty journal entry to be
+ * written:
+ */
+ bch_journal_flush_pins(j);
+ bch_journal_flush_async(j, NULL);
+ bch_journal_meta(j);
+
+ cancel_delayed_work_sync(&j->write_work);
+ cancel_delayed_work_sync(&j->reclaim_work);
+}
+
+void bch_dev_journal_exit(struct cache *ca)
{
kfree(ca->journal.buckets);
kfree(ca->journal.bucket_seq);
}
-int bch_journal_init_cache(struct cache *ca)
+int bch_dev_journal_init(struct cache *ca)
{
struct journal_device *ja = &ca->journal;
struct bch_sb_field_journal *journal_buckets =
@@ -2679,3 +2733,47 @@ int bch_journal_init_cache(struct cache *ca)
return 0;
}
+
+void bch_fs_journal_exit(struct journal *j)
+{
+ unsigned order = get_order(j->entry_size_max);
+
+ free_pages((unsigned long) j->buf[1].data, order);
+ free_pages((unsigned long) j->buf[0].data, order);
+ free_fifo(&j->pin);
+}
+
+int bch_fs_journal_init(struct journal *j, unsigned entry_size_max)
+{
+ static struct lock_class_key res_key;
+ unsigned order = get_order(entry_size_max);
+
+ spin_lock_init(&j->lock);
+ spin_lock_init(&j->pin_lock);
+ init_waitqueue_head(&j->wait);
+ INIT_DELAYED_WORK(&j->write_work, journal_write_work);
+ INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
+ mutex_init(&j->blacklist_lock);
+ INIT_LIST_HEAD(&j->seq_blacklist);
+ spin_lock_init(&j->devs.lock);
+ mutex_init(&j->reclaim_lock);
+
+ lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
+
+ j->entry_size_max = entry_size_max;
+ j->write_delay_ms = 100;
+ j->reclaim_delay_ms = 100;
+
+ bkey_extent_init(&j->key);
+
+ atomic64_set(&j->reservations.counter,
+ ((union journal_res_state)
+ { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
+
+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+ !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
+ !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
+ return -ENOMEM;
+
+ return 0;
+}