summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2015-06-05 05:33:21 -0700
committerKent Overstreet <kent.overstreet@gmail.com>2016-10-07 12:34:24 -0800
commit66a7eff674bb945487d93162a2479bf216ebf0d8 (patch)
tree7042969abdd2a3fb46dc3068ce095b9deee9adc4
parent9485f4233fddb6ac6b2de028554f2c2e448b119b (diff)
bcache: journal error handling (WIP)
-rw-r--r--drivers/md/bcache/alloc.c5
-rw-r--r--drivers/md/bcache/btree.c70
-rw-r--r--drivers/md/bcache/btree.h6
-rw-r--r--drivers/md/bcache/gc.c12
-rw-r--r--drivers/md/bcache/journal.c119
-rw-r--r--drivers/md/bcache/journal.h4
-rw-r--r--drivers/md/bcache/super.c12
-rw-r--r--drivers/md/bcache/super.h1
8 files changed, 167 insertions, 62 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 0e001cdae0e4..4b6930452935 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -284,9 +284,10 @@ static int bch_prio_write(struct cache *ca)
if (bch_meta_write_fault("prio"))
ret = -EIO;
if (ret) {
- bch_cache_error(ca,
+ __bch_cache_error(ca,
"IO error %d writing prios to bucket %lu",
- ret, r);
+ ret, r);
+ bch_cache_set_io_error(c);
return ret;
}
}
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 1cc4635a1e1e..02be79a7d547 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -541,7 +541,7 @@ out:
mempool_free(iter, &c->fill_iter);
return;
err:
- set_btree_node_io_error(b);
+ set_btree_node_read_error(b);
btree_node_error(b, ca, ptr, "%s", err);
goto out;
}
@@ -564,7 +564,7 @@ static void bch_btree_node_read(struct cache_set *c, struct btree *b)
pick = bch_btree_pick_ptr(c, b);
if (!pick.ca) {
- set_btree_node_io_error(b);
+ set_btree_node_read_error(b);
goto missing;
}
@@ -585,11 +585,11 @@ static void bch_btree_node_read(struct cache_set *c, struct btree *b)
if (bio->bi_error ||
bch_meta_read_fault("btree"))
- set_btree_node_io_error(b);
+ set_btree_node_read_error(b);
bio_put(bio);
- if (btree_node_io_error(b))
+ if (btree_node_read_error(b))
goto err;
bch_btree_node_read_done(c, b, pick.ca, &pick.ptr);
@@ -630,7 +630,9 @@ static void btree_node_write_done(struct closure *cl)
struct btree_write *w = btree_prev_write(b);
struct cache_set *c = b->c;
- btree_complete_write(c, b, w);
+ /* XXX: pin btree node in memory somehow */
+ if (!btree_node_write_error(b))
+ btree_complete_write(c, b, w);
if (btree_node_dirty(b) && c->btree_flush_delay)
schedule_delayed_work(&b->work, c->btree_flush_delay * HZ);
@@ -644,8 +646,13 @@ static void btree_node_write_endio(struct bio *bio)
struct btree *b = container_of(cl, struct btree, io);
struct bch_write_bio *wbio = to_wbio(bio);
- if (bio->bi_error || bch_meta_write_fault("btree"))
- set_btree_node_io_error(b);
+ if (bio->bi_error || bch_meta_write_fault("btree")) {
+ set_btree_node_write_error(b);
+
+ __bch_cache_error(wbio->bio.ca, "IO error %d writing btree",
+ bio->bi_error);
+ bch_cache_set_io_error(wbio->bio.ca->set);
+ }
if (wbio->orig)
bio_endio(wbio->orig);
@@ -1596,7 +1603,7 @@ retry:
prefetch(b->keys.set[i].data);
}
- if (btree_node_io_error(b)) {
+ if (btree_node_read_error(b)) {
__btree_node_unlock(iter, level, b);
return ERR_PTR(-EIO);
}
@@ -2020,7 +2027,7 @@ int bch_btree_root_read(struct cache_set *c, enum btree_id id,
bch_btree_node_read(c, b);
six_unlock_write(&b->lock);
- if (btree_node_io_error(b)) {
+ if (btree_node_read_error(b)) {
six_unlock_intent(&b->lock);
return -EIO;
}
@@ -2067,7 +2074,8 @@ int bch_btree_node_rewrite(struct btree *b, struct btree_iter *iter, bool wait)
bch_btree_node_write(n, &cl, NULL);
closure_sync(&cl);
- if (bch_journal_error(&c->journal)) {
+ if (bch_journal_error(&c->journal) ||
+ btree_node_write_error(n)) {
bch_btree_node_free_never_inserted(c, n);
six_unlock_intent(&n->lock);
return -EIO;
@@ -2220,9 +2228,9 @@ static bool btree_insert_key(struct cache_set *c, struct btree *b,
}
enum btree_insert_status {
- BTREE_INSERT_NO_INSERT,
- BTREE_INSERT_INSERTED,
+ BTREE_INSERT_OK,
BTREE_INSERT_NEED_SPLIT,
+ BTREE_INSERT_ERROR,
};
static bool have_enough_space(struct btree *b, struct keylist *insert_keys)
@@ -2297,9 +2305,11 @@ bch_btree_insert_keys(struct btree *b,
jset_u64s(n_max));
if (!b->level &&
- test_bit(JOURNAL_REPLAY_DONE, &iter->c->journal.flags))
- bch_journal_res_get(&iter->c->journal, &res,
- actual_min, actual_max);
+ test_bit(JOURNAL_REPLAY_DONE, &iter->c->journal.flags)) {
+ if (bch_journal_res_get(&iter->c->journal, &res,
+ actual_min, actual_max))
+ return BTREE_INSERT_ERROR;
+ }
/* just wrote a set? */
if (btree_node_need_init_next(b))
@@ -2381,8 +2391,7 @@ do_init_next: bch_btree_init_next(iter->c, b, iter);
BUG_ON(!bch_keylist_empty(insert_keys) && inserted && b->level);
- return need_split ? BTREE_INSERT_NEED_SPLIT :
- inserted ? BTREE_INSERT_INSERTED : BTREE_INSERT_NO_INSERT;
+ return need_split ? BTREE_INSERT_NEED_SPLIT : BTREE_INSERT_OK;
}
struct btree_split_state {
@@ -2588,7 +2597,10 @@ static int btree_split(struct btree *b, struct btree_iter *iter,
closure_sync(&state->stack_cl);
/* Check for journal error after waiting on the journal flush: */
- if (bch_journal_error(&c->journal))
+ if (bch_journal_error(&c->journal) ||
+ (n3 && btree_node_write_error(n3)) ||
+ (n2 && btree_node_write_error(n2)) ||
+ btree_node_write_error(n1))
goto err;
/* New nodes all written, now make them visible: */
@@ -2669,6 +2681,8 @@ static int __bch_btree_insert_node(struct btree *b,
u64 *journal_seq, unsigned flags,
struct btree_split_state *state)
{
+ int ret;
+
BUG_ON(iter->nodes[b->level] != b);
BUG_ON(!btree_node_intent_locked(iter, b->level));
BUG_ON(b->level &&
@@ -2677,8 +2691,12 @@ static int __bch_btree_insert_node(struct btree *b,
BUG_ON(b->level && !state->reserve);
BUG_ON(!b->written);
- if (bch_btree_insert_keys(b, iter, insert_keys, replace, journal_seq,
- flags) == BTREE_INSERT_NEED_SPLIT) {
+ switch (bch_btree_insert_keys(b, iter, insert_keys, replace,
+ journal_seq, flags)) {
+ case BTREE_INSERT_OK:
+ return 0;
+
+ case BTREE_INSERT_NEED_SPLIT:
if (!b->level) {
struct btree_reserve *res;
@@ -2701,15 +2719,21 @@ static int __bch_btree_insert_node(struct btree *b,
state->reserve = res;
}
- btree_split(b, iter, insert_keys, flags, state);
+ ret = btree_split(b, iter, insert_keys, flags, state);
if (!b->level) {
bch_btree_reserve_put(iter->c, state->reserve);
state->reserve = NULL;
}
- }
- return 0;
+ return ret;
+
+ case BTREE_INSERT_ERROR:
+ /* Journal error, so we couldn't get a journal reservation: */
+ return -EIO;
+ default:
+ BUG();
+ }
}
/**
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index f4aef01f97e5..52b52587a7ba 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -144,13 +144,15 @@ static inline void clear_btree_node_ ## flag(struct btree *b) \
{ clear_bit(BTREE_NODE_ ## flag, &b->flags); }
enum btree_flags {
- BTREE_NODE_io_error,
+ BTREE_NODE_read_error,
+ BTREE_NODE_write_error,
BTREE_NODE_dirty,
BTREE_NODE_write_idx,
BTREE_NODE_need_init_next,
};
-BTREE_FLAG(io_error);
+BTREE_FLAG(read_error);
+BTREE_FLAG(write_error);
BTREE_FLAG(dirty);
BTREE_FLAG(write_idx);
BTREE_FLAG(need_init_next);
diff --git a/drivers/md/bcache/gc.c b/drivers/md/bcache/gc.c
index d0b3370b11b0..c08dd74a1015 100644
--- a/drivers/md/bcache/gc.c
+++ b/drivers/md/bcache/gc.c
@@ -515,7 +515,10 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
/* Insert the newly coalesced nodes */
ret = bch_btree_insert_node(parent, iter, &keylist, NULL, NULL,
BTREE_INSERT_NOFAIL, res);
- BUG_ON(ret || !bch_keylist_empty(&keylist));
+ if (ret)
+ goto err;
+
+ BUG_ON(!bch_keylist_empty(&keylist));
iter->pos = saved_pos;
@@ -535,6 +538,13 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
out:
bch_keylist_free(&keylist);
bch_btree_reserve_put(c, res);
+ return;
+err:
+ for (i = 0; i < nr_new_nodes; i++) {
+ bch_btree_node_free_never_inserted(c, new_nodes[i]);
+ six_unlock_intent(&new_nodes[i]->lock);
+ }
+ goto out;
}
static int bch_coalesce_btree(struct cache_set *c, enum btree_id btree_id)
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index df373ee7a0b9..3cfa378ecd6f 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -776,33 +776,47 @@ static union journal_res_state journal_res_state(unsigned count,
};
}
+#define JOURNAL_ENTRY_CLOSED ((u32) S32_MAX)
+
+/*
+ * Journal error - we also set this in the res state so that we can avoid
+ * journal_entry_open() opening another entry after the journal has errored:
+ */
+#define JOURNAL_ENTRY_ERROR ((u32) S32_MAX + 1)
+
static bool journal_entry_is_open(struct journal *j)
{
- return j->reservations.cur_entry_offset < S32_MAX;
+ return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED;
}
-/*
- * Closes the current journal entry so that new reservations cannot be take on
- * it - returns true if the count of outstanding reservations is 0.
- */
-static bool journal_entry_close(struct journal *j)
+static bool __journal_entry_close(struct journal *j, u32 val)
{
union journal_res_state old, new;
u64 v = atomic64_read(&j->reservations.counter);
do {
old.v = new.v = v;
- if (old.cur_entry_offset == S32_MAX)
- return old.count == 0;
+ if (old.cur_entry_offset == val)
+ break;
- new.cur_entry_offset = S32_MAX;
+ new.cur_entry_offset = val;
} while ((v = cmpxchg(&j->reservations.v, old.v, new.v)) != old.v);
- journal_cur_write(j)->data->u64s = old.cur_entry_offset;
+ if (old.cur_entry_offset < JOURNAL_ENTRY_CLOSED)
+ journal_cur_write(j)->data->u64s = old.cur_entry_offset;
return old.count == 0;
}
+/*
+ * Closes the current journal entry so that new reservations cannot be take on
+ * it - returns true if the count of outstanding reservations is 0.
+ */
+static bool journal_entry_close(struct journal *j)
+{
+ return __journal_entry_close(j, JOURNAL_ENTRY_CLOSED);
+}
+
/* Number of u64s we can write to the current journal bucket */
static void journal_entry_open(struct journal *j)
{
@@ -829,11 +843,29 @@ static void journal_entry_open(struct journal *j)
u64s = max_t(ssize_t, 0L, u64s);
if (u64s > w->data->u64s) {
- j->cur_entry_u64s = max_t(ssize_t, 0L, u64s);
+ union journal_res_state old, new;
+ u64 v = atomic64_read(&j->reservations.counter);
+
+ /*
+ * Must be set before marking the journal entry as open:
+ *
+ * XXX: does this cause any problems if we bail out because of
+ * JOURNAL_ENTRY_ERROR?
+ */
+ j->cur_entry_u64s = u64s;
+
+ do {
+ old.v = new.v = v;
+
+ BUG_ON(old.count);
+
+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR)
+ break;
+
+ /* Handle any already added entries */
+ new.cur_entry_offset = w->data->u64s;
+ } while ((v = cmpxchg(&j->reservations.v, old.v, new.v)) != old.v);
- /* Handle any already added entries */
- atomic64_set(&j->reservations.counter,
- journal_res_state(0, w->data->u64s).v);
wake_up(&j->wait);
}
}
@@ -1319,11 +1351,18 @@ static void journal_write_endio(struct bio *bio)
{
struct cache *ca = container_of(bio, struct cache, journal.bio);
struct journal_write *w = bio->bi_private;
+ struct journal *j = w->j;
- if (bio->bi_error || bch_meta_write_fault("journal"))
- set_bit(JOURNAL_ERROR, &ca->set->journal.flags);
+ if (bio->bi_error || bch_meta_write_fault("journal")) {
+ set_bit(JOURNAL_ERROR, &j->flags);
+ __journal_entry_close(j, JOURNAL_ENTRY_ERROR);
- closure_put(&w->j->io);
+ __bch_cache_error(ca, "IO error %d writing journal",
+ bio->bi_error);
+ bch_cache_set_io_error(ca->set);
+ }
+
+ closure_put(&j->io);
percpu_ref_put(&ca->ref);
}
@@ -1474,11 +1513,20 @@ static bool __journal_write(struct journal *j)
__releases(j->lock)
{
struct cache_set *c = container_of(j, struct cache_set, journal);
+ unsigned long flags;
+
+ /*
+ * so we don't see IO_IN_FLIGHT cleared before JOURNAL_ERROR is set - as
+ * long as we read the flags all together, they're set in the correct
+ * order so we should be good
+ */
+ flags = READ_ONCE(j->flags);
EBUG_ON(!j->reservations.count &&
- !test_bit(JOURNAL_DIRTY, &j->flags));
+ !test_bit(JOURNAL_DIRTY, &flags));
- if (test_bit(JOURNAL_IO_IN_FLIGHT, &j->flags) ||
+ if (test_bit(JOURNAL_IO_IN_FLIGHT, &flags) ||
+ test_bit(JOURNAL_ERROR, &flags) ||
!journal_entry_close(j))
goto nowrite;
@@ -1616,7 +1664,7 @@ static inline bool journal_res_get_fast(struct journal *j,
return true;
}
-static bool __journal_res_get(struct journal *j, struct journal_res *res,
+static int __journal_res_get(struct journal *j, struct journal_res *res,
unsigned u64s_min, unsigned u64s_max,
u64 *start_time)
{
@@ -1624,7 +1672,7 @@ static bool __journal_res_get(struct journal *j, struct journal_res *res,
while (1) {
if (journal_res_get_fast(j, res, u64s_min, u64s_max))
- return true;
+ return 1;
spin_lock(&j->lock);
@@ -1635,7 +1683,12 @@ static bool __journal_res_get(struct journal *j, struct journal_res *res,
*/
if (journal_res_get_fast(j, res, u64s_min, u64s_max)) {
spin_unlock(&j->lock);
- return true;
+ return 1;
+ }
+
+ if (bch_journal_error(j)) {
+ spin_unlock(&j->lock);
+ return -EIO;
}
/* local_clock() can of course be 0 but we don't care */
@@ -1644,7 +1697,7 @@ static bool __journal_res_get(struct journal *j, struct journal_res *res,
if (!journal_entry_close(j)) {
spin_unlock(&j->lock);
- return false;
+ return 0;
}
if (test_bit(JOURNAL_DIRTY, &j->flags)) {
@@ -1656,7 +1709,7 @@ static bool __journal_res_get(struct journal *j, struct journal_res *res,
if (!journal_try_write(j)) {
trace_bcache_journal_entry_full(c);
- return false;
+ return 0;
}
} else {
/* Try to get a new journal bucket */
@@ -1666,7 +1719,7 @@ static bool __journal_res_get(struct journal *j, struct journal_res *res,
/* Still no room, we have to wait */
spin_unlock(&j->lock);
trace_bcache_journal_full(c);
- return false;
+ return 0;
}
spin_unlock(&j->lock);
@@ -1684,21 +1737,27 @@ static bool __journal_res_get(struct journal *j, struct journal_res *res,
* To ensure forward progress, the current task must not be holding any
* btree node write locks.
*/
-void bch_journal_res_get(struct journal *j, struct journal_res *res,
+int bch_journal_res_get(struct journal *j, struct journal_res *res,
unsigned u64s_min, unsigned u64s_max)
{
u64 start_time = 0;
+ int ret;
BUG_ON(res->ref);
BUG_ON(u64s_max < u64s_min);
wait_event(j->wait,
- __journal_res_get(j, res, u64s_min, u64s_max, &start_time));
-
- BUG_ON(!res->ref);
+ (ret = __journal_res_get(j, res, u64s_min,
+ u64s_max, &start_time)));
if (start_time)
bch_time_stats_update(&j->full_time, start_time);
+
+ if (ret < 0)
+ return ret;
+
+ BUG_ON(!res->ref);
+ return 0;
}
void bch_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent)
@@ -1823,7 +1882,7 @@ int bch_journal_alloc(struct journal *j)
bkey_extent_init(&j->key);
atomic64_set(&j->reservations.counter,
- journal_res_state(0, S32_MAX).v);
+ journal_res_state(0, JOURNAL_ENTRY_CLOSED).v);
j->w[0].j = j;
j->w[1].j = j;
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index d7f1bb208921..8aded34245ee 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -198,8 +198,8 @@ void bch_journal_add_keys(struct journal *, struct journal_res *,
unsigned);
void bch_journal_res_put(struct journal *, struct journal_res *);
-void bch_journal_res_get(struct journal *, struct journal_res *,
- unsigned, unsigned);
+int bch_journal_res_get(struct journal *, struct journal_res *,
+ unsigned, unsigned);
void bch_journal_flush_seq_async(struct journal *, u64, struct closure *);
void bch_journal_flush_async(struct journal *, struct closure *);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 65eb3a258098..20b79dbdbc7e 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -801,7 +801,11 @@ static void bch_cache_set_read_only_work(struct work_struct *work)
mutex_unlock(&bch_register_lock);
}
-/* Cache set startup/shutdown: */
+void bch_cache_set_io_error(struct cache_set *c)
+{
+ pr_err("%pU going read only", c->sb.set_uuid.b);
+ schedule_work(&c->read_only_work);
+}
void bch_cache_set_fail(struct cache_set *c)
{
@@ -819,6 +823,8 @@ void bch_cache_set_fail(struct cache_set *c)
}
}
+/* Cache set startup/shutdown: */
+
void bch_cache_set_release(struct kobject *kobj)
{
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
@@ -1228,7 +1234,9 @@ static const char *run_cache_set(struct cache_set *c)
goto err;
}
- bch_journal_replay(c, &journal);
+ err = "journal replay failed";
+ if (bch_journal_replay(c, &journal))
+ goto err;
err = "error gcing inode nlinks";
if (bch_gc_inode_nlinks(c))
diff --git a/drivers/md/bcache/super.h b/drivers/md/bcache/super.h
index f3e0bd52d44d..258152a42014 100644
--- a/drivers/md/bcache/super.h
+++ b/drivers/md/bcache/super.h
@@ -152,6 +152,7 @@ const char *validate_super(struct bcache_superblock *, struct cache_sb *);
void bch_cache_member_info_update(struct cache *);
+void bch_cache_set_io_error(struct cache_set *);
void bch_cache_set_fail(struct cache_set *);
void bch_cache_set_release(struct kobject *);