diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2015-06-05 05:33:21 -0700 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2016-10-07 12:34:24 -0800 |
commit | 66a7eff674bb945487d93162a2479bf216ebf0d8 (patch) | |
tree | 7042969abdd2a3fb46dc3068ce095b9deee9adc4 | |
parent | 9485f4233fddb6ac6b2de028554f2c2e448b119b (diff) |
bcache: journal error handling (WIP)
-rw-r--r-- | drivers/md/bcache/alloc.c | 5 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 70 | ||||
-rw-r--r-- | drivers/md/bcache/btree.h | 6 | ||||
-rw-r--r-- | drivers/md/bcache/gc.c | 12 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 119 | ||||
-rw-r--r-- | drivers/md/bcache/journal.h | 4 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 12 | ||||
-rw-r--r-- | drivers/md/bcache/super.h | 1 |
8 files changed, 167 insertions, 62 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 0e001cdae0e4..4b6930452935 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -284,9 +284,10 @@ static int bch_prio_write(struct cache *ca) if (bch_meta_write_fault("prio")) ret = -EIO; if (ret) { - bch_cache_error(ca, + __bch_cache_error(ca, "IO error %d writing prios to bucket %lu", - ret, r); + ret, r); + bch_cache_set_io_error(c); return ret; } } diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 1cc4635a1e1e..02be79a7d547 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -541,7 +541,7 @@ out: mempool_free(iter, &c->fill_iter); return; err: - set_btree_node_io_error(b); + set_btree_node_read_error(b); btree_node_error(b, ca, ptr, "%s", err); goto out; } @@ -564,7 +564,7 @@ static void bch_btree_node_read(struct cache_set *c, struct btree *b) pick = bch_btree_pick_ptr(c, b); if (!pick.ca) { - set_btree_node_io_error(b); + set_btree_node_read_error(b); goto missing; } @@ -585,11 +585,11 @@ static void bch_btree_node_read(struct cache_set *c, struct btree *b) if (bio->bi_error || bch_meta_read_fault("btree")) - set_btree_node_io_error(b); + set_btree_node_read_error(b); bio_put(bio); - if (btree_node_io_error(b)) + if (btree_node_read_error(b)) goto err; bch_btree_node_read_done(c, b, pick.ca, &pick.ptr); @@ -630,7 +630,9 @@ static void btree_node_write_done(struct closure *cl) struct btree_write *w = btree_prev_write(b); struct cache_set *c = b->c; - btree_complete_write(c, b, w); + /* XXX: pin btree node in memory somehow */ + if (!btree_node_write_error(b)) + btree_complete_write(c, b, w); if (btree_node_dirty(b) && c->btree_flush_delay) schedule_delayed_work(&b->work, c->btree_flush_delay * HZ); @@ -644,8 +646,13 @@ static void btree_node_write_endio(struct bio *bio) struct btree *b = container_of(cl, struct btree, io); struct bch_write_bio *wbio = to_wbio(bio); - if (bio->bi_error || bch_meta_write_fault("btree")) - set_btree_node_io_error(b); + if (bio->bi_error || bch_meta_write_fault("btree")) { + set_btree_node_write_error(b); + + __bch_cache_error(wbio->bio.ca, "IO error %d writing btree", + bio->bi_error); + bch_cache_set_io_error(wbio->bio.ca->set); + } if (wbio->orig) bio_endio(wbio->orig); @@ -1596,7 +1603,7 @@ retry: prefetch(b->keys.set[i].data); } - if (btree_node_io_error(b)) { + if (btree_node_read_error(b)) { __btree_node_unlock(iter, level, b); return ERR_PTR(-EIO); } @@ -2020,7 +2027,7 @@ int bch_btree_root_read(struct cache_set *c, enum btree_id id, bch_btree_node_read(c, b); six_unlock_write(&b->lock); - if (btree_node_io_error(b)) { + if (btree_node_read_error(b)) { six_unlock_intent(&b->lock); return -EIO; } @@ -2067,7 +2074,8 @@ int bch_btree_node_rewrite(struct btree *b, struct btree_iter *iter, bool wait) bch_btree_node_write(n, &cl, NULL); closure_sync(&cl); - if (bch_journal_error(&c->journal)) { + if (bch_journal_error(&c->journal) || + btree_node_write_error(n)) { bch_btree_node_free_never_inserted(c, n); six_unlock_intent(&n->lock); return -EIO; @@ -2220,9 +2228,9 @@ static bool btree_insert_key(struct cache_set *c, struct btree *b, } enum btree_insert_status { - BTREE_INSERT_NO_INSERT, - BTREE_INSERT_INSERTED, + BTREE_INSERT_OK, BTREE_INSERT_NEED_SPLIT, + BTREE_INSERT_ERROR, }; static bool have_enough_space(struct btree *b, struct keylist *insert_keys) @@ -2297,9 +2305,11 @@ bch_btree_insert_keys(struct btree *b, jset_u64s(n_max)); if (!b->level && - test_bit(JOURNAL_REPLAY_DONE, &iter->c->journal.flags)) - bch_journal_res_get(&iter->c->journal, &res, - actual_min, actual_max); + test_bit(JOURNAL_REPLAY_DONE, &iter->c->journal.flags)) { + if (bch_journal_res_get(&iter->c->journal, &res, + actual_min, actual_max)) + return BTREE_INSERT_ERROR; + } /* just wrote a set? */ if (btree_node_need_init_next(b)) @@ -2381,8 +2391,7 @@ do_init_next: bch_btree_init_next(iter->c, b, iter); BUG_ON(!bch_keylist_empty(insert_keys) && inserted && b->level); - return need_split ? BTREE_INSERT_NEED_SPLIT : - inserted ? BTREE_INSERT_INSERTED : BTREE_INSERT_NO_INSERT; + return need_split ? BTREE_INSERT_NEED_SPLIT : BTREE_INSERT_OK; } struct btree_split_state { @@ -2588,7 +2597,10 @@ static int btree_split(struct btree *b, struct btree_iter *iter, closure_sync(&state->stack_cl); /* Check for journal error after waiting on the journal flush: */ - if (bch_journal_error(&c->journal)) + if (bch_journal_error(&c->journal) || + (n3 && btree_node_write_error(n3)) || + (n2 && btree_node_write_error(n2)) || + btree_node_write_error(n1)) goto err; /* New nodes all written, now make them visible: */ @@ -2669,6 +2681,8 @@ static int __bch_btree_insert_node(struct btree *b, u64 *journal_seq, unsigned flags, struct btree_split_state *state) { + int ret; + BUG_ON(iter->nodes[b->level] != b); BUG_ON(!btree_node_intent_locked(iter, b->level)); BUG_ON(b->level && @@ -2677,8 +2691,12 @@ static int __bch_btree_insert_node(struct btree *b, BUG_ON(b->level && !state->reserve); BUG_ON(!b->written); - if (bch_btree_insert_keys(b, iter, insert_keys, replace, journal_seq, - flags) == BTREE_INSERT_NEED_SPLIT) { + switch (bch_btree_insert_keys(b, iter, insert_keys, replace, + journal_seq, flags)) { + case BTREE_INSERT_OK: + return 0; + + case BTREE_INSERT_NEED_SPLIT: if (!b->level) { struct btree_reserve *res; @@ -2701,15 +2719,21 @@ static int __bch_btree_insert_node(struct btree *b, state->reserve = res; } - btree_split(b, iter, insert_keys, flags, state); + ret = btree_split(b, iter, insert_keys, flags, state); if (!b->level) { bch_btree_reserve_put(iter->c, state->reserve); state->reserve = NULL; } - } - return 0; + return ret; + + case BTREE_INSERT_ERROR: + /* Journal error, so we couldn't get a journal reservation: */ + return -EIO; + default: + BUG(); + } } /** diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index f4aef01f97e5..52b52587a7ba 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -144,13 +144,15 @@ static inline void clear_btree_node_ ## flag(struct btree *b) \ { clear_bit(BTREE_NODE_ ## flag, &b->flags); } enum btree_flags { - BTREE_NODE_io_error, + BTREE_NODE_read_error, + BTREE_NODE_write_error, BTREE_NODE_dirty, BTREE_NODE_write_idx, BTREE_NODE_need_init_next, }; -BTREE_FLAG(io_error); +BTREE_FLAG(read_error); +BTREE_FLAG(write_error); BTREE_FLAG(dirty); BTREE_FLAG(write_idx); BTREE_FLAG(need_init_next); diff --git a/drivers/md/bcache/gc.c b/drivers/md/bcache/gc.c index d0b3370b11b0..c08dd74a1015 100644 --- a/drivers/md/bcache/gc.c +++ b/drivers/md/bcache/gc.c @@ -515,7 +515,10 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], /* Insert the newly coalesced nodes */ ret = bch_btree_insert_node(parent, iter, &keylist, NULL, NULL, BTREE_INSERT_NOFAIL, res); - BUG_ON(ret || !bch_keylist_empty(&keylist)); + if (ret) + goto err; + + BUG_ON(!bch_keylist_empty(&keylist)); iter->pos = saved_pos; @@ -535,6 +538,13 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], out: bch_keylist_free(&keylist); bch_btree_reserve_put(c, res); + return; +err: + for (i = 0; i < nr_new_nodes; i++) { + bch_btree_node_free_never_inserted(c, new_nodes[i]); + six_unlock_intent(&new_nodes[i]->lock); + } + goto out; } static int bch_coalesce_btree(struct cache_set *c, enum btree_id btree_id) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index df373ee7a0b9..3cfa378ecd6f 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -776,33 +776,47 @@ static union journal_res_state journal_res_state(unsigned count, }; } +#define JOURNAL_ENTRY_CLOSED ((u32) S32_MAX) + +/* + * Journal error - we also set this in the res state so that we can avoid + * journal_entry_open() opening another entry after the journal has errored: + */ +#define JOURNAL_ENTRY_ERROR ((u32) S32_MAX + 1) + static bool journal_entry_is_open(struct journal *j) { - return j->reservations.cur_entry_offset < S32_MAX; + return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED; } -/* - * Closes the current journal entry so that new reservations cannot be take on - * it - returns true if the count of outstanding reservations is 0. - */ -static bool journal_entry_close(struct journal *j) +static bool __journal_entry_close(struct journal *j, u32 val) { union journal_res_state old, new; u64 v = atomic64_read(&j->reservations.counter); do { old.v = new.v = v; - if (old.cur_entry_offset == S32_MAX) - return old.count == 0; + if (old.cur_entry_offset == val) + break; - new.cur_entry_offset = S32_MAX; + new.cur_entry_offset = val; } while ((v = cmpxchg(&j->reservations.v, old.v, new.v)) != old.v); - journal_cur_write(j)->data->u64s = old.cur_entry_offset; + if (old.cur_entry_offset < JOURNAL_ENTRY_CLOSED) + journal_cur_write(j)->data->u64s = old.cur_entry_offset; return old.count == 0; } +/* + * Closes the current journal entry so that new reservations cannot be take on + * it - returns true if the count of outstanding reservations is 0. + */ +static bool journal_entry_close(struct journal *j) +{ + return __journal_entry_close(j, JOURNAL_ENTRY_CLOSED); +} + /* Number of u64s we can write to the current journal bucket */ static void journal_entry_open(struct journal *j) { @@ -829,11 +843,29 @@ static void journal_entry_open(struct journal *j) u64s = max_t(ssize_t, 0L, u64s); if (u64s > w->data->u64s) { - j->cur_entry_u64s = max_t(ssize_t, 0L, u64s); + union journal_res_state old, new; + u64 v = atomic64_read(&j->reservations.counter); + + /* + * Must be set before marking the journal entry as open: + * + * XXX: does this cause any problems if we bail out because of + * JOURNAL_ENTRY_ERROR? + */ + j->cur_entry_u64s = u64s; + + do { + old.v = new.v = v; + + BUG_ON(old.count); + + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR) + break; + + /* Handle any already added entries */ + new.cur_entry_offset = w->data->u64s; + } while ((v = cmpxchg(&j->reservations.v, old.v, new.v)) != old.v); - /* Handle any already added entries */ - atomic64_set(&j->reservations.counter, - journal_res_state(0, w->data->u64s).v); wake_up(&j->wait); } } @@ -1319,11 +1351,18 @@ static void journal_write_endio(struct bio *bio) { struct cache *ca = container_of(bio, struct cache, journal.bio); struct journal_write *w = bio->bi_private; + struct journal *j = w->j; - if (bio->bi_error || bch_meta_write_fault("journal")) - set_bit(JOURNAL_ERROR, &ca->set->journal.flags); + if (bio->bi_error || bch_meta_write_fault("journal")) { + set_bit(JOURNAL_ERROR, &j->flags); + __journal_entry_close(j, JOURNAL_ENTRY_ERROR); - closure_put(&w->j->io); + __bch_cache_error(ca, "IO error %d writing journal", + bio->bi_error); + bch_cache_set_io_error(ca->set); + } + + closure_put(&j->io); percpu_ref_put(&ca->ref); } @@ -1474,11 +1513,20 @@ static bool __journal_write(struct journal *j) __releases(j->lock) { struct cache_set *c = container_of(j, struct cache_set, journal); + unsigned long flags; + + /* + * so we don't see IO_IN_FLIGHT cleared before JOURNAL_ERROR is set - as + * long as we read the flags all together, they're set in the correct + * order so we should be good + */ + flags = READ_ONCE(j->flags); EBUG_ON(!j->reservations.count && - !test_bit(JOURNAL_DIRTY, &j->flags)); + !test_bit(JOURNAL_DIRTY, &flags)); - if (test_bit(JOURNAL_IO_IN_FLIGHT, &j->flags) || + if (test_bit(JOURNAL_IO_IN_FLIGHT, &flags) || + test_bit(JOURNAL_ERROR, &flags) || !journal_entry_close(j)) goto nowrite; @@ -1616,7 +1664,7 @@ static inline bool journal_res_get_fast(struct journal *j, return true; } -static bool __journal_res_get(struct journal *j, struct journal_res *res, +static int __journal_res_get(struct journal *j, struct journal_res *res, unsigned u64s_min, unsigned u64s_max, u64 *start_time) { @@ -1624,7 +1672,7 @@ static bool __journal_res_get(struct journal *j, struct journal_res *res, while (1) { if (journal_res_get_fast(j, res, u64s_min, u64s_max)) - return true; + return 1; spin_lock(&j->lock); @@ -1635,7 +1683,12 @@ static bool __journal_res_get(struct journal *j, struct journal_res *res, */ if (journal_res_get_fast(j, res, u64s_min, u64s_max)) { spin_unlock(&j->lock); - return true; + return 1; + } + + if (bch_journal_error(j)) { + spin_unlock(&j->lock); + return -EIO; } /* local_clock() can of course be 0 but we don't care */ @@ -1644,7 +1697,7 @@ static bool __journal_res_get(struct journal *j, struct journal_res *res, if (!journal_entry_close(j)) { spin_unlock(&j->lock); - return false; + return 0; } if (test_bit(JOURNAL_DIRTY, &j->flags)) { @@ -1656,7 +1709,7 @@ static bool __journal_res_get(struct journal *j, struct journal_res *res, if (!journal_try_write(j)) { trace_bcache_journal_entry_full(c); - return false; + return 0; } } else { /* Try to get a new journal bucket */ @@ -1666,7 +1719,7 @@ static bool __journal_res_get(struct journal *j, struct journal_res *res, /* Still no room, we have to wait */ spin_unlock(&j->lock); trace_bcache_journal_full(c); - return false; + return 0; } spin_unlock(&j->lock); @@ -1684,21 +1737,27 @@ static bool __journal_res_get(struct journal *j, struct journal_res *res, * To ensure forward progress, the current task must not be holding any * btree node write locks. */ -void bch_journal_res_get(struct journal *j, struct journal_res *res, +int bch_journal_res_get(struct journal *j, struct journal_res *res, unsigned u64s_min, unsigned u64s_max) { u64 start_time = 0; + int ret; BUG_ON(res->ref); BUG_ON(u64s_max < u64s_min); wait_event(j->wait, - __journal_res_get(j, res, u64s_min, u64s_max, &start_time)); - - BUG_ON(!res->ref); + (ret = __journal_res_get(j, res, u64s_min, + u64s_max, &start_time))); if (start_time) bch_time_stats_update(&j->full_time, start_time); + + if (ret < 0) + return ret; + + BUG_ON(!res->ref); + return 0; } void bch_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent) @@ -1823,7 +1882,7 @@ int bch_journal_alloc(struct journal *j) bkey_extent_init(&j->key); atomic64_set(&j->reservations.counter, - journal_res_state(0, S32_MAX).v); + journal_res_state(0, JOURNAL_ENTRY_CLOSED).v); j->w[0].j = j; j->w[1].j = j; diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index d7f1bb208921..8aded34245ee 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -198,8 +198,8 @@ void bch_journal_add_keys(struct journal *, struct journal_res *, unsigned); void bch_journal_res_put(struct journal *, struct journal_res *); -void bch_journal_res_get(struct journal *, struct journal_res *, - unsigned, unsigned); +int bch_journal_res_get(struct journal *, struct journal_res *, + unsigned, unsigned); void bch_journal_flush_seq_async(struct journal *, u64, struct closure *); void bch_journal_flush_async(struct journal *, struct closure *); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 65eb3a258098..20b79dbdbc7e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -801,7 +801,11 @@ static void bch_cache_set_read_only_work(struct work_struct *work) mutex_unlock(&bch_register_lock); } -/* Cache set startup/shutdown: */ +void bch_cache_set_io_error(struct cache_set *c) +{ + pr_err("%pU going read only", c->sb.set_uuid.b); + schedule_work(&c->read_only_work); +} void bch_cache_set_fail(struct cache_set *c) { @@ -819,6 +823,8 @@ void bch_cache_set_fail(struct cache_set *c) } } +/* Cache set startup/shutdown: */ + void bch_cache_set_release(struct kobject *kobj) { struct cache_set *c = container_of(kobj, struct cache_set, kobj); @@ -1228,7 +1234,9 @@ static const char *run_cache_set(struct cache_set *c) goto err; } - bch_journal_replay(c, &journal); + err = "journal replay failed"; + if (bch_journal_replay(c, &journal)) + goto err; err = "error gcing inode nlinks"; if (bch_gc_inode_nlinks(c)) diff --git a/drivers/md/bcache/super.h b/drivers/md/bcache/super.h index f3e0bd52d44d..258152a42014 100644 --- a/drivers/md/bcache/super.h +++ b/drivers/md/bcache/super.h @@ -152,6 +152,7 @@ const char *validate_super(struct bcache_superblock *, struct cache_sb *); void bch_cache_member_info_update(struct cache *); +void bch_cache_set_io_error(struct cache_set *); void bch_cache_set_fail(struct cache_set *); void bch_cache_set_release(struct kobject *); |