diff options
-rw-r--r-- | drivers/md/bcache/Makefile | 6 | ||||
-rw-r--r-- | drivers/md/bcache/alloc.c | 40 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 72 | ||||
-rw-r--r-- | drivers/md/bcache/bkey_methods.c | 49 | ||||
-rw-r--r-- | drivers/md/bcache/bkey_methods.h | 8 | ||||
-rw-r--r-- | drivers/md/bcache/blockdev.c | 7 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 72 | ||||
-rw-r--r-- | drivers/md/bcache/debug.c | 35 | ||||
-rw-r--r-- | drivers/md/bcache/dirent.c | 17 | ||||
-rw-r--r-- | drivers/md/bcache/error.c | 134 | ||||
-rw-r--r-- | drivers/md/bcache/error.h | 202 | ||||
-rw-r--r-- | drivers/md/bcache/extents.c | 59 | ||||
-rw-r--r-- | drivers/md/bcache/fs-gc.c | 9 | ||||
-rw-r--r-- | drivers/md/bcache/fs.c | 15 | ||||
-rw-r--r-- | drivers/md/bcache/gc.c | 16 | ||||
-rw-r--r-- | drivers/md/bcache/inode.c | 19 | ||||
-rw-r--r-- | drivers/md/bcache/io.c | 117 | ||||
-rw-r--r-- | drivers/md/bcache/io.h | 5 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 61 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 1 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 130 | ||||
-rw-r--r-- | drivers/md/bcache/super.h | 6 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 7 | ||||
-rw-r--r-- | drivers/md/bcache/xattr.c | 17 |
24 files changed, 631 insertions, 473 deletions
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index 650d6542eb5a..95142f1dc2dd 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -3,8 +3,8 @@ obj-$(CONFIG_BCACHE) += bcache.o bcache-y := acl.o alloc.o bkey.o bkey_methods.o blockdev.o\ bset.o btree.o buckets.o chardev.o clock.o closure.o debug.o dirent.o\ - extents.o fs.o fs-gc.o gc.o inode.o io.o journal.o keybuf.o keylist.o\ - migrate.o move.o movinggc.o notify.o request.o siphash.o six.o stats.o\ - super.o sysfs.o tier.o trace.o util.o writeback.o xattr.o + error.o extents.o fs.o fs-gc.o gc.o inode.o io.o journal.o keybuf.o\ + keylist.o migrate.o move.o movinggc.o notify.o request.o siphash.o\ + six.o stats.o super.o sysfs.o tier.o trace.o util.o writeback.o xattr.o ccflags-y := -Werror diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 4b6930452935..d86133c79420 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -58,6 +58,7 @@ #include "btree.h" #include "buckets.h" #include "clock.h" +#include "error.h" #include "extents.h" #include "io.h" #include "journal.h" @@ -281,15 +282,10 @@ static int bch_prio_write(struct cache *ca) spin_unlock(&ca->prio_buckets_lock); ret = prio_io(ca, r, REQ_OP_WRITE); - if (bch_meta_write_fault("prio")) - ret = -EIO; - if (ret) { - __bch_cache_error(ca, - "IO error %d writing prios to bucket %lu", - ret, r); - bch_cache_set_io_error(c); + if (cache_fatal_io_err_on(ret, ca, + "prio write to bucket %lu", r) || + bch_meta_write_fault("prio")) return ret; - } } spin_lock(&c->journal.lock); @@ -300,11 +296,8 @@ static int bch_prio_write(struct cache *ca) spin_unlock(&c->journal.lock); ret = bch_journal_meta(&c->journal); - if (ret) { - __bch_cache_set_error(c, - "IO error %d journalling new prios", ret); + if (cache_set_fatal_err_on(ret, c, "journalling new prios")) return ret; - } /* * Don't want the old priorities to get garbage collected until after we @@ -347,7 +340,7 @@ int bch_prio_read(struct cache *ca) if ((bucket < ca->mi.first_bucket && bucket >= ca->mi.nbuckets) || bch_meta_read_fault("prio")) { - bch_cache_error(ca, "bad prio bucket %llu", bucket); + cache_inconsistent(ca, "bad prio bucket %llu", bucket); return -EIO; } @@ -361,32 +354,27 @@ int bch_prio_read(struct cache *ca) bucket_nr++; ret = prio_io(ca, bucket, REQ_OP_READ); - if (ret || bch_meta_read_fault("prio")) { - bch_cache_error(ca, - "IO error %d reading prios from bucket %llu", - ret, bucket); + if (cache_fatal_io_err_on(ret, ca, + "prior read from bucket %llu", + bucket) || + bch_meta_read_fault("prio")) return -EIO; - } got = p->magic; expect = pset_magic(&c->sb); - if (got != expect) { - bch_cache_error(ca, + if (cache_inconsistent_on(got != expect, ca, "bad magic (got %llu expect %llu) while reading prios from bucket %llu", - got, expect, bucket); + got, expect, bucket)) return -EIO; - } got = p->csum; expect = bch_checksum(PSET_CSUM_TYPE(p), &p->magic, bucket_bytes(ca) - 8); - if (got != expect) { - bch_cache_error(ca, + if (cache_inconsistent_on(got != expect, ca, "bad checksum (got %llu expect %llu) while reading prios from bucket %llu", - got, expect, bucket); + got, expect, bucket)) return -EIO; - } bucket = p->next_bucket; d = p->data; diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 224858445b5e..d1b0b61de590 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -237,7 +237,6 @@ struct cache { struct percpu_ref ref; struct rcu_head free_rcu; struct work_struct free_work; - struct work_struct read_only_work; struct work_struct remove_work; unsigned long flags; @@ -376,6 +375,7 @@ enum { CACHE_SET_STOPPING, CACHE_SET_RUNNING, CACHE_SET_RO, + CACHE_SET_RO_COMPLETE, CACHE_SET_GC_STOPPING, CACHE_SET_GC_FAILURE, CACHE_SET_BDEV_MOUNTED, @@ -404,6 +404,8 @@ struct cache_set { int minor; struct device *chardev; + struct super_block *vfs_sb; + char uuid[40]; /* Counts outstanding writes, for clean transition to read-only */ struct percpu_ref writes; @@ -687,74 +689,6 @@ static inline unsigned bucket_bytes(const struct cache *ca) #define prio_buckets(ca) \ DIV_ROUND_UP((size_t) (ca)->mi.nbuckets, prios_per_bucket(ca)) -/* Error handling macros */ - -/* The underscore versions merely log an error, they don't fail the cache set */ -#define __bch_cache_set_error(c, fmt, ...) \ - printk(KERN_ERR "bcache: error on %pU: " fmt "\n", \ - (c)->sb.set_uuid.b, ##__VA_ARGS__) - -#define __bch_cache_error(ca, fmt, ...) \ -do { \ - char _buf[BDEVNAME_SIZE]; \ - __bch_cache_set_error((ca)->set, "%s: " fmt, \ - bdevname((ca)->disk_sb.bdev, _buf), \ - ##__VA_ARGS__); \ -} while (0) - -/* These do fail the cache set */ -#define bch_cache_set_error(c, ...) \ -do { \ - __bch_cache_set_error(c, __VA_ARGS__); \ - bch_cache_set_fail(c); \ -} while (0) - -#define bch_cache_error(ca, ...) \ -do { \ - __bch_cache_error(ca, __VA_ARGS__); \ - bch_cache_set_fail((ca)->set); \ -} while (0) - -#define btree_bug(b, ...) \ -do { \ - __bch_cache_set_error((b)->c, __VA_ARGS__); \ - BUG(); \ -} while (0) - -#define cache_set_bug(c, ...) \ -do { \ - __bch_cache_set_error(c, __VA_ARGS__); \ - BUG(); \ -} while (0) - -#define btree_bug_on(cond, b, ...) \ -do { \ - if (cond) \ - btree_bug(b, __VA_ARGS__); \ -} while (0) - -#define cache_set_bug_on(cond, c, ...) \ -do { \ - if (cond) \ - cache_set_bug(c, __VA_ARGS__); \ -} while (0) - -#define cache_set_err_on(cond, c, ...) \ -do { \ - if (cond) \ - bch_cache_set_error(c, __VA_ARGS__); \ -} while (0) - -#define __bcache_io_error(c, fmt, ...) \ - printk_ratelimited(KERN_ERR "bcache: IO error on %pU: " fmt "\n",\ - (c)->sb.set_uuid.b, ##__VA_ARGS__) - -#define bcache_io_error(c, bio, fmt, ...) \ -do { \ - __bcache_io_error(c, fmt, ##__VA_ARGS__); \ - (bio)->bi_error = -EIO; \ -} while (0) - /* Forward declarations */ long bch_chardev_ioctl(struct file *, unsigned, unsigned long); diff --git a/drivers/md/bcache/bkey_methods.c b/drivers/md/bcache/bkey_methods.c index fc0ca2631921..9544f8696d26 100644 --- a/drivers/md/bcache/bkey_methods.c +++ b/drivers/md/bcache/bkey_methods.c @@ -3,6 +3,7 @@ #include "bkey_methods.h" #include "btree.h" #include "dirent.h" +#include "error.h" #include "extents.h" #include "inode.h" #include "xattr.h" @@ -15,58 +16,68 @@ static const struct bkey_ops *bch_bkey_ops[] = { [BKEY_TYPE_BTREE] = &bch_bkey_btree_ops, }; -bool bkey_invalid(struct cache_set *c, - enum bkey_type type, - struct bkey_s_c k) +/* Returns string indicating reason for being invalid, or NULL if valid: */ +const char *bkey_invalid(struct cache_set *c, enum bkey_type type, + struct bkey_s_c k) { const struct bkey_ops *ops = bch_bkey_ops[type]; if (k.k->u64s < BKEY_U64s) - return true; + return "u64s too small"; if (k.k->size && (bkey_deleted(k.k) || !ops->is_extents)) - return true; + return "nonzero size field"; switch (k.k->type) { case KEY_TYPE_DELETED: - return false; + return NULL; case KEY_TYPE_DISCARD: case KEY_TYPE_ERROR: - return bkey_val_bytes(k.k) != 0; + return bkey_val_bytes(k.k) != 0 + ? "value size should be zero" + : NULL; case KEY_TYPE_COOKIE: - return (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)); + return bkey_val_bytes(k.k) != sizeof(struct bch_cookie) + ? "incorrect value size" + : NULL; default: if (k.k->type < KEY_TYPE_GENERIC_NR) - return true; + return "invalid type"; return ops->key_invalid(c, k); } } +const char *btree_bkey_invalid(struct cache_set *c, struct btree *b, + struct bkey_s_c k) +{ + if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0) + return "key before start of btree node"; + + if (bkey_cmp(k.k->p, b->data->max_key) > 0) + return "key past end of btree node"; + + return bkey_invalid(c, btree_node_type(b), k); +} + void bkey_debugcheck(struct cache_set *c, struct btree *b, struct bkey_s_c k) { enum bkey_type type = btree_node_type(b); const struct bkey_ops *ops = bch_bkey_ops[type]; + const char *invalid; BUG_ON(!k.k->u64s); - cache_set_bug_on(bkey_cmp(bkey_start_pos(k.k), - b->data->min_key) < 0, - c, "key before start of btree node"); - - cache_set_bug_on(bkey_cmp(k.k->p, - b->data->max_key) > 0, - c, "key past end of btree node"); - - if (bkey_invalid(c, type, k)) { + invalid = btree_bkey_invalid(c, b, k); + if (invalid) { char buf[160]; bch_bkey_val_to_text(c, type, buf, sizeof(buf), k); - cache_set_bug(c, "invalid bkey %s", buf); + cache_set_bug(c, "invalid bkey %s: %s", buf, invalid); return; } diff --git a/drivers/md/bcache/bkey_methods.h b/drivers/md/bcache/bkey_methods.h index a8a5e802160e..a21325b87208 100644 --- a/drivers/md/bcache/bkey_methods.h +++ b/drivers/md/bcache/bkey_methods.h @@ -15,7 +15,8 @@ struct btree; struct bkey; struct bkey_ops { - bool (*key_invalid)(const struct cache_set *, + /* Returns reason for being invalid if invalid, else NULL: */ + const char * (*key_invalid)(const struct cache_set *, struct bkey_s_c); void (*key_debugcheck)(struct cache_set *, struct btree *, struct bkey_s_c); @@ -25,7 +26,10 @@ struct bkey_ops { bool is_extents; }; -bool bkey_invalid(struct cache_set *, enum bkey_type, struct bkey_s_c); +const char *bkey_invalid(struct cache_set *, enum bkey_type, struct bkey_s_c); +const char *btree_bkey_invalid(struct cache_set *, struct btree *, + struct bkey_s_c); + void bkey_debugcheck(struct cache_set *, struct btree *, struct bkey_s_c); void bch_bkey_val_to_text(struct cache_set *, enum bkey_type, char *, size_t, struct bkey_s_c); diff --git a/drivers/md/bcache/blockdev.c b/drivers/md/bcache/blockdev.c index 1a788f2fd7c5..20cc8dd39a4f 100644 --- a/drivers/md/bcache/blockdev.c +++ b/drivers/md/bcache/blockdev.c @@ -2,6 +2,7 @@ #include "bcache.h" #include "blockdev.h" #include "btree.h" +#include "error.h" #include "inode.h" #include "request.h" #include "super.h" @@ -719,12 +720,8 @@ int bch_blockdev_volumes_start(struct cache_set *c) inode = bkey_s_c_to_inode_blockdev(k); ret = blockdev_volume_run(c, inode); - if (ret) { - bch_cache_set_error(c, - "can't bring up blockdev volumes: %i", - ret); + if (ret) break; - } } bch_btree_iter_unlock(&iter); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 6edbb06d46f8..3bda6f09e8e1 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -24,6 +24,7 @@ #include "alloc.h" #include "btree.h" #include "debug.h" +#include "error.h" #include "extents.h" #include "gc.h" #include "io.h" @@ -352,8 +353,8 @@ static void bch_btree_init_next(struct cache_set *c, struct btree *b, _end - _data) ^ 0xffffffffffffffffULL; \ }) -#define btree_node_error(b, ca, ptr, fmt, ...) \ - bch_cache_error(ca, \ +#define btree_node_error(b, c, ptr, fmt, ...) \ + cache_set_inconsistent(c, \ "btree node error at btree %u level %u/%u bucket %zu block %u u64s %u: " fmt,\ (b)->btree_id, (b)->level, btree_node_root(b) \ ? btree_node_root(b)->level : -1, \ @@ -375,14 +376,16 @@ static const char *validate_bset(struct cache_set *c, struct btree *b, return "bset past end of btree node"; if (i != &b->data->keys && !i->u64s) - btree_node_error(b, ca, ptr, "empty set"); + btree_node_error(b, c, ptr, "empty set"); for (k = i->start; k != bset_bkey_last(i);) { struct bkey_tup tup; + struct bkey_s_c u; + const char *invalid; if (!k->u64s) { - btree_node_error(b, ca, ptr, + btree_node_error(b, c, ptr, "KEY_U64s 0: %zu bytes of metadata lost", (void *) bset_bkey_last(i) - (void *) k); @@ -391,7 +394,7 @@ static const char *validate_bset(struct cache_set *c, struct btree *b, } if (bkey_next(k) > bset_bkey_last(i)) { - btree_node_error(b, ca, ptr, + btree_node_error(b, c, ptr, "key extends past end of bset"); i->u64s = (u64 *) k - i->_data; @@ -399,16 +402,15 @@ static const char *validate_bset(struct cache_set *c, struct btree *b, } bkey_disassemble(&tup, f, k); + u = bkey_tup_to_s_c(&tup); - if (bkey_invalid(c, btree_node_type(b), - bkey_tup_to_s_c(&tup))) { + invalid = btree_bkey_invalid(c, b, u); + if (invalid) { char buf[160]; - bkey_disassemble(&tup, f, k); bch_bkey_val_to_text(c, btree_node_type(b), - buf, sizeof(buf), - bkey_tup_to_s_c(&tup)); - btree_node_error(b, ca, ptr, + buf, sizeof(buf), u); + btree_node_error(b, c, ptr, "invalid bkey %s", buf); i->u64s -= k->u64s; @@ -474,6 +476,8 @@ void bch_btree_node_read_done(struct cache_set *c, struct btree *b, if (BSET_CSUM_TYPE(i) >= BCH_CSUM_NR) goto err; + /* XXX: retry checksum errors */ + err = "bad checksum"; if (b->data->csum != btree_csum_set(b, b->data)) goto err; @@ -542,13 +546,13 @@ out: return; err: set_btree_node_read_error(b); - btree_node_error(b, ca, ptr, "%s", err); + btree_node_error(b, c, ptr, "%s", err); goto out; } static void btree_node_read_endio(struct bio *bio) { - bch_bbio_endio(to_bbio(bio), bio->bi_error, "reading btree"); + bch_bbio_endio(to_bbio(bio)); } static void bch_btree_node_read(struct cache_set *c, struct btree *b) @@ -563,9 +567,10 @@ static void bch_btree_node_read(struct cache_set *c, struct btree *b) closure_init_stack(&cl); pick = bch_btree_pick_ptr(c, b); - if (!pick.ca) { + if (cache_set_fatal_err_on(!pick.ca, c, + "no cache device for btree node")) { set_btree_node_read_error(b); - goto missing; + return; } percpu_ref_get(&pick.ca->ref); @@ -583,29 +588,18 @@ static void bch_btree_node_read(struct cache_set *c, struct btree *b) closure_sync(&cl); - if (bio->bi_error || - bch_meta_read_fault("btree")) + if (cache_fatal_io_err_on(bio->bi_error, + pick.ca, "IO error reading bucket %zu", + PTR_BUCKET_NR(pick.ca, &pick.ptr)) || + bch_meta_read_fault("btree")) { set_btree_node_read_error(b); - - bio_put(bio); - - if (btree_node_read_error(b)) - goto err; + goto out; + } bch_btree_node_read_done(c, b, pick.ca, &pick.ptr); bch_time_stats_update(&c->btree_read_time, start_time); - - percpu_ref_put(&pick.ca->ref); - return; - -missing: - bch_cache_set_error(c, "no cache device for btree node"); - percpu_ref_put(&pick.ca->ref); - return; - -err: - bch_cache_error(pick.ca, "IO error reading bucket %zu", - PTR_BUCKET_NR(pick.ca, &pick.ptr)); +out: + bio_put(bio); percpu_ref_put(&pick.ca->ref); } @@ -646,20 +640,16 @@ static void btree_node_write_endio(struct bio *bio) struct btree *b = container_of(cl, struct btree, io); struct bch_write_bio *wbio = to_wbio(bio); - if (bio->bi_error || bch_meta_write_fault("btree")) { + if (cache_fatal_io_err_on(bio->bi_error, wbio->bio.ca, "btree write") || + bch_meta_write_fault("btree")) set_btree_node_write_error(b); - __bch_cache_error(wbio->bio.ca, "IO error %d writing btree", - bio->bi_error); - bch_cache_set_io_error(wbio->bio.ca->set); - } - if (wbio->orig) bio_endio(wbio->orig); else if (wbio->bounce) bch_bio_free_pages(bio); - bch_bbio_endio(to_bbio(bio), bio->bi_error, "writing btree"); + bch_bbio_endio(to_bbio(bio)); } static void do_btree_node_write(struct closure *cl) diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 99d4657c4f4a..c0fb3cb19086 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -9,6 +9,7 @@ #include "btree.h" #include "buckets.h" #include "debug.h" +#include "error.h" #include "extents.h" #include "inode.h" #include "io.h" @@ -166,13 +167,12 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) void *p1 = kmap_atomic(bv.bv_page); void *p2 = page_address(check->bi_io_vec[iter.bi_idx].bv_page); - cache_set_err_on(memcmp(p1 + bv.bv_offset, - p2 + bv.bv_offset, - bv.bv_len), - dc->disk.c, - "verify failed at dev %s sector %llu", - bdevname(dc->disk_sb.bdev, name), - (uint64_t) bio->bi_iter.bi_sector); + if (memcmp(p1 + bv.bv_offset, + p2 + bv.bv_offset, + bv.bv_len)) + panic("verify failed at dev %s sector %llu\n", + bdevname(dc->disk_sb.bdev, name), + (uint64_t) bio->bi_iter.bi_sector); kunmap_atomic(p1); } @@ -199,7 +199,7 @@ void bch_verify_inode_refs(struct cache_set *c) bch_inode_find_by_inum(c, k.k->p.inode, &inode)) { bch_bkey_val_to_text(c, BTREE_ID_EXTENTS, buf, sizeof(buf), k); - bch_cache_set_error(c, + cache_set_inconsistent(c, "extent for missing inode %llu\n%s", k.k->p.inode, buf); bch_btree_iter_unlock(&iter); @@ -210,7 +210,7 @@ void bch_verify_inode_refs(struct cache_set *c) if (!S_ISREG(inode.v.i_mode) && !S_ISLNK(inode.v.i_mode)) - bch_cache_set_error(c, + cache_set_inconsistent(c, "extent for non regular file, inode %llu mode %u", k.k->p.inode, inode.v.i_mode); @@ -219,7 +219,7 @@ void bch_verify_inode_refs(struct cache_set *c) if (k.k->p.offset > round_up(inode.v.i_size, PAGE_SIZE) >> 9) { bch_bkey_val_to_text(c, BTREE_ID_EXTENTS, buf, sizeof(buf), k); - bch_cache_set_error(c, + cache_set_inconsistent(c, "extent past end of inode %llu: i_size %llu extent\n%s", k.k->p.inode, inode.v.i_size, buf); } @@ -234,7 +234,7 @@ void bch_verify_inode_refs(struct cache_set *c) if (k.k->p.inode != cur_inum && bch_inode_find_by_inum(c, k.k->p.inode, &inode)) { - bch_cache_set_error(c, "dirent for missing inode %llu", + cache_set_inconsistent(c, "dirent for missing inode %llu", k.k->p.inode); bch_btree_iter_unlock(&iter); return; @@ -243,7 +243,7 @@ void bch_verify_inode_refs(struct cache_set *c) cur_inum = k.k->p.inode; if (!S_ISDIR(inode.v.i_mode)) - bch_cache_set_error(c, + cache_set_inconsistent(c, "dirent for non directory, inode %llu mode %u", k.k->p.inode, inode.v.i_mode); } @@ -253,7 +253,7 @@ void bch_verify_inode_refs(struct cache_set *c) POS(BCACHE_ROOT_INO, 0), k) { if (k.k->p.inode != cur_inum && bch_inode_find_by_inum(c, k.k->p.inode, &inode)) { - bch_cache_set_error(c, + cache_set_inconsistent(c, "xattr for missing inode %llu", k.k->p.inode); bch_btree_iter_unlock(&iter); @@ -262,11 +262,10 @@ void bch_verify_inode_refs(struct cache_set *c) cur_inum = k.k->p.inode; - if (!S_ISREG(inode.v.i_mode) && - !S_ISDIR(inode.v.i_mode)) - bch_cache_set_error(c, - "xattr for non file/directory, inode %llu mode %u", - k.k->p.inode, inode.v.i_mode); + cache_set_inconsistent_on(!S_ISREG(inode.v.i_mode) && + !S_ISDIR(inode.v.i_mode), c, + "xattr for non file/directory, inode %llu mode %u", + k.k->p.inode, inode.v.i_mode); } bch_btree_iter_unlock(&iter); } diff --git a/drivers/md/bcache/dirent.c b/drivers/md/bcache/dirent.c index c007b7b0d39d..b7554e87ac55 100644 --- a/drivers/md/bcache/dirent.c +++ b/drivers/md/bcache/dirent.c @@ -78,21 +78,22 @@ static int dirent_cmp(struct bkey_s_c_dirent d, return len - q->len ?: memcmp(d.v->d_name, q->name, len); } -static bool bch_dirent_invalid(const struct cache_set *c, struct bkey_s_c k) +static const char *bch_dirent_invalid(const struct cache_set *c, + struct bkey_s_c k) { switch (k.k->type) { case BCH_DIRENT: - if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) - return true; + return bkey_val_bytes(k.k) < sizeof(struct bch_dirent) + ? "value too small" + : NULL; - return false; case BCH_DIRENT_WHITEOUT: - if (bkey_val_bytes(k.k)) - return true; + return bkey_val_bytes(k.k) != 0 + ? "value size should be zero" + : NULL; - return false; default: - return true; + return "invalid type"; } } diff --git a/drivers/md/bcache/error.c b/drivers/md/bcache/error.c new file mode 100644 index 000000000000..a0ca5ecbba7b --- /dev/null +++ b/drivers/md/bcache/error.c @@ -0,0 +1,134 @@ +#include "bcache.h" +#include "error.h" +#include "io.h" +#include "notify.h" +#include "super.h" + +void bch_inconsistent_error(struct cache_set *c) +{ + switch (c->opts.on_error_action) { + case BCH_ON_ERROR_CONTINUE: + break; + case BCH_ON_ERROR_RO: + if (!test_bit(CACHE_SET_INITIAL_GC_DONE, &c->flags)) { + /* XXX do something better here? */ + bch_cache_set_stop(c); + return; + } + + if (bch_cache_set_read_only(c)) + __bch_cache_set_error(c, "emergency read only"); + break; + case BCH_ON_ERROR_PANIC: + panic("bcache: (%s) panic after error\n", + c->vfs_sb ? c->vfs_sb->s_id : c->uuid); + break; + } +} + +void bch_fatal_error(struct cache_set *c) +{ + if (bch_cache_set_read_only(c)) + printk(KERN_ERR "bcache: %pU emergency read only\n", + c->sb.set_uuid.b); +} + +/* Nonfatal IO errors, IO error/latency accounting: */ + +/* Just does IO error accounting: */ +void bch_account_io_completion(struct cache *ca) +{ + /* + * The halflife of an error is: + * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh + */ + + if (ca->set->error_decay) { + unsigned count = atomic_inc_return(&ca->io_count); + + while (count > ca->set->error_decay) { + unsigned errors; + unsigned old = count; + unsigned new = count - ca->set->error_decay; + + /* + * First we subtract refresh from count; each time we + * succesfully do so, we rescale the errors once: + */ + + count = atomic_cmpxchg(&ca->io_count, old, new); + + if (count == old) { + count = new; + + errors = atomic_read(&ca->io_errors); + do { + old = errors; + new = ((uint64_t) errors * 127) / 128; + errors = atomic_cmpxchg(&ca->io_errors, + old, new); + } while (old != errors); + } + } + } +} + +/* IO error accounting and latency accounting: */ +void bch_account_bbio_completion(struct bbio *bio) +{ + struct cache_set *c; + unsigned threshold; + + if (!bio->ca) + return; + + c = bio->ca->set; + threshold = op_is_write(bio_op(&bio->bio)) + ? c->congested_write_threshold_us + : c->congested_read_threshold_us; + + if (threshold && bio->submit_time_us) { + unsigned t = local_clock_us(); + + int us = t - bio->submit_time_us; + int congested = atomic_read(&c->congested); + + if (us > (int) threshold) { + int ms = us / 1024; + c->congested_last_us = t; + + ms = min(ms, CONGESTED_MAX + congested); + atomic_sub(ms, &c->congested); + } else if (congested < 0) + atomic_inc(&c->congested); + } + + bch_account_io_completion(bio->ca); +} + +void bch_nonfatal_io_error_work(struct work_struct *work) +{ + struct cache *ca = container_of(work, struct cache, io_error_work); + unsigned errors = atomic_read(&ca->io_errors); + char buf[BDEVNAME_SIZE]; + + if (errors < ca->set->error_limit) { + bch_notify_cache_error(ca, false); + } else { + bch_notify_cache_error(ca, true); + + mutex_lock(&bch_register_lock); + if (CACHE_STATE(&ca->mi) == CACHE_ACTIVE) { + printk(KERN_ERR "bcache: too many IO errors on %s, going RO\n", + bdevname(ca->disk_sb.bdev, buf)); + bch_cache_read_only(ca); + } + mutex_unlock(&bch_register_lock); + } +} + +void bch_nonfatal_io_error(struct cache *ca) +{ + atomic_add(1 << IO_ERROR_SHIFT, &ca->io_errors); + queue_work(system_long_wq, &ca->io_error_work); +} diff --git a/drivers/md/bcache/error.h b/drivers/md/bcache/error.h new file mode 100644 index 000000000000..ea67bb92aaec --- /dev/null +++ b/drivers/md/bcache/error.h @@ -0,0 +1,202 @@ +#ifndef _BCACHE_ERROR_H +#define _BCACHE_ERROR_H + +#include <linux/printk.h> + +struct cache; +struct cache_set; +struct bbio; + +/* + * XXX: separate out errors that indicate on disk data is inconsistent, and flag + * superblock as such + */ + +/* Error messages: */ + +/* should clean this up */ + +#define __bch_err_fmt(_c, fmt, ...) \ + KERN_ERR "bcache (%s): " fmt "\n", \ + ((_c)->vfs_sb ? (_c)->vfs_sb->s_id : (_c)->uuid), ##__VA_ARGS__ + +#define __bch_cache_set_error(c, fmt, ...) \ + printk(__bch_err_fmt(c, fmt, ##__VA_ARGS__)) + +#define __bch_cache_error(ca, fmt, ...) \ +do { \ + char _buf[BDEVNAME_SIZE]; \ + __bch_cache_set_error((ca)->set, "%s: " fmt, \ + bdevname((ca)->disk_sb.bdev, _buf), \ + ##__VA_ARGS__); \ +} while (0) + +/* + * Very fatal logic/inconsistency errors: these indicate that we've majorly + * screwed up at runtime, i.e. it's not likely that it was just caused by the + * data on disk being inconsistent. These BUG(): + * + * XXX: audit and convert to inconsistent() checks + */ + +#define cache_set_bug(c, ...) \ +do { \ + __bch_cache_set_error(c, __VA_ARGS__); \ + BUG(); \ +} while (0) + +#define cache_set_bug_on(cond, c, ...) \ +do { \ + if (cond) \ + cache_set_bug(c, __VA_ARGS__); \ +} while (0) + +/* + * Inconsistency errors: The on disk data is inconsistent. If these occur during + * initial recovery, they don't indicate a bug in the running code - we walk all + * the metadata before modifying anything. If they occur at runtime, they + * indicate either a bug in the running code or (less likely) data is being + * silently corrupted under us. + * + * XXX: audit all inconsistent errors and make sure they're all recoverable, in + * BCH_ON_ERROR_CONTINUE mode + */ + +void bch_inconsistent_error(struct cache_set *); + +#define cache_set_inconsistent(c, ...) \ +do { \ + __bch_cache_set_error(c, __VA_ARGS__); \ + bch_inconsistent_error(c); \ +} while (0) + +#define cache_set_inconsistent_on(cond, c, ...) \ +({ \ + int _ret = !!(cond); \ + \ + if (_ret) \ + cache_set_inconsistent(c, __VA_ARGS__); \ + _ret; \ +}) + +/* + * Later we might want to mark only the particular device inconsistent, not the + * entire cache set: + */ + +#define cache_inconsistent(ca, ...) \ +do { \ + __bch_cache_error(ca, __VA_ARGS__); \ + bch_inconsistent_error((ca)->set); \ +} while (0) + +#define cache_inconsistent_on(cond, ca, ...) \ +({ \ + int _ret = !!(cond); \ + \ + if (_ret) \ + cache_inconsistent(ca, __VA_ARGS__); \ + _ret; \ +}) + +/* + * Fatal errors: these don't indicate a bug, but we can't continue running in RW + * mode - pretty much just due to metadata IO errors: + */ + +void bch_fatal_error(struct cache_set *); + +#define cache_set_fatal_error(c, ...) \ +do { \ + __bch_cache_set_error(c, __VA_ARGS__); \ + bch_fatal_error(c); \ +} while (0) + +#define cache_set_fatal_err_on(cond, c, ...) \ +({ \ + int _ret = !!(cond); \ + \ + if (_ret) \ + cache_set_fatal_error(c, __VA_ARGS__); \ + _ret; \ +}) + +#define cache_fatal_error(ca, ...) \ +do { \ + __bch_cache_error(ca, __VA_ARGS__); \ + bch_fatal_error(c); \ +} while (0) + +#define cache_fatal_io_error(ca, fmt, ...) \ +do { \ + char _buf[BDEVNAME_SIZE]; \ + \ + printk_ratelimited(__bch_err_fmt((ca)->set, "fatal IO error on %s for " fmt,\ + bdevname((ca)->disk_sb.bdev, _buf),\ + ##__VA_ARGS__)); \ + bch_fatal_error((ca)->set); \ +} while (0) + +#define cache_fatal_io_err_on(cond, ca, ...) \ +({ \ + int _ret = !!(cond); \ + \ + if (_ret) \ + cache_fatal_io_error(ca, __VA_ARGS__); \ + _ret; \ +}) + +/* + * Nonfatal IO errors: either recoverable metadata IO (because we have + * replicas), or data IO - we need to log it and print out a message, but we + * don't (necessarily) want to shut down the fs: + */ + +void bch_account_io_completion(struct cache *); +void bch_account_bbio_completion(struct bbio *); + +void bch_nonfatal_io_error_work(struct work_struct *); + +/* Does the error handling without logging a message */ +void bch_nonfatal_io_error(struct cache *); + +#if 0 +#define cache_set_nonfatal_io_error(c, ...) \ +do { \ + __bch_cache_set_error(c, __VA_ARGS__); \ + bch_nonfatal_io_error(c); \ +} while (0) +#endif + +/* Logs message and handles the error: */ +#define cache_nonfatal_io_error(ca, fmt, ...) \ +do { \ + char _buf[BDEVNAME_SIZE]; \ + \ + printk_ratelimited(__bch_err_fmt((ca)->set, "IO error on %s for " fmt,\ + bdevname((ca)->disk_sb.bdev, _buf),\ + ##__VA_ARGS__)); \ + bch_nonfatal_io_error(ca); \ +} while (0) + +#define cache_nonfatal_io_err_on(cond, ca, ...) \ +({ \ + int _ret = !!(cond); \ + \ + if (_ret) \ + cache_nonfatal_io_error(ca, __VA_ARGS__); \ + _ret; \ +}) + +/* kill? */ + +#define __bcache_io_error(c, fmt, ...) \ + printk_ratelimited(__bch_err_fmt(c, "IO error: " fmt, ##__VA_ARGS__)) + +#define bcache_io_error(c, bio, fmt, ...) \ +do { \ + __bcache_io_error(c, fmt, ##__VA_ARGS__); \ + (bio)->bi_error = -EIO; \ +} while (0) + +#endif /* _BCACHE_ERROR_H */ diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 4d5889d6d107..25a63aefb6a8 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -9,6 +9,7 @@ #include "btree.h" #include "debug.h" #include "dirent.h" +#include "error.h" #include "extents.h" #include "gc.h" #include "inode.h" @@ -340,8 +341,8 @@ static size_t extent_print_ptrs(struct cache_set *c, char *buf, /* Btree ptrs */ -static const char *bch_btree_ptr_invalid_reason(const struct cache_set *c, - struct bkey_s_c k) +static const char *bch_btree_ptr_invalid(const struct cache_set *c, + struct bkey_s_c k) { if (bkey_extent_is_cached(k.k)) return "cached"; @@ -383,11 +384,6 @@ static const char *bch_btree_ptr_invalid_reason(const struct cache_set *c, } } -static bool bch_btree_ptr_invalid(const struct cache_set *c, struct bkey_s_c k) -{ - return bch_btree_ptr_invalid_reason(c, k); -} - static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b, struct bkey_s_c k) { @@ -401,11 +397,6 @@ static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b, unsigned replicas = 0; bool bad; - if (bkey_extent_is_cached(k.k)) { - btree_bug(b, "btree ptr marked as cached"); - return; - } - rcu_read_lock(); extent_for_each_online_device(c, e, ptr, ca) { @@ -444,11 +435,11 @@ static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b, return; err: bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); - btree_bug(b, "%s btree pointer %s: bucket %zi prio %i " - "gen %i last_gc %i mark %08x", - err, buf, PTR_BUCKET_NR(ca, ptr), - g->read_prio, PTR_BUCKET_GEN(ca, ptr), - g->oldest_gen, g->mark.counter); + cache_set_bug(c, "%s btree pointer %s: bucket %zi prio %i " + "gen %i last_gc %i mark %08x", + err, buf, PTR_BUCKET_NR(ca, ptr), + g->read_prio, PTR_BUCKET_GEN(ca, ptr), + g->oldest_gen, g->mark.counter); rcu_read_unlock(); } @@ -463,7 +454,7 @@ static void bch_btree_ptr_to_text(struct cache_set *c, char *buf, if (bkey_extent_is_data(k.k)) out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k)); - invalid = bch_btree_ptr_invalid_reason(c, k); + invalid = bch_btree_ptr_invalid(c, k); if (invalid) p(" invalid: %s", invalid); #undef p @@ -480,23 +471,19 @@ bch_btree_pick_ptr(struct cache_set *c, const struct btree *b) rcu_read_lock(); extent_for_each_online_device_crc(c, e, crc, ptr, ca) { - if (crc) { - bch_cache_error(ca, + if (cache_set_inconsistent_on(crc, c, "btree node pointer with crc at btree %u level %u/%u bucket %zu", b->btree_id, b->level, btree_node_root(b) ? btree_node_root(b)->level : -1, - PTR_BUCKET_NR(ca, ptr)); + PTR_BUCKET_NR(ca, ptr))) break; - } - if (ptr_stale(ca, ptr)) { - bch_cache_error(ca, + if (cache_inconsistent_on(ptr_stale(ca, ptr), ca, "stale btree node pointer at btree %u level %u/%u bucket %zu", b->btree_id, b->level, btree_node_root(b) ? btree_node_root(b)->level : -1, - PTR_BUCKET_NR(ca, ptr)); + PTR_BUCKET_NR(ca, ptr))) continue; - } percpu_ref_get(&ca->ref); rcu_read_unlock(); @@ -1306,8 +1293,8 @@ out: return inserted; } -static const char *bch_extent_invalid_reason(const struct cache_set *c, - struct bkey_s_c k) +static const char *bch_extent_invalid(const struct cache_set *c, + struct bkey_s_c k) { if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX) return "value too big"; @@ -1360,11 +1347,6 @@ invalid: } } -static bool bch_extent_invalid(const struct cache_set *c, struct bkey_s_c k) -{ - return bch_extent_invalid_reason(c, k); -} - static void bch_extent_debugcheck(struct cache_set *c, struct btree *b, struct bkey_s_c k) { @@ -1379,6 +1361,15 @@ static void bch_extent_debugcheck(struct cache_set *c, struct btree *b, unsigned ptrs_per_tier[CACHE_TIERS]; unsigned i, tier, replicas = 0; + /* + * XXX: we should be doing most/all of these checks at startup time, + * where we check bkey_invalid() in btree_node_read_done() + * + * But note that we can't check for stale pointers or incorrect gc marks + * until after journal replay is done (it might be an extent that's + * going to get overwritten during replay) + */ + memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier)); mi = cache_member_info_get(c); @@ -1491,7 +1482,7 @@ static void bch_extent_to_text(struct cache_set *c, char *buf, if (bkey_extent_is_data(k.k)) out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k)); - invalid = bch_extent_invalid_reason(c, k); + invalid = bch_extent_invalid(c, k); if (invalid) p(" invalid: %s", invalid); #undef p diff --git a/drivers/md/bcache/fs-gc.c b/drivers/md/bcache/fs-gc.c index 3245e1063898..54bdf550ac93 100644 --- a/drivers/md/bcache/fs-gc.c +++ b/drivers/md/bcache/fs-gc.c @@ -2,6 +2,7 @@ #include "bcache.h" #include "btree.h" #include "dirent.h" +#include "error.h" #include "fs.h" #include "inode.h" #include "keylist.h" @@ -74,13 +75,13 @@ static int bch_gc_do_inode(struct cache_set *c, struct btree_iter *iter, struct bkey_i_inode update; int ret; - cache_set_err_on(inode.v->i_nlink < link.count, c, + cache_set_inconsistent_on(inode.v->i_nlink < link.count, c, "i_link too small (%u < %u, type %i)", inode.v->i_nlink, link.count + link.dir_count, mode_to_type(inode.v->i_mode)); if (!link.count) { - cache_set_err_on(S_ISDIR(inode.v->i_mode) && + cache_set_inconsistent_on(S_ISDIR(inode.v->i_mode) && bch_empty_dir(c, inode.k->p.inode), c, "non empty directory with link count 0,inode nlink %u, dir links found %u", inode.v->i_nlink, link.dir_count); @@ -139,7 +140,7 @@ static int bch_gc_walk_inodes(struct cache_set *c, u64 pos, struct nlink *links) break; while (i < k.k->p.inode - pos) { - cache_set_err_on(links[i].count, c, + cache_set_inconsistent_on(links[i].count, c, "missing inode %llu", pos + i); i++; @@ -157,7 +158,7 @@ static int bch_gc_walk_inodes(struct cache_set *c, u64 pos, struct nlink *links) break; default: - cache_set_err_on(links[i].count, c, + cache_set_inconsistent_on(links[i].count, c, "missing inode %llu", pos + i); break; diff --git a/drivers/md/bcache/fs.c b/drivers/md/bcache/fs.c index e6c689974de1..bccc2b176533 100644 --- a/drivers/md/bcache/fs.c +++ b/drivers/md/bcache/fs.c @@ -2057,7 +2057,7 @@ static int bch_remount(struct super_block *sb, int *flags, char *data) const char *err = NULL; if (opts.read_only) { - bch_cache_set_read_only(c); + bch_cache_set_read_only_sync(c); sb->s_flags |= MS_RDONLY; } else { @@ -2103,9 +2103,11 @@ static struct dentry *bch_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { struct cache_set *c; + struct cache *ca; struct super_block *sb; struct inode *inode; struct cache_set_opts opts; + unsigned i; int ret; if (!parse_options(&opts, flags, data)) @@ -2130,6 +2132,17 @@ static struct dentry *bch_mount(struct file_system_type *fs_type, sb->s_magic = BCACHE_STATFS_MAGIC; sb->s_time_gran = 1; sb->s_fs_info = c; + c->vfs_sb = sb; + + rcu_read_lock(); + for_each_cache_rcu(ca, c, i) { + char b[BDEVNAME_SIZE]; + + strlcpy(sb->s_id, bdevname(ca->disk_sb.bdev, b), + sizeof(sb->s_id)); + break; + } + rcu_read_unlock(); if (opts.posix_acl < 0) sb->s_flags |= MS_POSIXACL; diff --git a/drivers/md/bcache/gc.c b/drivers/md/bcache/gc.c index c08dd74a1015..74ac042c82b5 100644 --- a/drivers/md/bcache/gc.c +++ b/drivers/md/bcache/gc.c @@ -8,6 +8,7 @@ #include "btree.h" #include "buckets.h" #include "debug.h" +#include "error.h" #include "extents.h" #include "gc.h" #include "journal.h" @@ -79,13 +80,6 @@ static inline bool btree_node_has_ptrs(struct btree *b) bool btree_gc_mark_node(struct cache_set *c, struct btree *b) { struct bkey_format *f = &b->keys.format; - struct bset_tree *t; - - for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++) - btree_bug_on(t->size && - bset_written(&b->keys, t) && - bkey_cmp_packed(f, &b->key.k, &t->end) < 0, - b, "found short btree key in gc"); if (btree_node_has_ptrs(b)) { struct btree_node_iter iter; @@ -128,8 +122,12 @@ static int bch_gc_btree(struct cache_set *c, enum btree_id btree_id) for_each_btree_node(&iter, c, btree_id, POS_MIN, b) { if (!b->level) { - cache_set_bug_on(bkey_cmp(b->data->min_key, next_min), - c, + /* + * XXX: this check should be elsewhere - also, we should + * be checking all nodes, not just leaf nodes + */ + cache_set_inconsistent_on(bkey_cmp(b->data->min_key, + next_min), c, "btree node has incorrect min key: %llu:%llu != %llu:%llu", b->data->min_key.inode, b->data->min_key.offset, diff --git a/drivers/md/bcache/inode.c b/drivers/md/bcache/inode.c index 53b699920e1d..38189277ace3 100644 --- a/drivers/md/bcache/inode.c +++ b/drivers/md/bcache/inode.c @@ -51,30 +51,31 @@ ssize_t bch_inode_status(char *buf, size_t len, const struct bkey *k) } } -static bool bch_inode_invalid(const struct cache_set *c, struct bkey_s_c k) +static const char *bch_inode_invalid(const struct cache_set *c, + struct bkey_s_c k) { if (k.k->p.offset) - return true; + return "nonzero offset"; switch (k.k->type) { case BCH_INODE_FS: if (bkey_val_bytes(k.k) != sizeof(struct bch_inode)) - return true; + return "incorrect value size"; if (k.k->p.inode < BLOCKDEV_INODE_MAX) - return true; + return "fs inode in blockdev range"; - return false; + return NULL; case BCH_INODE_BLOCKDEV: if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_blockdev)) - return true; + return "incorrect value size"; if (k.k->p.inode >= BLOCKDEV_INODE_MAX) - return true; + return "blockdev inode in fs range"; - return false; + return NULL; default: - return true; + return "invalid type"; } } diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 1a4736111b44..0d15ec0ed78e 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -12,6 +12,7 @@ #include "buckets.h" #include "clock.h" #include "debug.h" +#include "error.h" #include "extents.h" #include "gc.h" #include "io.h" @@ -195,106 +196,13 @@ static void bch_bbio_reset(struct bbio *b) /* IO errors */ -void bch_count_io_errors(struct cache *ca, int error, const char *m) -{ - /* - * The halflife of an error is: - * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh - */ - - if (ca->set->error_decay) { - unsigned count = atomic_inc_return(&ca->io_count); - - while (count > ca->set->error_decay) { - unsigned errors; - unsigned old = count; - unsigned new = count - ca->set->error_decay; - - /* - * First we subtract refresh from count; each time we - * succesfully do so, we rescale the errors once: - */ - - count = atomic_cmpxchg(&ca->io_count, old, new); - - if (count == old) { - count = new; - - errors = atomic_read(&ca->io_errors); - do { - old = errors; - new = ((uint64_t) errors * 127) / 128; - errors = atomic_cmpxchg(&ca->io_errors, - old, new); - } while (old != errors); - } - } - } - - if (error) { - char buf[BDEVNAME_SIZE]; - - atomic_add(1 << IO_ERROR_SHIFT, &ca->io_errors); - queue_work(system_long_wq, &ca->io_error_work); - printk_ratelimited(KERN_ERR "%s: IO error on %s", - bdevname(ca->disk_sb.bdev, buf), m); - } -} - -void bch_cache_io_error_work(struct work_struct *work) -{ - struct cache *ca = container_of(work, struct cache, io_error_work); - unsigned errors = atomic_read(&ca->io_errors); - char buf[BDEVNAME_SIZE]; - - if (errors < ca->set->error_limit) { - bch_notify_cache_error(ca, false); - } else { - bch_notify_cache_error(ca, true); - printk_ratelimited(KERN_ERR "%s: too many IO errors, going RO", - bdevname(ca->disk_sb.bdev, buf)); - queue_work(system_long_wq, &ca->read_only_work); - } -} - -void bch_bbio_count_io_errors(struct bbio *bio, int error, const char *m) -{ - struct cache_set *c; - unsigned threshold; - - if (!bio->ca) - return; - - c = bio->ca->set; - threshold = op_is_write(bio_op(&bio->bio)) - ? c->congested_write_threshold_us - : c->congested_read_threshold_us; - - if (threshold && bio->submit_time_us) { - unsigned t = local_clock_us(); - - int us = t - bio->submit_time_us; - int congested = atomic_read(&c->congested); - - if (us > (int) threshold) { - int ms = us / 1024; - c->congested_last_us = t; - - ms = min(ms, CONGESTED_MAX + congested); - atomic_sub(ms, &c->congested); - } else if (congested < 0) - atomic_inc(&c->congested); - } - - bch_count_io_errors(bio->ca, error, m); -} - -void bch_bbio_endio(struct bbio *bio, int error, const char *m) +void bch_bbio_endio(struct bbio *bio) { struct closure *cl = bio->bio.bi_private; struct cache *ca = bio->ca; - bch_bbio_count_io_errors(bio, error, m); + bch_account_bbio_completion(bio); + bio_put(&bio->bio); if (ca) percpu_ref_put(&ca->ref); @@ -622,10 +530,10 @@ static void bch_write_endio(struct bio *bio) struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_write_bio *wbio = to_wbio(bio); - if (bio->bi_error) { + if (cache_nonfatal_io_err_on(bio->bi_error, wbio->bio.ca, + "data write")) { /* TODO: We could try to recover from this. */ if (!bkey_extent_is_cached(&op->insert_key.k)) { - __bcache_io_error(op->c, "IO error writing data"); op->error = bio->bi_error; } else if (!op->replace) set_closure_fn(cl, bch_write_error, op->c->wq); @@ -638,7 +546,7 @@ static void bch_write_endio(struct bio *bio) else if (wbio->bounce) bch_bio_free_pages_pool(op->c, bio); - bch_bbio_endio(&wbio->bio, bio->bi_error, "writing data to cache"); + bch_bbio_endio(&wbio->bio); } static const unsigned bch_crc_size[] = { @@ -1445,11 +1353,7 @@ static int bio_checksum_uncompress(struct bch_read_bio *rbio) if (rbio->csum_type != BCH_CSUM_NONE && rbio->csum != checksum_bio(bio, rbio->csum_type)) { - /* - * XXX: bch_bbio_count_io_errors() isn't counting checksum - * errors - */ - __bcache_io_error(rbio->c, "checksum error"); + cache_nonfatal_io_error(rbio->bio.ca, "checksum error"); return -EIO; } @@ -1537,7 +1441,10 @@ static void bch_read_endio(struct bio *bio) ptr_stale(rbio->bio.ca, &rbio->bio.ptr); int error = bio->bi_error; - bch_bbio_count_io_errors(&rbio->bio, error, "reading from cache"); + bch_account_bbio_completion(&rbio->bio); + + cache_nonfatal_io_err_on(error, rbio->bio.ca, "data read"); + percpu_ref_put(&rbio->bio.ca->ref); if (error) diff --git a/drivers/md/bcache/io.h b/drivers/md/bcache/io.h index 5c819c77a6ec..d424ddcf9bfd 100644 --- a/drivers/md/bcache/io.h +++ b/drivers/md/bcache/io.h @@ -140,10 +140,7 @@ enum bch_read_flags { int bch_read(struct cache_set *, struct bio *, u64); -void bch_cache_io_error_work(struct work_struct *); -void bch_count_io_errors(struct cache *, int, const char *); -void bch_bbio_count_io_errors(struct bbio *, int, const char *); -void bch_bbio_endio(struct bbio *, int, const char *); +void bch_bbio_endio(struct bbio *); void bch_generic_make_request(struct bio *, struct cache_set *); void bch_bio_submit_work(struct work_struct *); diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 3cfa378ecd6f..2625e8d4b550 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -8,6 +8,7 @@ #include "buckets.h" #include "btree.h" #include "debug.h" +#include "error.h" #include "extents.h" #include "gc.h" #include "io.h" @@ -98,11 +99,11 @@ struct bkey_i *bch_journal_find_btree_root(struct cache_set *c, struct jset *j, k = jkeys->start; *level = jkeys->level; - if (!jkeys->u64s || jkeys->u64s != k->k.u64s || - bkey_invalid(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(k))) { - bch_cache_set_error(c, "invalid btree root in journal"); + if (cache_set_inconsistent_on(!jkeys->u64s || + jkeys->u64s != k->k.u64s || + bkey_invalid(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(k)), + c, "invalid btree root in journal")) return NULL; - } *level = jkeys->level; return k; @@ -217,7 +218,7 @@ static int __bch_journal_seq_blacklisted(struct cache_set *c, u64 seq, if (seq <= j->seq) return 0; - cache_set_err_on(seq > j->seq + 1, c, + cache_set_inconsistent_on(seq > j->seq + 1, c, "bset journal seq too far in the future: %llu > %llu", seq, j->seq); @@ -357,38 +358,33 @@ static enum { got = j->version; expect = BCACHE_JSET_VERSION; - if (got != expect) { - __bch_cache_error(ca, + + if (cache_inconsistent_on(got != expect, ca, "bad journal version (got %llu expect %llu) sector %lluu", - got, expect, sector); + got, expect, sector)) return JOURNAL_ENTRY_BAD; - } - if (bytes > bucket_sectors_left << 9 || - bytes > PAGE_SIZE << JSET_BITS) { - __bch_cache_error(ca, + if (cache_inconsistent_on(bytes > bucket_sectors_left << 9 || + bytes > PAGE_SIZE << JSET_BITS, ca, "journal entry too big (%zu bytes), sector %lluu", - bytes, sector); + bytes, sector)) return JOURNAL_ENTRY_BAD; - } if (bytes > sectors_read << 9) return JOURNAL_ENTRY_REREAD; + /* XXX: retry on checksum error */ + got = j->csum; expect = csum_set(j, JSET_CSUM_TYPE(j)); - if (got != expect) { - __bch_cache_error(ca, + if (cache_inconsistent_on(got != expect, ca, "journal checksum bad (got %llu expect %llu), sector %lluu", - got, expect, sector); + got, expect, sector)) return JOURNAL_ENTRY_BAD; - } - if (j->last_seq > j->seq) { - __bch_cache_error(ca, - "invalid journal entry: last_seq > seq"); + if (cache_inconsistent_on(j->last_seq > j->seq, ca, + "invalid journal entry: last_seq > seq")) return JOURNAL_ENTRY_BAD; - } return JOURNAL_ENTRY_OK; } @@ -427,12 +423,12 @@ reread: bch_bio_map(bio, data); ret = submit_bio_wait(bio); - if (bch_meta_read_fault("journal")) + + if (cache_fatal_io_err_on(ret, ca, + "journal read from sector %llu", + sector + bucket_offset) || + bch_meta_read_fault("journal")) { ret = -EIO; - if (ret) { - __bch_cache_error(ca, - "IO error %d reading journal from bucket_offset %llu", - ret, sector + bucket_offset); goto err; } @@ -965,13 +961,13 @@ int bch_journal_replay(struct cache_set *c, struct list_head *list) journal_seq_blacklist_find(j, cur_seq)) cur_seq++; - cache_set_err_on(journal_seq_blacklist_find(j, i->j.seq), c, + cache_set_inconsistent_on(journal_seq_blacklist_find(j, i->j.seq), c, "found blacklisted journal entry %llu", i->j.seq); mutex_unlock(&j->blacklist_lock); - cache_set_err_on(i->j.seq != cur_seq, c, + cache_set_inconsistent_on(i->j.seq != cur_seq, c, "journal entries %llu-%llu missing! (replaying %llu-%llu)", cur_seq, i->j.seq - 1, last_seq(j), end_seq); @@ -1353,13 +1349,10 @@ static void journal_write_endio(struct bio *bio) struct journal_write *w = bio->bi_private; struct journal *j = w->j; - if (bio->bi_error || bch_meta_write_fault("journal")) { + if (cache_fatal_io_err_on(bio->bi_error, ca, "journal write") || + bch_meta_write_fault("journal")) { set_bit(JOURNAL_ERROR, &j->flags); __journal_entry_close(j, JOURNAL_ENTRY_ERROR); - - __bch_cache_error(ca, "IO error %d writing journal", - bio->bi_error); - bch_cache_set_io_error(ca->set); } closure_put(&j->io); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 13e08ed237ce..2397fdc47500 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -28,6 +28,7 @@ #include "btree.h" #include "clock.h" #include "debug.h" +#include "error.h" #include "extents.h" #include "io.h" #include "journal.h" diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 20b79dbdbc7e..efa03f795bab 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -12,6 +12,7 @@ #include "btree.h" #include "clock.h" #include "debug.h" +#include "error.h" #include "fs-gc.h" #include "gc.h" #include "inode.h" @@ -59,6 +60,7 @@ static int bch_chardev_major; static struct class *bch_chardev_class; static struct device *bch_chardev; static DEFINE_IDR(bch_chardev_minor); +static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait); struct workqueue_struct *bcache_io_wq; @@ -491,7 +493,12 @@ static void write_super_endio(struct bio *bio) { struct cache *ca = bio->bi_private; - bch_count_io_errors(ca, bio->bi_error, "writing superblock"); + /* XXX: return errors directly */ + + cache_fatal_io_err_on(bio->bi_error, ca, "superblock write"); + + bch_account_io_completion(ca); + closure_put(&ca->set->sb_write); percpu_ref_put(&ca->ref); } @@ -690,15 +697,41 @@ static void bch_writes_disabled(struct percpu_ref *writes) complete(&c->write_disable_complete); } -void bch_cache_set_read_only(struct cache_set *c) +static void bch_cache_set_read_only_work(struct work_struct *work) { - lockdep_assert_held(&bch_register_lock); + struct cache_set *c = + container_of(work, struct cache_set, read_only_work); + + init_completion(&c->write_disable_complete); + percpu_ref_put(&c->writes); + + del_timer_sync(&c->foreground_write_wakeup); + cancel_delayed_work_sync(&c->pd_controllers_update); + + c->foreground_write_pd.rate.rate = UINT_MAX; + bch_wake_delayed_writes((unsigned long) c); + + /* Wait for outstanding writes to complete: */ + wait_for_completion(&c->write_disable_complete); + + __bch_cache_set_read_only(c); + bch_notify_cache_set_read_only(c); + trace_bcache_cache_set_read_only_done(c); + + set_bit(CACHE_SET_RO_COMPLETE, &c->flags); + wake_up(&bch_read_only_wait); +} + +bool bch_cache_set_read_only(struct cache_set *c) +{ if (test_and_set_bit(CACHE_SET_RO, &c->flags)) - return; + return false; trace_bcache_cache_set_read_only(c); + percpu_ref_get(&c->writes); + /* * Block new foreground-end write operations from starting - any new * writes will return -EROFS: @@ -707,20 +740,18 @@ void bch_cache_set_read_only(struct cache_set *c) * allocated space can still happen until stopping the allocator in * bch_cache_allocator_stop()). */ - init_completion(&c->write_disable_complete); percpu_ref_kill(&c->writes); - bch_wake_delayed_writes((unsigned long) c); - del_timer_sync(&c->foreground_write_wakeup); - cancel_delayed_work_sync(&c->pd_controllers_update); - - /* Wait for outstanding writes to complete: */ - wait_for_completion(&c->write_disable_complete); + queue_work(system_unbound_wq, &c->read_only_work); + return true; +} - __bch_cache_set_read_only(c); +void bch_cache_set_read_only_sync(struct cache_set *c) +{ + bch_cache_set_read_only(c); - bch_notify_cache_set_read_only(c); - trace_bcache_cache_set_read_only_done(c); + wait_event(bch_read_only_wait, + test_bit(CACHE_SET_RO_COMPLETE, &c->flags)); } static const char *__bch_cache_set_read_write(struct cache_set *c) @@ -768,7 +799,7 @@ const char *bch_cache_set_read_write(struct cache_set *c) lockdep_assert_held(&bch_register_lock); - if (!test_bit(CACHE_SET_RO, &c->flags)) + if (!test_bit(CACHE_SET_RO_COMPLETE, &c->flags)) return NULL; for_each_cache(ca, c, i) @@ -783,6 +814,8 @@ const char *bch_cache_set_read_write(struct cache_set *c) return err; percpu_ref_reinit(&c->writes); + + clear_bit(CACHE_SET_RO_COMPLETE, &c->flags); clear_bit(CACHE_SET_RO, &c->flags); return NULL; @@ -791,38 +824,6 @@ err: return err; } -static void bch_cache_set_read_only_work(struct work_struct *work) -{ - struct cache_set *c = - container_of(work, struct cache_set, read_only_work); - - mutex_lock(&bch_register_lock); - bch_cache_set_read_only(c); - mutex_unlock(&bch_register_lock); -} - -void bch_cache_set_io_error(struct cache_set *c) -{ - pr_err("%pU going read only", c->sb.set_uuid.b); - schedule_work(&c->read_only_work); -} - -void bch_cache_set_fail(struct cache_set *c) -{ - switch (c->opts.on_error_action) { - case BCH_ON_ERROR_CONTINUE: - break; - case BCH_ON_ERROR_RO: - pr_err("%pU going read only", c->sb.set_uuid.b); - schedule_work(&c->read_only_work); - break; - case BCH_ON_ERROR_PANIC: - panic("bcache: %pU panic after error\n", - c->sb.set_uuid.b); - break; - } -} - /* Cache set startup/shutdown: */ void bch_cache_set_release(struct kobject *kobj) @@ -893,7 +894,7 @@ static void cache_set_flush(struct closure *cl) device_unregister(c->chardev); mutex_lock(&bch_register_lock); - bch_cache_set_read_only(c); + bch_cache_set_read_only_sync(c); if (c->kobj.state_in_sysfs) kobject_del(&c->kobj); @@ -986,6 +987,8 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb, if (cache_sb_to_cache_set(c, sb)) goto err; + scnprintf(c->uuid, sizeof(c->uuid), "%pU", &c->sb.user_uuid); + c->opts = (struct cache_set_opts) { .read_only = 0, .on_error_action = CACHE_ERROR_ACTION(&c->sb), @@ -1145,7 +1148,7 @@ static const char *run_cache_set(struct cache_set *c) lockdep_assert_held(&bch_register_lock); BUG_ON(test_bit(CACHE_SET_RUNNING, &c->flags)); - /* We don't want bch_cache_set_error() to free underneath us */ + /* We don't want bch_fatal_error() to free underneath us */ closure_get(&c->caching); /* @@ -1307,7 +1310,7 @@ static const char *run_cache_set(struct cache_set *c) bch_prio_timer_start(c, WRITE); if (c->opts.read_only) { - bch_cache_set_read_only(c); + bch_cache_set_read_only_sync(c); } else { err = __bch_cache_set_read_write(c); if (err) @@ -1322,7 +1325,9 @@ static const char *run_cache_set(struct cache_set *c) bcache_write_super(c); - bch_blockdev_volumes_start(c); + err = "can't bring up blockdev volumes"; + if (bch_blockdev_volumes_start(c)) + goto err; bch_debug_init_cache_set(c); @@ -1467,9 +1472,9 @@ void bch_cache_read_only(struct cache *ca) return; if (!cache_may_remove(ca)) { - pr_warning("Required member %s for %pU going RO, cache set going RO", - buf, &c->sb.set_uuid); - bch_cache_set_read_only(c); + printk(__bch_err_fmt(c, "required member %s going RO, forcing fs RO", + buf)); + bch_cache_set_read_only_sync(c); } /* @@ -1484,17 +1489,6 @@ void bch_cache_read_only(struct cache *ca) bch_cache_member_info_update(ca); } -static void bch_cache_read_only_work(struct work_struct *work) -{ - struct cache *ca = container_of(work, struct cache, read_only_work); - - /* Going RO because of an error: */ - - mutex_lock(&bch_register_lock); - bch_cache_read_only(ca); - mutex_unlock(&bch_register_lock); -} - static const char *__bch_cache_read_write(struct cache *ca) { const char *err; @@ -1824,7 +1818,6 @@ static const char *cache_alloc(struct bcache_superblock *sb, ca->self.devices[0] = ca; INIT_WORK(&ca->free_work, bch_cache_free_work); - INIT_WORK(&ca->read_only_work, bch_cache_read_only_work); INIT_WORK(&ca->remove_work, bch_cache_remove_work); bio_init(&ca->journal.bio); ca->journal.bio.bi_max_vecs = 8; @@ -1837,7 +1830,7 @@ static const char *cache_alloc(struct bcache_superblock *sb, ca->disk_sb.bdev->bd_holder = ca; memset(sb, 0, sizeof(*sb)); - INIT_WORK(&ca->io_error_work, bch_cache_io_error_work); + INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work); err = "dynamic fault"; if (cache_set_init_fault("cache_alloc")) @@ -2287,6 +2280,9 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) list_for_each_entry(c, &bch_cache_sets, list) bch_cache_set_read_only(c); + list_for_each_entry(c, &bch_cache_sets, list) + bch_cache_set_read_only_sync(c); + mutex_unlock(&bch_register_lock); } diff --git a/drivers/md/bcache/super.h b/drivers/md/bcache/super.h index 258152a42014..15b03c6e54a2 100644 --- a/drivers/md/bcache/super.h +++ b/drivers/md/bcache/super.h @@ -152,9 +152,6 @@ const char *validate_super(struct bcache_superblock *, struct cache_sb *); void bch_cache_member_info_update(struct cache *); -void bch_cache_set_io_error(struct cache_set *); -void bch_cache_set_fail(struct cache_set *); - void bch_cache_set_release(struct kobject *); void bch_cache_release(struct kobject *); @@ -174,7 +171,8 @@ const char *bch_register_cache_set(char * const *, unsigned, struct cache_set_opts, struct cache_set **); -void bch_cache_set_read_only(struct cache_set *); +bool bch_cache_set_read_only(struct cache_set *); +void bch_cache_set_read_only_sync(struct cache_set *); const char *bch_cache_set_read_write(struct cache_set *); void bch_cache_read_only(struct cache *); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 34bec1106f0e..892306f1b2e9 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -10,6 +10,7 @@ #include "btree.h" #include "clock.h" #include "debug.h" +#include "error.h" #include "extents.h" #include "io.h" #include "keybuf.h" @@ -168,9 +169,9 @@ static void read_dirty_endio(struct bio *bio) { struct dirty_io *io = container_of(bio, struct dirty_io, bio); - bch_count_io_errors(io->ca, bio->bi_error, - "reading dirty data from cache"); - percpu_ref_put(&io->ca->ref); + cache_nonfatal_io_err_on(bio->bi_error, io->ca, "writeback read"); + + bch_account_io_completion(io->ca); if (ptr_stale(io->ca, &io->ptr)) bio->bi_error = -EINTR; diff --git a/drivers/md/bcache/xattr.c b/drivers/md/bcache/xattr.c index 22e728c18121..2f004c3e69a7 100644 --- a/drivers/md/bcache/xattr.c +++ b/drivers/md/bcache/xattr.c @@ -75,21 +75,22 @@ static int xattr_cmp(const struct bch_xattr *xattr, memcmp(xattr->x_name, q->name, q->len); } -static bool bch_xattr_invalid(const struct cache_set *c, struct bkey_s_c k) +static const char *bch_xattr_invalid(const struct cache_set *c, + struct bkey_s_c k) { switch (k.k->type) { case BCH_XATTR: - if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) - return true; + return bkey_val_bytes(k.k) < sizeof(struct bch_xattr) + ? "value too small" + : NULL; - return false; case BCH_XATTR_WHITEOUT: - if (bkey_val_bytes(k.k)) - return true; + return bkey_val_bytes(k.k) != 0 + ? "value size should be zero" + : NULL; - return false; default: - return true; + return "invalid type"; } } |