diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2016-11-21 12:50:21 -0900 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2017-01-18 21:40:57 -0900 |
commit | 618ce13a2f1c0dc57fdd516199bcff196320a63d (patch) | |
tree | 6cc85c9b99937db9d61de13c94e8beb7da7fc7a6 | |
parent | be7f94b9e8779db37bdd411747e77eb33690a55f (diff) |
bcache: fix a copygc deadlock
only allocations that actually need it should be using the btree reserve
-rw-r--r-- | drivers/md/bcache/alloc.c | 31 | ||||
-rw-r--r-- | drivers/md/bcache/alloc.h | 5 | ||||
-rw-r--r-- | drivers/md/bcache/alloc_types.h | 5 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/btree_gc.c | 5 | ||||
-rw-r--r-- | drivers/md/bcache/btree_update.c | 50 | ||||
-rw-r--r-- | drivers/md/bcache/btree_update.h | 7 | ||||
-rw-r--r-- | drivers/md/bcache/io.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/io_types.h | 1 | ||||
-rw-r--r-- | drivers/md/bcache/move.c | 13 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 3 |
11 files changed, 77 insertions, 47 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index cc773059ff27..7f2fa47da286 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -1052,6 +1052,7 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c, struct write_point *wp, struct open_bucket *ob, unsigned nr_replicas, + enum alloc_reserve reserve, long *caches_used) { /* @@ -1063,22 +1064,23 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c, /* foreground writes: prefer tier 0: */ if (wp->group == &c->cache_all) - bch_bucket_alloc_group(c, ob, wp->reserve, nr_replicas, + bch_bucket_alloc_group(c, ob, reserve, nr_replicas, &c->cache_tiers[0], caches_used); - return bch_bucket_alloc_group(c, ob, wp->reserve, nr_replicas, + return bch_bucket_alloc_group(c, ob, reserve, nr_replicas, wp->group, caches_used); } static int bch_bucket_alloc_set(struct cache_set *c, struct write_point *wp, struct open_bucket *ob, unsigned nr_replicas, - long *caches_used, struct closure *cl) + enum alloc_reserve reserve, long *caches_used, + struct closure *cl) { bool waiting = false; while (1) { switch (__bch_bucket_alloc_set(c, wp, ob, nr_replicas, - caches_used)) { + reserve, caches_used)) { case ALLOC_SUCCESS: if (waiting) closure_wake_up(&c->freelist_wait); @@ -1093,7 +1095,7 @@ static int bch_bucket_alloc_set(struct cache_set *c, struct write_point *wp, case FREELIST_EMPTY: if (!cl || waiting) trace_bcache_freelist_empty_fail(c, - wp->reserve, cl); + reserve, cl); if (!cl) return -ENOSPC; @@ -1279,6 +1281,7 @@ static int open_bucket_add_buckets(struct cache_set *c, struct open_bucket *ob, struct bkey_i_extent *e, unsigned nr_replicas, + enum alloc_reserve reserve, struct closure *cl) { const struct bch_extent_ptr *ptr; @@ -1316,7 +1319,8 @@ static int open_bucket_add_buckets(struct cache_set *c, nr_replicas++; } - return bch_bucket_alloc_set(c, wp, ob, nr_replicas, caches_used, cl); + return bch_bucket_alloc_set(c, wp, ob, nr_replicas, + reserve, caches_used, cl); } /* @@ -1326,16 +1330,17 @@ struct open_bucket *bch_alloc_sectors_start(struct cache_set *c, struct write_point *wp, struct bkey_i_extent *e, unsigned nr_replicas, + enum alloc_reserve reserve, struct closure *cl) { struct open_bucket *ob; - unsigned open_buckets_reserved = allocation_is_metadata(wp->reserve) + unsigned open_buckets_reserved = allocation_is_metadata(reserve) ? 0 : BTREE_NODE_RESERVE; int ret; BUG_ON(!wp->group); - BUG_ON(!wp->reserve); + BUG_ON(!reserve); BUG_ON(!nr_replicas); retry: ob = lock_writepoint(c, wp); @@ -1379,7 +1384,8 @@ retry: ob = new_ob; } - ret = open_bucket_add_buckets(c, wp, ob, e, nr_replicas, cl); + ret = open_bucket_add_buckets(c, wp, ob, e, nr_replicas, + reserve, cl); if (ret) { mutex_unlock(&ob->lock); return ERR_PTR(ret); @@ -1470,11 +1476,12 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c, struct write_point *wp, struct bkey_i_extent *e, unsigned nr_replicas, + enum alloc_reserve reserve, struct closure *cl) { struct open_bucket *ob; - ob = bch_alloc_sectors_start(c, wp, e, nr_replicas, cl); + ob = bch_alloc_sectors_start(c, wp, e, nr_replicas, reserve, cl); if (IS_ERR_OR_NULL(ob)) return ob; @@ -1814,7 +1821,6 @@ void bch_open_buckets_init(struct cache_set *c) for (i = 0; i < ARRAY_SIZE(c->write_points); i++) { c->write_points[i].throttle = true; - c->write_points[i].reserve = RESERVE_NONE; c->write_points[i].group = &c->cache_tiers[0]; } @@ -1822,13 +1828,10 @@ void bch_open_buckets_init(struct cache_set *c) spin_lock_init(&c->cache_tiers[i].lock); c->promote_write_point.group = &c->cache_tiers[0]; - c->promote_write_point.reserve = RESERVE_NONE; c->migration_write_point.group = &c->cache_all; - c->migration_write_point.reserve = RESERVE_NONE; c->btree_write_point.group = &c->cache_all; - c->btree_write_point.reserve = RESERVE_BTREE; c->pd_controllers_update_seconds = 5; INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); diff --git a/drivers/md/bcache/alloc.h b/drivers/md/bcache/alloc.h index b02a307b0265..d3d82fd994be 100644 --- a/drivers/md/bcache/alloc.h +++ b/drivers/md/bcache/alloc.h @@ -18,14 +18,15 @@ void bch_open_bucket_put(struct cache_set *, struct open_bucket *); struct open_bucket *bch_alloc_sectors_start(struct cache_set *, struct write_point *, struct bkey_i_extent *, - unsigned, struct closure *); + unsigned, enum alloc_reserve, + struct closure *); void bch_alloc_sectors_done(struct cache_set *, struct write_point *, struct bkey_i_extent *, unsigned, struct open_bucket *, unsigned); struct open_bucket *bch_alloc_sectors(struct cache_set *, struct write_point *, struct bkey_i_extent *, unsigned, - struct closure *); + enum alloc_reserve, struct closure *); static inline void bch_wake_allocator(struct cache *ca) { diff --git a/drivers/md/bcache/alloc_types.h b/drivers/md/bcache/alloc_types.h index aa3c81268817..337b6e46517a 100644 --- a/drivers/md/bcache/alloc_types.h +++ b/drivers/md/bcache/alloc_types.h @@ -88,11 +88,6 @@ struct write_point { bool throttle; /* - * Bucket reserve to allocate from. - */ - enum alloc_reserve reserve; - - /* * If not NULL, cache group for tiering, promotion and moving GC - * always allocates a single replica */ diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 8ee369bcbf13..599896689bc1 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -597,7 +597,7 @@ struct cache_set { struct btree_alloc { struct open_bucket *ob; BKEY_PADDED(k); - } btree_reserve_cache[BTREE_NODE_RESERVE]; + } btree_reserve_cache[BTREE_NODE_RESERVE * 2]; unsigned btree_reserve_cache_nr; struct mutex btree_reserve_cache_lock; diff --git a/drivers/md/bcache/btree_gc.c b/drivers/md/bcache/btree_gc.c index ae59348cf3b4..1735ea39d397 100644 --- a/drivers/md/bcache/btree_gc.c +++ b/drivers/md/bcache/btree_gc.c @@ -474,7 +474,10 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], block_bytes(c)) > blocks) return; - res = bch_btree_reserve_get(c, parent, nr_old_nodes, false, NULL); + res = bch_btree_reserve_get(c, parent, nr_old_nodes, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE, + NULL); if (IS_ERR(res)) { trace_bcache_btree_gc_coalesce_fail(c, BTREE_GC_COALESCE_FAIL_RESERVE_GET); diff --git a/drivers/md/bcache/btree_update.c b/drivers/md/bcache/btree_update.c index 3a812b88fbd0..e352bb2f42e5 100644 --- a/drivers/md/bcache/btree_update.c +++ b/drivers/md/bcache/btree_update.c @@ -228,15 +228,17 @@ void btree_open_bucket_put(struct cache_set *c, struct btree *b) } static struct btree *__bch_btree_node_alloc(struct cache_set *c, + bool use_reserve, struct disk_reservation *res, struct closure *cl) { BKEY_PADDED(k) tmp; struct open_bucket *ob; struct btree *b; + unsigned reserve = use_reserve ? 0 : BTREE_NODE_RESERVE; mutex_lock(&c->btree_reserve_cache_lock); - if (c->btree_reserve_cache_nr) { + if (c->btree_reserve_cache_nr > reserve) { struct btree_alloc *a = &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; @@ -254,7 +256,9 @@ retry: ob = bch_alloc_sectors(c, &c->btree_write_point, bkey_i_to_extent(&tmp.k), - res->nr_replicas, cl); + res->nr_replicas, + use_reserve ? RESERVE_BTREE : RESERVE_NONE, + cl); if (IS_ERR(ob)) return ERR_CAST(ob); @@ -503,19 +507,19 @@ void bch_btree_reserve_put(struct cache_set *c, struct btree_reserve *reserve) } static struct btree_reserve *__bch_btree_reserve_get(struct cache_set *c, - bool check_enospc, - unsigned nr_nodes, - struct closure *cl) + unsigned nr_nodes, + unsigned flags, + struct closure *cl) { struct btree_reserve *reserve; struct btree *b; struct disk_reservation disk_res = { 0, 0 }; unsigned sectors = nr_nodes * c->sb.btree_node_size; - int ret, flags = BCH_DISK_RESERVATION_GC_LOCK_HELD| + int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD| BCH_DISK_RESERVATION_METADATA; - if (!check_enospc) - flags |= BCH_DISK_RESERVATION_NOFAIL; + if (flags & BTREE_INSERT_NOFAIL) + disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL; /* * This check isn't necessary for correctness - it's just to potentially @@ -525,7 +529,7 @@ static struct btree_reserve *__bch_btree_reserve_get(struct cache_set *c, if (ret) return ERR_PTR(ret); - if (bch_disk_reservation_get(c, &disk_res, sectors, flags)) + if (bch_disk_reservation_get(c, &disk_res, sectors, disk_res_flags)) return ERR_PTR(-ENOSPC); BUG_ON(nr_nodes > BTREE_RESERVE_MAX); @@ -546,7 +550,8 @@ static struct btree_reserve *__bch_btree_reserve_get(struct cache_set *c, reserve->nr = 0; while (reserve->nr < nr_nodes) { - b = __bch_btree_node_alloc(c, &disk_res, cl); + b = __bch_btree_node_alloc(c, flags & BTREE_INSERT_USE_RESERVE, + &disk_res, cl); if (IS_ERR(b)) { ret = PTR_ERR(b); goto err_free; @@ -567,14 +572,13 @@ err_free: struct btree_reserve *bch_btree_reserve_get(struct cache_set *c, struct btree *b, unsigned extra_nodes, - bool check_enospc, + unsigned flags, struct closure *cl) { unsigned depth = btree_node_root(b)->level - b->level; unsigned nr_nodes = btree_reserve_required_nodes(depth) + extra_nodes; - return __bch_btree_reserve_get(c, check_enospc, - nr_nodes, cl); + return __bch_btree_reserve_get(c, nr_nodes, flags, cl); } @@ -589,7 +593,7 @@ int bch_btree_root_alloc(struct cache_set *c, enum btree_id id, while (1) { /* XXX haven't calculated capacity yet :/ */ - reserve = __bch_btree_reserve_get(c, false, 1, &cl); + reserve = __bch_btree_reserve_get(c, 1, 0, &cl); if (!IS_ERR(reserve)) break; @@ -1470,8 +1474,7 @@ static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags) goto out; } - reserve = bch_btree_reserve_get(c, b, 0, - !(flags & BTREE_INSERT_NOFAIL), &cl); + reserve = bch_btree_reserve_get(c, b, 0, flags, &cl); if (IS_ERR(reserve)) { ret = PTR_ERR(reserve); if (ret == -EAGAIN) { @@ -1636,7 +1639,10 @@ retry: goto out_unlock; } - reserve = bch_btree_reserve_get(c, b, 0, false, &cl); + reserve = bch_btree_reserve_get(c, b, 0, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE, + &cl); if (IS_ERR(reserve)) { ret = PTR_ERR(reserve); goto out_unlock; @@ -2167,11 +2173,19 @@ int bch_btree_node_rewrite(struct btree_iter *iter, struct btree *b, struct btree *n, *parent = iter->nodes[b->level + 1]; struct btree_reserve *reserve; struct btree_interior_update *as; + unsigned flags = BTREE_INSERT_NOFAIL; + + /* + * if caller is going to wait if allocating reserve fails, then this is + * a rewrite that must succeed: + */ + if (cl) + flags |= BTREE_INSERT_USE_RESERVE; if (!bch_btree_iter_set_locks_want(iter, U8_MAX)) return -EINTR; - reserve = bch_btree_reserve_get(c, b, 0, false, cl); + reserve = bch_btree_reserve_get(c, b, 0, flags, cl); if (IS_ERR(reserve)) { trace_bcache_btree_gc_rewrite_node_fail(b); return PTR_ERR(reserve); diff --git a/drivers/md/bcache/btree_update.h b/drivers/md/bcache/btree_update.h index 22027678a41f..277a99476d07 100644 --- a/drivers/md/bcache/btree_update.h +++ b/drivers/md/bcache/btree_update.h @@ -160,7 +160,7 @@ void bch_btree_set_root_initial(struct cache_set *, struct btree *, void bch_btree_reserve_put(struct cache_set *, struct btree_reserve *); struct btree_reserve *bch_btree_reserve_get(struct cache_set *, struct btree *, unsigned, - bool, struct closure *); + unsigned, struct closure *); int bch_btree_root_alloc(struct cache_set *, enum btree_id, struct closure *); @@ -284,11 +284,14 @@ int __bch_btree_insert_at(struct btree_insert *, u64 *); /* Don't check for -ENOSPC: */ #define BTREE_INSERT_NOFAIL (1 << 1) +/* for copygc, or when merging btree nodes */ +#define BTREE_INSERT_USE_RESERVE (1 << 2) + /* * Insert is for journal replay: don't get journal reservations, or mark extents * (bch_mark_key) */ -#define BTREE_INSERT_JOURNAL_REPLAY (1 << 2) +#define BTREE_INSERT_JOURNAL_REPLAY (1 << 3) int bch_btree_insert_list_at(struct btree_iter *, struct keylist *, struct disk_reservation *, diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index b3e2a762d365..55e0b540c33d 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -575,6 +575,7 @@ static void __bch_write(struct closure *cl) b = bch_alloc_sectors_start(op->c, op->wp, bkey_i_to_extent(k), op->nr_replicas, + op->alloc_reserve, (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl); EBUG_ON(!b); @@ -793,6 +794,7 @@ void bch_write_op_init(struct bch_write_op *op, struct cache_set *c, op->flags = flags; op->compression_type = c->opts.compression; op->nr_replicas = res.nr_replicas; + op->alloc_reserve = RESERVE_NONE; op->pos = pos; op->version = 0; op->res = res; diff --git a/drivers/md/bcache/io_types.h b/drivers/md/bcache/io_types.h index 927106f1020f..28365dc40c24 100644 --- a/drivers/md/bcache/io_types.h +++ b/drivers/md/bcache/io_types.h @@ -98,6 +98,7 @@ struct bch_write_op { u16 flags; unsigned compression_type:4; unsigned nr_replicas:4; + unsigned alloc_reserve:4; struct bpos pos; unsigned version; diff --git a/drivers/md/bcache/move.c b/drivers/md/bcache/move.c index 9afc1d3271fb..2cf4a4bd2eec 100644 --- a/drivers/md/bcache/move.c +++ b/drivers/md/bcache/move.c @@ -88,6 +88,14 @@ static int bch_migrate_index_update(struct bch_write_op *op) ptr = bch_migrate_matching_ptr(m, e); if (ptr) { + unsigned insert_flags = + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL; + + /* copygc uses btree node reserve: */ + if (m->move) + insert_flags |= BTREE_INSERT_USE_RESERVE; + if (m->move) __bch_extent_drop_ptr(e, ptr); @@ -102,7 +110,7 @@ static int bch_migrate_index_update(struct bch_write_op *op) ret = bch_btree_insert_at(c, &op->res, NULL, op_journal_seq(op), - BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC, + insert_flags, BTREE_INSERT_ENTRY(&iter, &new.k)); if (ret && ret != -EINTR) break; @@ -147,6 +155,9 @@ void bch_migrate_write_init(struct cache_set *c, bkey_start_pos(k.k), NULL, flags); + if (m->move) + m->op.alloc_reserve = RESERVE_MOVINGGC; + m->op.nr_replicas = 1; m->op.index_update_fn = bch_migrate_index_update; } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e790503b9a92..e6935106e980 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1988,10 +1988,7 @@ static const char *cache_alloc(struct bcache_superblock *sb, total_reserve += ca->free[i].size; pr_debug("%zu buckets reserved", total_reserve); - ca->copygc_write_point.reserve = RESERVE_MOVINGGC; ca->copygc_write_point.group = &ca->self; - - ca->tiering_write_point.reserve = RESERVE_NONE; ca->tiering_write_point.group = &ca->self; kobject_get(&c->kobj); |