diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2016-08-27 02:26:15 -0800 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2016-08-27 05:24:18 -0800 |
commit | 09e9374b5eef9da7466f7f19bb65c5d1b6a0c0aa (patch) | |
tree | fe5af06ca8fed3caa8d0b6b02d2a8e48581e1f1f | |
parent | 0a9ad028970ba45acb365a559bfbd81c86918ec8 (diff) |
bcache: fix some issues with journal seq blacklisting
- the journalling change that increased pipelining meant it was possible for a
particular btree node to have bsets with two different blacklisted sequence
numbers - existing code didn't handle that at all
- the blacklisted seq flush code wasn't correctly waiting for nodes to be
rewritten
- the way the code kept track of which btree nodes had bsets with a blacklisted
sequence number and needed to be rewritten meant that they were pinned in
memory - asking for deadlocks
all fixed
-rw-r--r-- | drivers/md/bcache/btree_cache.c | 6 | ||||
-rw-r--r-- | drivers/md/bcache/btree_types.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/btree_update.c | 12 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 137 | ||||
-rw-r--r-- | drivers/md/bcache/journal_types.h | 11 |
5 files changed, 116 insertions, 52 deletions
diff --git a/drivers/md/bcache/btree_cache.c b/drivers/md/bcache/btree_cache.c index f8f4b7e1b48a..2cda8776e903 100644 --- a/drivers/md/bcache/btree_cache.c +++ b/drivers/md/bcache/btree_cache.c @@ -89,7 +89,6 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, gfp_t gfp) INIT_DELAYED_WORK(&b->work, btree_node_write_work); b->c = c; sema_init(&b->io_mutex, 1); - INIT_LIST_HEAD(&b->journal_seq_blacklisted); b->writes[1].index = 1; INIT_LIST_HEAD(&b->write_blocked); @@ -104,7 +103,6 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, gfp_t gfp) void mca_hash_remove(struct cache_set *c, struct btree *b) { BUG_ON(btree_node_dirty(b)); - BUG_ON(!list_empty_careful(&b->journal_seq_blacklisted)); b->keys.nsets = 0; b->keys.set[0].data = NULL; @@ -170,10 +168,6 @@ static int mca_reap_notrace(struct cache_set *c, struct btree *b, bool flush) if (!list_empty(&b->write_blocked)) goto out_unlock; - /* XXX: we need a better solution for this, this will cause deadlocks */ - if (!list_empty_careful(&b->journal_seq_blacklisted)) - goto out_unlock; - if (!flush) { if (btree_node_dirty(b)) goto out_unlock; diff --git a/drivers/md/bcache/btree_types.h b/drivers/md/bcache/btree_types.h index 5bce26ba3f95..f294aa2965c9 100644 --- a/drivers/md/bcache/btree_types.h +++ b/drivers/md/bcache/btree_types.h @@ -77,8 +77,6 @@ struct btree { struct delayed_work work; struct btree_write writes[2]; - - struct list_head journal_seq_blacklisted; }; #define BTREE_FLAG(flag) \ diff --git a/drivers/md/bcache/btree_update.c b/drivers/md/bcache/btree_update.c index 3fc2fcb6bd65..32a3915fd118 100644 --- a/drivers/md/bcache/btree_update.c +++ b/drivers/md/bcache/btree_update.c @@ -180,12 +180,6 @@ static void __btree_node_free(struct cache_set *c, struct btree *b, clear_btree_node_dirty(b); cancel_delayed_work(&b->work); - if (!list_empty_careful(&b->journal_seq_blacklisted)) { - mutex_lock(&c->journal.blacklist_lock); - list_del_init(&b->journal_seq_blacklisted); - mutex_unlock(&c->journal.blacklist_lock); - } - mca_hash_remove(c, b); mutex_lock(&c->btree_cache_lock); @@ -874,8 +868,6 @@ static void btree_interior_update_pointers_written(struct closure *cl) struct cache_set *c = as->c; unsigned i; - closure_wake_up(&as->wait); - journal_pin_drop(&c->journal, &as->journal); mutex_lock(&c->btree_interior_update_lock); @@ -890,6 +882,8 @@ static void btree_interior_update_pointers_written(struct closure *cl) list_del(&as->list); mutex_unlock(&c->btree_interior_update_lock); + closure_wake_up(&as->wait); + closure_return_with_destructor(cl, btree_interior_update_free); } @@ -1950,7 +1944,7 @@ int bch_btree_node_rewrite(struct btree_iter *iter, struct btree *b, if (!bch_btree_iter_upgrade(iter)) return -EINTR; - reserve = bch_btree_reserve_get(c, b, 1, true, cl); + reserve = bch_btree_reserve_get(c, b, 1, false, cl); if (IS_ERR(reserve)) { trace_bcache_btree_gc_rewrite_node_fail(b); return PTR_ERR(reserve); diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 50f3e781ed6e..30e1eb13dc77 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -169,41 +169,88 @@ static void journal_seq_blacklist_flush(struct journal *j, container_of(j, struct cache_set, journal); struct journal_seq_blacklist *bl = container_of(pin, struct journal_seq_blacklist, pin); - struct btree *b; - struct btree_iter iter; + struct blacklisted_node n; struct closure cl; + unsigned i; + int ret; closure_init_stack(&cl); - while (1) { + for (i = 0;; i++) { + struct btree_iter iter; + struct btree *b; + mutex_lock(&j->blacklist_lock); - if (list_empty(&bl->nodes)) + if (i >= bl->nr_entries) { + mutex_unlock(&j->blacklist_lock); break; - - b = list_first_entry(&bl->nodes, struct btree, - journal_seq_blacklisted); + } + n = bl->entries[i]; mutex_unlock(&j->blacklist_lock); - /* - * b might be changing underneath us, but it won't be _freed_ - * underneath us - and if the fields we're reading out of it to - * traverse to it are garbage because we raced, that's ok - */ - - bch_btree_iter_init(&iter, c, b->btree_id, b->key.k.p); + bch_btree_iter_init(&iter, c, n.btree_id, n.pos); iter.is_extents = false; - +redo_peek: b = bch_btree_iter_peek_node(&iter); - if (!list_empty_careful(&b->journal_seq_blacklisted)) - bch_btree_node_rewrite(&iter, b, &cl); + /* The node might have already been rewritten: */ + + if (b->data->keys.seq == n.seq && + !bkey_cmp(b->key.k.p, n.pos)) { + ret = bch_btree_node_rewrite(&iter, b, &cl); + if (ret == -EAGAIN) { + bch_btree_iter_unlock(&iter); + closure_sync(&cl); + ret = -EINTR; + } + if (ret == -EINTR) + goto redo_peek; + if (ret) + BUG(); + } bch_btree_iter_unlock(&iter); - closure_sync(&cl); } + closure_sync(&cl); + + mutex_lock(&c->btree_interior_update_lock); + + for (i = 0;; i++) { + struct btree_interior_update *as; + struct pending_btree_node_free *d; + + mutex_lock(&j->blacklist_lock); + if (i >= bl->nr_entries) { + mutex_unlock(&j->blacklist_lock); + break; + } + n = bl->entries[i]; + mutex_unlock(&j->blacklist_lock); + + /* + * Is the node on the list of pending interior node updates - + * being freed? If so, wait for that to finish: + */ + for_each_pending_btree_node_free(c, as, d) + if (n.seq == d->seq && + n.btree_id == d->btree_id && + !d->level && + !bkey_cmp(n.pos, d->key.k.p)) { + closure_wait(&as->wait, &cl); + mutex_unlock(&c->btree_interior_update_lock); + closure_sync(&cl); + break; + } + } + + mutex_unlock(&c->btree_interior_update_lock); + + mutex_lock(&j->blacklist_lock); + journal_pin_drop(j, &bl->pin); list_del(&bl->list); + kfree(bl->entries); kfree(bl); mutex_unlock(&j->blacklist_lock); @@ -235,8 +282,7 @@ bch_journal_seq_blacklisted_new(struct cache_set *c, u64 seq) if (!bl) return NULL; - bl->seq = seq; - INIT_LIST_HEAD(&bl->nodes); + bl->seq = seq; list_add_tail(&bl->list, &j->seq_blacklist); return bl; } @@ -251,30 +297,33 @@ int bch_journal_seq_should_ignore(struct cache_set *c, u64 seq, struct btree *b) { struct journal *j = &c->journal; struct journal_seq_blacklist *bl; + struct blacklisted_node *n; int ret = 0; + /* Interier updates aren't journalled: */ + if (b->level) + return 0; + /* * After initial GC has finished, we've scanned the entire btree and * found all the blacklisted sequence numbers - we won't be creating any * new ones after startup: */ - if (test_bit(CACHE_SET_INITIAL_GC_DONE, &c->flags) && - list_empty_careful(&j->seq_blacklist)) - return 0; + if (test_bit(CACHE_SET_INITIAL_GC_DONE, &c->flags)) { + if (list_empty_careful(&j->seq_blacklist)) + return 0; - mutex_lock(&j->blacklist_lock); + mutex_lock(&j->blacklist_lock); + ret = journal_seq_blacklist_find(j, seq) != NULL; + mutex_unlock(&j->blacklist_lock); + return ret; + } + mutex_lock(&j->blacklist_lock); bl = journal_seq_blacklist_find(j, seq); if (bl) goto found; - /* - * Don't look at j->seq after journal has started, since we aren't - * holding appropriate lock: - */ - if (test_bit(CACHE_SET_INITIAL_GC_DONE, &c->flags)) - goto out; - if (seq <= j->seq) goto out; @@ -291,8 +340,30 @@ int bch_journal_seq_should_ignore(struct cache_set *c, u64 seq, struct btree *b) goto out; } found: - if (list_empty(&b->journal_seq_blacklisted)) - list_add(&b->journal_seq_blacklisted, &bl->nodes); + for (n = bl->entries; n < bl->entries + bl->nr_entries; n++) + if (b->data->keys.seq == n->seq && + b->btree_id == n->btree_id && + !bkey_cmp(b->key.k.p, n->pos)) + goto found_entry; + + if (!bl->nr_entries || + is_power_of_2(bl->nr_entries)) { + n = krealloc(bl->entries, + max(bl->nr_entries * 2, 8UL) * sizeof(*n), + GFP_KERNEL); + if (!n) { + ret = -ENOMEM; + goto out; + } + bl->entries = n; + } + + bl->entries[bl->nr_entries++] = (struct blacklisted_node) { + .seq = b->data->keys.seq, + .btree_id = b->btree_id, + .pos = b->key.k.p, + }; +found_entry: ret = 1; out: mutex_unlock(&j->blacklist_lock); diff --git a/drivers/md/bcache/journal_types.h b/drivers/md/bcache/journal_types.h index 5011dd28fb9f..5c3ec3cb08d0 100644 --- a/drivers/md/bcache/journal_types.h +++ b/drivers/md/bcache/journal_types.h @@ -41,14 +41,21 @@ struct journal_entry_pin { struct journal_entry_pin_list *pin_list; }; +/* corresponds to a btree node with a blacklisted bset: */ +struct blacklisted_node { + __le64 seq; + enum btree_id btree_id; + struct bpos pos; +}; + struct journal_seq_blacklist { struct list_head list; u64 seq; bool written; struct journal_entry_pin pin; - /* Btree nodes to be flushed: */ - struct list_head nodes; + struct blacklisted_node *entries; + size_t nr_entries; }; struct journal_res { |