From b8ab956c544e51a8d19ea3f38b54355d6aa92c33 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 4 Nov 2014 12:52:41 +0100 Subject: block: Expand a bit documentation about elevator_allow_merge_fn Explain that two requests can be merged without elevator_allow_merge_fn() being called. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- Documentation/block/biodoc.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 2101e718670d..f1323c6b7ed2 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -946,7 +946,11 @@ elevator_allow_merge_fn called whenever the block layer determines request safely. The io scheduler may still want to stop a merge at this point if it results in some sort of conflict internally, - this hook allows it to do that. + this hook allows it to do that. Note however + that two *requests* can still be merged at later + time. Currently the io scheduler has no way to + prevent that. It can only learn about the fact + from elevator_merge_req_fn callback. elevator_dispatch_fn* fills the dispatch queue with ready requests. I/O schedulers are free to postpone requests by -- cgit v1.2.3 From 9c6ac78eb3521c5937b2dd8a7d1b300f41092f45 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Oct 2014 15:38:21 -0400 Subject: writeback: fix a subtle race condition in I_DIRTY clearing After invoking ->dirty_inode(), __mark_inode_dirty() does smp_mb() and tests inode->i_state locklessly to see whether it already has all the necessary I_DIRTY bits set. The comment above the barrier doesn't contain any useful information - memory barriers can't ensure "changes are seen by all cpus" by itself. And it sure enough was broken. Please consider the following scenario. CPU 0 CPU 1 ------------------------------------------------------------------------------- enters __writeback_single_inode() grabs inode->i_lock tests PAGECACHE_TAG_DIRTY which is clear enters __set_page_dirty() grabs mapping->tree_lock sets PAGECACHE_TAG_DIRTY releases mapping->tree_lock leaves __set_page_dirty() enters __mark_inode_dirty() smp_mb() sees I_DIRTY_PAGES set leaves __mark_inode_dirty() clears I_DIRTY_PAGES releases inode->i_lock Now @inode has dirty pages w/ I_DIRTY_PAGES clear. This doesn't seem to lead to an immediately critical problem because requeue_inode() later checks PAGECACHE_TAG_DIRTY instead of I_DIRTY_PAGES when deciding whether the inode needs to be requeued for IO and there are enough unintentional memory barriers inbetween, so while the inode ends up with inconsistent I_DIRTY_PAGES flag, it doesn't fall off the IO list. The lack of explicit barrier may also theoretically affect the other I_DIRTY bits which deal with metadata dirtiness. There is no guarantee that a strong enough barrier exists between I_DIRTY_[DATA]SYNC clearing and write_inode() writing out the dirtied inode. Filesystem inode writeout path likely has enough stuff which can behave as full barrier but it's theoretically possible that the writeout may not see all the updates from ->dirty_inode(). Fix it by adding an explicit smp_mb() after I_DIRTY clearing. Note that I_DIRTY_PAGES needs a special treatment as it always needs to be cleared to be interlocked with the lockless test on __mark_inode_dirty() side. It's cleared unconditionally and reinstated after smp_mb() if the mapping still has dirty pages. Also add comments explaining how and why the barriers are paired. Lightly tested. Signed-off-by: Tejun Heo Cc: Jan Kara Cc: Mikulas Patocka Cc: Jens Axboe Cc: Al Viro Cc: stable@vger.kernel.org Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ef9bef118342..2d609a5fbfea 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -479,12 +479,28 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * write_inode() */ spin_lock(&inode->i_lock); - /* Clear I_DIRTY_PAGES if we've written out all dirty pages */ - if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - inode->i_state &= ~I_DIRTY_PAGES; + dirty = inode->i_state & I_DIRTY; - inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); + inode->i_state &= ~I_DIRTY; + + /* + * Paired with smp_mb() in __mark_inode_dirty(). This allows + * __mark_inode_dirty() to test i_state without grabbing i_lock - + * either they see the I_DIRTY bits cleared or we see the dirtied + * inode. + * + * I_DIRTY_PAGES is always cleared together above even if @mapping + * still has dirty pages. The flag is reinstated after smp_mb() if + * necessary. This guarantees that either __mark_inode_dirty() + * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY. + */ + smp_mb(); + + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + inode->i_state |= I_DIRTY_PAGES; + spin_unlock(&inode->i_lock); + /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { int err = write_inode(inode, wbc); @@ -1148,12 +1164,11 @@ void __mark_inode_dirty(struct inode *inode, int flags) } /* - * make sure that changes are seen by all cpus before we test i_state - * -- mikulas + * Paired with smp_mb() in __writeback_single_inode() for the + * following lockless i_state test. See there for details. */ smp_mb(); - /* avoid the locking if we can */ if ((inode->i_state & flags) == flags) return; -- cgit v1.2.3 From 398205b8391b208f0034a392242867b28ad8af3d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 7 Nov 2014 23:03:59 +0100 Subject: blk_mq: call preempt_disable/enable in blk_mq_run_hw_queue, and only if needed preempt_disable/enable surrounds every call to blk_mq_run_hw_queue, except the one in blk-flush.c. In fact that one is always asynchronous, and it does not need smp_processor_id(). We can do the same for all other calls, avoiding preempt_disable when async is true. This avoids peppering blk-mq.c with preemption-disabled regions. Cc: Jens Axboe Cc: Thomas Gleixner Reported-by: Clark Williams Tested-by: Clark Williams Signed-off-by: Paolo Bonzini Signed-off-by: Jens Axboe --- block/blk-mq.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index b355b5957cd7..8b309e81ed0f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -801,9 +801,18 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) return; - if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) - __blk_mq_run_hw_queue(hctx); - else if (hctx->queue->nr_hw_queues == 1) + if (!async) { + preempt_disable(); + if (cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) { + __blk_mq_run_hw_queue(hctx); + preempt_enable(); + return; + } + + preempt_enable(); + } + + if (hctx->queue->nr_hw_queues == 1) kblockd_schedule_delayed_work(&hctx->run_work, 0); else { unsigned int cpu; @@ -824,9 +833,7 @@ void blk_mq_run_queues(struct request_queue *q, bool async) test_bit(BLK_MQ_S_STOPPED, &hctx->state)) continue; - preempt_disable(); blk_mq_run_hw_queue(hctx, async); - preempt_enable(); } } EXPORT_SYMBOL(blk_mq_run_queues); @@ -853,9 +860,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - preempt_disable(); blk_mq_run_hw_queue(hctx, false); - preempt_enable(); } EXPORT_SYMBOL(blk_mq_start_hw_queue); @@ -880,9 +885,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) continue; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - preempt_disable(); blk_mq_run_hw_queue(hctx, async); - preempt_enable(); } } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); -- cgit v1.2.3 From 2a90d4aae5509e9cf1ba848c5d0b3458201160a0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 7 Nov 2014 23:04:00 +0100 Subject: blk-mq: use get_cpu/put_cpu instead of preempt_disable/preempt_enable blk-mq is using preempt_disable/enable in order to ensure that the queue runners are placed on the right CPU. This does not work with the RT patches, because __blk_mq_run_hw_queue takes a non-raw spinlock with the preemption-disabled region. If there is contention on the lock, this violates the rules for preemption-disabled regions. While this should be easily fixable within the RT patches just by doing migrate_disable/enable, we can do better and document _why_ this particular region runs with disabled preemption. After the previous patch, it is trivial to switch it to get/put_cpu; the RT patches then can change it to get_cpu_light, which lets virtio-blk run under RT kernels. Cc: Jens Axboe Cc: Thomas Gleixner Reported-by: Clark Williams Tested-by: Clark Williams Signed-off-by: Paolo Bonzini Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 8b309e81ed0f..06ab0683a1f1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -802,14 +802,14 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) return; if (!async) { - preempt_disable(); - if (cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) { + int cpu = get_cpu(); + if (cpumask_test_cpu(cpu, hctx->cpumask)) { __blk_mq_run_hw_queue(hctx); - preempt_enable(); + put_cpu(); return; } - preempt_enable(); + put_cpu(); } if (hctx->queue->nr_hw_queues == 1) -- cgit v1.2.3 From 1a3b595a281a44be4074fe33b317a0a4854b4197 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 17 Nov 2014 10:40:48 -0700 Subject: blk-mq: export blk_mq_free_request() Drivers that know they are blk-mq should just use this function instead of calling through blk_put_request(). Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 06ab0683a1f1..fdf12152946e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -280,6 +280,7 @@ void blk_mq_free_request(struct request *rq) hctx = q->mq_ops->map_queue(q, ctx->cpu); __blk_mq_free_request(hctx, ctx, rq); } +EXPORT_SYMBOL_GPL(blk_mq_free_request); inline void __blk_mq_end_request(struct request *rq, int error) { -- cgit v1.2.3 From 7c7f2f2bc9a63f9605a16eabac59fc655dfe7c9a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 17 Nov 2014 10:41:57 -0700 Subject: blk-mq: add blk_mq_free_hctx_request() It's silly to use blk_mq_free_request() which in turn maps the request to the hardware queue, for places where we already know what the hardware queue is. This saves us an extra mapping of a hardware queue on request completion, if the caller knows this information already. Signed-off-by: Jens Axboe --- block/blk-mq.c | 17 ++++++++++++----- include/linux/blk-mq.h | 1 + 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index fdf12152946e..4347aa2be6ae 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -269,16 +269,23 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, blk_mq_queue_exit(q); } -void blk_mq_free_request(struct request *rq) +void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx; - struct request_queue *q = rq->q; ctx->rq_completed[rq_is_sync(rq)]++; - - hctx = q->mq_ops->map_queue(q, ctx->cpu); __blk_mq_free_request(hctx, ctx, rq); + +} +EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); + +void blk_mq_free_request(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx; + struct request_queue *q = rq->q; + + hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); + blk_mq_free_hctx_request(hctx, rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c3b64ec5321e..fb0a4fb3dc2b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -169,6 +169,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); +void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved); -- cgit v1.2.3