From b8ab956c544e51a8d19ea3f38b54355d6aa92c33 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 4 Nov 2014 12:52:41 +0100
Subject: block: Expand a bit documentation about elevator_allow_merge_fn

Explain that two requests can be merged without
elevator_allow_merge_fn() being called.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/biodoc.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 2101e718670d..f1323c6b7ed2 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -946,7 +946,11 @@ elevator_allow_merge_fn		called whenever the block layer determines
 				request safely. The io scheduler may still
 				want to stop a merge at this point if it
 				results in some sort of conflict internally,
-				this hook allows it to do that.
+				this hook allows it to do that. Note however
+				that two *requests* can still be merged at later
+				time. Currently the io scheduler has no way to
+				prevent that. It can only learn about the fact
+				from elevator_merge_req_fn callback.
 
 elevator_dispatch_fn*		fills the dispatch queue with ready requests.
 				I/O schedulers are free to postpone requests by
-- 
cgit v1.2.3


From 9c6ac78eb3521c5937b2dd8a7d1b300f41092f45 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Oct 2014 15:38:21 -0400
Subject: writeback: fix a subtle race condition in I_DIRTY clearing

After invoking ->dirty_inode(), __mark_inode_dirty() does smp_mb() and
tests inode->i_state locklessly to see whether it already has all the
necessary I_DIRTY bits set.  The comment above the barrier doesn't
contain any useful information - memory barriers can't ensure "changes
are seen by all cpus" by itself.

And it sure enough was broken.  Please consider the following
scenario.

 CPU 0					CPU 1
 -------------------------------------------------------------------------------

					enters __writeback_single_inode()
					grabs inode->i_lock
					tests PAGECACHE_TAG_DIRTY which is clear
 enters __set_page_dirty()
 grabs mapping->tree_lock
 sets PAGECACHE_TAG_DIRTY
 releases mapping->tree_lock
 leaves __set_page_dirty()

 enters __mark_inode_dirty()
 smp_mb()
 sees I_DIRTY_PAGES set
 leaves __mark_inode_dirty()
					clears I_DIRTY_PAGES
					releases inode->i_lock

Now @inode has dirty pages w/ I_DIRTY_PAGES clear.  This doesn't seem
to lead to an immediately critical problem because requeue_inode()
later checks PAGECACHE_TAG_DIRTY instead of I_DIRTY_PAGES when
deciding whether the inode needs to be requeued for IO and there are
enough unintentional memory barriers inbetween, so while the inode
ends up with inconsistent I_DIRTY_PAGES flag, it doesn't fall off the
IO list.

The lack of explicit barrier may also theoretically affect the other
I_DIRTY bits which deal with metadata dirtiness.  There is no
guarantee that a strong enough barrier exists between
I_DIRTY_[DATA]SYNC clearing and write_inode() writing out the dirtied
inode.  Filesystem inode writeout path likely has enough stuff which
can behave as full barrier but it's theoretically possible that the
writeout may not see all the updates from ->dirty_inode().

Fix it by adding an explicit smp_mb() after I_DIRTY clearing.  Note
that I_DIRTY_PAGES needs a special treatment as it always needs to be
cleared to be interlocked with the lockless test on
__mark_inode_dirty() side.  It's cleared unconditionally and
reinstated after smp_mb() if the mapping still has dirty pages.

Also add comments explaining how and why the barriers are paired.

Lightly tested.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ef9bef118342..2d609a5fbfea 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -479,12 +479,28 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	 * write_inode()
 	 */
 	spin_lock(&inode->i_lock);
-	/* Clear I_DIRTY_PAGES if we've written out all dirty pages */
-	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-		inode->i_state &= ~I_DIRTY_PAGES;
+
 	dirty = inode->i_state & I_DIRTY;
-	inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+	inode->i_state &= ~I_DIRTY;
+
+	/*
+	 * Paired with smp_mb() in __mark_inode_dirty().  This allows
+	 * __mark_inode_dirty() to test i_state without grabbing i_lock -
+	 * either they see the I_DIRTY bits cleared or we see the dirtied
+	 * inode.
+	 *
+	 * I_DIRTY_PAGES is always cleared together above even if @mapping
+	 * still has dirty pages.  The flag is reinstated after smp_mb() if
+	 * necessary.  This guarantees that either __mark_inode_dirty()
+	 * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
+	 */
+	smp_mb();
+
+	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+		inode->i_state |= I_DIRTY_PAGES;
+
 	spin_unlock(&inode->i_lock);
+
 	/* Don't write the inode if only I_DIRTY_PAGES was set */
 	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
 		int err = write_inode(inode, wbc);
@@ -1148,12 +1164,11 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	}
 
 	/*
-	 * make sure that changes are seen by all cpus before we test i_state
-	 * -- mikulas
+	 * Paired with smp_mb() in __writeback_single_inode() for the
+	 * following lockless i_state test.  See there for details.
 	 */
 	smp_mb();
 
-	/* avoid the locking if we can */
 	if ((inode->i_state & flags) == flags)
 		return;
 
-- 
cgit v1.2.3


From 398205b8391b208f0034a392242867b28ad8af3d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 7 Nov 2014 23:03:59 +0100
Subject: blk_mq: call preempt_disable/enable in blk_mq_run_hw_queue, and only
 if needed

preempt_disable/enable surrounds every call to blk_mq_run_hw_queue,
except the one in blk-flush.c.  In fact that one is always asynchronous,
and it does not need smp_processor_id().

We can do the same for all other calls, avoiding preempt_disable when
async is true.  This avoids peppering blk-mq.c with preemption-disabled
regions.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Clark Williams <williams@redhat.com>
Tested-by: Clark Williams <williams@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b355b5957cd7..8b309e81ed0f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -801,9 +801,18 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
 		return;
 
-	if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
-		__blk_mq_run_hw_queue(hctx);
-	else if (hctx->queue->nr_hw_queues == 1)
+	if (!async) {
+		preempt_disable();
+		if (cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) {
+			__blk_mq_run_hw_queue(hctx);
+			preempt_enable();
+			return;
+		}
+
+		preempt_enable();
+	}
+
+	if (hctx->queue->nr_hw_queues == 1)
 		kblockd_schedule_delayed_work(&hctx->run_work, 0);
 	else {
 		unsigned int cpu;
@@ -824,9 +833,7 @@ void blk_mq_run_queues(struct request_queue *q, bool async)
 		    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
 			continue;
 
-		preempt_disable();
 		blk_mq_run_hw_queue(hctx, async);
-		preempt_enable();
 	}
 }
 EXPORT_SYMBOL(blk_mq_run_queues);
@@ -853,9 +860,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
 
-	preempt_disable();
 	blk_mq_run_hw_queue(hctx, false);
-	preempt_enable();
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queue);
 
@@ -880,9 +885,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 			continue;
 
 		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
-		preempt_disable();
 		blk_mq_run_hw_queue(hctx, async);
-		preempt_enable();
 	}
 }
 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
-- 
cgit v1.2.3


From 2a90d4aae5509e9cf1ba848c5d0b3458201160a0 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 7 Nov 2014 23:04:00 +0100
Subject: blk-mq: use get_cpu/put_cpu instead of preempt_disable/preempt_enable

blk-mq is using preempt_disable/enable in order to ensure that the
queue runners are placed on the right CPU.  This does not work with
the RT patches, because __blk_mq_run_hw_queue takes a non-raw
spinlock with the preemption-disabled region.  If there is contention
on the lock, this violates the rules for preemption-disabled regions.

While this should be easily fixable within the RT patches just by doing
migrate_disable/enable, we can do better and document _why_ this
particular region runs with disabled preemption.  After the previous
patch, it is trivial to switch it to get/put_cpu; the RT patches then
can change it to get_cpu_light, which lets virtio-blk run under RT
kernels.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Clark Williams <williams@redhat.com>
Tested-by: Clark Williams <williams@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8b309e81ed0f..06ab0683a1f1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -802,14 +802,14 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 		return;
 
 	if (!async) {
-		preempt_disable();
-		if (cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) {
+		int cpu = get_cpu();
+		if (cpumask_test_cpu(cpu, hctx->cpumask)) {
 			__blk_mq_run_hw_queue(hctx);
-			preempt_enable();
+			put_cpu();
 			return;
 		}
 
-		preempt_enable();
+		put_cpu();
 	}
 
 	if (hctx->queue->nr_hw_queues == 1)
-- 
cgit v1.2.3


From 1a3b595a281a44be4074fe33b317a0a4854b4197 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 17 Nov 2014 10:40:48 -0700
Subject: blk-mq: export blk_mq_free_request()

Drivers that know they are blk-mq should just use this function
instead of calling through blk_put_request().

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 06ab0683a1f1..fdf12152946e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -280,6 +280,7 @@ void blk_mq_free_request(struct request *rq)
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 	__blk_mq_free_request(hctx, ctx, rq);
 }
+EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
 inline void __blk_mq_end_request(struct request *rq, int error)
 {
-- 
cgit v1.2.3


From 7c7f2f2bc9a63f9605a16eabac59fc655dfe7c9a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 17 Nov 2014 10:41:57 -0700
Subject: blk-mq: add blk_mq_free_hctx_request()

It's silly to use blk_mq_free_request() which in turn maps the
request to the hardware queue, for places where we already know
what the hardware queue is. This saves us an extra mapping of a
hardware queue on request completion, if the caller knows this
information already.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 17 ++++++++++++-----
 include/linux/blk-mq.h |  1 +
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index fdf12152946e..4347aa2be6ae 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -269,16 +269,23 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 	blk_mq_queue_exit(q);
 }
 
-void blk_mq_free_request(struct request *rq)
+void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
-	struct blk_mq_hw_ctx *hctx;
-	struct request_queue *q = rq->q;
 
 	ctx->rq_completed[rq_is_sync(rq)]++;
-
-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 	__blk_mq_free_request(hctx, ctx, rq);
+
+}
+EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
+
+void blk_mq_free_request(struct request *rq)
+{
+	struct blk_mq_hw_ctx *hctx;
+	struct request_queue *q = rq->q;
+
+	hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
+	blk_mq_free_hctx_request(hctx, rq);
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c3b64ec5321e..fb0a4fb3dc2b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -169,6 +169,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 void blk_mq_insert_request(struct request *, bool, bool, bool);
 void blk_mq_run_queues(struct request_queue *q, bool async);
 void blk_mq_free_request(struct request *rq);
+void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 		gfp_t gfp, bool reserved);
-- 
cgit v1.2.3