summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2011-06-24 15:07:15 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2011-06-24 15:07:15 +1000
commitd5e519b32321f32a71497705e2f66e467a498c0f (patch)
treef63f8fa65d8a5bce67980cf84ab6e5cc8f600a54
parent770fa387b792ab4b4c93708e2174a5442b2b035a (diff)
parente84d0a4f8e39a73003a6ec9a11b07702745f4c1f (diff)
Merge remote-tracking branch 'writeback/next'
-rw-r--r--fs/block_dev.c16
-rw-r--r--fs/ext4/inode.c4
-rw-r--r--fs/fs-writeback.c219
-rw-r--r--fs/inode.c5
-rw-r--r--fs/nfs/write.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c2
-rw-r--r--include/linux/backing-dev.h2
-rw-r--r--include/linux/writeback.h10
-rw-r--r--include/trace/events/btrfs.h6
-rw-r--r--include/trace/events/ext4.h6
-rw-r--r--include/trace/events/writeback.h100
-rw-r--r--mm/backing-dev.c21
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/page-writeback.c25
-rw-r--r--mm/rmap.c4
15 files changed, 270 insertions, 159 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1a2421f908f0..3c9a03e51b62 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -44,24 +44,28 @@ inline struct block_device *I_BDEV(struct inode *inode)
{
return &BDEV_I(inode)->bdev;
}
-
EXPORT_SYMBOL(I_BDEV);
/*
- * move the inode from it's current bdi to the a new bdi. if the inode is dirty
- * we need to move it onto the dirty list of @dst so that the inode is always
- * on the right list.
+ * Move the inode from its current bdi to a new bdi. If the inode is dirty we
+ * need to move it onto the dirty list of @dst so that the inode is always on
+ * the right list.
*/
static void bdev_inode_switch_bdi(struct inode *inode,
struct backing_dev_info *dst)
{
- spin_lock(&inode_wb_list_lock);
+ struct backing_dev_info *old = inode->i_data.backing_dev_info;
+
+ if (unlikely(dst == old)) /* deadlock avoidance */
+ return;
+ bdi_lock_two(&old->wb, &dst->wb);
spin_lock(&inode->i_lock);
inode->i_data.backing_dev_info = dst;
if (inode->i_state & I_DIRTY)
list_move(&inode->i_wb_list, &dst->wb.b_dirty);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&old->wb.list_lock);
+ spin_unlock(&dst->wb.list_lock);
}
static sector_t max_block(struct block_device *bdev)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e3126c051006..62f86e71f291 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2741,7 +2741,7 @@ static int write_cache_pages_da(struct address_space *mapping,
index = wbc->range_start >> PAGE_CACHE_SHIFT;
end = wbc->range_end >> PAGE_CACHE_SHIFT;
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
@@ -2973,7 +2973,7 @@ static int ext4_da_writepages(struct address_space *mapping,
}
retry:
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
while (!ret && wbc->nr_to_write > 0) {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0f015a0468de..6caa98247a5b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -36,6 +36,7 @@ struct wb_writeback_work {
long nr_pages;
struct super_block *sb;
enum writeback_sync_modes sync_mode;
+ unsigned int tagged_writepages:1;
unsigned int for_kupdate:1;
unsigned int range_cyclic:1;
unsigned int for_background:1;
@@ -180,12 +181,13 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
*/
void inode_wb_list_del(struct inode *inode)
{
- spin_lock(&inode_wb_list_lock);
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+
+ spin_lock(&bdi->wb.list_lock);
list_del_init(&inode->i_wb_list);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&bdi->wb.list_lock);
}
-
/*
* Redirty an inode: set its when-it-was dirtied timestamp and move it to the
* furthest end of its superblock's dirty-inode list.
@@ -195,11 +197,9 @@ void inode_wb_list_del(struct inode *inode)
* the case then the inode must have been redirtied while it was being written
* out and we don't reset its dirtied_when.
*/
-static void redirty_tail(struct inode *inode)
+static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
{
- struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-
- assert_spin_locked(&inode_wb_list_lock);
+ assert_spin_locked(&wb->list_lock);
if (!list_empty(&wb->b_dirty)) {
struct inode *tail;
@@ -213,11 +213,9 @@ static void redirty_tail(struct inode *inode)
/*
* requeue inode for re-scanning after bdi->b_io list is exhausted.
*/
-static void requeue_io(struct inode *inode)
+static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
{
- struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-
- assert_spin_locked(&inode_wb_list_lock);
+ assert_spin_locked(&wb->list_lock);
list_move(&inode->i_wb_list, &wb->b_more_io);
}
@@ -225,7 +223,7 @@ static void inode_sync_complete(struct inode *inode)
{
/*
* Prevent speculative execution through
- * spin_unlock(&inode_wb_list_lock);
+ * spin_unlock(&wb->list_lock);
*/
smp_mb();
@@ -250,15 +248,16 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
/*
* Move expired dirty inodes from @delaying_queue to @dispatch_queue.
*/
-static void move_expired_inodes(struct list_head *delaying_queue,
+static int move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue,
- unsigned long *older_than_this)
+ unsigned long *older_than_this)
{
LIST_HEAD(tmp);
struct list_head *pos, *node;
struct super_block *sb = NULL;
struct inode *inode;
int do_sb_sort = 0;
+ int moved = 0;
while (!list_empty(delaying_queue)) {
inode = wb_inode(delaying_queue->prev);
@@ -269,12 +268,13 @@ static void move_expired_inodes(struct list_head *delaying_queue,
do_sb_sort = 1;
sb = inode->i_sb;
list_move(&inode->i_wb_list, &tmp);
+ moved++;
}
/* just one sb in list, splice to dispatch_queue and we're done */
if (!do_sb_sort) {
list_splice(&tmp, dispatch_queue);
- return;
+ goto out;
}
/* Move inodes from one superblock together */
@@ -286,6 +286,8 @@ static void move_expired_inodes(struct list_head *delaying_queue,
list_move(&inode->i_wb_list, dispatch_queue);
}
}
+out:
+ return moved;
}
/*
@@ -301,9 +303,11 @@ static void move_expired_inodes(struct list_head *delaying_queue,
*/
static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
{
- assert_spin_locked(&inode_wb_list_lock);
+ int moved;
+ assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io);
- move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+ moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+ trace_writeback_queue_io(wb, older_than_this, moved);
}
static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -316,7 +320,8 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc)
/*
* Wait for writeback on an inode to complete.
*/
-static void inode_wait_for_writeback(struct inode *inode)
+static void inode_wait_for_writeback(struct inode *inode,
+ struct bdi_writeback *wb)
{
DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
wait_queue_head_t *wqh;
@@ -324,15 +329,15 @@ static void inode_wait_for_writeback(struct inode *inode)
wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
while (inode->i_state & I_SYNC) {
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&wb->list_lock);
__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
- spin_lock(&inode_wb_list_lock);
+ spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
}
}
/*
- * Write out an inode's dirty pages. Called under inode_wb_list_lock and
+ * Write out an inode's dirty pages. Called under wb->list_lock and
* inode->i_lock. Either the caller has an active reference on the inode or
* the inode has I_WILL_FREE set.
*
@@ -343,13 +348,15 @@ static void inode_wait_for_writeback(struct inode *inode)
* livelocks, etc.
*/
static int
-writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
+ struct writeback_control *wbc)
{
struct address_space *mapping = inode->i_mapping;
+ long nr_to_write = wbc->nr_to_write;
unsigned dirty;
int ret;
- assert_spin_locked(&inode_wb_list_lock);
+ assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
if (!atomic_read(&inode->i_count))
@@ -367,14 +374,16 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* completed a full scan of b_io.
*/
if (wbc->sync_mode != WB_SYNC_ALL) {
- requeue_io(inode);
+ requeue_io(inode, wb);
+ trace_writeback_single_inode_requeue(inode, wbc,
+ nr_to_write);
return 0;
}
/*
* It's a data-integrity sync. We must wait.
*/
- inode_wait_for_writeback(inode);
+ inode_wait_for_writeback(inode, wb);
}
BUG_ON(inode->i_state & I_SYNC);
@@ -383,7 +392,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
inode->i_state |= I_SYNC;
inode->i_state &= ~I_DIRTY_PAGES;
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&wb->list_lock);
ret = do_writepages(mapping, wbc);
@@ -414,10 +423,19 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
ret = err;
}
- spin_lock(&inode_wb_list_lock);
+ spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
inode->i_state &= ~I_SYNC;
if (!(inode->i_state & I_FREEING)) {
+ /*
+ * Sync livelock prevention. Each inode is tagged and synced in
+ * one shot. If still dirty, it will be redirty_tail()'ed below.
+ * Update the dirty time to prevent enqueue and sync it again.
+ */
+ if ((inode->i_state & I_DIRTY) &&
+ (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
+ inode->dirtied_when = jiffies;
+
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
/*
* We didn't write back all the pages. nfs_writepages()
@@ -428,7 +446,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
/*
* slice used up: queue for next turn
*/
- requeue_io(inode);
+ requeue_io(inode, wb);
} else {
/*
* Writeback blocked by something other than
@@ -437,7 +455,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* retrying writeback of the dirty page/inode
* that cannot be performed immediately.
*/
- redirty_tail(inode);
+ redirty_tail(inode, wb);
}
} else if (inode->i_state & I_DIRTY) {
/*
@@ -446,7 +464,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* submission or metadata updates after data IO
* completion.
*/
- redirty_tail(inode);
+ redirty_tail(inode, wb);
} else {
/*
* The inode is clean. At this point we either have
@@ -454,9 +472,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* No need to add it back to the LRU.
*/
list_del_init(&inode->i_wb_list);
+ wbc->inodes_written++;
}
}
inode_sync_complete(inode);
+ trace_writeback_single_inode(inode, wbc, nr_to_write);
return ret;
}
@@ -510,7 +530,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
* superblock, move all inodes not belonging
* to it back onto the dirty list.
*/
- redirty_tail(inode);
+ redirty_tail(inode, wb);
continue;
}
@@ -530,63 +550,44 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
spin_unlock(&inode->i_lock);
- requeue_io(inode);
+ requeue_io(inode, wb);
continue;
}
- /*
- * Was this inode dirtied after sync_sb_inodes was called?
- * This keeps sync from extra jobs and livelock.
- */
- if (inode_dirtied_after(inode, wbc->wb_start)) {
- spin_unlock(&inode->i_lock);
- return 1;
- }
-
__iget(inode);
pages_skipped = wbc->pages_skipped;
- writeback_single_inode(inode, wbc);
+ writeback_single_inode(inode, wb, wbc);
if (wbc->pages_skipped != pages_skipped) {
/*
* writeback is not making progress due to locked
* buffers. Skip this inode for now.
*/
- redirty_tail(inode);
+ redirty_tail(inode, wb);
}
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&wb->list_lock);
iput(inode);
cond_resched();
- spin_lock(&inode_wb_list_lock);
- if (wbc->nr_to_write <= 0) {
- wbc->more_io = 1;
+ spin_lock(&wb->list_lock);
+ if (wbc->nr_to_write <= 0)
return 1;
- }
- if (!list_empty(&wb->b_more_io))
- wbc->more_io = 1;
}
/* b_io is empty */
return 1;
}
-void writeback_inodes_wb(struct bdi_writeback *wb,
- struct writeback_control *wbc)
+static void __writeback_inodes_wb(struct bdi_writeback *wb,
+ struct writeback_control *wbc)
{
int ret = 0;
- if (!wbc->wb_start)
- wbc->wb_start = jiffies; /* livelock avoidance */
- spin_lock(&inode_wb_list_lock);
- if (!wbc->for_kupdate || list_empty(&wb->b_io))
- queue_io(wb, wbc->older_than_this);
-
while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
struct super_block *sb = inode->i_sb;
if (!pin_sb_for_writeback(sb)) {
- requeue_io(inode);
+ requeue_io(inode, wb);
continue;
}
ret = writeback_sb_inodes(sb, wb, wbc, false);
@@ -595,20 +596,17 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
if (ret)
break;
}
- spin_unlock(&inode_wb_list_lock);
/* Leave any unwritten inodes on b_io */
}
-static void __writeback_inodes_sb(struct super_block *sb,
- struct bdi_writeback *wb, struct writeback_control *wbc)
+void writeback_inodes_wb(struct bdi_writeback *wb,
+ struct writeback_control *wbc)
{
- WARN_ON(!rwsem_is_locked(&sb->s_umount));
-
- spin_lock(&inode_wb_list_lock);
- if (!wbc->for_kupdate || list_empty(&wb->b_io))
+ spin_lock(&wb->list_lock);
+ if (list_empty(&wb->b_io))
queue_io(wb, wbc->older_than_this);
- writeback_sb_inodes(sb, wb, wbc, true);
- spin_unlock(&inode_wb_list_lock);
+ __writeback_inodes_wb(wb, wbc);
+ spin_unlock(&wb->list_lock);
}
/*
@@ -650,6 +648,7 @@ static long wb_writeback(struct bdi_writeback *wb,
{
struct writeback_control wbc = {
.sync_mode = work->sync_mode,
+ .tagged_writepages = work->tagged_writepages,
.older_than_this = NULL,
.for_kupdate = work->for_kupdate,
.for_background = work->for_background,
@@ -657,14 +656,9 @@ static long wb_writeback(struct bdi_writeback *wb,
};
unsigned long oldest_jif;
long wrote = 0;
- long write_chunk;
+ long write_chunk = MAX_WRITEBACK_PAGES;
struct inode *inode;
- if (wbc.for_kupdate) {
- wbc.older_than_this = &oldest_jif;
- oldest_jif = jiffies -
- msecs_to_jiffies(dirty_expire_interval * 10);
- }
if (!wbc.range_cyclic) {
wbc.range_start = 0;
wbc.range_end = LLONG_MAX;
@@ -678,17 +672,18 @@ static long wb_writeback(struct bdi_writeback *wb,
* The intended call sequence for WB_SYNC_ALL writeback is:
*
* wb_writeback()
- * __writeback_inodes_sb() <== called only once
+ * writeback_sb_inodes() <== called only once
* write_cache_pages() <== called once for each inode
* (quickly) tag currently dirty pages
* (maybe slowly) sync all tagged pages
*/
- if (wbc.sync_mode == WB_SYNC_NONE)
- write_chunk = MAX_WRITEBACK_PAGES;
- else
+ if (wbc.sync_mode == WB_SYNC_ALL || wbc.tagged_writepages)
write_chunk = LONG_MAX;
- wbc.wb_start = jiffies; /* livelock avoidance */
+ oldest_jif = jiffies;
+ wbc.older_than_this = &oldest_jif;
+
+ spin_lock(&wb->list_lock);
for (;;) {
/*
* Stop writeback when nr_pages has been consumed
@@ -713,50 +708,59 @@ static long wb_writeback(struct bdi_writeback *wb,
if (work->for_background && !over_bground_thresh())
break;
- wbc.more_io = 0;
+ if (work->for_kupdate) {
+ oldest_jif = jiffies -
+ msecs_to_jiffies(dirty_expire_interval * 10);
+ wbc.older_than_this = &oldest_jif;
+ }
+
wbc.nr_to_write = write_chunk;
wbc.pages_skipped = 0;
+ wbc.inodes_written = 0;
trace_wbc_writeback_start(&wbc, wb->bdi);
+ if (list_empty(&wb->b_io))
+ queue_io(wb, wbc.older_than_this);
if (work->sb)
- __writeback_inodes_sb(work->sb, wb, &wbc);
+ writeback_sb_inodes(work->sb, wb, &wbc, true);
else
- writeback_inodes_wb(wb, &wbc);
+ __writeback_inodes_wb(wb, &wbc);
trace_wbc_writeback_written(&wbc, wb->bdi);
work->nr_pages -= write_chunk - wbc.nr_to_write;
wrote += write_chunk - wbc.nr_to_write;
/*
- * If we consumed everything, see if we have more
+ * Did we write something? Try for more
+ *
+ * Dirty inodes are moved to b_io for writeback in batches.
+ * The completion of the current batch does not necessarily
+ * mean the overall work is done. So we keep looping as long
+ * as made some progress on cleaning pages or inodes.
*/
- if (wbc.nr_to_write <= 0)
+ if (wbc.nr_to_write < write_chunk)
+ continue;
+ if (wbc.inodes_written)
continue;
/*
- * Didn't write everything and we don't have more IO, bail
+ * No more inodes for IO, bail
*/
- if (!wbc.more_io)
+ if (list_empty(&wb->b_more_io))
break;
/*
- * Did we write something? Try for more
- */
- if (wbc.nr_to_write < write_chunk)
- continue;
- /*
* Nothing written. Wait for some inode to
* become available for writeback. Otherwise
* we'll just busyloop.
*/
- spin_lock(&inode_wb_list_lock);
if (!list_empty(&wb->b_more_io)) {
inode = wb_inode(wb->b_more_io.prev);
trace_wbc_writeback_wait(&wbc, wb->bdi);
spin_lock(&inode->i_lock);
- inode_wait_for_writeback(inode);
+ inode_wait_for_writeback(inode, wb);
spin_unlock(&inode->i_lock);
}
- spin_unlock(&inode_wb_list_lock);
}
+ spin_unlock(&wb->list_lock);
return wrote;
}
@@ -1089,10 +1093,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
}
spin_unlock(&inode->i_lock);
- spin_lock(&inode_wb_list_lock);
+ spin_lock(&bdi->wb.list_lock);
inode->dirtied_when = jiffies;
list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&bdi->wb.list_lock);
if (wakeup_bdi)
bdi_wakeup_thread_delayed(bdi);
@@ -1188,10 +1192,11 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
{
DECLARE_COMPLETION_ONSTACK(done);
struct wb_writeback_work work = {
- .sb = sb,
- .sync_mode = WB_SYNC_NONE,
- .done = &done,
- .nr_pages = nr,
+ .sb = sb,
+ .sync_mode = WB_SYNC_NONE,
+ .tagged_writepages = 1,
+ .done = &done,
+ .nr_pages = nr,
};
WARN_ON(!rwsem_is_locked(&sb->s_umount));
@@ -1293,6 +1298,7 @@ EXPORT_SYMBOL(sync_inodes_sb);
*/
int write_inode_now(struct inode *inode, int sync)
{
+ struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
int ret;
struct writeback_control wbc = {
.nr_to_write = LONG_MAX,
@@ -1305,11 +1311,11 @@ int write_inode_now(struct inode *inode, int sync)
wbc.nr_to_write = 0;
might_sleep();
- spin_lock(&inode_wb_list_lock);
+ spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
- ret = writeback_single_inode(inode, &wbc);
+ ret = writeback_single_inode(inode, wb, &wbc);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&wb->list_lock);
if (sync)
inode_sync_wait(inode);
return ret;
@@ -1329,13 +1335,14 @@ EXPORT_SYMBOL(write_inode_now);
*/
int sync_inode(struct inode *inode, struct writeback_control *wbc)
{
+ struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
int ret;
- spin_lock(&inode_wb_list_lock);
+ spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
- ret = writeback_single_inode(inode, wbc);
+ ret = writeback_single_inode(inode, wb, wbc);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&wb->list_lock);
return ret;
}
EXPORT_SYMBOL(sync_inode);
diff --git a/fs/inode.c b/fs/inode.c
index e195ff89fb58..a1c36fc442e0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -37,7 +37,7 @@
* inode_lru, inode->i_lru
* inode_sb_list_lock protects:
* sb->s_inodes, inode->i_sb_list
- * inode_wb_list_lock protects:
+ * bdi->wb.list_lock protects:
* bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
* inode_hash_lock protects:
* inode_hashtable, inode->i_hash
@@ -48,7 +48,7 @@
* inode->i_lock
* inode_lru_lock
*
- * inode_wb_list_lock
+ * bdi->wb.list_lock
* inode->i_lock
*
* inode_hash_lock
@@ -68,7 +68,6 @@ static LIST_HEAD(inode_lru);
static DEFINE_SPINLOCK(inode_lru_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
/*
* iprune_sem provides exclusion between the icache shrinking and the
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 574ec0e86fbd..44a9a979788b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1582,8 +1582,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
int status;
bool sync = true;
- if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking ||
- wbc->for_background)
+ if (wbc->sync_mode == WB_SYNC_NONE)
sync = false;
status = pnfs_layoutcommit_inode(inode, sync);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 79ce38be15a1..7559861481aa 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -970,7 +970,7 @@ xfs_vm_writepage(
offset = page_offset(page);
type = IO_OVERWRITE;
- if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+ if (wbc->sync_mode == WB_SYNC_NONE)
nonblocking = 1;
do {
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 96f4094b706d..47feb2c4706a 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -57,6 +57,7 @@ struct bdi_writeback {
struct list_head b_dirty; /* dirty inodes */
struct list_head b_io; /* parked for writeback */
struct list_head b_more_io; /* parked for more writeback */
+ spinlock_t list_lock; /* protects the b_* lists */
};
struct backing_dev_info {
@@ -106,6 +107,7 @@ int bdi_writeback_thread(void *data);
int bdi_has_dirty_io(struct backing_dev_info *bdi);
void bdi_arm_supers_timer(void);
void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
+void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);
extern spinlock_t bdi_lock;
extern struct list_head bdi_list;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 17e7ccc322a5..2f1b512bd6e0 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -9,8 +9,6 @@
struct backing_dev_info;
-extern spinlock_t inode_wb_list_lock;
-
/*
* fs/fs-writeback.c
*/
@@ -28,12 +26,10 @@ struct writeback_control {
enum writeback_sync_modes sync_mode;
unsigned long *older_than_this; /* If !NULL, only write back inodes
older than this */
- unsigned long wb_start; /* Time writeback_inodes_wb was
- called. This is needed to avoid
- extra jobs and livelock */
long nr_to_write; /* Write this many pages, and decrement
this for each page written */
long pages_skipped; /* Pages which were not written */
+ long inodes_written; /* # of inodes written (at least) */
/*
* For a_ops->writepages(): is start or end are non-zero then this is
@@ -43,13 +39,11 @@ struct writeback_control {
loff_t range_start;
loff_t range_end;
- unsigned nonblocking:1; /* Don't get stuck on request queues */
- unsigned encountered_congestion:1; /* An output: a queue is full */
unsigned for_kupdate:1; /* A kupdate writeback */
unsigned for_background:1; /* A background writeback */
+ unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */
unsigned for_reclaim:1; /* Invoked from the page allocator */
unsigned range_cyclic:1; /* range_start is cyclic */
- unsigned more_io:1; /* more io to be dispatched */
};
/*
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 4114129f0794..b31702ac15be 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -284,7 +284,6 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
__field( long, pages_skipped )
__field( loff_t, range_start )
__field( loff_t, range_end )
- __field( char, nonblocking )
__field( char, for_kupdate )
__field( char, for_reclaim )
__field( char, range_cyclic )
@@ -299,7 +298,6 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
__entry->pages_skipped = wbc->pages_skipped;
__entry->range_start = wbc->range_start;
__entry->range_end = wbc->range_end;
- __entry->nonblocking = wbc->nonblocking;
__entry->for_kupdate = wbc->for_kupdate;
__entry->for_reclaim = wbc->for_reclaim;
__entry->range_cyclic = wbc->range_cyclic;
@@ -310,13 +308,13 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, "
"nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, "
- "range_end = %llu, nonblocking = %d, for_kupdate = %d, "
+ "range_end = %llu, for_kupdate = %d, "
"for_reclaim = %d, range_cyclic = %d, writeback_index = %lu",
show_root_type(__entry->root_objectid),
(unsigned long)__entry->ino, __entry->index,
__entry->nr_to_write, __entry->pages_skipped,
__entry->range_start, __entry->range_end,
- __entry->nonblocking, __entry->for_kupdate,
+ __entry->for_kupdate,
__entry->for_reclaim, __entry->range_cyclic,
(unsigned long)__entry->writeback_index)
);
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 5ce2b2f5f524..6363193a3418 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -380,7 +380,6 @@ TRACE_EVENT(ext4_da_writepages_result,
__field( int, pages_written )
__field( long, pages_skipped )
__field( int, sync_mode )
- __field( char, more_io )
__field( pgoff_t, writeback_index )
),
@@ -391,16 +390,15 @@ TRACE_EVENT(ext4_da_writepages_result,
__entry->pages_written = pages_written;
__entry->pages_skipped = wbc->pages_skipped;
__entry->sync_mode = wbc->sync_mode;
- __entry->more_io = wbc->more_io;
__entry->writeback_index = inode->i_mapping->writeback_index;
),
TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
- " more_io %d sync_mode %d writeback_index %lu",
+ "sync_mode %d writeback_index %lu",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino, __entry->ret,
__entry->pages_written, __entry->pages_skipped,
- __entry->more_io, __entry->sync_mode,
+ __entry->sync_mode,
(unsigned long) __entry->writeback_index)
);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 4e249b927eaa..205d14919ef2 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -8,6 +8,19 @@
#include <linux/device.h>
#include <linux/writeback.h>
+#define show_inode_state(state) \
+ __print_flags(state, "|", \
+ {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \
+ {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \
+ {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \
+ {I_NEW, "I_NEW"}, \
+ {I_WILL_FREE, "I_WILL_FREE"}, \
+ {I_FREEING, "I_FREEING"}, \
+ {I_CLEAR, "I_CLEAR"}, \
+ {I_SYNC, "I_SYNC"}, \
+ {I_REFERENCED, "I_REFERENCED"} \
+ )
+
struct wb_writeback_work;
DECLARE_EVENT_CLASS(writeback_work_class,
@@ -101,7 +114,6 @@ DECLARE_EVENT_CLASS(wbc_class,
__field(int, for_background)
__field(int, for_reclaim)
__field(int, range_cyclic)
- __field(int, more_io)
__field(unsigned long, older_than_this)
__field(long, range_start)
__field(long, range_end)
@@ -116,7 +128,6 @@ DECLARE_EVENT_CLASS(wbc_class,
__entry->for_background = wbc->for_background;
__entry->for_reclaim = wbc->for_reclaim;
__entry->range_cyclic = wbc->range_cyclic;
- __entry->more_io = wbc->more_io;
__entry->older_than_this = wbc->older_than_this ?
*wbc->older_than_this : 0;
__entry->range_start = (long)wbc->range_start;
@@ -124,7 +135,7 @@ DECLARE_EVENT_CLASS(wbc_class,
),
TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
- "bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx "
+ "bgrd=%d reclm=%d cyclic=%d older=0x%lx "
"start=0x%lx end=0x%lx",
__entry->name,
__entry->nr_to_write,
@@ -134,7 +145,6 @@ DECLARE_EVENT_CLASS(wbc_class,
__entry->for_background,
__entry->for_reclaim,
__entry->range_cyclic,
- __entry->more_io,
__entry->older_than_this,
__entry->range_start,
__entry->range_end)
@@ -152,6 +162,31 @@ DEFINE_WBC_EVENT(wbc_balance_dirty_written);
DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
DEFINE_WBC_EVENT(wbc_writepage);
+TRACE_EVENT(writeback_queue_io,
+ TP_PROTO(struct bdi_writeback *wb,
+ unsigned long *older_than_this,
+ int moved),
+ TP_ARGS(wb, older_than_this, moved),
+ TP_STRUCT__entry(
+ __array(char, name, 32)
+ __field(unsigned long, older)
+ __field(long, age)
+ __field(int, moved)
+ ),
+ TP_fast_assign(
+ strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
+ __entry->older = older_than_this ? *older_than_this : 0;
+ __entry->age = older_than_this ?
+ (jiffies - *older_than_this) * 1000 / HZ : -1;
+ __entry->moved = moved;
+ ),
+ TP_printk("bdi %s: older=%lu age=%ld enqueue=%d",
+ __entry->name,
+ __entry->older, /* older_than_this in jiffies */
+ __entry->age, /* older_than_this in relative milliseconds */
+ __entry->moved)
+);
+
DECLARE_EVENT_CLASS(writeback_congest_waited_template,
TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
@@ -187,6 +222,63 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested,
TP_ARGS(usec_timeout, usec_delayed)
);
+DECLARE_EVENT_CLASS(writeback_single_inode_template,
+
+ TP_PROTO(struct inode *inode,
+ struct writeback_control *wbc,
+ unsigned long nr_to_write
+ ),
+
+ TP_ARGS(inode, wbc, nr_to_write),
+
+ TP_STRUCT__entry(
+ __array(char, name, 32)
+ __field(unsigned long, ino)
+ __field(unsigned long, state)
+ __field(unsigned long, age)
+ __field(unsigned long, writeback_index)
+ __field(long, nr_to_write)
+ __field(unsigned long, wrote)
+ ),
+
+ TP_fast_assign(
+ strncpy(__entry->name,
+ dev_name(inode->i_mapping->backing_dev_info->dev), 32);
+ __entry->ino = inode->i_ino;
+ __entry->state = inode->i_state;
+ __entry->age = (jiffies - inode->dirtied_when) *
+ 1000 / HZ;
+ __entry->writeback_index = inode->i_mapping->writeback_index;
+ __entry->nr_to_write = nr_to_write;
+ __entry->wrote = nr_to_write - wbc->nr_to_write;
+ ),
+
+ TP_printk("bdi %s: ino=%lu state=%s age=%lu "
+ "index=%lu to_write=%ld wrote=%lu",
+ __entry->name,
+ __entry->ino,
+ show_inode_state(__entry->state),
+ __entry->age,
+ __entry->writeback_index,
+ __entry->nr_to_write,
+ __entry->wrote
+ )
+);
+
+DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_requeue,
+ TP_PROTO(struct inode *inode,
+ struct writeback_control *wbc,
+ unsigned long nr_to_write),
+ TP_ARGS(inode, wbc, nr_to_write)
+);
+
+DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
+ TP_PROTO(struct inode *inode,
+ struct writeback_control *wbc,
+ unsigned long nr_to_write),
+ TP_ARGS(inode, wbc, nr_to_write)
+);
+
#endif /* _TRACE_WRITEBACK_H */
/* This part must be outside protection */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f032e6e1e09a..5f6553ef1ba7 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer;
static int bdi_sync_supers(void *);
static void sync_supers_timer_fn(unsigned long);
+void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+{
+ if (wb1 < wb2) {
+ spin_lock(&wb1->list_lock);
+ spin_lock_nested(&wb2->list_lock, 1);
+ } else {
+ spin_lock(&wb2->list_lock);
+ spin_lock_nested(&wb1->list_lock, 1);
+ }
+}
+
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -67,14 +78,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
struct inode *inode;
nr_dirty = nr_io = nr_more_io = 0;
- spin_lock(&inode_wb_list_lock);
+ spin_lock(&wb->list_lock);
list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++;
list_for_each_entry(inode, &wb->b_io, i_wb_list)
nr_io++;
list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
nr_more_io++;
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&wb->list_lock);
global_dirty_limits(&background_thresh, &dirty_thresh);
bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
@@ -628,6 +639,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
INIT_LIST_HEAD(&wb->b_dirty);
INIT_LIST_HEAD(&wb->b_io);
INIT_LIST_HEAD(&wb->b_more_io);
+ spin_lock_init(&wb->list_lock);
setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
}
@@ -676,11 +688,12 @@ void bdi_destroy(struct backing_dev_info *bdi)
if (bdi_has_dirty_io(bdi)) {
struct bdi_writeback *dst = &default_backing_dev_info.wb;
- spin_lock(&inode_wb_list_lock);
+ bdi_lock_two(&bdi->wb, dst);
list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
list_splice(&bdi->wb.b_io, &dst->b_io);
list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&bdi->wb.list_lock);
+ spin_unlock(&dst->list_lock);
}
bdi_unregister(bdi);
diff --git a/mm/filemap.c b/mm/filemap.c
index cfac89d95ec1..2a2f5e8880ae 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -81,7 +81,7 @@
* ->i_mutex
* ->i_alloc_sem (various)
*
- * inode_wb_list_lock
+ * bdi->wb.list_lock
* sb_lock (fs/fs-writeback.c)
* ->mapping->tree_lock (__sync_single_inode)
*
@@ -99,9 +99,9 @@
* ->zone.lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty)
- * inode_wb_list_lock (page_remove_rmap->set_page_dirty)
+ * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
- * inode_wb_list_lock (zap_pte_range->set_page_dirty)
+ * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 31f698862420..b2529f8f8be0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -244,13 +244,8 @@ void task_dirty_inc(struct task_struct *tsk)
static void bdi_writeout_fraction(struct backing_dev_info *bdi,
long *numerator, long *denominator)
{
- if (bdi_cap_writeback_dirty(bdi)) {
- prop_fraction_percpu(&vm_completions, &bdi->completions,
+ prop_fraction_percpu(&vm_completions, &bdi->completions,
numerator, denominator);
- } else {
- *numerator = 0;
- *denominator = 1;
- }
}
static inline void task_dirties_fraction(struct task_struct *tsk,
@@ -437,10 +432,17 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
*pdirty = dirty;
}
-/*
+/**
* bdi_dirty_limit - @bdi's share of dirty throttling threshold
+ * @bdi: the backing_dev_info to query
+ * @dirty: global dirty limit in pages
*
- * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+ * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+ * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
+ * And the "limit" in the name is not seriously taken as hard limit in
+ * balance_dirty_pages().
+ *
+ * It allocates high/low dirty limits to fast/slow devices, in order to prevent
* - starving fast devices
* - piling up dirty pages (that will take long time to sync) on slow devices
*
@@ -488,6 +490,9 @@ static void balance_dirty_pages(struct address_space *mapping,
bool dirty_exceeded = false;
struct backing_dev_info *bdi = mapping->backing_dev_info;
+ if (!bdi_cap_account_dirty(bdi))
+ return;
+
for (;;) {
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
@@ -892,12 +897,12 @@ int write_cache_pages(struct address_space *mapping,
range_whole = 1;
cycled = 1; /* ignore range_cyclic tests */
}
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
retry:
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && (index <= end)) {
diff --git a/mm/rmap.c b/mm/rmap.c
index 23295f65ae43..67528a2661b8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -32,11 +32,11 @@
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
+ * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c)
* mapping->tree_lock (widely used, in set_page_dirty,
* in arch-dependent flush_dcache_mmap_lock,
- * within inode_wb_list_lock in __sync_single_inode)
+ * within bdi.wb->list_lock in __sync_single_inode)
*
* anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
* ->tasklist_lock