15 files changed, 509 insertions, 239 deletions
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 754f38f8a692..638dae048b4f 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/workqueue.h>
+#include <linux/delay.h>
 #include <scsi/scsi_dh.h>
 #include <linux/atomic.h>
 
@@ -61,11 +62,11 @@ struct multipath {
 	struct list_head list;
 	struct dm_target *ti;
 
-	spinlock_t lock;
-
 	const char *hw_handler_name;
 	char *hw_handler_params;
 
+	spinlock_t lock;
+
 	unsigned nr_priority_groups;
 	struct list_head priority_groups;
 
@@ -81,16 +82,17 @@ struct multipath {
 	struct priority_group *next_pg;	/* Switch to this PG if set */
 	unsigned repeat_count;		/* I/Os left before calling PS again */
 
-	unsigned queue_io;		/* Must we queue all I/O? */
-	unsigned queue_if_no_path;	/* Queue I/O if last path fails? */
-	unsigned saved_queue_if_no_path;/* Saved state during suspension */
+	unsigned queue_io:1;		/* Must we queue all I/O? */
+	unsigned queue_if_no_path:1;	/* Queue I/O if last path fails? */
+	unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
+
 	unsigned pg_init_retries;	/* Number of times to retry pg_init */
 	unsigned pg_init_count;		/* Number of times pg_init called */
 	unsigned pg_init_delay_msecs;	/* Number of msecs before pg_init retry */
 
+	unsigned queue_size;
 	struct work_struct process_queued_ios;
 	struct list_head queued_ios;
-	unsigned queue_size;
 
 	struct work_struct trigger_event;
 
@@ -328,14 +330,18 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
 	/*
 	 * Loop through priority groups until we find a valid path.
 	 * First time we skip PGs marked 'bypassed'.
-	 * Second time we only try the ones we skipped.
+	 * Second time we only try the ones we skipped, but set
+	 * pg_init_delay_retry so we do not hammer controllers.
 	 */
 	do {
 		list_for_each_entry(pg, &m->priority_groups, list) {
 			if (pg->bypassed == bypassed)
 				continue;
-			if (!__choose_path_in_pg(m, pg, nr_bytes))
+			if (!__choose_path_in_pg(m, pg, nr_bytes)) {
+				if (!bypassed)
+					m->pg_init_delay_retry = 1;
 				return;
+			}
 		}
 	} while (bypassed--);
 
@@ -481,9 +487,6 @@ static void process_queued_ios(struct work_struct *work)
 
 	spin_lock_irqsave(&m->lock, flags);
 
-	if (!m->queue_size)
-		goto out;
-
 	if (!m->current_pgpath)
 		__choose_pgpath(m, 0);
 
@@ -496,7 +499,6 @@ static void process_queued_ios(struct work_struct *work)
 	if (m->pg_init_required && !m->pg_init_in_progress && pgpath)
 		__pg_init_all_paths(m);
 
-out:
 	spin_unlock_irqrestore(&m->lock, flags);
 	if (!must_queue)
 		dispatch_queued_ios(m);
@@ -1517,11 +1519,16 @@ out:
 static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 			   unsigned long arg)
 {
-	struct multipath *m = (struct multipath *) ti->private;
-	struct block_device *bdev = NULL;
-	fmode_t mode = 0;
+	struct multipath *m = ti->private;
+	struct block_device *bdev;
+	fmode_t mode;
 	unsigned long flags;
-	int r = 0;
+	int r;
+
+again:
+	bdev = NULL;
+	mode = 0;
+	r = 0;
 
 	spin_lock_irqsave(&m->lock, flags);
 
@@ -1546,6 +1553,12 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 	if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
 		r = scsi_verify_blk_ioctl(NULL, cmd);
 
+	if (r == -EAGAIN && !fatal_signal_pending(current)) {
+		queue_work(kmultipathd, &m->process_queued_ios);
+		msleep(10);
+		goto again;
+	}
+
 	return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 }
 
@@ -1643,7 +1656,7 @@ out:
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 3, 0},
+	.version = {1, 4, 0},
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index d039de8322f0..b58b7a33914a 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1084,6 +1084,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->split_io = dm_rh_get_region_size(ms->rh);
 	ti->num_flush_requests = 1;
 	ti->num_discard_requests = 1;
+	ti->discard_zeroes_data_unsupported = 1;
 
 	ms->kmirrord_wq = alloc_workqueue("kmirrord",
 					  WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
@@ -1214,7 +1215,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 	 * We need to dec pending if this was a write.
 	 */
 	if (rw == WRITE) {
-		if (!(bio->bi_rw & REQ_FLUSH))
+		if (!(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD)))
 			dm_rh_dec(ms->rh, map_context->ll);
 		return error;
 	}
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 7771ed212182..69732e03eb34 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -404,6 +404,9 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
 		return;
 	}
 
+	if (bio->bi_rw & REQ_DISCARD)
+		return;
+
 	/* We must inform the log that the sync count has changed. */
 	log->type->set_region_sync(log, region, 0);
 
@@ -524,7 +527,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
 	struct bio *bio;
 
 	for (bio = bios->head; bio; bio = bio->bi_next) {
-		if (bio->bi_rw & REQ_FLUSH)
+		if (bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))
 			continue;
 		rh_inc(rh, dm_rh_bio_to_region(rh, bio));
 	}
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 737d38865b69..3e2907f0bc46 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1082,12 +1082,89 @@ int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
 	return 0;
 }
 
-static int __get_held_metadata_root(struct dm_pool_metadata *pmd,
-				    dm_block_t *result)
+static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
+{
+	int r, inc;
+	struct thin_disk_superblock *disk_super;
+	struct dm_block *copy, *sblock;
+	dm_block_t held_root;
+
+	/*
+	 * Copy the superblock.
+	 */
+	dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
+	r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION,
+			       &sb_validator, &copy, &inc);
+	if (r)
+		return r;
+
+	BUG_ON(!inc);
+
+	held_root = dm_block_location(copy);
+	disk_super = dm_block_data(copy);
+
+	if (le64_to_cpu(disk_super->held_root)) {
+		DMWARN("Pool metadata snapshot already exists: release this before taking another.");
+
+		dm_tm_dec(pmd->tm, held_root);
+		dm_tm_unlock(pmd->tm, copy);
+		pmd->need_commit = 1;
+
+		return -EBUSY;
+	}
+
+	/*
+	 * Wipe the spacemap since we're not publishing this.
+	 */
+	memset(&disk_super->data_space_map_root, 0,
+	       sizeof(disk_super->data_space_map_root));
+	memset(&disk_super->metadata_space_map_root, 0,
+	       sizeof(disk_super->metadata_space_map_root));
+
+	/*
+	 * Increment the data structures that need to be preserved.
+	 */
+	dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root));
+	dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root));
+	dm_tm_unlock(pmd->tm, copy);
+
+	/*
+	 * Write the held root into the superblock.
+	 */
+	r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+			     &sb_validator, &sblock);
+	if (r) {
+		dm_tm_dec(pmd->tm, held_root);
+		pmd->need_commit = 1;
+		return r;
+	}
+
+	disk_super = dm_block_data(sblock);
+	disk_super->held_root = cpu_to_le64(held_root);
+	dm_bm_unlock(sblock);
+
+	pmd->need_commit = 1;
+
+	return 0;
+}
+
+int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
+{
+	int r;
+
+	down_write(&pmd->root_lock);
+	r = __reserve_metadata_snap(pmd);
+	up_write(&pmd->root_lock);
+
+	return r;
+}
+
+static int __release_metadata_snap(struct dm_pool_metadata *pmd)
 {
 	int r;
 	struct thin_disk_superblock *disk_super;
-	struct dm_block *sblock;
+	struct dm_block *sblock, *copy;
+	dm_block_t held_root;
 
 	r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
 			     &sb_validator, &sblock);
@@ -1095,18 +1172,65 @@ static int __get_held_metadata_root(struct dm_pool_metadata *pmd,
 		return r;
 
 	disk_super = dm_block_data(sblock);
+	held_root = le64_to_cpu(disk_super->held_root);
+	disk_super->held_root = cpu_to_le64(0);
+	pmd->need_commit = 1;
+
+	dm_bm_unlock(sblock);
+
+	if (!held_root) {
+		DMWARN("No pool metadata snapshot found: nothing to release.");
+		return -EINVAL;
+	}
+
+	r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, &copy);
+	if (r)
+		return r;
+
+	disk_super = dm_block_data(copy);
+	dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->data_mapping_root));
+	dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->device_details_root));
+	dm_sm_dec_block(pmd->metadata_sm, held_root);
+
+	return dm_tm_unlock(pmd->tm, copy);
+}
+
+int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
+{
+	int r;
+
+	down_write(&pmd->root_lock);
+	r = __release_metadata_snap(pmd);
+	up_write(&pmd->root_lock);
+
+	return r;
+}
+
+static int __get_metadata_snap(struct dm_pool_metadata *pmd,
+			       dm_block_t *result)
+{
+	int r;
+	struct thin_disk_superblock *disk_super;
+	struct dm_block *sblock;
+
+	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+			    &sb_validator, &sblock);
+	if (r)
+		return r;
+
+	disk_super = dm_block_data(sblock);
 	*result = le64_to_cpu(disk_super->held_root);
 
 	return dm_bm_unlock(sblock);
 }
 
-int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd,
-				   dm_block_t *result)
+int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
+			      dm_block_t *result)
 {
 	int r;
 
 	down_read(&pmd->root_lock);
-	r = __get_held_metadata_root(pmd, result);
+	r = __get_metadata_snap(pmd, result);
 	up_read(&pmd->root_lock);
 
 	return r;
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index ed4725e67c96..b88918ccdaf6 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -90,11 +90,18 @@ int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
 
 /*
  * Hold/get root for userspace transaction.
+ *
+ * The metadata snapshot is a copy of the current superblock (minus the
+ * space maps).  Userland can access the data structures for READ
+ * operations only.  A small performance hit is incurred by providing this
+ * copy of the metadata to userland due to extra copy-on-write operations
+ * on the metadata nodes.  Release this as soon as you finish with it.
  */
-int dm_pool_hold_metadata_root(struct dm_pool_metadata *pmd);
+int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd);
+int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd);
 
-int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd,
-				   dm_block_t *result);
+int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
+			      dm_block_t *result);
 
 /*
  * Actions on a single virtual device.
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index eb3d138ff55a..68694da0d21d 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -111,7 +111,7 @@ struct cell_key {
 	dm_block_t block;
 };
 
-struct cell {
+struct dm_bio_prison_cell {
 	struct hlist_node list;
 	struct bio_prison *prison;
 	struct cell_key key;
@@ -141,6 +141,8 @@ static uint32_t calc_nr_buckets(unsigned nr_cells)
 	return n;
 }
 
+static struct kmem_cache *_cell_cache;
+
 /*
  * @nr_cells should be the number of cells you want in use _concurrently_.
  * Don't confuse it with the number of distinct keys.
@@ -157,8 +159,7 @@ static struct bio_prison *prison_create(unsigned nr_cells)
 		return NULL;
 
 	spin_lock_init(&prison->lock);
-	prison->cell_pool = mempool_create_kmalloc_pool(nr_cells,
-							sizeof(struct cell));
+	prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache);
 	if (!prison->cell_pool) {
 		kfree(prison);
 		return NULL;
@@ -194,10 +195,10 @@ static int keys_equal(struct cell_key *lhs, struct cell_key *rhs)
 		       (lhs->block == rhs->block);
 }
 
-static struct cell *__search_bucket(struct hlist_head *bucket,
-				    struct cell_key *key)
+static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket,
+						  struct cell_key *key)
 {
-	struct cell *cell;
+	struct dm_bio_prison_cell *cell;
 	struct hlist_node *tmp;
 
 	hlist_for_each_entry(cell, tmp, bucket, list)
@@ -214,12 +215,12 @@ static struct cell *__search_bucket(struct hlist_head *bucket,
  * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
  */
 static int bio_detain(struct bio_prison *prison, struct cell_key *key,
-		      struct bio *inmate, struct cell **ref)
+		      struct bio *inmate, struct dm_bio_prison_cell **ref)
 {
 	int r = 1;
 	unsigned long flags;
 	uint32_t hash = hash_key(prison, key);
-	struct cell *cell, *cell2;
+	struct dm_bio_prison_cell *cell, *cell2;
 
 	BUG_ON(hash > prison->nr_buckets);
 
@@ -273,7 +274,7 @@ out:
 /*
  * @inmates must have been initialised prior to this call
  */
-static void __cell_release(struct cell *cell, struct bio_list *inmates)
+static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
 {
 	struct bio_prison *prison = cell->prison;
 
@@ -287,7 +288,7 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates)
 	mempool_free(cell, prison->cell_pool);
 }
 
-static void cell_release(struct cell *cell, struct bio_list *bios)
+static void cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios)
 {
 	unsigned long flags;
 	struct bio_prison *prison = cell->prison;
@@ -303,7 +304,7 @@ static void cell_release(struct cell *cell, struct bio_list *bios)
  * bio may be in the cell.  This function releases the cell, and also does
  * a sanity check.
  */
-static void __cell_release_singleton(struct cell *cell, struct bio *bio)
+static void __cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio)
 {
 	BUG_ON(cell->holder != bio);
 	BUG_ON(!bio_list_empty(&cell->bios));
@@ -311,7 +312,7 @@ static void __cell_release_singleton(struct cell *cell, struct bio *bio)
 	__cell_release(cell, NULL);
 }
 
-static void cell_release_singleton(struct cell *cell, struct bio *bio)
+static void cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio)
 {
 	unsigned long flags;
 	struct bio_prison *prison = cell->prison;
@@ -324,7 +325,8 @@ static void cell_release_singleton(struct cell *cell, struct bio *bio)
 /*
  * Sometimes we don't want the holder, just the additional bios.
  */
-static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
+static void __cell_release_no_holder(struct dm_bio_prison_cell *cell,
+				     struct bio_list *inmates)
 {
 	struct bio_prison *prison = cell->prison;
 
@@ -334,7 +336,8 @@ static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates
 	mempool_free(cell, prison->cell_pool);
 }
 
-static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
+static void cell_release_no_holder(struct dm_bio_prison_cell *cell,
+				   struct bio_list *inmates)
 {
 	unsigned long flags;
 	struct bio_prison *prison = cell->prison;
@@ -344,7 +347,7 @@ static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
 	spin_unlock_irqrestore(&prison->lock, flags);
 }
 
-static void cell_error(struct cell *cell)
+static void cell_error(struct dm_bio_prison_cell *cell)
 {
 	struct bio_prison *prison = cell->prison;
 	struct bio_list bios;
@@ -491,7 +494,7 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
  * also provides the interface for creating and destroying internal
  * devices.
  */
-struct new_mapping;
+struct dm_thin_new_mapping;
 
 struct pool_features {
 	unsigned zero_new_blocks:1;
@@ -537,7 +540,7 @@ struct pool {
 	struct deferred_set shared_read_ds;
 	struct deferred_set all_io_ds;
 
-	struct new_mapping *next_mapping;
+	struct dm_thin_new_mapping *next_mapping;
 	mempool_t *mapping_pool;
 	mempool_t *endio_hook_pool;
 };
@@ -630,11 +633,11 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev
 
 /*----------------------------------------------------------------*/
 
-struct endio_hook {
+struct dm_thin_endio_hook {
 	struct thin_c *tc;
 	struct deferred_entry *shared_read_entry;
 	struct deferred_entry *all_io_entry;
-	struct new_mapping *overwrite_mapping;
+	struct dm_thin_new_mapping *overwrite_mapping;
 };
 
 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
@@ -647,7 +650,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 	bio_list_init(master);
 
 	while ((bio = bio_list_pop(&bios))) {
-		struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+		struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
+
 		if (h->tc == tc)
 			bio_endio(bio, DM_ENDIO_REQUEUE);
 		else
@@ -736,7 +740,7 @@ static void wake_worker(struct pool *pool)
 /*
  * Bio endio functions.
  */
-struct new_mapping {
+struct dm_thin_new_mapping {
 	struct list_head list;
 
 	unsigned quiesced:1;
@@ -746,7 +750,7 @@ struct new_mapping {
 	struct thin_c *tc;
 	dm_block_t virt_block;
 	dm_block_t data_block;
-	struct cell *cell, *cell2;
+	struct dm_bio_prison_cell *cell, *cell2;
 	int err;
 
 	/*
@@ -759,7 +763,7 @@ struct new_mapping {
 	bio_end_io_t *saved_bi_end_io;
 };
 
-static void __maybe_add_mapping(struct new_mapping *m)
+static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
 {
 	struct pool *pool = m->tc->pool;
 
@@ -772,7 +776,7 @@ static void __maybe_add_mapping(struct new_mapping *m)
 static void copy_complete(int read_err, unsigned long write_err, void *context)
 {
 	unsigned long flags;
-	struct new_mapping *m = context;
+	struct dm_thin_new_mapping *m = context;
 	struct pool *pool = m->tc->pool;
 
 	m->err = read_err || write_err ? -EIO : 0;
@@ -786,8 +790,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
 static void overwrite_endio(struct bio *bio, int err)
 {
 	unsigned long flags;
-	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
-	struct new_mapping *m = h->overwrite_mapping;
+	struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
+	struct dm_thin_new_mapping *m = h->overwrite_mapping;
 	struct pool *pool = m->tc->pool;
 
 	m->err = err;
@@ -811,7 +815,7 @@ static void overwrite_endio(struct bio *bio, int err)
 /*
  * This sends the bios in the cell back to the deferred_bios list.
  */
-static void cell_defer(struct thin_c *tc, struct cell *cell,
+static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell,
 		       dm_block_t data_block)
 {
 	struct pool *pool = tc->pool;
@@ -828,7 +832,7 @@ static void cell_defer(struct thin_c *tc, struct cell *cell,
  * Same as cell_defer above, except it omits one particular detainee,
  * a write bio that covers the block and has already been processed.
  */
-static void cell_defer_except(struct thin_c *tc, struct cell *cell)
+static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 {
 	struct bio_list bios;
 	struct pool *pool = tc->pool;
@@ -843,7 +847,7 @@ static void cell_defer_except(struct thin_c *tc, struct cell *cell)
 	wake_worker(pool);
 }
 
-static void process_prepared_mapping(struct new_mapping *m)
+static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 {
 	struct thin_c *tc = m->tc;
 	struct bio *bio;
@@ -886,7 +890,7 @@ static void process_prepared_mapping(struct new_mapping *m)
 	mempool_free(m, tc->pool->mapping_pool);
 }
 
-static void process_prepared_discard(struct new_mapping *m)
+static void process_prepared_discard(struct dm_thin_new_mapping *m)
 {
 	int r;
 	struct thin_c *tc = m->tc;
@@ -909,11 +913,11 @@ static void process_prepared_discard(struct new_mapping *m)
 }
 
 static void process_prepared(struct pool *pool, struct list_head *head,
-			     void (*fn)(struct new_mapping *))
+			     void (*fn)(struct dm_thin_new_mapping *))
 {
 	unsigned long flags;
 	struct list_head maps;
-	struct new_mapping *m, *tmp;
+	struct dm_thin_new_mapping *m, *tmp;
 
 	INIT_LIST_HEAD(&maps);
 	spin_lock_irqsave(&pool->lock, flags);
@@ -957,9 +961,9 @@ static int ensure_next_mapping(struct pool *pool)
 	return pool->next_mapping ? 0 : -ENOMEM;
 }
 
-static struct new_mapping *get_next_mapping(struct pool *pool)
+static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
 {
-	struct new_mapping *r = pool->next_mapping;
+	struct dm_thin_new_mapping *r = pool->next_mapping;
 
 	BUG_ON(!pool->next_mapping);
 
@@ -971,11 +975,11 @@ static struct new_mapping *get_next_mapping(struct pool *pool)
 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 			  struct dm_dev *origin, dm_block_t data_origin,
 			  dm_block_t data_dest,
-			  struct cell *cell, struct bio *bio)
+			  struct dm_bio_prison_cell *cell, struct bio *bio)
 {
 	int r;
 	struct pool *pool = tc->pool;
-	struct new_mapping *m = get_next_mapping(pool);
+	struct dm_thin_new_mapping *m = get_next_mapping(pool);
 
 	INIT_LIST_HEAD(&m->list);
 	m->quiesced = 0;
@@ -997,7 +1001,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 	 * bio immediately. Otherwise we use kcopyd to clone the data first.
 	 */
 	if (io_overwrites_block(pool, bio)) {
-		struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+		struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
+
 		h->overwrite_mapping = m;
 		m->bio = bio;
 		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
@@ -1025,7 +1030,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 
 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
 				   dm_block_t data_origin, dm_block_t data_dest,
-				   struct cell *cell, struct bio *bio)
+				   struct dm_bio_prison_cell *cell, struct bio *bio)
 {
 	schedule_copy(tc, virt_block, tc->pool_dev,
 		      data_origin, data_dest, cell, bio);
@@ -1033,18 +1038,18 @@ static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
 
 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
 				   dm_block_t data_dest,
-				   struct cell *cell, struct bio *bio)
+				   struct dm_bio_prison_cell *cell, struct bio *bio)
 {
 	schedule_copy(tc, virt_block, tc->origin_dev,
 		      virt_block, data_dest, cell, bio);
 }
 
 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
-			  dm_block_t data_block, struct cell *cell,
+			  dm_block_t data_block, struct dm_bio_prison_cell *cell,
 			  struct bio *bio)
 {
 	struct pool *pool = tc->pool;
-	struct new_mapping *m = get_next_mapping(pool);
+	struct dm_thin_new_mapping *m = get_next_mapping(pool);
 
 	INIT_LIST_HEAD(&m->list);
 	m->quiesced = 1;
@@ -1065,12 +1070,12 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 		process_prepared_mapping(m);
 
 	else if (io_overwrites_block(pool, bio)) {
-		struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+		struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
+
 		h->overwrite_mapping = m;
 		m->bio = bio;
 		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
 		remap_and_issue(tc, bio, data_block);
-
 	} else {
 		int r;
 		struct dm_io_region to;
@@ -1155,7 +1160,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
  */
 static void retry_on_resume(struct bio *bio)
 {
-	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+	struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
 	struct thin_c *tc = h->tc;
 	struct pool *pool = tc->pool;
 	unsigned long flags;
@@ -1165,7 +1170,7 @@ static void retry_on_resume(struct bio *bio)
 	spin_unlock_irqrestore(&pool->lock, flags);
 }
 
-static void no_space(struct cell *cell)
+static void no_space(struct dm_bio_prison_cell *cell)
 {
 	struct bio *bio;
 	struct bio_list bios;
@@ -1182,11 +1187,11 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 	int r;
 	unsigned long flags;
 	struct pool *pool = tc->pool;
-	struct cell *cell, *cell2;
+	struct dm_bio_prison_cell *cell, *cell2;
 	struct cell_key key, key2;
 	dm_block_t block = get_bio_block(tc, bio);
 	struct dm_thin_lookup_result lookup_result;
-	struct new_mapping *m;
+	struct dm_thin_new_mapping *m;
 
 	build_virtual_key(tc->td, block, &key);
 	if (bio_detain(tc->pool->prison, &key, bio, &cell))
@@ -1240,7 +1245,10 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 
 			cell_release_singleton(cell, bio);
 			cell_release_singleton(cell2, bio);
-			remap_and_issue(tc, bio, lookup_result.block);
+			if ((!lookup_result.shared) && pool->pf.discard_passdown)
+				remap_and_issue(tc, bio, lookup_result.block);
+			else
+				bio_endio(bio, 0);
 		}
 		break;
 
@@ -1263,7 +1271,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
 			  struct cell_key *key,
 			  struct dm_thin_lookup_result *lookup_result,
-			  struct cell *cell)
+			  struct dm_bio_prison_cell *cell)
 {
 	int r;
 	dm_block_t data_block;
@@ -1290,7 +1298,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
 			       dm_block_t block,
 			       struct dm_thin_lookup_result *lookup_result)
 {
-	struct cell *cell;
+	struct dm_bio_prison_cell *cell;
 	struct pool *pool = tc->pool;
 	struct cell_key key;
 
@@ -1305,7 +1313,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
 	if (bio_data_dir(bio) == WRITE)
 		break_sharing(tc, bio, block, &key, lookup_result, cell);
 	else {
-		struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+		struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
 
 		h->shared_read_entry = ds_inc(&pool->shared_read_ds);
 
@@ -1315,7 +1323,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
 }
 
 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
-			    struct cell *cell)
+			    struct dm_bio_prison_cell *cell)
 {
 	int r;
 	dm_block_t data_block;
@@ -1363,7 +1371,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
 {
 	int r;
 	dm_block_t block = get_bio_block(tc, bio);
-	struct cell *cell;
+	struct dm_bio_prison_cell *cell;
 	struct cell_key key;
 	struct dm_thin_lookup_result lookup_result;
 
@@ -1432,7 +1440,7 @@ static void process_deferred_bios(struct pool *pool)
 	spin_unlock_irqrestore(&pool->lock, flags);
 
 	while ((bio = bio_list_pop(&bios))) {
-		struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+		struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
 		struct thin_c *tc = h->tc;
 
 		/*
@@ -1522,10 +1530,10 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
 	wake_worker(pool);
 }
 
-static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
+static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
 {
 	struct pool *pool = tc->pool;
-	struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
+	struct dm_thin_endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
 
 	h->tc = tc;
 	h->shared_read_entry = NULL;
@@ -1687,6 +1695,9 @@ static void __pool_destroy(struct pool *pool)
 	kfree(pool);
 }
 
+static struct kmem_cache *_new_mapping_cache;
+static struct kmem_cache *_endio_hook_cache;
+
 static struct pool *pool_create(struct mapped_device *pool_md,
 				struct block_device *metadata_dev,
 				unsigned long block_size, char **error)
@@ -1755,16 +1766,16 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 	ds_init(&pool->all_io_ds);
 
 	pool->next_mapping = NULL;
-	pool->mapping_pool =
-		mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping));
+	pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
+						      _new_mapping_cache);
 	if (!pool->mapping_pool) {
 		*error = "Error creating pool's mapping mempool";
 		err_p = ERR_PTR(-ENOMEM);
 		goto bad_mapping_pool;
 	}
 
-	pool->endio_hook_pool =
-		mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook));
+	pool->endio_hook_pool = mempool_create_slab_pool(ENDIO_HOOK_POOL_SIZE,
+							 _endio_hook_cache);
 	if (!pool->endio_hook_pool) {
 		*error = "Error creating pool's endio_hook mempool";
 		err_p = ERR_PTR(-ENOMEM);
@@ -2276,6 +2287,43 @@ static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct po
 	return 0;
 }
 
+static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
+{
+	int r;
+
+	r = check_arg_count(argc, 1);
+	if (r)
+		return r;
+
+	r = dm_pool_commit_metadata(pool->pmd);
+	if (r) {
+		DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
+		      __func__, r);
+		return r;
+	}
+
+	r = dm_pool_reserve_metadata_snap(pool->pmd);
+	if (r)
+		DMWARN("reserve_metadata_snap message failed.");
+
+	return r;
+}
+
+static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
+{
+	int r;
+
+	r = check_arg_count(argc, 1);
+	if (r)
+		return r;
+
+	r = dm_pool_release_metadata_snap(pool->pmd);
+	if (r)
+		DMWARN("release_metadata_snap message failed.");
+
+	return r;
+}
+
 /*
  * Messages supported:
  *   create_thin	<dev_id>
@@ -2283,6 +2331,8 @@ static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct po
  *   delete		<dev_id>
  *   trim		<dev_id> <new_size_in_sectors>
  *   set_transaction_id <current_trans_id> <new_trans_id>
+ *   reserve_metadata_snap
+ *   release_metadata_snap
  */
 static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
 {
@@ -2302,6 +2352,12 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
 	else if (!strcasecmp(argv[0], "set_transaction_id"))
 		r = process_set_transaction_id_mesg(argc, argv, pool);
 
+	else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
+		r = process_reserve_metadata_snap_mesg(argc, argv, pool);
+
+	else if (!strcasecmp(argv[0], "release_metadata_snap"))
+		r = process_release_metadata_snap_mesg(argc, argv, pool);
+
 	else
 		DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
 
@@ -2361,7 +2417,7 @@ static int pool_status(struct dm_target *ti, status_type_t type,
 		if (r)
 			return r;
 
-		r = dm_pool_get_held_metadata_root(pool->pmd, &held_root);
+		r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
 		if (r)
 			return r;
 
@@ -2457,7 +2513,7 @@ static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 1, 0},
+	.version = {1, 2, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -2575,6 +2631,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (tc->pool->pf.discard_enabled) {
 		ti->discards_supported = 1;
 		ti->num_discard_requests = 1;
+		ti->discard_zeroes_data_unsupported = 1;
 	}
 
 	dm_put(pool_md);
@@ -2613,9 +2670,9 @@ static int thin_endio(struct dm_target *ti,
 		      union map_info *map_context)
 {
 	unsigned long flags;
-	struct endio_hook *h = map_context->ptr;
+	struct dm_thin_endio_hook *h = map_context->ptr;
 	struct list_head work;
-	struct new_mapping *m, *tmp;
+	struct dm_thin_new_mapping *m, *tmp;
 	struct pool *pool = h->tc->pool;
 
 	if (h->shared_read_entry) {
@@ -2755,7 +2812,32 @@ static int __init dm_thin_init(void)
 
 	r = dm_register_target(&pool_target);
 	if (r)
-		dm_unregister_target(&thin_target);
+		goto bad_pool_target;
+
+	r = -ENOMEM;
+
+	_cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0);
+	if (!_cell_cache)
+		goto bad_cell_cache;
+
+	_new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
+	if (!_new_mapping_cache)
+		goto bad_new_mapping_cache;
+
+	_endio_hook_cache = KMEM_CACHE(dm_thin_endio_hook, 0);
+	if (!_endio_hook_cache)
+		goto bad_endio_hook_cache;
+
+	return 0;
+
+bad_endio_hook_cache:
+	kmem_cache_destroy(_new_mapping_cache);
+bad_new_mapping_cache:
+	kmem_cache_destroy(_cell_cache);
+bad_cell_cache:
+	dm_unregister_target(&pool_target);
+bad_pool_target:
+	dm_unregister_target(&thin_target);
 
 	return r;
 }
@@ -2764,6 +2846,10 @@ static void dm_thin_exit(void)
 {
 	dm_unregister_target(&thin_target);
 	dm_unregister_target(&pool_target);
+
+	kmem_cache_destroy(_cell_cache);
+	kmem_cache_destroy(_new_mapping_cache);
+	kmem_cache_destroy(_endio_hook_cache);
 }
 
 module_init(dm_thin_init);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1c2f9048e1ae..db02d2efb76f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -498,61 +498,13 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
 }
 EXPORT_SYMBOL(md_flush_request);
 
-/* Support for plugging.
- * This mirrors the plugging support in request_queue, but does not
- * require having a whole queue or request structures.
- * We allocate an md_plug_cb for each md device and each thread it gets
- * plugged on.  This links tot the private plug_handle structure in the
- * personality data where we keep a count of the number of outstanding
- * plugs so other code can see if a plug is active.
- */
-struct md_plug_cb {
-	struct blk_plug_cb cb;
-	struct mddev *mddev;
-};
-
-static void plugger_unplug(struct blk_plug_cb *cb)
+void md_unplug(struct blk_plug_cb *cb, bool from_schedule)
 {
-	struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb);
-	if (atomic_dec_and_test(&mdcb->mddev->plug_cnt))
-		md_wakeup_thread(mdcb->mddev->thread);
-	kfree(mdcb);
-}
-
-/* Check that an unplug wakeup will come shortly.
- * If not, wakeup the md thread immediately
- */
-int mddev_check_plugged(struct mddev *mddev)
-{
-	struct blk_plug *plug = current->plug;
-	struct md_plug_cb *mdcb;
-
-	if (!plug)
-		return 0;
-
-	list_for_each_entry(mdcb, &plug->cb_list, cb.list) {
-		if (mdcb->cb.callback == plugger_unplug &&
-		    mdcb->mddev == mddev) {
-			/* Already on the list, move to top */
-			if (mdcb != list_first_entry(&plug->cb_list,
-						    struct md_plug_cb,
-						    cb.list))
-				list_move(&mdcb->cb.list, &plug->cb_list);
-			return 1;
-		}
-	}
-	/* Not currently on the callback list */
-	mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC);
-	if (!mdcb)
-		return 0;
-
-	mdcb->mddev = mddev;
-	mdcb->cb.callback = plugger_unplug;
-	atomic_inc(&mddev->plug_cnt);
-	list_add(&mdcb->cb.list, &plug->cb_list);
-	return 1;
+	struct mddev *mddev = cb->data;
+	md_wakeup_thread(mddev->thread);
+	kfree(cb);
 }
-EXPORT_SYMBOL_GPL(mddev_check_plugged);
+EXPORT_SYMBOL(md_unplug);
 
 static inline struct mddev *mddev_get(struct mddev *mddev)
 {
@@ -602,7 +554,6 @@ void mddev_init(struct mddev *mddev)
 	atomic_set(&mddev->active, 1);
 	atomic_set(&mddev->openers, 0);
 	atomic_set(&mddev->active_io, 0);
-	atomic_set(&mddev->plug_cnt, 0);
 	spin_lock_init(&mddev->write_lock);
 	atomic_set(&mddev->flush_pending, 0);
 	init_waitqueue_head(&mddev->sb_wait);
@@ -2931,6 +2882,7 @@ offset_store(struct md_rdev *rdev, const char *buf, size_t len)
 		 * can be sane */
 		return -EBUSY;
 	rdev->data_offset = offset;
+	rdev->new_data_offset = offset;
 	return len;
 }
 
@@ -3926,8 +3878,8 @@ array_state_show(struct mddev *mddev, char *page)
 	return sprintf(page, "%s\n", array_states[st]);
 }
 
-static int do_md_stop(struct mddev * mddev, int ro, int is_open);
-static int md_set_readonly(struct mddev * mddev, int is_open);
+static int do_md_stop(struct mddev * mddev, int ro, struct block_device *bdev);
+static int md_set_readonly(struct mddev * mddev, struct block_device *bdev);
 static int do_md_run(struct mddev * mddev);
 static int restart_array(struct mddev *mddev);
 
@@ -3943,14 +3895,14 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
 		/* stopping an active array */
 		if (atomic_read(&mddev->openers) > 0)
 			return -EBUSY;
-		err = do_md_stop(mddev, 0, 0);
+		err = do_md_stop(mddev, 0, NULL);
 		break;
 	case inactive:
 		/* stopping an active array */
 		if (mddev->pers) {
 			if (atomic_read(&mddev->openers) > 0)
 				return -EBUSY;
-			err = do_md_stop(mddev, 2, 0);
+			err = do_md_stop(mddev, 2, NULL);
 		} else
 			err = 0; /* already inactive */
 		break;
@@ -3958,7 +3910,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
 		break; /* not supported yet */
 	case readonly:
 		if (mddev->pers)
-			err = md_set_readonly(mddev, 0);
+			err = md_set_readonly(mddev, NULL);
 		else {
 			mddev->ro = 1;
 			set_disk_ro(mddev->gendisk, 1);
@@ -3968,7 +3920,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
 	case read_auto:
 		if (mddev->pers) {
 			if (mddev->ro == 0)
-				err = md_set_readonly(mddev, 0);
+				err = md_set_readonly(mddev, NULL);
 			else if (mddev->ro == 1)
 				err = restart_array(mddev);
 			if (err == 0) {
@@ -5351,15 +5303,17 @@ void md_stop(struct mddev *mddev)
 }
 EXPORT_SYMBOL_GPL(md_stop);
 
-static int md_set_readonly(struct mddev *mddev, int is_open)
+static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
 {
 	int err = 0;
 	mutex_lock(&mddev->open_mutex);
-	if (atomic_read(&mddev->openers) > is_open) {
+	if (atomic_read(&mddev->openers) > !!bdev) {
 		printk("md: %s still in use.\n",mdname(mddev));
 		err = -EBUSY;
 		goto out;
 	}
+	if (bdev)
+		sync_blockdev(bdev);
 	if (mddev->pers) {
 		__md_stop_writes(mddev);
 
@@ -5381,18 +5335,26 @@ out:
  *   0 - completely stop and dis-assemble array
  *   2 - stop but do not disassemble array
  */
-static int do_md_stop(struct mddev * mddev, int mode, int is_open)
+static int do_md_stop(struct mddev * mddev, int mode,
+		      struct block_device *bdev)
 {
 	struct gendisk *disk = mddev->gendisk;
 	struct md_rdev *rdev;
 
 	mutex_lock(&mddev->open_mutex);
-	if (atomic_read(&mddev->openers) > is_open ||
+	if (atomic_read(&mddev->openers) > !!bdev ||
 	    mddev->sysfs_active) {
 		printk("md: %s still in use.\n",mdname(mddev));
 		mutex_unlock(&mddev->open_mutex);
 		return -EBUSY;
 	}
+	if (bdev)
+		/* It is possible IO was issued on some other
+		 * open file which was closed before we took ->open_mutex.
+		 * As that was not the last close __blkdev_put will not
+		 * have called sync_blockdev, so we must.
+		 */
+		sync_blockdev(bdev);
 
 	if (mddev->pers) {
 		if (mddev->ro)
@@ -5466,7 +5428,7 @@ static void autorun_array(struct mddev *mddev)
 	err = do_md_run(mddev);
 	if (err) {
 		printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
-		do_md_stop(mddev, 0, 0);
+		do_md_stop(mddev, 0, NULL);
 	}
 }
 
@@ -5784,8 +5746,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
 			super_types[mddev->major_version].
 				validate_super(mddev, rdev);
 		if ((info->state & (1<<MD_DISK_SYNC)) &&
-		    (!test_bit(In_sync, &rdev->flags) ||
-		     rdev->raid_disk != info->raid_disk)) {
+		     rdev->raid_disk != info->raid_disk) {
 			/* This was a hot-add request, but events doesn't
 			 * match, so reject it.
 			 */
@@ -6482,11 +6443,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 			goto done_unlock;
 
 		case STOP_ARRAY:
-			err = do_md_stop(mddev, 0, 1);
+			err = do_md_stop(mddev, 0, bdev);
 			goto done_unlock;
 
 		case STOP_ARRAY_RO:
-			err = md_set_readonly(mddev, 1);
+			err = md_set_readonly(mddev, bdev);
 			goto done_unlock;
 
 		case BLKROSET:
@@ -6751,7 +6712,7 @@ struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev
 	thread->tsk = kthread_run(md_thread, thread,
 				  "%s_%s",
 				  mdname(thread->mddev),
-				  name ?: mddev->pers->name);
+				  name);
 	if (IS_ERR(thread->tsk)) {
 		kfree(thread);
 		return NULL;
@@ -7298,6 +7259,7 @@ void md_do_sync(struct mddev *mddev)
 	int skipped = 0;
 	struct md_rdev *rdev;
 	char *desc;
+	struct blk_plug plug;
 
 	/* just incase thread restarts... */
 	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -7447,6 +7409,7 @@ void md_do_sync(struct mddev *mddev)
 	}
 	mddev->curr_resync_completed = j;
 
+	blk_start_plug(&plug);
 	while (j < max_sectors) {
 		sector_t sectors;
 
@@ -7552,6 +7515,7 @@ void md_do_sync(struct mddev *mddev)
 	 * this also signals 'finished resyncing' to md_stop
 	 */
  out:
+	blk_finish_plug(&plug);
 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
 
 	/* tell personality that we are finished */
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7b4a3c318cae..f385b038589d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -266,9 +266,6 @@ struct mddev {
 	int				new_chunk_sectors;
 	int				reshape_backwards;
 
-	atomic_t			plug_cnt;	/* If device is expecting
-							 * more bios soon.
-							 */
 	struct md_thread		*thread;	/* management thread */
 	struct md_thread		*sync_thread;	/* doing resync or reconstruct */
 	sector_t			curr_resync;	/* last block scheduled */
@@ -630,6 +627,12 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
 				   struct mddev *mddev);
 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 				   struct mddev *mddev);
-extern int mddev_check_plugged(struct mddev *mddev);
 extern void md_trim_bio(struct bio *bio, int offset, int size);
+
+extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
+static inline int mddev_check_plugged(struct mddev *mddev)
+{
+	return !!blk_check_plugged(md_unplug, mddev,
+				   sizeof(struct blk_plug_cb));
+}
 #endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 9339e67fcc79..61a1833ebaf3 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -474,7 +474,8 @@ static int multipath_run (struct mddev *mddev)
 	}
 
 	{
-		mddev->thread = md_register_thread(multipathd, mddev, NULL);
+		mddev->thread = md_register_thread(multipathd, mddev,
+						   "multipath");
 		if (!mddev->thread) {
 			printk(KERN_ERR "multipath: couldn't allocate thread"
 				" for %s\n", mdname(mddev));
diff --git a/drivers/md/persistent-data/dm-space-map-checker.c b/drivers/md/persistent-data/dm-space-map-checker.c
index 50ed53bf4aa2..fc90c11620ad 100644
--- a/drivers/md/persistent-data/dm-space-map-checker.c
+++ b/drivers/md/persistent-data/dm-space-map-checker.c
@@ -8,6 +8,7 @@
 
 #include <linux/device-mapper.h>
 #include <linux/export.h>
+#include <linux/vmalloc.h>
 
 #ifdef CONFIG_DM_DEBUG_SPACE_MAPS
 
@@ -89,13 +90,23 @@ static int ca_create(struct count_array *ca, struct dm_space_map *sm)
 
 	ca->nr = nr_blocks;
 	ca->nr_free = nr_blocks;
-	ca->counts = kzalloc(sizeof(*ca->counts) * nr_blocks, GFP_KERNEL);
-	if (!ca->counts)
-		return -ENOMEM;
+
+	if (!nr_blocks)
+		ca->counts = NULL;
+	else {
+		ca->counts = vzalloc(sizeof(*ca->counts) * nr_blocks);
+		if (!ca->counts)
+			return -ENOMEM;
+	}
 
 	return 0;
 }
 
+static void ca_destroy(struct count_array *ca)
+{
+	vfree(ca->counts);
+}
+
 static int ca_load(struct count_array *ca, struct dm_space_map *sm)
 {
 	int r;
@@ -126,12 +137,14 @@ static int ca_load(struct count_array *ca, struct dm_space_map *sm)
 static int ca_extend(struct count_array *ca, dm_block_t extra_blocks)
 {
 	dm_block_t nr_blocks = ca->nr + extra_blocks;
-	uint32_t *counts = kzalloc(sizeof(*counts) * nr_blocks, GFP_KERNEL);
+	uint32_t *counts = vzalloc(sizeof(*counts) * nr_blocks);
 	if (!counts)
 		return -ENOMEM;
 
-	memcpy(counts, ca->counts, sizeof(*counts) * ca->nr);
-	kfree(ca->counts);
+	if (ca->counts) {
+		memcpy(counts, ca->counts, sizeof(*counts) * ca->nr);
+		ca_destroy(ca);
+	}
 	ca->nr = nr_blocks;
 	ca->nr_free += extra_blocks;
 	ca->counts = counts;
@@ -151,11 +164,6 @@ static int ca_commit(struct count_array *old, struct count_array *new)
 	return 0;
 }
 
-static void ca_destroy(struct count_array *ca)
-{
-	kfree(ca->counts);
-}
-
 /*----------------------------------------------------------------*/
 
 struct sm_checker {
@@ -343,25 +351,25 @@ struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
 	int r;
 	struct sm_checker *smc;
 
-	if (!sm)
-		return NULL;
+	if (IS_ERR_OR_NULL(sm))
+		return ERR_PTR(-EINVAL);
 
 	smc = kmalloc(sizeof(*smc), GFP_KERNEL);
 	if (!smc)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	memcpy(&smc->sm, &ops_, sizeof(smc->sm));
 	r = ca_create(&smc->old_counts, sm);
 	if (r) {
 		kfree(smc);
-		return NULL;
+		return ERR_PTR(r);
 	}
 
 	r = ca_create(&smc->counts, sm);
 	if (r) {
 		ca_destroy(&smc->old_counts);
 		kfree(smc);
-		return NULL;
+		return ERR_PTR(r);
 	}
 
 	smc->real_sm = sm;
@@ -371,7 +379,7 @@ struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
 		ca_destroy(&smc->counts);
 		ca_destroy(&smc->old_counts);
 		kfree(smc);
-		return NULL;
+		return ERR_PTR(r);
 	}
 
 	r = ca_commit(&smc->old_counts, &smc->counts);
@@ -379,7 +387,7 @@ struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
 		ca_destroy(&smc->counts);
 		ca_destroy(&smc->old_counts);
 		kfree(smc);
-		return NULL;
+		return ERR_PTR(r);
 	}
 
 	return &smc->sm;
@@ -391,25 +399,25 @@ struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm)
 	int r;
 	struct sm_checker *smc;
 
-	if (!sm)
-		return NULL;
+	if (IS_ERR_OR_NULL(sm))
+		return ERR_PTR(-EINVAL);
 
 	smc = kmalloc(sizeof(*smc), GFP_KERNEL);
 	if (!smc)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	memcpy(&smc->sm, &ops_, sizeof(smc->sm));
 	r = ca_create(&smc->old_counts, sm);
 	if (r) {
 		kfree(smc);
-		return NULL;
+		return ERR_PTR(r);
 	}
 
 	r = ca_create(&smc->counts, sm);
 	if (r) {
 		ca_destroy(&smc->old_counts);
 		kfree(smc);
-		return NULL;
+		return ERR_PTR(r);
 	}
 
 	smc->real_sm = sm;
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index fc469ba9f627..3d0ed5332883 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -290,7 +290,16 @@ struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
 				       dm_block_t nr_blocks)
 {
 	struct dm_space_map *sm = dm_sm_disk_create_real(tm, nr_blocks);
-	return dm_sm_checker_create_fresh(sm);
+	struct dm_space_map *smc;
+
+	if (IS_ERR_OR_NULL(sm))
+		return sm;
+
+	smc = dm_sm_checker_create_fresh(sm);
+	if (IS_ERR(smc))
+		dm_sm_destroy(sm);
+
+	return smc;
 }
 EXPORT_SYMBOL_GPL(dm_sm_disk_create);
 
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index 6f8d38747d7f..e5604b32d91f 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -138,6 +138,9 @@ EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone);
 
 void dm_tm_destroy(struct dm_transaction_manager *tm)
 {
+	if (!tm->is_clone)
+		wipe_shadow_table(tm);
+
 	kfree(tm);
 }
 EXPORT_SYMBOL_GPL(dm_tm_destroy);
@@ -249,6 +252,7 @@ int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
 
 	return r;
 }
+EXPORT_SYMBOL_GPL(dm_tm_shadow_block);
 
 int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
 		    struct dm_block_validator *v,
@@ -259,6 +263,7 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
 
 	return dm_bm_read_lock(tm->bm, b, v, blk);
 }
+EXPORT_SYMBOL_GPL(dm_tm_read_lock);
 
 int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b)
 {
@@ -342,8 +347,10 @@ static int dm_tm_create_internal(struct dm_block_manager *bm,
 		}
 
 		*sm = dm_sm_checker_create(inner);
-		if (!*sm)
+		if (IS_ERR(*sm)) {
+			r = PTR_ERR(*sm);
 			goto bad2;
+		}
 
 	} else {
 		r = dm_bm_write_lock(dm_tm_get_bm(*tm), sb_location,
@@ -362,8 +369,10 @@ static int dm_tm_create_internal(struct dm_block_manager *bm,
 		}
 
 		*sm = dm_sm_checker_create(inner);
-		if (!*sm)
+		if (IS_ERR(*sm)) {
+			r = PTR_ERR(*sm);
 			goto bad2;
+		}
 	}
 
 	return 0;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 835de7168cd3..36a8fc059ac3 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -517,8 +517,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 		int bad_sectors;
 
 		int disk = start_disk + i;
-		if (disk >= conf->raid_disks)
-			disk -= conf->raid_disks;
+		if (disk >= conf->raid_disks * 2)
+			disk -= conf->raid_disks * 2;
 
 		rdev = rcu_dereference(conf->mirrors[disk].rdev);
 		if (r1_bio->bios[disk] == IO_BLOCKED
@@ -883,7 +883,6 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
 	struct md_rdev *blocked_rdev;
-	int plugged;
 	int first_clone;
 	int sectors_handled;
 	int max_sectors;
@@ -1034,7 +1033,6 @@ read_again:
 	 * the bad blocks.  Each set of writes gets it's own r1bio
 	 * with a set of bios attached.
 	 */
-	plugged = mddev_check_plugged(mddev);
 
 	disks = conf->raid_disks * 2;
  retry_write:
@@ -1191,6 +1189,8 @@ read_again:
 		bio_list_add(&conf->pending_bio_list, mbio);
 		conf->pending_count++;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
+		if (!mddev_check_plugged(mddev))
+			md_wakeup_thread(mddev->thread);
 	}
 	/* Mustn't call r1_bio_write_done before this next test,
 	 * as it could result in the bio being freed.
@@ -1213,9 +1213,6 @@ read_again:
 
 	/* In case raid1d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
-
-	if (do_sync || !bitmap || !plugged)
-		md_wakeup_thread(mddev->thread);
 }
 
 static void status(struct seq_file *seq, struct mddev *mddev)
@@ -1821,8 +1818,14 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
 
 	if (atomic_dec_and_test(&r1_bio->remaining)) {
 		/* if we're here, all write(s) have completed, so clean up */
-		md_done_sync(mddev, r1_bio->sectors, 1);
-		put_buf(r1_bio);
+		int s = r1_bio->sectors;
+		if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+		    test_bit(R1BIO_WriteError, &r1_bio->state))
+			reschedule_retry(r1_bio);
+		else {
+			put_buf(r1_bio);
+			md_done_sync(mddev, s, 1);
+		}
 	}
 }
 
@@ -2170,8 +2173,7 @@ static void raid1d(struct mddev *mddev)
 	blk_start_plug(&plug);
 	for (;;) {
 
-		if (atomic_read(&mddev->plug_cnt) == 0)
-			flush_pending_writes(conf);
+		flush_pending_writes(conf);
 
 		spin_lock_irqsave(&conf->device_lock, flags);
 		if (list_empty(head)) {
@@ -2488,9 +2490,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 	 */
 	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
 		atomic_set(&r1_bio->remaining, read_targets);
-		for (i = 0; i < conf->raid_disks * 2; i++) {
+		for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) {
 			bio = r1_bio->bios[i];
 			if (bio->bi_end_io == end_sync_read) {
+				read_targets--;
 				md_sync_acct(bio->bi_bdev, nr_sectors);
 				generic_make_request(bio);
 			}
@@ -2550,6 +2553,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	err = -EINVAL;
 	spin_lock_init(&conf->device_lock);
 	rdev_for_each(rdev, mddev) {
+		struct request_queue *q;
 		int disk_idx = rdev->raid_disk;
 		if (disk_idx >= mddev->raid_disks
 		    || disk_idx < 0)
@@ -2562,6 +2566,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 		if (disk->rdev)
 			goto abort;
 		disk->rdev = rdev;
+		q = bdev_get_queue(rdev->bdev);
+		if (q->merge_bvec_fn)
+			mddev->merge_check_needed = 1;
 
 		disk->head_position = 0;
 	}
@@ -2617,7 +2624,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 		goto abort;
 	}
 	err = -ENOMEM;
-	conf->thread = md_register_thread(raid1d, mddev, NULL);
+	conf->thread = md_register_thread(raid1d, mddev, "raid1");
 	if (!conf->thread) {
 		printk(KERN_ERR
 		       "md/raid1:%s: couldn't allocate thread\n",
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 987db37cb875..5d33603a497d 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1039,7 +1039,6 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
 	unsigned long flags;
 	struct md_rdev *blocked_rdev;
-	int plugged;
 	int sectors_handled;
 	int max_sectors;
 	int sectors;
@@ -1239,7 +1238,6 @@ read_again:
 	 * of r10_bios is recored in bio->bi_phys_segments just as with
 	 * the read case.
 	 */
-	plugged = mddev_check_plugged(mddev);
 
 	r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
 	raid10_find_phys(conf, r10_bio);
@@ -1396,6 +1394,8 @@ retry_write:
 		bio_list_add(&conf->pending_bio_list, mbio);
 		conf->pending_count++;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
+		if (!mddev_check_plugged(mddev))
+			md_wakeup_thread(mddev->thread);
 
 		if (!r10_bio->devs[i].repl_bio)
 			continue;
@@ -1423,6 +1423,8 @@ retry_write:
 		bio_list_add(&conf->pending_bio_list, mbio);
 		conf->pending_count++;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
+		if (!mddev_check_plugged(mddev))
+			md_wakeup_thread(mddev->thread);
 	}
 
 	/* Don't remove the bias on 'remaining' (one_write_done) until
@@ -1448,9 +1450,6 @@ retry_write:
 
 	/* In case raid10d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
-
-	if (do_sync || !mddev->bitmap || !plugged)
-		md_wakeup_thread(mddev->thread);
 }
 
 static void status(struct seq_file *seq, struct mddev *mddev)
@@ -2310,7 +2309,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 			if (r10_sync_page_io(rdev,
 					     r10_bio->devs[sl].addr +
 					     sect,
-					     s<<9, conf->tmppage, WRITE)
+					     s, conf->tmppage, WRITE)
 			    == 0) {
 				/* Well, this device is dead */
 				printk(KERN_NOTICE
@@ -2349,7 +2348,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 			switch (r10_sync_page_io(rdev,
 					     r10_bio->devs[sl].addr +
 					     sect,
-					     s<<9, conf->tmppage,
+					     s, conf->tmppage,
 						 READ)) {
 			case 0:
 				/* Well, this device is dead */
@@ -2512,7 +2511,7 @@ read_more:
 	slot = r10_bio->read_slot;
 	printk_ratelimited(
 		KERN_ERR
-		"md/raid10:%s: %s: redirecting"
+		"md/raid10:%s: %s: redirecting "
 		"sector %llu to another mirror\n",
 		mdname(mddev),
 		bdevname(rdev->bdev, b),
@@ -2890,6 +2889,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 			/* want to reconstruct this device */
 			rb2 = r10_bio;
 			sect = raid10_find_virt(conf, sector_nr, i);
+			if (sect >= mddev->resync_max_sectors) {
+				/* last stripe is not complete - don't
+				 * try to recover this sector.
+				 */
+				continue;
+			}
 			/* Unless we are doing a full sync, or a replacement
 			 * we only need to recover the block if it is set in
 			 * the bitmap
@@ -3421,7 +3426,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 	spin_lock_init(&conf->resync_lock);
 	init_waitqueue_head(&conf->wait_barrier);
 
-	conf->thread = md_register_thread(raid10d, mddev, NULL);
+	conf->thread = md_register_thread(raid10d, mddev, "raid10");
 	if (!conf->thread)
 		goto out;
 
@@ -3475,6 +3480,7 @@ static int run(struct mddev *mddev)
 
 	rdev_for_each(rdev, mddev) {
 		long long diff;
+		struct request_queue *q;
 
 		disk_idx = rdev->raid_disk;
 		if (disk_idx < 0)
@@ -3493,6 +3499,9 @@ static int run(struct mddev *mddev)
 				goto out_free_conf;
 			disk->rdev = rdev;
 		}
+		q = bdev_get_queue(rdev->bdev);
+		if (q->merge_bvec_fn)
+			mddev->merge_check_needed = 1;
 		diff = (rdev->new_data_offset - rdev->data_offset);
 		if (!mddev->reshape_backwards)
 			diff = -diff;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d26767246d26..bde9da2baa39 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -196,12 +196,14 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
 		BUG_ON(!list_empty(&sh->lru));
 		BUG_ON(atomic_read(&conf->active_stripes)==0);
 		if (test_bit(STRIPE_HANDLE, &sh->state)) {
-			if (test_bit(STRIPE_DELAYED, &sh->state))
+			if (test_bit(STRIPE_DELAYED, &sh->state) &&
+			    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 				list_add_tail(&sh->lru, &conf->delayed_list);
 			else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
 				   sh->bm_seq - conf->seq_write > 0)
 				list_add_tail(&sh->lru, &conf->bitmap_list);
 			else {
+				clear_bit(STRIPE_DELAYED, &sh->state);
 				clear_bit(STRIPE_BIT_DELAY, &sh->state);
 				list_add_tail(&sh->lru, &conf->handle_list);
 			}
@@ -606,6 +608,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 					 * a chance*/
 					md_check_recovery(conf->mddev);
 				}
+				/*
+				 * Because md_wait_for_blocked_rdev
+				 * will dec nr_pending, we must
+				 * increment it first.
+				 */
+				atomic_inc(&rdev->nr_pending);
 				md_wait_for_blocked_rdev(rdev, conf->mddev);
 			} else {
 				/* Acknowledged bad block - skip the write */
@@ -1737,6 +1745,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 	} else {
 		const char *bdn = bdevname(rdev->bdev, b);
 		int retry = 0;
+		int set_bad = 0;
 
 		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
 		atomic_inc(&rdev->read_errors);
@@ -1748,7 +1757,8 @@ static void raid5_end_read_request(struct bio * bi, int error)
 				mdname(conf->mddev),
 				(unsigned long long)s,
 				bdn);
-		else if (conf->mddev->degraded >= conf->max_degraded)
+		else if (conf->mddev->degraded >= conf->max_degraded) {
+			set_bad = 1;
 			printk_ratelimited(
 				KERN_WARNING
 				"md/raid:%s: read error not correctable "
@@ -1756,8 +1766,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
 				mdname(conf->mddev),
 				(unsigned long long)s,
 				bdn);
-		else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
+		} else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
 			/* Oh, no!!! */
+			set_bad = 1;
 			printk_ratelimited(
 				KERN_WARNING
 				"md/raid:%s: read error NOT corrected!! "
@@ -1765,7 +1776,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 				mdname(conf->mddev),
 				(unsigned long long)s,
 				bdn);
-		else if (atomic_read(&rdev->read_errors)
+		} else if (atomic_read(&rdev->read_errors)
 			 > conf->max_nr_stripes)
 			printk(KERN_WARNING
 			       "md/raid:%s: Too many read errors, failing device %s.\n",
@@ -1777,7 +1788,11 @@ static void raid5_end_read_request(struct bio * bi, int error)
 		else {
 			clear_bit(R5_ReadError, &sh->dev[i].flags);
 			clear_bit(R5_ReWrite, &sh->dev[i].flags);
-			md_error(conf->mddev, rdev);
+			if (!(set_bad
+			      && test_bit(In_sync, &rdev->flags)
+			      && rdev_set_badblocks(
+				      rdev, sh->sector, STRIPE_SECTORS, 0)))
+				md_error(conf->mddev, rdev);
 		}
 	}
 	rdev_dec_pending(rdev, conf->mddev);
@@ -3582,8 +3597,18 @@ static void handle_stripe(struct stripe_head *sh)
 
 finish:
 	/* wait for this device to become unblocked */
-	if (conf->mddev->external && unlikely(s.blocked_rdev))
-		md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
+	if (unlikely(s.blocked_rdev)) {
+		if (conf->mddev->external)
+			md_wait_for_blocked_rdev(s.blocked_rdev,
+						 conf->mddev);
+		else
+			/* Internal metadata will immediately
+			 * be written by raid5d, so we don't
+			 * need to wait here.
+			 */
+			rdev_dec_pending(s.blocked_rdev,
+					 conf->mddev);
+	}
 
 	if (s.handle_bad_blocks)
 		for (i = disks; i--; ) {
@@ -3881,8 +3906,6 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
 		raid_bio->bi_next = (void*)rdev;
 		align_bi->bi_bdev =  rdev->bdev;
 		align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
-		/* No reshape active, so we can trust rdev->data_offset */
-		align_bi->bi_sector += rdev->data_offset;
 
 		if (!bio_fits_rdev(align_bi) ||
 		    is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
@@ -3893,6 +3916,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
 			return 0;
 		}
 
+		/* No reshape active, so we can trust rdev->data_offset */
+		align_bi->bi_sector += rdev->data_offset;
+
 		spin_lock_irq(&conf->device_lock);
 		wait_event_lock_irq(conf->wait_for_stripe,
 				    conf->quiesce == 0,
@@ -3971,7 +3997,6 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 	struct stripe_head *sh;
 	const int rw = bio_data_dir(bi);
 	int remaining;
-	int plugged;
 
 	if (unlikely(bi->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bi);
@@ -3990,7 +4015,6 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
 
-	plugged = mddev_check_plugged(mddev);
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		DEFINE_WAIT(w);
 		int previous;
@@ -4092,6 +4116,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 			if ((bi->bi_rw & REQ_SYNC) &&
 			    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 				atomic_inc(&conf->preread_active_stripes);
+			mddev_check_plugged(mddev);
 			release_stripe(sh);
 		} else {
 			/* cannot get stripe for read-ahead, just give-up */
@@ -4099,10 +4124,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 			finish_wait(&conf->wait_for_overlap, &w);
 			break;
 		}
-			
 	}
-	if (!plugged)
-		md_wakeup_thread(mddev->thread);
 
 	spin_lock_irq(&conf->device_lock);
 	remaining = raid5_dec_bi_phys_segments(bi);
@@ -4521,7 +4543,7 @@ static void raid5d(struct mddev *mddev)
 	while (1) {
 		struct bio *bio;
 
-		if (atomic_read(&mddev->plug_cnt) == 0 &&
+		if (
 		    !list_empty(&conf->bitmap_list)) {
 			/* Now is a good time to flush some bitmap updates */
 			conf->seq_flush++;
@@ -4531,8 +4553,7 @@ static void raid5d(struct mddev *mddev)
 			conf->seq_write = conf->seq_flush;
 			activate_bit_delay(conf);
 		}
-		if (atomic_read(&mddev->plug_cnt) == 0)
-			raid5_activate_delayed(conf);
+		raid5_activate_delayed(conf);
 
 		while ((bio = remove_bio_from_retry(conf))) {
 			int ok;
@@ -4823,6 +4844,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	int raid_disk, memory, max_disks;
 	struct md_rdev *rdev;
 	struct disk_info *disk;
+	char pers_name[6];
 
 	if (mddev->new_level != 5
 	    && mddev->new_level != 4
@@ -4946,7 +4968,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 		printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
 		       mdname(mddev), memory);
 
-	conf->thread = md_register_thread(raid5d, mddev, NULL);
+	sprintf(pers_name, "raid%d", mddev->new_level);
+	conf->thread = md_register_thread(raid5d, mddev, pers_name);
 	if (!conf->thread) {
 		printk(KERN_ERR
 		       "md/raid:%s: couldn't allocate thread.\n",
@@ -5465,10 +5488,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (rdev->saved_raid_disk >= 0 &&
 	    rdev->saved_raid_disk >= first &&
 	    conf->disks[rdev->saved_raid_disk].rdev == NULL)
-		disk = rdev->saved_raid_disk;
-	else
-		disk = first;
-	for ( ; disk <= last ; disk++) {
+		first = rdev->saved_raid_disk;
+
+	for (disk = first; disk <= last; disk++) {
 		p = conf->disks + disk;
 		if (p->rdev == NULL) {
 			clear_bit(In_sync, &rdev->flags);
@@ -5477,8 +5499,11 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			if (rdev->saved_raid_disk != disk)
 				conf->fullsync = 1;
 			rcu_assign_pointer(p->rdev, rdev);
-			break;
+			goto out;
 		}
+	}
+	for (disk = first; disk <= last; disk++) {
+		p = conf->disks + disk;
 		if (test_bit(WantReplacement, &p->rdev->flags) &&
 		    p->replacement == NULL) {
 			clear_bit(In_sync, &rdev->flags);
@@ -5490,6 +5515,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			break;
 		}
 	}
+out:
 	print_raid5_conf(conf);
 	return err;
 }