diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/dm-mpath.c | 47 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-region-hash.c | 5 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.c | 136 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.h | 13 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 216 | ||||
-rw-r--r-- | drivers/md/md.c | 104 | ||||
-rw-r--r-- | drivers/md/md.h | 11 | ||||
-rw-r--r-- | drivers/md/multipath.c | 3 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map-checker.c | 54 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map-disk.c | 11 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-transaction-manager.c | 13 | ||||
-rw-r--r-- | drivers/md/raid1.c | 33 | ||||
-rw-r--r-- | drivers/md/raid10.c | 27 | ||||
-rw-r--r-- | drivers/md/raid5.c | 72 |
15 files changed, 509 insertions, 239 deletions
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 754f38f8a692..638dae048b4f 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include <linux/time.h> #include <linux/workqueue.h> +#include <linux/delay.h> #include <scsi/scsi_dh.h> #include <linux/atomic.h> @@ -61,11 +62,11 @@ struct multipath { struct list_head list; struct dm_target *ti; - spinlock_t lock; - const char *hw_handler_name; char *hw_handler_params; + spinlock_t lock; + unsigned nr_priority_groups; struct list_head priority_groups; @@ -81,16 +82,17 @@ struct multipath { struct priority_group *next_pg; /* Switch to this PG if set */ unsigned repeat_count; /* I/Os left before calling PS again */ - unsigned queue_io; /* Must we queue all I/O? */ - unsigned queue_if_no_path; /* Queue I/O if last path fails? */ - unsigned saved_queue_if_no_path;/* Saved state during suspension */ + unsigned queue_io:1; /* Must we queue all I/O? */ + unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ + unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ + unsigned pg_init_retries; /* Number of times to retry pg_init */ unsigned pg_init_count; /* Number of times pg_init called */ unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ + unsigned queue_size; struct work_struct process_queued_ios; struct list_head queued_ios; - unsigned queue_size; struct work_struct trigger_event; @@ -328,14 +330,18 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes) /* * Loop through priority groups until we find a valid path. * First time we skip PGs marked 'bypassed'. - * Second time we only try the ones we skipped. + * Second time we only try the ones we skipped, but set + * pg_init_delay_retry so we do not hammer controllers. */ do { list_for_each_entry(pg, &m->priority_groups, list) { if (pg->bypassed == bypassed) continue; - if (!__choose_path_in_pg(m, pg, nr_bytes)) + if (!__choose_path_in_pg(m, pg, nr_bytes)) { + if (!bypassed) + m->pg_init_delay_retry = 1; return; + } } } while (bypassed--); @@ -481,9 +487,6 @@ static void process_queued_ios(struct work_struct *work) spin_lock_irqsave(&m->lock, flags); - if (!m->queue_size) - goto out; - if (!m->current_pgpath) __choose_pgpath(m, 0); @@ -496,7 +499,6 @@ static void process_queued_ios(struct work_struct *work) if (m->pg_init_required && !m->pg_init_in_progress && pgpath) __pg_init_all_paths(m); -out: spin_unlock_irqrestore(&m->lock, flags); if (!must_queue) dispatch_queued_ios(m); @@ -1517,11 +1519,16 @@ out: static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) { - struct multipath *m = (struct multipath *) ti->private; - struct block_device *bdev = NULL; - fmode_t mode = 0; + struct multipath *m = ti->private; + struct block_device *bdev; + fmode_t mode; unsigned long flags; - int r = 0; + int r; + +again: + bdev = NULL; + mode = 0; + r = 0; spin_lock_irqsave(&m->lock, flags); @@ -1546,6 +1553,12 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) r = scsi_verify_blk_ioctl(NULL, cmd); + if (r == -EAGAIN && !fatal_signal_pending(current)) { + queue_work(kmultipathd, &m->process_queued_ios); + msleep(10); + goto again; + } + return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); } @@ -1643,7 +1656,7 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 3, 0}, + .version = {1, 4, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index d039de8322f0..b58b7a33914a 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1084,6 +1084,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->split_io = dm_rh_get_region_size(ms->rh); ti->num_flush_requests = 1; ti->num_discard_requests = 1; + ti->discard_zeroes_data_unsupported = 1; ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); @@ -1214,7 +1215,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, * We need to dec pending if this was a write. */ if (rw == WRITE) { - if (!(bio->bi_rw & REQ_FLUSH)) + if (!(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) dm_rh_dec(ms->rh, map_context->ll); return error; } diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 7771ed212182..69732e03eb34 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -404,6 +404,9 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) return; } + if (bio->bi_rw & REQ_DISCARD) + return; + /* We must inform the log that the sync count has changed. */ log->type->set_region_sync(log, region, 0); @@ -524,7 +527,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) struct bio *bio; for (bio = bios->head; bio; bio = bio->bi_next) { - if (bio->bi_rw & REQ_FLUSH) + if (bio->bi_rw & (REQ_FLUSH | REQ_DISCARD)) continue; rh_inc(rh, dm_rh_bio_to_region(rh, bio)); } diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 737d38865b69..3e2907f0bc46 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1082,12 +1082,89 @@ int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, return 0; } -static int __get_held_metadata_root(struct dm_pool_metadata *pmd, - dm_block_t *result) +static int __reserve_metadata_snap(struct dm_pool_metadata *pmd) +{ + int r, inc; + struct thin_disk_superblock *disk_super; + struct dm_block *copy, *sblock; + dm_block_t held_root; + + /* + * Copy the superblock. + */ + dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION); + r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, ©, &inc); + if (r) + return r; + + BUG_ON(!inc); + + held_root = dm_block_location(copy); + disk_super = dm_block_data(copy); + + if (le64_to_cpu(disk_super->held_root)) { + DMWARN("Pool metadata snapshot already exists: release this before taking another."); + + dm_tm_dec(pmd->tm, held_root); + dm_tm_unlock(pmd->tm, copy); + pmd->need_commit = 1; + + return -EBUSY; + } + + /* + * Wipe the spacemap since we're not publishing this. + */ + memset(&disk_super->data_space_map_root, 0, + sizeof(disk_super->data_space_map_root)); + memset(&disk_super->metadata_space_map_root, 0, + sizeof(disk_super->metadata_space_map_root)); + + /* + * Increment the data structures that need to be preserved. + */ + dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root)); + dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root)); + dm_tm_unlock(pmd->tm, copy); + + /* + * Write the held root into the superblock. + */ + r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &sblock); + if (r) { + dm_tm_dec(pmd->tm, held_root); + pmd->need_commit = 1; + return r; + } + + disk_super = dm_block_data(sblock); + disk_super->held_root = cpu_to_le64(held_root); + dm_bm_unlock(sblock); + + pmd->need_commit = 1; + + return 0; +} + +int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd) +{ + int r; + + down_write(&pmd->root_lock); + r = __reserve_metadata_snap(pmd); + up_write(&pmd->root_lock); + + return r; +} + +static int __release_metadata_snap(struct dm_pool_metadata *pmd) { int r; struct thin_disk_superblock *disk_super; - struct dm_block *sblock; + struct dm_block *sblock, *copy; + dm_block_t held_root; r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, &sb_validator, &sblock); @@ -1095,18 +1172,65 @@ static int __get_held_metadata_root(struct dm_pool_metadata *pmd, return r; disk_super = dm_block_data(sblock); + held_root = le64_to_cpu(disk_super->held_root); + disk_super->held_root = cpu_to_le64(0); + pmd->need_commit = 1; + + dm_bm_unlock(sblock); + + if (!held_root) { + DMWARN("No pool metadata snapshot found: nothing to release."); + return -EINVAL; + } + + r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, ©); + if (r) + return r; + + disk_super = dm_block_data(copy); + dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->data_mapping_root)); + dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->device_details_root)); + dm_sm_dec_block(pmd->metadata_sm, held_root); + + return dm_tm_unlock(pmd->tm, copy); +} + +int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd) +{ + int r; + + down_write(&pmd->root_lock); + r = __release_metadata_snap(pmd); + up_write(&pmd->root_lock); + + return r; +} + +static int __get_metadata_snap(struct dm_pool_metadata *pmd, + dm_block_t *result) +{ + int r; + struct thin_disk_superblock *disk_super; + struct dm_block *sblock; + + r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); *result = le64_to_cpu(disk_super->held_root); return dm_bm_unlock(sblock); } -int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd, - dm_block_t *result) +int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd, + dm_block_t *result) { int r; down_read(&pmd->root_lock); - r = __get_held_metadata_root(pmd, result); + r = __get_metadata_snap(pmd, result); up_read(&pmd->root_lock); return r; diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index ed4725e67c96..b88918ccdaf6 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -90,11 +90,18 @@ int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, /* * Hold/get root for userspace transaction. + * + * The metadata snapshot is a copy of the current superblock (minus the + * space maps). Userland can access the data structures for READ + * operations only. A small performance hit is incurred by providing this + * copy of the metadata to userland due to extra copy-on-write operations + * on the metadata nodes. Release this as soon as you finish with it. */ -int dm_pool_hold_metadata_root(struct dm_pool_metadata *pmd); +int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd); +int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd); -int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd, - dm_block_t *result); +int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd, + dm_block_t *result); /* * Actions on a single virtual device. diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index eb3d138ff55a..68694da0d21d 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -111,7 +111,7 @@ struct cell_key { dm_block_t block; }; -struct cell { +struct dm_bio_prison_cell { struct hlist_node list; struct bio_prison *prison; struct cell_key key; @@ -141,6 +141,8 @@ static uint32_t calc_nr_buckets(unsigned nr_cells) return n; } +static struct kmem_cache *_cell_cache; + /* * @nr_cells should be the number of cells you want in use _concurrently_. * Don't confuse it with the number of distinct keys. @@ -157,8 +159,7 @@ static struct bio_prison *prison_create(unsigned nr_cells) return NULL; spin_lock_init(&prison->lock); - prison->cell_pool = mempool_create_kmalloc_pool(nr_cells, - sizeof(struct cell)); + prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache); if (!prison->cell_pool) { kfree(prison); return NULL; @@ -194,10 +195,10 @@ static int keys_equal(struct cell_key *lhs, struct cell_key *rhs) (lhs->block == rhs->block); } -static struct cell *__search_bucket(struct hlist_head *bucket, - struct cell_key *key) +static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket, + struct cell_key *key) { - struct cell *cell; + struct dm_bio_prison_cell *cell; struct hlist_node *tmp; hlist_for_each_entry(cell, tmp, bucket, list) @@ -214,12 +215,12 @@ static struct cell *__search_bucket(struct hlist_head *bucket, * Returns 1 if the cell was already held, 0 if @inmate is the new holder. */ static int bio_detain(struct bio_prison *prison, struct cell_key *key, - struct bio *inmate, struct cell **ref) + struct bio *inmate, struct dm_bio_prison_cell **ref) { int r = 1; unsigned long flags; uint32_t hash = hash_key(prison, key); - struct cell *cell, *cell2; + struct dm_bio_prison_cell *cell, *cell2; BUG_ON(hash > prison->nr_buckets); @@ -273,7 +274,7 @@ out: /* * @inmates must have been initialised prior to this call */ -static void __cell_release(struct cell *cell, struct bio_list *inmates) +static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates) { struct bio_prison *prison = cell->prison; @@ -287,7 +288,7 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates) mempool_free(cell, prison->cell_pool); } -static void cell_release(struct cell *cell, struct bio_list *bios) +static void cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios) { unsigned long flags; struct bio_prison *prison = cell->prison; @@ -303,7 +304,7 @@ static void cell_release(struct cell *cell, struct bio_list *bios) * bio may be in the cell. This function releases the cell, and also does * a sanity check. */ -static void __cell_release_singleton(struct cell *cell, struct bio *bio) +static void __cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio) { BUG_ON(cell->holder != bio); BUG_ON(!bio_list_empty(&cell->bios)); @@ -311,7 +312,7 @@ static void __cell_release_singleton(struct cell *cell, struct bio *bio) __cell_release(cell, NULL); } -static void cell_release_singleton(struct cell *cell, struct bio *bio) +static void cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio) { unsigned long flags; struct bio_prison *prison = cell->prison; @@ -324,7 +325,8 @@ static void cell_release_singleton(struct cell *cell, struct bio *bio) /* * Sometimes we don't want the holder, just the additional bios. */ -static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates) +static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, + struct bio_list *inmates) { struct bio_prison *prison = cell->prison; @@ -334,7 +336,8 @@ static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates mempool_free(cell, prison->cell_pool); } -static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates) +static void cell_release_no_holder(struct dm_bio_prison_cell *cell, + struct bio_list *inmates) { unsigned long flags; struct bio_prison *prison = cell->prison; @@ -344,7 +347,7 @@ static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates) spin_unlock_irqrestore(&prison->lock, flags); } -static void cell_error(struct cell *cell) +static void cell_error(struct dm_bio_prison_cell *cell) { struct bio_prison *prison = cell->prison; struct bio_list bios; @@ -491,7 +494,7 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, * also provides the interface for creating and destroying internal * devices. */ -struct new_mapping; +struct dm_thin_new_mapping; struct pool_features { unsigned zero_new_blocks:1; @@ -537,7 +540,7 @@ struct pool { struct deferred_set shared_read_ds; struct deferred_set all_io_ds; - struct new_mapping *next_mapping; + struct dm_thin_new_mapping *next_mapping; mempool_t *mapping_pool; mempool_t *endio_hook_pool; }; @@ -630,11 +633,11 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev /*----------------------------------------------------------------*/ -struct endio_hook { +struct dm_thin_endio_hook { struct thin_c *tc; struct deferred_entry *shared_read_entry; struct deferred_entry *all_io_entry; - struct new_mapping *overwrite_mapping; + struct dm_thin_new_mapping *overwrite_mapping; }; static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) @@ -647,7 +650,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) bio_list_init(master); while ((bio = bio_list_pop(&bios))) { - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; + if (h->tc == tc) bio_endio(bio, DM_ENDIO_REQUEUE); else @@ -736,7 +740,7 @@ static void wake_worker(struct pool *pool) /* * Bio endio functions. */ -struct new_mapping { +struct dm_thin_new_mapping { struct list_head list; unsigned quiesced:1; @@ -746,7 +750,7 @@ struct new_mapping { struct thin_c *tc; dm_block_t virt_block; dm_block_t data_block; - struct cell *cell, *cell2; + struct dm_bio_prison_cell *cell, *cell2; int err; /* @@ -759,7 +763,7 @@ struct new_mapping { bio_end_io_t *saved_bi_end_io; }; -static void __maybe_add_mapping(struct new_mapping *m) +static void __maybe_add_mapping(struct dm_thin_new_mapping *m) { struct pool *pool = m->tc->pool; @@ -772,7 +776,7 @@ static void __maybe_add_mapping(struct new_mapping *m) static void copy_complete(int read_err, unsigned long write_err, void *context) { unsigned long flags; - struct new_mapping *m = context; + struct dm_thin_new_mapping *m = context; struct pool *pool = m->tc->pool; m->err = read_err || write_err ? -EIO : 0; @@ -786,8 +790,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) static void overwrite_endio(struct bio *bio, int err) { unsigned long flags; - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; - struct new_mapping *m = h->overwrite_mapping; + struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct dm_thin_new_mapping *m = h->overwrite_mapping; struct pool *pool = m->tc->pool; m->err = err; @@ -811,7 +815,7 @@ static void overwrite_endio(struct bio *bio, int err) /* * This sends the bios in the cell back to the deferred_bios list. */ -static void cell_defer(struct thin_c *tc, struct cell *cell, +static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell, dm_block_t data_block) { struct pool *pool = tc->pool; @@ -828,7 +832,7 @@ static void cell_defer(struct thin_c *tc, struct cell *cell, * Same as cell_defer above, except it omits one particular detainee, * a write bio that covers the block and has already been processed. */ -static void cell_defer_except(struct thin_c *tc, struct cell *cell) +static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell) { struct bio_list bios; struct pool *pool = tc->pool; @@ -843,7 +847,7 @@ static void cell_defer_except(struct thin_c *tc, struct cell *cell) wake_worker(pool); } -static void process_prepared_mapping(struct new_mapping *m) +static void process_prepared_mapping(struct dm_thin_new_mapping *m) { struct thin_c *tc = m->tc; struct bio *bio; @@ -886,7 +890,7 @@ static void process_prepared_mapping(struct new_mapping *m) mempool_free(m, tc->pool->mapping_pool); } -static void process_prepared_discard(struct new_mapping *m) +static void process_prepared_discard(struct dm_thin_new_mapping *m) { int r; struct thin_c *tc = m->tc; @@ -909,11 +913,11 @@ static void process_prepared_discard(struct new_mapping *m) } static void process_prepared(struct pool *pool, struct list_head *head, - void (*fn)(struct new_mapping *)) + void (*fn)(struct dm_thin_new_mapping *)) { unsigned long flags; struct list_head maps; - struct new_mapping *m, *tmp; + struct dm_thin_new_mapping *m, *tmp; INIT_LIST_HEAD(&maps); spin_lock_irqsave(&pool->lock, flags); @@ -957,9 +961,9 @@ static int ensure_next_mapping(struct pool *pool) return pool->next_mapping ? 0 : -ENOMEM; } -static struct new_mapping *get_next_mapping(struct pool *pool) +static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) { - struct new_mapping *r = pool->next_mapping; + struct dm_thin_new_mapping *r = pool->next_mapping; BUG_ON(!pool->next_mapping); @@ -971,11 +975,11 @@ static struct new_mapping *get_next_mapping(struct pool *pool) static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, struct dm_dev *origin, dm_block_t data_origin, dm_block_t data_dest, - struct cell *cell, struct bio *bio) + struct dm_bio_prison_cell *cell, struct bio *bio) { int r; struct pool *pool = tc->pool; - struct new_mapping *m = get_next_mapping(pool); + struct dm_thin_new_mapping *m = get_next_mapping(pool); INIT_LIST_HEAD(&m->list); m->quiesced = 0; @@ -997,7 +1001,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, * bio immediately. Otherwise we use kcopyd to clone the data first. */ if (io_overwrites_block(pool, bio)) { - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; + h->overwrite_mapping = m; m->bio = bio; save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); @@ -1025,7 +1030,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, dm_block_t data_origin, dm_block_t data_dest, - struct cell *cell, struct bio *bio) + struct dm_bio_prison_cell *cell, struct bio *bio) { schedule_copy(tc, virt_block, tc->pool_dev, data_origin, data_dest, cell, bio); @@ -1033,18 +1038,18 @@ static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, dm_block_t data_dest, - struct cell *cell, struct bio *bio) + struct dm_bio_prison_cell *cell, struct bio *bio) { schedule_copy(tc, virt_block, tc->origin_dev, virt_block, data_dest, cell, bio); } static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, - dm_block_t data_block, struct cell *cell, + dm_block_t data_block, struct dm_bio_prison_cell *cell, struct bio *bio) { struct pool *pool = tc->pool; - struct new_mapping *m = get_next_mapping(pool); + struct dm_thin_new_mapping *m = get_next_mapping(pool); INIT_LIST_HEAD(&m->list); m->quiesced = 1; @@ -1065,12 +1070,12 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, process_prepared_mapping(m); else if (io_overwrites_block(pool, bio)) { - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; + h->overwrite_mapping = m; m->bio = bio; save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); remap_and_issue(tc, bio, data_block); - } else { int r; struct dm_io_region to; @@ -1155,7 +1160,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) */ static void retry_on_resume(struct bio *bio) { - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; struct thin_c *tc = h->tc; struct pool *pool = tc->pool; unsigned long flags; @@ -1165,7 +1170,7 @@ static void retry_on_resume(struct bio *bio) spin_unlock_irqrestore(&pool->lock, flags); } -static void no_space(struct cell *cell) +static void no_space(struct dm_bio_prison_cell *cell) { struct bio *bio; struct bio_list bios; @@ -1182,11 +1187,11 @@ static void process_discard(struct thin_c *tc, struct bio *bio) int r; unsigned long flags; struct pool *pool = tc->pool; - struct cell *cell, *cell2; + struct dm_bio_prison_cell *cell, *cell2; struct cell_key key, key2; dm_block_t block = get_bio_block(tc, bio); struct dm_thin_lookup_result lookup_result; - struct new_mapping *m; + struct dm_thin_new_mapping *m; build_virtual_key(tc->td, block, &key); if (bio_detain(tc->pool->prison, &key, bio, &cell)) @@ -1240,7 +1245,10 @@ static void process_discard(struct thin_c *tc, struct bio *bio) cell_release_singleton(cell, bio); cell_release_singleton(cell2, bio); - remap_and_issue(tc, bio, lookup_result.block); + if ((!lookup_result.shared) && pool->pf.discard_passdown) + remap_and_issue(tc, bio, lookup_result.block); + else + bio_endio(bio, 0); } break; @@ -1263,7 +1271,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio) static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, struct cell_key *key, struct dm_thin_lookup_result *lookup_result, - struct cell *cell) + struct dm_bio_prison_cell *cell) { int r; dm_block_t data_block; @@ -1290,7 +1298,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, dm_block_t block, struct dm_thin_lookup_result *lookup_result) { - struct cell *cell; + struct dm_bio_prison_cell *cell; struct pool *pool = tc->pool; struct cell_key key; @@ -1305,7 +1313,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, if (bio_data_dir(bio) == WRITE) break_sharing(tc, bio, block, &key, lookup_result, cell); else { - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; h->shared_read_entry = ds_inc(&pool->shared_read_ds); @@ -1315,7 +1323,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, } static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, - struct cell *cell) + struct dm_bio_prison_cell *cell) { int r; dm_block_t data_block; @@ -1363,7 +1371,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio) { int r; dm_block_t block = get_bio_block(tc, bio); - struct cell *cell; + struct dm_bio_prison_cell *cell; struct cell_key key; struct dm_thin_lookup_result lookup_result; @@ -1432,7 +1440,7 @@ static void process_deferred_bios(struct pool *pool) spin_unlock_irqrestore(&pool->lock, flags); while ((bio = bio_list_pop(&bios))) { - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; struct thin_c *tc = h->tc; /* @@ -1522,10 +1530,10 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio) wake_worker(pool); } -static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) +static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) { struct pool *pool = tc->pool; - struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); + struct dm_thin_endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); h->tc = tc; h->shared_read_entry = NULL; @@ -1687,6 +1695,9 @@ static void __pool_destroy(struct pool *pool) kfree(pool); } +static struct kmem_cache *_new_mapping_cache; +static struct kmem_cache *_endio_hook_cache; + static struct pool *pool_create(struct mapped_device *pool_md, struct block_device *metadata_dev, unsigned long block_size, char **error) @@ -1755,16 +1766,16 @@ static struct pool *pool_create(struct mapped_device *pool_md, ds_init(&pool->all_io_ds); pool->next_mapping = NULL; - pool->mapping_pool = - mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping)); + pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE, + _new_mapping_cache); if (!pool->mapping_pool) { *error = "Error creating pool's mapping mempool"; err_p = ERR_PTR(-ENOMEM); goto bad_mapping_pool; } - pool->endio_hook_pool = - mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook)); + pool->endio_hook_pool = mempool_create_slab_pool(ENDIO_HOOK_POOL_SIZE, + _endio_hook_cache); if (!pool->endio_hook_pool) { *error = "Error creating pool's endio_hook mempool"; err_p = ERR_PTR(-ENOMEM); @@ -2276,6 +2287,43 @@ static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct po return 0; } +static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) +{ + int r; + + r = check_arg_count(argc, 1); + if (r) + return r; + + r = dm_pool_commit_metadata(pool->pmd); + if (r) { + DMERR("%s: dm_pool_commit_metadata() failed, error = %d", + __func__, r); + return r; + } + + r = dm_pool_reserve_metadata_snap(pool->pmd); + if (r) + DMWARN("reserve_metadata_snap message failed."); + + return r; +} + +static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) +{ + int r; + + r = check_arg_count(argc, 1); + if (r) + return r; + + r = dm_pool_release_metadata_snap(pool->pmd); + if (r) + DMWARN("release_metadata_snap message failed."); + + return r; +} + /* * Messages supported: * create_thin <dev_id> @@ -2283,6 +2331,8 @@ static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct po * delete <dev_id> * trim <dev_id> <new_size_in_sectors> * set_transaction_id <current_trans_id> <new_trans_id> + * reserve_metadata_snap + * release_metadata_snap */ static int pool_message(struct dm_target *ti, unsigned argc, char **argv) { @@ -2302,6 +2352,12 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv) else if (!strcasecmp(argv[0], "set_transaction_id")) r = process_set_transaction_id_mesg(argc, argv, pool); + else if (!strcasecmp(argv[0], "reserve_metadata_snap")) + r = process_reserve_metadata_snap_mesg(argc, argv, pool); + + else if (!strcasecmp(argv[0], "release_metadata_snap")) + r = process_release_metadata_snap_mesg(argc, argv, pool); + else DMWARN("Unrecognised thin pool target message received: %s", argv[0]); @@ -2361,7 +2417,7 @@ static int pool_status(struct dm_target *ti, status_type_t type, if (r) return r; - r = dm_pool_get_held_metadata_root(pool->pmd, &held_root); + r = dm_pool_get_metadata_snap(pool->pmd, &held_root); if (r) return r; @@ -2457,7 +2513,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 1, 0}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -2575,6 +2631,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) if (tc->pool->pf.discard_enabled) { ti->discards_supported = 1; ti->num_discard_requests = 1; + ti->discard_zeroes_data_unsupported = 1; } dm_put(pool_md); @@ -2613,9 +2670,9 @@ static int thin_endio(struct dm_target *ti, union map_info *map_context) { unsigned long flags; - struct endio_hook *h = map_context->ptr; + struct dm_thin_endio_hook *h = map_context->ptr; struct list_head work; - struct new_mapping *m, *tmp; + struct dm_thin_new_mapping *m, *tmp; struct pool *pool = h->tc->pool; if (h->shared_read_entry) { @@ -2755,7 +2812,32 @@ static int __init dm_thin_init(void) r = dm_register_target(&pool_target); if (r) - dm_unregister_target(&thin_target); + goto bad_pool_target; + + r = -ENOMEM; + + _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0); + if (!_cell_cache) + goto bad_cell_cache; + + _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0); + if (!_new_mapping_cache) + goto bad_new_mapping_cache; + + _endio_hook_cache = KMEM_CACHE(dm_thin_endio_hook, 0); + if (!_endio_hook_cache) + goto bad_endio_hook_cache; + + return 0; + +bad_endio_hook_cache: + kmem_cache_destroy(_new_mapping_cache); +bad_new_mapping_cache: + kmem_cache_destroy(_cell_cache); +bad_cell_cache: + dm_unregister_target(&pool_target); +bad_pool_target: + dm_unregister_target(&thin_target); return r; } @@ -2764,6 +2846,10 @@ static void dm_thin_exit(void) { dm_unregister_target(&thin_target); dm_unregister_target(&pool_target); + + kmem_cache_destroy(_cell_cache); + kmem_cache_destroy(_new_mapping_cache); + kmem_cache_destroy(_endio_hook_cache); } module_init(dm_thin_init); diff --git a/drivers/md/md.c b/drivers/md/md.c index 1c2f9048e1ae..db02d2efb76f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -498,61 +498,13 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) } EXPORT_SYMBOL(md_flush_request); -/* Support for plugging. - * This mirrors the plugging support in request_queue, but does not - * require having a whole queue or request structures. - * We allocate an md_plug_cb for each md device and each thread it gets - * plugged on. This links tot the private plug_handle structure in the - * personality data where we keep a count of the number of outstanding - * plugs so other code can see if a plug is active. - */ -struct md_plug_cb { - struct blk_plug_cb cb; - struct mddev *mddev; -}; - -static void plugger_unplug(struct blk_plug_cb *cb) +void md_unplug(struct blk_plug_cb *cb, bool from_schedule) { - struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb); - if (atomic_dec_and_test(&mdcb->mddev->plug_cnt)) - md_wakeup_thread(mdcb->mddev->thread); - kfree(mdcb); -} - -/* Check that an unplug wakeup will come shortly. - * If not, wakeup the md thread immediately - */ -int mddev_check_plugged(struct mddev *mddev) -{ - struct blk_plug *plug = current->plug; - struct md_plug_cb *mdcb; - - if (!plug) - return 0; - - list_for_each_entry(mdcb, &plug->cb_list, cb.list) { - if (mdcb->cb.callback == plugger_unplug && - mdcb->mddev == mddev) { - /* Already on the list, move to top */ - if (mdcb != list_first_entry(&plug->cb_list, - struct md_plug_cb, - cb.list)) - list_move(&mdcb->cb.list, &plug->cb_list); - return 1; - } - } - /* Not currently on the callback list */ - mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC); - if (!mdcb) - return 0; - - mdcb->mddev = mddev; - mdcb->cb.callback = plugger_unplug; - atomic_inc(&mddev->plug_cnt); - list_add(&mdcb->cb.list, &plug->cb_list); - return 1; + struct mddev *mddev = cb->data; + md_wakeup_thread(mddev->thread); + kfree(cb); } -EXPORT_SYMBOL_GPL(mddev_check_plugged); +EXPORT_SYMBOL(md_unplug); static inline struct mddev *mddev_get(struct mddev *mddev) { @@ -602,7 +554,6 @@ void mddev_init(struct mddev *mddev) atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); atomic_set(&mddev->active_io, 0); - atomic_set(&mddev->plug_cnt, 0); spin_lock_init(&mddev->write_lock); atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); @@ -2931,6 +2882,7 @@ offset_store(struct md_rdev *rdev, const char *buf, size_t len) * can be sane */ return -EBUSY; rdev->data_offset = offset; + rdev->new_data_offset = offset; return len; } @@ -3926,8 +3878,8 @@ array_state_show(struct mddev *mddev, char *page) return sprintf(page, "%s\n", array_states[st]); } -static int do_md_stop(struct mddev * mddev, int ro, int is_open); -static int md_set_readonly(struct mddev * mddev, int is_open); +static int do_md_stop(struct mddev * mddev, int ro, struct block_device *bdev); +static int md_set_readonly(struct mddev * mddev, struct block_device *bdev); static int do_md_run(struct mddev * mddev); static int restart_array(struct mddev *mddev); @@ -3943,14 +3895,14 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) /* stopping an active array */ if (atomic_read(&mddev->openers) > 0) return -EBUSY; - err = do_md_stop(mddev, 0, 0); + err = do_md_stop(mddev, 0, NULL); break; case inactive: /* stopping an active array */ if (mddev->pers) { if (atomic_read(&mddev->openers) > 0) return -EBUSY; - err = do_md_stop(mddev, 2, 0); + err = do_md_stop(mddev, 2, NULL); } else err = 0; /* already inactive */ break; @@ -3958,7 +3910,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) break; /* not supported yet */ case readonly: if (mddev->pers) - err = md_set_readonly(mddev, 0); + err = md_set_readonly(mddev, NULL); else { mddev->ro = 1; set_disk_ro(mddev->gendisk, 1); @@ -3968,7 +3920,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) case read_auto: if (mddev->pers) { if (mddev->ro == 0) - err = md_set_readonly(mddev, 0); + err = md_set_readonly(mddev, NULL); else if (mddev->ro == 1) err = restart_array(mddev); if (err == 0) { @@ -5351,15 +5303,17 @@ void md_stop(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_stop); -static int md_set_readonly(struct mddev *mddev, int is_open) +static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) { int err = 0; mutex_lock(&mddev->open_mutex); - if (atomic_read(&mddev->openers) > is_open) { + if (atomic_read(&mddev->openers) > !!bdev) { printk("md: %s still in use.\n",mdname(mddev)); err = -EBUSY; goto out; } + if (bdev) + sync_blockdev(bdev); if (mddev->pers) { __md_stop_writes(mddev); @@ -5381,18 +5335,26 @@ out: * 0 - completely stop and dis-assemble array * 2 - stop but do not disassemble array */ -static int do_md_stop(struct mddev * mddev, int mode, int is_open) +static int do_md_stop(struct mddev * mddev, int mode, + struct block_device *bdev) { struct gendisk *disk = mddev->gendisk; struct md_rdev *rdev; mutex_lock(&mddev->open_mutex); - if (atomic_read(&mddev->openers) > is_open || + if (atomic_read(&mddev->openers) > !!bdev || mddev->sysfs_active) { printk("md: %s still in use.\n",mdname(mddev)); mutex_unlock(&mddev->open_mutex); return -EBUSY; } + if (bdev) + /* It is possible IO was issued on some other + * open file which was closed before we took ->open_mutex. + * As that was not the last close __blkdev_put will not + * have called sync_blockdev, so we must. + */ + sync_blockdev(bdev); if (mddev->pers) { if (mddev->ro) @@ -5466,7 +5428,7 @@ static void autorun_array(struct mddev *mddev) err = do_md_run(mddev); if (err) { printk(KERN_WARNING "md: do_md_run() returned %d\n", err); - do_md_stop(mddev, 0, 0); + do_md_stop(mddev, 0, NULL); } } @@ -5784,8 +5746,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) super_types[mddev->major_version]. validate_super(mddev, rdev); if ((info->state & (1<<MD_DISK_SYNC)) && - (!test_bit(In_sync, &rdev->flags) || - rdev->raid_disk != info->raid_disk)) { + rdev->raid_disk != info->raid_disk) { /* This was a hot-add request, but events doesn't * match, so reject it. */ @@ -6482,11 +6443,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto done_unlock; case STOP_ARRAY: - err = do_md_stop(mddev, 0, 1); + err = do_md_stop(mddev, 0, bdev); goto done_unlock; case STOP_ARRAY_RO: - err = md_set_readonly(mddev, 1); + err = md_set_readonly(mddev, bdev); goto done_unlock; case BLKROSET: @@ -6751,7 +6712,7 @@ struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev thread->tsk = kthread_run(md_thread, thread, "%s_%s", mdname(thread->mddev), - name ?: mddev->pers->name); + name); if (IS_ERR(thread->tsk)) { kfree(thread); return NULL; @@ -7298,6 +7259,7 @@ void md_do_sync(struct mddev *mddev) int skipped = 0; struct md_rdev *rdev; char *desc; + struct blk_plug plug; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) @@ -7447,6 +7409,7 @@ void md_do_sync(struct mddev *mddev) } mddev->curr_resync_completed = j; + blk_start_plug(&plug); while (j < max_sectors) { sector_t sectors; @@ -7552,6 +7515,7 @@ void md_do_sync(struct mddev *mddev) * this also signals 'finished resyncing' to md_stop */ out: + blk_finish_plug(&plug); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); /* tell personality that we are finished */ diff --git a/drivers/md/md.h b/drivers/md/md.h index 7b4a3c318cae..f385b038589d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -266,9 +266,6 @@ struct mddev { int new_chunk_sectors; int reshape_backwards; - atomic_t plug_cnt; /* If device is expecting - * more bios soon. - */ struct md_thread *thread; /* management thread */ struct md_thread *sync_thread; /* doing resync or reconstruct */ sector_t curr_resync; /* last block scheduled */ @@ -630,6 +627,12 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, struct mddev *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); -extern int mddev_check_plugged(struct mddev *mddev); extern void md_trim_bio(struct bio *bio, int offset, int size); + +extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); +static inline int mddev_check_plugged(struct mddev *mddev) +{ + return !!blk_check_plugged(md_unplug, mddev, + sizeof(struct blk_plug_cb)); +} #endif /* _MD_MD_H */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 9339e67fcc79..61a1833ebaf3 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -474,7 +474,8 @@ static int multipath_run (struct mddev *mddev) } { - mddev->thread = md_register_thread(multipathd, mddev, NULL); + mddev->thread = md_register_thread(multipathd, mddev, + "multipath"); if (!mddev->thread) { printk(KERN_ERR "multipath: couldn't allocate thread" " for %s\n", mdname(mddev)); diff --git a/drivers/md/persistent-data/dm-space-map-checker.c b/drivers/md/persistent-data/dm-space-map-checker.c index 50ed53bf4aa2..fc90c11620ad 100644 --- a/drivers/md/persistent-data/dm-space-map-checker.c +++ b/drivers/md/persistent-data/dm-space-map-checker.c @@ -8,6 +8,7 @@ #include <linux/device-mapper.h> #include <linux/export.h> +#include <linux/vmalloc.h> #ifdef CONFIG_DM_DEBUG_SPACE_MAPS @@ -89,13 +90,23 @@ static int ca_create(struct count_array *ca, struct dm_space_map *sm) ca->nr = nr_blocks; ca->nr_free = nr_blocks; - ca->counts = kzalloc(sizeof(*ca->counts) * nr_blocks, GFP_KERNEL); - if (!ca->counts) - return -ENOMEM; + + if (!nr_blocks) + ca->counts = NULL; + else { + ca->counts = vzalloc(sizeof(*ca->counts) * nr_blocks); + if (!ca->counts) + return -ENOMEM; + } return 0; } +static void ca_destroy(struct count_array *ca) +{ + vfree(ca->counts); +} + static int ca_load(struct count_array *ca, struct dm_space_map *sm) { int r; @@ -126,12 +137,14 @@ static int ca_load(struct count_array *ca, struct dm_space_map *sm) static int ca_extend(struct count_array *ca, dm_block_t extra_blocks) { dm_block_t nr_blocks = ca->nr + extra_blocks; - uint32_t *counts = kzalloc(sizeof(*counts) * nr_blocks, GFP_KERNEL); + uint32_t *counts = vzalloc(sizeof(*counts) * nr_blocks); if (!counts) return -ENOMEM; - memcpy(counts, ca->counts, sizeof(*counts) * ca->nr); - kfree(ca->counts); + if (ca->counts) { + memcpy(counts, ca->counts, sizeof(*counts) * ca->nr); + ca_destroy(ca); + } ca->nr = nr_blocks; ca->nr_free += extra_blocks; ca->counts = counts; @@ -151,11 +164,6 @@ static int ca_commit(struct count_array *old, struct count_array *new) return 0; } -static void ca_destroy(struct count_array *ca) -{ - kfree(ca->counts); -} - /*----------------------------------------------------------------*/ struct sm_checker { @@ -343,25 +351,25 @@ struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) int r; struct sm_checker *smc; - if (!sm) - return NULL; + if (IS_ERR_OR_NULL(sm)) + return ERR_PTR(-EINVAL); smc = kmalloc(sizeof(*smc), GFP_KERNEL); if (!smc) - return NULL; + return ERR_PTR(-ENOMEM); memcpy(&smc->sm, &ops_, sizeof(smc->sm)); r = ca_create(&smc->old_counts, sm); if (r) { kfree(smc); - return NULL; + return ERR_PTR(r); } r = ca_create(&smc->counts, sm); if (r) { ca_destroy(&smc->old_counts); kfree(smc); - return NULL; + return ERR_PTR(r); } smc->real_sm = sm; @@ -371,7 +379,7 @@ struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) ca_destroy(&smc->counts); ca_destroy(&smc->old_counts); kfree(smc); - return NULL; + return ERR_PTR(r); } r = ca_commit(&smc->old_counts, &smc->counts); @@ -379,7 +387,7 @@ struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) ca_destroy(&smc->counts); ca_destroy(&smc->old_counts); kfree(smc); - return NULL; + return ERR_PTR(r); } return &smc->sm; @@ -391,25 +399,25 @@ struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm) int r; struct sm_checker *smc; - if (!sm) - return NULL; + if (IS_ERR_OR_NULL(sm)) + return ERR_PTR(-EINVAL); smc = kmalloc(sizeof(*smc), GFP_KERNEL); if (!smc) - return NULL; + return ERR_PTR(-ENOMEM); memcpy(&smc->sm, &ops_, sizeof(smc->sm)); r = ca_create(&smc->old_counts, sm); if (r) { kfree(smc); - return NULL; + return ERR_PTR(r); } r = ca_create(&smc->counts, sm); if (r) { ca_destroy(&smc->old_counts); kfree(smc); - return NULL; + return ERR_PTR(r); } smc->real_sm = sm; diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index fc469ba9f627..3d0ed5332883 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -290,7 +290,16 @@ struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, dm_block_t nr_blocks) { struct dm_space_map *sm = dm_sm_disk_create_real(tm, nr_blocks); - return dm_sm_checker_create_fresh(sm); + struct dm_space_map *smc; + + if (IS_ERR_OR_NULL(sm)) + return sm; + + smc = dm_sm_checker_create_fresh(sm); + if (IS_ERR(smc)) + dm_sm_destroy(sm); + + return smc; } EXPORT_SYMBOL_GPL(dm_sm_disk_create); diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index 6f8d38747d7f..e5604b32d91f 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -138,6 +138,9 @@ EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone); void dm_tm_destroy(struct dm_transaction_manager *tm) { + if (!tm->is_clone) + wipe_shadow_table(tm); + kfree(tm); } EXPORT_SYMBOL_GPL(dm_tm_destroy); @@ -249,6 +252,7 @@ int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, return r; } +EXPORT_SYMBOL_GPL(dm_tm_shadow_block); int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, struct dm_block_validator *v, @@ -259,6 +263,7 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, return dm_bm_read_lock(tm->bm, b, v, blk); } +EXPORT_SYMBOL_GPL(dm_tm_read_lock); int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b) { @@ -342,8 +347,10 @@ static int dm_tm_create_internal(struct dm_block_manager *bm, } *sm = dm_sm_checker_create(inner); - if (!*sm) + if (IS_ERR(*sm)) { + r = PTR_ERR(*sm); goto bad2; + } } else { r = dm_bm_write_lock(dm_tm_get_bm(*tm), sb_location, @@ -362,8 +369,10 @@ static int dm_tm_create_internal(struct dm_block_manager *bm, } *sm = dm_sm_checker_create(inner); - if (!*sm) + if (IS_ERR(*sm)) { + r = PTR_ERR(*sm); goto bad2; + } } return 0; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 835de7168cd3..36a8fc059ac3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -517,8 +517,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect int bad_sectors; int disk = start_disk + i; - if (disk >= conf->raid_disks) - disk -= conf->raid_disks; + if (disk >= conf->raid_disks * 2) + disk -= conf->raid_disks * 2; rdev = rcu_dereference(conf->mirrors[disk].rdev); if (r1_bio->bios[disk] == IO_BLOCKED @@ -883,7 +883,6 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); struct md_rdev *blocked_rdev; - int plugged; int first_clone; int sectors_handled; int max_sectors; @@ -1034,7 +1033,6 @@ read_again: * the bad blocks. Each set of writes gets it's own r1bio * with a set of bios attached. */ - plugged = mddev_check_plugged(mddev); disks = conf->raid_disks * 2; retry_write: @@ -1191,6 +1189,8 @@ read_again: bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev)) + md_wakeup_thread(mddev->thread); } /* Mustn't call r1_bio_write_done before this next test, * as it could result in the bio being freed. @@ -1213,9 +1213,6 @@ read_again: /* In case raid1d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - - if (do_sync || !bitmap || !plugged) - md_wakeup_thread(mddev->thread); } static void status(struct seq_file *seq, struct mddev *mddev) @@ -1821,8 +1818,14 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) if (atomic_dec_and_test(&r1_bio->remaining)) { /* if we're here, all write(s) have completed, so clean up */ - md_done_sync(mddev, r1_bio->sectors, 1); - put_buf(r1_bio); + int s = r1_bio->sectors; + if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + reschedule_retry(r1_bio); + else { + put_buf(r1_bio); + md_done_sync(mddev, s, 1); + } } } @@ -2170,8 +2173,7 @@ static void raid1d(struct mddev *mddev) blk_start_plug(&plug); for (;;) { - if (atomic_read(&mddev->plug_cnt) == 0) - flush_pending_writes(conf); + flush_pending_writes(conf); spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) { @@ -2488,9 +2490,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { atomic_set(&r1_bio->remaining, read_targets); - for (i = 0; i < conf->raid_disks * 2; i++) { + for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) { bio = r1_bio->bios[i]; if (bio->bi_end_io == end_sync_read) { + read_targets--; md_sync_acct(bio->bi_bdev, nr_sectors); generic_make_request(bio); } @@ -2550,6 +2553,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) err = -EINVAL; spin_lock_init(&conf->device_lock); rdev_for_each(rdev, mddev) { + struct request_queue *q; int disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) @@ -2562,6 +2566,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (disk->rdev) goto abort; disk->rdev = rdev; + q = bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) + mddev->merge_check_needed = 1; disk->head_position = 0; } @@ -2617,7 +2624,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) goto abort; } err = -ENOMEM; - conf->thread = md_register_thread(raid1d, mddev, NULL); + conf->thread = md_register_thread(raid1d, mddev, "raid1"); if (!conf->thread) { printk(KERN_ERR "md/raid1:%s: couldn't allocate thread\n", diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 987db37cb875..5d33603a497d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1039,7 +1039,6 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_fua = (bio->bi_rw & REQ_FUA); unsigned long flags; struct md_rdev *blocked_rdev; - int plugged; int sectors_handled; int max_sectors; int sectors; @@ -1239,7 +1238,6 @@ read_again: * of r10_bios is recored in bio->bi_phys_segments just as with * the read case. */ - plugged = mddev_check_plugged(mddev); r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ raid10_find_phys(conf, r10_bio); @@ -1396,6 +1394,8 @@ retry_write: bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev)) + md_wakeup_thread(mddev->thread); if (!r10_bio->devs[i].repl_bio) continue; @@ -1423,6 +1423,8 @@ retry_write: bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev)) + md_wakeup_thread(mddev->thread); } /* Don't remove the bias on 'remaining' (one_write_done) until @@ -1448,9 +1450,6 @@ retry_write: /* In case raid10d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - - if (do_sync || !mddev->bitmap || !plugged) - md_wakeup_thread(mddev->thread); } static void status(struct seq_file *seq, struct mddev *mddev) @@ -2310,7 +2309,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 if (r10_sync_page_io(rdev, r10_bio->devs[sl].addr + sect, - s<<9, conf->tmppage, WRITE) + s, conf->tmppage, WRITE) == 0) { /* Well, this device is dead */ printk(KERN_NOTICE @@ -2349,7 +2348,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 switch (r10_sync_page_io(rdev, r10_bio->devs[sl].addr + sect, - s<<9, conf->tmppage, + s, conf->tmppage, READ)) { case 0: /* Well, this device is dead */ @@ -2512,7 +2511,7 @@ read_more: slot = r10_bio->read_slot; printk_ratelimited( KERN_ERR - "md/raid10:%s: %s: redirecting" + "md/raid10:%s: %s: redirecting " "sector %llu to another mirror\n", mdname(mddev), bdevname(rdev->bdev, b), @@ -2890,6 +2889,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, /* want to reconstruct this device */ rb2 = r10_bio; sect = raid10_find_virt(conf, sector_nr, i); + if (sect >= mddev->resync_max_sectors) { + /* last stripe is not complete - don't + * try to recover this sector. + */ + continue; + } /* Unless we are doing a full sync, or a replacement * we only need to recover the block if it is set in * the bitmap @@ -3421,7 +3426,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); - conf->thread = md_register_thread(raid10d, mddev, NULL); + conf->thread = md_register_thread(raid10d, mddev, "raid10"); if (!conf->thread) goto out; @@ -3475,6 +3480,7 @@ static int run(struct mddev *mddev) rdev_for_each(rdev, mddev) { long long diff; + struct request_queue *q; disk_idx = rdev->raid_disk; if (disk_idx < 0) @@ -3493,6 +3499,9 @@ static int run(struct mddev *mddev) goto out_free_conf; disk->rdev = rdev; } + q = bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) + mddev->merge_check_needed = 1; diff = (rdev->new_data_offset - rdev->data_offset); if (!mddev->reshape_backwards) diff = -diff; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d26767246d26..bde9da2baa39 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -196,12 +196,14 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) BUG_ON(!list_empty(&sh->lru)); BUG_ON(atomic_read(&conf->active_stripes)==0); if (test_bit(STRIPE_HANDLE, &sh->state)) { - if (test_bit(STRIPE_DELAYED, &sh->state)) + if (test_bit(STRIPE_DELAYED, &sh->state) && + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) list_add_tail(&sh->lru, &conf->delayed_list); else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && sh->bm_seq - conf->seq_write > 0) list_add_tail(&sh->lru, &conf->bitmap_list); else { + clear_bit(STRIPE_DELAYED, &sh->state); clear_bit(STRIPE_BIT_DELAY, &sh->state); list_add_tail(&sh->lru, &conf->handle_list); } @@ -606,6 +608,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) * a chance*/ md_check_recovery(conf->mddev); } + /* + * Because md_wait_for_blocked_rdev + * will dec nr_pending, we must + * increment it first. + */ + atomic_inc(&rdev->nr_pending); md_wait_for_blocked_rdev(rdev, conf->mddev); } else { /* Acknowledged bad block - skip the write */ @@ -1737,6 +1745,7 @@ static void raid5_end_read_request(struct bio * bi, int error) } else { const char *bdn = bdevname(rdev->bdev, b); int retry = 0; + int set_bad = 0; clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); @@ -1748,7 +1757,8 @@ static void raid5_end_read_request(struct bio * bi, int error) mdname(conf->mddev), (unsigned long long)s, bdn); - else if (conf->mddev->degraded >= conf->max_degraded) + else if (conf->mddev->degraded >= conf->max_degraded) { + set_bad = 1; printk_ratelimited( KERN_WARNING "md/raid:%s: read error not correctable " @@ -1756,8 +1766,9 @@ static void raid5_end_read_request(struct bio * bi, int error) mdname(conf->mddev), (unsigned long long)s, bdn); - else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) + } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { /* Oh, no!!! */ + set_bad = 1; printk_ratelimited( KERN_WARNING "md/raid:%s: read error NOT corrected!! " @@ -1765,7 +1776,7 @@ static void raid5_end_read_request(struct bio * bi, int error) mdname(conf->mddev), (unsigned long long)s, bdn); - else if (atomic_read(&rdev->read_errors) + } else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING "md/raid:%s: Too many read errors, failing device %s.\n", @@ -1777,7 +1788,11 @@ static void raid5_end_read_request(struct bio * bi, int error) else { clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); - md_error(conf->mddev, rdev); + if (!(set_bad + && test_bit(In_sync, &rdev->flags) + && rdev_set_badblocks( + rdev, sh->sector, STRIPE_SECTORS, 0))) + md_error(conf->mddev, rdev); } } rdev_dec_pending(rdev, conf->mddev); @@ -3582,8 +3597,18 @@ static void handle_stripe(struct stripe_head *sh) finish: /* wait for this device to become unblocked */ - if (conf->mddev->external && unlikely(s.blocked_rdev)) - md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev); + if (unlikely(s.blocked_rdev)) { + if (conf->mddev->external) + md_wait_for_blocked_rdev(s.blocked_rdev, + conf->mddev); + else + /* Internal metadata will immediately + * be written by raid5d, so we don't + * need to wait here. + */ + rdev_dec_pending(s.blocked_rdev, + conf->mddev); + } if (s.handle_bad_blocks) for (i = disks; i--; ) { @@ -3881,8 +3906,6 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) raid_bio->bi_next = (void*)rdev; align_bi->bi_bdev = rdev->bdev; align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); - /* No reshape active, so we can trust rdev->data_offset */ - align_bi->bi_sector += rdev->data_offset; if (!bio_fits_rdev(align_bi) || is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, @@ -3893,6 +3916,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) return 0; } + /* No reshape active, so we can trust rdev->data_offset */ + align_bi->bi_sector += rdev->data_offset; + spin_lock_irq(&conf->device_lock); wait_event_lock_irq(conf->wait_for_stripe, conf->quiesce == 0, @@ -3971,7 +3997,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) struct stripe_head *sh; const int rw = bio_data_dir(bi); int remaining; - int plugged; if (unlikely(bi->bi_rw & REQ_FLUSH)) { md_flush_request(mddev, bi); @@ -3990,7 +4015,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ - plugged = mddev_check_plugged(mddev); for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { DEFINE_WAIT(w); int previous; @@ -4092,6 +4116,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) if ((bi->bi_rw & REQ_SYNC) && !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); + mddev_check_plugged(mddev); release_stripe(sh); } else { /* cannot get stripe for read-ahead, just give-up */ @@ -4099,10 +4124,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) finish_wait(&conf->wait_for_overlap, &w); break; } - } - if (!plugged) - md_wakeup_thread(mddev->thread); spin_lock_irq(&conf->device_lock); remaining = raid5_dec_bi_phys_segments(bi); @@ -4521,7 +4543,7 @@ static void raid5d(struct mddev *mddev) while (1) { struct bio *bio; - if (atomic_read(&mddev->plug_cnt) == 0 && + if ( !list_empty(&conf->bitmap_list)) { /* Now is a good time to flush some bitmap updates */ conf->seq_flush++; @@ -4531,8 +4553,7 @@ static void raid5d(struct mddev *mddev) conf->seq_write = conf->seq_flush; activate_bit_delay(conf); } - if (atomic_read(&mddev->plug_cnt) == 0) - raid5_activate_delayed(conf); + raid5_activate_delayed(conf); while ((bio = remove_bio_from_retry(conf))) { int ok; @@ -4823,6 +4844,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) int raid_disk, memory, max_disks; struct md_rdev *rdev; struct disk_info *disk; + char pers_name[6]; if (mddev->new_level != 5 && mddev->new_level != 4 @@ -4946,7 +4968,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) printk(KERN_INFO "md/raid:%s: allocated %dkB\n", mdname(mddev), memory); - conf->thread = md_register_thread(raid5d, mddev, NULL); + sprintf(pers_name, "raid%d", mddev->new_level); + conf->thread = md_register_thread(raid5d, mddev, pers_name); if (!conf->thread) { printk(KERN_ERR "md/raid:%s: couldn't allocate thread.\n", @@ -5465,10 +5488,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->saved_raid_disk >= 0 && rdev->saved_raid_disk >= first && conf->disks[rdev->saved_raid_disk].rdev == NULL) - disk = rdev->saved_raid_disk; - else - disk = first; - for ( ; disk <= last ; disk++) { + first = rdev->saved_raid_disk; + + for (disk = first; disk <= last; disk++) { p = conf->disks + disk; if (p->rdev == NULL) { clear_bit(In_sync, &rdev->flags); @@ -5477,8 +5499,11 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->saved_raid_disk != disk) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); - break; + goto out; } + } + for (disk = first; disk <= last; disk++) { + p = conf->disks + disk; if (test_bit(WantReplacement, &p->rdev->flags) && p->replacement == NULL) { clear_bit(In_sync, &rdev->flags); @@ -5490,6 +5515,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } } +out: print_raid5_conf(conf); return err; } |