diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/dm.c | 8 | ||||
-rw-r--r-- | drivers/md/faulty.c | 5 | ||||
-rw-r--r-- | drivers/md/md.c | 285 | ||||
-rw-r--r-- | drivers/md/md.h | 2 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-block-manager.c | 4 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-btree.h | 2 | ||||
-rw-r--r-- | drivers/md/raid1.c | 4 | ||||
-rw-r--r-- | drivers/md/raid10.c | 148 | ||||
-rw-r--r-- | drivers/md/raid5.c | 124 |
9 files changed, 331 insertions, 251 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 02db9183ca01..77e6eff41cae 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -740,8 +740,14 @@ static void rq_completed(struct mapped_device *md, int rw, int run_queue) if (!md_in_flight(md)) wake_up(&md->wait); + /* + * Run this off this callpath, as drivers could invoke end_io while + * inside their request_fn (and holding the queue lock). Calling + * back into ->request_fn() could deadlock attempting to grab the + * queue lock again. + */ if (run_queue) - blk_run_queue(md->queue); + blk_run_queue_async(md->queue); /* * dm_put() must be at the end of this function. See the comment above diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 45135f69509c..5e7dc772f5de 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -315,8 +315,11 @@ static int run(struct mddev *mddev) } conf->nfaults = 0; - rdev_for_each(rdev, mddev) + rdev_for_each(rdev, mddev) { conf->rdev = rdev; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + } md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); mddev->private = conf; diff --git a/drivers/md/md.c b/drivers/md/md.c index 7e513a38cec7..3db3d1b271f7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1414,12 +1414,11 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) unsigned long long newcsum; int size = 256 + le32_to_cpu(sb->max_dev)*2; __le32 *isuper = (__le32*)sb; - int i; disk_csum = sb->sb_csum; sb->sb_csum = 0; newcsum = 0; - for (i=0; size>=4; size -= 4 ) + for (; size >= 4; size -= 4) newcsum += le32_to_cpu(*isuper++); if (size == 2) @@ -1817,10 +1816,10 @@ retry: memset(bbp, 0xff, PAGE_SIZE); for (i = 0 ; i < bb->count ; i++) { - u64 internal_bb = *p++; + u64 internal_bb = p[i]; u64 store_bb = ((BB_OFFSET(internal_bb) << 10) | BB_LEN(internal_bb)); - *bbp++ = cpu_to_le64(store_bb); + bbp[i] = cpu_to_le64(store_bb); } bb->changed = 0; if (read_seqretry(&bb->lock, seq)) @@ -4124,7 +4123,7 @@ static struct md_sysfs_entry md_size = __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); -/* Metdata version. +/* Metadata version. * This is one of * 'none' for arrays with no metadata (good luck...) * 'external' for arrays with externally managed metadata, @@ -4753,6 +4752,8 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, } mddev_get(mddev); spin_unlock(&all_mddevs_lock); + if (entry->store == new_dev_store) + flush_workqueue(md_misc_wq); rv = mddev_lock(mddev); if (!rv) { rv = entry->store(mddev, page, length); @@ -5294,7 +5295,7 @@ void md_stop_writes(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_stop_writes); -void md_stop(struct mddev *mddev) +static void __md_stop(struct mddev *mddev) { mddev->ready = 0; mddev->pers->stop(mddev); @@ -5304,6 +5305,18 @@ void md_stop(struct mddev *mddev) mddev->pers = NULL; clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } + +void md_stop(struct mddev *mddev) +{ + /* stop the array and free an attached data structures. + * This is called from dm-raid + */ + __md_stop(mddev); + bitmap_destroy(mddev); + if (mddev->bio_set) + bioset_free(mddev->bio_set); +} + EXPORT_SYMBOL_GPL(md_stop); static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) @@ -5364,7 +5377,7 @@ static int do_md_stop(struct mddev * mddev, int mode, set_disk_ro(disk, 0); __md_stop_writes(mddev); - md_stop(mddev); + __md_stop(mddev); mddev->queue->merge_bvec_fn = NULL; mddev->queue->backing_dev_info.congested_fn = NULL; @@ -6334,24 +6347,23 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, * Commands dealing with the RAID driver but not any * particular array: */ - switch (cmd) - { - case RAID_VERSION: - err = get_version(argp); - goto done; + switch (cmd) { + case RAID_VERSION: + err = get_version(argp); + goto done; - case PRINT_RAID_DEBUG: - err = 0; - md_print_devices(); - goto done; + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done; #ifndef MODULE - case RAID_AUTORUN: - err = 0; - autostart_arrays(arg); - goto done; + case RAID_AUTORUN: + err = 0; + autostart_arrays(arg); + goto done; #endif - default:; + default:; } /* @@ -6386,6 +6398,10 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto abort; } + if (cmd == ADD_NEW_DISK) + /* need to ensure md_delayed_delete() has completed */ + flush_workqueue(md_misc_wq); + err = mddev_lock(mddev); if (err) { printk(KERN_INFO @@ -6394,50 +6410,44 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto abort; } - switch (cmd) - { - case SET_ARRAY_INFO: - { - mdu_array_info_t info; - if (!arg) - memset(&info, 0, sizeof(info)); - else if (copy_from_user(&info, argp, sizeof(info))) { - err = -EFAULT; - goto abort_unlock; - } - if (mddev->pers) { - err = update_array_info(mddev, &info); - if (err) { - printk(KERN_WARNING "md: couldn't update" - " array info. %d\n", err); - goto abort_unlock; - } - goto done_unlock; - } - if (!list_empty(&mddev->disks)) { - printk(KERN_WARNING - "md: array %s already has disks!\n", - mdname(mddev)); - err = -EBUSY; - goto abort_unlock; - } - if (mddev->raid_disks) { - printk(KERN_WARNING - "md: array %s already initialised!\n", - mdname(mddev)); - err = -EBUSY; - goto abort_unlock; - } - err = set_array_info(mddev, &info); - if (err) { - printk(KERN_WARNING "md: couldn't set" - " array info. %d\n", err); - goto abort_unlock; - } + if (cmd == SET_ARRAY_INFO) { + mdu_array_info_t info; + if (!arg) + memset(&info, 0, sizeof(info)); + else if (copy_from_user(&info, argp, sizeof(info))) { + err = -EFAULT; + goto abort_unlock; + } + if (mddev->pers) { + err = update_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldn't update" + " array info. %d\n", err); + goto abort_unlock; } goto done_unlock; - - default:; + } + if (!list_empty(&mddev->disks)) { + printk(KERN_WARNING + "md: array %s already has disks!\n", + mdname(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (mddev->raid_disks) { + printk(KERN_WARNING + "md: array %s already initialised!\n", + mdname(mddev)); + err = -EBUSY; + goto abort_unlock; + } + err = set_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldn't set" + " array info. %d\n", err); + goto abort_unlock; + } + goto done_unlock; } /* @@ -6456,52 +6466,51 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, /* * Commands even a read-only array can execute: */ - switch (cmd) - { - case GET_BITMAP_FILE: - err = get_bitmap_file(mddev, argp); - goto done_unlock; + switch (cmd) { + case GET_BITMAP_FILE: + err = get_bitmap_file(mddev, argp); + goto done_unlock; - case RESTART_ARRAY_RW: - err = restart_array(mddev); - goto done_unlock; + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; - case STOP_ARRAY: - err = do_md_stop(mddev, 0, bdev); - goto done_unlock; + case STOP_ARRAY: + err = do_md_stop(mddev, 0, bdev); + goto done_unlock; - case STOP_ARRAY_RO: - err = md_set_readonly(mddev, bdev); - goto done_unlock; + case STOP_ARRAY_RO: + err = md_set_readonly(mddev, bdev); + goto done_unlock; - case BLKROSET: - if (get_user(ro, (int __user *)(arg))) { - err = -EFAULT; - goto done_unlock; - } - err = -EINVAL; + case BLKROSET: + if (get_user(ro, (int __user *)(arg))) { + err = -EFAULT; + goto done_unlock; + } + err = -EINVAL; - /* if the bdev is going readonly the value of mddev->ro - * does not matter, no writes are coming - */ - if (ro) - goto done_unlock; + /* if the bdev is going readonly the value of mddev->ro + * does not matter, no writes are coming + */ + if (ro) + goto done_unlock; - /* are we are already prepared for writes? */ - if (mddev->ro != 1) - goto done_unlock; + /* are we are already prepared for writes? */ + if (mddev->ro != 1) + goto done_unlock; - /* transitioning to readauto need only happen for - * arrays that call md_write_start - */ - if (mddev->pers) { - err = restart_array(mddev); - if (err == 0) { - mddev->ro = 2; - set_disk_ro(mddev->gendisk, 0); - } + /* transitioning to readauto need only happen for + * arrays that call md_write_start + */ + if (mddev->pers) { + err = restart_array(mddev); + if (err == 0) { + mddev->ro = 2; + set_disk_ro(mddev->gendisk, 0); } - goto done_unlock; + } + goto done_unlock; } /* @@ -6523,37 +6532,36 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, } } - switch (cmd) + switch (cmd) { + case ADD_NEW_DISK: { - case ADD_NEW_DISK: - { - mdu_disk_info_t info; - if (copy_from_user(&info, argp, sizeof(info))) - err = -EFAULT; - else - err = add_new_disk(mddev, &info); - goto done_unlock; - } + mdu_disk_info_t info; + if (copy_from_user(&info, argp, sizeof(info))) + err = -EFAULT; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } - case HOT_REMOVE_DISK: - err = hot_remove_disk(mddev, new_decode_dev(arg)); - goto done_unlock; + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, new_decode_dev(arg)); + goto done_unlock; - case HOT_ADD_DISK: - err = hot_add_disk(mddev, new_decode_dev(arg)); - goto done_unlock; + case HOT_ADD_DISK: + err = hot_add_disk(mddev, new_decode_dev(arg)); + goto done_unlock; - case RUN_ARRAY: - err = do_md_run(mddev); - goto done_unlock; + case RUN_ARRAY: + err = do_md_run(mddev); + goto done_unlock; - case SET_BITMAP_FILE: - err = set_bitmap_file(mddev, (int)arg); - goto done_unlock; + case SET_BITMAP_FILE: + err = set_bitmap_file(mddev, (int)arg); + goto done_unlock; - default: - err = -EINVAL; - goto abort_unlock; + default: + err = -EINVAL; + goto abort_unlock; } done_unlock: @@ -7172,6 +7180,7 @@ void md_done_sync(struct mddev *mddev, int blocks, int ok) wake_up(&mddev->recovery_wait); if (!ok) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); + set_bit(MD_RECOVERY_ERROR, &mddev->recovery); md_wakeup_thread(mddev->thread); // stop recovery, signal do_sync .... } @@ -7269,6 +7278,7 @@ EXPORT_SYMBOL_GPL(md_allow_write); #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) +#define UPDATE_FREQUENCY (5*60*HZ) void md_do_sync(struct md_thread *thread) { struct mddev *mddev = thread->mddev; @@ -7277,6 +7287,7 @@ void md_do_sync(struct md_thread *thread) window; sector_t max_sectors,j, io_sectors; unsigned long mark[SYNC_MARKS]; + unsigned long update_time; sector_t mark_cnt[SYNC_MARKS]; int last_mark,m; struct list_head *tmp; @@ -7436,6 +7447,7 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync_completed = j; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); md_new_event(mddev); + update_time = jiffies; blk_start_plug(&plug); while (j < max_sectors) { @@ -7447,6 +7459,7 @@ void md_do_sync(struct md_thread *thread) ((mddev->curr_resync > mddev->curr_resync_completed && (mddev->curr_resync - mddev->curr_resync_completed) > (max_sectors >> 4)) || + time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || (j - mddev->curr_resync_completed)*2 >= mddev->resync_max - mddev->curr_resync_completed )) { @@ -7454,6 +7467,10 @@ void md_do_sync(struct md_thread *thread) wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active) == 0); mddev->curr_resync_completed = j; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + j > mddev->recovery_cp) + mddev->recovery_cp = j; + update_time = jiffies; set_bit(MD_CHANGE_CLEAN, &mddev->flags); sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } @@ -7558,8 +7575,13 @@ void md_do_sync(struct md_thread *thread) printk(KERN_INFO "md: checkpointing %s of %s.\n", desc, mdname(mddev)); - mddev->recovery_cp = - mddev->curr_resync_completed; + if (test_bit(MD_RECOVERY_ERROR, + &mddev->recovery)) + mddev->recovery_cp = + mddev->curr_resync_completed; + else + mddev->recovery_cp = + mddev->curr_resync; } } else mddev->recovery_cp = MaxSector; @@ -7936,9 +7958,9 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, sector_t *first_bad, int *bad_sectors) { int hi; - int lo = 0; + int lo; u64 *p = bb->page; - int rv = 0; + int rv; sector_t target = s + sectors; unsigned seq; @@ -7953,7 +7975,8 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, retry: seq = read_seqbegin(&bb->lock); - + lo = 0; + rv = 0; hi = bb->count; /* Binary search between lo and hi for 'target' diff --git a/drivers/md/md.h b/drivers/md/md.h index 1e2fc3d9c74c..eca59c3074ef 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -307,6 +307,7 @@ struct mddev { * REQUEST: user-space has requested a sync (used with SYNC) * CHECK: user-space request for check-only, no repair * RESHAPE: A reshape is happening + * ERROR: sync-action interrupted because io-error * * If neither SYNC or RESHAPE are set, then it is a recovery. */ @@ -320,6 +321,7 @@ struct mddev { #define MD_RECOVERY_CHECK 7 #define MD_RECOVERY_RESHAPE 8 #define MD_RECOVERY_FROZEN 9 +#define MD_RECOVERY_ERROR 10 unsigned long recovery; /* If a RAID personality determines that recovery (of a particular diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 5ba277768d99..a3ae09124a67 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -25,7 +25,7 @@ * may be held at once. This is just an implementation detail. * * ii) Recursive locking attempts are detected and return EINVAL. A stack - * trace is also emitted for the previous lock aquisition. + * trace is also emitted for the previous lock acquisition. * * iii) Priority is given to write locks. */ @@ -109,7 +109,7 @@ static int __check_holder(struct block_lock *lock) DMERR("previously held here:"); print_stack_trace(lock->traces + i, 4); - DMERR("subsequent aquisition attempted here:"); + DMERR("subsequent acquisition attempted here:"); t.nr_entries = 0; t.max_entries = MAX_STACK; t.entries = entries; diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h index ae02c84410ff..a2cd50441ca1 100644 --- a/drivers/md/persistent-data/dm-btree.h +++ b/drivers/md/persistent-data/dm-btree.h @@ -35,7 +35,7 @@ struct dm_transaction_manager; */ /* - * Infomation about the values stored within the btree. + * Information about the values stored within the btree. */ struct dm_btree_value_type { void *context; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 534dd74a2da0..d5bddfc4010e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -962,7 +962,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) struct r1conf *conf = mddev->private; struct bio *bio; - if (from_schedule) { + if (from_schedule || current->bio_list) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); conf->pending_count += plug->pending_cnt; @@ -2709,7 +2709,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) || disk_idx < 0) continue; if (test_bit(Replacement, &rdev->flags)) - disk = conf->mirrors + conf->raid_disks + disk_idx; + disk = conf->mirrors + mddev->raid_disks + disk_idx; else disk = conf->mirrors + disk_idx; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 9a08f621b27d..64d48249c03b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -499,7 +499,7 @@ static void raid10_end_write_request(struct bio *bio, int error) */ one_write_done(r10_bio); if (dec_rdev) - rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); + rdev_dec_pending(rdev, conf->mddev); } /* @@ -1068,7 +1068,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) struct r10conf *conf = mddev->private; struct bio *bio; - if (from_schedule) { + if (from_schedule || current->bio_list) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); conf->pending_count += plug->pending_cnt; @@ -1333,18 +1333,21 @@ retry_write: blocked_rdev = rrdev; break; } + if (rdev && (test_bit(Faulty, &rdev->flags) + || test_bit(Unmerged, &rdev->flags))) + rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags) || test_bit(Unmerged, &rrdev->flags))) rrdev = NULL; r10_bio->devs[i].bio = NULL; r10_bio->devs[i].repl_bio = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags) || - test_bit(Unmerged, &rdev->flags)) { + + if (!rdev && !rrdev) { set_bit(R10BIO_Degraded, &r10_bio->state); continue; } - if (test_bit(WriteErrorSeen, &rdev->flags)) { + if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; sector_t dev_sector = r10_bio->devs[i].addr; int bad_sectors; @@ -1386,8 +1389,10 @@ retry_write: max_sectors = good_sectors; } } - r10_bio->devs[i].bio = bio; - atomic_inc(&rdev->nr_pending); + if (rdev) { + r10_bio->devs[i].bio = bio; + atomic_inc(&rdev->nr_pending); + } if (rrdev) { r10_bio->devs[i].repl_bio = bio; atomic_inc(&rrdev->nr_pending); @@ -1443,69 +1448,71 @@ retry_write: for (i = 0; i < conf->copies; i++) { struct bio *mbio; int d = r10_bio->devs[i].devnum; - if (!r10_bio->devs[i].bio) - continue; + if (r10_bio->devs[i].bio) { + struct md_rdev *rdev = conf->mirrors[d].rdev; + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, + max_sectors); + r10_bio->devs[i].bio = mbio; + + mbio->bi_sector = (r10_bio->devs[i].addr+ + choose_data_offset(r10_bio, + rdev)); + mbio->bi_bdev = rdev->bdev; + mbio->bi_end_io = raid10_end_write_request; + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; + mbio->bi_private = r10_bio; - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, - max_sectors); - r10_bio->devs[i].bio = mbio; + atomic_inc(&r10_bio->remaining); - mbio->bi_sector = (r10_bio->devs[i].addr+ - choose_data_offset(r10_bio, - conf->mirrors[d].rdev)); - mbio->bi_bdev = conf->mirrors[d].rdev->bdev; - mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; - mbio->bi_private = r10_bio; + cb = blk_check_plugged(raid10_unplug, mddev, + sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, + cb); + else + plug = NULL; + spin_lock_irqsave(&conf->device_lock, flags); + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } + spin_unlock_irqrestore(&conf->device_lock, flags); + if (!plug) + md_wakeup_thread(mddev->thread); + } - atomic_inc(&r10_bio->remaining); + if (r10_bio->devs[i].repl_bio) { + struct md_rdev *rdev = conf->mirrors[d].replacement; + if (rdev == NULL) { + /* Replacement just got moved to main 'rdev' */ + smp_mb(); + rdev = conf->mirrors[d].rdev; + } + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, + max_sectors); + r10_bio->devs[i].repl_bio = mbio; + + mbio->bi_sector = (r10_bio->devs[i].addr + + choose_data_offset( + r10_bio, rdev)); + mbio->bi_bdev = rdev->bdev; + mbio->bi_end_io = raid10_end_write_request; + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; + mbio->bi_private = r10_bio; - cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); - if (cb) - plug = container_of(cb, struct raid10_plug_cb, cb); - else - plug = NULL; - spin_lock_irqsave(&conf->device_lock, flags); - if (plug) { - bio_list_add(&plug->pending, mbio); - plug->pending_cnt++; - } else { + atomic_inc(&r10_bio->remaining); + spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; + spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev)) + md_wakeup_thread(mddev->thread); } - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!plug) - md_wakeup_thread(mddev->thread); - - if (!r10_bio->devs[i].repl_bio) - continue; - - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, - max_sectors); - r10_bio->devs[i].repl_bio = mbio; - - /* We are actively writing to the original device - * so it cannot disappear, so the replacement cannot - * become NULL here - */ - mbio->bi_sector = (r10_bio->devs[i].addr + - choose_data_offset( - r10_bio, - conf->mirrors[d].replacement)); - mbio->bi_bdev = conf->mirrors[d].replacement->bdev; - mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; - mbio->bi_private = r10_bio; - - atomic_inc(&r10_bio->remaining); - spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) - md_wakeup_thread(mddev->thread); } /* Don't remove the bias on 'remaining' (one_write_done) until @@ -1782,7 +1789,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) clear_bit(Unmerged, &rdev->flags); } md_integrity_add_rdev(rdev, mddev); - if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); @@ -3612,11 +3619,14 @@ static int run(struct mddev *mddev) discard_supported = true; } - if (discard_supported) - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); - else - queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); - + if (mddev->queue) { + if (discard_supported) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); + } /* need to check that every block has at least one working mirror */ if (!enough(conf, -1)) { printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2bf617d6f4fd..19d77a026639 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -53,6 +53,8 @@ #include <linux/cpu.h> #include <linux/slab.h> #include <linux/ratelimit.h> +#include <trace/events/block.h> + #include "md.h" #include "raid5.h" #include "raid0.h" @@ -182,6 +184,8 @@ static void return_io(struct bio *return_bi) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; + trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), + bi, 0); bio_endio(bi, 0); bi = return_bi; } @@ -670,6 +674,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) bi->bi_next = NULL; if (rrdev) set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); + trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), + bi, disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); generic_make_request(bi); } if (rrdev) { @@ -697,6 +704,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rbi->bi_io_vec[0].bv_offset = 0; rbi->bi_size = STRIPE_SIZE; rbi->bi_next = NULL; + trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), + rbi, disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); generic_make_request(rbi); } if (!rdev && !rrdev) { @@ -1575,7 +1585,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) * This happens in stages: * 1/ create a new kmem_cache and allocate the required number of * stripe_heads. - * 2/ gather all the old stripe_heads and tranfer the pages across + * 2/ gather all the old stripe_heads and transfer the pages across * to the new stripe_heads. This will have the side effect of * freezing the array as once all stripe_heads have been collected, * no IO will be possible. Old stripe heads are freed once their @@ -2772,10 +2782,12 @@ static void handle_stripe_clean_event(struct r5conf *conf, dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && (test_bit(R5_UPTODATE, &dev->flags) || - test_and_clear_bit(R5_Discard, &dev->flags))) { + test_bit(R5_Discard, &dev->flags))) { /* We can return any write requests */ struct bio *wbi, *wbi2; pr_debug("Return write for disc %d\n", i); + if (test_and_clear_bit(R5_Discard, &dev->flags)) + clear_bit(R5_UPTODATE, &dev->flags); wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_sector < @@ -2793,7 +2805,8 @@ static void handle_stripe_clean_event(struct r5conf *conf, !test_bit(STRIPE_DEGRADED, &sh->state), 0); } - } + } else if (test_bit(R5_Discard, &sh->dev[i].flags)) + clear_bit(R5_Discard, &sh->dev[i].flags); if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) if (atomic_dec_and_test(&conf->pending_full_writes)) @@ -2850,8 +2863,10 @@ static void handle_stripe_dirtying(struct r5conf *conf, pr_debug("for sector %llu, rmw=%d rcw=%d\n", (unsigned long long)sh->sector, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); - if (rmw < rcw && rmw > 0) + if (rmw < rcw && rmw > 0) { /* prefer read-modify-write, but need to get some data */ + blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d", + (unsigned long long)sh->sector, rmw); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && @@ -2862,7 +2877,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, if ( test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { pr_debug("Read_old block " - "%d for r-m-w\n", i); + "%d for r-m-w\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); s->locked++; @@ -2872,8 +2887,10 @@ static void handle_stripe_dirtying(struct r5conf *conf, } } } + } if (rcw <= rmw && rcw > 0) { /* want reconstruct write, but need to get some data */ + int qread =0; rcw = 0; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -2892,12 +2909,17 @@ static void handle_stripe_dirtying(struct r5conf *conf, set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); s->locked++; + qread++; } else { set_bit(STRIPE_DELAYED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); } } } + if (rcw) + blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", + (unsigned long long)sh->sector, + rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); } /* now if nothing is locked, and if we have enough data, * we can start a write request @@ -3219,10 +3241,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) } /* done submitting copies, wait for them to complete */ - if (tx) { - async_tx_ack(tx); - dma_wait_for_async_tx(tx); - } + async_tx_quiesce(&tx); } /* @@ -3488,40 +3507,6 @@ static void handle_stripe(struct stripe_head *sh) handle_failed_sync(conf, sh, &s); } - /* - * might be able to return some write requests if the parity blocks - * are safe, or on a failed drive - */ - pdev = &sh->dev[sh->pd_idx]; - s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) - || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); - qdev = &sh->dev[sh->qd_idx]; - s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) - || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) - || conf->level < 6; - - if (s.written && - (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) - && !test_bit(R5_LOCKED, &pdev->flags) - && (test_bit(R5_UPTODATE, &pdev->flags) || - test_bit(R5_Discard, &pdev->flags))))) && - (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) - && !test_bit(R5_LOCKED, &qdev->flags) - && (test_bit(R5_UPTODATE, &qdev->flags) || - test_bit(R5_Discard, &qdev->flags)))))) - handle_stripe_clean_event(conf, sh, disks, &s.return_bi); - - /* Now we might consider reading some blocks, either to check/generate - * parity, or to satisfy requests - * or to load a block that is being partially written. - */ - if (s.to_read || s.non_overwrite - || (conf->level == 6 && s.to_write && s.failed) - || (s.syncing && (s.uptodate + s.compute < disks)) - || s.replacing - || s.expanding) - handle_stripe_fill(sh, &s, disks); - /* Now we check to see if any write operations have recently * completed */ @@ -3559,6 +3544,40 @@ static void handle_stripe(struct stripe_head *sh) s.dec_preread_active = 1; } + /* + * might be able to return some write requests if the parity blocks + * are safe, or on a failed drive + */ + pdev = &sh->dev[sh->pd_idx]; + s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) + || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); + qdev = &sh->dev[sh->qd_idx]; + s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) + || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) + || conf->level < 6; + + if (s.written && + (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) + && !test_bit(R5_LOCKED, &pdev->flags) + && (test_bit(R5_UPTODATE, &pdev->flags) || + test_bit(R5_Discard, &pdev->flags))))) && + (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) + && !test_bit(R5_LOCKED, &qdev->flags) + && (test_bit(R5_UPTODATE, &qdev->flags) || + test_bit(R5_Discard, &qdev->flags)))))) + handle_stripe_clean_event(conf, sh, disks, &s.return_bi); + + /* Now we might consider reading some blocks, either to check/generate + * parity, or to satisfy requests + * or to load a block that is being partially written. + */ + if (s.to_read || s.non_overwrite + || (conf->level == 6 && s.to_write && s.failed) + || (s.syncing && (s.uptodate + s.compute < disks)) + || s.replacing + || s.expanding) + handle_stripe_fill(sh, &s, disks); + /* Now to consider new write requests and what else, if anything * should be read. We do not handle new writes when: * 1/ A 'write' operation (copy+xor) is already in flight. @@ -3898,6 +3917,8 @@ static void raid5_align_endio(struct bio *bi, int error) rdev_dec_pending(rdev, conf->mddev); if (!error && uptodate) { + trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), + raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); @@ -4002,6 +4023,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) atomic_inc(&conf->active_aligned_reads); spin_unlock_irq(&conf->device_lock); + trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), + align_bi, disk_devt(mddev->gendisk), + raid_bio->bi_sector); generic_make_request(align_bi); return 1; } else { @@ -4076,6 +4100,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) struct stripe_head *sh; struct mddev *mddev = cb->cb.data; struct r5conf *conf = mddev->private; + int cnt = 0; if (cb->list.next && !list_empty(&cb->list)) { spin_lock_irq(&conf->device_lock); @@ -4090,9 +4115,11 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) smp_mb__before_clear_bit(); clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); __release_stripe(conf, sh); + cnt++; } spin_unlock_irq(&conf->device_lock); } + trace_block_unplug(mddev->queue, cnt, !from_schedule); kfree(cb); } @@ -4350,6 +4377,8 @@ static void make_request(struct mddev *mddev, struct bio * bi) if ( rw == WRITE ) md_write_end(mddev); + trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), + bi, 0); bio_endio(bi, 0); } } @@ -4726,8 +4755,11 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) handled++; } remaining = raid5_dec_bi_active_stripes(raid_bio); - if (remaining == 0) + if (remaining == 0) { + trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), + raid_bio, 0); bio_endio(raid_bio, 0); + } if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); return handled; @@ -5527,6 +5559,10 @@ static int run(struct mddev *mddev) * discard data disk but write parity disk */ stripe = stripe * PAGE_SIZE; + /* Round up to power of 2, as discard handling + * currently assumes that */ + while ((stripe-1) & stripe) + stripe = (stripe | (stripe-1)) + 1; mddev->queue->limits.discard_alignment = stripe; mddev->queue->limits.discard_granularity = stripe; /* |