diff options
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 1531 |
1 files changed, 833 insertions, 698 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index e575e74aabf5..0fde115e921f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -69,15 +69,19 @@ #include "md-bitmap.h" #include "md-cluster.h" -/* pers_list is a list of registered personalities protected by pers_lock. */ -static LIST_HEAD(pers_list); -static DEFINE_SPINLOCK(pers_lock); +static const char *action_name[NR_SYNC_ACTIONS] = { + [ACTION_RESYNC] = "resync", + [ACTION_RECOVER] = "recover", + [ACTION_CHECK] = "check", + [ACTION_REPAIR] = "repair", + [ACTION_RESHAPE] = "reshape", + [ACTION_FROZEN] = "frozen", + [ACTION_IDLE] = "idle", +}; -static const struct kobj_type md_ktype; +static DEFINE_XARRAY(md_submodule); -struct md_cluster_operations *md_cluster_ops; -EXPORT_SYMBOL(md_cluster_ops); -static struct module *md_cluster_mod; +static const struct kobj_type md_ktype; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; @@ -107,32 +111,48 @@ static void md_wakeup_thread_directly(struct md_thread __rcu *thread); /* Default safemode delay: 200 msec */ #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) /* - * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' - * is 1000 KB/sec, so the extra system load does not show up that much. - * Increase it if you want to have more _guaranteed_ speed. Note that - * the RAID driver will use the maximum available bandwidth if the IO - * subsystem is idle. There is also an 'absolute maximum' reconstruction - * speed limit - in case reconstruction slows down your system despite - * idle IO detection. + * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit' + * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load + * does not show up that much. Increase it if you want to have more guaranteed + * speed. Note that the RAID driver will use the maximum bandwidth + * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle. + * + * Background sync IO speed control: + * + * - below speed min: + * no limit; + * - above speed min and below speed max: + * a) if mddev is idle, then no limit; + * b) if mddev is busy handling normal IO, then limit inflight sync IO + * to sync_io_depth; + * - above speed max: + * sync IO can't be issued; * - * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. - * or /sys/block/mdX/md/sync_speed_{min,max} + * Following configurations can be changed via /proc/sys/dev/raid/ for system + * or /sys/block/mdX/md/ for one array. */ - static int sysctl_speed_limit_min = 1000; static int sysctl_speed_limit_max = 200000; -static inline int speed_min(struct mddev *mddev) +static int sysctl_sync_io_depth = 32; + +static int speed_min(struct mddev *mddev) { return mddev->sync_speed_min ? mddev->sync_speed_min : sysctl_speed_limit_min; } -static inline int speed_max(struct mddev *mddev) +static int speed_max(struct mddev *mddev) { return mddev->sync_speed_max ? mddev->sync_speed_max : sysctl_speed_limit_max; } +static int sync_io_depth(struct mddev *mddev) +{ + return mddev->sync_io_depth ? + mddev->sync_io_depth : sysctl_sync_io_depth; +} + static void rdev_uninit_serial(struct md_rdev *rdev) { if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) @@ -284,19 +304,26 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) static struct ctl_table_header *raid_table_header; -static struct ctl_table raid_table[] = { +static const struct ctl_table raid_table[] = { { .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, .maxlen = sizeof(int), - .mode = S_IRUGO|S_IWUSR, + .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "speed_limit_max", .data = &sysctl_speed_limit_max, .maxlen = sizeof(int), - .mode = S_IRUGO|S_IWUSR, + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_io_depth", + .data = &sysctl_sync_io_depth, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_dointvec, }, }; @@ -479,7 +506,6 @@ int mddev_suspend(struct mddev *mddev, bool interruptible) */ WRITE_ONCE(mddev->suspended, mddev->suspended + 1); - del_timer_sync(&mddev->safemode_timer); /* restrict memory reclaim I/O during raid array is suspend */ mddev->noio_flag = memalloc_noio_save(); @@ -538,140 +564,57 @@ static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_n } /* - * Generic flush handling for md + * The only difference from bio_chain_endio() is that the current + * bi_status of bio does not affect the bi_status of parent. */ - static void md_end_flush(struct bio *bio) { - struct md_rdev *rdev = bio->bi_private; - struct mddev *mddev = rdev->mddev; - - bio_put(bio); - - rdev_dec_pending(rdev, mddev); + struct bio *parent = bio->bi_private; - if (atomic_dec_and_test(&mddev->flush_pending)) { - /* The pair is percpu_ref_get() from md_flush_request() */ - percpu_ref_put(&mddev->active_io); + /* + * If any flush io error before the power failure, + * disk data may be lost. + */ + if (bio->bi_status) + pr_err("md: %pg flush io error %d\n", bio->bi_bdev, + blk_status_to_errno(bio->bi_status)); - /* The pre-request flush has finished */ - queue_work(md_wq, &mddev->flush_work); - } + bio_put(bio); + bio_endio(parent); } -static void md_submit_flush_data(struct work_struct *ws); - -static void submit_flushes(struct work_struct *ws) +bool md_flush_request(struct mddev *mddev, struct bio *bio) { - struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct md_rdev *rdev; - - mddev->start_flush = ktime_get_boottime(); - INIT_WORK(&mddev->flush_work, md_submit_flush_data); - atomic_set(&mddev->flush_pending, 1); - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Faulty, &rdev->flags)) { - struct bio *bi; - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - bi = bio_alloc_bioset(rdev->bdev, 0, - REQ_OP_WRITE | REQ_PREFLUSH, - GFP_NOIO, &mddev->bio_set); - bi->bi_end_io = md_end_flush; - bi->bi_private = rdev; - atomic_inc(&mddev->flush_pending); - submit_bio(bi); - rcu_read_lock(); - } - rcu_read_unlock(); - if (atomic_dec_and_test(&mddev->flush_pending)) { - /* The pair is percpu_ref_get() from md_flush_request() */ - percpu_ref_put(&mddev->active_io); - - queue_work(md_wq, &mddev->flush_work); - } -} - -static void md_submit_flush_data(struct work_struct *ws) -{ - struct mddev *mddev = container_of(ws, struct mddev, flush_work); - struct bio *bio = mddev->flush_bio; + struct bio *new; /* - * must reset flush_bio before calling into md_handle_request to avoid a - * deadlock, because other bios passed md_handle_request suspend check - * could wait for this and below md_handle_request could wait for those - * bios because of suspend check + * md_flush_reqeust() should be called under md_handle_request() and + * 'active_io' is already grabbed. Hence it's safe to get rdev directly + * without rcu protection. */ - spin_lock_irq(&mddev->lock); - mddev->prev_flush_start = mddev->start_flush; - mddev->flush_bio = NULL; - spin_unlock_irq(&mddev->lock); - wake_up(&mddev->sb_wait); + WARN_ON(percpu_ref_is_zero(&mddev->active_io)); - if (bio->bi_iter.bi_size == 0) { - /* an empty barrier - all done */ - bio_endio(bio); - } else { - bio->bi_opf &= ~REQ_PREFLUSH; - md_handle_request(mddev, bio); - } -} + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; -/* - * Manages consolidation of flushes and submitting any flushes needed for - * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is - * being finished in another context. Returns false if the flushing is - * complete but still needs the I/O portion of the bio to be processed. - */ -bool md_flush_request(struct mddev *mddev, struct bio *bio) -{ - ktime_t req_start = ktime_get_boottime(); - spin_lock_irq(&mddev->lock); - /* flush requests wait until ongoing flush completes, - * hence coalescing all the pending requests. - */ - wait_event_lock_irq(mddev->sb_wait, - !mddev->flush_bio || - ktime_before(req_start, mddev->prev_flush_start), - mddev->lock); - /* new request after previous flush is completed */ - if (ktime_after(req_start, mddev->prev_flush_start)) { - WARN_ON(mddev->flush_bio); - /* - * Grab a reference to make sure mddev_suspend() will wait for - * this flush to be done. - * - * md_flush_reqeust() is called under md_handle_request() and - * 'active_io' is already grabbed, hence percpu_ref_is_zero() - * won't pass, percpu_ref_tryget_live() can't be used because - * percpu_ref_kill() can be called by mddev_suspend() - * concurrently. - */ - WARN_ON(percpu_ref_is_zero(&mddev->active_io)); - percpu_ref_get(&mddev->active_io); - mddev->flush_bio = bio; - bio = NULL; + new = bio_alloc_bioset(rdev->bdev, 0, + REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, + &mddev->bio_set); + new->bi_private = bio; + new->bi_end_io = md_end_flush; + bio_inc_remaining(bio); + submit_bio(new); } - spin_unlock_irq(&mddev->lock); - if (!bio) { - INIT_WORK(&mddev->flush_work, submit_flushes); - queue_work(md_wq, &mddev->flush_work); - } else { - /* flush was performed for some other bio while we waited. */ - if (bio->bi_iter.bi_size == 0) - /* an empty barrier - all done */ - bio_endio(bio); - else { - bio->bi_opf &= ~REQ_PREFLUSH; - return false; - } + if (bio_sectors(bio) == 0) { + bio_endio(bio); + return true; } - return true; + + bio->bi_opf &= ~REQ_PREFLUSH; + return false; } EXPORT_SYMBOL(md_flush_request); @@ -703,6 +646,12 @@ static void __mddev_put(struct mddev *mddev) queue_work(md_misc_wq, &mddev->del_work); } +static void mddev_put_locked(struct mddev *mddev) +{ + if (atomic_dec_and_test(&mddev->active)) + __mddev_put(mddev); +} + void mddev_put(struct mddev *mddev) { if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) @@ -742,7 +691,6 @@ int mddev_init(struct mddev *mddev) mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); - mutex_init(&mddev->sync_mutex); mutex_init(&mddev->suspend_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); @@ -753,15 +701,15 @@ int mddev_init(struct mddev *mddev) atomic_set(&mddev->openers, 0); atomic_set(&mddev->sync_seq, 0); spin_lock_init(&mddev->lock); - atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; mddev->reshape_backwards = 0; - mddev->last_sync_action = "none"; + mddev->last_sync_action = ACTION_IDLE; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; + mddev_set_bitmap_ops(mddev); INIT_WORK(&mddev->sync_work, md_start_sync); INIT_WORK(&mddev->del_work, mddev_delayed_delete); @@ -963,16 +911,40 @@ struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) } EXPORT_SYMBOL_GPL(md_find_rdev_rcu); -static struct md_personality *find_pers(int level, char *clevel) +static struct md_personality *get_pers(int level, char *clevel) { - struct md_personality *pers; - list_for_each_entry(pers, &pers_list, list) { - if (level != LEVEL_NONE && pers->level == level) - return pers; - if (strcmp(pers->name, clevel)==0) - return pers; + struct md_personality *ret = NULL; + struct md_submodule_head *head; + unsigned long i; + + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) { + if (head->type != MD_PERSONALITY) + continue; + if ((level != LEVEL_NONE && head->id == level) || + !strcmp(head->name, clevel)) { + if (try_module_get(head->owner)) + ret = (void *)head; + break; + } } - return NULL; + xa_unlock(&md_submodule); + + if (!ret) { + if (level != LEVEL_NONE) + pr_warn("md: personality for level %d is not loaded!\n", + level); + else + pr_warn("md: personality for level %s is not loaded!\n", + clevel); + } + + return ret; +} + +static void put_pers(struct md_personality *pers) +{ + module_put(pers->head.owner); } /* return the offset of the super block in 512byte sectors */ @@ -1255,7 +1227,7 @@ int md_check_no_bitmap(struct mddev *mddev) if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) return 0; pr_warn("%s: bitmaps are not supported for %s\n", - mdname(mddev), mddev->pers->name); + mdname(mddev), mddev->pers->head.name); return 1; } EXPORT_SYMBOL(md_check_no_bitmap); @@ -1362,6 +1334,18 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor return ret; } +static u64 md_bitmap_events_cleared(struct mddev *mddev) +{ + struct md_bitmap_stats stats; + int err; + + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (err) + return 0; + + return stats.events_cleared; +} + /* * validate_super for 0.90.0 * note: we are not using "freshest" for 0.9 superblock @@ -1454,7 +1438,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru /* if adding to array with a bitmap, then we can accept an * older device ... but not too old. */ - if (ev1 < mddev->bitmap->events_cleared) + if (ev1 < md_bitmap_events_cleared(mddev)) return 0; if (ev1 < mddev->events) set_bit(Bitmap_sync, &rdev->flags); @@ -1811,7 +1795,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ count <<= sb->bblog_shift; if (bb + 1 == 0) break; - if (badblocks_set(&rdev->badblocks, sector, count, 1)) + if (!badblocks_set(&rdev->badblocks, sector, count, 1)) return -EINVAL; } } else if (sb->bblog_offset != 0) @@ -1981,7 +1965,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc /* If adding to array with a bitmap, then we can accept an * older device, but not too old. */ - if (ev1 < mddev->bitmap->events_cleared) + if (ev1 < md_bitmap_events_cleared(mddev)) return 0; if (ev1 < mddev->events) set_bit(Bitmap_sync, &rdev->flags); @@ -2313,7 +2297,6 @@ super_1_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) { /* All necessary checks on new >= old have been done */ - struct bitmap *bitmap; if (new_offset >= rdev->data_offset) return 1; @@ -2330,11 +2313,18 @@ super_1_allow_new_offset(struct md_rdev *rdev, */ if (rdev->sb_start + (32+4)*2 > new_offset) return 0; - bitmap = rdev->mddev->bitmap; - if (bitmap && !rdev->mddev->bitmap_info.file && - rdev->sb_start + rdev->mddev->bitmap_info.offset + - bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) - return 0; + + if (!rdev->mddev->bitmap_info.file) { + struct mddev *mddev = rdev->mddev; + struct md_bitmap_stats stats; + int err; + + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (!err && rdev->sb_start + mddev->bitmap_info.offset + + stats.file_pages * (PAGE_SIZE >> 9) > new_offset) + return 0; + } + if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) return 0; @@ -2410,81 +2400,16 @@ static LIST_HEAD(pending_raid_disks); */ int md_integrity_register(struct mddev *mddev) { - struct md_rdev *rdev, *reference = NULL; - if (list_empty(&mddev->disks)) return 0; /* nothing to do */ - if (mddev_is_dm(mddev) || blk_get_integrity(mddev->gendisk)) - return 0; /* shouldn't register, or already is */ - rdev_for_each(rdev, mddev) { - /* skip spares and non-functional disks */ - if (test_bit(Faulty, &rdev->flags)) - continue; - if (rdev->raid_disk < 0) - continue; - if (!reference) { - /* Use the first rdev as the reference */ - reference = rdev; - continue; - } - /* does this rdev's profile match the reference profile? */ - if (blk_integrity_compare(reference->bdev->bd_disk, - rdev->bdev->bd_disk) < 0) - return -EINVAL; - } - if (!reference || !bdev_get_integrity(reference->bdev)) - return 0; - /* - * All component devices are integrity capable and have matching - * profiles, register the common profile for the md device. - */ - blk_integrity_register(mddev->gendisk, - bdev_get_integrity(reference->bdev)); + if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk)) + return 0; /* shouldn't register */ pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); - if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || - (mddev->level != 1 && mddev->level != 10 && - bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { - /* - * No need to handle the failure of bioset_integrity_create, - * because the function is called by md_run() -> pers->run(), - * md_run calls bioset_exit -> bioset_integrity_free in case - * of failure case. - */ - pr_err("md: failed to create integrity pool for %s\n", - mdname(mddev)); - return -EINVAL; - } return 0; } EXPORT_SYMBOL(md_integrity_register); -/* - * Attempt to add an rdev, but only if it is consistent with the current - * integrity profile - */ -int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) -{ - struct blk_integrity *bi_mddev; - - if (mddev_is_dm(mddev)) - return 0; - - bi_mddev = blk_get_integrity(mddev->gendisk); - - if (!bi_mddev) /* nothing to do */ - return 0; - - if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { - pr_err("%s: incompatible integrity profile for %pg\n", - mdname(mddev), rdev->bdev); - return -ENXIO; - } - - return 0; -} -EXPORT_SYMBOL(md_integrity_add_rdev); - static bool rdev_read_only(struct md_rdev *rdev) { return bdev_read_only(rdev->bdev) || @@ -2748,11 +2673,11 @@ repeat: force_change = 1; if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) nospares = 1; - ret = md_cluster_ops->metadata_update_start(mddev); + ret = mddev->cluster_ops->metadata_update_start(mddev); /* Has someone else has updated the sb */ if (!does_sb_need_changing(mddev)) { if (ret == 0) - md_cluster_ops->metadata_update_cancel(mddev); + mddev->cluster_ops->metadata_update_cancel(mddev); bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)); @@ -2862,7 +2787,7 @@ repeat: mddev_add_trace_msg(mddev, "md md_update_sb"); rewrite: - md_bitmap_update_sb(mddev->bitmap); + mddev->bitmap_ops->update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ @@ -2892,7 +2817,7 @@ rewrite: /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ if (mddev_is_clustered(mddev) && ret == 0) - md_cluster_ops->metadata_update_finish(mddev); + mddev->cluster_ops->metadata_update_finish(mddev); if (mddev->in_sync != sync_req || !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), @@ -3051,7 +2976,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) else { err = 0; if (mddev_is_clustered(mddev)) - err = md_cluster_ops->remove_disk(mddev, rdev); + err = mddev->cluster_ops->remove_disk(mddev, rdev); if (err == 0) { md_kick_rdev_from_array(rdev); @@ -3161,7 +3086,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) * by this node eventually */ if (!mddev_is_clustered(rdev->mddev) || - (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { + (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) { clear_bit(Faulty, &rdev->flags); err = add_bound_rdev(rdev); } @@ -3969,7 +3894,7 @@ level_show(struct mddev *mddev, char *page) spin_lock(&mddev->lock); p = mddev->pers; if (p) - ret = sprintf(page, "%s\n", p->name); + ret = sprintf(page, "%s\n", p->head.name); else if (mddev->clevel[0]) ret = sprintf(page, "%s\n", mddev->clevel); else if (mddev->level != LEVEL_NONE) @@ -4026,7 +3951,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) rv = -EINVAL; if (!mddev->pers->quiesce) { pr_warn("md: %s: %s does not support online personality change\n", - mdname(mddev), mddev->pers->name); + mdname(mddev), mddev->pers->head.name); goto out_unlock; } @@ -4040,24 +3965,20 @@ level_store(struct mddev *mddev, const char *buf, size_t len) if (request_module("md-%s", clevel) != 0) request_module("md-level-%s", clevel); - spin_lock(&pers_lock); - pers = find_pers(level, clevel); - if (!pers || !try_module_get(pers->owner)) { - spin_unlock(&pers_lock); - pr_warn("md: personality %s not loaded\n", clevel); + pers = get_pers(level, clevel); + if (!pers) { rv = -EINVAL; goto out_unlock; } - spin_unlock(&pers_lock); if (pers == mddev->pers) { /* Nothing to do! */ - module_put(pers->owner); + put_pers(pers); rv = len; goto out_unlock; } if (!pers->takeover) { - module_put(pers->owner); + put_pers(pers); pr_warn("md: %s: %s does not support personality takeover\n", mdname(mddev), clevel); rv = -EINVAL; @@ -4078,7 +3999,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->raid_disks -= mddev->delta_disks; mddev->delta_disks = 0; mddev->reshape_backwards = 0; - module_put(pers->owner); + put_pers(pers); pr_warn("md: %s: %s would not accept array\n", mdname(mddev), clevel); rv = PTR_ERR(priv); @@ -4093,7 +4014,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) oldpriv = mddev->private; mddev->pers = pers; mddev->private = priv; - strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); mddev->level = mddev->new_level; mddev->layout = mddev->new_layout; mddev->chunk_sectors = mddev->new_chunk_sectors; @@ -4135,7 +4056,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->to_remove = &md_redundancy_group; } - module_put(oldpers->owner); + put_pers(oldpers); rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) @@ -4166,7 +4087,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) * it must always be in_sync */ mddev->in_sync = 1; - del_timer_sync(&mddev->safemode_timer); + timer_delete_sync(&mddev->safemode_timer); } pers->run(mddev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -4184,6 +4105,34 @@ static struct md_sysfs_entry md_level = __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); static ssize_t +new_level_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d\n", mddev->new_level); +} + +static ssize_t +new_level_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int n; + int err; + + err = kstrtouint(buf, 10, &n); + if (err < 0) + return err; + err = mddev_lock(mddev); + if (err) + return err; + + mddev->new_level = n; + md_update_sb(mddev, 1); + + mddev_unlock(mddev); + return len; +} +static struct md_sysfs_entry md_new_level = +__ATTR(new_level, 0664, new_level_show, new_level_store); + +static ssize_t layout_show(struct mddev *mddev, char *page) { /* just a number, not meaningful for all levels */ @@ -4722,17 +4671,23 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ while (*buf) { chunk = end_chunk = simple_strtoul(buf, &end, 0); - if (buf == end) break; + if (buf == end) + break; + if (*end == '-') { /* range */ buf = end + 1; end_chunk = simple_strtoul(buf, &end, 0); - if (buf == end) break; + if (buf == end) + break; } - if (*end && !isspace(*end)) break; - md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); + + if (*end && !isspace(*end)) + break; + + mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk); buf = skip_spaces(end); } - md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ + mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */ out: mddev_unlock(mddev); return len; @@ -4867,30 +4822,81 @@ out_unlock: static struct md_sysfs_entry md_metadata = __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); -static ssize_t -action_show(struct mddev *mddev, char *page) +enum sync_action md_sync_action(struct mddev *mddev) { - char *type = "idle"; unsigned long recovery = mddev->recovery; + + /* + * frozen has the highest priority, means running sync_thread will be + * stopped immediately, and no new sync_thread can start. + */ if (test_bit(MD_RECOVERY_FROZEN, &recovery)) - type = "frozen"; - else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || - (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) { - if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) - type = "reshape"; - else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { - if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) - type = "resync"; - else if (test_bit(MD_RECOVERY_CHECK, &recovery)) - type = "check"; - else - type = "repair"; - } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) - type = "recover"; - else if (mddev->reshape_position != MaxSector) - type = "reshape"; + return ACTION_FROZEN; + + /* + * read-only array can't register sync_thread, and it can only + * add/remove spares. + */ + if (!md_is_rdwr(mddev)) + return ACTION_IDLE; + + /* + * idle means no sync_thread is running, and no new sync_thread is + * requested. + */ + if (!test_bit(MD_RECOVERY_RUNNING, &recovery) && + !test_bit(MD_RECOVERY_NEEDED, &recovery)) + return ACTION_IDLE; + + if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || + mddev->reshape_position != MaxSector) + return ACTION_RESHAPE; + + if (test_bit(MD_RECOVERY_RECOVER, &recovery)) + return ACTION_RECOVER; + + if (test_bit(MD_RECOVERY_SYNC, &recovery)) { + /* + * MD_RECOVERY_CHECK must be paired with + * MD_RECOVERY_REQUESTED. + */ + if (test_bit(MD_RECOVERY_CHECK, &recovery)) + return ACTION_CHECK; + if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) + return ACTION_REPAIR; + return ACTION_RESYNC; + } + + /* + * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no + * sync_action is specified. + */ + return ACTION_IDLE; +} + +enum sync_action md_sync_action_by_name(const char *page) +{ + enum sync_action action; + + for (action = 0; action < NR_SYNC_ACTIONS; ++action) { + if (cmd_match(page, action_name[action])) + return action; } - return sprintf(page, "%s\n", type); + + return NR_SYNC_ACTIONS; +} + +const char *md_sync_action_name(enum sync_action action) +{ + return action_name[action]; +} + +static ssize_t +action_show(struct mddev *mddev, char *page) +{ + enum sync_action action = md_sync_action(mddev); + + return sprintf(page, "%s\n", md_sync_action_name(action)); } /** @@ -4899,15 +4905,10 @@ action_show(struct mddev *mddev, char *page) * @locked: if set, reconfig_mutex will still be held after this function * return; if not set, reconfig_mutex will be released after this * function return. - * @check_seq: if set, only wait for curent running sync_thread to stop, noted - * that new sync_thread can still start. */ -static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq) +static void stop_sync_thread(struct mddev *mddev, bool locked) { - int sync_seq; - - if (check_seq) - sync_seq = atomic_read(&mddev->sync_seq); + int sync_seq = atomic_read(&mddev->sync_seq); if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { if (!locked) @@ -4928,7 +4929,8 @@ static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq) wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - (check_seq && sync_seq != atomic_read(&mddev->sync_seq))); + (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) && + sync_seq != atomic_read(&mddev->sync_seq))); if (locked) mddev_lock_nointr(mddev); @@ -4939,7 +4941,7 @@ void md_idle_sync_thread(struct mddev *mddev) lockdep_assert_held(&mddev->reconfig_mutex); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - stop_sync_thread(mddev, true, true); + stop_sync_thread(mddev, true); } EXPORT_SYMBOL_GPL(md_idle_sync_thread); @@ -4948,7 +4950,7 @@ void md_frozen_sync_thread(struct mddev *mddev) lockdep_assert_held(&mddev->reconfig_mutex); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - stop_sync_thread(mddev, true, false); + stop_sync_thread(mddev, true); } EXPORT_SYMBOL_GPL(md_frozen_sync_thread); @@ -4963,100 +4965,127 @@ void md_unfrozen_sync_thread(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); -static void idle_sync_thread(struct mddev *mddev) +static int mddev_start_reshape(struct mddev *mddev) { - mutex_lock(&mddev->sync_mutex); - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - - if (mddev_lock(mddev)) { - mutex_unlock(&mddev->sync_mutex); - return; - } - - stop_sync_thread(mddev, false, true); - mutex_unlock(&mddev->sync_mutex); -} + int ret; -static void frozen_sync_thread(struct mddev *mddev) -{ - mutex_lock(&mddev->sync_mutex); - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + if (mddev->pers->start_reshape == NULL) + return -EINVAL; - if (mddev_lock(mddev)) { - mutex_unlock(&mddev->sync_mutex); - return; + if (mddev->reshape_position == MaxSector || + mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev)) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + ret = mddev->pers->start_reshape(mddev); + if (ret) + return ret; + } else { + /* + * If reshape is still in progress, and md_check_recovery() can + * continue to reshape, don't restart reshape because data can + * be corrupted for raid456. + */ + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } - stop_sync_thread(mddev, false, false); - mutex_unlock(&mddev->sync_mutex); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); + return 0; } static ssize_t action_store(struct mddev *mddev, const char *page, size_t len) { + int ret; + enum sync_action action; + if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; +retry: + if (work_busy(&mddev->sync_work)) + flush_work(&mddev->sync_work); - if (cmd_match(page, "idle")) - idle_sync_thread(mddev); - else if (cmd_match(page, "frozen")) - frozen_sync_thread(mddev); - else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return -EBUSY; - else if (cmd_match(page, "resync")) - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - else if (cmd_match(page, "recover")) { - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if (cmd_match(page, "reshape")) { - int err; - if (mddev->pers->start_reshape == NULL) - return -EINVAL; - err = mddev_lock(mddev); - if (!err) { - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { - err = -EBUSY; - } else if (mddev->reshape_position == MaxSector || - mddev->pers->check_reshape == NULL || - mddev->pers->check_reshape(mddev)) { - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - err = mddev->pers->start_reshape(mddev); - } else { - /* - * If reshape is still in progress, and - * md_check_recovery() can continue to reshape, - * don't restart reshape because data can be - * corrupted for raid456. - */ - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - } - mddev_unlock(mddev); + ret = mddev_lock(mddev); + if (ret) + return ret; + + if (work_busy(&mddev->sync_work)) { + mddev_unlock(mddev); + goto retry; + } + + action = md_sync_action_by_name(page); + + /* TODO: mdadm rely on "idle" to start sync_thread. */ + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + switch (action) { + case ACTION_FROZEN: + md_frozen_sync_thread(mddev); + ret = len; + goto out; + case ACTION_IDLE: + md_idle_sync_thread(mddev); + break; + case ACTION_RESHAPE: + case ACTION_RECOVER: + case ACTION_CHECK: + case ACTION_REPAIR: + case ACTION_RESYNC: + ret = -EBUSY; + goto out; + default: + ret = -EINVAL; + goto out; } - if (err) - return err; - sysfs_notify_dirent_safe(mddev->sysfs_degraded); } else { - if (cmd_match(page, "check")) + switch (action) { + case ACTION_FROZEN: + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + ret = len; + goto out; + case ACTION_RESHAPE: + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + ret = mddev_start_reshape(mddev); + if (ret) + goto out; + break; + case ACTION_RECOVER: + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + break; + case ACTION_CHECK: set_bit(MD_RECOVERY_CHECK, &mddev->recovery); - else if (!cmd_match(page, "repair")) - return -EINVAL; - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + fallthrough; + case ACTION_REPAIR: + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + fallthrough; + case ACTION_RESYNC: + case ACTION_IDLE: + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + break; + default: + ret = -EINVAL; + goto out; + } } + if (mddev->ro == MD_AUTO_READ) { /* A write to sync_action is enough to justify * canceling read-auto mode */ - flush_work(&mddev->sync_work); mddev->ro = MD_RDWR; md_wakeup_thread(mddev->sync_thread); } + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_action); - return len; + ret = len; + +out: + mddev_unlock(mddev); + return ret; } static struct md_sysfs_entry md_scan_mode = @@ -5065,7 +5094,8 @@ __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); static ssize_t last_sync_action_show(struct mddev *mddev, char *page) { - return sprintf(page, "%s\n", mddev->last_sync_action); + return sprintf(page, "%s\n", + md_sync_action_name(mddev->last_sync_action)); } static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); @@ -5084,7 +5114,7 @@ static ssize_t sync_min_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_min(mddev), - mddev->sync_speed_min ? "local": "system"); + mddev->sync_speed_min ? "local" : "system"); } static ssize_t @@ -5093,7 +5123,7 @@ sync_min_store(struct mddev *mddev, const char *buf, size_t len) unsigned int min; int rv; - if (strncmp(buf, "system", 6)==0) { + if (strncmp(buf, "system", 6) == 0) { min = 0; } else { rv = kstrtouint(buf, 10, &min); @@ -5113,7 +5143,7 @@ static ssize_t sync_max_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_max(mddev), - mddev->sync_speed_max ? "local": "system"); + mddev->sync_speed_max ? "local" : "system"); } static ssize_t @@ -5122,7 +5152,7 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len) unsigned int max; int rv; - if (strncmp(buf, "system", 6)==0) { + if (strncmp(buf, "system", 6) == 0) { max = 0; } else { rv = kstrtouint(buf, 10, &max); @@ -5139,6 +5169,35 @@ static struct md_sysfs_entry md_sync_max = __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); static ssize_t +sync_io_depth_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d (%s)\n", sync_io_depth(mddev), + mddev->sync_io_depth ? "local" : "system"); +} + +static ssize_t +sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int max; + int rv; + + if (strncmp(buf, "system", 6) == 0) { + max = 0; + } else { + rv = kstrtouint(buf, 10, &max); + if (rv < 0) + return rv; + if (max == 0) + return -EINVAL; + } + mddev->sync_io_depth = max; + return len; +} + +static struct md_sysfs_entry md_sync_io_depth = +__ATTR_RW(sync_io_depth); + +static ssize_t degraded_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->degraded); @@ -5584,7 +5643,7 @@ __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, static ssize_t serialize_policy_show(struct mddev *mddev, char *page) { - if (mddev->pers == NULL || (mddev->pers->level != 1)) + if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) return sprintf(page, "n/a\n"); else return sprintf(page, "%d\n", mddev->serialize_policy); @@ -5610,7 +5669,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) err = mddev_suspend_and_lock(mddev); if (err) return err; - if (mddev->pers == NULL || (mddev->pers->level != 1)) { + if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) { pr_err("md: serialize_policy is only effective for raid1\n"); err = -EINVAL; goto unlock; @@ -5633,6 +5692,7 @@ __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, static struct attribute *md_default_attrs[] = { &md_level.attr, + &md_new_level.attr, &md_layout.attr, &md_raid_disks.attr, &md_uuid.attr, @@ -5663,6 +5723,7 @@ static struct attribute *md_redundancy_attrs[] = { &md_mismatches.attr, &md_sync_min.attr, &md_sync_max.attr, + &md_sync_io_depth.attr, &md_sync_speed.attr, &md_sync_force_parallel.attr, &md_sync_completed.attr, @@ -5755,14 +5816,20 @@ static const struct kobj_type md_ktype = { int mdp_major = 0; /* stack the limit for all rdevs into lim */ -void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim) +int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, + unsigned int flags) { struct md_rdev *rdev; rdev_for_each(rdev, mddev) { queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, mddev->gendisk->disk_name); + if ((flags & MDDEV_STACK_INTEGRITY) && + !queue_limits_stack_integrity_bdev(lim, rdev->bdev)) + return -EINVAL; } + + return 0; } EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); @@ -5777,6 +5844,14 @@ int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) lim = queue_limits_start_update(mddev->gendisk->queue); queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, mddev->gendisk->disk_name); + + if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) { + pr_err("%s: incompatible integrity profile for %pg\n", + mdname(mddev), rdev->bdev); + queue_limits_cancel_update(mddev->gendisk->queue); + return -ENXIO; + } + return queue_limits_commit_update(mddev->gendisk->queue, &lim); } EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); @@ -5806,6 +5881,14 @@ static void mddev_delayed_delete(struct work_struct *ws) kobject_put(&mddev->kobj); } +void md_init_stacking_limits(struct queue_limits *lim) +{ + blk_set_stacking_limits(lim); + lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | + BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; +} +EXPORT_SYMBOL_GPL(md_init_stacking_limits); + struct mddev *md_alloc(dev_t dev, char *name) { /* @@ -5823,7 +5906,7 @@ struct mddev *md_alloc(dev_t dev, char *name) int partitioned; int shift; int unit; - int error ; + int error; /* * Wait for any previous instance of this device to be completely @@ -5881,7 +5964,6 @@ struct mddev *md_alloc(dev_t dev, char *name) disk->fops = &md_fops; disk->private_data = mddev; - blk_queue_write_cache(disk->queue, true, true); disk->events |= DISK_EVENT_MEDIA_CHANGE; mddev->gendisk = disk; error = add_disk(disk); @@ -6074,30 +6156,21 @@ int md_run(struct mddev *mddev) goto exit_sync_set; } - spin_lock(&pers_lock); - pers = find_pers(mddev->level, mddev->clevel); - if (!pers || !try_module_get(pers->owner)) { - spin_unlock(&pers_lock); - if (mddev->level != LEVEL_NONE) - pr_warn("md: personality for level %d is not loaded!\n", - mddev->level); - else - pr_warn("md: personality for level %s is not loaded!\n", - mddev->clevel); + pers = get_pers(mddev->level, mddev->clevel); + if (!pers) { err = -EINVAL; goto abort; } - spin_unlock(&pers_lock); - if (mddev->level != pers->level) { - mddev->level = pers->level; - mddev->new_level = pers->level; + if (mddev->level != pers->head.id) { + mddev->level = pers->head.id; + mddev->new_level = pers->head.id; } - strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ - module_put(pers->owner); + put_pers(pers); err = -EINVAL; goto abort; } @@ -6152,16 +6225,10 @@ int md_run(struct mddev *mddev) } if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { - struct bitmap *bitmap; - - bitmap = md_bitmap_create(mddev, -1); - if (IS_ERR(bitmap)) { - err = PTR_ERR(bitmap); + err = mddev->bitmap_ops->create(mddev, -1); + if (err) pr_warn("%s: failed to create bitmap (%d)\n", mdname(mddev), err); - } else - mddev->bitmap = bitmap; - } if (err) goto bitmap_abort; @@ -6185,28 +6252,6 @@ int md_run(struct mddev *mddev) } } - if (!mddev_is_dm(mddev)) { - struct request_queue *q = mddev->gendisk->queue; - bool nonrot = true; - - rdev_for_each(rdev, mddev) { - if (rdev->raid_disk >= 0 && !bdev_nonrot(rdev->bdev)) { - nonrot = false; - break; - } - } - if (mddev->degraded) - nonrot = false; - if (nonrot) - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - else - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); - blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q); - - /* Set the NOWAIT flags if all underlying devices support it */ - if (nowait) - blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); - } if (pers->sync_request) { if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_redundancy_group)) @@ -6252,8 +6297,8 @@ bitmap_abort: if (mddev->private) pers->free(mddev, mddev->private); mddev->private = NULL; - module_put(pers->owner); - md_bitmap_destroy(mddev); + put_pers(pers); + mddev->bitmap_ops->destroy(mddev); abort: bioset_exit(&mddev->io_clone_set); exit_sync_set: @@ -6272,9 +6317,10 @@ int do_md_run(struct mddev *mddev) err = md_run(mddev); if (err) goto out; - err = md_bitmap_load(mddev); + + err = mddev->bitmap_ops->load(mddev); if (err) { - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); goto out; } @@ -6412,13 +6458,14 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { - del_timer_sync(&mddev->safemode_timer); + timer_delete_sync(&mddev->safemode_timer); if (mddev->pers && mddev->pers->quiesce) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - md_bitmap_flush(mddev); + + mddev->bitmap_ops->flush(mddev); if (md_is_rdwr(mddev) && ((!mddev->in_sync && !mddev_is_clustered(mddev)) || @@ -6437,7 +6484,7 @@ void md_stop_writes(struct mddev *mddev) { mddev_lock_nointr(mddev); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - stop_sync_thread(mddev, true, false); + stop_sync_thread(mddev, true); __md_stop_writes(mddev); mddev_unlock(mddev); } @@ -6445,7 +6492,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { - md_bitmap_wait_behind_writes(mddev); + mddev->bitmap_ops->wait_behind_writes(mddev); if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); @@ -6460,7 +6507,8 @@ static void mddev_detach(struct mddev *mddev) static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; - md_bitmap_destroy(mddev); + + mddev->bitmap_ops->destroy(mddev); mddev_detach(mddev); spin_lock(&mddev->lock); mddev->pers = NULL; @@ -6470,7 +6518,7 @@ static void __md_stop(struct mddev *mddev) mddev->private = NULL; if (pers->sync_request && mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; - module_put(pers->owner); + put_pers(pers); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); bioset_exit(&mddev->bio_set); @@ -6505,7 +6553,7 @@ static int md_set_readonly(struct mddev *mddev) set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } - stop_sync_thread(mddev, false, false); + stop_sync_thread(mddev, false); wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); mddev_lock_nointr(mddev); @@ -6551,7 +6599,7 @@ static int do_md_stop(struct mddev *mddev, int mode) set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } - stop_sync_thread(mddev, true, false); + stop_sync_thread(mddev, true); if (mddev->sysfs_active || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { @@ -6986,7 +7034,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) set_bit(Candidate, &rdev->flags); else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { /* --add initiated by this node */ - err = md_cluster_ops->add_new_disk(mddev, rdev); + err = mddev->cluster_ops->add_new_disk(mddev, rdev); if (err) { export_rdev(rdev, mddev); return err; @@ -7003,14 +7051,14 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) if (mddev_is_clustered(mddev)) { if (info->state & (1 << MD_DISK_CANDIDATE)) { if (!err) { - err = md_cluster_ops->new_disk_ack(mddev, - err == 0); + err = mddev->cluster_ops->new_disk_ack( + mddev, err == 0); if (err) md_kick_rdev_from_array(rdev); } } else { if (err) - md_cluster_ops->add_new_disk_cancel(mddev); + mddev->cluster_ops->add_new_disk_cancel(mddev); else err = add_bound_rdev(rdev); } @@ -7090,10 +7138,9 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) goto busy; kick_rdev: - if (mddev_is_clustered(mddev)) { - if (md_cluster_ops->remove_disk(mddev, rdev)) - goto busy; - } + if (mddev_is_clustered(mddev) && + mddev->cluster_ops->remove_disk(mddev, rdev)) + goto busy; md_kick_rdev_from_array(rdev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -7166,15 +7213,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) if (!mddev->thread) md_update_sb(mddev, 1); /* - * If the new disk does not support REQ_NOWAIT, - * disable on the whole MD. - */ - if (!bdev_nowait(rdev->bdev)) { - pr_info("%s: Disabling nowait because %pg does not support nowait\n", - mdname(mddev), rdev->bdev); - blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->gendisk->queue); - } - /* * Kick recovery, maybe this spare has to be added to the * array immediately. */ @@ -7247,22 +7285,19 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = 0; if (mddev->pers) { if (fd >= 0) { - struct bitmap *bitmap; + err = mddev->bitmap_ops->create(mddev, -1); + if (!err) + err = mddev->bitmap_ops->load(mddev); - bitmap = md_bitmap_create(mddev, -1); - if (!IS_ERR(bitmap)) { - mddev->bitmap = bitmap; - err = md_bitmap_load(mddev); - } else - err = PTR_ERR(bitmap); if (err) { - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); fd = -1; } } else if (fd < 0) { - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); } } + if (fd < 0) { struct file *f = mddev->bitmap_info.file; if (f) { @@ -7408,7 +7443,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) rv = mddev->pers->resize(mddev, num_sectors); if (!rv) { if (mddev_is_clustered(mddev)) - md_cluster_ops->update_size(mddev, old_dev_sectors); + mddev->cluster_ops->update_size(mddev, old_dev_sectors); else if (!mddev_is_dm(mddev)) set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); @@ -7456,6 +7491,28 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) return rv; } +static int get_cluster_ops(struct mddev *mddev) +{ + xa_lock(&md_submodule); + mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER); + if (mddev->cluster_ops && + !try_module_get(mddev->cluster_ops->head.owner)) + mddev->cluster_ops = NULL; + xa_unlock(&md_submodule); + + return mddev->cluster_ops == NULL ? -ENOENT : 0; +} + +static void put_cluster_ops(struct mddev *mddev) +{ + if (!mddev->cluster_ops) + return; + + mddev->cluster_ops->leave(mddev); + module_put(mddev->cluster_ops->head.owner); + mddev->cluster_ops = NULL; +} + /* * update_array_info is used to change the configuration of an * on-line array. @@ -7531,7 +7588,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) goto err; } if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { - struct bitmap *bitmap; /* add the bitmap */ if (mddev->bitmap) { rv = -EEXIST; @@ -7545,39 +7601,38 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - bitmap = md_bitmap_create(mddev, -1); - if (!IS_ERR(bitmap)) { - mddev->bitmap = bitmap; - rv = md_bitmap_load(mddev); - } else - rv = PTR_ERR(bitmap); + rv = mddev->bitmap_ops->create(mddev, -1); + if (!rv) + rv = mddev->bitmap_ops->load(mddev); + if (rv) - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); } else { - /* remove the bitmap */ - if (!mddev->bitmap) { - rv = -ENOENT; + struct md_bitmap_stats stats; + + rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (rv) goto err; - } - if (mddev->bitmap->storage.file) { + + if (stats.file) { rv = -EINVAL; goto err; } + if (mddev->bitmap_info.nodes) { /* hold PW on all the bitmap lock */ - if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { + if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) { pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); rv = -EPERM; - md_cluster_ops->unlock_all_bitmaps(mddev); + mddev->cluster_ops->unlock_all_bitmaps(mddev); goto err; } mddev->bitmap_info.nodes = 0; - md_cluster_ops->leave(mddev); - module_put(md_cluster_mod); + put_cluster_ops(mddev); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); mddev->bitmap_info.offset = 0; } } @@ -7742,12 +7797,6 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, return get_bitmap_file(mddev, argp); } - if (cmd == HOT_REMOVE_DISK) - /* need to ensure recovery thread has run */ - wait_event_interruptible_timeout(mddev->sb_wait, - !test_bit(MD_RECOVERY_NEEDED, - &mddev->recovery), - msecs_to_jiffies(5000)); if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { /* Need to flush page cache, and ensure no-one else opens * and writes @@ -7864,7 +7913,7 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, case CLUSTERED_DISK_NACK: if (mddev_is_clustered(mddev)) - md_cluster_ops->new_disk_ack(mddev, false); + mddev->cluster_ops->new_disk_ack(mddev, false); else err = -EINVAL; goto unlock; @@ -8087,7 +8136,8 @@ void md_wakeup_thread(struct md_thread __rcu *thread) if (t) { pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); set_bit(THREAD_WAKEUP, &t->flags); - wake_up(&t->wqueue); + if (wq_has_sleeper(&t->wqueue)) + wake_up(&t->wqueue); } rcu_read_unlock(); } @@ -8145,7 +8195,8 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) return; mddev->pers->error_handler(mddev, rdev); - if (mddev->pers->level == 0) + if (mddev->pers->head.id == ID_RAID0 || + mddev->pers->head.id == ID_LINEAR) return; if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) @@ -8183,14 +8234,17 @@ static void status_unused(struct seq_file *seq) static void status_personalities(struct seq_file *seq) { - struct md_personality *pers; + struct md_submodule_head *head; + unsigned long i; seq_puts(seq, "Personalities : "); - spin_lock(&pers_lock); - list_for_each_entry(pers, &pers_list, list) - seq_printf(seq, "[%s] ", pers->name); - spin_unlock(&pers_lock); + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) + if (head->type == MD_PERSONALITY) + seq_printf(seq, "[%s] ", head->name); + xa_unlock(&md_submodule); + seq_puts(seq, "\n"); } @@ -8352,6 +8406,33 @@ static void md_seq_stop(struct seq_file *seq, void *v) spin_unlock(&all_mddevs_lock); } +static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) +{ + struct md_bitmap_stats stats; + unsigned long used_pages; + unsigned long chunk_kb; + int err; + + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (err) + return; + + chunk_kb = mddev->bitmap_info.chunksize >> 10; + used_pages = stats.pages - stats.missing_pages; + + seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk", + used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10), + chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, + chunk_kb ? "KB" : "B"); + + if (stats.file) { + seq_puts(seq, ", file: "); + seq_file_path(seq, stats.file, " \t\n"); + } + + seq_putc(seq, '\n'); +} + static int md_seq_show(struct seq_file *seq, void *v) { struct mddev *mddev; @@ -8370,16 +8451,25 @@ static int md_seq_show(struct seq_file *seq, void *v) return 0; spin_unlock(&all_mddevs_lock); + + /* prevent bitmap to be freed after checking */ + mutex_lock(&mddev->bitmap_info.mutex); + spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { - seq_printf(seq, "%s : %sactive", mdname(mddev), - mddev->pers ? "" : "in"); + seq_printf(seq, "%s : ", mdname(mddev)); if (mddev->pers) { + if (test_bit(MD_BROKEN, &mddev->flags)) + seq_printf(seq, "broken"); + else + seq_printf(seq, "active"); if (mddev->ro == MD_RDONLY) seq_printf(seq, " (read-only)"); if (mddev->ro == MD_AUTO_READ) seq_printf(seq, " (auto-read-only)"); - seq_printf(seq, " %s", mddev->pers->name); + seq_printf(seq, " %s", mddev->pers->head.name); + } else { + seq_printf(seq, "inactive"); } sectors = 0; @@ -8435,19 +8525,18 @@ static int md_seq_show(struct seq_file *seq, void *v) } else seq_printf(seq, "\n "); - md_bitmap_status(seq, mddev->bitmap); + md_bitmap_status(seq, mddev); seq_printf(seq, "\n"); } spin_unlock(&mddev->lock); + mutex_unlock(&mddev->bitmap_info.mutex); spin_lock(&all_mddevs_lock); if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) status_unused(seq); - if (atomic_dec_and_test(&mddev->active)) - __mddev_put(mddev); - + mddev_put_locked(mddev); return 0; } @@ -8498,67 +8587,34 @@ static const struct proc_ops mdstat_proc_ops = { .proc_poll = mdstat_poll, }; -int register_md_personality(struct md_personality *p) +int register_md_submodule(struct md_submodule_head *msh) { - pr_debug("md: %s personality registered for level %d\n", - p->name, p->level); - spin_lock(&pers_lock); - list_add_tail(&p->list, &pers_list); - spin_unlock(&pers_lock); - return 0; + return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL); } -EXPORT_SYMBOL(register_md_personality); +EXPORT_SYMBOL_GPL(register_md_submodule); -int unregister_md_personality(struct md_personality *p) +void unregister_md_submodule(struct md_submodule_head *msh) { - pr_debug("md: %s personality unregistered\n", p->name); - spin_lock(&pers_lock); - list_del_init(&p->list); - spin_unlock(&pers_lock); - return 0; + xa_erase(&md_submodule, msh->id); } -EXPORT_SYMBOL(unregister_md_personality); - -int register_md_cluster_operations(struct md_cluster_operations *ops, - struct module *module) -{ - int ret = 0; - spin_lock(&pers_lock); - if (md_cluster_ops != NULL) - ret = -EALREADY; - else { - md_cluster_ops = ops; - md_cluster_mod = module; - } - spin_unlock(&pers_lock); - return ret; -} -EXPORT_SYMBOL(register_md_cluster_operations); - -int unregister_md_cluster_operations(void) -{ - spin_lock(&pers_lock); - md_cluster_ops = NULL; - spin_unlock(&pers_lock); - return 0; -} -EXPORT_SYMBOL(unregister_md_cluster_operations); +EXPORT_SYMBOL_GPL(unregister_md_submodule); int md_setup_cluster(struct mddev *mddev, int nodes) { - int ret; - if (!md_cluster_ops) + int ret = get_cluster_ops(mddev); + + if (ret) { request_module("md-cluster"); - spin_lock(&pers_lock); + ret = get_cluster_ops(mddev); + } + /* ensure module won't be unloaded */ - if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { + if (ret) { pr_warn("can't find md-cluster module or get its reference.\n"); - spin_unlock(&pers_lock); - return -ENOENT; + return ret; } - spin_unlock(&pers_lock); - ret = md_cluster_ops->join(mddev, nodes); + ret = mddev->cluster_ops->join(mddev, nodes); if (!ret) mddev->safemode_delay = 0; return ret; @@ -8566,52 +8622,58 @@ int md_setup_cluster(struct mddev *mddev, int nodes) void md_cluster_stop(struct mddev *mddev) { - if (!md_cluster_ops) - return; - md_cluster_ops->leave(mddev); - module_put(md_cluster_mod); + put_cluster_ops(mddev); } -static int is_mddev_idle(struct mddev *mddev, int init) +static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init) { + unsigned long last_events = rdev->last_events; + + if (!bdev_is_partition(rdev->bdev)) + return true; + + /* + * If rdev is partition, and user doesn't issue IO to the array, the + * array is still not idle if user issues IO to other partitions. + */ + rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0, + sectors) - + part_stat_read_accum(rdev->bdev, sectors); + + return init || rdev->last_events <= last_events; +} + +/* + * mddev is idle if following conditions are matched since last check: + * 1) mddev doesn't have normal IO completed; + * 2) mddev doesn't have inflight normal IO; + * 3) if any member disk is partition, and other partitions don't have IO + * completed; + * + * Noted this checking rely on IO accounting is enabled. + */ +static bool is_mddev_idle(struct mddev *mddev, int init) +{ + unsigned long last_events = mddev->normal_io_events; + struct gendisk *disk; struct md_rdev *rdev; - int idle; - int curr_events; + bool idle = true; + + disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk; + if (!disk) + return true; + + mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors); + if (!init && (mddev->normal_io_events > last_events || + bdev_count_inflight(disk->part0))) + idle = false; - idle = 1; rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) { - struct gendisk *disk = rdev->bdev->bd_disk; - curr_events = (int)part_stat_read_accum(disk->part0, sectors) - - atomic_read(&disk->sync_io); - /* sync IO will cause sync_io to increase before the disk_stats - * as sync_io is counted when a request starts, and - * disk_stats is counted when it completes. - * So resync activity will cause curr_events to be smaller than - * when there was no such activity. - * non-sync IO will cause disk_stat to increase without - * increasing sync_io so curr_events will (eventually) - * be larger than it was before. Once it becomes - * substantially larger, the test below will cause - * the array to appear non-idle, and resync will slow - * down. - * If there is a lot of outstanding resync activity when - * we set last_event to curr_events, then all that activity - * completing might cause the array to appear non-idle - * and resync will be slowed down even though there might - * not have been non-resync activity. This will only - * happen once though. 'last_events' will soon reflect - * the state where there is little or no outstanding - * resync requests, and further resync activity will - * always make curr_events less than last_events. - * - */ - if (init || curr_events - rdev->last_events > 64) { - rdev->last_events = curr_events; - idle = 0; - } - } + rdev_for_each_rcu(rdev, mddev) + if (!is_rdev_holder_idle(rdev, init)) + idle = false; rcu_read_unlock(); + return idle; } @@ -8636,17 +8698,16 @@ EXPORT_SYMBOL(md_done_sync); * A return value of 'false' means that the write wasn't recorded * and cannot proceed as the array is being suspend. */ -bool md_write_start(struct mddev *mddev, struct bio *bi) +void md_write_start(struct mddev *mddev, struct bio *bi) { int did_change = 0; if (bio_data_dir(bi) != WRITE) - return true; + return; BUG_ON(mddev->ro == MD_RDONLY); if (mddev->ro == MD_AUTO_READ) { /* need to switch to read/write */ - flush_work(&mddev->sync_work); mddev->ro = MD_RDWR; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -8674,15 +8735,9 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); if (!mddev->has_superblocks) - return true; + return; wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || - is_md_suspended(mddev)); - if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { - percpu_ref_put(&mddev->writes_pending); - return false; - } - return true; + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); } EXPORT_SYMBOL(md_write_start); @@ -8737,12 +8792,32 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, } EXPORT_SYMBOL_GPL(md_submit_discard_bio); +static void md_bitmap_start(struct mddev *mddev, + struct md_io_clone *md_io_clone) +{ + if (mddev->pers->bitmap_sector) + mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, + &md_io_clone->sectors); + + mddev->bitmap_ops->startwrite(mddev, md_io_clone->offset, + md_io_clone->sectors); +} + +static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) +{ + mddev->bitmap_ops->endwrite(mddev, md_io_clone->offset, + md_io_clone->sectors); +} + static void md_end_clone_io(struct bio *bio) { struct md_io_clone *md_io_clone = bio->bi_private; struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; + if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) + md_bitmap_end(mddev, md_io_clone); + if (bio->bi_status && !orig_bio->bi_status) orig_bio->bi_status = bio->bi_status; @@ -8767,6 +8842,12 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio) if (blk_queue_io_stat(bdev->bd_disk->queue)) md_io_clone->start_time = bio_start_io_acct(*bio); + if (bio_data_dir(*bio) == WRITE && mddev->bitmap) { + md_io_clone->offset = (*bio)->bi_iter.bi_sector; + md_io_clone->sectors = bio_sectors(*bio); + md_bitmap_start(mddev, md_io_clone); + } + clone->bi_end_io = md_end_clone_io; clone->bi_private = md_io_clone; *bio = clone; @@ -8785,6 +8866,9 @@ void md_free_cloned_bio(struct bio *bio) struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; + if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) + md_bitmap_end(mddev, md_io_clone); + if (bio->bi_status && !orig_bio->bi_status) orig_bio->bi_status = bio->bi_status; @@ -8830,6 +8914,94 @@ void md_allow_write(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_allow_write); +static sector_t md_sync_max_sectors(struct mddev *mddev, + enum sync_action action) +{ + switch (action) { + case ACTION_RESYNC: + case ACTION_CHECK: + case ACTION_REPAIR: + atomic64_set(&mddev->resync_mismatches, 0); + fallthrough; + case ACTION_RESHAPE: + return mddev->resync_max_sectors; + case ACTION_RECOVER: + return mddev->dev_sectors; + default: + return 0; + } +} + +static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) +{ + sector_t start = 0; + struct md_rdev *rdev; + + switch (action) { + case ACTION_CHECK: + case ACTION_REPAIR: + return mddev->resync_min; + case ACTION_RESYNC: + if (!mddev->bitmap) + return mddev->recovery_cp; + return 0; + case ACTION_RESHAPE: + /* + * If the original node aborts reshaping then we continue the + * reshaping, so set again to avoid restart reshape from the + * first beginning + */ + if (mddev_is_clustered(mddev) && + mddev->reshape_position != MaxSector) + return mddev->reshape_position; + return 0; + case ACTION_RECOVER: + start = MaxSector; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < start) + start = rdev->recovery_offset; + rcu_read_unlock(); + + /* If there is a bitmap, we need to make sure all + * writes that started before we added a spare + * complete before we start doing a recovery. + * Otherwise the write might complete and (via + * bitmap_endwrite) set a bit in the bitmap after the + * recovery has checked that bit and skipped that + * region. + */ + if (mddev->bitmap) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + return start; + default: + return MaxSector; + } +} + +static bool sync_io_within_limit(struct mddev *mddev) +{ + int io_sectors; + + /* + * For raid456, sync IO is stripe(4k) per IO, for other levels, it's + * RESYNC_PAGES(64k) per IO. + */ + if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6) + io_sectors = 8; + else + io_sectors = 128; + + return atomic_read(&mddev->recovery_active) < + io_sectors * sync_io_depth(mddev); +} + #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) #define UPDATE_FREQUENCY (5*60*HZ) @@ -8846,7 +9018,8 @@ void md_do_sync(struct md_thread *thread) sector_t last_check; int skipped = 0; struct md_rdev *rdev; - char *desc, *action = NULL; + enum sync_action action; + const char *desc; struct blk_plug plug; int ret; @@ -8864,7 +9037,7 @@ void md_do_sync(struct md_thread *thread) } if (mddev_is_clustered(mddev)) { - ret = md_cluster_ops->resync_start(mddev); + ret = mddev->cluster_ops->resync_start(mddev); if (ret) goto skip; @@ -8877,21 +9050,9 @@ void md_do_sync(struct md_thread *thread) goto skip; } - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { - desc = "data-check"; - action = "check"; - } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - desc = "requested-resync"; - action = "repair"; - } else - desc = "resync"; - } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - desc = "reshape"; - else - desc = "recovery"; - - mddev->last_sync_action = action ?: desc; + action = md_sync_action(mddev); + desc = md_sync_action_name(action); + mddev->last_sync_action = action; /* * Before starting a resync we must have set curr_resync to @@ -8902,7 +9063,8 @@ void md_do_sync(struct md_thread *thread) * This will mean we have to start checking from the beginning again. * */ - + if (mddev_is_clustered(mddev)) + mddev->cluster_ops->resync_start_notify(mddev); do { int mddev2_minor = -1; mddev->curr_resync = MD_RESYNC_DELAYED; @@ -8959,56 +9121,8 @@ void md_do_sync(struct md_thread *thread) spin_unlock(&all_mddevs_lock); } while (mddev->curr_resync < MD_RESYNC_DELAYED); - j = 0; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - /* resync follows the size requested by the personality, - * which defaults to physical size, but can be virtual size - */ - max_sectors = mddev->resync_max_sectors; - atomic64_set(&mddev->resync_mismatches, 0); - /* we don't use the checkpoint if there's a bitmap */ - if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) - j = mddev->resync_min; - else if (!mddev->bitmap) - j = mddev->recovery_cp; - - } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { - max_sectors = mddev->resync_max_sectors; - /* - * If the original node aborts reshaping then we continue the - * reshaping, so set j again to avoid restart reshape from the - * first beginning - */ - if (mddev_is_clustered(mddev) && - mddev->reshape_position != MaxSector) - j = mddev->reshape_position; - } else { - /* recovery follows the physical size of devices */ - max_sectors = mddev->dev_sectors; - j = MaxSector; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < j) - j = rdev->recovery_offset; - rcu_read_unlock(); - - /* If there is a bitmap, we need to make sure all - * writes that started before we added a spare - * complete before we start doing a recovery. - * Otherwise the write might complete and (via - * bitmap_endwrite) set a bit in the bitmap after the - * recovery has checked that bit and skipped that - * region. - */ - if (mddev->bitmap) { - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - } - } + max_sectors = md_sync_max_sectors(mddev, action); + j = md_sync_position(mddev, action); pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); @@ -9090,7 +9204,8 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; - sectors = mddev->pers->sync_request(mddev, j, &skipped); + sectors = mddev->pers->sync_request(mddev, j, max_sectors, + &skipped); if (sectors == 0) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); break; @@ -9155,7 +9270,8 @@ void md_do_sync(struct md_thread *thread) msleep(500); goto repeat; } - if (!is_mddev_idle(mddev, 0)) { + if (!sync_io_within_limit(mddev) && + !is_mddev_idle(mddev, 0)) { /* * Give other IO more of a chance. * The faster the devices, the less we wait. @@ -9180,7 +9296,7 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify_dirent_safe(mddev->sysfs_completed); } - mddev->pers->sync_request(mddev, max_sectors, &skipped); + mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > MD_RESYNC_ACTIVE) { @@ -9404,6 +9520,13 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) return true; } + /* Check if resync is in progress. */ + if (mddev->recovery_cp < MaxSector) { + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + return true; + } + /* * Remove any failed drives, then add spares if possible. Spares are * also removed and re-added, to allow the personality to fail the @@ -9420,13 +9543,6 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) return true; } - /* Check if recovery is in progress. */ - if (mddev->recovery_cp < MaxSector) { - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - return true; - } - /* Delay to choose resync/check/repair in md_do_sync(). */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) return true; @@ -9476,7 +9592,7 @@ static void md_start_sync(struct work_struct *ws) * stored on all devices. So make sure all bitmap pages get written. */ if (spares) - md_bitmap_write_all(mddev->bitmap); + mddev->bitmap_ops->write_all(mddev); name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? "reshape" : "resync"; @@ -9564,7 +9680,7 @@ static void unregister_sync_thread(struct mddev *mddev) void md_check_recovery(struct mddev *mddev) { if (mddev->bitmap) - md_bitmap_daemon_work(mddev); + mddev->bitmap_ops->daemon_work(mddev); if (signal_pending(current)) { if (mddev->pers->sync_request && !mddev->external) { @@ -9733,7 +9849,7 @@ void md_reap_sync_thread(struct mddev *mddev) * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by * clustered raid */ if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) - md_cluster_ops->resync_finish(mddev); + mddev->cluster_ops->resync_finish(mddev); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); @@ -9741,13 +9857,13 @@ void md_reap_sync_thread(struct mddev *mddev) clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); /* - * We call md_cluster_ops->update_size here because sync_size could + * We call mddev->cluster_ops->update_size here because sync_size could * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, * so it is time to update size across cluster. */ if (mddev_is_clustered(mddev) && is_reshaped && !test_bit(MD_CLOSING, &mddev->flags)) - md_cluster_ops->update_size(mddev, old_dev_sectors); + mddev->cluster_ops->update_size(mddev, old_dev_sectors); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_completed); @@ -9762,9 +9878,7 @@ EXPORT_SYMBOL(md_reap_sync_thread); void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) { sysfs_notify_dirent_safe(rdev->sysfs_state); - wait_event_timeout(rdev->blocked_wait, - !test_bit(Blocked, &rdev->flags) && - !test_bit(BlockedBadBlocks, &rdev->flags), + wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev), msecs_to_jiffies(5000)); rdev_dec_pending(rdev, mddev); } @@ -9787,54 +9901,65 @@ EXPORT_SYMBOL(md_finish_reshape); /* Bad block management */ -/* Returns 1 on success, 0 on failure */ -int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new) +/* Returns true on success, false on failure */ +bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) { struct mddev *mddev = rdev->mddev; - int rv; + + /* + * Recording new badblocks for faulty rdev will force unnecessary + * super block updating. This is fragile for external management because + * userspace daemon may trying to remove this device and deadlock may + * occur. This will be probably solved in the mdadm, but it is safer to + * avoid it. + */ + if (test_bit(Faulty, &rdev->flags)) + return true; + if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; - rv = badblocks_set(&rdev->badblocks, s, sectors, 0); - if (rv == 0) { - /* Make sure they get written out promptly */ - if (test_bit(ExternalBbl, &rdev->flags)) - sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); - sysfs_notify_dirent_safe(rdev->sysfs_state); - set_mask_bits(&mddev->sb_flags, 0, - BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); - md_wakeup_thread(rdev->mddev->thread); - return 1; - } else - return 0; + + if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) + return false; + + /* Make sure they get written out promptly */ + if (test_bit(ExternalBbl, &rdev->flags)) + sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); + sysfs_notify_dirent_safe(rdev->sysfs_state); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); + md_wakeup_thread(rdev->mddev->thread); + return true; } EXPORT_SYMBOL_GPL(rdev_set_badblocks); -int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new) +void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) { - int rv; if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; - rv = badblocks_clear(&rdev->badblocks, s, sectors); - if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) + + if (!badblocks_clear(&rdev->badblocks, s, sectors)) + return; + + if (test_bit(ExternalBbl, &rdev->flags)) sysfs_notify_dirent_safe(rdev->sysfs_badblocks); - return rv; } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { - struct mddev *mddev, *n; + struct mddev *mddev; int need_delay = 0; spin_lock(&all_mddevs_lock); - list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { if (!mddev_get(mddev)) continue; spin_unlock(&all_mddevs_lock); @@ -9846,8 +9971,8 @@ static int md_notify_reboot(struct notifier_block *this, mddev_unlock(mddev); } need_delay = 1; - mddev_put(mddev); spin_lock(&all_mddevs_lock); + mddev_put_locked(mddev); } spin_unlock(&all_mddevs_lock); @@ -9935,7 +10060,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) if (ret) pr_info("md-cluster: resize failed\n"); else - md_bitmap_update_sb(mddev->bitmap); + mddev->bitmap_ops->update_sb(mddev->bitmap); } /* Check for change of roles in the active devices */ @@ -9963,8 +10088,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) */ if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && !(le32_to_cpu(sb->feature_map) & - MD_FEATURE_RESHAPE_ACTIVE)) { - rdev2->saved_raid_disk = role; + MD_FEATURE_RESHAPE_ACTIVE) && + !mddev->cluster_ops->resync_status_get(mddev)) { + /* + * -1 to make raid1_add_disk() set conf->fullsync + * to 1. This could avoid skipping sync when the + * remote node is down during resyncing. + */ + if ((le32_to_cpu(sb->feature_map) + & MD_FEATURE_RECOVERY_OFFSET)) + rdev2->saved_raid_disk = -1; + else + rdev2->saved_raid_disk = role; ret = remove_and_add_spares(mddev, rdev2); pr_info("Activated spare: %pg\n", rdev2->bdev); @@ -10170,7 +10305,7 @@ void md_autostart_arrays(int part) static __exit void md_exit(void) { - struct mddev *mddev, *n; + struct mddev *mddev; int delay = 1; unregister_blkdev(MD_MAJOR,"md"); @@ -10191,7 +10326,7 @@ static __exit void md_exit(void) remove_proc_entry("mdstat", NULL); spin_lock(&all_mddevs_lock); - list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { if (!mddev_get(mddev)) continue; spin_unlock(&all_mddevs_lock); @@ -10203,8 +10338,8 @@ static __exit void md_exit(void) * the mddev for destruction by a workqueue, and the * destroy_workqueue() below will wait for that to complete. */ - mddev_put(mddev); spin_lock(&all_mddevs_lock); + mddev_put_locked(mddev); } spin_unlock(&all_mddevs_lock); |