diff options
Diffstat (limited to 'drivers/md/md.c')
| -rw-r--r-- | drivers/md/md.c | 1420 |
1 files changed, 982 insertions, 438 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index aebe12b0ee27..e5922a682953 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -79,16 +79,10 @@ static const char *action_name[NR_SYNC_ACTIONS] = { [ACTION_IDLE] = "idle", }; -/* pers_list is a list of registered personalities protected by pers_lock. */ -static LIST_HEAD(pers_list); -static DEFINE_SPINLOCK(pers_lock); +static DEFINE_XARRAY(md_submodule); static const struct kobj_type md_ktype; -const struct md_cluster_operations *md_cluster_ops; -EXPORT_SYMBOL(md_cluster_ops); -static struct module *md_cluster_mod; - static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; @@ -100,13 +94,12 @@ static struct workqueue_struct *md_wq; * workqueue whith reconfig_mutex grabbed. */ static struct workqueue_struct *md_misc_wq; -struct workqueue_struct *md_bitmap_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); -static void md_wakeup_thread_directly(struct md_thread __rcu *thread); +static void md_wakeup_thread_directly(struct md_thread __rcu **thread); /* * Default number of read corrections we'll attempt on an rdev @@ -117,32 +110,48 @@ static void md_wakeup_thread_directly(struct md_thread __rcu *thread); /* Default safemode delay: 200 msec */ #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) /* - * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' - * is 1000 KB/sec, so the extra system load does not show up that much. - * Increase it if you want to have more _guaranteed_ speed. Note that - * the RAID driver will use the maximum available bandwidth if the IO - * subsystem is idle. There is also an 'absolute maximum' reconstruction - * speed limit - in case reconstruction slows down your system despite - * idle IO detection. + * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit' + * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load + * does not show up that much. Increase it if you want to have more guaranteed + * speed. Note that the RAID driver will use the maximum bandwidth + * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle. + * + * Background sync IO speed control: + * + * - below speed min: + * no limit; + * - above speed min and below speed max: + * a) if mddev is idle, then no limit; + * b) if mddev is busy handling normal IO, then limit inflight sync IO + * to sync_io_depth; + * - above speed max: + * sync IO can't be issued; * - * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. - * or /sys/block/mdX/md/sync_speed_{min,max} + * Following configurations can be changed via /proc/sys/dev/raid/ for system + * or /sys/block/mdX/md/ for one array. */ - static int sysctl_speed_limit_min = 1000; static int sysctl_speed_limit_max = 200000; -static inline int speed_min(struct mddev *mddev) +static int sysctl_sync_io_depth = 32; + +static int speed_min(struct mddev *mddev) { return mddev->sync_speed_min ? mddev->sync_speed_min : sysctl_speed_limit_min; } -static inline int speed_max(struct mddev *mddev) +static int speed_max(struct mddev *mddev) { return mddev->sync_speed_max ? mddev->sync_speed_max : sysctl_speed_limit_max; } +static int sync_io_depth(struct mddev *mddev) +{ + return mddev->sync_io_depth ? + mddev->sync_io_depth : sysctl_sync_io_depth; +} + static void rdev_uninit_serial(struct md_rdev *rdev) { if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) @@ -294,19 +303,26 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) static struct ctl_table_header *raid_table_header; -static struct ctl_table raid_table[] = { +static const struct ctl_table raid_table[] = { { .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, .maxlen = sizeof(int), - .mode = S_IRUGO|S_IWUSR, + .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "speed_limit_max", .data = &sysctl_speed_limit_max, .maxlen = sizeof(int), - .mode = S_IRUGO|S_IWUSR, + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_io_depth", + .data = &sysctl_sync_io_depth, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_dointvec, }, }; @@ -322,6 +338,8 @@ static int start_readonly; * so all the races disappear. */ static bool create_on_open = true; +static bool legacy_async_del_gendisk = true; +static bool check_new_feature = true; /* * We have a system wide 'event count' that is incremented @@ -619,9 +637,12 @@ static void __mddev_put(struct mddev *mddev) mddev->ctime || mddev->hold_active) return; - /* Array is not configured at all, and not held active, so destroy it */ + /* + * If array is freed by stopping array, MD_DELETED is set by + * do_md_stop(), MD_DELETED is still set here in case mddev is freed + * directly by closing a mddev that is created by create_on_open. + */ set_bit(MD_DELETED, &mddev->flags); - /* * Call queue_work inside the spinlock so that flush_workqueue() after * mddev_find will succeed in waiting for the work to be done. @@ -629,6 +650,12 @@ static void __mddev_put(struct mddev *mddev) queue_work(md_misc_wq, &mddev->del_work); } +static void mddev_put_locked(struct mddev *mddev) +{ + if (atomic_dec_and_test(&mddev->active)) + __mddev_put(mddev); +} + void mddev_put(struct mddev *mddev) { if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) @@ -650,8 +677,66 @@ static void active_io_release(struct percpu_ref *ref) static void no_op(struct percpu_ref *r) {} +static bool mddev_set_bitmap_ops(struct mddev *mddev) +{ + struct bitmap_operations *old = mddev->bitmap_ops; + struct md_submodule_head *head; + + if (mddev->bitmap_id == ID_BITMAP_NONE || + (old && old->head.id == mddev->bitmap_id)) + return true; + + xa_lock(&md_submodule); + head = xa_load(&md_submodule, mddev->bitmap_id); + + if (!head) { + pr_warn("md: can't find bitmap id %d\n", mddev->bitmap_id); + goto err; + } + + if (head->type != MD_BITMAP) { + pr_warn("md: invalid bitmap id %d\n", mddev->bitmap_id); + goto err; + } + + mddev->bitmap_ops = (void *)head; + xa_unlock(&md_submodule); + + if (!mddev_is_dm(mddev) && mddev->bitmap_ops->group) { + if (sysfs_create_group(&mddev->kobj, mddev->bitmap_ops->group)) + pr_warn("md: cannot register extra bitmap attributes for %s\n", + mdname(mddev)); + else + /* + * Inform user with KOBJ_CHANGE about new bitmap + * attributes. + */ + kobject_uevent(&mddev->kobj, KOBJ_CHANGE); + } + return true; + +err: + xa_unlock(&md_submodule); + return false; +} + +static void mddev_clear_bitmap_ops(struct mddev *mddev) +{ + if (!mddev_is_dm(mddev) && mddev->bitmap_ops && + mddev->bitmap_ops->group) + sysfs_remove_group(&mddev->kobj, mddev->bitmap_ops->group); + + mddev->bitmap_ops = NULL; +} + int mddev_init(struct mddev *mddev) { + int err = 0; + + if (!IS_ENABLED(CONFIG_MD_BITMAP)) + mddev->bitmap_id = ID_BITMAP_NONE; + else + mddev->bitmap_id = ID_BITMAP; if (percpu_ref_init(&mddev->active_io, active_io_release, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) @@ -659,10 +744,23 @@ int mddev_init(struct mddev *mddev) if (percpu_ref_init(&mddev->writes_pending, no_op, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { - percpu_ref_exit(&mddev->active_io); - return -ENOMEM; + err = -ENOMEM; + goto exit_acitve_io; } + err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (err) + goto exit_writes_pending; + + err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (err) + goto exit_bio_set; + + err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, + offsetof(struct md_io_clone, bio_clone), 0); + if (err) + goto exit_sync_set; + /* We want to start with the refcount at zero */ percpu_ref_put(&mddev->writes_pending); @@ -686,17 +784,29 @@ int mddev_init(struct mddev *mddev) mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; - mddev_set_bitmap_ops(mddev); INIT_WORK(&mddev->sync_work, md_start_sync); INIT_WORK(&mddev->del_work, mddev_delayed_delete); return 0; + +exit_sync_set: + bioset_exit(&mddev->sync_set); +exit_bio_set: + bioset_exit(&mddev->bio_set); +exit_writes_pending: + percpu_ref_exit(&mddev->writes_pending); +exit_acitve_io: + percpu_ref_exit(&mddev->active_io); + return err; } EXPORT_SYMBOL_GPL(mddev_init); void mddev_destroy(struct mddev *mddev) { + bioset_exit(&mddev->bio_set); + bioset_exit(&mddev->sync_set); + bioset_exit(&mddev->io_clone_set); percpu_ref_exit(&mddev->active_io); percpu_ref_exit(&mddev->writes_pending); } @@ -850,6 +960,22 @@ void mddev_unlock(struct mddev *mddev) kobject_del(&rdev->kobj); export_rdev(rdev, mddev); } + + if (!legacy_async_del_gendisk) { + /* + * Call del_gendisk after release reconfig_mutex to avoid + * deadlock (e.g. call del_gendisk under the lock and an + * access to sysfs files waits the lock) + * And MD_DELETED is only used for md raid which is set in + * do_md_stop. dm raid only uses md_stop to stop. So dm raid + * doesn't need to check MD_DELETED when getting reconfig lock + */ + if (test_bit(MD_DELETED, &mddev->flags) && + !test_and_set_bit(MD_DO_DELETE, &mddev->flags)) { + kobject_del(&mddev->kobj); + del_gendisk(mddev->gendisk); + } + } } EXPORT_SYMBOL_GPL(mddev_unlock); @@ -888,16 +1014,40 @@ struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) } EXPORT_SYMBOL_GPL(md_find_rdev_rcu); -static struct md_personality *find_pers(int level, char *clevel) +static struct md_personality *get_pers(int level, char *clevel) { - struct md_personality *pers; - list_for_each_entry(pers, &pers_list, list) { - if (level != LEVEL_NONE && pers->level == level) - return pers; - if (strcmp(pers->name, clevel)==0) - return pers; + struct md_personality *ret = NULL; + struct md_submodule_head *head; + unsigned long i; + + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) { + if (head->type != MD_PERSONALITY) + continue; + if ((level != LEVEL_NONE && head->id == level) || + !strcmp(head->name, clevel)) { + if (try_module_get(head->owner)) + ret = (void *)head; + break; + } } - return NULL; + xa_unlock(&md_submodule); + + if (!ret) { + if (level != LEVEL_NONE) + pr_warn("md: personality for level %d is not loaded!\n", + level); + else + pr_warn("md: personality for level %s is not loaded!\n", + clevel); + } + + return ret; +} + +static void put_pers(struct md_personality *pers) +{ + module_put(pers->head.owner); } /* return the offset of the super block in 512byte sectors */ @@ -956,15 +1106,26 @@ static void super_written(struct bio *bio) wake_up(&mddev->sb_wait); } -void md_super_write(struct mddev *mddev, struct md_rdev *rdev, - sector_t sector, int size, struct page *page) +/** + * md_write_metadata - write metadata to underlying disk, including + * array superblock, badblocks, bitmap superblock and bitmap bits. + * @mddev: the array to write + * @rdev: the underlying disk to write + * @sector: the offset to @rdev + * @size: the length of the metadata + * @page: the metadata + * @offset: the offset to @page + * + * Write @size bytes of @page start from @offset, to @sector of @rdev, Increment + * mddev->pending_writes before returning, and decrement it on completion, + * waking up sb_wait. Caller must call md_super_wait() after issuing io to all + * rdev. If an error occurred, md_error() will be called, and the @rdev will be + * kicked out from @mddev. + */ +void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev, + sector_t sector, int size, struct page *page, + unsigned int offset) { - /* write first size bytes of page to sector of rdev - * Increment mddev->pending_writes before returning - * and decrement it on completion, waking up sb_wait - * if zero is reached. - * If an error occurred, call md_error - */ struct bio *bio; if (!page) @@ -982,7 +1143,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, atomic_inc(&rdev->nr_pending); bio->bi_iter.bi_sector = sector; - __bio_add_page(bio, page, size, 0); + __bio_add_page(bio, page, size, offset); bio->bi_private = rdev; bio->bi_end_io = super_written; @@ -1180,7 +1341,7 @@ int md_check_no_bitmap(struct mddev *mddev) if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) return 0; pr_warn("%s: bitmaps are not supported for %s\n", - mdname(mddev), mddev->pers->name); + mdname(mddev), mddev->pers->head.name); return 1; } EXPORT_SYMBOL(md_check_no_bitmap); @@ -1292,6 +1453,9 @@ static u64 md_bitmap_events_cleared(struct mddev *mddev) struct md_bitmap_stats stats; int err; + if (!md_bitmap_enabled(mddev, false)) + return 0; + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (err) return 0; @@ -1355,13 +1519,13 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru mddev->layout = -1; if (sb->state & (1<<MD_SB_CLEAN)) - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; else { if (sb->events_hi == sb->cp_events_hi && sb->events_lo == sb->cp_events_lo) { - mddev->recovery_cp = sb->recovery_cp; + mddev->resync_offset = sb->recovery_cp; } else - mddev->recovery_cp = 0; + mddev->resync_offset = 0; } memcpy(mddev->uuid+0, &sb->set_uuid0, 4); @@ -1487,10 +1651,10 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) mddev->minor_version = sb->minor_version; if (mddev->in_sync) { - sb->recovery_cp = mddev->recovery_cp; + sb->recovery_cp = mddev->resync_offset; sb->cp_events_hi = (mddev->events>>32); sb->cp_events_lo = (u32)mddev->events; - if (mddev->recovery_cp == MaxSector) + if (mddev->resync_offset == MaxSector) sb->state = (1<< MD_SB_CLEAN); } else sb->recovery_cp = 0; @@ -1589,8 +1753,8 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) num_sectors = (sector_t)(2ULL << 32) - 2; do { - md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, - rdev->sb_page); + md_write_metadata(rdev->mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); } while (md_super_wait(rdev->mddev) < 0); return num_sectors; } @@ -1688,9 +1852,13 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ } if (sb->pad0 || sb->pad3[0] || - memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) - /* Some padding is non-zero, might be a new feature */ - return -EINVAL; + memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) { + pr_warn("Some padding is non-zero on %pg, might be a new feature\n", + rdev->bdev); + if (check_new_feature) + return -EINVAL; + pr_warn("check_new_feature is disabled, data corruption possible\n"); + } rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); @@ -1748,7 +1916,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ count <<= sb->bblog_shift; if (bb + 1 == 0) break; - if (badblocks_set(&rdev->badblocks, sector, count, 1)) + if (!badblocks_set(&rdev->badblocks, sector, count, 1)) return -EINVAL; } } else if (sb->bblog_offset != 0) @@ -1831,6 +1999,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); mddev->dev_sectors = le64_to_cpu(sb->size); + mddev->logical_block_size = le32_to_cpu(sb->logical_block_size); mddev->events = ev1; mddev->bitmap_info.offset = 0; mddev->bitmap_info.space = 0; @@ -1841,7 +2010,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc mddev->bitmap_info.default_space = (4096-1024) >> 9; mddev->reshape_backwards = 0; - mddev->recovery_cp = le64_to_cpu(sb->resync_offset); + mddev->resync_offset = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); mddev->max_disks = (4096-256)/2; @@ -2027,7 +2196,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->utime = cpu_to_le64((__u64)mddev->utime); sb->events = cpu_to_le64(mddev->events); if (mddev->in_sync) - sb->resync_offset = cpu_to_le64(mddev->recovery_cp); + sb->resync_offset = cpu_to_le64(mddev->resync_offset); else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) sb->resync_offset = cpu_to_le64(MaxSector); else @@ -2040,6 +2209,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->chunksize = cpu_to_le32(mddev->chunk_sectors); sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); + sb->logical_block_size = cpu_to_le32(mddev->logical_block_size); if (test_bit(FailFast, &rdev->flags)) sb->devflags |= FailFast1; else @@ -2238,8 +2408,8 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) sb->super_offset = cpu_to_le64(rdev->sb_start); sb->sb_csum = calc_sb_1_csum(sb); do { - md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, - rdev->sb_page); + md_write_metadata(rdev->mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); } while (md_super_wait(rdev->mddev) < 0); return num_sectors; @@ -2249,13 +2419,15 @@ static int super_1_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) { + struct mddev *mddev = rdev->mddev; + /* All necessary checks on new >= old have been done */ if (new_offset >= rdev->data_offset) return 1; /* with 1.0 metadata, there is no metadata to tread on * so we can always move back */ - if (rdev->mddev->minor_version == 0) + if (mddev->minor_version == 0) return 1; /* otherwise we must be sure not to step on @@ -2267,8 +2439,7 @@ super_1_allow_new_offset(struct md_rdev *rdev, if (rdev->sb_start + (32+4)*2 > new_offset) return 0; - if (!rdev->mddev->bitmap_info.file) { - struct mddev *mddev = rdev->mddev; + if (md_bitmap_registered(mddev) && !mddev->bitmap_info.file) { struct md_bitmap_stats stats; int err; @@ -2359,19 +2530,6 @@ int md_integrity_register(struct mddev *mddev) return 0; /* shouldn't register */ pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); - if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || - (mddev->level != 1 && mddev->level != 10 && - bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { - /* - * No need to handle the failure of bioset_integrity_create, - * because the function is called by md_run() -> pers->run(), - * md_run calls bioset_exit -> bioset_integrity_free in case - * of failure case. - */ - pr_err("md: failed to create integrity pool for %s\n", - mdname(mddev)); - return -EINVAL; - } return 0; } EXPORT_SYMBOL(md_integrity_register); @@ -2630,6 +2788,7 @@ void md_update_sb(struct mddev *mddev, int force_change) if (!md_is_rdwr(mddev)) { if (force_change) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + pr_err("%s: can't update sb for read-only array %s\n", __func__, mdname(mddev)); return; } @@ -2639,11 +2798,11 @@ repeat: force_change = 1; if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) nospares = 1; - ret = md_cluster_ops->metadata_update_start(mddev); + ret = mddev->cluster_ops->metadata_update_start(mddev); /* Has someone else has updated the sb */ if (!does_sb_need_changing(mddev)) { if (ret == 0) - md_cluster_ops->metadata_update_cancel(mddev); + mddev->cluster_ops->metadata_update_cancel(mddev); bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)); @@ -2720,7 +2879,7 @@ repeat: /* If this is just a dirty<->clean transition, and the array is clean * and 'events' is odd, we can roll back to the previous clean state */ if (nospares - && (mddev->in_sync && mddev->recovery_cp == MaxSector) + && (mddev->in_sync && mddev->resync_offset == MaxSector) && mddev->can_decrease_events && mddev->events != 1) { mddev->events--; @@ -2753,24 +2912,24 @@ repeat: mddev_add_trace_msg(mddev, "md md_update_sb"); rewrite: - mddev->bitmap_ops->update_sb(mddev->bitmap); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ if (!test_bit(Faulty, &rdev->flags)) { - md_super_write(mddev,rdev, - rdev->sb_start, rdev->sb_size, - rdev->sb_page); + md_write_metadata(mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); pr_debug("md: (write) %pg's sb offset: %llu\n", rdev->bdev, (unsigned long long)rdev->sb_start); rdev->sb_events = mddev->events; if (rdev->badblocks.size) { - md_super_write(mddev, rdev, - rdev->badblocks.sector, - rdev->badblocks.size << 9, - rdev->bb_page); + md_write_metadata(mddev, rdev, + rdev->badblocks.sector, + rdev->badblocks.size << 9, + rdev->bb_page, 0); rdev->badblocks.size = 0; } @@ -2783,7 +2942,7 @@ rewrite: /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ if (mddev_is_clustered(mddev) && ret == 0) - md_cluster_ops->metadata_update_finish(mddev); + mddev->cluster_ops->metadata_update_finish(mddev); if (mddev->in_sync != sync_req || !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), @@ -2942,7 +3101,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) else { err = 0; if (mddev_is_clustered(mddev)) - err = md_cluster_ops->remove_disk(mddev, rdev); + err = mddev->cluster_ops->remove_disk(mddev, rdev); if (err == 0) { md_kick_rdev_from_array(rdev); @@ -3052,7 +3211,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) * by this node eventually */ if (!mddev_is_clustered(rdev->mddev) || - (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { + (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) { clear_bit(Faulty, &rdev->flags); err = add_bound_rdev(rdev); } @@ -3860,7 +4019,7 @@ level_show(struct mddev *mddev, char *page) spin_lock(&mddev->lock); p = mddev->pers; if (p) - ret = sprintf(page, "%s\n", p->name); + ret = sprintf(page, "%s\n", p->head.name); else if (mddev->clevel[0]) ret = sprintf(page, "%s\n", mddev->clevel); else if (mddev->level != LEVEL_NONE) @@ -3917,7 +4076,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) rv = -EINVAL; if (!mddev->pers->quiesce) { pr_warn("md: %s: %s does not support online personality change\n", - mdname(mddev), mddev->pers->name); + mdname(mddev), mddev->pers->head.name); goto out_unlock; } @@ -3931,24 +4090,20 @@ level_store(struct mddev *mddev, const char *buf, size_t len) if (request_module("md-%s", clevel) != 0) request_module("md-level-%s", clevel); - spin_lock(&pers_lock); - pers = find_pers(level, clevel); - if (!pers || !try_module_get(pers->owner)) { - spin_unlock(&pers_lock); - pr_warn("md: personality %s not loaded\n", clevel); + pers = get_pers(level, clevel); + if (!pers) { rv = -EINVAL; goto out_unlock; } - spin_unlock(&pers_lock); if (pers == mddev->pers) { /* Nothing to do! */ - module_put(pers->owner); + put_pers(pers); rv = len; goto out_unlock; } if (!pers->takeover) { - module_put(pers->owner); + put_pers(pers); pr_warn("md: %s: %s does not support personality takeover\n", mdname(mddev), clevel); rv = -EINVAL; @@ -3969,7 +4124,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->raid_disks -= mddev->delta_disks; mddev->delta_disks = 0; mddev->reshape_backwards = 0; - module_put(pers->owner); + put_pers(pers); pr_warn("md: %s: %s would not accept array\n", mdname(mddev), clevel); rv = PTR_ERR(priv); @@ -3984,7 +4139,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) oldpriv = mddev->private; mddev->pers = pers; mddev->private = priv; - strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); mddev->level = mddev->new_level; mddev->layout = mddev->new_layout; mddev->chunk_sectors = mddev->new_chunk_sectors; @@ -4026,7 +4181,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->to_remove = &md_redundancy_group; } - module_put(oldpers->owner); + put_pers(oldpers); rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) @@ -4057,7 +4212,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) * it must always be in_sync */ mddev->in_sync = 1; - del_timer_sync(&mddev->safemode_timer); + timer_delete_sync(&mddev->safemode_timer); } pers->run(mddev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -4103,6 +4258,86 @@ static struct md_sysfs_entry md_new_level = __ATTR(new_level, 0664, new_level_show, new_level_store); static ssize_t +bitmap_type_show(struct mddev *mddev, char *page) +{ + struct md_submodule_head *head; + unsigned long i; + ssize_t len = 0; + + if (mddev->bitmap_id == ID_BITMAP_NONE) + len += sprintf(page + len, "[none] "); + else + len += sprintf(page + len, "none "); + + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) { + if (head->type != MD_BITMAP) + continue; + + if (mddev->bitmap_id == head->id) + len += sprintf(page + len, "[%s] ", head->name); + else + len += sprintf(page + len, "%s ", head->name); + } + xa_unlock(&md_submodule); + + len += sprintf(page + len, "\n"); + return len; +} + +static ssize_t +bitmap_type_store(struct mddev *mddev, const char *buf, size_t len) +{ + struct md_submodule_head *head; + enum md_submodule_id id; + unsigned long i; + int err = 0; + + xa_lock(&md_submodule); + + if (mddev->bitmap_ops) { + err = -EBUSY; + goto out; + } + + if (cmd_match(buf, "none")) { + mddev->bitmap_id = ID_BITMAP_NONE; + goto out; + } + + xa_for_each(&md_submodule, i, head) { + if (head->type == MD_BITMAP && cmd_match(buf, head->name)) { + mddev->bitmap_id = head->id; + goto out; + } + } + + err = kstrtoint(buf, 10, &id); + if (err) + goto out; + + if (id == ID_BITMAP_NONE) { + mddev->bitmap_id = id; + goto out; + } + + head = xa_load(&md_submodule, id); + if (head && head->type == MD_BITMAP) { + mddev->bitmap_id = id; + goto out; + } + + err = -ENOENT; + +out: + xa_unlock(&md_submodule); + return err ? err : len; +} + +static struct md_sysfs_entry md_bitmap_type = +__ATTR(bitmap_type, 0664, bitmap_type_show, bitmap_type_store); + +static ssize_t layout_show(struct mddev *mddev, char *page) { /* just a number, not meaningful for all levels */ @@ -4260,9 +4495,9 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); static ssize_t resync_start_show(struct mddev *mddev, char *page) { - if (mddev->recovery_cp == MaxSector) + if (mddev->resync_offset == MaxSector) return sprintf(page, "none\n"); - return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); + return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset); } static ssize_t @@ -4288,7 +4523,7 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len) err = -EBUSY; if (!err) { - mddev->recovery_cp = n; + mddev->resync_offset = n; if (mddev->pers) set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); } @@ -4633,6 +4868,9 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) unsigned long chunk, end_chunk; int err; + if (!md_bitmap_enabled(mddev, false)) + return len; + err = mddev_lock(mddev); if (err) return err; @@ -4792,9 +5030,42 @@ out_unlock: static struct md_sysfs_entry md_metadata = __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); +static bool rdev_needs_recovery(struct md_rdev *rdev, sector_t sectors) +{ + return rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < sectors; +} + +static enum sync_action md_get_active_sync_action(struct mddev *mddev) +{ + struct md_rdev *rdev; + bool is_recover = false; + + if (mddev->resync_offset < MaxSector) + return ACTION_RESYNC; + + if (mddev->reshape_position != MaxSector) + return ACTION_RESHAPE; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (rdev_needs_recovery(rdev, MaxSector)) { + is_recover = true; + break; + } + } + rcu_read_unlock(); + + return is_recover ? ACTION_RECOVER : ACTION_IDLE; +} + enum sync_action md_sync_action(struct mddev *mddev) { unsigned long recovery = mddev->recovery; + enum sync_action active_action; /* * frozen has the highest priority, means running sync_thread will be @@ -4818,8 +5089,17 @@ enum sync_action md_sync_action(struct mddev *mddev) !test_bit(MD_RECOVERY_NEEDED, &recovery)) return ACTION_IDLE; - if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || - mddev->reshape_position != MaxSector) + /* + * Check if any sync operation (resync/recover/reshape) is + * currently active. This ensures that only one sync operation + * can run at a time. Returns the type of active operation, or + * ACTION_IDLE if none are active. + */ + active_action = md_get_active_sync_action(mddev); + if (active_action != ACTION_IDLE) + return active_action; + + if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) return ACTION_RESHAPE; if (test_bit(MD_RECOVERY_RECOVER, &recovery)) @@ -4893,7 +5173,7 @@ static void stop_sync_thread(struct mddev *mddev, bool locked) * Thread might be blocked waiting for metadata update which will now * never happen */ - md_wakeup_thread_directly(mddev->sync_thread); + md_wakeup_thread_directly(&mddev->sync_thread); if (work_pending(&mddev->sync_work)) flush_work(&mddev->sync_work); @@ -5084,7 +5364,7 @@ static ssize_t sync_min_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_min(mddev), - mddev->sync_speed_min ? "local": "system"); + mddev->sync_speed_min ? "local" : "system"); } static ssize_t @@ -5093,7 +5373,7 @@ sync_min_store(struct mddev *mddev, const char *buf, size_t len) unsigned int min; int rv; - if (strncmp(buf, "system", 6)==0) { + if (strncmp(buf, "system", 6) == 0) { min = 0; } else { rv = kstrtouint(buf, 10, &min); @@ -5113,7 +5393,7 @@ static ssize_t sync_max_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_max(mddev), - mddev->sync_speed_max ? "local": "system"); + mddev->sync_speed_max ? "local" : "system"); } static ssize_t @@ -5122,7 +5402,7 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len) unsigned int max; int rv; - if (strncmp(buf, "system", 6)==0) { + if (strncmp(buf, "system", 6) == 0) { max = 0; } else { rv = kstrtouint(buf, 10, &max); @@ -5139,6 +5419,35 @@ static struct md_sysfs_entry md_sync_max = __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); static ssize_t +sync_io_depth_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d (%s)\n", sync_io_depth(mddev), + mddev->sync_io_depth ? "local" : "system"); +} + +static ssize_t +sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int max; + int rv; + + if (strncmp(buf, "system", 6) == 0) { + max = 0; + } else { + rv = kstrtouint(buf, 10, &max); + if (rv < 0) + return rv; + if (max == 0) + return -EINVAL; + } + mddev->sync_io_depth = max; + return len; +} + +static struct md_sysfs_entry md_sync_io_depth = +__ATTR_RW(sync_io_depth); + +static ssize_t degraded_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->degraded); @@ -5584,7 +5893,7 @@ __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, static ssize_t serialize_policy_show(struct mddev *mddev, char *page) { - if (mddev->pers == NULL || (mddev->pers->level != 1)) + if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) return sprintf(page, "n/a\n"); else return sprintf(page, "%d\n", mddev->serialize_policy); @@ -5610,7 +5919,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) err = mddev_suspend_and_lock(mddev); if (err) return err; - if (mddev->pers == NULL || (mddev->pers->level != 1)) { + if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) { pr_err("md: serialize_policy is only effective for raid1\n"); err = -EINVAL; goto unlock; @@ -5630,10 +5939,73 @@ static struct md_sysfs_entry md_serialize_policy = __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, serialize_policy_store); +static int mddev_set_logical_block_size(struct mddev *mddev, + unsigned int lbs) +{ + int err = 0; + struct queue_limits lim; + + if (queue_logical_block_size(mddev->gendisk->queue) >= lbs) { + pr_err("%s: Cannot set LBS smaller than mddev LBS %u\n", + mdname(mddev), lbs); + return -EINVAL; + } + + lim = queue_limits_start_update(mddev->gendisk->queue); + lim.logical_block_size = lbs; + pr_info("%s: logical_block_size is changed, data may be lost\n", + mdname(mddev)); + err = queue_limits_commit_update(mddev->gendisk->queue, &lim); + if (err) + return err; + + mddev->logical_block_size = lbs; + /* New lbs will be written to superblock after array is running */ + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + return 0; +} + +static ssize_t +lbs_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%u\n", mddev->logical_block_size); +} + +static ssize_t +lbs_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int lbs; + int err = -EBUSY; + + /* Only 1.x meta supports configurable LBS */ + if (mddev->major_version == 0) + return -EINVAL; + + if (mddev->pers) + return -EBUSY; + + err = kstrtouint(buf, 10, &lbs); + if (err < 0) + return -EINVAL; + + err = mddev_lock(mddev); + if (err) + goto unlock; + + err = mddev_set_logical_block_size(mddev, lbs); + +unlock: + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry md_logical_block_size = +__ATTR(logical_block_size, 0644, lbs_show, lbs_store); static struct attribute *md_default_attrs[] = { &md_level.attr, &md_new_level.attr, + &md_bitmap_type.attr, &md_layout.attr, &md_raid_disks.attr, &md_uuid.attr, @@ -5651,6 +6023,7 @@ static struct attribute *md_default_attrs[] = { &md_consistency_policy.attr, &md_fail_last_dev.attr, &md_serialize_policy.attr, + &md_logical_block_size.attr, NULL, }; @@ -5664,6 +6037,7 @@ static struct attribute *md_redundancy_attrs[] = { &md_mismatches.attr, &md_sync_min.attr, &md_sync_max.attr, + &md_sync_io_depth.attr, &md_sync_speed.attr, &md_sync_force_parallel.attr, &md_sync_completed.attr, @@ -5682,7 +6056,6 @@ static const struct attribute_group md_redundancy_group = { static const struct attribute_group *md_attr_groups[] = { &md_default_group, - &md_bitmap_group, NULL, }; @@ -5714,19 +6087,30 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); struct mddev *mddev = container_of(kobj, struct mddev, kobj); ssize_t rv; + struct kernfs_node *kn = NULL; if (!entry->store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + + if (entry->store == array_state_store && cmd_match(page, "clear")) + kn = sysfs_break_active_protection(kobj, attr); + spin_lock(&all_mddevs_lock); if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); + if (kn) + sysfs_unbreak_active_protection(kn); return -EBUSY; } spin_unlock(&all_mddevs_lock); rv = entry->store(mddev, page, length); mddev_put(mddev); + + if (kn) + sysfs_unbreak_active_protection(kn); + return rv; } @@ -5734,12 +6118,13 @@ static void md_kobj_release(struct kobject *ko) { struct mddev *mddev = container_of(ko, struct mddev, kobj); - if (mddev->sysfs_state) - sysfs_put(mddev->sysfs_state); - if (mddev->sysfs_level) - sysfs_put(mddev->sysfs_level); - - del_gendisk(mddev->gendisk); + if (legacy_async_del_gendisk) { + if (mddev->sysfs_state) + sysfs_put(mddev->sysfs_state); + if (mddev->sysfs_level) + sysfs_put(mddev->sysfs_level); + del_gendisk(mddev->gendisk); + } put_disk(mddev->gendisk); } @@ -5769,6 +6154,17 @@ int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, return -EINVAL; } + /* + * Before RAID adding folio support, the logical_block_size + * should be smaller than the page size. + */ + if (lim->logical_block_size > PAGE_SIZE) { + pr_err("%s: logical_block_size must not larger than PAGE_SIZE\n", + mdname(mddev)); + return -EINVAL; + } + mddev->logical_block_size = lim->logical_block_size; + return 0; } EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); @@ -5781,6 +6177,13 @@ int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) if (mddev_is_dm(mddev)) return 0; + if (queue_logical_block_size(rdev->bdev->bd_disk->queue) > + queue_logical_block_size(mddev->gendisk->queue)) { + pr_err("%s: incompatible logical_block_size, can not add\n", + mdname(mddev)); + return -EINVAL; + } + lim = queue_limits_start_update(mddev->gendisk->queue); queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, mddev->gendisk->disk_name); @@ -5943,6 +6346,9 @@ static int md_alloc_and_put(dev_t dev, char *name) { struct mddev *mddev = md_alloc(dev, name); + if (legacy_async_del_gendisk) + pr_warn("md: async del_gendisk mode will be removed in future, please upgrade to mdadm-4.5+\n"); + if (IS_ERR(mddev)) return PTR_ERR(mddev); mddev_put(mddev); @@ -5988,7 +6394,7 @@ static int add_named_array(const char *val, const struct kernel_param *kp) static void md_safemode_timeout(struct timer_list *t) { - struct mddev *mddev = from_timer(mddev, t, safemode_timer); + struct mddev *mddev = timer_container_of(mddev, t, safemode_timer); mddev->safemode = 1; if (mddev->external) @@ -5999,6 +6405,26 @@ static void md_safemode_timeout(struct timer_list *t) static int start_dirty_degraded; +static int md_bitmap_create(struct mddev *mddev) +{ + if (mddev->bitmap_id == ID_BITMAP_NONE) + return -EINVAL; + + if (!mddev_set_bitmap_ops(mddev)) + return -ENOENT; + + return mddev->bitmap_ops->create(mddev); +} + +static void md_bitmap_destroy(struct mddev *mddev) +{ + if (!md_bitmap_registered(mddev)) + return; + + mddev->bitmap_ops->destroy(mddev); + mddev_clear_bitmap_ops(mddev); +} + int md_run(struct mddev *mddev) { int err; @@ -6078,50 +6504,20 @@ int md_run(struct mddev *mddev) nowait = nowait && bdev_nowait(rdev->bdev); } - if (!bioset_initialized(&mddev->bio_set)) { - err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); - if (err) - return err; - } - if (!bioset_initialized(&mddev->sync_set)) { - err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); - if (err) - goto exit_bio_set; - } - - if (!bioset_initialized(&mddev->io_clone_set)) { - err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, - offsetof(struct md_io_clone, bio_clone), 0); - if (err) - goto exit_sync_set; - } - - spin_lock(&pers_lock); - pers = find_pers(mddev->level, mddev->clevel); - if (!pers || !try_module_get(pers->owner)) { - spin_unlock(&pers_lock); - if (mddev->level != LEVEL_NONE) - pr_warn("md: personality for level %d is not loaded!\n", - mddev->level); - else - pr_warn("md: personality for level %s is not loaded!\n", - mddev->clevel); - err = -EINVAL; - goto abort; - } - spin_unlock(&pers_lock); - if (mddev->level != pers->level) { - mddev->level = pers->level; - mddev->new_level = pers->level; + pers = get_pers(mddev->level, mddev->clevel); + if (!pers) + return -EINVAL; + if (mddev->level != pers->head.id) { + mddev->level = pers->head.id; + mddev->new_level = pers->head.id; } - strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ - module_put(pers->owner); - err = -EINVAL; - goto abort; + put_pers(pers); + return -EINVAL; } if (pers->sync_request) { @@ -6174,7 +6570,7 @@ int md_run(struct mddev *mddev) } if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { - err = mddev->bitmap_ops->create(mddev, -1); + err = md_bitmap_create(mddev); if (err) pr_warn("%s: failed to create bitmap (%d)\n", mdname(mddev), err); @@ -6246,14 +6642,8 @@ bitmap_abort: if (mddev->private) pers->free(mddev, mddev->private); mddev->private = NULL; - module_put(pers->owner); - mddev->bitmap_ops->destroy(mddev); -abort: - bioset_exit(&mddev->io_clone_set); -exit_sync_set: - bioset_exit(&mddev->sync_set); -exit_bio_set: - bioset_exit(&mddev->bio_set); + put_pers(pers); + md_bitmap_destroy(mddev); return err; } EXPORT_SYMBOL_GPL(md_run); @@ -6267,10 +6657,12 @@ int do_md_run(struct mddev *mddev) if (err) goto out; - err = mddev->bitmap_ops->load(mddev); - if (err) { - mddev->bitmap_ops->destroy(mddev); - goto out; + if (md_bitmap_registered(mddev)) { + err = mddev->bitmap_ops->load(mddev); + if (err) { + md_bitmap_destroy(mddev); + goto out; + } } if (mddev_is_clustered(mddev)) @@ -6354,7 +6746,7 @@ static void md_clean(struct mddev *mddev) mddev->external_size = 0; mddev->dev_sectors = 0; mddev->raid_disks = 0; - mddev->recovery_cp = 0; + mddev->resync_offset = 0; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->reshape_position = MaxSector; @@ -6362,21 +6754,29 @@ static void md_clean(struct mddev *mddev) mddev->persistent = 0; mddev->level = LEVEL_NONE; mddev->clevel[0] = 0; + /* - * Don't clear MD_CLOSING, or mddev can be opened again. - * 'hold_active != 0' means mddev is still in the creation - * process and will be used later. + * For legacy_async_del_gendisk mode, it can stop the array in the + * middle of assembling it, then it still can access the array. So + * it needs to clear MD_CLOSING. If not legacy_async_del_gendisk, + * it can't open the array again after stopping it. So it doesn't + * clear MD_CLOSING. */ - if (mddev->hold_active) - mddev->flags = 0; - else + if (legacy_async_del_gendisk && mddev->hold_active) { + clear_bit(MD_CLOSING, &mddev->flags); + } else { + /* if UNTIL_STOP is set, it's cleared here */ + mddev->hold_active = 0; + /* Don't clear MD_CLOSING, or mddev can be opened again. */ mddev->flags &= BIT_ULL_MASK(MD_CLOSING); + } mddev->sb_flags = 0; mddev->ro = MD_RDWR; mddev->metadata_type[0] = 0; mddev->chunk_sectors = 0; mddev->ctime = mddev->utime = 0; mddev->layout = 0; + mddev->logical_block_size = 0; mddev->max_disks = 0; mddev->events = 0; mddev->can_decrease_events = 0; @@ -6407,14 +6807,15 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { - del_timer_sync(&mddev->safemode_timer); + timer_delete_sync(&mddev->safemode_timer); if (mddev->pers && mddev->pers->quiesce) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - mddev->bitmap_ops->flush(mddev); + if (md_bitmap_enabled(mddev, true)) + mddev->bitmap_ops->flush(mddev); if (md_is_rdwr(mddev) && ((!mddev->in_sync && !mddev_is_clustered(mddev)) || @@ -6441,7 +6842,8 @@ EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { - mddev->bitmap_ops->wait_behind_writes(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->wait_behind_writes(mddev); if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); @@ -6457,7 +6859,7 @@ static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); mddev_detach(mddev); spin_lock(&mddev->lock); mddev->pers = NULL; @@ -6465,14 +6867,8 @@ static void __md_stop(struct mddev *mddev) if (mddev->private) pers->free(mddev, mddev->private); mddev->private = NULL; - if (pers->sync_request && mddev->to_remove == NULL) - mddev->to_remove = &md_redundancy_group; - module_put(pers->owner); + put_pers(pers); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - - bioset_exit(&mddev->bio_set); - bioset_exit(&mddev->sync_set); - bioset_exit(&mddev->io_clone_set); } void md_stop(struct mddev *mddev) @@ -6563,6 +6959,10 @@ static int do_md_stop(struct mddev *mddev, int mode) if (!md_is_rdwr(mddev)) set_disk_ro(disk, 0); + if (mode == 2 && mddev->pers->sync_request && + mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + __md_stop_writes(mddev); __md_stop(mddev); @@ -6595,10 +6995,9 @@ static int do_md_stop(struct mddev *mddev, int mode) mddev->bitmap_info.offset = 0; export_array(mddev); - md_clean(mddev); - if (mddev->hold_active == UNTIL_STOP) - mddev->hold_active = 0; + if (!legacy_async_del_gendisk) + set_bit(MD_DELETED, &mddev->flags); } md_new_event(); sysfs_notify_dirent_safe(mddev->sysfs_state); @@ -6983,7 +7382,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) set_bit(Candidate, &rdev->flags); else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { /* --add initiated by this node */ - err = md_cluster_ops->add_new_disk(mddev, rdev); + err = mddev->cluster_ops->add_new_disk(mddev, rdev); if (err) { export_rdev(rdev, mddev); return err; @@ -7000,14 +7399,14 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) if (mddev_is_clustered(mddev)) { if (info->state & (1 << MD_DISK_CANDIDATE)) { if (!err) { - err = md_cluster_ops->new_disk_ack(mddev, - err == 0); + err = mddev->cluster_ops->new_disk_ack( + mddev, err == 0); if (err) md_kick_rdev_from_array(rdev); } } else { if (err) - md_cluster_ops->add_new_disk_cancel(mddev); + mddev->cluster_ops->add_new_disk_cancel(mddev); else err = add_bound_rdev(rdev); } @@ -7087,10 +7486,9 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) goto busy; kick_rdev: - if (mddev_is_clustered(mddev)) { - if (md_cluster_ops->remove_disk(mddev, rdev)) - goto busy; - } + if (mddev_is_clustered(mddev) && + mddev->cluster_ops->remove_disk(mddev, rdev)) + goto busy; md_kick_rdev_from_array(rdev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -7179,6 +7577,9 @@ static int set_bitmap_file(struct mddev *mddev, int fd) { int err = 0; + if (!md_bitmap_registered(mddev)) + return -EINVAL; + if (mddev->pers) { if (!mddev->pers->quiesce || !mddev->thread) return -EBUSY; @@ -7235,16 +7636,16 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = 0; if (mddev->pers) { if (fd >= 0) { - err = mddev->bitmap_ops->create(mddev, -1); + err = md_bitmap_create(mddev); if (!err) err = mddev->bitmap_ops->load(mddev); if (err) { - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); fd = -1; } } else if (fd < 0) { - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); } } @@ -7309,9 +7710,9 @@ int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) * openned */ if (info->state & (1<<MD_SB_CLEAN)) - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; else - mddev->recovery_cp = 0; + mddev->resync_offset = 0; mddev->persistent = ! info->not_persistent; mddev->external = 0; @@ -7393,7 +7794,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) rv = mddev->pers->resize(mddev, num_sectors); if (!rv) { if (mddev_is_clustered(mddev)) - md_cluster_ops->update_size(mddev, old_dev_sectors); + mddev->cluster_ops->update_size(mddev, old_dev_sectors); else if (!mddev_is_dm(mddev)) set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); @@ -7441,6 +7842,28 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) return rv; } +static int get_cluster_ops(struct mddev *mddev) +{ + xa_lock(&md_submodule); + mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER); + if (mddev->cluster_ops && + !try_module_get(mddev->cluster_ops->head.owner)) + mddev->cluster_ops = NULL; + xa_unlock(&md_submodule); + + return mddev->cluster_ops == NULL ? -ENOENT : 0; +} + +static void put_cluster_ops(struct mddev *mddev) +{ + if (!mddev->cluster_ops) + return; + + mddev->cluster_ops->leave(mddev); + module_put(mddev->cluster_ops->head.owner); + mddev->cluster_ops = NULL; +} + /* * update_array_info is used to change the configuration of an * on-line array. @@ -7529,12 +7952,12 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - rv = mddev->bitmap_ops->create(mddev, -1); + rv = md_bitmap_create(mddev); if (!rv) rv = mddev->bitmap_ops->load(mddev); if (rv) - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); } else { struct md_bitmap_stats stats; @@ -7549,19 +7972,18 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) if (mddev->bitmap_info.nodes) { /* hold PW on all the bitmap lock */ - if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { + if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) { pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); rv = -EPERM; - md_cluster_ops->unlock_all_bitmaps(mddev); + mddev->cluster_ops->unlock_all_bitmaps(mddev); goto err; } mddev->bitmap_info.nodes = 0; - md_cluster_ops->leave(mddev); - module_put(md_cluster_mod); + put_cluster_ops(mddev); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); mddev->bitmap_info.offset = 0; } } @@ -7598,9 +8020,9 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev) * 4 sectors (with a BIG number of cylinders...). This drives * dosfs just mad... ;-) */ -static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int md_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mddev *mddev = bdev->bd_disk->private_data; + struct mddev *mddev = disk->private_data; geo->heads = 2; geo->sectors = 4; @@ -7842,7 +8264,7 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, case CLUSTERED_DISK_NACK: if (mddev_is_clustered(mddev)) - md_cluster_ops->new_disk_ack(mddev, false); + mddev->cluster_ops->new_disk_ack(mddev, false); else err = -EINVAL; goto unlock; @@ -8045,22 +8467,21 @@ static int md_thread(void *arg) return 0; } -static void md_wakeup_thread_directly(struct md_thread __rcu *thread) +static void md_wakeup_thread_directly(struct md_thread __rcu **thread) { struct md_thread *t; rcu_read_lock(); - t = rcu_dereference(thread); + t = rcu_dereference(*thread); if (t) wake_up_process(t->tsk); rcu_read_unlock(); } -void md_wakeup_thread(struct md_thread __rcu *thread) +void __md_wakeup_thread(struct md_thread __rcu *thread) { struct md_thread *t; - rcu_read_lock(); t = rcu_dereference(thread); if (t) { pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); @@ -8068,9 +8489,8 @@ void md_wakeup_thread(struct md_thread __rcu *thread) if (wq_has_sleeper(&t->wqueue)) wake_up(&t->wqueue); } - rcu_read_unlock(); } -EXPORT_SYMBOL(md_wakeup_thread); +EXPORT_SYMBOL(__md_wakeup_thread); struct md_thread *md_register_thread(void (*run) (struct md_thread *), struct mddev *mddev, const char *name) @@ -8124,7 +8544,8 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) return; mddev->pers->error_handler(mddev, rdev); - if (mddev->pers->level == 0) + if (mddev->pers->head.id == ID_RAID0 || + mddev->pers->head.id == ID_LINEAR) return; if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) @@ -8162,14 +8583,17 @@ static void status_unused(struct seq_file *seq) static void status_personalities(struct seq_file *seq) { - struct md_personality *pers; + struct md_submodule_head *head; + unsigned long i; seq_puts(seq, "Personalities : "); - spin_lock(&pers_lock); - list_for_each_entry(pers, &pers_list, list) - seq_printf(seq, "[%s] ", pers->name); - spin_unlock(&pers_lock); + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) + if (head->type == MD_PERSONALITY) + seq_printf(seq, "[%s] ", head->name); + xa_unlock(&md_submodule); + seq_puts(seq, "\n"); } @@ -8225,7 +8649,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) seq_printf(seq, "\tresync=REMOTE"); return 1; } - if (mddev->recovery_cp < MaxSector) { + if (mddev->resync_offset < MaxSector) { seq_printf(seq, "\tresync=PENDING"); return 1; } @@ -8338,6 +8762,9 @@ static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) unsigned long chunk_kb; int err; + if (!md_bitmap_enabled(mddev, false)) + return; + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (err) return; @@ -8376,6 +8803,10 @@ static int md_seq_show(struct seq_file *seq, void *v) return 0; spin_unlock(&all_mddevs_lock); + + /* prevent bitmap to be freed after checking */ + mutex_lock(&mddev->bitmap_info.mutex); + spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { seq_printf(seq, "%s : ", mdname(mddev)); @@ -8388,7 +8819,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, " (read-only)"); if (mddev->ro == MD_AUTO_READ) seq_printf(seq, " (auto-read-only)"); - seq_printf(seq, " %s", mddev->pers->name); + seq_printf(seq, " %s", mddev->pers->head.name); } else { seq_printf(seq, "inactive"); } @@ -8451,14 +8882,13 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } spin_unlock(&mddev->lock); + mutex_unlock(&mddev->bitmap_info.mutex); spin_lock(&all_mddevs_lock); if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) status_unused(seq); - if (atomic_dec_and_test(&mddev->active)) - __mddev_put(mddev); - + mddev_put_locked(mddev); return 0; } @@ -8509,67 +8939,34 @@ static const struct proc_ops mdstat_proc_ops = { .proc_poll = mdstat_poll, }; -int register_md_personality(struct md_personality *p) -{ - pr_debug("md: %s personality registered for level %d\n", - p->name, p->level); - spin_lock(&pers_lock); - list_add_tail(&p->list, &pers_list); - spin_unlock(&pers_lock); - return 0; -} -EXPORT_SYMBOL(register_md_personality); - -int unregister_md_personality(struct md_personality *p) -{ - pr_debug("md: %s personality unregistered\n", p->name); - spin_lock(&pers_lock); - list_del_init(&p->list); - spin_unlock(&pers_lock); - return 0; -} -EXPORT_SYMBOL(unregister_md_personality); - -int register_md_cluster_operations(const struct md_cluster_operations *ops, - struct module *module) +int register_md_submodule(struct md_submodule_head *msh) { - int ret = 0; - spin_lock(&pers_lock); - if (md_cluster_ops != NULL) - ret = -EALREADY; - else { - md_cluster_ops = ops; - md_cluster_mod = module; - } - spin_unlock(&pers_lock); - return ret; + return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL); } -EXPORT_SYMBOL(register_md_cluster_operations); +EXPORT_SYMBOL_GPL(register_md_submodule); -int unregister_md_cluster_operations(void) +void unregister_md_submodule(struct md_submodule_head *msh) { - spin_lock(&pers_lock); - md_cluster_ops = NULL; - spin_unlock(&pers_lock); - return 0; + xa_erase(&md_submodule, msh->id); } -EXPORT_SYMBOL(unregister_md_cluster_operations); +EXPORT_SYMBOL_GPL(unregister_md_submodule); int md_setup_cluster(struct mddev *mddev, int nodes) { - int ret; - if (!md_cluster_ops) + int ret = get_cluster_ops(mddev); + + if (ret) { request_module("md-cluster"); - spin_lock(&pers_lock); + ret = get_cluster_ops(mddev); + } + /* ensure module won't be unloaded */ - if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { + if (ret) { pr_warn("can't find md-cluster module or get its reference.\n"); - spin_unlock(&pers_lock); - return -ENOENT; + return ret; } - spin_unlock(&pers_lock); - ret = md_cluster_ops->join(mddev, nodes); + ret = mddev->cluster_ops->join(mddev, nodes); if (!ret) mddev->safemode_delay = 0; return ret; @@ -8577,56 +8974,58 @@ int md_setup_cluster(struct mddev *mddev, int nodes) void md_cluster_stop(struct mddev *mddev) { - if (!md_cluster_ops) - return; - md_cluster_ops->leave(mddev); - module_put(md_cluster_mod); + put_cluster_ops(mddev); } -static int is_mddev_idle(struct mddev *mddev, int init) +static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init) { + unsigned long last_events = rdev->last_events; + + if (!bdev_is_partition(rdev->bdev)) + return true; + + /* + * If rdev is partition, and user doesn't issue IO to the array, the + * array is still not idle if user issues IO to other partitions. + */ + rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0, + sectors) - + part_stat_read_accum(rdev->bdev, sectors); + + return init || rdev->last_events <= last_events; +} + +/* + * mddev is idle if following conditions are matched since last check: + * 1) mddev doesn't have normal IO completed; + * 2) mddev doesn't have inflight normal IO; + * 3) if any member disk is partition, and other partitions don't have IO + * completed; + * + * Noted this checking rely on IO accounting is enabled. + */ +static bool is_mddev_idle(struct mddev *mddev, int init) +{ + unsigned long last_events = mddev->normal_io_events; + struct gendisk *disk; struct md_rdev *rdev; - int idle; - int curr_events; + bool idle = true; - idle = 1; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) { - struct gendisk *disk = rdev->bdev->bd_disk; + disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk; + if (!disk) + return true; - if (!init && !blk_queue_io_stat(disk->queue)) - continue; + mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors); + if (!init && (mddev->normal_io_events > last_events || + bdev_count_inflight(disk->part0))) + idle = false; - curr_events = (int)part_stat_read_accum(disk->part0, sectors) - - atomic_read(&disk->sync_io); - /* sync IO will cause sync_io to increase before the disk_stats - * as sync_io is counted when a request starts, and - * disk_stats is counted when it completes. - * So resync activity will cause curr_events to be smaller than - * when there was no such activity. - * non-sync IO will cause disk_stat to increase without - * increasing sync_io so curr_events will (eventually) - * be larger than it was before. Once it becomes - * substantially larger, the test below will cause - * the array to appear non-idle, and resync will slow - * down. - * If there is a lot of outstanding resync activity when - * we set last_event to curr_events, then all that activity - * completing might cause the array to appear non-idle - * and resync will be slowed down even though there might - * not have been non-resync activity. This will only - * happen once though. 'last_events' will soon reflect - * the state where there is little or no outstanding - * resync requests, and further resync activity will - * always make curr_events less than last_events. - * - */ - if (init || curr_events - rdev->last_events > 64) { - rdev->last_events = curr_events; - idle = 0; - } - } + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (!is_rdev_holder_idle(rdev, init)) + idle = false; rcu_read_unlock(); + return idle; } @@ -8745,12 +9144,38 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, } EXPORT_SYMBOL_GPL(md_submit_discard_bio); +static void md_bitmap_start(struct mddev *mddev, + struct md_io_clone *md_io_clone) +{ + md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? + mddev->bitmap_ops->start_discard : + mddev->bitmap_ops->start_write; + + if (mddev->pers->bitmap_sector) + mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, + &md_io_clone->sectors); + + fn(mddev, md_io_clone->offset, md_io_clone->sectors); +} + +static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) +{ + md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? + mddev->bitmap_ops->end_discard : + mddev->bitmap_ops->end_write; + + fn(mddev, md_io_clone->offset, md_io_clone->sectors); +} + static void md_end_clone_io(struct bio *bio) { struct md_io_clone *md_io_clone = bio->bi_private; struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; + if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) + md_bitmap_end(mddev, md_io_clone); + if (bio->bi_status && !orig_bio->bi_status) orig_bio->bi_status = bio->bi_status; @@ -8775,6 +9200,13 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio) if (blk_queue_io_stat(bdev->bd_disk->queue)) md_io_clone->start_time = bio_start_io_acct(*bio); + if (bio_data_dir(*bio) == WRITE && md_bitmap_enabled(mddev, false)) { + md_io_clone->offset = (*bio)->bi_iter.bi_sector; + md_io_clone->sectors = bio_sectors(*bio); + md_io_clone->rw = op_stat_group(bio_op(*bio)); + md_bitmap_start(mddev, md_io_clone); + } + clone->bi_end_io = md_end_clone_io; clone->bi_private = md_io_clone; *bio = clone; @@ -8793,6 +9225,9 @@ void md_free_cloned_bio(struct bio *bio) struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; + if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) + md_bitmap_end(mddev, md_io_clone); + if (bio->bi_status && !orig_bio->bi_status) orig_bio->bi_status = bio->bi_status; @@ -8856,6 +9291,39 @@ static sector_t md_sync_max_sectors(struct mddev *mddev, } } +/* + * If lazy recovery is requested and all rdevs are in sync, select the rdev with + * the higest index to perfore recovery to build initial xor data, this is the + * same as old bitmap. + */ +static bool mddev_select_lazy_recover_rdev(struct mddev *mddev) +{ + struct md_rdev *recover_rdev = NULL; + struct md_rdev *rdev; + bool ret = false; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (rdev->raid_disk < 0) + continue; + + if (test_bit(Faulty, &rdev->flags) || + !test_bit(In_sync, &rdev->flags)) + break; + + if (!recover_rdev || recover_rdev->raid_disk < rdev->raid_disk) + recover_rdev = rdev; + } + + if (recover_rdev) { + clear_bit(In_sync, &recover_rdev->flags); + ret = true; + } + + rcu_read_unlock(); + return ret; +} + static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) { sector_t start = 0; @@ -8867,7 +9335,7 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) return mddev->resync_min; case ACTION_RESYNC: if (!mddev->bitmap) - return mddev->recovery_cp; + return mddev->resync_offset; return 0; case ACTION_RESHAPE: /* @@ -8883,14 +9351,18 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) start = MaxSector; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < start) + if (rdev_needs_recovery(rdev, start)) start = rdev->recovery_offset; rcu_read_unlock(); + /* + * If there are no spares, and raid456 lazy initial recover is + * requested. + */ + if (test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery) && + start == MaxSector && mddev_select_lazy_recover_rdev(mddev)) + start = 0; + /* If there is a bitmap, we need to make sure all * writes that started before we added a spare * complete before we start doing a recovery. @@ -8909,6 +9381,16 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) } } +static bool sync_io_within_limit(struct mddev *mddev) +{ + /* + * For raid456, sync IO is stripe(4k) per IO, for other levels, it's + * RESYNC_PAGES(64k) per IO. + */ + return atomic_read(&mddev->recovery_active) < + (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev); +} + #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) #define UPDATE_FREQUENCY (5*60*HZ) @@ -8944,7 +9426,7 @@ void md_do_sync(struct md_thread *thread) } if (mddev_is_clustered(mddev)) { - ret = md_cluster_ops->resync_start(mddev); + ret = mddev->cluster_ops->resync_start(mddev); if (ret) goto skip; @@ -8958,6 +9440,11 @@ void md_do_sync(struct md_thread *thread) } action = md_sync_action(mddev); + if (action == ACTION_FROZEN || action == ACTION_IDLE) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + goto skip; + } + desc = md_sync_action_name(action); mddev->last_sync_action = action; @@ -8971,7 +9458,7 @@ void md_do_sync(struct md_thread *thread) * */ if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_start_notify(mddev); + mddev->cluster_ops->resync_start_notify(mddev); do { int mddev2_minor = -1; mddev->curr_resync = MD_RESYNC_DELAYED; @@ -9088,8 +9575,8 @@ void md_do_sync(struct md_thread *thread) atomic_read(&mddev->recovery_active) == 0); mddev->curr_resync_completed = j; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && - j > mddev->recovery_cp) - mddev->recovery_cp = j; + j > mddev->resync_offset) + mddev->resync_offset = j; update_time = jiffies; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); sysfs_notify_dirent_safe(mddev->sysfs_completed); @@ -9111,6 +9598,12 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; + if (mddev->bitmap_ops && mddev->bitmap_ops->skip_sync_blocks) { + sectors = mddev->bitmap_ops->skip_sync_blocks(mddev, j); + if (sectors) + goto update; + } + sectors = mddev->pers->sync_request(mddev, j, max_sectors, &skipped); if (sectors == 0) { @@ -9126,6 +9619,7 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; +update: j += sectors; if (j > max_sectors) /* when skipping, extra large numbers can be returned. */ @@ -9177,7 +9671,8 @@ void md_do_sync(struct md_thread *thread) msleep(500); goto repeat; } - if (!is_mddev_idle(mddev, 0)) { + if (!sync_io_within_limit(mddev) && + !is_mddev_idle(mddev, 0)) { /* * Give other IO more of a chance. * The faster the devices, the less we wait. @@ -9208,19 +9703,19 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync > MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - if (mddev->curr_resync >= mddev->recovery_cp) { + if (mddev->curr_resync >= mddev->resync_offset) { pr_debug("md: checkpointing %s of %s.\n", desc, mdname(mddev)); if (test_bit(MD_RECOVERY_ERROR, &mddev->recovery)) - mddev->recovery_cp = + mddev->resync_offset = mddev->curr_resync_completed; else - mddev->recovery_cp = + mddev->resync_offset = mddev->curr_resync; } } else - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; @@ -9228,12 +9723,8 @@ void md_do_sync(struct md_thread *thread) test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - mddev->delta_disks >= 0 && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < mddev->curr_resync) + if (mddev->delta_disks >= 0 && + rdev_needs_recovery(rdev, mddev->curr_resync)) rdev->recovery_offset = mddev->curr_resync; rcu_read_unlock(); } @@ -9324,6 +9815,12 @@ static bool rdev_is_spare(struct md_rdev *rdev) static bool rdev_addable(struct md_rdev *rdev) { + struct mddev *mddev; + + mddev = READ_ONCE(rdev->mddev); + if (!mddev) + return false; + /* rdev is already used, don't add it again. */ if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || test_bit(Faulty, &rdev->flags)) @@ -9334,7 +9831,7 @@ static bool rdev_addable(struct md_rdev *rdev) return true; /* Allow to add if array is read-write. */ - if (md_is_rdwr(rdev->mddev)) + if (md_is_rdwr(mddev)) return true; /* @@ -9362,17 +9859,11 @@ static bool md_spares_need_change(struct mddev *mddev) return false; } -static int remove_and_add_spares(struct mddev *mddev, - struct md_rdev *this) +static int remove_spares(struct mddev *mddev, struct md_rdev *this) { struct md_rdev *rdev; - int spares = 0; int removed = 0; - if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - /* Mustn't remove devices when resync thread is running */ - return 0; - rdev_for_each(rdev, mddev) { if ((this == NULL || rdev == this) && rdev_removeable(rdev) && !mddev->pers->hot_remove_disk(mddev, rdev)) { @@ -9386,6 +9877,21 @@ static int remove_and_add_spares(struct mddev *mddev, if (removed && mddev->kobj.sd) sysfs_notify_dirent_safe(mddev->sysfs_degraded); + return removed; +} + +static int remove_and_add_spares(struct mddev *mddev, + struct md_rdev *this) +{ + struct md_rdev *rdev; + int spares = 0; + int removed = 0; + + if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + /* Mustn't remove devices when resync thread is running */ + return 0; + + removed = remove_spares(mddev, this); if (this && removed) goto no_add; @@ -9423,6 +9929,16 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); + return true; + } + + /* Check if resync is in progress. */ + if (mddev->resync_offset < MaxSector) { + remove_spares(mddev, NULL); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); return true; } @@ -9432,7 +9948,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) * re-add. */ *spares = remove_and_add_spares(mddev, NULL); - if (*spares) { + if (*spares || test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) { clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); @@ -9442,13 +9958,6 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) return true; } - /* Check if recovery is in progress. */ - if (mddev->recovery_cp < MaxSector) { - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - return true; - } - /* Delay to choose resync/check/repair in md_do_sync(). */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) return true; @@ -9497,7 +10006,7 @@ static void md_start_sync(struct work_struct *ws) * We are adding a device or devices to an array which has the bitmap * stored on all devices. So make sure all bitmap pages get written. */ - if (spares) + if (spares && md_bitmap_enabled(mddev, true)) mddev->bitmap_ops->write_all(mddev); name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? @@ -9561,6 +10070,52 @@ static void unregister_sync_thread(struct mddev *mddev) md_reap_sync_thread(mddev); } +static bool md_should_do_recovery(struct mddev *mddev) +{ + /* + * As long as one of the following flags is set, + * recovery needs to do or cleanup. + */ + if (test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || + test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + return true; + + /* + * If no flags are set and it is in read-only status, + * there is nothing to do. + */ + if (!md_is_rdwr(mddev)) + return false; + + /* + * MD_SB_CHANGE_PENDING indicates that the array is switching from clean to + * active, and no action is needed for now. + * All other MD_SB_* flags require to update the superblock. + */ + if (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) + return true; + + /* + * If the array is not using external metadata and there has been no data + * written for some time, then the array's status needs to be set to + * in_sync. + */ + if (mddev->external == 0 && mddev->safemode == 1) + return true; + + /* + * When the system is about to restart or the process receives an signal, + * the array needs to be synchronized as soon as possible. + * Once the data synchronization is completed, need to change the array + * status to in_sync. + */ + if (mddev->safemode == 2 && !mddev->in_sync && + mddev->resync_offset == MaxSector) + return true; + + return false; +} + /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. @@ -9585,7 +10140,7 @@ static void unregister_sync_thread(struct mddev *mddev) */ void md_check_recovery(struct mddev *mddev) { - if (mddev->bitmap) + if (md_bitmap_enabled(mddev, false) && mddev->bitmap_ops->daemon_work) mddev->bitmap_ops->daemon_work(mddev); if (signal_pending(current)) { @@ -9597,18 +10152,7 @@ void md_check_recovery(struct mddev *mddev) flush_signals(current); } - if (!md_is_rdwr(mddev) && - !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && - !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) - return; - if ( ! ( - (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || - test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || - test_bit(MD_RECOVERY_DONE, &mddev->recovery) || - (mddev->external == 0 && mddev->safemode == 1) || - (mddev->safemode == 2 - && !mddev->in_sync && mddev->recovery_cp == MaxSector) - )) + if (!md_should_do_recovery(mddev)) return; if (mddev_trylock(mddev)) { @@ -9652,6 +10196,7 @@ void md_check_recovery(struct mddev *mddev) } clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); @@ -9664,8 +10209,8 @@ void md_check_recovery(struct mddev *mddev) * remove disk. */ rdev_for_each_safe(rdev, tmp, mddev) { - if (test_and_clear_bit(ClusterRemove, &rdev->flags) && - rdev->raid_disk < 0) + if (rdev->raid_disk < 0 && + test_and_clear_bit(ClusterRemove, &rdev->flags)) md_kick_rdev_from_array(rdev); } } @@ -9755,21 +10300,22 @@ void md_reap_sync_thread(struct mddev *mddev) * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by * clustered raid */ if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) - md_cluster_ops->resync_finish(mddev); + mddev->cluster_ops->resync_finish(mddev); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); /* - * We call md_cluster_ops->update_size here because sync_size could + * We call mddev->cluster_ops->update_size here because sync_size could * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, * so it is time to update size across cluster. */ if (mddev_is_clustered(mddev) && is_reshaped && !test_bit(MD_CLOSING, &mddev->flags)) - md_cluster_ops->update_size(mddev, old_dev_sectors); + mddev->cluster_ops->update_size(mddev, old_dev_sectors); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_completed); @@ -9807,12 +10353,11 @@ EXPORT_SYMBOL(md_finish_reshape); /* Bad block management */ -/* Returns 1 on success, 0 on failure */ -int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new) +/* Returns true on success, false on failure */ +bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) { struct mddev *mddev = rdev->mddev; - int rv; /* * Recording new badblocks for faulty rdev will force unnecessary @@ -9822,50 +10367,50 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, * avoid it. */ if (test_bit(Faulty, &rdev->flags)) - return 1; + return true; if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; - rv = badblocks_set(&rdev->badblocks, s, sectors, 0); - if (rv == 0) { - /* Make sure they get written out promptly */ - if (test_bit(ExternalBbl, &rdev->flags)) - sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); - sysfs_notify_dirent_safe(rdev->sysfs_state); - set_mask_bits(&mddev->sb_flags, 0, - BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); - md_wakeup_thread(rdev->mddev->thread); - return 1; - } else - return 0; + + if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) + return false; + + /* Make sure they get written out promptly */ + if (test_bit(ExternalBbl, &rdev->flags)) + sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); + sysfs_notify_dirent_safe(rdev->sysfs_state); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); + md_wakeup_thread(rdev->mddev->thread); + return true; } EXPORT_SYMBOL_GPL(rdev_set_badblocks); -int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new) +void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) { - int rv; if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; - rv = badblocks_clear(&rdev->badblocks, s, sectors); - if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) + + if (!badblocks_clear(&rdev->badblocks, s, sectors)) + return; + + if (test_bit(ExternalBbl, &rdev->flags)) sysfs_notify_dirent_safe(rdev->sysfs_badblocks); - return rv; } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { - struct mddev *mddev, *n; - int need_delay = 0; + struct mddev *mddev; spin_lock(&all_mddevs_lock); - list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { if (!mddev_get(mddev)) continue; spin_unlock(&all_mddevs_lock); @@ -9876,21 +10421,11 @@ static int md_notify_reboot(struct notifier_block *this, mddev->safemode = 2; mddev_unlock(mddev); } - need_delay = 1; - mddev_put(mddev); spin_lock(&all_mddevs_lock); + mddev_put_locked(mddev); } spin_unlock(&all_mddevs_lock); - /* - * certain more exotic SCSI devices are known to be - * volatile wrt too early system reboots. While the - * right place to handle this issue is the given - * driver, we do want to have a safe RAID driver ... - */ - if (need_delay) - msleep(1000); - return NOTIFY_DONE; } @@ -9909,8 +10444,16 @@ static void md_geninit(void) static int __init md_init(void) { - int ret = -ENOMEM; + int ret = md_bitmap_init(); + + if (ret) + return ret; + + ret = md_llbitmap_init(); + if (ret) + goto err_bitmap; + ret = -ENOMEM; md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); if (!md_wq) goto err_wq; @@ -9919,11 +10462,6 @@ static int __init md_init(void) if (!md_misc_wq) goto err_misc_wq; - md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, - 0); - if (!md_bitmap_wq) - goto err_bitmap_wq; - ret = __register_blkdev(MD_MAJOR, "md", md_probe); if (ret < 0) goto err_md; @@ -9942,12 +10480,13 @@ static int __init md_init(void) err_mdp: unregister_blkdev(MD_MAJOR, "md"); err_md: - destroy_workqueue(md_bitmap_wq); -err_bitmap_wq: destroy_workqueue(md_misc_wq); err_misc_wq: destroy_workqueue(md_wq); err_wq: + md_llbitmap_exit(); +err_bitmap: + md_bitmap_exit(); return ret; } @@ -9965,14 +10504,17 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); if (ret) pr_info("md-cluster: resize failed\n"); - else + else if (md_bitmap_enabled(mddev, false)) mddev->bitmap_ops->update_sb(mddev->bitmap); } /* Check for change of roles in the active devices */ rdev_for_each_safe(rdev2, tmp, mddev) { - if (test_bit(Faulty, &rdev2->flags)) + if (test_bit(Faulty, &rdev2->flags)) { + if (test_bit(ClusterRemove, &rdev2->flags)) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); continue; + } /* Check if the roles changed */ role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); @@ -9995,7 +10537,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && - !md_cluster_ops->resync_status_get(mddev)) { + !mddev->cluster_ops->resync_status_get(mddev)) { /* * -1 to make raid1_add_disk() set conf->fullsync * to 1. This could avoid skipping sync when the @@ -10211,7 +10753,7 @@ void md_autostart_arrays(int part) static __exit void md_exit(void) { - struct mddev *mddev, *n; + struct mddev *mddev; int delay = 1; unregister_blkdev(MD_MAJOR,"md"); @@ -10232,7 +10774,7 @@ static __exit void md_exit(void) remove_proc_entry("mdstat", NULL); spin_lock(&all_mddevs_lock); - list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { if (!mddev_get(mddev)) continue; spin_unlock(&all_mddevs_lock); @@ -10244,14 +10786,14 @@ static __exit void md_exit(void) * the mddev for destruction by a workqueue, and the * destroy_workqueue() below will wait for that to complete. */ - mddev_put(mddev); spin_lock(&all_mddevs_lock); + mddev_put_locked(mddev); } spin_unlock(&all_mddevs_lock); destroy_workqueue(md_misc_wq); - destroy_workqueue(md_bitmap_wq); destroy_workqueue(md_wq); + md_bitmap_exit(); } subsys_initcall(md_init); @@ -10270,6 +10812,8 @@ module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); module_param(create_on_open, bool, S_IRUSR|S_IWUSR); +module_param(legacy_async_del_gendisk, bool, 0600); +module_param(check_new_feature, bool, 0600); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MD RAID framework"); |
