diff options
Diffstat (limited to 'drivers/md/md.c')
| -rw-r--r-- | drivers/md/md.c | 8916 |
1 files changed, 5518 insertions, 3398 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index dddc87bcf64a..e5922a682953 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1,6 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* md.c : Multiple Devices driver for Linux - Copyright (C) 1998, 1999, 2000 Ingo Molnar + Copyright (C) 1998, 1999, 2000 Ingo Molnar completely rewritten, based on the MD driver code from Marc Zyngier @@ -22,18 +23,26 @@ - persistent bitmap code Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + Errors, Warnings, etc. + Please use: + pr_crit() for error conditions that risk data loss + pr_err() for error conditions that are unexpected, like an IO error + or internal inconsistency + pr_warn() for error conditions that could have been predicated, like + adding a device to an array when it has incompatible metadata + pr_info() for every interesting, very rare events, like an array starting + or stopping, or resync starting or stopping + pr_debug() for everything else. + */ +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> #include <linux/kthread.h> #include <linux/blkdev.h> +#include <linux/blk-integrity.h> +#include <linux/badblocks.h> #include <linux/sysctl.h> #include <linux/seq_file.h> #include <linux/fs.h> @@ -43,6 +52,7 @@ #include <linux/hdreg.h> #include <linux/proc_fs.h> #include <linux/random.h> +#include <linux/major.h> #include <linux/module.h> #include <linux/reboot.h> #include <linux/file.h> @@ -50,32 +60,46 @@ #include <linux/delay.h> #include <linux/raid/md_p.h> #include <linux/raid/md_u.h> +#include <linux/raid/detect.h> #include <linux/slab.h> -#include "md.h" -#include "bitmap.h" +#include <linux/percpu-refcount.h> +#include <linux/part_stat.h> -#ifndef MODULE -static void autostart_arrays(int part); -#endif +#include "md.h" +#include "md-bitmap.h" +#include "md-cluster.h" + +static const char *action_name[NR_SYNC_ACTIONS] = { + [ACTION_RESYNC] = "resync", + [ACTION_RECOVER] = "recover", + [ACTION_CHECK] = "check", + [ACTION_REPAIR] = "repair", + [ACTION_RESHAPE] = "reshape", + [ACTION_FROZEN] = "frozen", + [ACTION_IDLE] = "idle", +}; -/* pers_list is a list of registered personalities protected - * by pers_lock. - * pers_lock does extra service to protect accesses to - * mddev->thread when the mutex cannot be held. - */ -static LIST_HEAD(pers_list); -static DEFINE_SPINLOCK(pers_lock); +static DEFINE_XARRAY(md_submodule); -static void md_print_devices(void); +static const struct kobj_type md_ktype; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; + +/* + * This workqueue is used for sync_work to register new sync_thread, and for + * del_work to remove rdev, and for event_work that is only set by dm-raid. + * + * Noted that sync_work will grab reconfig_mutex, hence never flush this + * workqueue whith reconfig_mutex grabbed. + */ static struct workqueue_struct *md_misc_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); - -#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } +static void mddev_detach(struct mddev *mddev); +static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); +static void md_wakeup_thread_directly(struct md_thread __rcu **thread); /* * Default number of read corrections we'll attempt on an rdev @@ -83,145 +107,239 @@ static int remove_and_add_spares(struct mddev *mddev, * count by 2 for every hour elapsed between read errors. */ #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 +/* Default safemode delay: 200 msec */ +#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) /* - * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' - * is 1000 KB/sec, so the extra system load does not show up that much. - * Increase it if you want to have more _guaranteed_ speed. Note that - * the RAID driver will use the maximum available bandwidth if the IO - * subsystem is idle. There is also an 'absolute maximum' reconstruction - * speed limit - in case reconstruction slows down your system despite - * idle IO detection. + * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit' + * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load + * does not show up that much. Increase it if you want to have more guaranteed + * speed. Note that the RAID driver will use the maximum bandwidth + * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle. * - * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. - * or /sys/block/mdX/md/sync_speed_{min,max} + * Background sync IO speed control: + * + * - below speed min: + * no limit; + * - above speed min and below speed max: + * a) if mddev is idle, then no limit; + * b) if mddev is busy handling normal IO, then limit inflight sync IO + * to sync_io_depth; + * - above speed max: + * sync IO can't be issued; + * + * Following configurations can be changed via /proc/sys/dev/raid/ for system + * or /sys/block/mdX/md/ for one array. */ - static int sysctl_speed_limit_min = 1000; static int sysctl_speed_limit_max = 200000; -static inline int speed_min(struct mddev *mddev) +static int sysctl_sync_io_depth = 32; + +static int speed_min(struct mddev *mddev) { return mddev->sync_speed_min ? mddev->sync_speed_min : sysctl_speed_limit_min; } -static inline int speed_max(struct mddev *mddev) +static int speed_max(struct mddev *mddev) { return mddev->sync_speed_max ? mddev->sync_speed_max : sysctl_speed_limit_max; } -static struct ctl_table_header *raid_table_header; +static int sync_io_depth(struct mddev *mddev) +{ + return mddev->sync_io_depth ? + mddev->sync_io_depth : sysctl_sync_io_depth; +} -static ctl_table raid_table[] = { - { - .procname = "speed_limit_min", - .data = &sysctl_speed_limit_min, - .maxlen = sizeof(int), - .mode = S_IRUGO|S_IWUSR, - .proc_handler = proc_dointvec, - }, - { - .procname = "speed_limit_max", - .data = &sysctl_speed_limit_max, - .maxlen = sizeof(int), - .mode = S_IRUGO|S_IWUSR, - .proc_handler = proc_dointvec, - }, - { } -}; +static void rdev_uninit_serial(struct md_rdev *rdev) +{ + if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) + return; -static ctl_table raid_dir_table[] = { - { - .procname = "raid", - .maxlen = 0, - .mode = S_IRUGO|S_IXUGO, - .child = raid_table, - }, - { } -}; + kvfree(rdev->serial); + rdev->serial = NULL; +} -static ctl_table raid_root_table[] = { - { - .procname = "dev", - .maxlen = 0, - .mode = 0555, - .child = raid_dir_table, - }, - { } -}; +static void rdevs_uninit_serial(struct mddev *mddev) +{ + struct md_rdev *rdev; -static const struct block_device_operations md_fops; + rdev_for_each(rdev, mddev) + rdev_uninit_serial(rdev); +} -static int start_readonly; +static int rdev_init_serial(struct md_rdev *rdev) +{ + /* serial_nums equals with BARRIER_BUCKETS_NR */ + int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); + struct serial_in_rdev *serial = NULL; -/* bio_clone_mddev - * like bio_clone, but with a local bio set - */ + if (test_bit(CollisionCheck, &rdev->flags)) + return 0; -struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, - struct mddev *mddev) -{ - struct bio *b; + serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, + GFP_KERNEL); + if (!serial) + return -ENOMEM; - if (!mddev || !mddev->bio_set) - return bio_alloc(gfp_mask, nr_iovecs); + for (i = 0; i < serial_nums; i++) { + struct serial_in_rdev *serial_tmp = &serial[i]; - b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set); - if (!b) - return NULL; - return b; + spin_lock_init(&serial_tmp->serial_lock); + serial_tmp->serial_rb = RB_ROOT_CACHED; + init_waitqueue_head(&serial_tmp->serial_io_wait); + } + + rdev->serial = serial; + set_bit(CollisionCheck, &rdev->flags); + + return 0; } -EXPORT_SYMBOL_GPL(bio_alloc_mddev); -struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, - struct mddev *mddev) +static int rdevs_init_serial(struct mddev *mddev) { - if (!mddev || !mddev->bio_set) - return bio_clone(bio, gfp_mask); + struct md_rdev *rdev; + int ret = 0; + + rdev_for_each(rdev, mddev) { + ret = rdev_init_serial(rdev); + if (ret) + break; + } + + /* Free all resources if pool is not existed */ + if (ret && !mddev->serial_info_pool) + rdevs_uninit_serial(mddev); - return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); + return ret; } -EXPORT_SYMBOL_GPL(bio_clone_mddev); -void md_trim_bio(struct bio *bio, int offset, int size) +/* + * rdev needs to enable serial stuffs if it meets the conditions: + * 1. it is multi-queue device flaged with writemostly. + * 2. the write-behind mode is enabled. + */ +static int rdev_need_serial(struct md_rdev *rdev) { - /* 'bio' is a cloned bio which we need to trim to match - * the given offset and size. - * This requires adjusting bi_sector, bi_size, and bi_io_vec - */ - int i; - struct bio_vec *bvec; - int sofar = 0; + return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && + rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && + test_bit(WriteMostly, &rdev->flags)); +} + +/* + * Init resource for rdev(s), then create serial_info_pool if: + * 1. rdev is the first device which return true from rdev_enable_serial. + * 2. rdev is NULL, means we want to enable serialization for all rdevs. + */ +void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) +{ + int ret = 0; - size <<= 9; - if (offset == 0 && size == bio->bi_size) + if (rdev && !rdev_need_serial(rdev) && + !test_bit(CollisionCheck, &rdev->flags)) return; - clear_bit(BIO_SEG_VALID, &bio->bi_flags); + if (!rdev) + ret = rdevs_init_serial(mddev); + else + ret = rdev_init_serial(rdev); + if (ret) + return; - bio_advance(bio, offset << 9); + if (mddev->serial_info_pool == NULL) { + /* + * already in memalloc noio context by + * mddev_suspend() + */ + mddev->serial_info_pool = + mempool_create_kmalloc_pool(NR_SERIAL_INFOS, + sizeof(struct serial_info)); + if (!mddev->serial_info_pool) { + rdevs_uninit_serial(mddev); + pr_err("can't alloc memory pool for serialization\n"); + } + } +} - bio->bi_size = size; +/* + * Free resource from rdev(s), and destroy serial_info_pool under conditions: + * 1. rdev is the last device flaged with CollisionCheck. + * 2. when bitmap is destroyed while policy is not enabled. + * 3. for disable policy, the pool is destroyed only when no rdev needs it. + */ +void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) +{ + if (rdev && !test_bit(CollisionCheck, &rdev->flags)) + return; - /* avoid any complications with bi_idx being non-zero*/ - if (bio->bi_idx) { - memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, - (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); - bio->bi_vcnt -= bio->bi_idx; - bio->bi_idx = 0; - } - /* Make sure vcnt and last bv are not too big */ - bio_for_each_segment(bvec, bio, i) { - if (sofar + bvec->bv_len > size) - bvec->bv_len = size - sofar; - if (bvec->bv_len == 0) { - bio->bi_vcnt = i; - break; + if (mddev->serial_info_pool) { + struct md_rdev *temp; + int num = 0; /* used to track if other rdevs need the pool */ + + rdev_for_each(temp, mddev) { + if (!rdev) { + if (!mddev->serialize_policy || + !rdev_need_serial(temp)) + rdev_uninit_serial(temp); + else + num++; + } else if (temp != rdev && + test_bit(CollisionCheck, &temp->flags)) + num++; + } + + if (rdev) + rdev_uninit_serial(rdev); + + if (num) + pr_info("The mempool could be used by other devices\n"); + else { + mempool_destroy(mddev->serial_info_pool); + mddev->serial_info_pool = NULL; } - sofar += bvec->bv_len; } } -EXPORT_SYMBOL_GPL(md_trim_bio); + +static struct ctl_table_header *raid_table_header; + +static const struct ctl_table raid_table[] = { + { + .procname = "speed_limit_min", + .data = &sysctl_speed_limit_min, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "speed_limit_max", + .data = &sysctl_speed_limit_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_io_depth", + .data = &sysctl_sync_io_depth, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + +static int start_readonly; + +/* + * The original mechanism for creating an md device is to create + * a device node in /dev and to open it. This causes races with device-close. + * The preferred method is to write to the "new_array" module parameter. + * This can avoid races. + * Setting create_on_open to false disables the original mechanism + * so all the races disappear. + */ +static bool create_on_open = true; +static bool legacy_async_del_gendisk = true; +static bool check_new_feature = true; /* * We have a system wide 'event count' that is incremented @@ -235,22 +353,13 @@ EXPORT_SYMBOL_GPL(md_trim_bio); */ static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); static atomic_t md_event_count; -void md_new_event(struct mddev *mddev) +void md_new_event(void) { atomic_inc(&md_event_count); wake_up(&md_event_waiters); } EXPORT_SYMBOL_GPL(md_new_event); -/* Alternate version that can be called from interrupts - * when calling sysfs_notify isn't needed. - */ -static void md_new_event_inintr(struct mddev *mddev) -{ - atomic_inc(&md_event_count); - wake_up(&md_event_waiters); -} - /* * Enables to iterate over all existing md arrays * all_mddevs_lock protects this list. @@ -258,30 +367,10 @@ static void md_new_event_inintr(struct mddev *mddev) static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); - -/* - * iterates through all used mddevs in the system. - * We take care to grab the all_mddevs_lock whenever navigating - * the list, and to always hold a refcount when unlocked. - * Any code which breaks out of this loop while own - * a reference to the current mddev and must mddev_put it. - */ -#define for_each_mddev(_mddev,_tmp) \ - \ - for (({ spin_lock(&all_mddevs_lock); \ - _tmp = all_mddevs.next; \ - _mddev = NULL;}); \ - ({ if (_tmp != &all_mddevs) \ - mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ - spin_unlock(&all_mddevs_lock); \ - if (_mddev) mddev_put(_mddev); \ - _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ - _tmp != &all_mddevs;}); \ - ({ spin_lock(&all_mddevs_lock); \ - _tmp = _tmp->next;}) \ - ) - - +static bool is_md_suspended(struct mddev *mddev) +{ + return percpu_ref_is_dying(&mddev->active_io); +} /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. @@ -289,338 +378,541 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more. */ -static void md_make_request(struct request_queue *q, struct bio *bio) +static bool is_suspended(struct mddev *mddev, struct bio *bio) { - const int rw = bio_data_dir(bio); - struct mddev *mddev = q->queuedata; - int cpu; - unsigned int sectors; + if (is_md_suspended(mddev)) + return true; + if (bio_data_dir(bio) != WRITE) + return false; + if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) + return false; + if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) + return false; + if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) + return false; + return true; +} - if (mddev == NULL || mddev->pers == NULL - || !mddev->ready) { - bio_io_error(bio); - return; - } - if (mddev->ro == 1 && unlikely(rw == WRITE)) { - bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS); - return; - } - smp_rmb(); /* Ensure implications of 'active' are visible */ - rcu_read_lock(); - if (mddev->suspended) { +bool md_handle_request(struct mddev *mddev, struct bio *bio) +{ +check_suspended: + if (is_suspended(mddev, bio)) { DEFINE_WAIT(__wait); + /* Bail out if REQ_NOWAIT is set for the bio */ + if (bio->bi_opf & REQ_NOWAIT) { + bio_wouldblock_error(bio); + return true; + } for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, TASK_UNINTERRUPTIBLE); - if (!mddev->suspended) + if (!is_suspended(mddev, bio)) break; - rcu_read_unlock(); schedule(); - rcu_read_lock(); } finish_wait(&mddev->sb_wait, &__wait); } - atomic_inc(&mddev->active_io); - rcu_read_unlock(); + if (!percpu_ref_tryget_live(&mddev->active_io)) + goto check_suspended; - /* - * save the sectors now since our bio can - * go away inside make_request - */ - sectors = bio_sectors(bio); - mddev->pers->make_request(mddev, bio); + if (!mddev->pers->make_request(mddev, bio)) { + percpu_ref_put(&mddev->active_io); + if (!mddev->gendisk && mddev->pers->prepare_suspend) + return false; + goto check_suspended; + } + + percpu_ref_put(&mddev->active_io); + return true; +} +EXPORT_SYMBOL(md_handle_request); - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); - part_stat_unlock(); +static void md_submit_bio(struct bio *bio) +{ + const int rw = bio_data_dir(bio); + struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; - if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) - wake_up(&mddev->sb_wait); + if (mddev == NULL || mddev->pers == NULL) { + bio_io_error(bio); + return; + } + + if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { + bio_io_error(bio); + return; + } + + bio = bio_split_to_limits(bio); + if (!bio) + return; + + if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { + if (bio_sectors(bio) != 0) + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return; + } + + /* bio could be mergeable after passing to underlayer */ + bio->bi_opf &= ~REQ_NOMERGE; + + md_handle_request(mddev, bio); } -/* mddev_suspend makes sure no new requests are submitted - * to the device, and that any requests that have been submitted - * are completely handled. - * Once ->stop is called and completes, the module will be completely - * unused. +/* + * Make sure no new requests are submitted to the device, and any requests that + * have been submitted are completely handled. */ -void mddev_suspend(struct mddev *mddev) +int mddev_suspend(struct mddev *mddev, bool interruptible) { - BUG_ON(mddev->suspended); - mddev->suspended = 1; - synchronize_rcu(); - wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); - mddev->pers->quiesce(mddev, 1); + int err = 0; + + /* + * hold reconfig_mutex to wait for normal io will deadlock, because + * other context can't update super_block, and normal io can rely on + * updating super_block. + */ + lockdep_assert_not_held(&mddev->reconfig_mutex); + + if (interruptible) + err = mutex_lock_interruptible(&mddev->suspend_mutex); + else + mutex_lock(&mddev->suspend_mutex); + if (err) + return err; + + if (mddev->suspended) { + WRITE_ONCE(mddev->suspended, mddev->suspended + 1); + mutex_unlock(&mddev->suspend_mutex); + return 0; + } + + percpu_ref_kill(&mddev->active_io); + if (interruptible) + err = wait_event_interruptible(mddev->sb_wait, + percpu_ref_is_zero(&mddev->active_io)); + else + wait_event(mddev->sb_wait, + percpu_ref_is_zero(&mddev->active_io)); + if (err) { + percpu_ref_resurrect(&mddev->active_io); + mutex_unlock(&mddev->suspend_mutex); + return err; + } + + /* + * For raid456, io might be waiting for reshape to make progress, + * allow new reshape to start while waiting for io to be done to + * prevent deadlock. + */ + WRITE_ONCE(mddev->suspended, mddev->suspended + 1); + + /* restrict memory reclaim I/O during raid array is suspend */ + mddev->noio_flag = memalloc_noio_save(); - del_timer_sync(&mddev->safemode_timer); + mutex_unlock(&mddev->suspend_mutex); + return 0; } EXPORT_SYMBOL_GPL(mddev_suspend); -void mddev_resume(struct mddev *mddev) +static void __mddev_resume(struct mddev *mddev, bool recovery_needed) { - mddev->suspended = 0; + lockdep_assert_not_held(&mddev->reconfig_mutex); + + mutex_lock(&mddev->suspend_mutex); + WRITE_ONCE(mddev->suspended, mddev->suspended - 1); + if (mddev->suspended) { + mutex_unlock(&mddev->suspend_mutex); + return; + } + + /* entred the memalloc scope from mddev_suspend() */ + memalloc_noio_restore(mddev->noio_flag); + + percpu_ref_resurrect(&mddev->active_io); wake_up(&mddev->sb_wait); - mddev->pers->quiesce(mddev, 0); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + if (recovery_needed) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ + + mutex_unlock(&mddev->suspend_mutex); +} + +void mddev_resume(struct mddev *mddev) +{ + return __mddev_resume(mddev, true); } EXPORT_SYMBOL_GPL(mddev_resume); -int mddev_congested(struct mddev *mddev, int bits) +/* sync bdev before setting device to readonly or stopping raid*/ +static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num) { - return mddev->suspended; + mutex_lock(&mddev->open_mutex); + if (mddev->pers && atomic_read(&mddev->openers) > opener_num) { + mutex_unlock(&mddev->open_mutex); + return -EBUSY; + } + if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { + mutex_unlock(&mddev->open_mutex); + return -EBUSY; + } + mutex_unlock(&mddev->open_mutex); + + sync_blockdev(mddev->gendisk->part0); + return 0; } -EXPORT_SYMBOL(mddev_congested); /* - * Generic flush handling for md + * The only difference from bio_chain_endio() is that the current + * bi_status of bio does not affect the bi_status of parent. */ - -static void md_end_flush(struct bio *bio, int err) +static void md_end_flush(struct bio *bio) { - struct md_rdev *rdev = bio->bi_private; - struct mddev *mddev = rdev->mddev; + struct bio *parent = bio->bi_private; - rdev_dec_pending(rdev, mddev); + /* + * If any flush io error before the power failure, + * disk data may be lost. + */ + if (bio->bi_status) + pr_err("md: %pg flush io error %d\n", bio->bi_bdev, + blk_status_to_errno(bio->bi_status)); - if (atomic_dec_and_test(&mddev->flush_pending)) { - /* The pre-request flush has finished */ - queue_work(md_wq, &mddev->flush_work); - } bio_put(bio); + bio_endio(parent); } -static void md_submit_flush_data(struct work_struct *ws); - -static void submit_flushes(struct work_struct *ws) +bool md_flush_request(struct mddev *mddev, struct bio *bio) { - struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct md_rdev *rdev; + struct bio *new; - INIT_WORK(&mddev->flush_work, md_submit_flush_data); - atomic_set(&mddev->flush_pending, 1); - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Faulty, &rdev->flags)) { - /* Take two references, one is dropped - * when request finishes, one after - * we reclaim rcu_read_lock - */ - struct bio *bi; - atomic_inc(&rdev->nr_pending); - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); - bi->bi_end_io = md_end_flush; - bi->bi_private = rdev; - bi->bi_bdev = rdev->bdev; - atomic_inc(&mddev->flush_pending); - submit_bio(WRITE_FLUSH, bi); - rcu_read_lock(); - rdev_dec_pending(rdev, mddev); - } - rcu_read_unlock(); - if (atomic_dec_and_test(&mddev->flush_pending)) - queue_work(md_wq, &mddev->flush_work); -} + /* + * md_flush_reqeust() should be called under md_handle_request() and + * 'active_io' is already grabbed. Hence it's safe to get rdev directly + * without rcu protection. + */ + WARN_ON(percpu_ref_is_zero(&mddev->active_io)); -static void md_submit_flush_data(struct work_struct *ws) -{ - struct mddev *mddev = container_of(ws, struct mddev, flush_work); - struct bio *bio = mddev->flush_bio; + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; - if (bio->bi_size == 0) - /* an empty barrier - all done */ - bio_endio(bio, 0); - else { - bio->bi_rw &= ~REQ_FLUSH; - mddev->pers->make_request(mddev, bio); + new = bio_alloc_bioset(rdev->bdev, 0, + REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, + &mddev->bio_set); + new->bi_private = bio; + new->bi_end_io = md_end_flush; + bio_inc_remaining(bio); + submit_bio(new); } - mddev->flush_bio = NULL; - wake_up(&mddev->sb_wait); -} - -void md_flush_request(struct mddev *mddev, struct bio *bio) -{ - spin_lock_irq(&mddev->write_lock); - wait_event_lock_irq(mddev->sb_wait, - !mddev->flush_bio, - mddev->write_lock); - mddev->flush_bio = bio; - spin_unlock_irq(&mddev->write_lock); + if (bio_sectors(bio) == 0) { + bio_endio(bio); + return true; + } - INIT_WORK(&mddev->flush_work, submit_flushes); - queue_work(md_wq, &mddev->flush_work); + bio->bi_opf &= ~REQ_PREFLUSH; + return false; } EXPORT_SYMBOL(md_flush_request); -void md_unplug(struct blk_plug_cb *cb, bool from_schedule) -{ - struct mddev *mddev = cb->data; - md_wakeup_thread(mddev->thread); - kfree(cb); -} -EXPORT_SYMBOL(md_unplug); - static inline struct mddev *mddev_get(struct mddev *mddev) { + lockdep_assert_held(&all_mddevs_lock); + + if (test_bit(MD_DELETED, &mddev->flags)) + return NULL; atomic_inc(&mddev->active); return mddev; } static void mddev_delayed_delete(struct work_struct *ws); -static void mddev_put(struct mddev *mddev) +static void __mddev_put(struct mddev *mddev) +{ + if (mddev->raid_disks || !list_empty(&mddev->disks) || + mddev->ctime || mddev->hold_active) + return; + + /* + * If array is freed by stopping array, MD_DELETED is set by + * do_md_stop(), MD_DELETED is still set here in case mddev is freed + * directly by closing a mddev that is created by create_on_open. + */ + set_bit(MD_DELETED, &mddev->flags); + /* + * Call queue_work inside the spinlock so that flush_workqueue() after + * mddev_find will succeed in waiting for the work to be done. + */ + queue_work(md_misc_wq, &mddev->del_work); +} + +static void mddev_put_locked(struct mddev *mddev) { - struct bio_set *bs = NULL; + if (atomic_dec_and_test(&mddev->active)) + __mddev_put(mddev); +} +void mddev_put(struct mddev *mddev) +{ if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; - if (!mddev->raid_disks && list_empty(&mddev->disks) && - mddev->ctime == 0 && !mddev->hold_active) { - /* Array is not configured at all, and not held active, - * so destroy it */ - list_del_init(&mddev->all_mddevs); - bs = mddev->bio_set; - mddev->bio_set = NULL; - if (mddev->gendisk) { - /* We did a probe so need to clean up. Call - * queue_work inside the spinlock so that - * flush_workqueue() after mddev_find will - * succeed in waiting for the work to be done. + + __mddev_put(mddev); + spin_unlock(&all_mddevs_lock); +} + +static void md_safemode_timeout(struct timer_list *t); +static void md_start_sync(struct work_struct *ws); + +static void active_io_release(struct percpu_ref *ref) +{ + struct mddev *mddev = container_of(ref, struct mddev, active_io); + + wake_up(&mddev->sb_wait); +} + +static void no_op(struct percpu_ref *r) {} + +static bool mddev_set_bitmap_ops(struct mddev *mddev) +{ + struct bitmap_operations *old = mddev->bitmap_ops; + struct md_submodule_head *head; + + if (mddev->bitmap_id == ID_BITMAP_NONE || + (old && old->head.id == mddev->bitmap_id)) + return true; + + xa_lock(&md_submodule); + head = xa_load(&md_submodule, mddev->bitmap_id); + + if (!head) { + pr_warn("md: can't find bitmap id %d\n", mddev->bitmap_id); + goto err; + } + + if (head->type != MD_BITMAP) { + pr_warn("md: invalid bitmap id %d\n", mddev->bitmap_id); + goto err; + } + + mddev->bitmap_ops = (void *)head; + xa_unlock(&md_submodule); + + if (!mddev_is_dm(mddev) && mddev->bitmap_ops->group) { + if (sysfs_create_group(&mddev->kobj, mddev->bitmap_ops->group)) + pr_warn("md: cannot register extra bitmap attributes for %s\n", + mdname(mddev)); + else + /* + * Inform user with KOBJ_CHANGE about new bitmap + * attributes. */ - INIT_WORK(&mddev->del_work, mddev_delayed_delete); - queue_work(md_misc_wq, &mddev->del_work); - } else - kfree(mddev); + kobject_uevent(&mddev->kobj, KOBJ_CHANGE); } - spin_unlock(&all_mddevs_lock); - if (bs) - bioset_free(bs); + return true; + +err: + xa_unlock(&md_submodule); + return false; } -void mddev_init(struct mddev *mddev) +static void mddev_clear_bitmap_ops(struct mddev *mddev) { + if (!mddev_is_dm(mddev) && mddev->bitmap_ops && + mddev->bitmap_ops->group) + sysfs_remove_group(&mddev->kobj, mddev->bitmap_ops->group); + + mddev->bitmap_ops = NULL; +} + +int mddev_init(struct mddev *mddev) +{ + int err = 0; + + if (!IS_ENABLED(CONFIG_MD_BITMAP)) + mddev->bitmap_id = ID_BITMAP_NONE; + else + mddev->bitmap_id = ID_BITMAP; + + if (percpu_ref_init(&mddev->active_io, active_io_release, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) + return -ENOMEM; + + if (percpu_ref_init(&mddev->writes_pending, no_op, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + err = -ENOMEM; + goto exit_acitve_io; + } + + err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (err) + goto exit_writes_pending; + + err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (err) + goto exit_bio_set; + + err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, + offsetof(struct md_io_clone, bio_clone), 0); + if (err) + goto exit_sync_set; + + /* We want to start with the refcount at zero */ + percpu_ref_put(&mddev->writes_pending); + mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); + mutex_init(&mddev->suspend_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); - init_timer(&mddev->safemode_timer); + INIT_LIST_HEAD(&mddev->deleting); + timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); - atomic_set(&mddev->active_io, 0); - spin_lock_init(&mddev->write_lock); - atomic_set(&mddev->flush_pending, 0); + atomic_set(&mddev->sync_seq, 0); + spin_lock_init(&mddev->lock); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; mddev->reshape_backwards = 0; - mddev->last_sync_action = "none"; + mddev->last_sync_action = ACTION_IDLE; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; + + INIT_WORK(&mddev->sync_work, md_start_sync); + INIT_WORK(&mddev->del_work, mddev_delayed_delete); + + return 0; + +exit_sync_set: + bioset_exit(&mddev->sync_set); +exit_bio_set: + bioset_exit(&mddev->bio_set); +exit_writes_pending: + percpu_ref_exit(&mddev->writes_pending); +exit_acitve_io: + percpu_ref_exit(&mddev->active_io); + return err; } EXPORT_SYMBOL_GPL(mddev_init); -static struct mddev * mddev_find(dev_t unit) +void mddev_destroy(struct mddev *mddev) { - struct mddev *mddev, *new = NULL; + bioset_exit(&mddev->bio_set); + bioset_exit(&mddev->sync_set); + bioset_exit(&mddev->io_clone_set); + percpu_ref_exit(&mddev->active_io); + percpu_ref_exit(&mddev->writes_pending); +} +EXPORT_SYMBOL_GPL(mddev_destroy); - if (unit && MAJOR(unit) != MD_MAJOR) - unit &= ~((1<<MdpMinorShift)-1); +static struct mddev *mddev_find_locked(dev_t unit) +{ + struct mddev *mddev; - retry: - spin_lock(&all_mddevs_lock); + list_for_each_entry(mddev, &all_mddevs, all_mddevs) + if (mddev->unit == unit) + return mddev; - if (unit) { - list_for_each_entry(mddev, &all_mddevs, all_mddevs) - if (mddev->unit == unit) { - mddev_get(mddev); - spin_unlock(&all_mddevs_lock); - kfree(new); - return mddev; - } + return NULL; +} - if (new) { - list_add(&new->all_mddevs, &all_mddevs); - spin_unlock(&all_mddevs_lock); - new->hold_active = UNTIL_IOCTL; - return new; - } - } else if (new) { - /* find an unused unit number */ - static int next_minor = 512; - int start = next_minor; - int is_free = 0; - int dev = 0; - while (!is_free) { - dev = MKDEV(MD_MAJOR, next_minor); - next_minor++; - if (next_minor > MINORMASK) - next_minor = 0; - if (next_minor == start) { - /* Oh dear, all in use. */ - spin_unlock(&all_mddevs_lock); - kfree(new); - return NULL; - } - - is_free = 1; - list_for_each_entry(mddev, &all_mddevs, all_mddevs) - if (mddev->unit == dev) { - is_free = 0; - break; - } - } - new->unit = dev; - new->md_minor = MINOR(dev); - new->hold_active = UNTIL_STOP; - list_add(&new->all_mddevs, &all_mddevs); - spin_unlock(&all_mddevs_lock); - return new; +/* find an unused unit number */ +static dev_t mddev_alloc_unit(void) +{ + static int next_minor = 512; + int start = next_minor; + bool is_free = 0; + dev_t dev = 0; + + while (!is_free) { + dev = MKDEV(MD_MAJOR, next_minor); + next_minor++; + if (next_minor > MINORMASK) + next_minor = 0; + if (next_minor == start) + return 0; /* Oh dear, all in use. */ + is_free = !mddev_find_locked(dev); } - spin_unlock(&all_mddevs_lock); + + return dev; +} + +static struct mddev *mddev_alloc(dev_t unit) +{ + struct mddev *new; + int error; + + if (unit && MAJOR(unit) != MD_MAJOR) + unit &= ~((1 << MdpMinorShift) - 1); new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) - return NULL; + return ERR_PTR(-ENOMEM); - new->unit = unit; - if (MAJOR(unit) == MD_MAJOR) - new->md_minor = MINOR(unit); - else - new->md_minor = MINOR(unit) >> MdpMinorShift; + error = mddev_init(new); + if (error) + goto out_free_new; - mddev_init(new); + spin_lock(&all_mddevs_lock); + if (unit) { + error = -EEXIST; + if (mddev_find_locked(unit)) + goto out_destroy_new; + new->unit = unit; + if (MAJOR(unit) == MD_MAJOR) + new->md_minor = MINOR(unit); + else + new->md_minor = MINOR(unit) >> MdpMinorShift; + new->hold_active = UNTIL_IOCTL; + } else { + error = -ENODEV; + new->unit = mddev_alloc_unit(); + if (!new->unit) + goto out_destroy_new; + new->md_minor = MINOR(new->unit); + new->hold_active = UNTIL_STOP; + } - goto retry; -} + list_add(&new->all_mddevs, &all_mddevs); + spin_unlock(&all_mddevs_lock); + return new; -static inline int mddev_lock(struct mddev * mddev) -{ - return mutex_lock_interruptible(&mddev->reconfig_mutex); +out_destroy_new: + spin_unlock(&all_mddevs_lock); + mddev_destroy(new); +out_free_new: + kfree(new); + return ERR_PTR(error); } -static inline int mddev_is_locked(struct mddev *mddev) +static void mddev_free(struct mddev *mddev) { - return mutex_is_locked(&mddev->reconfig_mutex); -} + spin_lock(&all_mddevs_lock); + list_del(&mddev->all_mddevs); + spin_unlock(&all_mddevs_lock); -static inline int mddev_trylock(struct mddev * mddev) -{ - return mutex_trylock(&mddev->reconfig_mutex); + mddev_destroy(mddev); + kfree(mddev); } -static struct attribute_group md_redundancy_group; +static const struct attribute_group md_redundancy_group; -static void mddev_unlock(struct mddev * mddev) +void mddev_unlock(struct mddev *mddev) { + struct md_rdev *rdev; + struct md_rdev *tmp; + LIST_HEAD(delete); + + if (!list_empty(&mddev->deleting)) + list_splice_init(&mddev->deleting, &delete); + if (mddev->to_remove) { /* These cannot be removed under reconfig_mutex as * an access to the files will try to take reconfig_mutex @@ -634,7 +926,7 @@ static void mddev_unlock(struct mddev * mddev) * test it under the same mutex to ensure its correct value * is seen. */ - struct attribute_group *to_remove = mddev->to_remove; + const struct attribute_group *to_remove = mddev->to_remove; mddev->to_remove = NULL; mddev->sysfs_active = 1; mutex_unlock(&mddev->reconfig_mutex); @@ -647,33 +939,47 @@ static void mddev_unlock(struct mddev * mddev) sysfs_remove_group(&mddev->kobj, &md_redundancy_group); if (mddev->sysfs_action) sysfs_put(mddev->sysfs_action); + if (mddev->sysfs_completed) + sysfs_put(mddev->sysfs_completed); + if (mddev->sysfs_degraded) + sysfs_put(mddev->sysfs_degraded); mddev->sysfs_action = NULL; + mddev->sysfs_completed = NULL; + mddev->sysfs_degraded = NULL; } } mddev->sysfs_active = 0; } else mutex_unlock(&mddev->reconfig_mutex); - /* As we've dropped the mutex we need a spinlock to - * make sure the thread doesn't disappear - */ - spin_lock(&pers_lock); md_wakeup_thread(mddev->thread); - spin_unlock(&pers_lock); -} - -static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) -{ - struct md_rdev *rdev; + wake_up(&mddev->sb_wait); - rdev_for_each(rdev, mddev) - if (rdev->desc_nr == nr) - return rdev; + list_for_each_entry_safe(rdev, tmp, &delete, same_set) { + list_del_init(&rdev->same_set); + kobject_del(&rdev->kobj); + export_rdev(rdev, mddev); + } - return NULL; + if (!legacy_async_del_gendisk) { + /* + * Call del_gendisk after release reconfig_mutex to avoid + * deadlock (e.g. call del_gendisk under the lock and an + * access to sysfs files waits the lock) + * And MD_DELETED is only used for md raid which is set in + * do_md_stop. dm raid only uses md_stop to stop. So dm raid + * doesn't need to check MD_DELETED when getting reconfig lock + */ + if (test_bit(MD_DELETED, &mddev->flags) && + !test_and_set_bit(MD_DO_DELETE, &mddev->flags)) { + kobject_del(&mddev->kobj); + del_gendisk(mddev->gendisk); + } + } } +EXPORT_SYMBOL_GPL(mddev_unlock); -static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) +struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) { struct md_rdev *rdev; @@ -683,6 +989,7 @@ static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) return NULL; } +EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) { @@ -695,7 +1002,7 @@ static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) return NULL; } -static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) +struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; @@ -705,37 +1012,55 @@ static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) return NULL; } +EXPORT_SYMBOL_GPL(md_find_rdev_rcu); -static struct md_personality *find_pers(int level, char *clevel) +static struct md_personality *get_pers(int level, char *clevel) { - struct md_personality *pers; - list_for_each_entry(pers, &pers_list, list) { - if (level != LEVEL_NONE && pers->level == level) - return pers; - if (strcmp(pers->name, clevel)==0) - return pers; + struct md_personality *ret = NULL; + struct md_submodule_head *head; + unsigned long i; + + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) { + if (head->type != MD_PERSONALITY) + continue; + if ((level != LEVEL_NONE && head->id == level) || + !strcmp(head->name, clevel)) { + if (try_module_get(head->owner)) + ret = (void *)head; + break; + } } - return NULL; + xa_unlock(&md_submodule); + + if (!ret) { + if (level != LEVEL_NONE) + pr_warn("md: personality for level %d is not loaded!\n", + level); + else + pr_warn("md: personality for level %s is not loaded!\n", + clevel); + } + + return ret; +} + +static void put_pers(struct md_personality *pers) +{ + module_put(pers->head.owner); } /* return the offset of the super block in 512byte sectors */ static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) { - sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; - return MD_NEW_SIZE_SECTORS(num_sectors); + return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); } -static int alloc_disk_sb(struct md_rdev * rdev) +static int alloc_disk_sb(struct md_rdev *rdev) { - if (rdev->sb_page) - MD_BUG(); - rdev->sb_page = alloc_page(GFP_KERNEL); - if (!rdev->sb_page) { - printk(KERN_ALERT "md: out of memory.\n"); + if (!rdev->sb_page) return -ENOMEM; - } - return 0; } @@ -752,130 +1077,146 @@ void md_rdev_clear(struct md_rdev *rdev) put_page(rdev->bb_page); rdev->bb_page = NULL; } - kfree(rdev->badblocks.page); - rdev->badblocks.page = NULL; + badblocks_exit(&rdev->badblocks); } EXPORT_SYMBOL_GPL(md_rdev_clear); -static void super_written(struct bio *bio, int error) +static void super_written(struct bio *bio) { struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; - if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { - printk("md: super_written gets error=%d, uptodate=%d\n", - error, test_bit(BIO_UPTODATE, &bio->bi_flags)); - WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); + if (bio->bi_status) { + pr_err("md: %s gets error=%d\n", __func__, + blk_status_to_errno(bio->bi_status)); md_error(mddev, rdev); - } + if (!test_bit(Faulty, &rdev->flags) + && (bio->bi_opf & MD_FAILFAST)) { + set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); + set_bit(LastDev, &rdev->flags); + } + } else + clear_bit(LastDev, &rdev->flags); + + bio_put(bio); + + rdev_dec_pending(rdev, mddev); if (atomic_dec_and_test(&mddev->pending_writes)) wake_up(&mddev->sb_wait); - bio_put(bio); } -void md_super_write(struct mddev *mddev, struct md_rdev *rdev, - sector_t sector, int size, struct page *page) +/** + * md_write_metadata - write metadata to underlying disk, including + * array superblock, badblocks, bitmap superblock and bitmap bits. + * @mddev: the array to write + * @rdev: the underlying disk to write + * @sector: the offset to @rdev + * @size: the length of the metadata + * @page: the metadata + * @offset: the offset to @page + * + * Write @size bytes of @page start from @offset, to @sector of @rdev, Increment + * mddev->pending_writes before returning, and decrement it on completion, + * waking up sb_wait. Caller must call md_super_wait() after issuing io to all + * rdev. If an error occurred, md_error() will be called, and the @rdev will be + * kicked out from @mddev. + */ +void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev, + sector_t sector, int size, struct page *page, + unsigned int offset) { - /* write first size bytes of page to sector of rdev - * Increment mddev->pending_writes before returning - * and decrement it on completion, waking up sb_wait - * if zero is reached. - * If an error occurred, call md_error - */ - struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); + struct bio *bio; + + if (!page) + return; + + if (test_bit(Faulty, &rdev->flags)) + return; + + bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, + 1, + REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META + | REQ_PREFLUSH | REQ_FUA, + GFP_NOIO, &mddev->sync_set); + + atomic_inc(&rdev->nr_pending); - bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; - bio->bi_sector = sector; - bio_add_page(bio, page, size, 0); + bio->bi_iter.bi_sector = sector; + __bio_add_page(bio, page, size, offset); bio->bi_private = rdev; bio->bi_end_io = super_written; + if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && + test_bit(FailFast, &rdev->flags) && + !test_bit(LastDev, &rdev->flags)) + bio->bi_opf |= MD_FAILFAST; + atomic_inc(&mddev->pending_writes); - submit_bio(WRITE_FLUSH_FUA, bio); + submit_bio(bio); } -void md_super_wait(struct mddev *mddev) +int md_super_wait(struct mddev *mddev) { /* wait for all superblock writes that were scheduled to complete */ - DEFINE_WAIT(wq); - for(;;) { - prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); - if (atomic_read(&mddev->pending_writes)==0) - break; - schedule(); - } - finish_wait(&mddev->sb_wait, &wq); -} - -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion*)bio->bi_private); + wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); + if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) + return -EAGAIN; + return 0; } int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, - struct page *page, int rw, bool metadata_op) + struct page *page, blk_opf_t opf, bool metadata_op) { - struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); - struct completion event; - int ret; + struct bio bio; + struct bio_vec bvec; - rw |= REQ_SYNC; + if (metadata_op && rdev->meta_bdev) + bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); + else + bio_init(&bio, rdev->bdev, &bvec, 1, opf); - bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? - rdev->meta_bdev : rdev->bdev; if (metadata_op) - bio->bi_sector = sector + rdev->sb_start; + bio.bi_iter.bi_sector = sector + rdev->sb_start; else if (rdev->mddev->reshape_position != MaxSector && (rdev->mddev->reshape_backwards == (sector >= rdev->mddev->reshape_position))) - bio->bi_sector = sector + rdev->new_data_offset; + bio.bi_iter.bi_sector = sector + rdev->new_data_offset; else - bio->bi_sector = sector + rdev->data_offset; - bio_add_page(bio, page, size, 0); - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(&event); - - ret = test_bit(BIO_UPTODATE, &bio->bi_flags); - bio_put(bio); - return ret; + bio.bi_iter.bi_sector = sector + rdev->data_offset; + __bio_add_page(&bio, page, size, 0); + + submit_bio_wait(&bio); + + return !bio.bi_status; } EXPORT_SYMBOL_GPL(sync_page_io); -static int read_disk_sb(struct md_rdev * rdev, int size) +static int read_disk_sb(struct md_rdev *rdev, int size) { - char b[BDEVNAME_SIZE]; - if (!rdev->sb_page) { - MD_BUG(); - return -EINVAL; - } if (rdev->sb_loaded) return 0; - - if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) + if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) goto fail; rdev->sb_loaded = 1; return 0; fail: - printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", - bdevname(rdev->bdev,b)); + pr_err("md: disabled device %pg, could not read superblock.\n", + rdev->bdev); return -EINVAL; } -static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) +static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) { - return sb1->set_uuid0 == sb2->set_uuid0 && + return sb1->set_uuid0 == sb2->set_uuid0 && sb1->set_uuid1 == sb2->set_uuid1 && sb1->set_uuid2 == sb2->set_uuid2 && sb1->set_uuid3 == sb2->set_uuid3; } -static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) +static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) { int ret; mdp_super_t *tmp1, *tmp2; @@ -885,7 +1226,6 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) if (!tmp1 || !tmp2) { ret = 0; - printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); goto abort; } @@ -905,14 +1245,13 @@ abort: return ret; } - static u32 md_csum_fold(u32 csum) { csum = (csum & 0xffff) + (csum >> 16); return (csum & 0xffff) + (csum >> 16); } -static unsigned int calc_sb_csum(mdp_super_t * sb) +static unsigned int calc_sb_csum(mdp_super_t *sb) { u64 newcsum = 0; u32 *sb32 = (u32*)sb; @@ -926,7 +1265,6 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) newcsum += sb32[i]; csum = (newcsum & 0xffffffff) + (newcsum>>32); - #ifdef CONFIG_ALPHA /* This used to use csum_partial, which was wrong for several * reasons including that different results are returned on @@ -943,7 +1281,6 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) return csum; } - /* * Handle superblock details. * We want to be able to handle multiple superblock formats @@ -981,6 +1318,7 @@ struct super_type { struct md_rdev *refdev, int minor_version); int (*validate_super)(struct mddev *mddev, + struct md_rdev *freshest, struct md_rdev *rdev); void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); @@ -1002,20 +1340,20 @@ int md_check_no_bitmap(struct mddev *mddev) { if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) return 0; - printk(KERN_ERR "%s: bitmaps are not supported for %s\n", - mdname(mddev), mddev->pers->name); + pr_warn("%s: bitmaps are not supported for %s\n", + mdname(mddev), mddev->pers->head.name); return 1; } EXPORT_SYMBOL(md_check_no_bitmap); /* - * load_super for 0.90.0 + * load_super for 0.90.0 */ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { - char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; mdp_super_t *sb; int ret; + bool spare_disk = true; /* * Calculate the position of the superblock (512byte sectors), @@ -1026,25 +1364,24 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor rdev->sb_start = calc_dev_sboffset(rdev); ret = read_disk_sb(rdev, MD_SB_BYTES); - if (ret) return ret; + if (ret) + return ret; ret = -EINVAL; - bdevname(rdev->bdev, b); sb = page_address(rdev->sb_page); if (sb->md_magic != MD_SB_MAGIC) { - printk(KERN_ERR "md: invalid raid superblock magic on %s\n", - b); + pr_warn("md: invalid raid superblock magic on %pg\n", + rdev->bdev); goto abort; } if (sb->major_version != 0 || sb->minor_version < 90 || sb->minor_version > 91) { - printk(KERN_WARNING "Bad version number %d.%d on %s\n", - sb->major_version, sb->minor_version, - b); + pr_warn("Bad version number %d.%d on %pg\n", + sb->major_version, sb->minor_version, rdev->bdev); goto abort; } @@ -1052,8 +1389,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor goto abort; if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { - printk(KERN_WARNING "md: invalid superblock checksum on %s\n", - b); + pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); goto abort; } @@ -1063,32 +1399,37 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor rdev->sb_size = MD_SB_BYTES; rdev->badblocks.shift = -1; - if (sb->level == LEVEL_MULTIPATH) - rdev->desc_nr = -1; - else - rdev->desc_nr = sb->this_disk.number; + rdev->desc_nr = sb->this_disk.number; + + /* not spare disk */ + if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS && + sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) + spare_disk = false; if (!refdev) { - ret = 1; + if (!spare_disk) + ret = 1; + else + ret = 0; } else { __u64 ev1, ev2; mdp_super_t *refsb = page_address(refdev->sb_page); - if (!uuid_equal(refsb, sb)) { - printk(KERN_WARNING "md: %s has different UUID to %s\n", - b, bdevname(refdev->bdev,b2)); + if (!md_uuid_equal(refsb, sb)) { + pr_warn("md: %pg has different UUID to %pg\n", + rdev->bdev, refdev->bdev); goto abort; } - if (!sb_equal(refsb, sb)) { - printk(KERN_WARNING "md: %s has same UUID" - " but different superblock to %s\n", - b, bdevname(refdev->bdev, b2)); + if (!md_sb_equal(refsb, sb)) { + pr_warn("md: %pg has same UUID but different superblock to %pg\n", + rdev->bdev, refdev->bdev); goto abort; } ev1 = md_event(sb); ev2 = md_event(refsb); - if (ev1 > ev2) + + if (!spare_disk && ev1 > ev2) ret = 1; - else + else ret = 0; } rdev->sectors = rdev->sb_start; @@ -1096,8 +1437,8 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor * (not needed for Linear and RAID0 as metadata doesn't * record this size) */ - if (rdev->sectors >= (2ULL << 32) && sb->level >= 1) - rdev->sectors = (2ULL << 32) - 2; + if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) + rdev->sectors = (sector_t)(2ULL << 32) - 2; if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) /* "this cannot possibly happen" ... */ @@ -1107,10 +1448,26 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor return ret; } +static u64 md_bitmap_events_cleared(struct mddev *mddev) +{ + struct md_bitmap_stats stats; + int err; + + if (!md_bitmap_enabled(mddev, false)) + return 0; + + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (err) + return 0; + + return stats.events_cleared; +} + /* * validate_super for 0.90.0 + * note: we are not using "freshest" for 0.9 superblock */ -static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) +static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) { mdp_disk_t *desc; mdp_super_t *sb = page_address(rdev->sb_page); @@ -1119,6 +1476,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); if (mddev->raid_disks == 0) { @@ -1157,15 +1515,17 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; } + if (mddev->level == 0) + mddev->layout = -1; if (sb->state & (1<<MD_SB_CLEAN)) - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; else { - if (sb->events_hi == sb->cp_events_hi && + if (sb->events_hi == sb->cp_events_hi && sb->events_lo == sb->cp_events_lo) { - mddev->recovery_cp = sb->recovery_cp; + mddev->resync_offset = sb->recovery_cp; } else - mddev->recovery_cp = 0; + mddev->resync_offset = 0; } memcpy(mddev->uuid+0, &sb->set_uuid0, 4); @@ -1180,7 +1540,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->bitmap_info.offset = mddev->bitmap_info.default_offset; mddev->bitmap_info.space = - mddev->bitmap_info.space; + mddev->bitmap_info.default_space; } } else if (mddev->pers == NULL) { @@ -1189,42 +1549,44 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) ++ev1; if (sb->disks[rdev->desc_nr].state & ( (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) - if (ev1 < mddev->events) + if (ev1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { /* if adding to array with a bitmap, then we can accept an * older device ... but not too old. */ - if (ev1 < mddev->bitmap->events_cleared) + if (ev1 < md_bitmap_events_cleared(mddev)) return 0; + if (ev1 < mddev->events) + set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ return 0; } - if (mddev->level != LEVEL_MULTIPATH) { - desc = sb->disks + rdev->desc_nr; + desc = sb->disks + rdev->desc_nr; - if (desc->state & (1<<MD_DISK_FAULTY)) - set_bit(Faulty, &rdev->flags); - else if (desc->state & (1<<MD_DISK_SYNC) /* && - desc->raid_disk < mddev->raid_disks */) { - set_bit(In_sync, &rdev->flags); + if (desc->state & (1<<MD_DISK_FAULTY)) + set_bit(Faulty, &rdev->flags); + else if (desc->state & (1<<MD_DISK_SYNC)) { + set_bit(In_sync, &rdev->flags); + rdev->raid_disk = desc->raid_disk; + rdev->saved_raid_disk = desc->raid_disk; + } else if (desc->state & (1<<MD_DISK_ACTIVE)) { + /* active but not in sync implies recovery up to + * reshape position. We don't know exactly where + * that is, so set to zero for now + */ + if (mddev->minor_version >= 91) { + rdev->recovery_offset = 0; rdev->raid_disk = desc->raid_disk; - } else if (desc->state & (1<<MD_DISK_ACTIVE)) { - /* active but not in sync implies recovery up to - * reshape position. We don't know exactly where - * that is, so set to zero for now */ - if (mddev->minor_version >= 91) { - rdev->recovery_offset = 0; - rdev->raid_disk = desc->raid_disk; - } } - if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) - set_bit(WriteMostly, &rdev->flags); - } else /* MULTIPATH are always insync */ - set_bit(In_sync, &rdev->flags); + } + if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) + set_bit(WriteMostly, &rdev->flags); + if (desc->state & (1<<MD_DISK_FAILFAST)) + set_bit(FailFast, &rdev->flags); return 0; } @@ -1237,7 +1599,6 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) struct md_rdev *rdev2; int next_spare = mddev->raid_disks; - /* make rdev->sb match mddev data.. * * 1/ zero out disks @@ -1266,13 +1627,13 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) memcpy(&sb->set_uuid2, mddev->uuid+8, 4); memcpy(&sb->set_uuid3, mddev->uuid+12,4); - sb->ctime = mddev->ctime; + sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); sb->level = mddev->level; sb->size = mddev->dev_sectors / 2; sb->raid_disks = mddev->raid_disks; sb->md_minor = mddev->md_minor; sb->not_persistent = 0; - sb->utime = mddev->utime; + sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); sb->state = 0; sb->events_hi = (mddev->events>>32); sb->events_lo = (u32)mddev->events; @@ -1290,10 +1651,10 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) mddev->minor_version = sb->minor_version; if (mddev->in_sync) { - sb->recovery_cp = mddev->recovery_cp; + sb->recovery_cp = mddev->resync_offset; sb->cp_events_hi = (mddev->events>>32); sb->cp_events_lo = (u32)mddev->events; - if (mddev->recovery_cp == MaxSector) + if (mddev->resync_offset == MaxSector) sb->state = (1<< MD_SB_CLEAN); } else sb->recovery_cp = 0; @@ -1349,6 +1710,8 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) } if (test_bit(WriteMostly, &rdev2->flags)) d->state |= (1<<MD_DISK_WRITEMOSTLY); + if (test_bit(FailFast, &rdev2->flags)) + d->state |= (1<<MD_DISK_FAILFAST); } /* now set the "removed" and "faulty" bits on any missing devices */ for (i=0 ; i < mddev->raid_disks ; i++) { @@ -1387,11 +1750,12 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) /* Limit to 4TB as metadata cannot record more than that. * 4TB == 2^32 KB, or 2*2^32 sectors. */ - if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) - num_sectors = (2ULL << 32) - 2; - md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, - rdev->sb_page); - md_super_wait(rdev->mddev); + if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) + num_sectors = (sector_t)(2ULL << 32) - 2; + do { + md_write_metadata(rdev->mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); + } while (md_super_wait(rdev->mddev) < 0); return num_sectors; } @@ -1406,7 +1770,7 @@ super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) * version 1 superblock */ -static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) +static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) { __le32 disk_csum; u32 csum; @@ -1428,16 +1792,14 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) return cpu_to_le32(csum); } -static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, - int acknowledged); static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { struct mdp_superblock_1 *sb; int ret; sector_t sb_start; sector_t sectors; - char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; int bmask; + bool spare_disk = true; /* * Calculate the position of the superblock in 512byte sectors. @@ -1449,8 +1811,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ */ switch(minor_version) { case 0: - sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; - sb_start -= 8*2; + sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; sb_start &= ~(sector_t)(4*2-1); break; case 1: @@ -1470,7 +1831,6 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ ret = read_disk_sb(rdev, 4096); if (ret) return ret; - sb = page_address(rdev->sb_page); if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || @@ -1481,20 +1841,24 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ return -EINVAL; if (calc_sb_1_csum(sb) != sb->sb_csum) { - printk("md: invalid superblock checksum on %s\n", - bdevname(rdev->bdev,b)); + pr_warn("md: invalid superblock checksum on %pg\n", + rdev->bdev); return -EINVAL; } if (le64_to_cpu(sb->data_size) < 10) { - printk("md: data_size too small on %s\n", - bdevname(rdev->bdev,b)); + pr_warn("md: data_size too small on %pg\n", + rdev->bdev); return -EINVAL; } if (sb->pad0 || sb->pad3[0] || - memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) - /* Some padding is non-zero, might be a new feature */ - return -EINVAL; + memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) { + pr_warn("Some padding is non-zero on %pg, might be a new feature\n", + rdev->bdev); + if (check_new_feature) + return -EINVAL; + pr_warn("check_new_feature is disabled, data corruption possible\n"); + } rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); @@ -1516,10 +1880,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) return -EINVAL; - if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) - rdev->desc_nr = -1; - else - rdev->desc_nr = le32_to_cpu(sb->dev_number); + rdev->desc_nr = le32_to_cpu(sb->dev_number); if (!rdev->bb_page) { rdev->bb_page = alloc_page(GFP_KERNEL); @@ -1533,7 +1894,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ */ s32 offset; sector_t bb_sector; - u64 *bbp; + __le64 *bbp; int i; int sectors = le16_to_cpu(sb->bblog_size); if (sectors > (PAGE_SIZE / 512)) @@ -1543,9 +1904,9 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ return -EINVAL; bb_sector = (long long)offset; if (!sync_page_io(rdev, bb_sector, sectors << 9, - rdev->bb_page, READ, true)) + rdev->bb_page, REQ_OP_READ, true)) return -EIO; - bbp = (u64 *)page_address(rdev->bb_page); + bbp = (__le64 *)page_address(rdev->bb_page); rdev->badblocks.shift = sb->bblog_shift; for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { u64 bb = le64_to_cpu(*bbp); @@ -1555,15 +1916,34 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ count <<= sb->bblog_shift; if (bb + 1 == 0) break; - if (md_set_badblocks(&rdev->badblocks, - sector, count, 1) == 0) + if (!badblocks_set(&rdev->badblocks, sector, count, 1)) return -EINVAL; } } else if (sb->bblog_offset != 0) rdev->badblocks.shift = 0; + if ((le32_to_cpu(sb->feature_map) & + (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { + rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); + rdev->ppl.size = le16_to_cpu(sb->ppl.size); + rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; + } + + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && + sb->level != 0) + return -EINVAL; + + /* not spare disk */ + if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && + (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) + spare_disk = false; + if (!refdev) { - ret = 1; + if (!spare_disk) + ret = 1; + else + ret = 0; } else { __u64 ev1, ev2; struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); @@ -1572,24 +1952,22 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ sb->level != refsb->level || sb->layout != refsb->layout || sb->chunksize != refsb->chunksize) { - printk(KERN_WARNING "md: %s has strangely different" - " superblock to %s\n", - bdevname(rdev->bdev,b), - bdevname(refdev->bdev,b2)); + pr_warn("md: %pg has strangely different superblock to %pg\n", + rdev->bdev, + refdev->bdev); return -EINVAL; } ev1 = le64_to_cpu(sb->events); ev2 = le64_to_cpu(refsb->events); - if (ev1 > ev2) + if (!spare_disk && ev1 > ev2) ret = 1; else ret = 0; } - if (minor_version) { - sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); - sectors -= rdev->data_offset; - } else + if (minor_version) + sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; + else sectors = rdev->sb_start; if (sectors < le64_to_cpu(sb->data_size)) return -EINVAL; @@ -1597,14 +1975,16 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ return ret; } -static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) +static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) { struct mdp_superblock_1 *sb = page_address(rdev->sb_page); __u64 ev1 = le64_to_cpu(sb->events); + int role; rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); if (mddev->raid_disks == 0) { @@ -1612,13 +1992,14 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->patch_version = 0; mddev->external = 0; mddev->chunk_sectors = le32_to_cpu(sb->chunksize); - mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); - mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); + mddev->ctime = le64_to_cpu(sb->ctime); + mddev->utime = le64_to_cpu(sb->utime); mddev->level = le32_to_cpu(sb->level); mddev->clevel[0] = 0; mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); mddev->dev_sectors = le64_to_cpu(sb->size); + mddev->logical_block_size = le32_to_cpu(sb->logical_block_size); mddev->events = ev1; mddev->bitmap_info.offset = 0; mddev->bitmap_info.space = 0; @@ -1629,7 +2010,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->bitmap_info.default_space = (4096-1024) >> 9; mddev->reshape_backwards = 0; - mddev->recovery_cp = le64_to_cpu(sb->resync_offset); + mddev->resync_offset = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); mddev->max_disks = (4096-256)/2; @@ -1672,55 +2053,128 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_chunk_sectors = mddev->chunk_sectors; } + if (mddev->level == 0 && + !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) + mddev->layout = -1; + + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) + set_bit(MD_HAS_JOURNAL, &mddev->flags); + + if (le32_to_cpu(sb->feature_map) & + (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { + if (le32_to_cpu(sb->feature_map) & + (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) + return -EINVAL; + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && + (le32_to_cpu(sb->feature_map) & + MD_FEATURE_MULTIPLE_PPLS)) + return -EINVAL; + set_bit(MD_HAS_PPL, &mddev->flags); + } } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling, except for - * spares (which don't need an event count) */ - ++ev1; + * spares (which don't need an event count). + * Similar to mdadm, we allow event counter difference of 1 + * from the freshest device. + */ if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && - le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) - if (ev1 < mddev->events) + (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) + if (ev1 + 1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { /* If adding to array with a bitmap, then we can accept an * older device, but not too old. */ - if (ev1 < mddev->bitmap->events_cleared) + if (ev1 < md_bitmap_events_cleared(mddev)) return 0; + if (ev1 < mddev->events) + set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ return 0; } - if (mddev->level != LEVEL_MULTIPATH) { - int role; - if (rdev->desc_nr < 0 || - rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { - role = 0xffff; - rdev->desc_nr = -1; - } else - role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); - switch(role) { - case 0xffff: /* spare */ - break; - case 0xfffe: /* faulty */ - set_bit(Faulty, &rdev->flags); - break; - default: - if ((le32_to_cpu(sb->feature_map) & - MD_FEATURE_RECOVERY_OFFSET)) - rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); - else + + if (rdev->desc_nr < 0 || + rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { + role = MD_DISK_ROLE_SPARE; + rdev->desc_nr = -1; + } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { + /* + * If we are assembling, and our event counter is smaller than the + * highest event counter, we cannot trust our superblock about the role. + * It could happen that our rdev was marked as Faulty, and all other + * superblocks were updated with +1 event counter. + * Then, before the next superblock update, which typically happens when + * remove_and_add_spares() removes the device from the array, there was + * a crash or reboot. + * If we allow current rdev without consulting the freshest superblock, + * we could cause data corruption. + * Note that in this case our event counter is smaller by 1 than the + * highest, otherwise, this rdev would not be allowed into array; + * both kernel and mdadm allow event counter difference of 1. + */ + struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); + u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); + + if (rdev->desc_nr >= freshest_max_dev) { + /* this is unexpected, better not proceed */ + pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", + mdname(mddev), rdev->bdev, rdev->desc_nr, + freshest->bdev, freshest_max_dev); + return -EUCLEAN; + } + + role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); + pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", + mdname(mddev), rdev->bdev, role, role, freshest->bdev); + } else { + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + } + switch (role) { + case MD_DISK_ROLE_SPARE: /* spare */ + break; + case MD_DISK_ROLE_FAULTY: /* faulty */ + set_bit(Faulty, &rdev->flags); + break; + case MD_DISK_ROLE_JOURNAL: /* journal device */ + if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { + /* journal device without journal feature */ + pr_warn("md: journal device provided without journal feature, ignoring the device\n"); + return -EINVAL; + } + set_bit(Journal, &rdev->flags); + rdev->journal_tail = le64_to_cpu(sb->journal_tail); + rdev->raid_disk = 0; + break; + default: + rdev->saved_raid_disk = role; + if ((le32_to_cpu(sb->feature_map) & + MD_FEATURE_RECOVERY_OFFSET)) { + rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); + if (!(le32_to_cpu(sb->feature_map) & + MD_FEATURE_RECOVERY_BITMAP)) + rdev->saved_raid_disk = -1; + } else { + /* + * If the array is FROZEN, then the device can't + * be in_sync with rest of array. + */ + if (!test_bit(MD_RECOVERY_FROZEN, + &mddev->recovery)) set_bit(In_sync, &rdev->flags); - rdev->raid_disk = role; - break; } - if (sb->devflags & WriteMostly1) - set_bit(WriteMostly, &rdev->flags); - if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) - set_bit(Replacement, &rdev->flags); - } else /* MULTIPATH are always insync */ - set_bit(In_sync, &rdev->flags); + rdev->raid_disk = role; + break; + } + if (sb->devflags & WriteMostly1) + set_bit(WriteMostly, &rdev->flags); + if (sb->devflags & FailFast1) + set_bit(FailFast, &rdev->flags); + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) + set_bit(Replacement, &rdev->flags); return 0; } @@ -1742,7 +2196,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->utime = cpu_to_le64((__u64)mddev->utime); sb->events = cpu_to_le64(mddev->events); if (mddev->in_sync) - sb->resync_offset = cpu_to_le64(mddev->recovery_cp); + sb->resync_offset = cpu_to_le64(mddev->resync_offset); + else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) + sb->resync_offset = cpu_to_le64(MaxSector); else sb->resync_offset = cpu_to_le64(0); @@ -1753,6 +2209,11 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->chunksize = cpu_to_le32(mddev->chunk_sectors); sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); + sb->logical_block_size = cpu_to_le32(mddev->logical_block_size); + if (test_bit(FailFast, &rdev->flags)) + sb->devflags |= FailFast1; + else + sb->devflags &= ~FailFast1; if (test_bit(WriteMostly, &rdev->flags)) sb->devflags |= WriteMostly1; @@ -1766,13 +2227,19 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } - if (rdev->raid_disk >= 0 && + if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags)) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); + if (rdev->saved_raid_disk >= 0 && mddev->bitmap) + sb->feature_map |= + cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); } + /* Note: recovery_offset and journal_tail share space */ + if (test_bit(Journal, &rdev->flags)) + sb->journal_tail = cpu_to_le64(rdev->journal_tail); if (test_bit(Replacement, &rdev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_REPLACEMENT); @@ -1796,6 +2263,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) } } + if (mddev_is_clustered(mddev)) + sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); + if (rdev->badblocks.count == 0) /* Nothing to do for bad blocks*/ ; else if (sb->bblog_offset == 0) @@ -1803,7 +2273,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) md_error(mddev, rdev); else { struct badblocks *bb = &rdev->badblocks; - u64 *bbp = (u64 *)page_address(rdev->bb_page); + __le64 *bbp = (__le64 *)page_address(rdev->bb_page); u64 *p = bb->page; sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); if (bb->changed) { @@ -1846,23 +2316,56 @@ retry: max_dev = le32_to_cpu(sb->max_dev); for (i=0; i<max_dev;i++) - sb->dev_roles[i] = cpu_to_le16(0xfffe); - + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); + + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) + sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); + + if (test_bit(MD_HAS_PPL, &mddev->flags)) { + if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) + sb->feature_map |= + cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); + else + sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); + sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); + sb->ppl.size = cpu_to_le16(rdev->ppl.size); + } + rdev_for_each(rdev2, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) - sb->dev_roles[i] = cpu_to_le16(0xfffe); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); + else if (test_bit(Journal, &rdev2->flags)) + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); else if (rdev2->raid_disk >= 0) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else - sb->dev_roles[i] = cpu_to_le16(0xffff); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); } sb->sb_csum = calc_sb_1_csum(sb); } +static sector_t super_1_choose_bm_space(sector_t dev_size) +{ + sector_t bm_space; + + /* if the device is bigger than 8Gig, save 64k for bitmap + * usage, if bigger than 200Gig, save 128k + */ + if (dev_size < 64*2) + bm_space = 0; + else if (dev_size - 64*2 >= 200*1024*1024*2) + bm_space = 128*2; + else if (dev_size - 4*2 > 8*1024*1024*2) + bm_space = 64*2; + else + bm_space = 4*2; + return bm_space; +} + static unsigned long long super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) { @@ -1874,8 +2377,7 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) return 0; /* too confusing */ if (rdev->sb_start < rdev->data_offset) { /* minor versions 1 and 2; superblock before data */ - max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; - max_sectors -= rdev->data_offset; + max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; } else if (rdev->mddev->bitmap_info.offset) { @@ -1883,21 +2385,32 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) return 0; } else { /* minor version 0; superblock after data */ - sector_t sb_start; - sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; + sector_t sb_start, bm_space; + sector_t dev_size = bdev_nr_sectors(rdev->bdev); + + /* 8K is for superblock */ + sb_start = dev_size - 8*2; sb_start &= ~(sector_t)(4*2 - 1); - max_sectors = rdev->sectors + sb_start - rdev->sb_start; + + bm_space = super_1_choose_bm_space(dev_size); + + /* Space that can be used to store date needs to decrease + * superblock bitmap space and bad block space(4K) + */ + max_sectors = sb_start - bm_space - 4*2; + if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; rdev->sb_start = sb_start; } sb = page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); - sb->super_offset = rdev->sb_start; + sb->super_offset = cpu_to_le64(rdev->sb_start); sb->sb_csum = calc_sb_1_csum(sb); - md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, - rdev->sb_page); - md_super_wait(rdev->mddev); + do { + md_write_metadata(rdev->mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); + } while (md_super_wait(rdev->mddev) < 0); return num_sectors; } @@ -1906,14 +2419,15 @@ static int super_1_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) { + struct mddev *mddev = rdev->mddev; + /* All necessary checks on new >= old have been done */ - struct bitmap *bitmap; if (new_offset >= rdev->data_offset) return 1; /* with 1.0 metadata, there is no metadata to tread on * so we can always move back */ - if (rdev->mddev->minor_version == 0) + if (mddev->minor_version == 0) return 1; /* otherwise we must be sure not to step on @@ -1924,11 +2438,17 @@ super_1_allow_new_offset(struct md_rdev *rdev, */ if (rdev->sb_start + (32+4)*2 > new_offset) return 0; - bitmap = rdev->mddev->bitmap; - if (bitmap && !rdev->mddev->bitmap_info.file && - rdev->sb_start + rdev->mddev->bitmap_info.offset + - bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) - return 0; + + if (md_bitmap_registered(mddev) && !mddev->bitmap_info.file) { + struct md_bitmap_stats stats; + int err; + + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (!err && rdev->sb_start + mddev->bitmap_info.offset + + stats.file_pages * (PAGE_SIZE >> 9) > new_offset) + return 0; + } + if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) return 0; @@ -1973,13 +2493,22 @@ static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) struct md_rdev *rdev, *rdev2; rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev1) - rdev_for_each_rcu(rdev2, mddev2) - if (rdev->bdev->bd_contains == - rdev2->bdev->bd_contains) { + rdev_for_each_rcu(rdev, mddev1) { + if (test_bit(Faulty, &rdev->flags) || + test_bit(Journal, &rdev->flags) || + rdev->raid_disk == -1) + continue; + rdev_for_each_rcu(rdev2, mddev2) { + if (test_bit(Faulty, &rdev2->flags) || + test_bit(Journal, &rdev2->flags) || + rdev2->raid_disk == -1) + continue; + if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { rcu_read_unlock(); return 1; } + } + } rcu_read_unlock(); return 0; } @@ -1995,93 +2524,38 @@ static LIST_HEAD(pending_raid_disks); */ int md_integrity_register(struct mddev *mddev) { - struct md_rdev *rdev, *reference = NULL; - if (list_empty(&mddev->disks)) return 0; /* nothing to do */ - if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) - return 0; /* shouldn't register, or already is */ - rdev_for_each(rdev, mddev) { - /* skip spares and non-functional disks */ - if (test_bit(Faulty, &rdev->flags)) - continue; - if (rdev->raid_disk < 0) - continue; - if (!reference) { - /* Use the first rdev as the reference */ - reference = rdev; - continue; - } - /* does this rdev's profile match the reference profile? */ - if (blk_integrity_compare(reference->bdev->bd_disk, - rdev->bdev->bd_disk) < 0) - return -EINVAL; - } - if (!reference || !bdev_get_integrity(reference->bdev)) - return 0; - /* - * All component devices are integrity capable and have matching - * profiles, register the common profile for the md device. - */ - if (blk_integrity_register(mddev->gendisk, - bdev_get_integrity(reference->bdev)) != 0) { - printk(KERN_ERR "md: failed to register integrity for %s\n", - mdname(mddev)); - return -EINVAL; - } - printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); - if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { - printk(KERN_ERR "md: failed to create integrity pool for %s\n", - mdname(mddev)); - return -EINVAL; - } + if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk)) + return 0; /* shouldn't register */ + + pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); return 0; } EXPORT_SYMBOL(md_integrity_register); -/* Disable data integrity if non-capable/non-matching disk is being added */ -void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) +static bool rdev_read_only(struct md_rdev *rdev) { - struct blk_integrity *bi_rdev; - struct blk_integrity *bi_mddev; - - if (!mddev->gendisk) - return; - - bi_rdev = bdev_get_integrity(rdev->bdev); - bi_mddev = blk_get_integrity(mddev->gendisk); - - if (!bi_mddev) /* nothing to do */ - return; - if (rdev->raid_disk < 0) /* skip spares */ - return; - if (bi_rdev && blk_integrity_compare(mddev->gendisk, - rdev->bdev->bd_disk) >= 0) - return; - printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); - blk_integrity_unregister(mddev->gendisk); + return bdev_read_only(rdev->bdev) || + (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); } -EXPORT_SYMBOL(md_integrity_add_rdev); -static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev) +static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) { char b[BDEVNAME_SIZE]; - struct kobject *ko; - char *s; int err; - if (rdev->mddev) { - MD_BUG(); - return -EINVAL; - } - /* prevent duplicates */ if (find_rdev(mddev, rdev->bdev->bd_dev)) return -EEXIST; + if (rdev_read_only(rdev) && mddev->pers) + return -EROFS; + /* make sure rdev->sectors exceeds mddev->dev_sectors */ - if (rdev->sectors && (mddev->dev_sectors == 0 || - rdev->sectors < mddev->dev_sectors)) { + if (!test_bit(Journal, &rdev->flags) && + rdev->sectors && + (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { if (mddev->pers) { /* Cannot change size, so fail * If mddev->level <= 0, then we don't care @@ -2097,35 +2571,46 @@ static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev) * If it is -1, assign a free number, else * check number is not in use */ + rcu_read_lock(); if (rdev->desc_nr < 0) { int choice = 0; - if (mddev->pers) choice = mddev->raid_disks; - while (find_rdev_nr(mddev, choice)) + if (mddev->pers) + choice = mddev->raid_disks; + while (md_find_rdev_nr_rcu(mddev, choice)) choice++; rdev->desc_nr = choice; } else { - if (find_rdev_nr(mddev, rdev->desc_nr)) + if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { + rcu_read_unlock(); return -EBUSY; + } } - if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { - printk(KERN_WARNING "md: %s: array is limited to %d devices\n", - mdname(mddev), mddev->max_disks); + rcu_read_unlock(); + if (!test_bit(Journal, &rdev->flags) && + mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { + pr_warn("md: %s: array is limited to %d devices\n", + mdname(mddev), mddev->max_disks); return -EBUSY; } - bdevname(rdev->bdev,b); - while ( (s=strchr(b, '/')) != NULL) - *s = '!'; + snprintf(b, sizeof(b), "%pg", rdev->bdev); + strreplace(b, '/', '!'); rdev->mddev = mddev; - printk(KERN_INFO "md: bind<%s>\n", b); + pr_debug("md: bind<%s>\n", b); + + if (mddev->raid_disks) + mddev_create_serial_pool(mddev, rdev); if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; - ko = &part_to_dev(rdev->bdev->bd_part)->kobj; - if (sysfs_create_link(&rdev->kobj, ko, "block")) - /* failure here is OK */; + /* failure here is OK */ + err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); + rdev->sysfs_unack_badblocks = + sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); + rdev->sysfs_badblocks = + sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); list_add_rcu(&rdev->same_set, &mddev->disks); bd_link_disk_holder(rdev->bdev, mddev->gendisk); @@ -2136,247 +2621,99 @@ static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev) return 0; fail: - printk(KERN_WARNING "md: failed to register dev-%s for %s\n", - b, mdname(mddev)); + pr_warn("md: failed to register dev-%s for %s\n", + b, mdname(mddev)); + mddev_destroy_serial_pool(mddev, rdev); return err; } -static void md_delayed_delete(struct work_struct *ws) +void md_autodetect_dev(dev_t dev); + +/* just for claiming the bdev */ +static struct md_rdev claim_rdev; + +static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) { - struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); - kobject_del(&rdev->kobj); + pr_debug("md: export_rdev(%pg)\n", rdev->bdev); + md_rdev_clear(rdev); +#ifndef MODULE + if (test_bit(AutoDetected, &rdev->flags)) + md_autodetect_dev(rdev->bdev->bd_dev); +#endif + fput(rdev->bdev_file); + rdev->bdev = NULL; kobject_put(&rdev->kobj); } -static void unbind_rdev_from_array(struct md_rdev * rdev) +static void md_kick_rdev_from_array(struct md_rdev *rdev) { - char b[BDEVNAME_SIZE]; - if (!rdev->mddev) { - MD_BUG(); - return; - } + struct mddev *mddev = rdev->mddev; + bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); - printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); - rdev->mddev = NULL; + pr_debug("md: unbind<%pg>\n", rdev->bdev); + mddev_destroy_serial_pool(rdev->mddev, rdev); + WRITE_ONCE(rdev->mddev, NULL); sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); + sysfs_put(rdev->sysfs_unack_badblocks); + sysfs_put(rdev->sysfs_badblocks); rdev->sysfs_state = NULL; + rdev->sysfs_unack_badblocks = NULL; + rdev->sysfs_badblocks = NULL; rdev->badblocks.count = 0; - /* We need to delay this, otherwise we can deadlock when - * writing to 'remove' to "dev/state". We also need - * to delay it due to rcu usage. - */ - synchronize_rcu(); - INIT_WORK(&rdev->del_work, md_delayed_delete); - kobject_get(&rdev->kobj); - queue_work(md_misc_wq, &rdev->del_work); -} - -/* - * prevent the device from being mounted, repartitioned or - * otherwise reused by a RAID array (or any other kernel - * subsystem), by bd_claiming the device. - */ -static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) -{ - int err = 0; - struct block_device *bdev; - char b[BDEVNAME_SIZE]; - - bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, - shared ? (struct md_rdev *)lock_rdev : rdev); - if (IS_ERR(bdev)) { - printk(KERN_ERR "md: could not open %s.\n", - __bdevname(dev, b)); - return PTR_ERR(bdev); - } - rdev->bdev = bdev; - return err; -} - -static void unlock_rdev(struct md_rdev *rdev) -{ - struct block_device *bdev = rdev->bdev; - rdev->bdev = NULL; - if (!bdev) - MD_BUG(); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -} -void md_autodetect_dev(dev_t dev); - -static void export_rdev(struct md_rdev * rdev) -{ - char b[BDEVNAME_SIZE]; - printk(KERN_INFO "md: export_rdev(%s)\n", - bdevname(rdev->bdev,b)); - if (rdev->mddev) - MD_BUG(); - md_rdev_clear(rdev); -#ifndef MODULE - if (test_bit(AutoDetected, &rdev->flags)) - md_autodetect_dev(rdev->bdev->bd_dev); -#endif - unlock_rdev(rdev); - kobject_put(&rdev->kobj); -} + synchronize_rcu(); -static void kick_rdev_from_array(struct md_rdev * rdev) -{ - unbind_rdev_from_array(rdev); - export_rdev(rdev); + /* + * kobject_del() will wait for all in progress writers to be done, where + * reconfig_mutex is held, hence it can't be called under + * reconfig_mutex and it's delayed to mddev_unlock(). + */ + list_add(&rdev->same_set, &mddev->deleting); } static void export_array(struct mddev *mddev) { - struct md_rdev *rdev, *tmp; + struct md_rdev *rdev; - rdev_for_each_safe(rdev, tmp, mddev) { - if (!rdev->mddev) { - MD_BUG(); - continue; - } - kick_rdev_from_array(rdev); + while (!list_empty(&mddev->disks)) { + rdev = list_first_entry(&mddev->disks, struct md_rdev, + same_set); + md_kick_rdev_from_array(rdev); } - if (!list_empty(&mddev->disks)) - MD_BUG(); mddev->raid_disks = 0; mddev->major_version = 0; } -static void print_desc(mdp_disk_t *desc) -{ - printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, - desc->major,desc->minor,desc->raid_disk,desc->state); -} - -static void print_sb_90(mdp_super_t *sb) -{ - int i; - - printk(KERN_INFO - "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", - sb->major_version, sb->minor_version, sb->patch_version, - sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, - sb->ctime); - printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", - sb->level, sb->size, sb->nr_disks, sb->raid_disks, - sb->md_minor, sb->layout, sb->chunk_size); - printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" - " FD:%d SD:%d CSUM:%08x E:%08lx\n", - sb->utime, sb->state, sb->active_disks, sb->working_disks, - sb->failed_disks, sb->spare_disks, - sb->sb_csum, (unsigned long)sb->events_lo); - - printk(KERN_INFO); - for (i = 0; i < MD_SB_DISKS; i++) { - mdp_disk_t *desc; - - desc = sb->disks + i; - if (desc->number || desc->major || desc->minor || - desc->raid_disk || (desc->state && (desc->state != 4))) { - printk(" D %2d: ", i); - print_desc(desc); - } - } - printk(KERN_INFO "md: THIS: "); - print_desc(&sb->this_disk); -} - -static void print_sb_1(struct mdp_superblock_1 *sb) -{ - __u8 *uuid; - - uuid = sb->set_uuid; - printk(KERN_INFO - "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n" - "md: Name: \"%s\" CT:%llu\n", - le32_to_cpu(sb->major_version), - le32_to_cpu(sb->feature_map), - uuid, - sb->set_name, - (unsigned long long)le64_to_cpu(sb->ctime) - & MD_SUPERBLOCK_1_TIME_SEC_MASK); - - uuid = sb->device_uuid; - printk(KERN_INFO - "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" - " RO:%llu\n" - "md: Dev:%08x UUID: %pU\n" - "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" - "md: (MaxDev:%u) \n", - le32_to_cpu(sb->level), - (unsigned long long)le64_to_cpu(sb->size), - le32_to_cpu(sb->raid_disks), - le32_to_cpu(sb->layout), - le32_to_cpu(sb->chunksize), - (unsigned long long)le64_to_cpu(sb->data_offset), - (unsigned long long)le64_to_cpu(sb->data_size), - (unsigned long long)le64_to_cpu(sb->super_offset), - (unsigned long long)le64_to_cpu(sb->recovery_offset), - le32_to_cpu(sb->dev_number), - uuid, - sb->devflags, - (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, - (unsigned long long)le64_to_cpu(sb->events), - (unsigned long long)le64_to_cpu(sb->resync_offset), - le32_to_cpu(sb->sb_csum), - le32_to_cpu(sb->max_dev) - ); -} - -static void print_rdev(struct md_rdev *rdev, int major_version) -{ - char b[BDEVNAME_SIZE]; - printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", - bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, - test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), - rdev->desc_nr); - if (rdev->sb_loaded) { - printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); - switch (major_version) { - case 0: - print_sb_90(page_address(rdev->sb_page)); - break; - case 1: - print_sb_1(page_address(rdev->sb_page)); - break; +static bool set_in_sync(struct mddev *mddev) +{ + lockdep_assert_held(&mddev->lock); + if (!mddev->in_sync) { + mddev->sync_checkers++; + spin_unlock(&mddev->lock); + percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); + spin_lock(&mddev->lock); + if (!mddev->in_sync && + percpu_ref_is_zero(&mddev->writes_pending)) { + mddev->in_sync = 1; + /* + * Ensure ->in_sync is visible before we clear + * ->sync_checkers. + */ + smp_mb(); + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + sysfs_notify_dirent_safe(mddev->sysfs_state); } - } else - printk(KERN_INFO "md: no rdev superblock!\n"); -} - -static void md_print_devices(void) -{ - struct list_head *tmp; - struct md_rdev *rdev; - struct mddev *mddev; - char b[BDEVNAME_SIZE]; - - printk("\n"); - printk("md: **********************************\n"); - printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); - printk("md: **********************************\n"); - for_each_mddev(mddev, tmp) { - - if (mddev->bitmap) - bitmap_print_sb(mddev->bitmap); - else - printk("%s: ", mdname(mddev)); - rdev_for_each(rdev, mddev) - printk("<%s>", bdevname(rdev->bdev,b)); - printk("\n"); - - rdev_for_each(rdev, mddev) - print_rdev(rdev, mddev->major_version); + if (--mddev->sync_checkers == 0) + percpu_ref_switch_to_percpu(&mddev->writes_pending); } - printk("md: **********************************\n"); - printk("\n"); + if (mddev->safemode == 1) + mddev->safemode = 0; + return mddev->in_sync; } - -static void sync_sbs(struct mddev * mddev, int nospares) +static void sync_sbs(struct mddev *mddev, int nospares) { /* Update each superblock (in-memory image), but * if we are allowed to, skip spares which already @@ -2399,37 +2736,107 @@ static void sync_sbs(struct mddev * mddev, int nospares) } } -static void md_update_sb(struct mddev * mddev, int force_change) +static bool does_sb_need_changing(struct mddev *mddev) +{ + struct md_rdev *rdev = NULL, *iter; + struct mdp_superblock_1 *sb; + int role; + + /* Find a good rdev */ + rdev_for_each(iter, mddev) + if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { + rdev = iter; + break; + } + + /* No good device found. */ + if (!rdev) + return false; + + sb = page_address(rdev->sb_page); + /* Check if a device has become faulty or a spare become active */ + rdev_for_each(rdev, mddev) { + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + /* Device activated? */ + if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags)) + return true; + /* Device turned faulty? */ + if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) + return true; + } + + /* Check if any mddev parameters have changed */ + if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || + (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || + (mddev->layout != le32_to_cpu(sb->layout)) || + (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || + (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) + return true; + + return false; +} + +void md_update_sb(struct mddev *mddev, int force_change) { struct md_rdev *rdev; int sync_req; int nospares = 0; int any_badblocks_changed = 0; + int ret = -1; - if (mddev->ro) { + if (!md_is_rdwr(mddev)) { if (force_change) - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + pr_err("%s: can't update sb for read-only array %s\n", __func__, mdname(mddev)); return; } + repeat: - /* First make sure individual recovery_offsets are correct */ + if (mddev_is_clustered(mddev)) { + if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) + force_change = 1; + if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) + nospares = 1; + ret = mddev->cluster_ops->metadata_update_start(mddev); + /* Has someone else has updated the sb */ + if (!does_sb_need_changing(mddev)) { + if (ret == 0) + mddev->cluster_ops->metadata_update_cancel(mddev); + bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), + BIT(MD_SB_CHANGE_DEVS) | + BIT(MD_SB_CHANGE_CLEAN)); + return; + } + } + + /* + * First make sure individual recovery_offsets are correct + * curr_resync_completed can only be used during recovery. + * During reshape/resync it might use array-addresses rather + * that device addresses. + */ rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && + !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && mddev->curr_resync_completed > rdev->recovery_offset) rdev->recovery_offset = mddev->curr_resync_completed; - } + } if (!mddev->persistent) { - clear_bit(MD_CHANGE_CLEAN, &mddev->flags); - clear_bit(MD_CHANGE_DEVS, &mddev->flags); + clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (!mddev->external) { - clear_bit(MD_CHANGE_PENDING, &mddev->flags); + clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) { rdev->badblocks.changed = 0; - md_ack_all_badblocks(&rdev->badblocks); + ack_all_badblocks(&rdev->badblocks); md_error(mddev, rdev); } clear_bit(Blocked, &rdev->flags); @@ -2441,13 +2848,13 @@ repeat: return; } - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); - mddev->utime = get_seconds(); + mddev->utime = ktime_get_real_seconds(); - if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) + if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) force_change = 1; - if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) + if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) /* just a clean<-> dirty transition, possibly leave spares alone, * though if events isn't the right even/odd, we will have to do * spares after all @@ -2472,7 +2879,7 @@ repeat: /* If this is just a dirty<->clean transition, and the array is clean * and 'events' is odd, we can roll back to the previous clean state */ if (nospares - && (mddev->in_sync && mddev->recovery_cp == MaxSector) + && (mddev->in_sync && mddev->resync_offset == MaxSector) && mddev->can_decrease_events && mddev->events != 1) { mddev->events--; @@ -2483,15 +2890,12 @@ repeat: mddev->can_decrease_events = nospares; } - if (!mddev->events) { - /* - * oops, this 64-bit counter should never wrap. - * Either we are in around ~1 trillion A.C., assuming - * 1 reboot per second, or we have a bug: - */ - MD_BUG(); - mddev->events --; - } + /* + * This 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug... + */ + WARN_ON(mddev->events == 0); rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) @@ -2501,71 +2905,94 @@ repeat: } sync_sbs(mddev, nospares); - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", mdname(mddev), mddev->in_sync); - bitmap_update_sb(mddev->bitmap); + mddev_add_trace_msg(mddev, "md md_update_sb"); +rewrite: + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { - char b[BDEVNAME_SIZE]; - if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ - if (!test_bit(Faulty, &rdev->flags) && - rdev->saved_raid_disk == -1) { - md_super_write(mddev,rdev, - rdev->sb_start, rdev->sb_size, - rdev->sb_page); - pr_debug("md: (write) %s's sb offset: %llu\n", - bdevname(rdev->bdev, b), + if (!test_bit(Faulty, &rdev->flags)) { + md_write_metadata(mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); + pr_debug("md: (write) %pg's sb offset: %llu\n", + rdev->bdev, (unsigned long long)rdev->sb_start); rdev->sb_events = mddev->events; if (rdev->badblocks.size) { - md_super_write(mddev, rdev, - rdev->badblocks.sector, - rdev->badblocks.size << 9, - rdev->bb_page); + md_write_metadata(mddev, rdev, + rdev->badblocks.sector, + rdev->badblocks.size << 9, + rdev->bb_page, 0); rdev->badblocks.size = 0; } - } else if (test_bit(Faulty, &rdev->flags)) - pr_debug("md: %s (skipping faulty)\n", - bdevname(rdev->bdev, b)); - else - pr_debug("(skipping incremental s/r "); - - if (mddev->level == LEVEL_MULTIPATH) - /* only need to write one superblock... */ - break; + } else + pr_debug("md: %pg (skipping faulty)\n", + rdev->bdev); } - md_super_wait(mddev); - /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ + if (md_super_wait(mddev) < 0) + goto rewrite; + /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ + + if (mddev_is_clustered(mddev) && ret == 0) + mddev->cluster_ops->metadata_update_finish(mddev); - spin_lock_irq(&mddev->write_lock); if (mddev->in_sync != sync_req || - test_bit(MD_CHANGE_DEVS, &mddev->flags)) { + !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), + BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) /* have to write it out again */ - spin_unlock_irq(&mddev->write_lock); goto repeat; - } - clear_bit(MD_CHANGE_PENDING, &mddev->flags); - spin_unlock_irq(&mddev->write_lock); wake_up(&mddev->sb_wait); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); rdev_for_each(rdev, mddev) { if (test_and_clear_bit(FaultRecorded, &rdev->flags)) clear_bit(Blocked, &rdev->flags); if (any_badblocks_changed) - md_ack_all_badblocks(&rdev->badblocks); + ack_all_badblocks(&rdev->badblocks); clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } } +EXPORT_SYMBOL(md_update_sb); + +static int add_bound_rdev(struct md_rdev *rdev) +{ + struct mddev *mddev = rdev->mddev; + int err = 0; + bool add_journal = test_bit(Journal, &rdev->flags); + + if (!mddev->pers->hot_remove_disk || add_journal) { + /* If there is hot_add_disk but no hot_remove_disk + * then added disks for geometry changes, + * and should be added immediately. + */ + super_types[mddev->major_version]. + validate_super(mddev, NULL/*freshest*/, rdev); + err = mddev->pers->hot_add_disk(mddev, rdev); + if (err) { + md_kick_rdev_from_array(rdev); + return err; + } + } + sysfs_notify_dirent_safe(rdev->sysfs_state); + + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + if (mddev->degraded) + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_new_event(); + return 0; +} /* words written to sysfs files may, or may not, be \n terminated. * We want to accept with case. For this we use cmd_match. @@ -2596,45 +3023,41 @@ struct rdev_sysfs_entry { static ssize_t state_show(struct md_rdev *rdev, char *page) { - char *sep = ""; + char *sep = ","; size_t len = 0; - - if (test_bit(Faulty, &rdev->flags) || - rdev->badblocks.unacked_exist) { - len+= sprintf(page+len, "%sfaulty",sep); - sep = ","; - } - if (test_bit(In_sync, &rdev->flags)) { - len += sprintf(page+len, "%sin_sync",sep); - sep = ","; - } - if (test_bit(WriteMostly, &rdev->flags)) { - len += sprintf(page+len, "%swrite_mostly",sep); - sep = ","; - } - if (test_bit(Blocked, &rdev->flags) || + unsigned long flags = READ_ONCE(rdev->flags); + + if (test_bit(Faulty, &flags) || + (!test_bit(ExternalBbl, &flags) && + rdev->badblocks.unacked_exist)) + len += sprintf(page+len, "faulty%s", sep); + if (test_bit(In_sync, &flags)) + len += sprintf(page+len, "in_sync%s", sep); + if (test_bit(Journal, &flags)) + len += sprintf(page+len, "journal%s", sep); + if (test_bit(WriteMostly, &flags)) + len += sprintf(page+len, "write_mostly%s", sep); + if (test_bit(Blocked, &flags) || (rdev->badblocks.unacked_exist - && !test_bit(Faulty, &rdev->flags))) { - len += sprintf(page+len, "%sblocked", sep); - sep = ","; - } - if (!test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags)) { - len += sprintf(page+len, "%sspare", sep); - sep = ","; - } - if (test_bit(WriteErrorSeen, &rdev->flags)) { - len += sprintf(page+len, "%swrite_error", sep); - sep = ","; - } - if (test_bit(WantReplacement, &rdev->flags)) { - len += sprintf(page+len, "%swant_replacement", sep); - sep = ","; - } - if (test_bit(Replacement, &rdev->flags)) { - len += sprintf(page+len, "%sreplacement", sep); - sep = ","; - } + && !test_bit(Faulty, &flags))) + len += sprintf(page+len, "blocked%s", sep); + if (!test_bit(Faulty, &flags) && + !test_bit(Journal, &flags) && + !test_bit(In_sync, &flags)) + len += sprintf(page+len, "spare%s", sep); + if (test_bit(WriteErrorSeen, &flags)) + len += sprintf(page+len, "write_error%s", sep); + if (test_bit(WantReplacement, &flags)) + len += sprintf(page+len, "want_replacement%s", sep); + if (test_bit(Replacement, &flags)) + len += sprintf(page+len, "replacement%s", sep); + if (test_bit(ExternalBbl, &flags)) + len += sprintf(page+len, "external_bbl%s", sep); + if (test_bit(FailFast, &flags)) + len += sprintf(page+len, "failfast%s", sep); + + if (len) + len -= strlen(sep); return len+sprintf(page+len, "\n"); } @@ -2650,38 +3073,59 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) * blocked - sets the Blocked flags * -blocked - clears the Blocked and possibly simulates an error * insync - sets Insync providing device isn't active + * -insync - clear Insync for a device with a slot assigned, + * so that it gets rebuilt based on bitmap * write_error - sets WriteErrorSeen * -write_error - clears WriteErrorSeen + * {,-}failfast - set/clear FailFast */ + + struct mddev *mddev = rdev->mddev; int err = -EINVAL; + bool need_update_sb = false; + if (cmd_match(buf, "faulty") && rdev->mddev->pers) { md_error(rdev->mddev, rdev); - if (test_bit(Faulty, &rdev->flags)) - err = 0; - else + + if (test_bit(MD_BROKEN, &rdev->mddev->flags)) err = -EBUSY; + else + err = 0; } else if (cmd_match(buf, "remove")) { + if (rdev->mddev->pers) { + clear_bit(Blocked, &rdev->flags); + remove_and_add_spares(rdev->mddev, rdev); + } if (rdev->raid_disk >= 0) err = -EBUSY; else { - struct mddev *mddev = rdev->mddev; - kick_rdev_from_array(rdev); - if (mddev->pers) - md_update_sb(mddev, 1); - md_new_event(mddev); err = 0; + if (mddev_is_clustered(mddev)) + err = mddev->cluster_ops->remove_disk(mddev, rdev); + + if (err == 0) { + md_kick_rdev_from_array(rdev); + if (mddev->pers) + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + md_new_event(); + } } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); + mddev_create_serial_pool(rdev->mddev, rdev); + need_update_sb = true; err = 0; } else if (cmd_match(buf, "-writemostly")) { + mddev_destroy_serial_pool(rdev->mddev, rdev); clear_bit(WriteMostly, &rdev->flags); + need_update_sb = true; err = 0; } else if (cmd_match(buf, "blocked")) { set_bit(Blocked, &rdev->flags); err = 0; } else if (cmd_match(buf, "-blocked")) { if (!test_bit(Faulty, &rdev->flags) && + !test_bit(ExternalBbl, &rdev->flags) && rdev->badblocks.unacked_exist) { /* metadata handler doesn't understand badblocks, * so we need to fail the device @@ -2692,12 +3136,27 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); - md_wakeup_thread(rdev->mddev->thread); err = 0; } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { set_bit(In_sync, &rdev->flags); err = 0; + } else if (cmd_match(buf, "failfast")) { + set_bit(FailFast, &rdev->flags); + need_update_sb = true; + err = 0; + } else if (cmd_match(buf, "-failfast")) { + clear_bit(FailFast, &rdev->flags); + need_update_sb = true; + err = 0; + } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags)) { + if (rdev->mddev->pers == NULL) { + clear_bit(In_sync, &rdev->flags); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->raid_disk = -1; + err = 0; + } } else if (cmd_match(buf, "write_error")) { set_bit(WriteErrorSeen, &rdev->flags); err = 0; @@ -2710,10 +3169,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) * check if recovery is needed. */ if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && !test_bit(Replacement, &rdev->flags)) set_bit(WantReplacement, &rdev->flags); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); - md_wakeup_thread(rdev->mddev->thread); err = 0; } else if (cmd_match(buf, "-want_replacement")) { /* Clearing 'want_replacement' is always allowed. @@ -2740,13 +3199,40 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) clear_bit(Replacement, &rdev->flags); err = 0; } + } else if (cmd_match(buf, "re-add")) { + if (!rdev->mddev->pers) + err = -EINVAL; + else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && + rdev->saved_raid_disk >= 0) { + /* clear_bit is performed _after_ all the devices + * have their local Faulty bit cleared. If any writes + * happen in the meantime in the local node, they + * will land in the local bitmap, which will be synced + * by this node eventually + */ + if (!mddev_is_clustered(rdev->mddev) || + (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) { + clear_bit(Faulty, &rdev->flags); + err = add_bound_rdev(rdev); + } + } else + err = -EBUSY; + } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { + set_bit(ExternalBbl, &rdev->flags); + rdev->badblocks.shift = 0; + err = 0; + } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { + clear_bit(ExternalBbl, &rdev->flags); + err = 0; } + if (need_update_sb) + md_update_sb(mddev, 1); if (!err) sysfs_notify_dirent_safe(rdev->sysfs_state); return err ? err : len; } static struct rdev_sysfs_entry rdev_state = -__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); +__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); static ssize_t errors_show(struct md_rdev *rdev, char *page) @@ -2757,13 +3243,14 @@ errors_show(struct md_rdev *rdev, char *page) static ssize_t errors_store(struct md_rdev *rdev, const char *buf, size_t len) { - char *e; - unsigned long n = simple_strtoul(buf, &e, 10); - if (*buf && (*e == 0 || *e == '\n')) { - atomic_set(&rdev->corrected_errors, n); - return len; - } - return -EINVAL; + unsigned int n; + int rv; + + rv = kstrtouint(buf, 10, &n); + if (rv < 0) + return rv; + atomic_set(&rdev->corrected_errors, n); + return len; } static struct rdev_sysfs_entry rdev_errors = __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); @@ -2771,7 +3258,9 @@ __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); static ssize_t slot_show(struct md_rdev *rdev, char *page) { - if (rdev->raid_disk < 0) + if (test_bit(Journal, &rdev->flags)) + return sprintf(page, "journal\n"); + else if (rdev->raid_disk < 0) return sprintf(page, "none\n"); else return sprintf(page, "%d\n", rdev->raid_disk); @@ -2780,13 +3269,21 @@ slot_show(struct md_rdev *rdev, char *page) static ssize_t slot_store(struct md_rdev *rdev, const char *buf, size_t len) { - char *e; + int slot; int err; - int slot = simple_strtoul(buf, &e, 10); + + if (test_bit(Journal, &rdev->flags)) + return -EBUSY; if (strncmp(buf, "none", 4)==0) slot = -1; - else if (e==buf || (*e && *e!= '\n')) - return -EINVAL; + else { + err = kstrtouint(buf, 10, (unsigned int *)&slot); + if (err < 0) + return err; + if (slot < 0) + /* overflow */ + return -ENOSPC; + } if (rdev->mddev->pers && slot == -1) { /* Setting 'slot' on an active array requires also * updating the 'rd%d' link, and communicating @@ -2805,11 +3302,11 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) if (rdev->raid_disk >= 0) return -EBUSY; set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); - md_wakeup_thread(rdev->mddev->thread); } else if (rdev->mddev->pers) { /* Activating a spare .. or possibly reactivating * if we ever get bitmaps working here. */ + int err; if (rdev->raid_disk != -1) return -EBUSY; @@ -2830,15 +3327,15 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) else rdev->saved_raid_disk = -1; clear_bit(In_sync, &rdev->flags); - err = rdev->mddev->pers-> - hot_add_disk(rdev->mddev, rdev); + clear_bit(Bitmap_sync, &rdev->flags); + err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); if (err) { rdev->raid_disk = -1; return err; } else sysfs_notify_dirent_safe(rdev->sysfs_state); - if (sysfs_link_rdev(rdev->mddev, rdev)) - /* failure here is OK */; + /* failure here is OK */; + sysfs_link_rdev(rdev->mddev, rdev); /* don't wakeup anyone, leave that to userspace. */ } else { if (slot >= rdev->mddev->raid_disks && @@ -2854,7 +3351,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) return len; } - static struct rdev_sysfs_entry rdev_slot = __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); @@ -2899,7 +3395,7 @@ static ssize_t new_offset_store(struct md_rdev *rdev, if (kstrtoull(buf, 10, &new_offset) < 0) return -EINVAL; - if (mddev->sync_thread) + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; if (new_offset == rdev->data_offset) /* reset is always permitted */ @@ -2947,14 +3443,35 @@ rdev_size_show(struct md_rdev *rdev, char *page) return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); } -static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) +static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) { /* check if two start/length pairs overlap */ - if (s1+l1 <= s2) - return 0; - if (s2+l2 <= s1) - return 0; - return 1; + if (a->data_offset + a->sectors <= b->data_offset) + return false; + if (b->data_offset + b->sectors <= a->data_offset) + return false; + return true; +} + +static bool md_rdev_overlaps(struct md_rdev *rdev) +{ + struct mddev *mddev; + struct md_rdev *rdev2; + + spin_lock(&all_mddevs_lock); + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { + if (test_bit(MD_DELETED, &mddev->flags)) + continue; + rdev_for_each(rdev2, mddev) { + if (rdev != rdev2 && rdev->bdev == rdev2->bdev && + md_rdevs_overlap(rdev, rdev2)) { + spin_unlock(&all_mddevs_lock); + return true; + } + } + } + spin_unlock(&all_mddevs_lock); + return false; } static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) @@ -2983,6 +3500,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) sector_t oldsectors = rdev->sectors; sector_t sectors; + if (test_bit(Journal, &rdev->flags)) + return -EBUSY; if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; if (rdev->data_offset != rdev->new_data_offset) @@ -2994,7 +3513,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) if (!sectors) return -EBUSY; } else if (!sectors) - sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - + sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; if (!my_mddev->pers->resize) /* Cannot change size for RAID0 or Linear etc */ @@ -3004,47 +3523,21 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) return -EINVAL; /* component must fit device */ rdev->sectors = sectors; - if (sectors > oldsectors && my_mddev->external) { - /* need to check that all other rdevs with the same ->bdev - * do not overlap. We need to unlock the mddev to avoid - * a deadlock. We have already changed rdev->sectors, and if - * we have to change it back, we will have the lock again. - */ - struct mddev *mddev; - int overlap = 0; - struct list_head *tmp; - mddev_unlock(my_mddev); - for_each_mddev(mddev, tmp) { - struct md_rdev *rdev2; - - mddev_lock(mddev); - rdev_for_each(rdev2, mddev) - if (rdev->bdev == rdev2->bdev && - rdev != rdev2 && - overlaps(rdev->data_offset, rdev->sectors, - rdev2->data_offset, - rdev2->sectors)) { - overlap = 1; - break; - } - mddev_unlock(mddev); - if (overlap) { - mddev_put(mddev); - break; - } - } - mddev_lock(my_mddev); - if (overlap) { - /* Someone else could have slipped in a size - * change here, but doing so is just silly. - * We put oldsectors back because we *know* it is - * safe, and trust userspace not to race with - * itself - */ - rdev->sectors = oldsectors; - return -EBUSY; - } + /* + * Check that all other rdevs with the same bdev do not overlap. This + * check does not provide a hard guarantee, it just helps avoid + * dangerous mistakes. + */ + if (sectors > oldsectors && my_mddev->external && + md_rdev_overlaps(rdev)) { + /* + * Someone else could have slipped in a size change here, but + * doing so is just silly. We put oldsectors back because we + * know it is safe, and trust userspace not to race with itself. + */ + rdev->sectors = oldsectors; + return -EBUSY; } return len; } @@ -3052,7 +3545,6 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) static struct rdev_sysfs_entry rdev_size = __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); - static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) { unsigned long long recovery_start = rdev->recovery_offset; @@ -3088,12 +3580,17 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_ static struct rdev_sysfs_entry rdev_recovery_start = __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); - -static ssize_t -badblocks_show(struct badblocks *bb, char *page, int unack); -static ssize_t -badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); - +/* sysfs access to bad-blocks list. + * We present two files. + * 'bad-blocks' lists sector numbers and lengths of ranges that + * are recorded as bad. The list is truncated to fit within + * the one-page limit of sysfs. + * Writing "sector length" to this file adds an acknowledged + * bad block list. + * 'unacknowledged-bad-blocks' lists bad blocks that have not yet + * been acknowledged. Writing to this file adds bad blocks + * without acknowledging them. This is largely for testing. + */ static ssize_t bb_show(struct md_rdev *rdev, char *page) { return badblocks_show(&rdev->badblocks, page, 0); @@ -3109,7 +3606,6 @@ static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) static struct rdev_sysfs_entry rdev_bad_blocks = __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); - static ssize_t ubb_show(struct md_rdev *rdev, char *page) { return badblocks_show(&rdev->badblocks, page, 1); @@ -3121,6 +3617,78 @@ static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) static struct rdev_sysfs_entry rdev_unack_bad_blocks = __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); +static ssize_t +ppl_sector_show(struct md_rdev *rdev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); +} + +static ssize_t +ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + unsigned long long sector; + + if (kstrtoull(buf, 10, §or) < 0) + return -EINVAL; + if (sector != (sector_t)sector) + return -EINVAL; + + if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && + rdev->raid_disk >= 0) + return -EBUSY; + + if (rdev->mddev->persistent) { + if (rdev->mddev->major_version == 0) + return -EINVAL; + if ((sector > rdev->sb_start && + sector - rdev->sb_start > S16_MAX) || + (sector < rdev->sb_start && + rdev->sb_start - sector > -S16_MIN)) + return -EINVAL; + rdev->ppl.offset = sector - rdev->sb_start; + } else if (!rdev->mddev->external) { + return -EBUSY; + } + rdev->ppl.sector = sector; + return len; +} + +static struct rdev_sysfs_entry rdev_ppl_sector = +__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); + +static ssize_t +ppl_size_show(struct md_rdev *rdev, char *page) +{ + return sprintf(page, "%u\n", rdev->ppl.size); +} + +static ssize_t +ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + unsigned int size; + + if (kstrtouint(buf, 10, &size) < 0) + return -EINVAL; + + if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && + rdev->raid_disk >= 0) + return -EBUSY; + + if (rdev->mddev->persistent) { + if (rdev->mddev->major_version == 0) + return -EINVAL; + if (size > U16_MAX) + return -EINVAL; + } else if (!rdev->mddev->external) { + return -EBUSY; + } + rdev->ppl.size = size; + return len; +} + +static struct rdev_sysfs_entry rdev_ppl_size = +__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); + static struct attribute *rdev_default_attrs[] = { &rdev_state.attr, &rdev_errors.attr, @@ -3131,28 +3699,22 @@ static struct attribute *rdev_default_attrs[] = { &rdev_recovery_start.attr, &rdev_bad_blocks.attr, &rdev_unack_bad_blocks.attr, + &rdev_ppl_sector.attr, + &rdev_ppl_size.attr, NULL, }; +ATTRIBUTE_GROUPS(rdev_default); static ssize_t rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); - struct mddev *mddev = rdev->mddev; - ssize_t rv; if (!entry->show) return -EIO; - - rv = mddev ? mddev_lock(mddev) : -EBUSY; - if (!rv) { - if (rdev->mddev == NULL) - rv = -EBUSY; - else - rv = entry->show(rdev, page); - mddev_unlock(mddev); - } - return rv; + if (!rdev->mddev) + return -ENODEV; + return entry->show(rdev, page); } static ssize_t @@ -3161,21 +3723,39 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); + struct kernfs_node *kn = NULL; + bool suspend = false; ssize_t rv; - struct mddev *mddev = rdev->mddev; + struct mddev *mddev = READ_ONCE(rdev->mddev); if (!entry->store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - rv = mddev ? mddev_lock(mddev): -EBUSY; + if (!mddev) + return -ENODEV; + + if (entry->store == state_store) { + if (cmd_match(page, "remove")) + kn = sysfs_break_active_protection(kobj, attr); + if (cmd_match(page, "remove") || cmd_match(page, "re-add") || + cmd_match(page, "writemostly") || + cmd_match(page, "-writemostly")) + suspend = true; + } + + rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); if (!rv) { if (rdev->mddev == NULL) - rv = -EBUSY; + rv = -ENODEV; else rv = entry->store(rdev, page, length); - mddev_unlock(mddev); + suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); } + + if (kn) + sysfs_unbreak_active_protection(kn); + return rv; } @@ -3188,10 +3768,10 @@ static const struct sysfs_ops rdev_sysfs_ops = { .show = rdev_attr_show, .store = rdev_attr_store, }; -static struct kobj_type rdev_ktype = { +static const struct kobj_type rdev_ktype = { .release = rdev_free, .sysfs_ops = &rdev_sysfs_ops, - .default_attrs = rdev_default_attrs, + .default_groups = rdev_default_groups, }; int md_rdev_init(struct md_rdev *rdev) @@ -3203,8 +3783,7 @@ int md_rdev_init(struct md_rdev *rdev) rdev->data_offset = 0; rdev->new_data_offset = 0; rdev->sb_events = 0; - rdev->last_read_error.tv_sec = 0; - rdev->last_read_error.tv_nsec = 0; + rdev->last_read_error = 0; rdev->sb_loaded = 0; rdev->bb_page = NULL; atomic_set(&rdev->nr_pending, 0); @@ -3218,16 +3797,10 @@ int md_rdev_init(struct md_rdev *rdev) * This reserves the space even on arrays where it cannot * be used - I wonder if that matters */ - rdev->badblocks.count = 0; - rdev->badblocks.shift = -1; /* disabled until explicitly enabled */ - rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); - seqlock_init(&rdev->badblocks.lock); - if (rdev->badblocks.page == NULL) - return -ENOMEM; - - return 0; + return badblocks_init(&rdev->badblocks, 0); } EXPORT_SYMBOL_GPL(md_rdev_init); + /* * Import a device. If 'super_format' >= 0, then sanity check the superblock * @@ -3240,64 +3813,65 @@ EXPORT_SYMBOL_GPL(md_rdev_init); */ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) { - char b[BDEVNAME_SIZE]; - int err; struct md_rdev *rdev; sector_t size; + int err; rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); - if (!rdev) { - printk(KERN_ERR "md: could not alloc mem for new device!\n"); + if (!rdev) return ERR_PTR(-ENOMEM); - } err = md_rdev_init(rdev); if (err) - goto abort_free; + goto out_free_rdev; err = alloc_disk_sb(rdev); if (err) - goto abort_free; + goto out_clear_rdev; - err = lock_rdev(rdev, newdev, super_format == -2); - if (err) - goto abort_free; + rdev->bdev_file = bdev_file_open_by_dev(newdev, + BLK_OPEN_READ | BLK_OPEN_WRITE, + super_format == -2 ? &claim_rdev : rdev, NULL); + if (IS_ERR(rdev->bdev_file)) { + pr_warn("md: could not open device unknown-block(%u,%u).\n", + MAJOR(newdev), MINOR(newdev)); + err = PTR_ERR(rdev->bdev_file); + goto out_clear_rdev; + } + rdev->bdev = file_bdev(rdev->bdev_file); kobject_init(&rdev->kobj, &rdev_ktype); - size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; + size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; if (!size) { - printk(KERN_WARNING - "md: %s has zero or unknown size, marking faulty!\n", - bdevname(rdev->bdev,b)); + pr_warn("md: %pg has zero or unknown size, marking faulty!\n", + rdev->bdev); err = -EINVAL; - goto abort_free; + goto out_blkdev_put; } if (super_format >= 0) { err = super_types[super_format]. load_super(rdev, NULL, super_minor); if (err == -EINVAL) { - printk(KERN_WARNING - "md: %s does not have a valid v%d.%d " - "superblock, not importing!\n", - bdevname(rdev->bdev,b), - super_format, super_minor); - goto abort_free; + pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", + rdev->bdev, + super_format, super_minor); + goto out_blkdev_put; } if (err < 0) { - printk(KERN_WARNING - "md: could not read %s's sb, not importing!\n", - bdevname(rdev->bdev,b)); - goto abort_free; + pr_warn("md: could not read %pg's sb, not importing!\n", + rdev->bdev); + goto out_blkdev_put; } } return rdev; -abort_free: - if (rdev->bdev) - unlock_rdev(rdev); +out_blkdev_put: + fput(rdev->bdev_file); +out_clear_rdev: md_rdev_clear(rdev); +out_free_rdev: kfree(rdev); return ERR_PTR(err); } @@ -3306,12 +3880,10 @@ abort_free: * Check a full RAID array for plausibility */ - -static void analyze_sbs(struct mddev * mddev) +static int analyze_sbs(struct mddev *mddev) { int i; struct md_rdev *rdev, *freshest, *tmp; - char b[BDEVNAME_SIZE]; freshest = NULL; rdev_for_each_safe(rdev, tmp, mddev) @@ -3323,53 +3895,54 @@ static void analyze_sbs(struct mddev * mddev) case 0: break; default: - printk( KERN_ERR \ - "md: fatal superblock inconsistency in %s" - " -- removing from array\n", - bdevname(rdev->bdev,b)); - kick_rdev_from_array(rdev); + pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", + rdev->bdev); + md_kick_rdev_from_array(rdev); } + /* Cannot find a valid fresh disk */ + if (!freshest) { + pr_warn("md: cannot find a valid disk\n"); + return -EINVAL; + } super_types[mddev->major_version]. - validate_super(mddev, freshest); + validate_super(mddev, NULL/*freshest*/, freshest); i = 0; rdev_for_each_safe(rdev, tmp, mddev) { if (mddev->max_disks && (rdev->desc_nr >= mddev->max_disks || i > mddev->max_disks)) { - printk(KERN_WARNING - "md: %s: %s: only %d devices permitted\n", - mdname(mddev), bdevname(rdev->bdev, b), - mddev->max_disks); - kick_rdev_from_array(rdev); + pr_warn("md: %s: %pg: only %d devices permitted\n", + mdname(mddev), rdev->bdev, + mddev->max_disks); + md_kick_rdev_from_array(rdev); continue; } - if (rdev != freshest) + if (rdev != freshest) { if (super_types[mddev->major_version]. - validate_super(mddev, rdev)) { - printk(KERN_WARNING "md: kicking non-fresh %s" - " from array!\n", - bdevname(rdev->bdev,b)); - kick_rdev_from_array(rdev); + validate_super(mddev, freshest, rdev)) { + pr_warn("md: kicking non-fresh %pg from array!\n", + rdev->bdev); + md_kick_rdev_from_array(rdev); continue; } - if (mddev->level == LEVEL_MULTIPATH) { - rdev->desc_nr = i++; - rdev->raid_disk = rdev->desc_nr; - set_bit(In_sync, &rdev->flags); - } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) { + } + if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) && + !test_bit(Journal, &rdev->flags)) { rdev->raid_disk = -1; clear_bit(In_sync, &rdev->flags); } } + + return 0; } /* Read a fixed-point number. * Numbers in sysfs attributes should be in "standard" units where * possible, so time should be in seconds. - * However we internally use a a much smaller unit such as + * However we internally use a a much smaller unit such as * milliseconds or jiffies. * This function takes a decimal number with a possible fractional * component, and produces an integer which is the result of @@ -3398,39 +3971,40 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) return -EINVAL; if (decimals < 0) decimals = 0; - while (decimals < scale) { - result *= 10; - decimals ++; - } - *res = result; + *res = result * int_pow(10, scale - decimals); return 0; } - -static void md_safemode_timeout(unsigned long data); - static ssize_t safe_delay_show(struct mddev *mddev, char *page) { - int msec = (mddev->safemode_delay*1000)/HZ; - return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); + unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; + + return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); } static ssize_t safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) { unsigned long msec; - if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) + if (mddev_is_clustered(mddev)) { + pr_warn("md: Safemode is disabled for clustered mode\n"); + return -EINVAL; + } + + if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) return -EINVAL; if (msec == 0) mddev->safemode_delay = 0; else { unsigned long old_delay = mddev->safemode_delay; - mddev->safemode_delay = (msec*HZ)/1000; - if (mddev->safemode_delay == 0) - mddev->safemode_delay = 1; - if (mddev->safemode_delay < old_delay) - md_safemode_timeout((unsigned long)mddev); + unsigned long new_delay = (msec*HZ)/1000; + + if (new_delay == 0) + new_delay = 1; + mddev->safemode_delay = new_delay; + if (new_delay < old_delay || old_delay == 0) + mod_timer(&mddev->safemode_timer, jiffies+1); } return len; } @@ -3440,39 +4014,52 @@ __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); static ssize_t level_show(struct mddev *mddev, char *page) { - struct md_personality *p = mddev->pers; + struct md_personality *p; + int ret; + spin_lock(&mddev->lock); + p = mddev->pers; if (p) - return sprintf(page, "%s\n", p->name); + ret = sprintf(page, "%s\n", p->head.name); else if (mddev->clevel[0]) - return sprintf(page, "%s\n", mddev->clevel); + ret = sprintf(page, "%s\n", mddev->clevel); else if (mddev->level != LEVEL_NONE) - return sprintf(page, "%d\n", mddev->level); + ret = sprintf(page, "%d\n", mddev->level); else - return 0; + ret = 0; + spin_unlock(&mddev->lock); + return ret; } static ssize_t level_store(struct mddev *mddev, const char *buf, size_t len) { char clevel[16]; - ssize_t rv = len; - struct md_personality *pers; + ssize_t rv; + size_t slen = len; + struct md_personality *pers, *oldpers; long level; - void *priv; + void *priv, *oldpriv; struct md_rdev *rdev; + if (slen == 0 || slen >= sizeof(clevel)) + return -EINVAL; + + rv = mddev_suspend_and_lock(mddev); + if (rv) + return rv; + if (mddev->pers == NULL) { - if (len == 0) - return 0; - if (len >= sizeof(mddev->clevel)) - return -ENOSPC; - strncpy(mddev->clevel, buf, len); - if (mddev->clevel[len-1] == '\n') - len--; - mddev->clevel[len] = 0; + memcpy(mddev->clevel, buf, slen); + if (mddev->clevel[slen-1] == '\n') + slen--; + mddev->clevel[slen] = 0; mddev->level = LEVEL_NONE; - return rv; + rv = len; + goto out_unlock; } + rv = -EROFS; + if (!md_is_rdwr(mddev)) + goto out_unlock; /* request to change the personality. Need to ensure: * - array is not engaged in resync/recovery/reshape @@ -3480,48 +4067,47 @@ level_store(struct mddev *mddev, const char *buf, size_t len) * - new personality will access other array. */ - if (mddev->sync_thread || + rv = -EBUSY; + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || mddev->reshape_position != MaxSector || mddev->sysfs_active) - return -EBUSY; + goto out_unlock; + rv = -EINVAL; if (!mddev->pers->quiesce) { - printk(KERN_WARNING "md: %s: %s does not support online personality change\n", - mdname(mddev), mddev->pers->name); - return -EINVAL; + pr_warn("md: %s: %s does not support online personality change\n", + mdname(mddev), mddev->pers->head.name); + goto out_unlock; } /* Now find the new personality */ - if (len == 0 || len >= sizeof(clevel)) - return -EINVAL; - strncpy(clevel, buf, len); - if (clevel[len-1] == '\n') - len--; - clevel[len] = 0; + memcpy(clevel, buf, slen); + if (clevel[slen-1] == '\n') + slen--; + clevel[slen] = 0; if (kstrtol(clevel, 10, &level)) level = LEVEL_NONE; if (request_module("md-%s", clevel) != 0) request_module("md-level-%s", clevel); - spin_lock(&pers_lock); - pers = find_pers(level, clevel); - if (!pers || !try_module_get(pers->owner)) { - spin_unlock(&pers_lock); - printk(KERN_WARNING "md: personality %s not loaded\n", clevel); - return -EINVAL; + pers = get_pers(level, clevel); + if (!pers) { + rv = -EINVAL; + goto out_unlock; } - spin_unlock(&pers_lock); if (pers == mddev->pers) { /* Nothing to do! */ - module_put(pers->owner); - return rv; + put_pers(pers); + rv = len; + goto out_unlock; } if (!pers->takeover) { - module_put(pers->owner); - printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", - mdname(mddev), clevel); - return -EINVAL; + put_pers(pers); + pr_warn("md: %s: %s does not support personality takeover\n", + mdname(mddev), clevel); + rv = -EINVAL; + goto out_unlock; } rdev_for_each(rdev, mddev) @@ -3538,33 +4124,31 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->raid_disks -= mddev->delta_disks; mddev->delta_disks = 0; mddev->reshape_backwards = 0; - module_put(pers->owner); - printk(KERN_WARNING "md: %s: %s would not accept array\n", - mdname(mddev), clevel); - return PTR_ERR(priv); + put_pers(pers); + pr_warn("md: %s: %s would not accept array\n", + mdname(mddev), clevel); + rv = PTR_ERR(priv); + goto out_unlock; } /* Looks like we have a winner */ - mddev_suspend(mddev); - mddev->pers->stop(mddev); - - if (mddev->pers->sync_request == NULL && - pers->sync_request != NULL) { - /* need to add the md_redundancy_group */ - if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) - printk(KERN_WARNING - "md: cannot register extra attributes for %s\n", - mdname(mddev)); - mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action"); - } - if (mddev->pers->sync_request != NULL && - pers->sync_request == NULL) { - /* need to remove the md_redundancy_group */ - if (mddev->to_remove == NULL) - mddev->to_remove = &md_redundancy_group; - } + mddev_detach(mddev); + + spin_lock(&mddev->lock); + oldpers = mddev->pers; + oldpriv = mddev->private; + mddev->pers = pers; + mddev->private = priv; + strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); + mddev->level = mddev->new_level; + mddev->layout = mddev->new_layout; + mddev->chunk_sectors = mddev->new_chunk_sectors; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + mddev->degraded = 0; + spin_unlock(&mddev->lock); - if (mddev->pers->sync_request == NULL && + if (oldpers->sync_request == NULL && mddev->external) { /* We are converting from a no-redundancy array * to a redundancy array and metadata is managed @@ -3578,6 +4162,27 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->safemode = 0; } + oldpers->free(mddev, oldpriv); + + if (oldpers->sync_request == NULL && + pers->sync_request != NULL) { + /* need to add the md_redundancy_group */ + if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) + pr_warn("md: cannot register extra attributes for %s\n", + mdname(mddev)); + mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); + mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); + mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); + } + if (oldpers->sync_request != NULL && + pers->sync_request == NULL) { + /* need to remove the md_redundancy_group */ + if (mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + } + + put_pers(oldpers); + rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; @@ -3597,40 +4202,140 @@ level_store(struct mddev *mddev, const char *buf, size_t len) clear_bit(In_sync, &rdev->flags); else { if (sysfs_link_rdev(mddev, rdev)) - printk(KERN_WARNING "md: cannot register rd%d" - " for %s after level change\n", - rdev->raid_disk, mdname(mddev)); + pr_warn("md: cannot register rd%d for %s after level change\n", + rdev->raid_disk, mdname(mddev)); } } - module_put(mddev->pers->owner); - mddev->pers = pers; - mddev->private = priv; - strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); - mddev->level = mddev->new_level; - mddev->layout = mddev->new_layout; - mddev->chunk_sectors = mddev->new_chunk_sectors; - mddev->delta_disks = 0; - mddev->reshape_backwards = 0; - mddev->degraded = 0; - if (mddev->pers->sync_request == NULL) { + if (pers->sync_request == NULL) { /* this is now an array without redundancy, so * it must always be in_sync */ mddev->in_sync = 1; - del_timer_sync(&mddev->safemode_timer); + timer_delete_sync(&mddev->safemode_timer); } pers->run(mddev); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - mddev_resume(mddev); - sysfs_notify(&mddev->kobj, NULL, "level"); - md_new_event(mddev); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + if (!mddev->thread) + md_update_sb(mddev, 1); + sysfs_notify_dirent_safe(mddev->sysfs_level); + md_new_event(); + rv = len; +out_unlock: + mddev_unlock_and_resume(mddev); return rv; } static struct md_sysfs_entry md_level = __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); +static ssize_t +new_level_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d\n", mddev->new_level); +} + +static ssize_t +new_level_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int n; + int err; + + err = kstrtouint(buf, 10, &n); + if (err < 0) + return err; + err = mddev_lock(mddev); + if (err) + return err; + + mddev->new_level = n; + md_update_sb(mddev, 1); + + mddev_unlock(mddev); + return len; +} +static struct md_sysfs_entry md_new_level = +__ATTR(new_level, 0664, new_level_show, new_level_store); + +static ssize_t +bitmap_type_show(struct mddev *mddev, char *page) +{ + struct md_submodule_head *head; + unsigned long i; + ssize_t len = 0; + + if (mddev->bitmap_id == ID_BITMAP_NONE) + len += sprintf(page + len, "[none] "); + else + len += sprintf(page + len, "none "); + + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) { + if (head->type != MD_BITMAP) + continue; + + if (mddev->bitmap_id == head->id) + len += sprintf(page + len, "[%s] ", head->name); + else + len += sprintf(page + len, "%s ", head->name); + } + xa_unlock(&md_submodule); + + len += sprintf(page + len, "\n"); + return len; +} + +static ssize_t +bitmap_type_store(struct mddev *mddev, const char *buf, size_t len) +{ + struct md_submodule_head *head; + enum md_submodule_id id; + unsigned long i; + int err = 0; + + xa_lock(&md_submodule); + + if (mddev->bitmap_ops) { + err = -EBUSY; + goto out; + } + + if (cmd_match(buf, "none")) { + mddev->bitmap_id = ID_BITMAP_NONE; + goto out; + } + + xa_for_each(&md_submodule, i, head) { + if (head->type == MD_BITMAP && cmd_match(buf, head->name)) { + mddev->bitmap_id = head->id; + goto out; + } + } + + err = kstrtoint(buf, 10, &id); + if (err) + goto out; + + if (id == ID_BITMAP_NONE) { + mddev->bitmap_id = id; + goto out; + } + + head = xa_load(&md_submodule, id); + if (head && head->type == MD_BITMAP) { + mddev->bitmap_id = id; + goto out; + } + + err = -ENOENT; + +out: + xa_unlock(&md_submodule); + return err ? err : len; +} + +static struct md_sysfs_entry md_bitmap_type = +__ATTR(bitmap_type, 0664, bitmap_type_show, bitmap_type_store); static ssize_t layout_show(struct mddev *mddev, char *page) @@ -3646,33 +4351,38 @@ layout_show(struct mddev *mddev, char *page) static ssize_t layout_store(struct mddev *mddev, const char *buf, size_t len) { - char *e; - unsigned long n = simple_strtoul(buf, &e, 10); + unsigned int n; + int err; - if (!*buf || (*e && *e != '\n')) - return -EINVAL; + err = kstrtouint(buf, 10, &n); + if (err < 0) + return err; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) { - int err; if (mddev->pers->check_reshape == NULL) - return -EBUSY; - mddev->new_layout = n; - err = mddev->pers->check_reshape(mddev); - if (err) { - mddev->new_layout = mddev->layout; - return err; + err = -EBUSY; + else if (!md_is_rdwr(mddev)) + err = -EROFS; + else { + mddev->new_layout = n; + err = mddev->pers->check_reshape(mddev); + if (err) + mddev->new_layout = mddev->layout; } } else { mddev->new_layout = n; if (mddev->reshape_position == MaxSector) mddev->layout = n; } - return len; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_layout = __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); - static ssize_t raid_disks_show(struct mddev *mddev, char *page) { @@ -3690,38 +4400,53 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks); static ssize_t raid_disks_store(struct mddev *mddev, const char *buf, size_t len) { - char *e; - int rv = 0; - unsigned long n = simple_strtoul(buf, &e, 10); + unsigned int n; + int err; - if (!*buf || (*e && *e != '\n')) - return -EINVAL; + err = kstrtouint(buf, 10, &n); + if (err < 0) + return err; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) - rv = update_raid_disks(mddev, n); + err = update_raid_disks(mddev, n); else if (mddev->reshape_position != MaxSector) { struct md_rdev *rdev; int olddisks = mddev->raid_disks - mddev->delta_disks; + err = -EINVAL; rdev_for_each(rdev, mddev) { if (olddisks < n && rdev->data_offset < rdev->new_data_offset) - return -EINVAL; + goto out_unlock; if (olddisks > n && rdev->data_offset > rdev->new_data_offset) - return -EINVAL; + goto out_unlock; } + err = 0; mddev->delta_disks = n - olddisks; mddev->raid_disks = n; mddev->reshape_backwards = (mddev->delta_disks < 0); } else mddev->raid_disks = n; - return rv ? rv : len; +out_unlock: + mddev_unlock(mddev); + return err ? err : len; } static struct md_sysfs_entry md_raid_disks = __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); static ssize_t +uuid_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%pU\n", mddev->uuid); +} +static struct md_sysfs_entry md_uuid = +__ATTR(uuid, S_IRUGO, uuid_show, NULL); + +static ssize_t chunk_size_show(struct mddev *mddev, char *page) { if (mddev->reshape_position != MaxSector && @@ -3735,28 +4460,34 @@ chunk_size_show(struct mddev *mddev, char *page) static ssize_t chunk_size_store(struct mddev *mddev, const char *buf, size_t len) { - char *e; - unsigned long n = simple_strtoul(buf, &e, 10); + unsigned long n; + int err; - if (!*buf || (*e && *e != '\n')) - return -EINVAL; + err = kstrtoul(buf, 10, &n); + if (err < 0) + return err; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) { - int err; if (mddev->pers->check_reshape == NULL) - return -EBUSY; - mddev->new_chunk_sectors = n >> 9; - err = mddev->pers->check_reshape(mddev); - if (err) { - mddev->new_chunk_sectors = mddev->chunk_sectors; - return err; + err = -EBUSY; + else if (!md_is_rdwr(mddev)) + err = -EROFS; + else { + mddev->new_chunk_sectors = n >> 9; + err = mddev->pers->check_reshape(mddev); + if (err) + mddev->new_chunk_sectors = mddev->chunk_sectors; } } else { mddev->new_chunk_sectors = n >> 9; if (mddev->reshape_position == MaxSector) mddev->chunk_sectors = n >> 9; } - return len; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_chunk_size = __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); @@ -3764,31 +4495,44 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); static ssize_t resync_start_show(struct mddev *mddev, char *page) { - if (mddev->recovery_cp == MaxSector) + if (mddev->resync_offset == MaxSector) return sprintf(page, "none\n"); - return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); + return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset); } static ssize_t resync_start_store(struct mddev *mddev, const char *buf, size_t len) { - char *e; - unsigned long long n = simple_strtoull(buf, &e, 10); + unsigned long long n; + int err; - if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) - return -EBUSY; if (cmd_match(buf, "none")) n = MaxSector; - else if (!*buf || (*e && *e != '\n')) - return -EINVAL; + else { + err = kstrtoull(buf, 10, &n); + if (err < 0) + return err; + if (n != (sector_t)n) + return -EINVAL; + } - mddev->recovery_cp = n; - if (mddev->pers) - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - return len; + err = mddev_lock(mddev); + if (err) + return err; + if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + err = -EBUSY; + + if (!err) { + mddev->resync_offset = n; + if (mddev->pers) + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + } + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_resync_start = -__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); +__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, + resync_start_show, resync_start_store); /* * The array state can be: @@ -3825,12 +4569,16 @@ __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); * active-idle * like active, but no writes have been seen for a while (100msec). * + * broken +* Array is failed. It's useful because mounted-arrays aren't stopped +* when array is failed, so this state will at least alert the user that +* something is wrong. */ enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, - write_pending, active_idle, bad_word}; + write_pending, active_idle, broken, bad_word}; static char *array_states[] = { "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", - "write-pending", "active-idle", NULL }; + "write-pending", "active-idle", "broken", NULL }; static int match_word(const char *word, char **list) { @@ -3846,25 +4594,30 @@ array_state_show(struct mddev *mddev, char *page) { enum array_state st = inactive; - if (mddev->pers) + if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { switch(mddev->ro) { - case 1: + case MD_RDONLY: st = readonly; break; - case 2: + case MD_AUTO_READ: st = read_auto; break; - case 0: - if (mddev->in_sync) - st = clean; - else if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) + case MD_RDWR: + spin_lock(&mddev->lock); + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) st = write_pending; + else if (mddev->in_sync) + st = clean; else if (mddev->safemode) st = active_idle; else st = active; + spin_unlock(&mddev->lock); } - else { + + if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) + st = broken; + } else { if (list_empty(&mddev->disks) && mddev->raid_disks == 0 && mddev->dev_sectors == 0) @@ -3875,102 +4628,143 @@ array_state_show(struct mddev *mddev, char *page) return sprintf(page, "%s\n", array_states[st]); } -static int do_md_stop(struct mddev * mddev, int ro, struct block_device *bdev); -static int md_set_readonly(struct mddev * mddev, struct block_device *bdev); -static int do_md_run(struct mddev * mddev); +static int do_md_stop(struct mddev *mddev, int ro); +static int md_set_readonly(struct mddev *mddev); static int restart_array(struct mddev *mddev); static ssize_t array_state_store(struct mddev *mddev, const char *buf, size_t len) { - int err = -EINVAL; + int err = 0; enum array_state st = match_word(buf, array_states); - switch(st) { + + /* No lock dependent actions */ + switch (st) { + case suspended: /* not supported yet */ + case write_pending: /* cannot be set */ + case active_idle: /* cannot be set */ + case broken: /* cannot be set */ case bad_word: - break; + return -EINVAL; case clear: - /* stopping an active array */ - err = do_md_stop(mddev, 0, NULL); + case readonly: + case inactive: + case read_auto: + if (!mddev->pers || !md_is_rdwr(mddev)) + break; + /* write sysfs will not open mddev and opener should be 0 */ + err = mddev_set_closing_and_sync_blockdev(mddev, 0); + if (err) + return err; + break; + default: break; + } + + if (mddev->pers && (st == active || st == clean) && + mddev->ro != MD_RDONLY) { + /* don't take reconfig_mutex when toggling between + * clean and active + */ + spin_lock(&mddev->lock); + if (st == active) { + restart_array(mddev); + clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); + md_wakeup_thread(mddev->thread); + wake_up(&mddev->sb_wait); + } else /* st == clean */ { + restart_array(mddev); + if (!set_in_sync(mddev)) + err = -EBUSY; + } + if (!err) + sysfs_notify_dirent_safe(mddev->sysfs_state); + spin_unlock(&mddev->lock); + return err ?: len; + } + err = mddev_lock(mddev); + if (err) + return err; + + switch (st) { case inactive: - /* stopping an active array */ + /* stop an active array, return 0 otherwise */ if (mddev->pers) - err = do_md_stop(mddev, 2, NULL); - else - err = 0; /* already inactive */ + err = do_md_stop(mddev, 2); + break; + case clear: + err = do_md_stop(mddev, 0); break; - case suspended: - break; /* not supported yet */ case readonly: if (mddev->pers) - err = md_set_readonly(mddev, NULL); + err = md_set_readonly(mddev); else { - mddev->ro = 1; + mddev->ro = MD_RDONLY; set_disk_ro(mddev->gendisk, 1); err = do_md_run(mddev); } break; case read_auto: if (mddev->pers) { - if (mddev->ro == 0) - err = md_set_readonly(mddev, NULL); - else if (mddev->ro == 1) + if (md_is_rdwr(mddev)) + err = md_set_readonly(mddev); + else if (mddev->ro == MD_RDONLY) err = restart_array(mddev); if (err == 0) { - mddev->ro = 2; + mddev->ro = MD_AUTO_READ; set_disk_ro(mddev->gendisk, 0); } } else { - mddev->ro = 2; + mddev->ro = MD_AUTO_READ; err = do_md_run(mddev); } break; case clean: if (mddev->pers) { - restart_array(mddev); - spin_lock_irq(&mddev->write_lock); - if (atomic_read(&mddev->writes_pending) == 0) { - if (mddev->in_sync == 0) { - mddev->in_sync = 1; - if (mddev->safemode == 1) - mddev->safemode = 0; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - } - err = 0; - } else + err = restart_array(mddev); + if (err) + break; + spin_lock(&mddev->lock); + if (!set_in_sync(mddev)) err = -EBUSY; - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); } else err = -EINVAL; break; case active: if (mddev->pers) { - restart_array(mddev); - clear_bit(MD_CHANGE_PENDING, &mddev->flags); + err = restart_array(mddev); + if (err) + break; + clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); wake_up(&mddev->sb_wait); err = 0; } else { - mddev->ro = 0; + mddev->ro = MD_RDWR; set_disk_ro(mddev->gendisk, 0); err = do_md_run(mddev); } break; - case write_pending: - case active_idle: - /* these cannot be set */ + default: + err = -EINVAL; break; } - if (err) - return err; - else { + + if (!err) { if (mddev->hold_active == UNTIL_IOCTL) mddev->hold_active = 0; sysfs_notify_dirent_safe(mddev->sysfs_state); - return len; } + mddev_unlock(mddev); + + if (st == readonly || st == read_auto || st == inactive || + (err && st == clear)) + clear_bit(MD_CLOSING, &mddev->flags); + + return err ?: len; } static struct md_sysfs_entry md_array_state = -__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); +__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); static ssize_t max_corrected_read_errors_show(struct mddev *mddev, char *page) { @@ -3981,14 +4775,16 @@ max_corrected_read_errors_show(struct mddev *mddev, char *page) { static ssize_t max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) { - char *e; - unsigned long n = simple_strtoul(buf, &e, 10); + unsigned int n; + int rv; - if (*buf && (*e == 0 || *e == '\n')) { - atomic_set(&mddev->max_corr_read_errors, n); - return len; - } - return -EINVAL; + rv = kstrtouint(buf, 10, &n); + if (rv < 0) + return rv; + if (n > INT_MAX) + return -EINVAL; + atomic_set(&mddev->max_corr_read_errors, n); + return len; } static struct md_sysfs_entry max_corr_read_errors = @@ -4028,7 +4824,9 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; - + err = mddev_suspend_and_lock(mddev); + if (err) + return err; if (mddev->persistent) { rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); @@ -4046,12 +4844,17 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) else rdev = md_import_device(dev, -1, -1); - if (IS_ERR(rdev)) + if (IS_ERR(rdev)) { + mddev_unlock_and_resume(mddev); return PTR_ERR(rdev); + } err = bind_rdev_to_array(rdev, mddev); out: if (err) - export_rdev(rdev); + export_rdev(rdev, mddev); + mddev_unlock_and_resume(mddev); + if (!err) + md_new_event(); return err ? err : len; } @@ -4063,24 +4866,38 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) { char *end; unsigned long chunk, end_chunk; + int err; + + if (!md_bitmap_enabled(mddev, false)) + return len; + err = mddev_lock(mddev); + if (err) + return err; if (!mddev->bitmap) goto out; /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ while (*buf) { chunk = end_chunk = simple_strtoul(buf, &end, 0); - if (buf == end) break; + if (buf == end) + break; + if (*end == '-') { /* range */ buf = end + 1; end_chunk = simple_strtoul(buf, &end, 0); - if (buf == end) break; + if (buf == end) + break; } - if (*end && !isspace(*end)) break; - bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); + + if (*end && !isspace(*end)) + break; + + mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk); buf = skip_spaces(end); } - bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ + mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */ out: + mddev_unlock(mddev); return len; } @@ -4108,9 +4925,13 @@ size_store(struct mddev *mddev, const char *buf, size_t len) if (err < 0) return err; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) { err = update_size(mddev, sectors); - md_update_sb(mddev, 1); + if (err == 0) + md_update_sb(mddev, 1); } else { if (mddev->dev_sectors == 0 || mddev->dev_sectors > sectors) @@ -4118,13 +4939,13 @@ size_store(struct mddev *mddev, const char *buf, size_t len) else err = -ENOSPC; } + mddev_unlock(mddev); return err ? err : len; } static struct md_sysfs_entry md_size = __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); - /* Metadata version. * This is one of * 'none' for arrays with no metadata (good luck...) @@ -4148,27 +4969,34 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len) { int major, minor; char *e; + int err; /* Changing the details of 'external' metadata is * always permitted. Otherwise there must be * no devices attached to the array. */ + + err = mddev_lock(mddev); + if (err) + return err; + err = -EBUSY; if (mddev->external && strncmp(buf, "external:", 9) == 0) ; else if (!list_empty(&mddev->disks)) - return -EBUSY; + goto out_unlock; + err = 0; if (cmd_match(buf, "none")) { mddev->persistent = 0; mddev->external = 0; mddev->major_version = 0; mddev->minor_version = 90; - return len; + goto out_unlock; } if (strncmp(buf, "external:", 9) == 0) { size_t namelen = len-9; if (namelen >= sizeof(mddev->metadata_type)) namelen = sizeof(mddev->metadata_type)-1; - strncpy(mddev->metadata_type, buf+9, namelen); + memcpy(mddev->metadata_type, buf+9, namelen); mddev->metadata_type[namelen] = 0; if (namelen && mddev->metadata_type[namelen-1] == '\n') mddev->metadata_type[--namelen] = 0; @@ -4176,110 +5004,348 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len) mddev->external = 1; mddev->major_version = 0; mddev->minor_version = 90; - return len; + goto out_unlock; } major = simple_strtoul(buf, &e, 10); + err = -EINVAL; if (e==buf || *e != '.') - return -EINVAL; + goto out_unlock; buf = e+1; minor = simple_strtoul(buf, &e, 10); if (e==buf || (*e && *e != '\n') ) - return -EINVAL; + goto out_unlock; + err = -ENOENT; if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) - return -ENOENT; + goto out_unlock; mddev->major_version = major; mddev->minor_version = minor; mddev->persistent = 1; mddev->external = 0; - return len; + err = 0; +out_unlock: + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_metadata = -__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); +__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); + +static bool rdev_needs_recovery(struct md_rdev *rdev, sector_t sectors) +{ + return rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < sectors; +} + +static enum sync_action md_get_active_sync_action(struct mddev *mddev) +{ + struct md_rdev *rdev; + bool is_recover = false; + + if (mddev->resync_offset < MaxSector) + return ACTION_RESYNC; + + if (mddev->reshape_position != MaxSector) + return ACTION_RESHAPE; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (rdev_needs_recovery(rdev, MaxSector)) { + is_recover = true; + break; + } + } + rcu_read_unlock(); + + return is_recover ? ACTION_RECOVER : ACTION_IDLE; +} + +enum sync_action md_sync_action(struct mddev *mddev) +{ + unsigned long recovery = mddev->recovery; + enum sync_action active_action; + + /* + * frozen has the highest priority, means running sync_thread will be + * stopped immediately, and no new sync_thread can start. + */ + if (test_bit(MD_RECOVERY_FROZEN, &recovery)) + return ACTION_FROZEN; + + /* + * read-only array can't register sync_thread, and it can only + * add/remove spares. + */ + if (!md_is_rdwr(mddev)) + return ACTION_IDLE; + + /* + * idle means no sync_thread is running, and no new sync_thread is + * requested. + */ + if (!test_bit(MD_RECOVERY_RUNNING, &recovery) && + !test_bit(MD_RECOVERY_NEEDED, &recovery)) + return ACTION_IDLE; + + /* + * Check if any sync operation (resync/recover/reshape) is + * currently active. This ensures that only one sync operation + * can run at a time. Returns the type of active operation, or + * ACTION_IDLE if none are active. + */ + active_action = md_get_active_sync_action(mddev); + if (active_action != ACTION_IDLE) + return active_action; + + if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) + return ACTION_RESHAPE; + + if (test_bit(MD_RECOVERY_RECOVER, &recovery)) + return ACTION_RECOVER; + + if (test_bit(MD_RECOVERY_SYNC, &recovery)) { + /* + * MD_RECOVERY_CHECK must be paired with + * MD_RECOVERY_REQUESTED. + */ + if (test_bit(MD_RECOVERY_CHECK, &recovery)) + return ACTION_CHECK; + if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) + return ACTION_REPAIR; + return ACTION_RESYNC; + } + + /* + * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no + * sync_action is specified. + */ + return ACTION_IDLE; +} + +enum sync_action md_sync_action_by_name(const char *page) +{ + enum sync_action action; + + for (action = 0; action < NR_SYNC_ACTIONS; ++action) { + if (cmd_match(page, action_name[action])) + return action; + } + + return NR_SYNC_ACTIONS; +} + +const char *md_sync_action_name(enum sync_action action) +{ + return action_name[action]; +} static ssize_t action_show(struct mddev *mddev, char *page) { - char *type = "idle"; - if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) - type = "frozen"; - else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - type = "reshape"; - else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) - type = "resync"; - else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) - type = "check"; - else - type = "repair"; - } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) - type = "recover"; + enum sync_action action = md_sync_action(mddev); + + return sprintf(page, "%s\n", md_sync_action_name(action)); +} + +/** + * stop_sync_thread() - wait for sync_thread to stop if it's running. + * @mddev: the array. + * @locked: if set, reconfig_mutex will still be held after this function + * return; if not set, reconfig_mutex will be released after this + * function return. + */ +static void stop_sync_thread(struct mddev *mddev, bool locked) +{ + int sync_seq = atomic_read(&mddev->sync_seq); + + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + if (!locked) + mddev_unlock(mddev); + return; } - return sprintf(page, "%s\n", type); + + mddev_unlock(mddev); + + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + /* + * Thread might be blocked waiting for metadata update which will now + * never happen + */ + md_wakeup_thread_directly(&mddev->sync_thread); + if (work_pending(&mddev->sync_work)) + flush_work(&mddev->sync_work); + + wait_event(resync_wait, + !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) && + sync_seq != atomic_read(&mddev->sync_seq))); + + if (locked) + mddev_lock_nointr(mddev); +} + +void md_idle_sync_thread(struct mddev *mddev) +{ + lockdep_assert_held(&mddev->reconfig_mutex); + + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + stop_sync_thread(mddev, true); +} +EXPORT_SYMBOL_GPL(md_idle_sync_thread); + +void md_frozen_sync_thread(struct mddev *mddev) +{ + lockdep_assert_held(&mddev->reconfig_mutex); + + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + stop_sync_thread(mddev, true); +} +EXPORT_SYMBOL_GPL(md_frozen_sync_thread); + +void md_unfrozen_sync_thread(struct mddev *mddev) +{ + lockdep_assert_held(&mddev->reconfig_mutex); + + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + sysfs_notify_dirent_safe(mddev->sysfs_action); +} +EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); + +static int mddev_start_reshape(struct mddev *mddev) +{ + int ret; + + if (mddev->pers->start_reshape == NULL) + return -EINVAL; + + if (mddev->reshape_position == MaxSector || + mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev)) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + ret = mddev->pers->start_reshape(mddev); + if (ret) + return ret; + } else { + /* + * If reshape is still in progress, and md_check_recovery() can + * continue to reshape, don't restart reshape because data can + * be corrupted for raid456. + */ + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + } + + sysfs_notify_dirent_safe(mddev->sysfs_degraded); + return 0; } static ssize_t action_store(struct mddev *mddev, const char *page, size_t len) { + int ret; + enum sync_action action; + if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; - if (cmd_match(page, "frozen")) - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - else - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); +retry: + if (work_busy(&mddev->sync_work)) + flush_work(&mddev->sync_work); - if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { - if (mddev->sync_thread) { - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); + ret = mddev_lock(mddev); + if (ret) + return ret; + + if (work_busy(&mddev->sync_work)) { + mddev_unlock(mddev); + goto retry; + } + + action = md_sync_action_by_name(page); + + /* TODO: mdadm rely on "idle" to start sync_thread. */ + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + switch (action) { + case ACTION_FROZEN: + md_frozen_sync_thread(mddev); + ret = len; + goto out; + case ACTION_IDLE: + md_idle_sync_thread(mddev); + break; + case ACTION_RESHAPE: + case ACTION_RECOVER: + case ACTION_CHECK: + case ACTION_REPAIR: + case ACTION_RESYNC: + ret = -EBUSY; + goto out; + default: + ret = -EINVAL; + goto out; } - } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) - return -EBUSY; - else if (cmd_match(page, "resync")) - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - else if (cmd_match(page, "recover")) { - set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - } else if (cmd_match(page, "reshape")) { - int err; - if (mddev->pers->start_reshape == NULL) - return -EINVAL; - err = mddev->pers->start_reshape(mddev); - if (err) - return err; - sysfs_notify(&mddev->kobj, NULL, "degraded"); } else { - if (cmd_match(page, "check")) + switch (action) { + case ACTION_FROZEN: + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + ret = len; + goto out; + case ACTION_RESHAPE: + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + ret = mddev_start_reshape(mddev); + if (ret) + goto out; + break; + case ACTION_RECOVER: + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + break; + case ACTION_CHECK: set_bit(MD_RECOVERY_CHECK, &mddev->recovery); - else if (!cmd_match(page, "repair")) - return -EINVAL; - set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + fallthrough; + case ACTION_REPAIR: + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + fallthrough; + case ACTION_RESYNC: + case ACTION_IDLE: + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + break; + default: + ret = -EINVAL; + goto out; + } } - if (mddev->ro == 2) { + + if (mddev->ro == MD_AUTO_READ) { /* A write to sync_action is enough to justify * canceling read-auto mode */ - mddev->ro = 0; + mddev->ro = MD_RDWR; md_wakeup_thread(mddev->sync_thread); } + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_action); - return len; + ret = len; + +out: + mddev_unlock(mddev); + return ret; } static struct md_sysfs_entry md_scan_mode = -__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); +__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); static ssize_t last_sync_action_show(struct mddev *mddev, char *page) { - return sprintf(page, "%s\n", mddev->last_sync_action); + return sprintf(page, "%s\n", + md_sync_action_name(mddev->last_sync_action)); } static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); @@ -4298,21 +5364,24 @@ static ssize_t sync_min_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_min(mddev), - mddev->sync_speed_min ? "local": "system"); + mddev->sync_speed_min ? "local" : "system"); } static ssize_t sync_min_store(struct mddev *mddev, const char *buf, size_t len) { - int min; - char *e; - if (strncmp(buf, "system", 6)==0) { - mddev->sync_speed_min = 0; - return len; + unsigned int min; + int rv; + + if (strncmp(buf, "system", 6) == 0) { + min = 0; + } else { + rv = kstrtouint(buf, 10, &min); + if (rv < 0) + return rv; + if (min == 0) + return -EINVAL; } - min = simple_strtoul(buf, &e, 10); - if (buf == e || (*e && *e != '\n') || min <= 0) - return -EINVAL; mddev->sync_speed_min = min; return len; } @@ -4324,21 +5393,24 @@ static ssize_t sync_max_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_max(mddev), - mddev->sync_speed_max ? "local": "system"); + mddev->sync_speed_max ? "local" : "system"); } static ssize_t sync_max_store(struct mddev *mddev, const char *buf, size_t len) { - int max; - char *e; - if (strncmp(buf, "system", 6)==0) { - mddev->sync_speed_max = 0; - return len; + unsigned int max; + int rv; + + if (strncmp(buf, "system", 6) == 0) { + max = 0; + } else { + rv = kstrtouint(buf, 10, &max); + if (rv < 0) + return rv; + if (max == 0) + return -EINVAL; } - max = simple_strtoul(buf, &e, 10); - if (buf == e || (*e && *e != '\n') || max <= 0) - return -EINVAL; mddev->sync_speed_max = max; return len; } @@ -4347,6 +5419,35 @@ static struct md_sysfs_entry md_sync_max = __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); static ssize_t +sync_io_depth_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d (%s)\n", sync_io_depth(mddev), + mddev->sync_io_depth ? "local" : "system"); +} + +static ssize_t +sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int max; + int rv; + + if (strncmp(buf, "system", 6) == 0) { + max = 0; + } else { + rv = kstrtouint(buf, 10, &max); + if (rv < 0) + return rv; + if (max == 0) + return -EINVAL; + } + mddev->sync_io_depth = max; + return len; +} + +static struct md_sysfs_entry md_sync_io_depth = +__ATTR_RW(sync_io_depth); + +static ssize_t degraded_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->degraded); @@ -4387,7 +5488,7 @@ static ssize_t sync_speed_show(struct mddev *mddev, char *page) { unsigned long resync, dt, db; - if (mddev->curr_resync == 0) + if (mddev->curr_resync == MD_RESYNC_NONE) return sprintf(page, "none\n"); resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); dt = (jiffies - mddev->resync_mark) / HZ; @@ -4406,8 +5507,8 @@ sync_completed_show(struct mddev *mddev, char *page) if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); - if (mddev->curr_resync == 1 || - mddev->curr_resync == 2) + if (mddev->curr_resync == MD_RESYNC_YIELDED || + mddev->curr_resync == MD_RESYNC_DELAYED) return sprintf(page, "delayed\n"); if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || @@ -4420,7 +5521,8 @@ sync_completed_show(struct mddev *mddev, char *page) return sprintf(page, "%llu / %llu\n", resync, max_sectors); } -static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); +static struct md_sysfs_entry md_sync_completed = + __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); static ssize_t min_sync_show(struct mddev *mddev, char *page) @@ -4432,22 +5534,27 @@ static ssize_t min_sync_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long min; + int err; + if (kstrtoull(buf, 10, &min)) return -EINVAL; + + spin_lock(&mddev->lock); + err = -EINVAL; if (min > mddev->resync_max) - return -EINVAL; + goto out_unlock; + + err = -EBUSY; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return -EBUSY; + goto out_unlock; - /* Must be a multiple of chunk_size */ - if (mddev->chunk_sectors) { - sector_t temp = min; - if (sector_div(temp, mddev->chunk_sectors)) - return -EINVAL; - } - mddev->resync_min = min; + /* Round down to multiple of 4K for safety */ + mddev->resync_min = round_down(min, 8); + err = 0; - return len; +out_unlock: + spin_unlock(&mddev->lock); + return err ?: len; } static struct md_sysfs_entry md_min_sync = @@ -4465,29 +5572,41 @@ max_sync_show(struct mddev *mddev, char *page) static ssize_t max_sync_store(struct mddev *mddev, const char *buf, size_t len) { + int err; + spin_lock(&mddev->lock); if (strncmp(buf, "max", 3) == 0) mddev->resync_max = MaxSector; else { unsigned long long max; + int chunk; + + err = -EINVAL; if (kstrtoull(buf, 10, &max)) - return -EINVAL; + goto out_unlock; if (max < mddev->resync_min) - return -EINVAL; - if (max < mddev->resync_max && - mddev->ro == 0 && + goto out_unlock; + + err = -EBUSY; + if (max < mddev->resync_max && md_is_rdwr(mddev) && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return -EBUSY; + goto out_unlock; /* Must be a multiple of chunk_size */ - if (mddev->chunk_sectors) { + chunk = mddev->chunk_sectors; + if (chunk) { sector_t temp = max; - if (sector_div(temp, mddev->chunk_sectors)) - return -EINVAL; + + err = -EINVAL; + if (sector_div(temp, chunk)) + goto out_unlock; } mddev->resync_max = max; } wake_up(&mddev->recovery_wait); - return len; + err = 0; +out_unlock: + spin_unlock(&mddev->lock); + return err ?: len; } static struct md_sysfs_entry md_max_sync = @@ -4496,65 +5615,60 @@ __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); static ssize_t suspend_lo_show(struct mddev *mddev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); + return sprintf(page, "%llu\n", + (unsigned long long)READ_ONCE(mddev->suspend_lo)); } static ssize_t suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { - char *e; - unsigned long long new = simple_strtoull(buf, &e, 10); - unsigned long long old = mddev->suspend_lo; + unsigned long long new; + int err; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) - return -EINVAL; - if (buf == e || (*e && *e != '\n')) + err = kstrtoull(buf, 10, &new); + if (err < 0) + return err; + if (new != (sector_t)new) return -EINVAL; - mddev->suspend_lo = new; - if (new >= old) - /* Shrinking suspended region */ - mddev->pers->quiesce(mddev, 2); - else { - /* Expanding suspended region - need to wait */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - } + err = mddev_suspend(mddev, true); + if (err) + return err; + + WRITE_ONCE(mddev->suspend_lo, new); + mddev_resume(mddev); + return len; } static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); - static ssize_t suspend_hi_show(struct mddev *mddev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); + return sprintf(page, "%llu\n", + (unsigned long long)READ_ONCE(mddev->suspend_hi)); } static ssize_t suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { - char *e; - unsigned long long new = simple_strtoull(buf, &e, 10); - unsigned long long old = mddev->suspend_hi; + unsigned long long new; + int err; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) - return -EINVAL; - if (buf == e || (*e && *e != '\n')) + err = kstrtoull(buf, 10, &new); + if (err < 0) + return err; + if (new != (sector_t)new) return -EINVAL; - mddev->suspend_hi = new; - if (new <= old) - /* Shrinking suspended region */ - mddev->pers->quiesce(mddev, 2); - else { - /* Expanding suspended region - need to wait */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - } + err = mddev_suspend(mddev, true); + if (err) + return err; + + WRITE_ONCE(mddev->suspend_hi, new); + mddev_resume(mddev); + return len; } static struct md_sysfs_entry md_suspend_hi = @@ -4574,12 +5688,20 @@ static ssize_t reshape_position_store(struct mddev *mddev, const char *buf, size_t len) { struct md_rdev *rdev; - char *e; - unsigned long long new = simple_strtoull(buf, &e, 10); - if (mddev->pers) - return -EBUSY; - if (buf == e || (*e && *e != '\n')) + unsigned long long new; + int err; + + err = kstrtoull(buf, 10, &new); + if (err < 0) + return err; + if (new != (sector_t)new) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; + err = -EBUSY; + if (mddev->pers) + goto unlock; mddev->reshape_position = new; mddev->delta_disks = 0; mddev->reshape_backwards = 0; @@ -4588,7 +5710,10 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len) mddev->new_chunk_sectors = mddev->chunk_sectors; rdev_for_each(rdev, mddev) rdev->new_data_offset = rdev->data_offset; - return len; + err = 0; +unlock: + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_reshape_position = @@ -4606,6 +5731,8 @@ static ssize_t reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) { int backwards = 0; + int err; + if (cmd_match(buf, "forwards")) backwards = 0; else if (cmd_match(buf, "backwards")) @@ -4615,16 +5742,19 @@ reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->reshape_backwards == backwards) return len; + err = mddev_lock(mddev); + if (err) + return err; /* check if we are allowed to change */ if (mddev->delta_disks) - return -EBUSY; - - if (mddev->persistent && + err = -EBUSY; + else if (mddev->persistent && mddev->major_version == 0) - return -EINVAL; - - mddev->reshape_backwards = backwards; - return len; + err = -EINVAL; + else + mddev->reshape_backwards = backwards; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_reshape_direction = @@ -4645,6 +5775,17 @@ static ssize_t array_size_store(struct mddev *mddev, const char *buf, size_t len) { sector_t sectors; + int err; + + err = mddev_lock(mddev); + if (err) + return err; + + /* cluster raid doesn't support change array_sectors */ + if (mddev_is_clustered(mddev)) { + mddev_unlock(mddev); + return -EINVAL; + } if (strncmp(buf, "default", 7) == 0) { if (mddev->pers) @@ -4655,29 +5796,219 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len) mddev->external_size = 0; } else { if (strict_blocks_to_sectors(buf, §ors) < 0) - return -EINVAL; - if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) - return -E2BIG; - - mddev->external_size = 1; + err = -EINVAL; + else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) + err = -E2BIG; + else + mddev->external_size = 1; } - mddev->array_sectors = sectors; - if (mddev->pers) { - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); + if (!err) { + mddev->array_sectors = sectors; + if (mddev->pers) + set_capacity_and_notify(mddev->gendisk, + mddev->array_sectors); } - return len; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_array_size = __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, array_size_store); +static ssize_t +consistency_policy_show(struct mddev *mddev, char *page) +{ + int ret; + + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + ret = sprintf(page, "journal\n"); + } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { + ret = sprintf(page, "ppl\n"); + } else if (mddev->bitmap) { + ret = sprintf(page, "bitmap\n"); + } else if (mddev->pers) { + if (mddev->pers->sync_request) + ret = sprintf(page, "resync\n"); + else + ret = sprintf(page, "none\n"); + } else { + ret = sprintf(page, "unknown\n"); + } + + return ret; +} + +static ssize_t +consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) +{ + int err = 0; + + if (mddev->pers) { + if (mddev->pers->change_consistency_policy) + err = mddev->pers->change_consistency_policy(mddev, buf); + else + err = -EBUSY; + } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { + set_bit(MD_HAS_PPL, &mddev->flags); + } else { + err = -EINVAL; + } + + return err ? err : len; +} + +static struct md_sysfs_entry md_consistency_policy = +__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, + consistency_policy_store); + +static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d\n", mddev->fail_last_dev); +} + +/* + * Setting fail_last_dev to true to allow last device to be forcibly removed + * from RAID1/RAID10. + */ +static ssize_t +fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) +{ + int ret; + bool value; + + ret = kstrtobool(buf, &value); + if (ret) + return ret; + + if (value != mddev->fail_last_dev) + mddev->fail_last_dev = value; + + return len; +} +static struct md_sysfs_entry md_fail_last_dev = +__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, + fail_last_dev_store); + +static ssize_t serialize_policy_show(struct mddev *mddev, char *page) +{ + if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) + return sprintf(page, "n/a\n"); + else + return sprintf(page, "%d\n", mddev->serialize_policy); +} + +/* + * Setting serialize_policy to true to enforce write IO is not reordered + * for raid1. + */ +static ssize_t +serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) +{ + int err; + bool value; + + err = kstrtobool(buf, &value); + if (err) + return err; + + if (value == mddev->serialize_policy) + return len; + + err = mddev_suspend_and_lock(mddev); + if (err) + return err; + if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) { + pr_err("md: serialize_policy is only effective for raid1\n"); + err = -EINVAL; + goto unlock; + } + + if (value) + mddev_create_serial_pool(mddev, NULL); + else + mddev_destroy_serial_pool(mddev, NULL); + mddev->serialize_policy = value; +unlock: + mddev_unlock_and_resume(mddev); + return err ?: len; +} + +static struct md_sysfs_entry md_serialize_policy = +__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, + serialize_policy_store); + +static int mddev_set_logical_block_size(struct mddev *mddev, + unsigned int lbs) +{ + int err = 0; + struct queue_limits lim; + + if (queue_logical_block_size(mddev->gendisk->queue) >= lbs) { + pr_err("%s: Cannot set LBS smaller than mddev LBS %u\n", + mdname(mddev), lbs); + return -EINVAL; + } + + lim = queue_limits_start_update(mddev->gendisk->queue); + lim.logical_block_size = lbs; + pr_info("%s: logical_block_size is changed, data may be lost\n", + mdname(mddev)); + err = queue_limits_commit_update(mddev->gendisk->queue, &lim); + if (err) + return err; + + mddev->logical_block_size = lbs; + /* New lbs will be written to superblock after array is running */ + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + return 0; +} + +static ssize_t +lbs_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%u\n", mddev->logical_block_size); +} + +static ssize_t +lbs_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int lbs; + int err = -EBUSY; + + /* Only 1.x meta supports configurable LBS */ + if (mddev->major_version == 0) + return -EINVAL; + + if (mddev->pers) + return -EBUSY; + + err = kstrtouint(buf, 10, &lbs); + if (err < 0) + return -EINVAL; + + err = mddev_lock(mddev); + if (err) + goto unlock; + + err = mddev_set_logical_block_size(mddev, lbs); + +unlock: + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry md_logical_block_size = +__ATTR(logical_block_size, 0644, lbs_show, lbs_store); + static struct attribute *md_default_attrs[] = { &md_level.attr, + &md_new_level.attr, + &md_bitmap_type.attr, &md_layout.attr, &md_raid_disks.attr, + &md_uuid.attr, &md_chunk_size.attr, &md_size.attr, &md_resync_start.attr, @@ -4689,15 +6020,24 @@ static struct attribute *md_default_attrs[] = { &md_reshape_direction.attr, &md_array_size.attr, &max_corr_read_errors.attr, + &md_consistency_policy.attr, + &md_fail_last_dev.attr, + &md_serialize_policy.attr, + &md_logical_block_size.attr, NULL, }; +static const struct attribute_group md_default_group = { + .attrs = md_default_attrs, +}; + static struct attribute *md_redundancy_attrs[] = { &md_scan_mode.attr, &md_last_scan_mode.attr, &md_mismatches.attr, &md_sync_min.attr, &md_sync_max.attr, + &md_sync_io_depth.attr, &md_sync_speed.attr, &md_sync_force_parallel.attr, &md_sync_completed.attr, @@ -4709,11 +6049,15 @@ static struct attribute *md_redundancy_attrs[] = { &md_degraded.attr, NULL, }; -static struct attribute_group md_redundancy_group = { +static const struct attribute_group md_redundancy_group = { .name = NULL, .attrs = md_redundancy_attrs, }; +static const struct attribute_group *md_attr_groups[] = { + &md_default_group, + NULL, +}; static ssize_t md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) @@ -4725,18 +6069,13 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) if (!entry->show) return -EIO; spin_lock(&all_mddevs_lock); - if (list_empty(&mddev->all_mddevs)) { + if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); return -EBUSY; } - mddev_get(mddev); spin_unlock(&all_mddevs_lock); - rv = mddev_lock(mddev); - if (!rv) { - rv = entry->show(mddev, page); - mddev_unlock(mddev); - } + rv = entry->show(mddev, page); mddev_put(mddev); return rv; } @@ -4748,95 +6087,188 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); struct mddev *mddev = container_of(kobj, struct mddev, kobj); ssize_t rv; + struct kernfs_node *kn = NULL; if (!entry->store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + + if (entry->store == array_state_store && cmd_match(page, "clear")) + kn = sysfs_break_active_protection(kobj, attr); + spin_lock(&all_mddevs_lock); - if (list_empty(&mddev->all_mddevs)) { + if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); + if (kn) + sysfs_unbreak_active_protection(kn); return -EBUSY; } - mddev_get(mddev); spin_unlock(&all_mddevs_lock); - if (entry->store == new_dev_store) - flush_workqueue(md_misc_wq); - rv = mddev_lock(mddev); - if (!rv) { - rv = entry->store(mddev, page, length); - mddev_unlock(mddev); - } + rv = entry->store(mddev, page, length); mddev_put(mddev); + + if (kn) + sysfs_unbreak_active_protection(kn); + return rv; } -static void md_free(struct kobject *ko) +static void md_kobj_release(struct kobject *ko) { struct mddev *mddev = container_of(ko, struct mddev, kobj); - if (mddev->sysfs_state) - sysfs_put(mddev->sysfs_state); - - if (mddev->gendisk) { + if (legacy_async_del_gendisk) { + if (mddev->sysfs_state) + sysfs_put(mddev->sysfs_state); + if (mddev->sysfs_level) + sysfs_put(mddev->sysfs_level); del_gendisk(mddev->gendisk); - put_disk(mddev->gendisk); } - if (mddev->queue) - blk_cleanup_queue(mddev->queue); - - kfree(mddev); + put_disk(mddev->gendisk); } static const struct sysfs_ops md_sysfs_ops = { .show = md_attr_show, .store = md_attr_store, }; -static struct kobj_type md_ktype = { - .release = md_free, +static const struct kobj_type md_ktype = { + .release = md_kobj_release, .sysfs_ops = &md_sysfs_ops, - .default_attrs = md_default_attrs, + .default_groups = md_attr_groups, }; int mdp_major = 0; +/* stack the limit for all rdevs into lim */ +int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, + unsigned int flags) +{ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) { + queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, + mddev->gendisk->disk_name); + if ((flags & MDDEV_STACK_INTEGRITY) && + !queue_limits_stack_integrity_bdev(lim, rdev->bdev)) + return -EINVAL; + } + + /* + * Before RAID adding folio support, the logical_block_size + * should be smaller than the page size. + */ + if (lim->logical_block_size > PAGE_SIZE) { + pr_err("%s: logical_block_size must not larger than PAGE_SIZE\n", + mdname(mddev)); + return -EINVAL; + } + mddev->logical_block_size = lim->logical_block_size; + + return 0; +} +EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); + +/* apply the extra stacking limits from a new rdev into mddev */ +int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) +{ + struct queue_limits lim; + + if (mddev_is_dm(mddev)) + return 0; + + if (queue_logical_block_size(rdev->bdev->bd_disk->queue) > + queue_logical_block_size(mddev->gendisk->queue)) { + pr_err("%s: incompatible logical_block_size, can not add\n", + mdname(mddev)); + return -EINVAL; + } + + lim = queue_limits_start_update(mddev->gendisk->queue); + queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, + mddev->gendisk->disk_name); + + if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) { + pr_err("%s: incompatible integrity profile for %pg\n", + mdname(mddev), rdev->bdev); + queue_limits_cancel_update(mddev->gendisk->queue); + return -ENXIO; + } + + return queue_limits_commit_update(mddev->gendisk->queue, &lim); +} +EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); + +/* update the optimal I/O size after a reshape */ +void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes) +{ + struct queue_limits lim; + + if (mddev_is_dm(mddev)) + return; + + /* don't bother updating io_opt if we can't suspend the array */ + if (mddev_suspend(mddev, false) < 0) + return; + lim = queue_limits_start_update(mddev->gendisk->queue); + lim.io_opt = lim.io_min * nr_stripes; + queue_limits_commit_update(mddev->gendisk->queue, &lim); + mddev_resume(mddev); +} +EXPORT_SYMBOL_GPL(mddev_update_io_opt); + static void mddev_delayed_delete(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); - sysfs_remove_group(&mddev->kobj, &md_bitmap_group); - kobject_del(&mddev->kobj); kobject_put(&mddev->kobj); } -static int md_alloc(dev_t dev, char *name) +void md_init_stacking_limits(struct queue_limits *lim) { + blk_set_stacking_limits(lim); + lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | + BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; +} +EXPORT_SYMBOL_GPL(md_init_stacking_limits); + +struct mddev *md_alloc(dev_t dev, char *name) +{ + /* + * If dev is zero, name is the name of a device to allocate with + * an arbitrary minor number. It will be "md_???" + * If dev is non-zero it must be a device number with a MAJOR of + * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then + * the device is being created by opening a node in /dev. + * If "name" is not NULL, the device is being created by + * writing to /sys/module/md_mod/parameters/new_array. + */ static DEFINE_MUTEX(disks_mutex); - struct mddev *mddev = mddev_find(dev); + struct mddev *mddev; struct gendisk *disk; int partitioned; int shift; int unit; int error; - if (!mddev) - return -ENODEV; - - partitioned = (MAJOR(mddev->unit) != MD_MAJOR); - shift = partitioned ? MdpMinorShift : 0; - unit = MINOR(mddev->unit) >> shift; - - /* wait for any previous instance of this device to be - * completely removed (mddev_delayed_delete). + /* + * Wait for any previous instance of this device to be completely + * removed (mddev_delayed_delete). */ flush_workqueue(md_misc_wq); mutex_lock(&disks_mutex); - error = -EEXIST; - if (mddev->gendisk) - goto abort; + mddev = mddev_alloc(dev); + if (IS_ERR(mddev)) { + error = PTR_ERR(mddev); + goto out_unlock; + } + + partitioned = (MAJOR(mddev->unit) != MD_MAJOR); + shift = partitioned ? MdpMinorShift : 0; + unit = MINOR(mddev->unit) >> shift; - if (name) { + if (name && !dev) { /* Need to ensure that 'name' is not a duplicate. */ struct mddev *mddev2; @@ -4846,28 +6278,26 @@ static int md_alloc(dev_t dev, char *name) if (mddev2->gendisk && strcmp(mddev2->gendisk->disk_name, name) == 0) { spin_unlock(&all_mddevs_lock); - goto abort; + error = -EEXIST; + goto out_free_mddev; } spin_unlock(&all_mddevs_lock); } + if (name && dev) + /* + * Creating /dev/mdNNN via "newarray", so adjust hold_active. + */ + mddev->hold_active = UNTIL_STOP; - error = -ENOMEM; - mddev->queue = blk_alloc_queue(GFP_KERNEL); - if (!mddev->queue) - goto abort; - mddev->queue->queuedata = mddev; - - blk_queue_make_request(mddev->queue, md_make_request); - blk_set_stacking_limits(&mddev->queue->limits); - - disk = alloc_disk(1 << shift); - if (!disk) { - blk_cleanup_queue(mddev->queue); - mddev->queue = NULL; - goto abort; + disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(disk)) { + error = PTR_ERR(disk); + goto out_free_mddev; } + disk->major = MAJOR(mddev->unit); disk->first_minor = unit << shift; + disk->minors = 1 << shift; if (name) strcpy(disk->disk_name, name); else if (partitioned) @@ -4876,88 +6306,131 @@ static int md_alloc(dev_t dev, char *name) sprintf(disk->disk_name, "md%d", unit); disk->fops = &md_fops; disk->private_data = mddev; - disk->queue = mddev->queue; - blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); - /* Allow extended partitions. This makes the - * 'mdp' device redundant, but we can't really - * remove it now. - */ - disk->flags |= GENHD_FL_EXT_DEVT; + + disk->events |= DISK_EVENT_MEDIA_CHANGE; mddev->gendisk = disk; - /* As soon as we call add_disk(), another thread could get - * through to md_open, so make sure it doesn't get too far - */ - mutex_lock(&mddev->open_mutex); - add_disk(disk); + error = add_disk(disk); + if (error) + goto out_put_disk; - error = kobject_init_and_add(&mddev->kobj, &md_ktype, - &disk_to_dev(disk)->kobj, "%s", "md"); + kobject_init(&mddev->kobj, &md_ktype); + error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); if (error) { - /* This isn't possible, but as kobject_init_and_add is marked - * __must_check, we must do something with the result + /* + * The disk is already live at this point. Clear the hold flag + * and let mddev_put take care of the deletion, as it isn't any + * different from a normal close on last release now. */ - printk(KERN_WARNING "md: cannot register %s/md - name in use\n", - disk->disk_name); - error = 0; + mddev->hold_active = 0; + mutex_unlock(&disks_mutex); + mddev_put(mddev); + return ERR_PTR(error); } - if (mddev->kobj.sd && - sysfs_create_group(&mddev->kobj, &md_bitmap_group)) - printk(KERN_DEBUG "pointless warning\n"); - mutex_unlock(&mddev->open_mutex); - abort: + + kobject_uevent(&mddev->kobj, KOBJ_ADD); + mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); + mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); mutex_unlock(&disks_mutex); - if (!error && mddev->kobj.sd) { - kobject_uevent(&mddev->kobj, KOBJ_ADD); - mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); - } + return mddev; + +out_put_disk: + put_disk(disk); +out_free_mddev: + mddev_free(mddev); +out_unlock: + mutex_unlock(&disks_mutex); + return ERR_PTR(error); +} + +static int md_alloc_and_put(dev_t dev, char *name) +{ + struct mddev *mddev = md_alloc(dev, name); + + if (legacy_async_del_gendisk) + pr_warn("md: async del_gendisk mode will be removed in future, please upgrade to mdadm-4.5+\n"); + + if (IS_ERR(mddev)) + return PTR_ERR(mddev); mddev_put(mddev); - return error; + return 0; } -static struct kobject *md_probe(dev_t dev, int *part, void *data) +static void md_probe(dev_t dev) { - md_alloc(dev, NULL); - return NULL; + if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) + return; + if (create_on_open) + md_alloc_and_put(dev, NULL); } -static int add_named_array(const char *val, struct kernel_param *kp) +static int add_named_array(const char *val, const struct kernel_param *kp) { - /* val must be "md_*" where * is not all digits. - * We allocate an array with a large free minor number, and + /* + * val must be "md_*" or "mdNNN". + * For "md_*" we allocate an array with a large free minor number, and * set the name to val. val must not already be an active name. + * For "mdNNN" we allocate an array with the minor number NNN + * which must not already be in use. */ int len = strlen(val); char buf[DISK_NAME_LEN]; + unsigned long devnum; while (len && val[len-1] == '\n') len--; if (len >= DISK_NAME_LEN) return -E2BIG; - strlcpy(buf, val, len+1); - if (strncmp(buf, "md_", 3) != 0) - return -EINVAL; - return md_alloc(0, buf); + strscpy(buf, val, len+1); + if (strncmp(buf, "md_", 3) == 0) + return md_alloc_and_put(0, buf); + if (strncmp(buf, "md", 2) == 0 && + isdigit(buf[2]) && + kstrtoul(buf+2, 10, &devnum) == 0 && + devnum <= MINORMASK) + return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); + + return -EINVAL; } -static void md_safemode_timeout(unsigned long data) +static void md_safemode_timeout(struct timer_list *t) { - struct mddev *mddev = (struct mddev *) data; + struct mddev *mddev = timer_container_of(mddev, t, safemode_timer); + + mddev->safemode = 1; + if (mddev->external) + sysfs_notify_dirent_safe(mddev->sysfs_state); - if (!atomic_read(&mddev->writes_pending)) { - mddev->safemode = 1; - if (mddev->external) - sysfs_notify_dirent_safe(mddev->sysfs_state); - } md_wakeup_thread(mddev->thread); } static int start_dirty_degraded; +static int md_bitmap_create(struct mddev *mddev) +{ + if (mddev->bitmap_id == ID_BITMAP_NONE) + return -EINVAL; + + if (!mddev_set_bitmap_ops(mddev)) + return -ENOENT; + + return mddev->bitmap_ops->create(mddev); +} + +static void md_bitmap_destroy(struct mddev *mddev) +{ + if (!md_bitmap_registered(mddev)) + return; + + mddev->bitmap_ops->destroy(mddev); + mddev_clear_bitmap_ops(mddev); +} + int md_run(struct mddev *mddev) { int err; struct md_rdev *rdev; struct md_personality *pers; + bool nowait = true; if (list_empty(&mddev->disks)) /* cannot run an array with no devices.. */ @@ -4975,7 +6448,9 @@ int md_run(struct mddev *mddev) if (!mddev->raid_disks) { if (!mddev->persistent) return -EINVAL; - analyze_sbs(mddev); + err = analyze_sbs(mddev); + if (err) + return -EINVAL; } if (mddev->level != LEVEL_NONE) @@ -4988,11 +6463,20 @@ int md_run(struct mddev *mddev) * the only valid external interface is through the md * device. */ + mddev->has_superblocks = false; rdev_for_each(rdev, mddev) { if (test_bit(Faulty, &rdev->flags)) continue; sync_blockdev(rdev->bdev); invalidate_bdev(rdev->bdev); + if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { + mddev->ro = MD_RDONLY; + if (!mddev_is_dm(mddev)) + set_disk_ro(mddev->gendisk, 1); + } + + if (rdev->sb_page) + mddev->has_superblocks = true; /* perform some consistency tests on the device. * We don't want the data to overlap the metadata, @@ -5004,49 +6488,35 @@ int md_run(struct mddev *mddev) if (mddev->dev_sectors && rdev->data_offset + mddev->dev_sectors > rdev->sb_start) { - printk("md: %s: data overlaps metadata\n", - mdname(mddev)); + pr_warn("md: %s: data overlaps metadata\n", + mdname(mddev)); return -EINVAL; } } else { if (rdev->sb_start + rdev->sb_size/512 > rdev->data_offset) { - printk("md: %s: metadata overlaps data\n", - mdname(mddev)); + pr_warn("md: %s: metadata overlaps data\n", + mdname(mddev)); return -EINVAL; } } sysfs_notify_dirent_safe(rdev->sysfs_state); + nowait = nowait && bdev_nowait(rdev->bdev); } - if (mddev->bio_set == NULL) - mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); - - spin_lock(&pers_lock); - pers = find_pers(mddev->level, mddev->clevel); - if (!pers || !try_module_get(pers->owner)) { - spin_unlock(&pers_lock); - if (mddev->level != LEVEL_NONE) - printk(KERN_WARNING "md: personality for level %d is not loaded!\n", - mddev->level); - else - printk(KERN_WARNING "md: personality for level %s is not loaded!\n", - mddev->clevel); + pers = get_pers(mddev->level, mddev->clevel); + if (!pers) return -EINVAL; + if (mddev->level != pers->head.id) { + mddev->level = pers->head.id; + mddev->new_level = pers->head.id; } - mddev->pers = pers; - spin_unlock(&pers_lock); - if (mddev->level != pers->level) { - mddev->level = pers->level; - mddev->new_level = pers->level; - } - strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ - mddev->pers = NULL; - module_put(pers->owner); + put_pers(pers); return -EINVAL; } @@ -5054,201 +6524,259 @@ int md_run(struct mddev *mddev) /* Warn if this is a potentially silly * configuration. */ - char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; struct md_rdev *rdev2; int warned = 0; rdev_for_each(rdev, mddev) rdev_for_each(rdev2, mddev) { if (rdev < rdev2 && - rdev->bdev->bd_contains == - rdev2->bdev->bd_contains) { - printk(KERN_WARNING - "%s: WARNING: %s appears to be" - " on the same physical disk as" - " %s.\n", - mdname(mddev), - bdevname(rdev->bdev,b), - bdevname(rdev2->bdev,b2)); + rdev->bdev->bd_disk == + rdev2->bdev->bd_disk) { + pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", + mdname(mddev), + rdev->bdev, + rdev2->bdev); warned = 1; } } if (warned) - printk(KERN_WARNING - "True protection against single-disk" - " failure might be compromised.\n"); + pr_warn("True protection against single-disk failure might be compromised.\n"); } - mddev->recovery = 0; + /* dm-raid expect sync_thread to be frozen until resume */ + if (mddev->gendisk) + mddev->recovery = 0; + /* may be over-ridden by personality */ mddev->resync_max_sectors = mddev->dev_sectors; mddev->ok_start_degraded = start_dirty_degraded; - if (start_readonly && mddev->ro == 0) - mddev->ro = 2; /* read-only, but switch on first write */ + if (start_readonly && md_is_rdwr(mddev)) + mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ - err = mddev->pers->run(mddev); + err = pers->run(mddev); if (err) - printk(KERN_ERR "md: pers->run() failed ...\n"); - else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { - WARN_ONCE(!mddev->external_size, "%s: default size too small," - " but 'external_size' not in effect?\n", __func__); - printk(KERN_ERR - "md: invalid array_size %llu > default size %llu\n", - (unsigned long long)mddev->array_sectors / 2, - (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); + pr_warn("md: pers->run() failed ...\n"); + else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { + WARN_ONCE(!mddev->external_size, + "%s: default size too small, but 'external_size' not in effect?\n", + __func__); + pr_warn("md: invalid array_size %llu > default size %llu\n", + (unsigned long long)mddev->array_sectors / 2, + (unsigned long long)pers->size(mddev, 0, 0) / 2); err = -EINVAL; - mddev->pers->stop(mddev); } - if (err == 0 && mddev->pers->sync_request && + if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { - err = bitmap_create(mddev); - if (err) { - printk(KERN_ERR "%s: failed to create bitmap (%d)\n", - mdname(mddev), err); - mddev->pers->stop(mddev); - } + err = md_bitmap_create(mddev); + if (err) + pr_warn("%s: failed to create bitmap (%d)\n", + mdname(mddev), err); } - if (err) { - module_put(mddev->pers->owner); - mddev->pers = NULL; - bitmap_destroy(mddev); - return err; + if (err) + goto bitmap_abort; + + if (mddev->bitmap_info.max_write_behind > 0) { + bool create_pool = false; + + rdev_for_each(rdev, mddev) { + if (test_bit(WriteMostly, &rdev->flags) && + rdev_init_serial(rdev)) + create_pool = true; + } + if (create_pool && mddev->serial_info_pool == NULL) { + mddev->serial_info_pool = + mempool_create_kmalloc_pool(NR_SERIAL_INFOS, + sizeof(struct serial_info)); + if (!mddev->serial_info_pool) { + err = -ENOMEM; + goto bitmap_abort; + } + } } - if (mddev->pers->sync_request) { + + if (pers->sync_request) { if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_redundancy_group)) - printk(KERN_WARNING - "md: cannot register extra attributes for %s\n", - mdname(mddev)); + pr_warn("md: cannot register extra attributes for %s\n", + mdname(mddev)); mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); - } else if (mddev->ro == 2) /* auto-readonly not meaningful */ - mddev->ro = 0; + mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); + mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); + } else if (mddev->ro == MD_AUTO_READ) + mddev->ro = MD_RDWR; - atomic_set(&mddev->writes_pending,0); atomic_set(&mddev->max_corr_read_errors, MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); mddev->safemode = 0; - mddev->safemode_timer.function = md_safemode_timeout; - mddev->safemode_timer.data = (unsigned long) mddev; - mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ + if (mddev_is_clustered(mddev)) + mddev->safemode_delay = 0; + else + mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; mddev->in_sync = 1; smp_wmb(); - mddev->ready = 1; + spin_lock(&mddev->lock); + mddev->pers = pers; + spin_unlock(&mddev->lock); rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) - if (sysfs_link_rdev(mddev, rdev)) - /* failure here is OK */; - + sysfs_link_rdev(mddev, rdev); /* failure here is OK */ + + if (mddev->degraded && md_is_rdwr(mddev)) + /* This ensures that recovering status is reported immediately + * via sysfs - until a lack of spares is confirmed. + */ + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - - if (mddev->flags) + + if (mddev->sb_flags) md_update_sb(mddev, 0); - md_new_event(mddev); - sysfs_notify_dirent_safe(mddev->sysfs_state); - sysfs_notify_dirent_safe(mddev->sysfs_action); - sysfs_notify(&mddev->kobj, NULL, "degraded"); + md_new_event(); return 0; + +bitmap_abort: + mddev_detach(mddev); + if (mddev->private) + pers->free(mddev, mddev->private); + mddev->private = NULL; + put_pers(pers); + md_bitmap_destroy(mddev); + return err; } EXPORT_SYMBOL_GPL(md_run); -static int do_md_run(struct mddev *mddev) +int do_md_run(struct mddev *mddev) { int err; + set_bit(MD_NOT_READY, &mddev->flags); err = md_run(mddev); if (err) goto out; - err = bitmap_load(mddev); - if (err) { - bitmap_destroy(mddev); - goto out; + + if (md_bitmap_registered(mddev)) { + err = mddev->bitmap_ops->load(mddev); + if (err) { + md_bitmap_destroy(mddev); + goto out; + } } - md_wakeup_thread(mddev->thread); + if (mddev_is_clustered(mddev)) + md_allow_write(mddev); + + /* run start up tasks that require md_thread */ + md_start(mddev); + md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); + set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); + clear_bit(MD_NOT_READY, &mddev->flags); mddev->changed = 1; kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); + sysfs_notify_dirent_safe(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_action); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); out: + clear_bit(MD_NOT_READY, &mddev->flags); return err; } +int md_start(struct mddev *mddev) +{ + int ret = 0; + + if (mddev->pers->start) { + set_bit(MD_RECOVERY_WAIT, &mddev->recovery); + ret = mddev->pers->start(mddev); + clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); + md_wakeup_thread(mddev->sync_thread); + } + return ret; +} +EXPORT_SYMBOL_GPL(md_start); + static int restart_array(struct mddev *mddev) { struct gendisk *disk = mddev->gendisk; + struct md_rdev *rdev; + bool has_journal = false; + bool has_readonly = false; /* Complain if it has no devices */ if (list_empty(&mddev->disks)) return -ENXIO; if (!mddev->pers) return -EINVAL; - if (!mddev->ro) + if (md_is_rdwr(mddev)) return -EBUSY; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + has_journal = true; + if (rdev_read_only(rdev)) + has_readonly = true; + } + rcu_read_unlock(); + if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) + /* Don't restart rw with journal missing/faulty */ + return -EINVAL; + if (has_readonly) + return -EROFS; + mddev->safemode = 0; - mddev->ro = 0; + mddev->ro = MD_RDWR; set_disk_ro(disk, 0); - printk(KERN_INFO "md: %s switched to read-write mode.\n", - mdname(mddev)); + pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); /* Kick recovery or resync if necessary */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); sysfs_notify_dirent_safe(mddev->sysfs_state); return 0; } -/* similar to deny_write_access, but accounts for our holding a reference - * to the file ourselves */ -static int deny_bitmap_write_access(struct file * file) -{ - struct inode *inode = file->f_mapping->host; - - spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_writecount) > 1) { - spin_unlock(&inode->i_lock); - return -ETXTBSY; - } - atomic_set(&inode->i_writecount, -1); - spin_unlock(&inode->i_lock); - - return 0; -} - -void restore_bitmap_write_access(struct file *file) -{ - struct inode *inode = file->f_mapping->host; - - spin_lock(&inode->i_lock); - atomic_set(&inode->i_writecount, 1); - spin_unlock(&inode->i_lock); -} - static void md_clean(struct mddev *mddev) { mddev->array_sectors = 0; mddev->external_size = 0; mddev->dev_sectors = 0; mddev->raid_disks = 0; - mddev->recovery_cp = 0; + mddev->resync_offset = 0; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->reshape_position = MaxSector; - mddev->external = 0; + /* we still need mddev->external in export_rdev, do not clear it yet */ mddev->persistent = 0; mddev->level = LEVEL_NONE; mddev->clevel[0] = 0; - mddev->flags = 0; - mddev->ro = 0; + + /* + * For legacy_async_del_gendisk mode, it can stop the array in the + * middle of assembling it, then it still can access the array. So + * it needs to clear MD_CLOSING. If not legacy_async_del_gendisk, + * it can't open the array again after stopping it. So it doesn't + * clear MD_CLOSING. + */ + if (legacy_async_del_gendisk && mddev->hold_active) { + clear_bit(MD_CLOSING, &mddev->flags); + } else { + /* if UNTIL_STOP is set, it's cleared here */ + mddev->hold_active = 0; + /* Don't clear MD_CLOSING, or mddev can be opened again. */ + mddev->flags &= BIT_ULL_MASK(MD_CLOSING); + } + mddev->sb_flags = 0; + mddev->ro = MD_RDWR; mddev->metadata_type[0] = 0; mddev->chunk_sectors = 0; mddev->ctime = mddev->utime = 0; mddev->layout = 0; + mddev->logical_block_size = 0; mddev->max_disks = 0; mddev->events = 0; mddev->can_decrease_events = 0; @@ -5257,7 +6785,7 @@ static void md_clean(struct mddev *mddev) mddev->new_level = LEVEL_NONE; mddev->new_layout = 0; mddev->new_chunk_sectors = 0; - mddev->curr_resync = 0; + mddev->curr_resync = MD_RESYNC_NONE; atomic64_set(&mddev->resync_mismatches, 0); mddev->suspend_lo = mddev->suspend_hi = 0; mddev->sync_speed_min = mddev->sync_speed_max = 0; @@ -5266,93 +6794,138 @@ static void md_clean(struct mddev *mddev) mddev->changed = 0; mddev->degraded = 0; mddev->safemode = 0; - mddev->merge_check_needed = 0; + mddev->private = NULL; + mddev->cluster_info = NULL; mddev->bitmap_info.offset = 0; mddev->bitmap_info.default_offset = 0; mddev->bitmap_info.default_space = 0; mddev->bitmap_info.chunksize = 0; mddev->bitmap_info.daemon_sleep = 0; mddev->bitmap_info.max_write_behind = 0; + mddev->bitmap_info.nodes = 0; } static void __md_stop_writes(struct mddev *mddev) { - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - if (mddev->sync_thread) { - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); - } + timer_delete_sync(&mddev->safemode_timer); - del_timer_sync(&mddev->safemode_timer); + if (mddev->pers && mddev->pers->quiesce) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } - bitmap_flush(mddev); - md_super_wait(mddev); + if (md_bitmap_enabled(mddev, true)) + mddev->bitmap_ops->flush(mddev); - if (mddev->ro == 0 && - (!mddev->in_sync || mddev->flags)) { + if (md_is_rdwr(mddev) && + ((!mddev->in_sync && !mddev_is_clustered(mddev)) || + mddev->sb_flags)) { /* mark array as shutdown cleanly */ - mddev->in_sync = 1; + if (!mddev_is_clustered(mddev)) + mddev->in_sync = 1; md_update_sb(mddev, 1); } + /* disable policy to guarantee rdevs free resources for serialization */ + mddev->serialize_policy = 0; + mddev_destroy_serial_pool(mddev, NULL); } void md_stop_writes(struct mddev *mddev) { - mddev_lock(mddev); + mddev_lock_nointr(mddev); + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + stop_sync_thread(mddev, true); __md_stop_writes(mddev); mddev_unlock(mddev); } EXPORT_SYMBOL_GPL(md_stop_writes); +static void mddev_detach(struct mddev *mddev) +{ + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->wait_behind_writes(mddev); + if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + md_unregister_thread(mddev, &mddev->thread); + + /* the unplug fn references 'conf' */ + if (!mddev_is_dm(mddev)) + blk_sync_queue(mddev->gendisk->queue); +} + static void __md_stop(struct mddev *mddev) { - mddev->ready = 0; - mddev->pers->stop(mddev); - if (mddev->pers->sync_request && mddev->to_remove == NULL) - mddev->to_remove = &md_redundancy_group; - module_put(mddev->pers->owner); + struct md_personality *pers = mddev->pers; + + md_bitmap_destroy(mddev); + mddev_detach(mddev); + spin_lock(&mddev->lock); mddev->pers = NULL; + spin_unlock(&mddev->lock); + if (mddev->private) + pers->free(mddev, mddev->private); + mddev->private = NULL; + put_pers(pers); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } void md_stop(struct mddev *mddev) { + lockdep_assert_held(&mddev->reconfig_mutex); + /* stop the array and free an attached data structures. * This is called from dm-raid */ + __md_stop_writes(mddev); __md_stop(mddev); - bitmap_destroy(mddev); - if (mddev->bio_set) - bioset_free(mddev->bio_set); } EXPORT_SYMBOL_GPL(md_stop); -static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) +/* ensure 'mddev->pers' exist before calling md_set_readonly() */ +static int md_set_readonly(struct mddev *mddev) { int err = 0; - mutex_lock(&mddev->open_mutex); - if (atomic_read(&mddev->openers) > !!bdev) { - printk("md: %s still in use.\n",mdname(mddev)); + int did_freeze = 0; + + if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) + return -EBUSY; + + if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + did_freeze = 1; + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + } + + stop_sync_thread(mddev, false); + wait_event(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); + mddev_lock_nointr(mddev); + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + pr_warn("md: %s still in use.\n",mdname(mddev)); err = -EBUSY; goto out; } - if (bdev) - sync_blockdev(bdev); - if (mddev->pers) { - __md_stop_writes(mddev); + __md_stop_writes(mddev); + + if (mddev->ro == MD_RDONLY) { err = -ENXIO; - if (mddev->ro==1) - goto out; - mddev->ro = 1; - set_disk_ro(mddev->gendisk, 1); + goto out; + } + + mddev->ro = MD_RDONLY; + set_disk_ro(mddev->gendisk, 1); + +out: + if (!err || did_freeze) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_state); - err = 0; } -out: - mutex_unlock(&mddev->open_mutex); + return err; } @@ -5360,35 +6933,38 @@ out: * 0 - completely stop and dis-assemble array * 2 - stop but do not disassemble array */ -static int do_md_stop(struct mddev * mddev, int mode, - struct block_device *bdev) +static int do_md_stop(struct mddev *mddev, int mode) { struct gendisk *disk = mddev->gendisk; struct md_rdev *rdev; + int did_freeze = 0; - mutex_lock(&mddev->open_mutex); - if (atomic_read(&mddev->openers) > !!bdev || - mddev->sysfs_active) { - printk("md: %s still in use.\n",mdname(mddev)); - mutex_unlock(&mddev->open_mutex); - return -EBUSY; + if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + did_freeze = 1; + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } - if (bdev) - /* It is possible IO was issued on some other - * open file which was closed before we took ->open_mutex. - * As that was not the last close __blkdev_put will not - * have called sync_blockdev, so we must. - */ - sync_blockdev(bdev); + stop_sync_thread(mddev, true); + + if (mddev->sysfs_active || + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + pr_warn("md: %s still in use.\n",mdname(mddev)); + if (did_freeze) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } + return -EBUSY; + } if (mddev->pers) { - if (mddev->ro) + if (!md_is_rdwr(mddev)) set_disk_ro(disk, 0); + if (mode == 2 && mddev->pers->sync_request && + mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + __md_stop_writes(mddev); __md_stop(mddev); - mddev->queue->merge_bvec_fn = NULL; - mddev->queue->backing_dev_info.congested_fn = NULL; /* tell userspace to handle 'inactive' */ sysfs_notify_dirent_safe(mddev->sysfs_state); @@ -5397,38 +6973,33 @@ static int do_md_stop(struct mddev * mddev, int mode, if (rdev->raid_disk >= 0) sysfs_unlink_rdev(mddev, rdev); - set_capacity(disk, 0); - mutex_unlock(&mddev->open_mutex); + set_capacity_and_notify(disk, 0); mddev->changed = 1; - revalidate_disk(disk); - if (mddev->ro) - mddev->ro = 0; - } else - mutex_unlock(&mddev->open_mutex); + if (!md_is_rdwr(mddev)) + mddev->ro = MD_RDWR; + } /* * Free resources if final stop */ if (mode == 0) { - printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); + pr_info("md: %s stopped.\n", mdname(mddev)); - bitmap_destroy(mddev); if (mddev->bitmap_info.file) { - restore_bitmap_write_access(mddev->bitmap_info.file); - fput(mddev->bitmap_info.file); + struct file *f = mddev->bitmap_info.file; + spin_lock(&mddev->lock); mddev->bitmap_info.file = NULL; + spin_unlock(&mddev->lock); + fput(f); } mddev->bitmap_info.offset = 0; export_array(mddev); - md_clean(mddev); - kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); - if (mddev->hold_active == UNTIL_STOP) - mddev->hold_active = 0; + if (!legacy_async_del_gendisk) + set_bit(MD_DELETED, &mddev->flags); } - blk_integrity_unregister(disk); - md_new_event(mddev); + md_new_event(); sysfs_notify_dirent_safe(mddev->sysfs_state); return 0; } @@ -5442,18 +7013,17 @@ static void autorun_array(struct mddev *mddev) if (list_empty(&mddev->disks)) return; - printk(KERN_INFO "md: running: "); + pr_info("md: running: "); rdev_for_each(rdev, mddev) { - char b[BDEVNAME_SIZE]; - printk("<%s>", bdevname(rdev->bdev,b)); + pr_cont("<%pg>", rdev->bdev); } - printk("\n"); + pr_cont("\n"); err = do_md_run(mddev); if (err) { - printk(KERN_WARNING "md: do_md_run() returned %d\n", err); - do_md_stop(mddev, 0, NULL); + pr_warn("md: do_md_run() returned %d\n", err); + do_md_stop(mddev, 0); } } @@ -5473,9 +7043,8 @@ static void autorun_devices(int part) { struct md_rdev *rdev0, *rdev, *tmp; struct mddev *mddev; - char b[BDEVNAME_SIZE]; - printk(KERN_INFO "md: autorun ...\n"); + pr_info("md: autorun ...\n"); while (!list_empty(&pending_raid_disks)) { int unit; dev_t dev; @@ -5483,13 +7052,12 @@ static void autorun_devices(int part) rdev0 = list_entry(pending_raid_disks.next, struct md_rdev, same_set); - printk(KERN_INFO "md: considering %s ...\n", - bdevname(rdev0->bdev,b)); + pr_debug("md: considering %pg ...\n", rdev0->bdev); INIT_LIST_HEAD(&candidates); rdev_for_each_list(rdev, tmp, &pending_raid_disks) if (super_90_load(rdev, rdev0, 0) >= 0) { - printk(KERN_INFO "md: adding %s ...\n", - bdevname(rdev->bdev,b)); + pr_debug("md: adding %pg ...\n", + rdev->bdev); list_move(&rdev->same_set, &candidates); } /* @@ -5506,54 +7074,47 @@ static void autorun_devices(int part) unit = MINOR(dev); } if (rdev0->preferred_minor != unit) { - printk(KERN_INFO "md: unit number in %s is bad: %d\n", - bdevname(rdev0->bdev, b), rdev0->preferred_minor); + pr_warn("md: unit number in %pg is bad: %d\n", + rdev0->bdev, rdev0->preferred_minor); break; } - md_probe(dev, NULL, NULL); - mddev = mddev_find(dev); - if (!mddev || !mddev->gendisk) { - if (mddev) - mddev_put(mddev); - printk(KERN_ERR - "md: cannot allocate memory for md drive.\n"); + mddev = md_alloc(dev, NULL); + if (IS_ERR(mddev)) break; - } - if (mddev_lock(mddev)) - printk(KERN_WARNING "md: %s locked, cannot run\n", - mdname(mddev)); + + if (mddev_suspend_and_lock(mddev)) + pr_warn("md: %s locked, cannot run\n", mdname(mddev)); else if (mddev->raid_disks || mddev->major_version || !list_empty(&mddev->disks)) { - printk(KERN_WARNING - "md: %s already running, cannot run %s\n", - mdname(mddev), bdevname(rdev0->bdev,b)); - mddev_unlock(mddev); + pr_warn("md: %s already running, cannot run %pg\n", + mdname(mddev), rdev0->bdev); + mddev_unlock_and_resume(mddev); } else { - printk(KERN_INFO "md: created %s\n", mdname(mddev)); + pr_debug("md: created %s\n", mdname(mddev)); mddev->persistent = 1; rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); if (bind_rdev_to_array(rdev, mddev)) - export_rdev(rdev); + export_rdev(rdev, mddev); } autorun_array(mddev); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); } /* on success, candidates will be empty, on error * it won't... */ rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); - export_rdev(rdev); + export_rdev(rdev, mddev); } mddev_put(mddev); } - printk(KERN_INFO "md: ... autorun DONE.\n"); + pr_info("md: ... autorun DONE.\n"); } #endif /* !MODULE */ -static int get_version(void __user * arg) +static int get_version(void __user *arg) { mdu_version_t ver; @@ -5567,7 +7128,7 @@ static int get_version(void __user * arg) return 0; } -static int get_array_info(struct mddev * mddev, void __user * arg) +static int get_array_info(struct mddev *mddev, void __user *arg) { mdu_array_info_t info; int nr,working,insync,failed,spare; @@ -5582,7 +7143,10 @@ static int get_array_info(struct mddev * mddev, void __user * arg) else { working++; if (test_bit(In_sync, &rdev->flags)) - insync++; + insync++; + else if (test_bit(Journal, &rdev->flags)) + /* TODO: add journal count to md_u.h */ + ; else spare++; } @@ -5592,7 +7156,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg) info.major_version = mddev->major_version; info.minor_version = mddev->minor_version; info.patch_version = MD_PATCHLEVEL_VERSION; - info.ctime = mddev->ctime; + info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); info.level = mddev->level; info.size = mddev->dev_sectors / 2; if (info.size != mddev->dev_sectors / 2) /* overflow */ @@ -5602,12 +7166,14 @@ static int get_array_info(struct mddev * mddev, void __user * arg) info.md_minor = mddev->md_minor; info.not_persistent= !mddev->persistent; - info.utime = mddev->utime; + info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); info.state = 0; if (mddev->in_sync) info.state = (1<<MD_SB_CLEAN); if (mddev->bitmap && mddev->bitmap_info.offset) - info.state = (1<<MD_SB_BITMAP_PRESENT); + info.state |= (1<<MD_SB_BITMAP_PRESENT); + if (mddev_is_clustered(mddev)) + info.state |= (1<<MD_SB_CLUSTERED); info.active_disks = insync; info.working_disks = working; info.failed_disks = failed; @@ -5622,48 +7188,39 @@ static int get_array_info(struct mddev * mddev, void __user * arg) return 0; } -static int get_bitmap_file(struct mddev * mddev, void __user * arg) +static int get_bitmap_file(struct mddev *mddev, void __user * arg) { mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ - char *ptr, *buf = NULL; - int err = -ENOMEM; - - if (md_allow_write(mddev)) - file = kmalloc(sizeof(*file), GFP_NOIO); - else - file = kmalloc(sizeof(*file), GFP_KERNEL); + char *ptr; + int err; + file = kzalloc(sizeof(*file), GFP_NOIO); if (!file) - goto out; + return -ENOMEM; - /* bitmap disabled, zero the first byte and copy out */ - if (!mddev->bitmap || !mddev->bitmap->storage.file) { - file->pathname[0] = '\0'; - goto copy_out; + err = 0; + spin_lock(&mddev->lock); + /* bitmap enabled */ + if (mddev->bitmap_info.file) { + ptr = file_path(mddev->bitmap_info.file, file->pathname, + sizeof(file->pathname)); + if (IS_ERR(ptr)) + err = PTR_ERR(ptr); + else + memmove(file->pathname, ptr, + sizeof(file->pathname)-(ptr-file->pathname)); } + spin_unlock(&mddev->lock); - buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); - if (!buf) - goto out; - - ptr = d_path(&mddev->bitmap->storage.file->f_path, - buf, sizeof(file->pathname)); - if (IS_ERR(ptr)) - goto out; - - strcpy(file->pathname, ptr); - -copy_out: - err = 0; - if (copy_to_user(arg, file, sizeof(*file))) + if (err == 0 && + copy_to_user(arg, file, sizeof(*file))) err = -EFAULT; -out: - kfree(buf); + kfree(file); return err; } -static int get_disk_info(struct mddev * mddev, void __user * arg) +static int get_disk_info(struct mddev *mddev, void __user * arg) { mdu_disk_info_t info; struct md_rdev *rdev; @@ -5672,7 +7229,7 @@ static int get_disk_info(struct mddev * mddev, void __user * arg) return -EFAULT; rcu_read_lock(); - rdev = find_rdev_nr_rcu(mddev, info.number); + rdev = md_find_rdev_nr_rcu(mddev, info.number); if (rdev) { info.major = MAJOR(rdev->bdev->bd_dev); info.minor = MINOR(rdev->bdev->bd_dev); @@ -5684,8 +7241,12 @@ static int get_disk_info(struct mddev * mddev, void __user * arg) info.state |= (1<<MD_DISK_ACTIVE); info.state |= (1<<MD_DISK_SYNC); } + if (test_bit(Journal, &rdev->flags)) + info.state |= (1<<MD_DISK_JOURNAL); if (test_bit(WriteMostly, &rdev->flags)) info.state |= (1<<MD_DISK_WRITEMOSTLY); + if (test_bit(FailFast, &rdev->flags)) + info.state |= (1<<MD_DISK_FAILFAST); } else { info.major = info.minor = 0; info.raid_disk = -1; @@ -5699,12 +7260,18 @@ static int get_disk_info(struct mddev * mddev, void __user * arg) return 0; } -static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) +int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) { - char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; struct md_rdev *rdev; dev_t dev = MKDEV(info->major,info->minor); + if (mddev_is_clustered(mddev) && + !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { + pr_warn("%s: Cannot add to clustered mddev.\n", + mdname(mddev)); + return -EINVAL; + } + if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) return -EOVERFLOW; @@ -5713,8 +7280,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) /* expecting a device which has a superblock */ rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); if (IS_ERR(rdev)) { - printk(KERN_WARNING - "md: md_import_device returned %ld\n", + pr_warn("md: md_import_device returned %ld\n", PTR_ERR(rdev)); return PTR_ERR(rdev); } @@ -5725,31 +7291,29 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) err = super_types[mddev->major_version] .load_super(rdev, rdev0, mddev->minor_version); if (err < 0) { - printk(KERN_WARNING - "md: %s has different UUID to %s\n", - bdevname(rdev->bdev,b), - bdevname(rdev0->bdev,b2)); - export_rdev(rdev); + pr_warn("md: %pg has different UUID to %pg\n", + rdev->bdev, + rdev0->bdev); + export_rdev(rdev, mddev); return -EINVAL; } } err = bind_rdev_to_array(rdev, mddev); if (err) - export_rdev(rdev); + export_rdev(rdev, mddev); return err; } /* - * add_new_disk can be used once the array is assembled + * md_add_new_disk can be used once the array is assembled * to add "hot spares". They must already have a superblock * written */ if (mddev->pers) { int err; if (!mddev->pers->hot_add_disk) { - printk(KERN_WARNING - "%s: personality does not support diskops!\n", - mdname(mddev)); + pr_warn("%s: personality does not support diskops!\n", + mdname(mddev)); return -EINVAL; } if (mddev->persistent) @@ -5758,8 +7322,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) else rdev = md_import_device(dev, -1, -1); if (IS_ERR(rdev)) { - printk(KERN_WARNING - "md: md_import_device returned %ld\n", + pr_warn("md: md_import_device returned %ld\n", PTR_ERR(rdev)); return PTR_ERR(rdev); } @@ -5768,66 +7331,97 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) if (info->state & (1<<MD_DISK_SYNC) && info->raid_disk < mddev->raid_disks) { rdev->raid_disk = info->raid_disk; - set_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); } else rdev->raid_disk = -1; + rdev->saved_raid_disk = rdev->raid_disk; } else super_types[mddev->major_version]. - validate_super(mddev, rdev); + validate_super(mddev, NULL/*freshest*/, rdev); if ((info->state & (1<<MD_DISK_SYNC)) && rdev->raid_disk != info->raid_disk) { /* This was a hot-add request, but events doesn't * match, so reject it. */ - export_rdev(rdev); + export_rdev(rdev, mddev); return -EINVAL; } - if (test_bit(In_sync, &rdev->flags)) - rdev->saved_raid_disk = rdev->raid_disk; - else - rdev->saved_raid_disk = -1; - clear_bit(In_sync, &rdev->flags); /* just to be sure */ if (info->state & (1<<MD_DISK_WRITEMOSTLY)) set_bit(WriteMostly, &rdev->flags); else clear_bit(WriteMostly, &rdev->flags); + if (info->state & (1<<MD_DISK_FAILFAST)) + set_bit(FailFast, &rdev->flags); + else + clear_bit(FailFast, &rdev->flags); + + if (info->state & (1<<MD_DISK_JOURNAL)) { + struct md_rdev *rdev2; + bool has_journal = false; + + /* make sure no existing journal disk */ + rdev_for_each(rdev2, mddev) { + if (test_bit(Journal, &rdev2->flags)) { + has_journal = true; + break; + } + } + if (has_journal || mddev->bitmap) { + export_rdev(rdev, mddev); + return -EBUSY; + } + set_bit(Journal, &rdev->flags); + } + /* + * check whether the device shows up in other nodes + */ + if (mddev_is_clustered(mddev)) { + if (info->state & (1 << MD_DISK_CANDIDATE)) + set_bit(Candidate, &rdev->flags); + else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { + /* --add initiated by this node */ + err = mddev->cluster_ops->add_new_disk(mddev, rdev); + if (err) { + export_rdev(rdev, mddev); + return err; + } + } + } rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); - if (!err && !mddev->pers->hot_remove_disk) { - /* If there is hot_add_disk but no hot_remove_disk - * then added disks for geometry changes, - * and should be added immediately. - */ - super_types[mddev->major_version]. - validate_super(mddev, rdev); - err = mddev->pers->hot_add_disk(mddev, rdev); - if (err) - unbind_rdev_from_array(rdev); - } + if (err) - export_rdev(rdev); - else - sysfs_notify_dirent_safe(rdev->sysfs_state); + export_rdev(rdev, mddev); + + if (mddev_is_clustered(mddev)) { + if (info->state & (1 << MD_DISK_CANDIDATE)) { + if (!err) { + err = mddev->cluster_ops->new_disk_ack( + mddev, err == 0); + if (err) + md_kick_rdev_from_array(rdev); + } + } else { + if (err) + mddev->cluster_ops->add_new_disk_cancel(mddev); + else + err = add_bound_rdev(rdev); + } + + } else if (!err) + err = add_bound_rdev(rdev); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - if (mddev->degraded) - set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - if (!err) - md_new_event(mddev); - md_wakeup_thread(mddev->thread); return err; } - /* otherwise, add_new_disk is only allowed + /* otherwise, md_add_new_disk is only allowed * for major_version==0 superblocks */ if (mddev->major_version != 0) { - printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", - mdname(mddev)); + pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); return -EINVAL; } @@ -5835,8 +7429,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) int err; rdev = md_import_device(dev, -1, 0); if (IS_ERR(rdev)) { - printk(KERN_WARNING - "md: error, md_import_device() returned %ld\n", + pr_warn("md: error, md_import_device() returned %ld\n", PTR_ERR(rdev)); return PTR_ERR(rdev); } @@ -5852,17 +7445,19 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) if (info->state & (1<<MD_DISK_WRITEMOSTLY)) set_bit(WriteMostly, &rdev->flags); + if (info->state & (1<<MD_DISK_FAILFAST)) + set_bit(FailFast, &rdev->flags); if (!mddev->persistent) { - printk(KERN_INFO "md: nonpersistent superblock ...\n"); - rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; + pr_debug("md: nonpersistent superblock ...\n"); + rdev->sb_start = bdev_nr_sectors(rdev->bdev); } else rdev->sb_start = calc_dev_sboffset(rdev); rdev->sectors = rdev->sb_start; err = bind_rdev_to_array(rdev, mddev); if (err) { - export_rdev(rdev); + export_rdev(rdev, mddev); return err; } } @@ -5870,35 +7465,46 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) return 0; } -static int hot_remove_disk(struct mddev * mddev, dev_t dev) +static int hot_remove_disk(struct mddev *mddev, dev_t dev) { - char b[BDEVNAME_SIZE]; struct md_rdev *rdev; + if (!mddev->pers) + return -ENODEV; + rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; + if (rdev->raid_disk < 0) + goto kick_rdev; + clear_bit(Blocked, &rdev->flags); remove_and_add_spares(mddev, rdev); if (rdev->raid_disk >= 0) goto busy; - kick_rdev_from_array(rdev); - md_update_sb(mddev, 1); - md_new_event(mddev); +kick_rdev: + if (mddev_is_clustered(mddev) && + mddev->cluster_ops->remove_disk(mddev, rdev)) + goto busy; + + md_kick_rdev_from_array(rdev); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + if (!mddev->thread) + md_update_sb(mddev, 1); + md_new_event(); return 0; busy: - printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", - bdevname(rdev->bdev,b), mdname(mddev)); + pr_debug("md: cannot remove active disk %pg from %s ...\n", + rdev->bdev, mdname(mddev)); return -EBUSY; } -static int hot_add_disk(struct mddev * mddev, dev_t dev) +static int hot_add_disk(struct mddev *mddev, dev_t dev) { - char b[BDEVNAME_SIZE]; int err; struct md_rdev *rdev; @@ -5906,22 +7512,19 @@ static int hot_add_disk(struct mddev * mddev, dev_t dev) return -ENODEV; if (mddev->major_version != 0) { - printk(KERN_WARNING "%s: HOT_ADD may only be used with" - " version-0 superblocks.\n", + pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", mdname(mddev)); return -EINVAL; } if (!mddev->pers->hot_add_disk) { - printk(KERN_WARNING - "%s: personality does not support diskops!\n", + pr_warn("%s: personality does not support diskops!\n", mdname(mddev)); return -EINVAL; } rdev = md_import_device(dev, -1, 0); if (IS_ERR(rdev)) { - printk(KERN_WARNING - "md: error, md_import_device() returned %ld\n", + pr_warn("md: error, md_import_device() returned %ld\n", PTR_ERR(rdev)); return -EINVAL; } @@ -5929,17 +7532,17 @@ static int hot_add_disk(struct mddev * mddev, dev_t dev) if (mddev->persistent) rdev->sb_start = calc_dev_sboffset(rdev); else - rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; + rdev->sb_start = bdev_nr_sectors(rdev->bdev); rdev->sectors = rdev->sb_start; if (test_bit(Faulty, &rdev->flags)) { - printk(KERN_WARNING - "md: can not hot-add faulty %s disk to %s!\n", - bdevname(rdev->bdev,b), mdname(mddev)); + pr_warn("md: can not hot-add faulty %pg disk to %s!\n", + rdev->bdev, mdname(mddev)); err = -EINVAL; goto abort_export; } + clear_bit(In_sync, &rdev->flags); rdev->desc_nr = -1; rdev->saved_raid_disk = -1; @@ -5954,84 +7557,113 @@ static int hot_add_disk(struct mddev * mddev, dev_t dev) rdev->raid_disk = -1; - md_update_sb(mddev, 1); - + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + if (!mddev->thread) + md_update_sb(mddev, 1); /* * Kick recovery, maybe this spare has to be added to the * array immediately. */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); - md_new_event(mddev); + md_new_event(); return 0; abort_export: - export_rdev(rdev); + export_rdev(rdev, mddev); return err; } static int set_bitmap_file(struct mddev *mddev, int fd) { - int err; + int err = 0; + + if (!md_bitmap_registered(mddev)) + return -EINVAL; if (mddev->pers) { - if (!mddev->pers->quiesce) + if (!mddev->pers->quiesce || !mddev->thread) return -EBUSY; if (mddev->recovery || mddev->sync_thread) return -EBUSY; /* we should be able to change the bitmap.. */ } - if (fd >= 0) { - if (mddev->bitmap) + struct inode *inode; + struct file *f; + + if (mddev->bitmap || mddev->bitmap_info.file) return -EEXIST; /* cannot add when bitmap is present */ - mddev->bitmap_info.file = fget(fd); - if (mddev->bitmap_info.file == NULL) { - printk(KERN_ERR "%s: error: failed to get bitmap file\n", - mdname(mddev)); + if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { + pr_warn("%s: bitmap files not supported by this kernel\n", + mdname(mddev)); + return -EINVAL; + } + pr_warn("%s: using deprecated bitmap file support\n", + mdname(mddev)); + + f = fget(fd); + + if (f == NULL) { + pr_warn("%s: error: failed to get bitmap file\n", + mdname(mddev)); return -EBADF; } - err = deny_bitmap_write_access(mddev->bitmap_info.file); + inode = f->f_mapping->host; + if (!S_ISREG(inode->i_mode)) { + pr_warn("%s: error: bitmap file must be a regular file\n", + mdname(mddev)); + err = -EBADF; + } else if (!(f->f_mode & FMODE_WRITE)) { + pr_warn("%s: error: bitmap file must open for write\n", + mdname(mddev)); + err = -EBADF; + } else if (atomic_read(&inode->i_writecount) != 1) { + pr_warn("%s: error: bitmap file is already in use\n", + mdname(mddev)); + err = -EBUSY; + } if (err) { - printk(KERN_ERR "%s: error: bitmap file is already in use\n", - mdname(mddev)); - fput(mddev->bitmap_info.file); - mddev->bitmap_info.file = NULL; + fput(f); return err; } + mddev->bitmap_info.file = f; mddev->bitmap_info.offset = 0; /* file overrides offset */ } else if (mddev->bitmap == NULL) return -ENOENT; /* cannot remove what isn't there */ err = 0; if (mddev->pers) { - mddev->pers->quiesce(mddev, 1); if (fd >= 0) { - err = bitmap_create(mddev); + err = md_bitmap_create(mddev); if (!err) - err = bitmap_load(mddev); - } - if (fd < 0 || err) { - bitmap_destroy(mddev); - fd = -1; /* make sure to put the file */ + err = mddev->bitmap_ops->load(mddev); + + if (err) { + md_bitmap_destroy(mddev); + fd = -1; + } + } else if (fd < 0) { + md_bitmap_destroy(mddev); } - mddev->pers->quiesce(mddev, 0); } + if (fd < 0) { - if (mddev->bitmap_info.file) { - restore_bitmap_write_access(mddev->bitmap_info.file); - fput(mddev->bitmap_info.file); + struct file *f = mddev->bitmap_info.file; + if (f) { + spin_lock(&mddev->lock); + mddev->bitmap_info.file = NULL; + spin_unlock(&mddev->lock); + fput(f); } - mddev->bitmap_info.file = NULL; } return err; } /* - * set_array_info is used two different ways + * md_set_array_info is used two different ways * The original usage is when creating a new array. * In this usage, raid_disks is > 0 and it together with * level, size, not_persistent,layout,chunksize determine the @@ -6043,17 +7675,15 @@ static int set_bitmap_file(struct mddev *mddev, int fd) * The minor and patch _version numbers are also kept incase the * super_block handler wishes to interpret them. */ -static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) +int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) { - if (info->raid_disks == 0) { /* just setting version number for superblock loading */ if (info->major_version < 0 || info->major_version >= ARRAY_SIZE(super_types) || super_types[info->major_version].name == NULL) { /* maybe try to auto-load a module? */ - printk(KERN_INFO - "md: superblock version %d not known\n", + pr_warn("md: superblock version %d not known\n", info->major_version); return -EINVAL; } @@ -6064,13 +7694,13 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) /* ensure mddev_put doesn't delete this now that there * is some minimal configuration. */ - mddev->ctime = get_seconds(); + mddev->ctime = ktime_get_real_seconds(); return 0; } mddev->major_version = MD_MAJOR_VERSION; mddev->minor_version = MD_MINOR_VERSION; mddev->patch_version = MD_PATCHLEVEL_VERSION; - mddev->ctime = get_seconds(); + mddev->ctime = ktime_get_real_seconds(); mddev->level = info->level; mddev->clevel[0] = 0; @@ -6080,20 +7710,24 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) * openned */ if (info->state & (1<<MD_SB_CLEAN)) - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; else - mddev->recovery_cp = 0; + mddev->resync_offset = 0; mddev->persistent = ! info->not_persistent; mddev->external = 0; mddev->layout = info->layout; + if (mddev->level == 0) + /* Cannot trust RAID0 layout info here */ + mddev->layout = -1; mddev->chunk_sectors = info->chunk_size >> 9; - mddev->max_disks = MD_SB_DISKS; - - if (mddev->persistent) - mddev->flags = 0; - set_bit(MD_CHANGE_DEVS, &mddev->flags); + if (mddev->persistent) { + mddev->max_disks = MD_SB_DISKS; + mddev->flags = 0; + mddev->sb_flags = 0; + } + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); @@ -6117,7 +7751,7 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) { - WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); + lockdep_assert_held(&mddev->reconfig_mutex); if (mddev->external_size) return; @@ -6131,6 +7765,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) struct md_rdev *rdev; int rv; int fit = (num_sectors == 0); + sector_t old_dev_sectors = mddev->dev_sectors; if (mddev->pers->resize == NULL) return -EINVAL; @@ -6143,8 +7778,10 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) * of each device. If num_sectors is zero, we find the largest size * that fits. */ - if (mddev->sync_thread) + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; + if (!md_is_rdwr(mddev)) + return -EROFS; rdev_for_each(rdev, mddev) { sector_t avail = rdev->sectors; @@ -6155,8 +7792,13 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) return -ENOSPC; } rv = mddev->pers->resize(mddev, num_sectors); - if (!rv) - revalidate_disk(mddev->gendisk); + if (!rv) { + if (mddev_is_clustered(mddev)) + mddev->cluster_ops->update_size(mddev, old_dev_sectors); + else if (!mddev_is_dm(mddev)) + set_capacity_and_notify(mddev->gendisk, + mddev->array_sectors); + } return rv; } @@ -6167,10 +7809,14 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) /* change the number of raid disks */ if (mddev->pers->check_reshape == NULL) return -EINVAL; + if (!md_is_rdwr(mddev)) + return -EROFS; if (raid_disks <= 0 || (mddev->max_disks && raid_disks >= mddev->max_disks)) return -EINVAL; - if (mddev->sync_thread || mddev->reshape_position != MaxSector) + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || + mddev->reshape_position != MaxSector) return -EBUSY; rdev_for_each(rdev, mddev) { @@ -6196,6 +7842,27 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) return rv; } +static int get_cluster_ops(struct mddev *mddev) +{ + xa_lock(&md_submodule); + mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER); + if (mddev->cluster_ops && + !try_module_get(mddev->cluster_ops->head.owner)) + mddev->cluster_ops = NULL; + xa_unlock(&md_submodule); + + return mddev->cluster_ops == NULL ? -ENOENT : 0; +} + +static void put_cluster_ops(struct mddev *mddev) +{ + if (!mddev->cluster_ops) + return; + + mddev->cluster_ops->leave(mddev); + module_put(mddev->cluster_ops->head.owner); + mddev->cluster_ops = NULL; +} /* * update_array_info is used to change the configuration of an @@ -6221,7 +7888,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->ctime != info->ctime || mddev->level != info->level || /* mddev->layout != info->layout || */ - !mddev->persistent != info->not_persistent|| + mddev->persistent != !info->not_persistent || mddev->chunk_sectors != info->chunk_size >> 9 || /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ ((state^info->state) & 0xfffffe00) @@ -6263,41 +7930,67 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = update_raid_disks(mddev, info->raid_disks); if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { - if (mddev->pers->quiesce == NULL) - return -EINVAL; - if (mddev->recovery || mddev->sync_thread) - return -EBUSY; + if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { + rv = -EINVAL; + goto err; + } + if (mddev->recovery || mddev->sync_thread) { + rv = -EBUSY; + goto err; + } if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { /* add the bitmap */ - if (mddev->bitmap) - return -EEXIST; - if (mddev->bitmap_info.default_offset == 0) - return -EINVAL; + if (mddev->bitmap) { + rv = -EEXIST; + goto err; + } + if (mddev->bitmap_info.default_offset == 0) { + rv = -EINVAL; + goto err; + } mddev->bitmap_info.offset = mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - mddev->pers->quiesce(mddev, 1); - rv = bitmap_create(mddev); + rv = md_bitmap_create(mddev); if (!rv) - rv = bitmap_load(mddev); + rv = mddev->bitmap_ops->load(mddev); + if (rv) - bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + md_bitmap_destroy(mddev); } else { - /* remove the bitmap */ - if (!mddev->bitmap) - return -ENOENT; - if (mddev->bitmap->storage.file) - return -EINVAL; - mddev->pers->quiesce(mddev, 1); - bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + struct md_bitmap_stats stats; + + rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (rv) + goto err; + + if (stats.file) { + rv = -EINVAL; + goto err; + } + + if (mddev->bitmap_info.nodes) { + /* hold PW on all the bitmap lock */ + if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) { + pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); + rv = -EPERM; + mddev->cluster_ops->unlock_all_bitmaps(mddev); + goto err; + } + + mddev->bitmap_info.nodes = 0; + put_cluster_ops(mddev); + mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; + } + md_bitmap_destroy(mddev); mddev->bitmap_info.offset = 0; } } md_update_sb(mddev, 1); return rv; +err: + return rv; } static int set_disk_faulty(struct mddev *mddev, dev_t dev) @@ -6309,12 +8002,12 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev) return -ENODEV; rcu_read_lock(); - rdev = find_rdev_rcu(mddev, dev); + rdev = md_find_rdev_rcu(mddev, dev); if (!rdev) err = -ENODEV; else { md_error(mddev, rdev); - if (!test_bit(Faulty, &rdev->flags)) + if (test_bit(MD_BROKEN, &mddev->flags)) err = -EBUSY; } rcu_read_unlock(); @@ -6327,9 +8020,9 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev) * 4 sectors (with a BIG number of cylinders...). This drives * dosfs just mad... ;-) */ -static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int md_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mddev *mddev = bdev->bd_disk->private_data; + struct mddev *mddev = disk->private_data; geo->heads = 2; geo->sectors = 4; @@ -6337,135 +8030,147 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -static int md_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) +static inline int md_ioctl_valid(unsigned int cmd) { - int err = 0; - void __user *argp = (void __user *)arg; - struct mddev *mddev = NULL; - int ro; - switch (cmd) { - case RAID_VERSION: case GET_ARRAY_INFO: case GET_DISK_INFO: - break; - default: + case RAID_VERSION: + return 0; + case ADD_NEW_DISK: + case GET_BITMAP_FILE: + case HOT_ADD_DISK: + case HOT_REMOVE_DISK: + case RESTART_ARRAY_RW: + case RUN_ARRAY: + case SET_ARRAY_INFO: + case SET_BITMAP_FILE: + case SET_DISK_FAULTY: + case STOP_ARRAY: + case STOP_ARRAY_RO: + case CLUSTERED_DISK_NACK: if (!capable(CAP_SYS_ADMIN)) return -EACCES; + return 0; + default: + return -ENOTTY; } +} - /* - * Commands dealing with the RAID driver but not any - * particular array: - */ +static bool md_ioctl_need_suspend(unsigned int cmd) +{ switch (cmd) { - case RAID_VERSION: - err = get_version(argp); - goto done; + case ADD_NEW_DISK: + case HOT_ADD_DISK: + case HOT_REMOVE_DISK: + case SET_BITMAP_FILE: + case SET_ARRAY_INFO: + return true; + default: + return false; + } +} - case PRINT_RAID_DEBUG: - err = 0; - md_print_devices(); - goto done; +static int __md_set_array_info(struct mddev *mddev, void __user *argp) +{ + mdu_array_info_t info; + int err; -#ifndef MODULE - case RAID_AUTORUN: - err = 0; - autostart_arrays(arg); - goto done; -#endif - default:; + if (!argp) + memset(&info, 0, sizeof(info)); + else if (copy_from_user(&info, argp, sizeof(info))) + return -EFAULT; + + if (mddev->pers) { + err = update_array_info(mddev, &info); + if (err) + pr_warn("md: couldn't update array info. %d\n", err); + return err; } + if (!list_empty(&mddev->disks)) { + pr_warn("md: array %s already has disks!\n", mdname(mddev)); + return -EBUSY; + } + + if (mddev->raid_disks) { + pr_warn("md: array %s already initialised!\n", mdname(mddev)); + return -EBUSY; + } + + err = md_set_array_info(mddev, &info); + if (err) + pr_warn("md: couldn't set array info. %d\n", err); + + return err; +} + +static int md_ioctl(struct block_device *bdev, blk_mode_t mode, + unsigned int cmd, unsigned long arg) +{ + int err = 0; + void __user *argp = (void __user *)arg; + struct mddev *mddev = NULL; + + err = md_ioctl_valid(cmd); + if (err) + return err; + + /* + * Commands dealing with the RAID driver but not any + * particular array: + */ + if (cmd == RAID_VERSION) + return get_version(argp); + /* * Commands creating/starting a new array: */ mddev = bdev->bd_disk->private_data; - if (!mddev) { - BUG(); - goto abort; - } - /* Some actions do not requires the mutex */ switch (cmd) { case GET_ARRAY_INFO: if (!mddev->raid_disks && !mddev->external) - err = -ENODEV; - else - err = get_array_info(mddev, argp); - goto abort; + return -ENODEV; + return get_array_info(mddev, argp); case GET_DISK_INFO: if (!mddev->raid_disks && !mddev->external) - err = -ENODEV; - else - err = get_disk_info(mddev, argp); - goto abort; + return -ENODEV; + return get_disk_info(mddev, argp); case SET_DISK_FAULTY: - err = set_disk_faulty(mddev, new_decode_dev(arg)); - goto abort; + return set_disk_faulty(mddev, new_decode_dev(arg)); + + case GET_BITMAP_FILE: + return get_bitmap_file(mddev, argp); + } + + if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { + /* Need to flush page cache, and ensure no-one else opens + * and writes + */ + err = mddev_set_closing_and_sync_blockdev(mddev, 1); + if (err) + return err; } - if (cmd == ADD_NEW_DISK) - /* need to ensure md_delayed_delete() has completed */ - flush_workqueue(md_misc_wq); + if (!md_is_rdwr(mddev)) + flush_work(&mddev->sync_work); - if (cmd == HOT_REMOVE_DISK) - /* need to ensure recovery thread has run */ - wait_event_interruptible_timeout(mddev->sb_wait, - !test_bit(MD_RECOVERY_NEEDED, - &mddev->flags), - msecs_to_jiffies(5000)); - err = mddev_lock(mddev); + err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : + mddev_lock(mddev); if (err) { - printk(KERN_INFO - "md: ioctl lock interrupted, reason %d, cmd %d\n", - err, cmd); - goto abort; + pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", + err, cmd); + goto out; } if (cmd == SET_ARRAY_INFO) { - mdu_array_info_t info; - if (!arg) - memset(&info, 0, sizeof(info)); - else if (copy_from_user(&info, argp, sizeof(info))) { - err = -EFAULT; - goto abort_unlock; - } - if (mddev->pers) { - err = update_array_info(mddev, &info); - if (err) { - printk(KERN_WARNING "md: couldn't update" - " array info. %d\n", err); - goto abort_unlock; - } - goto done_unlock; - } - if (!list_empty(&mddev->disks)) { - printk(KERN_WARNING - "md: array %s already has disks!\n", - mdname(mddev)); - err = -EBUSY; - goto abort_unlock; - } - if (mddev->raid_disks) { - printk(KERN_WARNING - "md: array %s already initialised!\n", - mdname(mddev)); - err = -EBUSY; - goto abort_unlock; - } - err = set_array_info(mddev, &info); - if (err) { - printk(KERN_WARNING "md: couldn't set" - " array info. %d\n", err); - goto abort_unlock; - } - goto done_unlock; + err = __md_set_array_info(mddev, argp); + goto unlock; } /* @@ -6478,36 +8183,33 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE && cmd != GET_BITMAP_FILE) { err = -ENODEV; - goto abort_unlock; + goto unlock; } /* * Commands even a read-only array can execute: */ switch (cmd) { - case GET_BITMAP_FILE: - err = get_bitmap_file(mddev, argp); - goto done_unlock; - case RESTART_ARRAY_RW: err = restart_array(mddev); - goto done_unlock; + goto unlock; case STOP_ARRAY: - err = do_md_stop(mddev, 0, bdev); - goto done_unlock; + err = do_md_stop(mddev, 0); + goto unlock; case STOP_ARRAY_RO: - err = md_set_readonly(mddev, bdev); - goto done_unlock; + if (mddev->pers) + err = md_set_readonly(mddev); + goto unlock; case HOT_REMOVE_DISK: err = hot_remove_disk(mddev, new_decode_dev(arg)); - goto done_unlock; + goto unlock; case ADD_NEW_DISK: /* We can support ADD_NEW_DISK on read-only arrays - * on if we are re-adding a preexisting device. + * only if we are re-adding a preexisting device. * So require mddev->pers and MD_DISK_SYNC. */ if (mddev->pers) { @@ -6518,67 +8220,34 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, /* Need to clear read-only for this */ break; else - err = add_new_disk(mddev, &info); - goto done_unlock; + err = md_add_new_disk(mddev, &info); + goto unlock; } break; - - case BLKROSET: - if (get_user(ro, (int __user *)(arg))) { - err = -EFAULT; - goto done_unlock; - } - err = -EINVAL; - - /* if the bdev is going readonly the value of mddev->ro - * does not matter, no writes are coming - */ - if (ro) - goto done_unlock; - - /* are we are already prepared for writes? */ - if (mddev->ro != 1) - goto done_unlock; - - /* transitioning to readauto need only happen for - * arrays that call md_write_start - */ - if (mddev->pers) { - err = restart_array(mddev); - if (err == 0) { - mddev->ro = 2; - set_disk_ro(mddev->gendisk, 0); - } - } - goto done_unlock; } /* * The remaining ioctls are changing the state of the * superblock, so we do not allow them on read-only arrays. - * However non-MD ioctls (e.g. get-size) will still come through - * here and hit the 'default' below, so only disallow - * 'md' ioctls, and switch to rw mode if started auto-readonly. */ - if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { - if (mddev->ro == 2) { - mddev->ro = 0; - sysfs_notify_dirent_safe(mddev->sysfs_state); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - /* mddev_unlock will wake thread */ - /* If a device failed while we were read-only, we - * need to make sure the metadata is updated now. - */ - if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { - mddev_unlock(mddev); - wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_DEVS, &mddev->flags) && - !test_bit(MD_CHANGE_PENDING, &mddev->flags)); - mddev_lock(mddev); - } - } else { + if (!md_is_rdwr(mddev) && mddev->pers) { + if (mddev->ro != MD_AUTO_READ) { err = -EROFS; - goto abort_unlock; + goto unlock; + } + mddev->ro = MD_RDWR; + sysfs_notify_dirent_safe(mddev->sysfs_state); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + /* mddev_unlock will wake thread */ + /* If a device failed while we were read-only, we + * need to make sure the metadata is updated now. + */ + if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { + mddev_unlock(mddev); + wait_event(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); + mddev_lock_nointr(mddev); } } @@ -6589,43 +8258,49 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, if (copy_from_user(&info, argp, sizeof(info))) err = -EFAULT; else - err = add_new_disk(mddev, &info); - goto done_unlock; + err = md_add_new_disk(mddev, &info); + goto unlock; } + case CLUSTERED_DISK_NACK: + if (mddev_is_clustered(mddev)) + mddev->cluster_ops->new_disk_ack(mddev, false); + else + err = -EINVAL; + goto unlock; + case HOT_ADD_DISK: err = hot_add_disk(mddev, new_decode_dev(arg)); - goto done_unlock; + goto unlock; case RUN_ARRAY: err = do_md_run(mddev); - goto done_unlock; + goto unlock; case SET_BITMAP_FILE: err = set_bitmap_file(mddev, (int)arg); - goto done_unlock; + goto unlock; default: err = -EINVAL; - goto abort_unlock; + goto unlock; } -done_unlock: -abort_unlock: +unlock: if (mddev->hold_active == UNTIL_IOCTL && err != -EINVAL) mddev->hold_active = 0; - mddev_unlock(mddev); - return err; -done: - if (err) - MD_BUG(); -abort: + md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : + mddev_unlock(mddev); + +out: + if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY)) + clear_bit(MD_CLOSING, &mddev->flags); return err; } #ifdef CONFIG_COMPAT -static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, +static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -6644,68 +8319,99 @@ static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, } #endif /* CONFIG_COMPAT */ -static int md_open(struct block_device *bdev, fmode_t mode) +static int md_set_read_only(struct block_device *bdev, bool ro) { + struct mddev *mddev = bdev->bd_disk->private_data; + int err; + + err = mddev_lock(mddev); + if (err) + return err; + + if (!mddev->raid_disks && !mddev->external) { + err = -ENODEV; + goto out_unlock; + } + /* - * Succeed if we can lock the mddev, which confirms that - * it isn't being stopped right now. + * Transitioning to read-auto need only happen for arrays that call + * md_write_start and which are not ready for writes yet. */ - struct mddev *mddev = mddev_find(bdev->bd_dev); + if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { + err = restart_array(mddev); + if (err) + goto out_unlock; + mddev->ro = MD_AUTO_READ; + } + +out_unlock: + mddev_unlock(mddev); + return err; +} + +static int md_open(struct gendisk *disk, blk_mode_t mode) +{ + struct mddev *mddev; int err; + spin_lock(&all_mddevs_lock); + mddev = mddev_get(disk->private_data); + spin_unlock(&all_mddevs_lock); if (!mddev) return -ENODEV; - if (mddev->gendisk != bdev->bd_disk) { - /* we are racing with mddev_put which is discarding this - * bd_disk. - */ - mddev_put(mddev); - /* Wait until bdev->bd_disk is definitely gone */ - flush_workqueue(md_misc_wq); - /* Then retry the open from the top */ - return -ERESTARTSYS; - } - BUG_ON(mddev != bdev->bd_disk->private_data); - - if ((err = mutex_lock_interruptible(&mddev->open_mutex))) + err = mutex_lock_interruptible(&mddev->open_mutex); + if (err) goto out; - err = 0; + err = -ENODEV; + if (test_bit(MD_CLOSING, &mddev->flags)) + goto out_unlock; + atomic_inc(&mddev->openers); mutex_unlock(&mddev->open_mutex); - check_disk_change(bdev); - out: + disk_check_media_change(disk); + return 0; + +out_unlock: + mutex_unlock(&mddev->open_mutex); +out: + mddev_put(mddev); return err; } -static void md_release(struct gendisk *disk, fmode_t mode) +static void md_release(struct gendisk *disk) { - struct mddev *mddev = disk->private_data; + struct mddev *mddev = disk->private_data; BUG_ON(!mddev); atomic_dec(&mddev->openers); mddev_put(mddev); } -static int md_media_changed(struct gendisk *disk) +static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) { struct mddev *mddev = disk->private_data; + unsigned int ret = 0; - return mddev->changed; + if (mddev->changed) + ret = DISK_EVENT_MEDIA_CHANGE; + mddev->changed = 0; + return ret; } -static int md_revalidate(struct gendisk *disk) +static void md_free_disk(struct gendisk *disk) { struct mddev *mddev = disk->private_data; - mddev->changed = 0; - return 0; + mddev_free(mddev); } -static const struct block_device_operations md_fops = + +const struct block_device_operations md_fops = { .owner = THIS_MODULE, + .submit_bio = md_submit_bio, .open = md_open, .release = md_release, .ioctl = md_ioctl, @@ -6713,11 +8419,12 @@ static const struct block_device_operations md_fops = .compat_ioctl = md_compat_ioctl, #endif .getgeo = md_getgeo, - .media_changed = md_media_changed, - .revalidate_disk= md_revalidate, + .check_events = md_check_events, + .set_read_only = md_set_read_only, + .free_disk = md_free_disk, }; -static int md_thread(void * arg) +static int md_thread(void *arg) { struct md_thread *thread = arg; @@ -6747,10 +8454,12 @@ static int md_thread(void * arg) wait_event_interruptible_timeout (thread->wqueue, test_bit(THREAD_WAKEUP, &thread->flags) - || kthread_should_stop(), + || kthread_should_stop() || kthread_should_park(), thread->timeout); clear_bit(THREAD_WAKEUP, &thread->flags); + if (kthread_should_park()) + kthread_parkme(); if (!kthread_should_stop()) thread->run(thread); } @@ -6758,14 +8467,30 @@ static int md_thread(void * arg) return 0; } -void md_wakeup_thread(struct md_thread *thread) +static void md_wakeup_thread_directly(struct md_thread __rcu **thread) +{ + struct md_thread *t; + + rcu_read_lock(); + t = rcu_dereference(*thread); + if (t) + wake_up_process(t->tsk); + rcu_read_unlock(); +} + +void __md_wakeup_thread(struct md_thread __rcu *thread) { - if (thread) { - pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); - set_bit(THREAD_WAKEUP, &thread->flags); - wake_up(&thread->wqueue); + struct md_thread *t; + + t = rcu_dereference(thread); + if (t) { + pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); + set_bit(THREAD_WAKEUP, &t->flags); + if (wq_has_sleeper(&t->wqueue)) + wake_up(&t->wqueue); } } +EXPORT_SYMBOL(__md_wakeup_thread); struct md_thread *md_register_thread(void (*run) (struct md_thread *), struct mddev *mddev, const char *name) @@ -6791,47 +8516,51 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *), } return thread; } +EXPORT_SYMBOL(md_register_thread); -void md_unregister_thread(struct md_thread **threadp) +void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) { - struct md_thread *thread = *threadp; + struct md_thread *thread = rcu_dereference_protected(*threadp, + lockdep_is_held(&mddev->reconfig_mutex)); + if (!thread) return; - pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); - /* Locking ensures that mddev_unlock does not wake_up a - * non-existent thread - */ - spin_lock(&pers_lock); - *threadp = NULL; - spin_unlock(&pers_lock); + rcu_assign_pointer(*threadp, NULL); + synchronize_rcu(); + + pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); kthread_stop(thread->tsk); kfree(thread); } +EXPORT_SYMBOL(md_unregister_thread); void md_error(struct mddev *mddev, struct md_rdev *rdev) { - if (!mddev) { - MD_BUG(); - return; - } - if (!rdev || test_bit(Faulty, &rdev->flags)) return; if (!mddev->pers || !mddev->pers->error_handler) return; - mddev->pers->error_handler(mddev,rdev); - if (mddev->degraded) + mddev->pers->error_handler(mddev, rdev); + + if (mddev->pers->head.id == ID_RAID0 || + mddev->pers->head.id == ID_LINEAR) + return; + + if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_RECOVERY_INTR, &mddev->recovery); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); + if (!test_bit(MD_BROKEN, &mddev->flags)) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } if (mddev->event_work.func) queue_work(md_misc_wq, &mddev->event_work); - md_new_event_inintr(mddev); + md_new_event(); } +EXPORT_SYMBOL(md_error); /* seq_file implementation /proc/mdstat */ @@ -6843,10 +8572,8 @@ static void status_unused(struct seq_file *seq) seq_printf(seq, "unused devices: "); list_for_each_entry(rdev, &pending_raid_disks, same_set) { - char b[BDEVNAME_SIZE]; i++; - seq_printf(seq, "%s ", - bdevname(rdev->bdev,b)); + seq_printf(seq, "%pg ", rdev->bdev); } if (!i) seq_printf(seq, "<none>"); @@ -6854,34 +8581,86 @@ static void status_unused(struct seq_file *seq) seq_printf(seq, "\n"); } +static void status_personalities(struct seq_file *seq) +{ + struct md_submodule_head *head; + unsigned long i; + + seq_puts(seq, "Personalities : "); -static void status_resync(struct seq_file *seq, struct mddev * mddev) + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) + if (head->type == MD_PERSONALITY) + seq_printf(seq, "[%s] ", head->name); + xa_unlock(&md_submodule); + + seq_puts(seq, "\n"); +} + +static int status_resync(struct seq_file *seq, struct mddev *mddev) { sector_t max_sectors, resync, res; - unsigned long dt, db; - sector_t rt; - int scale; + unsigned long dt, db = 0; + sector_t rt, curr_mark_cnt, resync_mark_cnt; + int scale, recovery_active; unsigned int per_milli; - if (mddev->curr_resync <= 3) - resync = 0; - else - resync = mddev->curr_resync - - atomic_read(&mddev->recovery_active); - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->resync_max_sectors; else max_sectors = mddev->dev_sectors; - /* - * Should not happen. - */ - if (!max_sectors) { - MD_BUG(); - return; + resync = mddev->curr_resync; + if (resync < MD_RESYNC_ACTIVE) { + if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + /* Still cleaning up */ + resync = max_sectors; + } else if (resync > max_sectors) { + resync = max_sectors; + } else { + res = atomic_read(&mddev->recovery_active); + /* + * Resync has started, but the subtraction has overflowed or + * yielded one of the special values. Force it to active to + * ensure the status reports an active resync. + */ + if (resync < res || resync - res < MD_RESYNC_ACTIVE) + resync = MD_RESYNC_ACTIVE; + else + resync -= res; } + + if (resync == MD_RESYNC_NONE) { + if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags) && + rdev->recovery_offset != MaxSector && + rdev->recovery_offset) { + seq_printf(seq, "\trecover=REMOTE"); + return 1; + } + if (mddev->reshape_position != MaxSector) + seq_printf(seq, "\treshape=REMOTE"); + else + seq_printf(seq, "\tresync=REMOTE"); + return 1; + } + if (mddev->resync_offset < MaxSector) { + seq_printf(seq, "\tresync=PENDING"); + return 1; + } + return 0; + } + if (resync < MD_RESYNC_ACTIVE) { + seq_printf(seq, "\tresync=DELAYED"); + return 1; + } + + WARN_ON(max_sectors == 0); /* Pick 'scale' such that (resync>>scale)*1000 will fit * in a sector_t, and (max_sectors>>scale) will fit in a * u32, as those are the requirements for sector_div. @@ -6922,22 +8701,30 @@ static void status_resync(struct seq_file *seq, struct mddev * mddev) * db: blocks written from mark until now * rt: remaining time * - * rt is a sector_t, so could be 32bit or 64bit. - * So we divide before multiply in case it is 32bit and close - * to the limit. - * We scale the divisor (db) by 32 to avoid losing precision - * near the end of resync when the number of remaining sectors - * is close to 'db'. - * We then divide rt by 32 after multiplying by db to compensate. - * The '+1' avoids division by zero if db is very small. + * rt is a sector_t, which is always 64bit now. We are keeping + * the original algorithm, but it is not really necessary. + * + * Original algorithm: + * So we divide before multiply in case it is 32bit and close + * to the limit. + * We scale the divisor (db) by 32 to avoid losing precision + * near the end of resync when the number of remaining sectors + * is close to 'db'. + * We then divide rt by 32 after multiplying by db to compensate. + * The '+1' avoids division by zero if db is very small. */ dt = ((jiffies - mddev->resync_mark) / HZ); if (!dt) dt++; - db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) - - mddev->resync_mark_cnt; + + curr_mark_cnt = mddev->curr_mark_cnt; + recovery_active = atomic_read(&mddev->recovery_active); + resync_mark_cnt = mddev->resync_mark_cnt; + + if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) + db = curr_mark_cnt - (recovery_active + resync_mark_cnt); rt = max_sectors - resync; /* number of remaining sectors */ - sector_div(rt, db/32+1); + rt = div64_u64(rt, db/32+1); rt *= dt; rt >>= 5; @@ -6945,114 +8732,107 @@ static void status_resync(struct seq_file *seq, struct mddev * mddev) ((unsigned long)rt % 60)/6); seq_printf(seq, " speed=%ldK/sec", db/2/dt); + return 1; } static void *md_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(&all_mddevs_lock) { - struct list_head *tmp; - loff_t l = *pos; - struct mddev *mddev; - - if (l >= 0x10000) - return NULL; - if (!l--) - /* header */ - return (void*)1; - + seq->poll_event = atomic_read(&md_event_count); spin_lock(&all_mddevs_lock); - list_for_each(tmp,&all_mddevs) - if (!l--) { - mddev = list_entry(tmp, struct mddev, all_mddevs); - mddev_get(mddev); - spin_unlock(&all_mddevs_lock); - return mddev; - } - spin_unlock(&all_mddevs_lock); - if (!l--) - return (void*)2;/* tail */ - return NULL; + + return seq_list_start_head(&all_mddevs, *pos); } static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct list_head *tmp; - struct mddev *next_mddev, *mddev = v; - - ++*pos; - if (v == (void*)2) - return NULL; + return seq_list_next(v, &all_mddevs, pos); +} - spin_lock(&all_mddevs_lock); - if (v == (void*)1) - tmp = all_mddevs.next; - else - tmp = mddev->all_mddevs.next; - if (tmp != &all_mddevs) - next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); - else { - next_mddev = (void*)2; - *pos = 0x10000; - } +static void md_seq_stop(struct seq_file *seq, void *v) + __releases(&all_mddevs_lock) +{ spin_unlock(&all_mddevs_lock); - - if (v != (void*)1) - mddev_put(mddev); - return next_mddev; - } -static void md_seq_stop(struct seq_file *seq, void *v) +static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) { - struct mddev *mddev = v; + struct md_bitmap_stats stats; + unsigned long used_pages; + unsigned long chunk_kb; + int err; - if (mddev && v != (void*)1 && v != (void*)2) - mddev_put(mddev); + if (!md_bitmap_enabled(mddev, false)) + return; + + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (err) + return; + + chunk_kb = mddev->bitmap_info.chunksize >> 10; + used_pages = stats.pages - stats.missing_pages; + + seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk", + used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10), + chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, + chunk_kb ? "KB" : "B"); + + if (stats.file) { + seq_puts(seq, ", file: "); + seq_file_path(seq, stats.file, " \t\n"); + } + + seq_putc(seq, '\n'); } static int md_seq_show(struct seq_file *seq, void *v) { - struct mddev *mddev = v; + struct mddev *mddev; sector_t sectors; struct md_rdev *rdev; - if (v == (void*)1) { - struct md_personality *pers; - seq_printf(seq, "Personalities : "); - spin_lock(&pers_lock); - list_for_each_entry(pers, &pers_list, list) - seq_printf(seq, "[%s] ", pers->name); - - spin_unlock(&pers_lock); - seq_printf(seq, "\n"); - seq->poll_event = atomic_read(&md_event_count); + if (v == &all_mddevs) { + status_personalities(seq); + if (list_empty(&all_mddevs)) + status_unused(seq); return 0; } - if (v == (void*)2) { - status_unused(seq); + + mddev = list_entry(v, struct mddev, all_mddevs); + if (!mddev_get(mddev)) return 0; - } - if (mddev_lock(mddev) < 0) - return -EINTR; + spin_unlock(&all_mddevs_lock); + + /* prevent bitmap to be freed after checking */ + mutex_lock(&mddev->bitmap_info.mutex); + spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { - seq_printf(seq, "%s : %sactive", mdname(mddev), - mddev->pers ? "" : "in"); + seq_printf(seq, "%s : ", mdname(mddev)); if (mddev->pers) { - if (mddev->ro==1) + if (test_bit(MD_BROKEN, &mddev->flags)) + seq_printf(seq, "broken"); + else + seq_printf(seq, "active"); + if (mddev->ro == MD_RDONLY) seq_printf(seq, " (read-only)"); - if (mddev->ro==2) + if (mddev->ro == MD_AUTO_READ) seq_printf(seq, " (auto-read-only)"); - seq_printf(seq, " %s", mddev->pers->name); + seq_printf(seq, " %s", mddev->pers->head.name); + } else { + seq_printf(seq, "inactive"); } sectors = 0; - rdev_for_each(rdev, mddev) { - char b[BDEVNAME_SIZE]; - seq_printf(seq, " %s[%d]", - bdevname(rdev->bdev,b), rdev->desc_nr); + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); + if (test_bit(WriteMostly, &rdev->flags)) seq_printf(seq, "(W)"); + if (test_bit(Journal, &rdev->flags)) + seq_printf(seq, "(J)"); if (test_bit(Faulty, &rdev->flags)) { seq_printf(seq, "(F)"); continue; @@ -7063,6 +8843,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "(R)"); sectors += rdev->sectors; } + rcu_read_unlock(); if (!list_empty(&mddev->disks)) { if (mddev->pers) @@ -7088,25 +8869,26 @@ static int md_seq_show(struct seq_file *seq, void *v) if (mddev->pers) { mddev->pers->status(seq, mddev); - seq_printf(seq, "\n "); + seq_printf(seq, "\n "); if (mddev->pers->sync_request) { - if (mddev->curr_resync > 2) { - status_resync(seq, mddev); + if (status_resync(seq, mddev)) seq_printf(seq, "\n "); - } else if (mddev->curr_resync >= 1) - seq_printf(seq, "\tresync=DELAYED\n "); - else if (mddev->recovery_cp < MaxSector) - seq_printf(seq, "\tresync=PENDING\n "); } } else seq_printf(seq, "\n "); - bitmap_status(seq, mddev->bitmap); + md_bitmap_status(seq, mddev); seq_printf(seq, "\n"); } - mddev_unlock(mddev); - + spin_unlock(&mddev->lock); + mutex_unlock(&mddev->bitmap_info.mutex); + spin_lock(&all_mddevs_lock); + + if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) + status_unused(seq); + + mddev_put_locked(mddev); return 0; } @@ -7131,89 +8913,119 @@ static int md_seq_open(struct inode *inode, struct file *file) return error; } -static unsigned int mdstat_poll(struct file *filp, poll_table *wait) +static int md_unloading; +static __poll_t mdstat_poll(struct file *filp, poll_table *wait) { struct seq_file *seq = filp->private_data; - int mask; + __poll_t mask; + if (md_unloading) + return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; poll_wait(filp, &md_event_waiters, wait); /* always allow read */ - mask = POLLIN | POLLRDNORM; + mask = EPOLLIN | EPOLLRDNORM; if (seq->poll_event != atomic_read(&md_event_count)) - mask |= POLLERR | POLLPRI; + mask |= EPOLLERR | EPOLLPRI; return mask; } -static const struct file_operations md_seq_fops = { - .owner = THIS_MODULE, - .open = md_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, - .poll = mdstat_poll, +static const struct proc_ops mdstat_proc_ops = { + .proc_open = md_seq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, + .proc_poll = mdstat_poll, }; -int register_md_personality(struct md_personality *p) +int register_md_submodule(struct md_submodule_head *msh) { - spin_lock(&pers_lock); - list_add_tail(&p->list, &pers_list); - printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); - spin_unlock(&pers_lock); - return 0; + return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL); } +EXPORT_SYMBOL_GPL(register_md_submodule); -int unregister_md_personality(struct md_personality *p) +void unregister_md_submodule(struct md_submodule_head *msh) { - printk(KERN_INFO "md: %s personality unregistered\n", p->name); - spin_lock(&pers_lock); - list_del_init(&p->list); - spin_unlock(&pers_lock); - return 0; + xa_erase(&md_submodule, msh->id); } +EXPORT_SYMBOL_GPL(unregister_md_submodule); -static int is_mddev_idle(struct mddev *mddev, int init) +int md_setup_cluster(struct mddev *mddev, int nodes) { - struct md_rdev * rdev; - int idle; - int curr_events; + int ret = get_cluster_ops(mddev); - idle = 1; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) { - struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; - curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + - (int)part_stat_read(&disk->part0, sectors[1]) - - atomic_read(&disk->sync_io); - /* sync IO will cause sync_io to increase before the disk_stats - * as sync_io is counted when a request starts, and - * disk_stats is counted when it completes. - * So resync activity will cause curr_events to be smaller than - * when there was no such activity. - * non-sync IO will cause disk_stat to increase without - * increasing sync_io so curr_events will (eventually) - * be larger than it was before. Once it becomes - * substantially larger, the test below will cause - * the array to appear non-idle, and resync will slow - * down. - * If there is a lot of outstanding resync activity when - * we set last_event to curr_events, then all that activity - * completing might cause the array to appear non-idle - * and resync will be slowed down even though there might - * not have been non-resync activity. This will only - * happen once though. 'last_events' will soon reflect - * the state where there is little or no outstanding - * resync requests, and further resync activity will - * always make curr_events less than last_events. - * - */ - if (init || curr_events - rdev->last_events > 64) { - rdev->last_events = curr_events; - idle = 0; - } + if (ret) { + request_module("md-cluster"); + ret = get_cluster_ops(mddev); + } + + /* ensure module won't be unloaded */ + if (ret) { + pr_warn("can't find md-cluster module or get its reference.\n"); + return ret; } + + ret = mddev->cluster_ops->join(mddev, nodes); + if (!ret) + mddev->safemode_delay = 0; + return ret; +} + +void md_cluster_stop(struct mddev *mddev) +{ + put_cluster_ops(mddev); +} + +static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init) +{ + unsigned long last_events = rdev->last_events; + + if (!bdev_is_partition(rdev->bdev)) + return true; + + /* + * If rdev is partition, and user doesn't issue IO to the array, the + * array is still not idle if user issues IO to other partitions. + */ + rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0, + sectors) - + part_stat_read_accum(rdev->bdev, sectors); + + return init || rdev->last_events <= last_events; +} + +/* + * mddev is idle if following conditions are matched since last check: + * 1) mddev doesn't have normal IO completed; + * 2) mddev doesn't have inflight normal IO; + * 3) if any member disk is partition, and other partitions don't have IO + * completed; + * + * Noted this checking rely on IO accounting is enabled. + */ +static bool is_mddev_idle(struct mddev *mddev, int init) +{ + unsigned long last_events = mddev->normal_io_events; + struct gendisk *disk; + struct md_rdev *rdev; + bool idle = true; + + disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk; + if (!disk) + return true; + + mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors); + if (!init && (mddev->normal_io_events > last_events || + bdev_count_inflight(disk->part0))) + idle = false; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (!is_rdev_holder_idle(rdev, init)) + idle = false; rcu_read_unlock(); + return idle; } @@ -7229,96 +9041,355 @@ void md_done_sync(struct mddev *mddev, int blocks, int ok) // stop recovery, signal do_sync .... } } - +EXPORT_SYMBOL(md_done_sync); /* md_write_start(mddev, bi) * If we need to update some array metadata (e.g. 'active' flag * in superblock) before writing, schedule a superblock update * and wait for it to complete. + * A return value of 'false' means that the write wasn't recorded + * and cannot proceed as the array is being suspend. */ void md_write_start(struct mddev *mddev, struct bio *bi) { int did_change = 0; + if (bio_data_dir(bi) != WRITE) return; - BUG_ON(mddev->ro == 1); - if (mddev->ro == 2) { + BUG_ON(mddev->ro == MD_RDONLY); + if (mddev->ro == MD_AUTO_READ) { /* need to switch to read/write */ - mddev->ro = 0; + mddev->ro = MD_RDWR; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); did_change = 1; } - atomic_inc(&mddev->writes_pending); + rcu_read_lock(); + percpu_ref_get(&mddev->writes_pending); + smp_mb(); /* Match smp_mb in set_in_sync() */ if (mddev->safemode == 1) mddev->safemode = 0; - if (mddev->in_sync) { - spin_lock_irq(&mddev->write_lock); + /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ + if (mddev->in_sync || mddev->sync_checkers) { + spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); md_wakeup_thread(mddev->thread); did_change = 1; } - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); } + rcu_read_unlock(); if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); + if (!mddev->has_superblocks) + return; wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_PENDING, &mddev->flags)); + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); +} +EXPORT_SYMBOL(md_write_start); + +/* md_write_inc can only be called when md_write_start() has + * already been called at least once of the current request. + * It increments the counter and is useful when a single request + * is split into several parts. Each part causes an increment and + * so needs a matching md_write_end(). + * Unlike md_write_start(), it is safe to call md_write_inc() inside + * a spinlocked region. + */ +void md_write_inc(struct mddev *mddev, struct bio *bi) +{ + if (bio_data_dir(bi) != WRITE) + return; + WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); + percpu_ref_get(&mddev->writes_pending); } +EXPORT_SYMBOL(md_write_inc); void md_write_end(struct mddev *mddev) { - if (atomic_dec_and_test(&mddev->writes_pending)) { - if (mddev->safemode == 2) - md_wakeup_thread(mddev->thread); - else if (mddev->safemode_delay) - mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); + percpu_ref_put(&mddev->writes_pending); + + if (mddev->safemode == 2) + md_wakeup_thread(mddev->thread); + else if (mddev->safemode_delay) + /* The roundup() ensures this only performs locking once + * every ->safemode_delay jiffies + */ + mod_timer(&mddev->safemode_timer, + roundup(jiffies, mddev->safemode_delay) + + mddev->safemode_delay); +} + +EXPORT_SYMBOL(md_write_end); + +/* This is used by raid0 and raid10 */ +void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, + struct bio *bio, sector_t start, sector_t size) +{ + struct bio *discard_bio = NULL; + + if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, + &discard_bio) || !discard_bio) + return; + + bio_chain(discard_bio, bio); + bio_clone_blkg_association(discard_bio, bio); + mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector); + submit_bio_noacct(discard_bio); +} +EXPORT_SYMBOL_GPL(md_submit_discard_bio); + +static void md_bitmap_start(struct mddev *mddev, + struct md_io_clone *md_io_clone) +{ + md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? + mddev->bitmap_ops->start_discard : + mddev->bitmap_ops->start_write; + + if (mddev->pers->bitmap_sector) + mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, + &md_io_clone->sectors); + + fn(mddev, md_io_clone->offset, md_io_clone->sectors); +} + +static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) +{ + md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? + mddev->bitmap_ops->end_discard : + mddev->bitmap_ops->end_write; + + fn(mddev, md_io_clone->offset, md_io_clone->sectors); +} + +static void md_end_clone_io(struct bio *bio) +{ + struct md_io_clone *md_io_clone = bio->bi_private; + struct bio *orig_bio = md_io_clone->orig_bio; + struct mddev *mddev = md_io_clone->mddev; + + if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) + md_bitmap_end(mddev, md_io_clone); + + if (bio->bi_status && !orig_bio->bi_status) + orig_bio->bi_status = bio->bi_status; + + if (md_io_clone->start_time) + bio_end_io_acct(orig_bio, md_io_clone->start_time); + + bio_put(bio); + bio_endio(orig_bio); + percpu_ref_put(&mddev->active_io); +} + +static void md_clone_bio(struct mddev *mddev, struct bio **bio) +{ + struct block_device *bdev = (*bio)->bi_bdev; + struct md_io_clone *md_io_clone; + struct bio *clone = + bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); + + md_io_clone = container_of(clone, struct md_io_clone, bio_clone); + md_io_clone->orig_bio = *bio; + md_io_clone->mddev = mddev; + if (blk_queue_io_stat(bdev->bd_disk->queue)) + md_io_clone->start_time = bio_start_io_acct(*bio); + + if (bio_data_dir(*bio) == WRITE && md_bitmap_enabled(mddev, false)) { + md_io_clone->offset = (*bio)->bi_iter.bi_sector; + md_io_clone->sectors = bio_sectors(*bio); + md_io_clone->rw = op_stat_group(bio_op(*bio)); + md_bitmap_start(mddev, md_io_clone); } + + clone->bi_end_io = md_end_clone_io; + clone->bi_private = md_io_clone; + *bio = clone; +} + +void md_account_bio(struct mddev *mddev, struct bio **bio) +{ + percpu_ref_get(&mddev->active_io); + md_clone_bio(mddev, bio); +} +EXPORT_SYMBOL_GPL(md_account_bio); + +void md_free_cloned_bio(struct bio *bio) +{ + struct md_io_clone *md_io_clone = bio->bi_private; + struct bio *orig_bio = md_io_clone->orig_bio; + struct mddev *mddev = md_io_clone->mddev; + + if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) + md_bitmap_end(mddev, md_io_clone); + + if (bio->bi_status && !orig_bio->bi_status) + orig_bio->bi_status = bio->bi_status; + + if (md_io_clone->start_time) + bio_end_io_acct(orig_bio, md_io_clone->start_time); + + bio_put(bio); + percpu_ref_put(&mddev->active_io); } +EXPORT_SYMBOL_GPL(md_free_cloned_bio); /* md_allow_write(mddev) * Calling this ensures that the array is marked 'active' so that writes * may proceed without blocking. It is important to call this before * attempting a GFP_KERNEL allocation while holding the mddev lock. * Must be called with mddev_lock held. - * - * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock - * is dropped, so return -EAGAIN after notifying userspace. */ -int md_allow_write(struct mddev *mddev) +void md_allow_write(struct mddev *mddev) { if (!mddev->pers) - return 0; - if (mddev->ro) - return 0; + return; + if (!md_is_rdwr(mddev)) + return; if (!mddev->pers->sync_request) - return 0; + return; - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); if (mddev->safemode_delay && mddev->safemode == 0) mddev->safemode = 1; - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); md_update_sb(mddev, 0); sysfs_notify_dirent_safe(mddev->sysfs_state); + /* wait for the dirty state to be recorded in the metadata */ + wait_event(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); } else - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); +} +EXPORT_SYMBOL_GPL(md_allow_write); - if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) - return -EAGAIN; - else +static sector_t md_sync_max_sectors(struct mddev *mddev, + enum sync_action action) +{ + switch (action) { + case ACTION_RESYNC: + case ACTION_CHECK: + case ACTION_REPAIR: + atomic64_set(&mddev->resync_mismatches, 0); + fallthrough; + case ACTION_RESHAPE: + return mddev->resync_max_sectors; + case ACTION_RECOVER: + return mddev->dev_sectors; + default: return 0; + } +} + +/* + * If lazy recovery is requested and all rdevs are in sync, select the rdev with + * the higest index to perfore recovery to build initial xor data, this is the + * same as old bitmap. + */ +static bool mddev_select_lazy_recover_rdev(struct mddev *mddev) +{ + struct md_rdev *recover_rdev = NULL; + struct md_rdev *rdev; + bool ret = false; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (rdev->raid_disk < 0) + continue; + + if (test_bit(Faulty, &rdev->flags) || + !test_bit(In_sync, &rdev->flags)) + break; + + if (!recover_rdev || recover_rdev->raid_disk < rdev->raid_disk) + recover_rdev = rdev; + } + + if (recover_rdev) { + clear_bit(In_sync, &recover_rdev->flags); + ret = true; + } + + rcu_read_unlock(); + return ret; +} + +static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) +{ + sector_t start = 0; + struct md_rdev *rdev; + + switch (action) { + case ACTION_CHECK: + case ACTION_REPAIR: + return mddev->resync_min; + case ACTION_RESYNC: + if (!mddev->bitmap) + return mddev->resync_offset; + return 0; + case ACTION_RESHAPE: + /* + * If the original node aborts reshaping then we continue the + * reshaping, so set again to avoid restart reshape from the + * first beginning + */ + if (mddev_is_clustered(mddev) && + mddev->reshape_position != MaxSector) + return mddev->reshape_position; + return 0; + case ACTION_RECOVER: + start = MaxSector; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev_needs_recovery(rdev, start)) + start = rdev->recovery_offset; + rcu_read_unlock(); + + /* + * If there are no spares, and raid456 lazy initial recover is + * requested. + */ + if (test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery) && + start == MaxSector && mddev_select_lazy_recover_rdev(mddev)) + start = 0; + + /* If there is a bitmap, we need to make sure all + * writes that started before we added a spare + * complete before we start doing a recovery. + * Otherwise the write might complete and (via + * bitmap_endwrite) set a bit in the bitmap after the + * recovery has checked that bit and skipped that + * region. + */ + if (mddev->bitmap) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + return start; + default: + return MaxSector; + } +} + +static bool sync_io_within_limit(struct mddev *mddev) +{ + /* + * For raid456, sync IO is stripe(4k) per IO, for other levels, it's + * RESYNC_PAGES(64k) per IO. + */ + return atomic_read(&mddev->recovery_active) < + (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev); } -EXPORT_SYMBOL_GPL(md_allow_write); #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) @@ -7327,49 +9398,57 @@ void md_do_sync(struct md_thread *thread) { struct mddev *mddev = thread->mddev; struct mddev *mddev2; - unsigned int currspeed = 0, - window; - sector_t max_sectors,j, io_sectors; + unsigned int currspeed = 0, window; + sector_t max_sectors,j, io_sectors, recovery_done; unsigned long mark[SYNC_MARKS]; unsigned long update_time; sector_t mark_cnt[SYNC_MARKS]; int last_mark,m; - struct list_head *tmp; sector_t last_check; int skipped = 0; struct md_rdev *rdev; - char *desc, *action = NULL; + enum sync_action action; + const char *desc; struct blk_plug plug; + int ret; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return; - if (mddev->ro) /* never try to sync a read-only array */ - return; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { - desc = "data-check"; - action = "check"; - } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - desc = "requested-resync"; - action = "repair"; - } else - desc = "resync"; - } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - desc = "reshape"; - else - desc = "recovery"; + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + goto skip; + + if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || + !md_is_rdwr(mddev)) {/* never try to sync a read-only array */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + goto skip; + } - mddev->last_sync_action = action ?: desc; + if (mddev_is_clustered(mddev)) { + ret = mddev->cluster_ops->resync_start(mddev); + if (ret) + goto skip; - /* we overload curr_resync somewhat here. - * 0 == not engaged in resync at all - * 2 == checking that there is no conflict with another sync - * 1 == like 2, but have yielded to allow conflicting resync to - * commense - * other == active in resync - this many blocks - * + set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); + if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) + && ((unsigned long long)mddev->curr_resync_completed + < (unsigned long long)mddev->resync_max_sectors)) + goto skip; + } + + action = md_sync_action(mddev); + if (action == ACTION_FROZEN || action == ACTION_IDLE) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + goto skip; + } + + desc = md_sync_action_name(action); + mddev->last_sync_action = action; + + /* * Before starting a resync we must have set curr_resync to * 2, and then checked that every "conflicting" array has curr_resync * less than ours. When we find one that is the same or higher @@ -7378,29 +9457,33 @@ void md_do_sync(struct md_thread *thread) * This will mean we have to start checking from the beginning again. * */ - + if (mddev_is_clustered(mddev)) + mddev->cluster_ops->resync_start_notify(mddev); do { - mddev->curr_resync = 2; + int mddev2_minor = -1; + mddev->curr_resync = MD_RESYNC_DELAYED; try_again: - if (kthread_should_stop()) - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto skip; - for_each_mddev(mddev2, tmp) { + spin_lock(&all_mddevs_lock); + list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { + if (test_bit(MD_DELETED, &mddev2->flags)) + continue; if (mddev2 == mddev) continue; if (!mddev->parallel_resync && mddev2->curr_resync && match_mddev_units(mddev, mddev2)) { DEFINE_WAIT(wq); - if (mddev < mddev2 && mddev->curr_resync == 2) { + if (mddev < mddev2 && + mddev->curr_resync == MD_RESYNC_DELAYED) { /* arbitrarily yield */ - mddev->curr_resync = 1; + mddev->curr_resync = MD_RESYNC_YIELDED; wake_up(&resync_wait); } - if (mddev > mddev2 && mddev->curr_resync == 1) + if (mddev > mddev2 && + mddev->curr_resync == MD_RESYNC_YIELDED) /* no need to wait here, we can wait the next * time 'round when curr_resync == 2 */ @@ -7410,13 +9493,16 @@ void md_do_sync(struct md_thread *thread) * be caught by 'softlockup' */ prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); - if (!kthread_should_stop() && + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev2->curr_resync >= mddev->curr_resync) { - printk(KERN_INFO "md: delaying %s of %s" - " until %s has finished (they" - " share one or more physical units)\n", - desc, mdname(mddev), mdname(mddev2)); - mddev_put(mddev2); + if (mddev2_minor != mddev2->md_minor) { + mddev2_minor = mddev2->md_minor; + pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", + desc, mdname(mddev), + mdname(mddev2)); + } + spin_unlock(&all_mddevs_lock); + if (signal_pending(current)) flush_signals(current); schedule(); @@ -7426,43 +9512,16 @@ void md_do_sync(struct md_thread *thread) finish_wait(&resync_wait, &wq); } } - } while (mddev->curr_resync < 2); - - j = 0; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - /* resync follows the size requested by the personality, - * which defaults to physical size, but can be virtual size - */ - max_sectors = mddev->resync_max_sectors; - atomic64_set(&mddev->resync_mismatches, 0); - /* we don't use the checkpoint if there's a bitmap */ - if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) - j = mddev->resync_min; - else if (!mddev->bitmap) - j = mddev->recovery_cp; + spin_unlock(&all_mddevs_lock); + } while (mddev->curr_resync < MD_RESYNC_DELAYED); - } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - max_sectors = mddev->resync_max_sectors; - else { - /* recovery follows the physical size of devices */ - max_sectors = mddev->dev_sectors; - j = MaxSector; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < j) - j = rdev->recovery_offset; - rcu_read_unlock(); - } + max_sectors = md_sync_max_sectors(mddev, action); + j = md_sync_position(mddev, action); - printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); - printk(KERN_INFO "md: minimum _guaranteed_ speed:" - " %d KB/sec/disk.\n", speed_min(mddev)); - printk(KERN_INFO "md: using maximum available idle IO bandwidth " - "(but not more than %d KB/sec) for %s.\n", - speed_max(mddev), desc); + pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); + pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); + pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", + speed_max(mddev), desc); is_mddev_idle(mddev, 1); /* this initializes IO event counters */ @@ -7478,23 +9537,22 @@ void md_do_sync(struct md_thread *thread) /* * Tune reconstruction: */ - window = 32*(PAGE_SIZE/512); - printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n", - window/2, (unsigned long long)max_sectors/2); + window = 32 * (PAGE_SIZE / 512); + pr_debug("md: using %dk window, over a total of %lluk.\n", + window/2, (unsigned long long)max_sectors/2); atomic_set(&mddev->recovery_active, 0); last_check = 0; - if (j>2) { - printk(KERN_INFO - "md: resuming %s of %s from checkpoint.\n", - desc, mdname(mddev)); + if (j >= MD_RESYNC_ACTIVE) { + pr_debug("md: resuming %s of %s from checkpoint.\n", + desc, mdname(mddev)); mddev->curr_resync = j; } else - mddev->curr_resync = 3; /* no longer delayed */ + mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ mddev->curr_resync_completed = j; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); - md_new_event(mddev); + sysfs_notify_dirent_safe(mddev->sysfs_completed); + md_new_event(); update_time = jiffies; blk_start_plug(&plug); @@ -7509,21 +9567,23 @@ void md_do_sync(struct md_thread *thread) > (max_sectors >> 4)) || time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || (j - mddev->curr_resync_completed)*2 - >= mddev->resync_max - mddev->curr_resync_completed + >= mddev->resync_max - mddev->curr_resync_completed || + mddev->curr_resync_completed > mddev->resync_max )) { /* time to update curr_resync_completed */ wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active) == 0); mddev->curr_resync_completed = j; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && - j > mddev->recovery_cp) - mddev->recovery_cp = j; + j > mddev->resync_offset) + mddev->resync_offset = j; update_time = jiffies; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } - while (j >= mddev->resync_max && !kthread_should_stop()) { + while (j >= mddev->resync_max && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* As this condition is controlled by user-space, * we can block indefinitely, so use '_interruptible' * to avoid triggering warnings. @@ -7531,17 +9591,24 @@ void md_do_sync(struct md_thread *thread) flush_signals(current); /* just in case */ wait_event_interruptible(mddev->recovery_wait, mddev->resync_max > j - || kthread_should_stop()); + || test_bit(MD_RECOVERY_INTR, + &mddev->recovery)); } - if (kthread_should_stop()) - goto interrupted; + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; + + if (mddev->bitmap_ops && mddev->bitmap_ops->skip_sync_blocks) { + sectors = mddev->bitmap_ops->skip_sync_blocks(mddev, j); + if (sectors) + goto update; + } - sectors = mddev->pers->sync_request(mddev, j, &skipped, - currspeed < speed_min(mddev)); + sectors = mddev->pers->sync_request(mddev, j, max_sectors, + &skipped); if (sectors == 0) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; + break; } if (!skipped) { /* actual IO requested */ @@ -7552,15 +9619,19 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; +update: j += sectors; - if (j > 2) + if (j > max_sectors) + /* when skipping, extra large numbers can be returned. */ + j = max_sectors; + if (j >= MD_RESYNC_ACTIVE) mddev->curr_resync = j; mddev->curr_mark_cnt = io_sectors; if (last_check == 0) /* this is the earliest that rebuild will be * visible in /proc/mdstat */ - md_new_event(mddev); + md_new_event(); if (last_check + window > io_sectors || j == max_sectors) continue; @@ -7578,10 +9649,8 @@ void md_do_sync(struct md_thread *thread) last_mark = next; } - - if (kthread_should_stop()) - goto interrupted; - + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; /* * this loop exits only if either when we are slower than @@ -7593,63 +9662,96 @@ void md_do_sync(struct md_thread *thread) */ cond_resched(); - currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 + recovery_done = io_sectors - atomic_read(&mddev->recovery_active); + currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 /((jiffies-mddev->resync_mark)/HZ +1) +1; if (currspeed > speed_min(mddev)) { - if ((currspeed > speed_max(mddev)) || - !is_mddev_idle(mddev, 0)) { + if (currspeed > speed_max(mddev)) { msleep(500); goto repeat; } + if (!sync_io_within_limit(mddev) && + !is_mddev_idle(mddev, 0)) { + /* + * Give other IO more of a chance. + * The faster the devices, the less we wait. + */ + wait_event(mddev->recovery_wait, + !atomic_read(&mddev->recovery_active)); + } } } - printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); + pr_info("md: %s: %s %s.\n",mdname(mddev), desc, + test_bit(MD_RECOVERY_INTR, &mddev->recovery) + ? "interrupted" : "done"); /* * this also signals 'finished resyncing' to md_stop */ - out: blk_finish_plug(&plug); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); - /* tell personality that we are finished */ - mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + mddev->curr_resync >= MD_RESYNC_ACTIVE) { + mddev->curr_resync_completed = mddev->curr_resync; + sysfs_notify_dirent_safe(mddev->sysfs_completed); + } + mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && - mddev->curr_resync > 2) { + mddev->curr_resync > MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - if (mddev->curr_resync >= mddev->recovery_cp) { - printk(KERN_INFO - "md: checkpointing %s of %s.\n", - desc, mdname(mddev)); + if (mddev->curr_resync >= mddev->resync_offset) { + pr_debug("md: checkpointing %s of %s.\n", + desc, mdname(mddev)); if (test_bit(MD_RECOVERY_ERROR, &mddev->recovery)) - mddev->recovery_cp = + mddev->resync_offset = mddev->curr_resync_completed; else - mddev->recovery_cp = + mddev->resync_offset = mddev->curr_resync; } } else - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - mddev->delta_disks >= 0 && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < mddev->curr_resync) - rdev->recovery_offset = mddev->curr_resync; - rcu_read_unlock(); + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (mddev->delta_disks >= 0 && + rdev_needs_recovery(rdev, mddev->curr_resync)) + rdev->recovery_offset = mddev->curr_resync; + rcu_read_unlock(); + } } } skip: - set_bit(MD_CHANGE_DEVS, &mddev->flags); + /* set CHANGE_PENDING here since maybe another update is needed, + * so other nodes are informed. It should be harmless for normal + * raid */ + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + mddev->delta_disks > 0 && + mddev->pers->finish_reshape && + mddev->pers->size && + !mddev_is_dm(mddev)) { + mddev_lock_nointr(mddev); + md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); + mddev_unlock(mddev); + if (!mddev_is_clustered(mddev)) + set_capacity_and_notify(mddev->gendisk, + mddev->array_sectors); + } + + spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* We completed so min/max setting can be forgotten if used. */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) @@ -7657,23 +9759,126 @@ void md_do_sync(struct md_thread *thread) mddev->resync_max = MaxSector; } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) mddev->resync_min = mddev->curr_resync_completed; - mddev->curr_resync = 0; - wake_up(&resync_wait); set_bit(MD_RECOVERY_DONE, &mddev->recovery); + mddev->curr_resync = MD_RESYNC_NONE; + spin_unlock(&mddev->lock); + + wake_up(&resync_wait); md_wakeup_thread(mddev->thread); return; +} +EXPORT_SYMBOL_GPL(md_do_sync); + +static bool rdev_removeable(struct md_rdev *rdev) +{ + /* rdev is not used. */ + if (rdev->raid_disk < 0) + return false; + + /* There are still inflight io, don't remove this rdev. */ + if (atomic_read(&rdev->nr_pending)) + return false; - interrupted: /* - * got a signal, exit. + * An error occurred but has not yet been acknowledged by the metadata + * handler, don't remove this rdev. */ - printk(KERN_INFO - "md: md_do_sync() got signal ... exiting\n"); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; + if (test_bit(Blocked, &rdev->flags)) + return false; + + /* Fautly rdev is not used, it's safe to remove it. */ + if (test_bit(Faulty, &rdev->flags)) + return true; + + /* Journal disk can only be removed if it's faulty. */ + if (test_bit(Journal, &rdev->flags)) + return false; + + /* + * 'In_sync' is cleared while 'raid_disk' is valid, which means + * replacement has just become active from pers->spare_active(), and + * then pers->hot_remove_disk() will replace this rdev with replacement. + */ + if (!test_bit(In_sync, &rdev->flags)) + return true; + return false; +} + +static bool rdev_is_spare(struct md_rdev *rdev) +{ + return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags); +} + +static bool rdev_addable(struct md_rdev *rdev) +{ + struct mddev *mddev; + + mddev = READ_ONCE(rdev->mddev); + if (!mddev) + return false; + + /* rdev is already used, don't add it again. */ + if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || + test_bit(Faulty, &rdev->flags)) + return false; + + /* Allow to add journal disk. */ + if (test_bit(Journal, &rdev->flags)) + return true; + + /* Allow to add if array is read-write. */ + if (md_is_rdwr(mddev)) + return true; + + /* + * For read-only array, only allow to readd a rdev. And if bitmap is + * used, don't allow to readd a rdev that is too old. + */ + if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) + return true; + + return false; +} + +static bool md_spares_need_change(struct mddev *mddev) +{ + struct md_rdev *rdev; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (rdev_removeable(rdev) || rdev_addable(rdev)) { + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); + return false; +} + +static int remove_spares(struct mddev *mddev, struct md_rdev *this) +{ + struct md_rdev *rdev; + int removed = 0; + + rdev_for_each(rdev, mddev) { + if ((this == NULL || rdev == this) && rdev_removeable(rdev) && + !mddev->pers->hot_remove_disk(mddev, rdev)) { + sysfs_unlink_rdev(mddev, rdev); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->raid_disk = -1; + removed++; + } + } + + if (removed && mddev->kobj.sd) + sysfs_notify_dirent_safe(mddev->sysfs_degraded); + + return removed; } -EXPORT_SYMBOL_GPL(md_do_sync); static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this) @@ -7682,69 +9887,235 @@ static int remove_and_add_spares(struct mddev *mddev, int spares = 0; int removed = 0; - rdev_for_each(rdev, mddev) - if ((this == NULL || rdev == this) && - rdev->raid_disk >= 0 && - !test_bit(Blocked, &rdev->flags) && - (test_bit(Faulty, &rdev->flags) || - ! test_bit(In_sync, &rdev->flags)) && - atomic_read(&rdev->nr_pending)==0) { - if (mddev->pers->hot_remove_disk( - mddev, rdev) == 0) { - sysfs_unlink_rdev(mddev, rdev); - rdev->raid_disk = -1; - removed++; - } - } - if (removed && mddev->kobj.sd) - sysfs_notify(&mddev->kobj, NULL, "degraded"); + if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + /* Mustn't remove devices when resync thread is running */ + return 0; - if (this) + removed = remove_spares(mddev, this); + if (this && removed) goto no_add; rdev_for_each(rdev, mddev) { - if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - !test_bit(Faulty, &rdev->flags)) - spares++; - if (rdev->raid_disk >= 0) - continue; - if (test_bit(Faulty, &rdev->flags)) - continue; - if (mddev->ro && - rdev->saved_raid_disk < 0) - continue; - - rdev->recovery_offset = 0; - if (rdev->saved_raid_disk >= 0 && mddev->in_sync) { - spin_lock_irq(&mddev->write_lock); - if (mddev->in_sync) - /* OK, this device, which is in_sync, - * will definitely be noticed before - * the next write, so recovery isn't - * needed. - */ - rdev->recovery_offset = mddev->recovery_cp; - spin_unlock_irq(&mddev->write_lock); - } - if (mddev->ro && rdev->recovery_offset != MaxSector) - /* not safe to add this disk now */ + if (this && this != rdev) continue; - if (mddev->pers-> - hot_add_disk(mddev, rdev) == 0) { - if (sysfs_link_rdev(mddev, rdev)) - /* failure here is OK */; + if (rdev_is_spare(rdev)) spares++; - md_new_event(mddev); - set_bit(MD_CHANGE_DEVS, &mddev->flags); + if (!rdev_addable(rdev)) + continue; + if (!test_bit(Journal, &rdev->flags)) + rdev->recovery_offset = 0; + if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { + /* failure here is OK */ + sysfs_link_rdev(mddev, rdev); + if (!test_bit(Journal, &rdev->flags)) + spares++; + md_new_event(); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); } } no_add: if (removed) - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); return spares; } +static bool md_choose_sync_action(struct mddev *mddev, int *spares) +{ + /* Check if reshape is in progress first. */ + if (mddev->reshape_position != MaxSector) { + if (mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev) != 0) + return false; + + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); + return true; + } + + /* Check if resync is in progress. */ + if (mddev->resync_offset < MaxSector) { + remove_spares(mddev, NULL); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); + return true; + } + + /* + * Remove any failed drives, then add spares if possible. Spares are + * also removed and re-added, to allow the personality to fail the + * re-add. + */ + *spares = remove_and_add_spares(mddev, NULL); + if (*spares || test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) { + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + + /* Start new recovery. */ + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + return true; + } + + /* Delay to choose resync/check/repair in md_do_sync(). */ + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + return true; + + /* Nothing to be done */ + return false; +} + +static void md_start_sync(struct work_struct *ws) +{ + struct mddev *mddev = container_of(ws, struct mddev, sync_work); + int spares = 0; + bool suspend = false; + char *name; + + /* + * If reshape is still in progress, spares won't be added or removed + * from conf until reshape is done. + */ + if (mddev->reshape_position == MaxSector && + md_spares_need_change(mddev)) { + suspend = true; + mddev_suspend(mddev, false); + } + + mddev_lock_nointr(mddev); + if (!md_is_rdwr(mddev)) { + /* + * On a read-only array we can: + * - remove failed devices + * - add already-in_sync devices if the array itself is in-sync. + * As we only add devices that are already in-sync, we can + * activate the spares immediately. + */ + remove_and_add_spares(mddev, NULL); + goto not_running; + } + + if (!md_choose_sync_action(mddev, &spares)) + goto not_running; + + if (!mddev->pers->sync_request) + goto not_running; + + /* + * We are adding a device or devices to an array which has the bitmap + * stored on all devices. So make sure all bitmap pages get written. + */ + if (spares && md_bitmap_enabled(mddev, true)) + mddev->bitmap_ops->write_all(mddev); + + name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? + "reshape" : "resync"; + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, name)); + if (!mddev->sync_thread) { + pr_warn("%s: could not start resync thread...\n", + mdname(mddev)); + /* leave the spares where they are, it shouldn't hurt */ + goto not_running; + } + + mddev_unlock(mddev); + /* + * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should + * not set it again. Otherwise, we may cause issue like this one: + * https://bugzilla.kernel.org/show_bug.cgi?id=218200 + * Therefore, use __mddev_resume(mddev, false). + */ + if (suspend) + __mddev_resume(mddev, false); + md_wakeup_thread(mddev->sync_thread); + sysfs_notify_dirent_safe(mddev->sysfs_action); + md_new_event(); + return; + +not_running: + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + mddev_unlock(mddev); + /* + * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should + * not set it again. Otherwise, we may cause issue like this one: + * https://bugzilla.kernel.org/show_bug.cgi?id=218200 + * Therefore, use __mddev_resume(mddev, false). + */ + if (suspend) + __mddev_resume(mddev, false); + + wake_up(&resync_wait); + if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && + mddev->sysfs_action) + sysfs_notify_dirent_safe(mddev->sysfs_action); +} + +static void unregister_sync_thread(struct mddev *mddev) +{ + if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { + /* resync/recovery still happening */ + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + return; + } + + if (WARN_ON_ONCE(!mddev->sync_thread)) + return; + + md_reap_sync_thread(mddev); +} + +static bool md_should_do_recovery(struct mddev *mddev) +{ + /* + * As long as one of the following flags is set, + * recovery needs to do or cleanup. + */ + if (test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || + test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + return true; + + /* + * If no flags are set and it is in read-only status, + * there is nothing to do. + */ + if (!md_is_rdwr(mddev)) + return false; + + /* + * MD_SB_CHANGE_PENDING indicates that the array is switching from clean to + * active, and no action is needed for now. + * All other MD_SB_* flags require to update the superblock. + */ + if (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) + return true; + + /* + * If the array is not using external metadata and there has been no data + * written for some time, then the array's status needs to be set to + * in_sync. + */ + if (mddev->external == 0 && mddev->safemode == 1) + return true; + + /* + * When the system is about to restart or the process receives an signal, + * the array needs to be synchronized as soon as possible. + * Once the data synchronization is completed, need to change the array + * status to in_sync. + */ + if (mddev->safemode == 2 && !mddev->in_sync && + mddev->resync_offset == MaxSector) + return true; + + return false; +} + /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. @@ -7769,212 +10140,197 @@ no_add: */ void md_check_recovery(struct mddev *mddev) { - if (mddev->suspended) - return; - - if (mddev->bitmap) - bitmap_daemon_work(mddev); + if (md_bitmap_enabled(mddev, false) && mddev->bitmap_ops->daemon_work) + mddev->bitmap_ops->daemon_work(mddev); if (signal_pending(current)) { if (mddev->pers->sync_request && !mddev->external) { - printk(KERN_INFO "md: %s in immediate safe mode\n", - mdname(mddev)); + pr_debug("md: %s in immediate safe mode\n", + mdname(mddev)); mddev->safemode = 2; } flush_signals(current); } - if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) - return; - if ( ! ( - (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) || - test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || - test_bit(MD_RECOVERY_DONE, &mddev->recovery) || - (mddev->external == 0 && mddev->safemode == 1) || - (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) - && !mddev->in_sync && mddev->recovery_cp == MaxSector) - )) + if (!md_should_do_recovery(mddev)) return; if (mddev_trylock(mddev)) { - int spares = 0; - - if (mddev->ro) { - /* On a read-only array we can: - * - remove failed devices - * - add already-in_sync devices if the array itself - * is in-sync. - * As we only add devices that are already in-sync, - * we can activate the spares immediately. + bool try_set_sync = mddev->safemode != 0; + + if (!mddev->external && mddev->safemode == 1) + mddev->safemode = 0; + + if (!md_is_rdwr(mddev)) { + struct md_rdev *rdev; + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + unregister_sync_thread(mddev); + goto unlock; + } + + if (!mddev->external && mddev->in_sync) + /* + * 'Blocked' flag not needed as failed devices + * will be recorded if array switched to read/write. + * Leaving it set will prevent the device + * from being removed. + */ + rdev_for_each(rdev, mddev) + clear_bit(Blocked, &rdev->flags); + + /* + * There is no thread, but we need to call + * ->spare_active and clear saved_raid_disk + */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_reap_sync_thread(mddev); + + /* + * Let md_start_sync() to remove and add rdevs to the + * array. */ + if (md_spares_need_change(mddev)) { + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + queue_work(md_misc_wq, &mddev->sync_work); + } + + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - remove_and_add_spares(mddev, NULL); - mddev->pers->spare_active(mddev); + clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); + goto unlock; } - if (!mddev->external) { - int did_change = 0; - spin_lock_irq(&mddev->write_lock); - if (mddev->safemode && - !atomic_read(&mddev->writes_pending) && - !mddev->in_sync && - mddev->recovery_cp == MaxSector) { - mddev->in_sync = 1; - did_change = 1; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); + if (mddev_is_clustered(mddev)) { + struct md_rdev *rdev, *tmp; + /* kick the device if another node issued a + * remove disk. + */ + rdev_for_each_safe(rdev, tmp, mddev) { + if (rdev->raid_disk < 0 && + test_and_clear_bit(ClusterRemove, &rdev->flags)) + md_kick_rdev_from_array(rdev); } - if (mddev->safemode == 1) - mddev->safemode = 0; - spin_unlock_irq(&mddev->write_lock); - if (did_change) - sysfs_notify_dirent_safe(mddev->sysfs_state); } - if (mddev->flags) + if (try_set_sync && !mddev->external && !mddev->in_sync) { + spin_lock(&mddev->lock); + set_in_sync(mddev); + spin_unlock(&mddev->lock); + } + + if (mddev->sb_flags) md_update_sb(mddev, 0); - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && - !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { - /* resync/recovery still happening */ - clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - goto unlock; - } - if (mddev->sync_thread) { - md_reap_sync_thread(mddev); + /* + * Never start a new sync thread if MD_RECOVERY_RUNNING is + * still set. + */ + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + unregister_sync_thread(mddev); goto unlock; } + /* Set RUNNING before clearing NEEDED to avoid * any transients in the value of "sync_action". */ mddev->curr_resync_completed = 0; + spin_lock(&mddev->lock); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + spin_unlock(&mddev->lock); /* Clear some bits that don't mean anything, but * might be left set */ clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); - if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || - test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) - goto unlock; - /* no recovery is running. - * remove any failed drives, then - * add spares if possible. - * Spares are also removed and re-added, to allow - * the personality to fail the re-add. - */ - - if (mddev->reshape_position != MaxSector) { - if (mddev->pers->check_reshape == NULL || - mddev->pers->check_reshape(mddev) != 0) - /* Cannot proceed */ - goto unlock; - set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if ((spares = remove_and_add_spares(mddev, NULL))) { - clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); - clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if (mddev->recovery_cp < MaxSector) { - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - /* nothing to be done ... */ - goto unlock; - - if (mddev->pers->sync_request) { - if (spares) { - /* We are adding a device or devices to an array - * which has the bitmap stored on all devices. - * So make sure all bitmap pages get written - */ - bitmap_write_all(mddev->bitmap); - } - mddev->sync_thread = md_register_thread(md_do_sync, - mddev, - "resync"); - if (!mddev->sync_thread) { - printk(KERN_ERR "%s: could not start resync" - " thread...\n", - mdname(mddev)); - /* leave the spares where they are, it shouldn't hurt */ - clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); - } else - md_wakeup_thread(mddev->sync_thread); - sysfs_notify_dirent_safe(mddev->sysfs_action); - md_new_event(mddev); + if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && + !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + queue_work(md_misc_wq, &mddev->sync_work); + } else { + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + wake_up(&resync_wait); } + unlock: wake_up(&mddev->sb_wait); - - if (!mddev->sync_thread) { - clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - if (test_and_clear_bit(MD_RECOVERY_RECOVER, - &mddev->recovery)) - if (mddev->sysfs_action) - sysfs_notify_dirent_safe(mddev->sysfs_action); - } mddev_unlock(mddev); } } +EXPORT_SYMBOL(md_check_recovery); void md_reap_sync_thread(struct mddev *mddev) { struct md_rdev *rdev; + sector_t old_dev_sectors = mddev->dev_sectors; + bool is_reshaped = false; /* resync has finished, collect result */ - md_unregister_thread(&mddev->sync_thread); + md_unregister_thread(mddev, &mddev->sync_thread); + atomic_inc(&mddev->sync_seq); + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + mddev->degraded != mddev->raid_disks) { /* success...*/ /* activate any spares */ if (mddev->pers->spare_active(mddev)) { - sysfs_notify(&mddev->kobj, NULL, - "degraded"); - set_bit(MD_CHANGE_DEVS, &mddev->flags); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); } } if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - mddev->pers->finish_reshape) + mddev->pers->finish_reshape) { mddev->pers->finish_reshape(mddev); + if (mddev_is_clustered(mddev)) + is_reshaped = true; + } /* If array is no-longer degraded, then any saved_raid_disk - * information must be scrapped. Also if any device is now - * In_sync we must scrape the saved_raid_disk for that device - * do the superblock for an incrementally recovered device - * written out. + * information must be scrapped. */ - rdev_for_each(rdev, mddev) - if (!mddev->degraded || - test_bit(In_sync, &rdev->flags)) + if (!mddev->degraded) + rdev_for_each(rdev, mddev) rdev->saved_raid_disk = -1; md_update_sb(mddev, 1); + /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can + * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by + * clustered raid */ + if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) + mddev->cluster_ops->resync_finish(mddev); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); + /* + * We call mddev->cluster_ops->update_size here because sync_size could + * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, + * so it is time to update size across cluster. + */ + if (mddev_is_clustered(mddev) && is_reshaped + && !test_bit(MD_CLOSING, &mddev->flags)) + mddev->cluster_ops->update_size(mddev, old_dev_sectors); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_completed); sysfs_notify_dirent_safe(mddev->sysfs_action); - md_new_event(mddev); + md_new_event(); if (mddev->event_work.func) queue_work(md_misc_wq, &mddev->event_work); + wake_up(&resync_wait); } +EXPORT_SYMBOL(md_reap_sync_thread); void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) { sysfs_notify_dirent_safe(rdev->sysfs_state); - wait_event_timeout(rdev->blocked_wait, - !test_bit(Blocked, &rdev->flags) && - !test_bit(BlockedBadBlocks, &rdev->flags), + wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev), msecs_to_jiffies(5000)); rdev_dec_pending(rdev, mddev); } @@ -7995,526 +10351,80 @@ void md_finish_reshape(struct mddev *mddev) } EXPORT_SYMBOL(md_finish_reshape); -/* Bad block management. - * We can record which blocks on each device are 'bad' and so just - * fail those blocks, or that stripe, rather than the whole device. - * Entries in the bad-block table are 64bits wide. This comprises: - * Length of bad-range, in sectors: 0-511 for lengths 1-512 - * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) - * A 'shift' can be set so that larger blocks are tracked and - * consequently larger devices can be covered. - * 'Acknowledged' flag - 1 bit. - the most significant bit. - * - * Locking of the bad-block table uses a seqlock so md_is_badblock - * might need to retry if it is very unlucky. - * We will sometimes want to check for bad blocks in a bi_end_io function, - * so we use the write_seqlock_irq variant. - * - * When looking for a bad block we specify a range and want to - * know if any block in the range is bad. So we binary-search - * to the last range that starts at-or-before the given endpoint, - * (or "before the sector after the target range") - * then see if it ends after the given start. - * We return - * 0 if there are no known bad blocks in the range - * 1 if there are known bad block which are all acknowledged - * -1 if there are bad blocks which have not yet been acknowledged in metadata. - * plus the start/length of the first bad section we overlap. - */ -int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) -{ - int hi; - int lo; - u64 *p = bb->page; - int rv; - sector_t target = s + sectors; - unsigned seq; - - if (bb->shift > 0) { - /* round the start down, and the end up */ - s >>= bb->shift; - target += (1<<bb->shift) - 1; - target >>= bb->shift; - sectors = target - s; - } - /* 'target' is now the first block after the bad range */ - -retry: - seq = read_seqbegin(&bb->lock); - lo = 0; - rv = 0; - hi = bb->count; +/* Bad block management */ - /* Binary search between lo and hi for 'target' - * i.e. for the last range that starts before 'target' - */ - /* INVARIANT: ranges before 'lo' and at-or-after 'hi' - * are known not to be the last range before target. - * VARIANT: hi-lo is the number of possible - * ranges, and decreases until it reaches 1 - */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a < target) - /* This could still be the one, earlier ranges - * could not. */ - lo = mid; - else - /* This and later ranges are definitely out. */ - hi = mid; - } - /* 'lo' might be the last that started before target, but 'hi' isn't */ - if (hi > lo) { - /* need to check all range that end after 's' to see if - * any are unacknowledged. - */ - while (lo >= 0 && - BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { - if (BB_OFFSET(p[lo]) < target) { - /* starts before the end, and finishes after - * the start, so they must overlap - */ - if (rv != -1 && BB_ACK(p[lo])) - rv = 1; - else - rv = -1; - *first_bad = BB_OFFSET(p[lo]); - *bad_sectors = BB_LEN(p[lo]); - } - lo--; - } - } - - if (read_seqretry(&bb->lock, seq)) - goto retry; - - return rv; -} -EXPORT_SYMBOL_GPL(md_is_badblock); - -/* - * Add a range of bad blocks to the table. - * This might extend the table, or might contract it - * if two adjacent ranges can be merged. - * We binary-search to find the 'insertion' point, then - * decide how best to handle it. - */ -static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) +/* Returns true on success, false on failure */ +bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) { - u64 *p; - int lo, hi; - int rv = 1; - - if (bb->shift < 0) - /* badblocks are disabled */ - return 0; - - if (bb->shift) { - /* round the start down, and the end up */ - sector_t next = s + sectors; - s >>= bb->shift; - next += (1<<bb->shift) - 1; - next >>= bb->shift; - sectors = next - s; - } - - write_seqlock_irq(&bb->lock); - - p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts at-or-before 's' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a <= s) - lo = mid; - else - hi = mid; - } - if (hi > lo && BB_OFFSET(p[lo]) > s) - hi = lo; - - if (hi > lo) { - /* we found a range that might merge with the start - * of our new range - */ - sector_t a = BB_OFFSET(p[lo]); - sector_t e = a + BB_LEN(p[lo]); - int ack = BB_ACK(p[lo]); - if (e >= s) { - /* Yes, we can merge with a previous range */ - if (s == a && s + sectors >= e) - /* new range covers old */ - ack = acknowledged; - else - ack = ack && acknowledged; - - if (e < s + sectors) - e = s + sectors; - if (e - a <= BB_MAX_LEN) { - p[lo] = BB_MAKE(a, e-a, ack); - s = e; - } else { - /* does not all fit in one range, - * make p[lo] maximal - */ - if (BB_LEN(p[lo]) != BB_MAX_LEN) - p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; - } - sectors = e - s; - } - } - if (sectors && hi < bb->count) { - /* 'hi' points to the first range that starts after 's'. - * Maybe we can merge with the start of that range */ - sector_t a = BB_OFFSET(p[hi]); - sector_t e = a + BB_LEN(p[hi]); - int ack = BB_ACK(p[hi]); - if (a <= s + sectors) { - /* merging is possible */ - if (e <= s + sectors) { - /* full overlap */ - e = s + sectors; - ack = acknowledged; - } else - ack = ack && acknowledged; - - a = s; - if (e - a <= BB_MAX_LEN) { - p[hi] = BB_MAKE(a, e-a, ack); - s = e; - } else { - p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; - } - sectors = e - s; - lo = hi; - hi++; - } - } - if (sectors == 0 && hi < bb->count) { - /* we might be able to combine lo and hi */ - /* Note: 's' is at the end of 'lo' */ - sector_t a = BB_OFFSET(p[hi]); - int lolen = BB_LEN(p[lo]); - int hilen = BB_LEN(p[hi]); - int newlen = lolen + hilen - (s - a); - if (s >= a && newlen < BB_MAX_LEN) { - /* yes, we can combine them */ - int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); - p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); - memmove(p + hi, p + hi + 1, - (bb->count - hi - 1) * 8); - bb->count--; - } - } - while (sectors) { - /* didn't merge (it all). - * Need to add a range just before 'hi' */ - if (bb->count >= MD_MAX_BADBLOCKS) { - /* No room for more */ - rv = 0; - break; - } else { - int this_sectors = sectors; - memmove(p + hi + 1, p + hi, - (bb->count - hi) * 8); - bb->count++; - - if (this_sectors > BB_MAX_LEN) - this_sectors = BB_MAX_LEN; - p[hi] = BB_MAKE(s, this_sectors, acknowledged); - sectors -= this_sectors; - s += this_sectors; - } - } - - bb->changed = 1; - if (!acknowledged) - bb->unacked_exist = 1; - write_sequnlock_irq(&bb->lock); + struct mddev *mddev = rdev->mddev; - return rv; -} + /* + * Recording new badblocks for faulty rdev will force unnecessary + * super block updating. This is fragile for external management because + * userspace daemon may trying to remove this device and deadlock may + * occur. This will be probably solved in the mdadm, but it is safer to + * avoid it. + */ + if (test_bit(Faulty, &rdev->flags)) + return true; -int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new) -{ - int rv; if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; - rv = md_set_badblocks(&rdev->badblocks, - s, sectors, 0); - if (rv) { - /* Make sure they get written out promptly */ - sysfs_notify_dirent_safe(rdev->sysfs_state); - set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); - md_wakeup_thread(rdev->mddev->thread); - } - return rv; -} -EXPORT_SYMBOL_GPL(rdev_set_badblocks); -/* - * Remove a range of bad blocks from the table. - * This may involve extending the table if we spilt a region, - * but it must not fail. So if the table becomes full, we just - * drop the remove request. - */ -static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) -{ - u64 *p; - int lo, hi; - sector_t target = s + sectors; - int rv = 0; + if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) + return false; - if (bb->shift > 0) { - /* When clearing we round the start up and the end down. - * This should not matter as the shift should align with - * the block size and no rounding should ever be needed. - * However it is better the think a block is bad when it - * isn't than to think a block is not bad when it is. - */ - s += (1<<bb->shift) - 1; - s >>= bb->shift; - target >>= bb->shift; - sectors = target - s; - } - - write_seqlock_irq(&bb->lock); - - p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts before 'target' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a < target) - lo = mid; - else - hi = mid; - } - if (hi > lo) { - /* p[lo] is the last range that could overlap the - * current range. Earlier ranges could also overlap, - * but only this one can overlap the end of the range. - */ - if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { - /* Partial overlap, leave the tail of this range */ - int ack = BB_ACK(p[lo]); - sector_t a = BB_OFFSET(p[lo]); - sector_t end = a + BB_LEN(p[lo]); - - if (a < s) { - /* we need to split this range */ - if (bb->count >= MD_MAX_BADBLOCKS) { - rv = 0; - goto out; - } - memmove(p+lo+1, p+lo, (bb->count - lo) * 8); - bb->count++; - p[lo] = BB_MAKE(a, s-a, ack); - lo++; - } - p[lo] = BB_MAKE(target, end - target, ack); - /* there is no longer an overlap */ - hi = lo; - lo--; - } - while (lo >= 0 && - BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { - /* This range does overlap */ - if (BB_OFFSET(p[lo]) < s) { - /* Keep the early parts of this range. */ - int ack = BB_ACK(p[lo]); - sector_t start = BB_OFFSET(p[lo]); - p[lo] = BB_MAKE(start, s - start, ack); - /* now low doesn't overlap, so.. */ - break; - } - lo--; - } - /* 'lo' is strictly before, 'hi' is strictly after, - * anything between needs to be discarded - */ - if (hi - lo > 1) { - memmove(p+lo+1, p+hi, (bb->count - hi) * 8); - bb->count -= (hi - lo - 1); - } - } - - bb->changed = 1; -out: - write_sequnlock_irq(&bb->lock); - return rv; + /* Make sure they get written out promptly */ + if (test_bit(ExternalBbl, &rdev->flags)) + sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); + sysfs_notify_dirent_safe(rdev->sysfs_state); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); + md_wakeup_thread(rdev->mddev->thread); + return true; } +EXPORT_SYMBOL_GPL(rdev_set_badblocks); -int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new) +void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) { if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; - return md_clear_badblocks(&rdev->badblocks, - s, sectors); -} -EXPORT_SYMBOL_GPL(rdev_clear_badblocks); -/* - * Acknowledge all bad blocks in a list. - * This only succeeds if ->changed is clear. It is used by - * in-kernel metadata updates - */ -void md_ack_all_badblocks(struct badblocks *bb) -{ - if (bb->page == NULL || bb->changed) - /* no point even trying */ + if (!badblocks_clear(&rdev->badblocks, s, sectors)) return; - write_seqlock_irq(&bb->lock); - - if (bb->changed == 0 && bb->unacked_exist) { - u64 *p = bb->page; - int i; - for (i = 0; i < bb->count ; i++) { - if (!BB_ACK(p[i])) { - sector_t start = BB_OFFSET(p[i]); - int len = BB_LEN(p[i]); - p[i] = BB_MAKE(start, len, 1); - } - } - bb->unacked_exist = 0; - } - write_sequnlock_irq(&bb->lock); -} -EXPORT_SYMBOL_GPL(md_ack_all_badblocks); - -/* sysfs access to bad-blocks list. - * We present two files. - * 'bad-blocks' lists sector numbers and lengths of ranges that - * are recorded as bad. The list is truncated to fit within - * the one-page limit of sysfs. - * Writing "sector length" to this file adds an acknowledged - * bad block list. - * 'unacknowledged-bad-blocks' lists bad blocks that have not yet - * been acknowledged. Writing to this file adds bad blocks - * without acknowledging them. This is largely for testing. - */ - -static ssize_t -badblocks_show(struct badblocks *bb, char *page, int unack) -{ - size_t len; - int i; - u64 *p = bb->page; - unsigned seq; - - if (bb->shift < 0) - return 0; - -retry: - seq = read_seqbegin(&bb->lock); - - len = 0; - i = 0; - - while (len < PAGE_SIZE && i < bb->count) { - sector_t s = BB_OFFSET(p[i]); - unsigned int length = BB_LEN(p[i]); - int ack = BB_ACK(p[i]); - i++; - - if (unack && ack) - continue; - len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", - (unsigned long long)s << bb->shift, - length << bb->shift); - } - if (unack && len == 0) - bb->unacked_exist = 0; - - if (read_seqretry(&bb->lock, seq)) - goto retry; - - return len; -} - -#define DO_DEBUG 1 - -static ssize_t -badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) -{ - unsigned long long sector; - int length; - char newline; -#ifdef DO_DEBUG - /* Allow clearing via sysfs *only* for testing/debugging. - * Normally only a successful write may clear a badblock - */ - int clear = 0; - if (page[0] == '-') { - clear = 1; - page++; - } -#endif /* DO_DEBUG */ - - switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { - case 3: - if (newline != '\n') - return -EINVAL; - case 2: - if (length <= 0) - return -EINVAL; - break; - default: - return -EINVAL; - } - -#ifdef DO_DEBUG - if (clear) { - md_clear_badblocks(bb, sector, length); - return len; - } -#endif /* DO_DEBUG */ - if (md_set_badblocks(bb, sector, length, !unack)) - return len; - else - return -ENOSPC; + if (test_bit(ExternalBbl, &rdev->flags)) + sysfs_notify_dirent_safe(rdev->sysfs_badblocks); } +EXPORT_SYMBOL_GPL(rdev_clear_badblocks); static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { - struct list_head *tmp; struct mddev *mddev; - int need_delay = 0; - for_each_mddev(mddev, tmp) { + spin_lock(&all_mddevs_lock); + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { + if (!mddev_get(mddev)) + continue; + spin_unlock(&all_mddevs_lock); if (mddev_trylock(mddev)) { if (mddev->pers) __md_stop_writes(mddev); - mddev->safemode = 2; + if (mddev->persistent) + mddev->safemode = 2; mddev_unlock(mddev); } - need_delay = 1; + spin_lock(&all_mddevs_lock); + mddev_put_locked(mddev); } - /* - * certain more exotic SCSI devices are known to be - * volatile wrt too early system reboots. While the - * right place to handle this issue is the given - * driver, we do want to have a safe RAID driver ... - */ - if (need_delay) - mdelay(1000*1); + spin_unlock(&all_mddevs_lock); return NOTIFY_DONE; } @@ -8529,13 +10439,21 @@ static void md_geninit(void) { pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); - proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); + proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); } static int __init md_init(void) { - int ret = -ENOMEM; + int ret = md_bitmap_init(); + + if (ret) + return ret; + ret = md_llbitmap_init(); + if (ret) + goto err_bitmap; + + ret = -ENOMEM; md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); if (!md_wq) goto err_wq; @@ -8544,20 +10462,17 @@ static int __init md_init(void) if (!md_misc_wq) goto err_misc_wq; - if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) + ret = __register_blkdev(MD_MAJOR, "md", md_probe); + if (ret < 0) goto err_md; - if ((ret = register_blkdev(0, "mdp")) < 0) + ret = __register_blkdev(0, "mdp", md_probe); + if (ret < 0) goto err_mdp; mdp_major = ret; - blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, - md_probe, NULL, NULL); - blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, - md_probe, NULL, NULL); - register_reboot_notifier(&md_notifier); - raid_table_header = register_sysctl_table(raid_root_table); + raid_table_header = register_sysctl("dev/raid", raid_table); md_geninit(); return 0; @@ -8569,9 +10484,204 @@ err_md: err_misc_wq: destroy_workqueue(md_wq); err_wq: + md_llbitmap_exit(); +err_bitmap: + md_bitmap_exit(); return ret; } +static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) +{ + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); + struct md_rdev *rdev2, *tmp; + int role, ret; + + /* + * If size is changed in another node then we need to + * do resize as well. + */ + if (mddev->dev_sectors != le64_to_cpu(sb->size)) { + ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); + if (ret) + pr_info("md-cluster: resize failed\n"); + else if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->update_sb(mddev->bitmap); + } + + /* Check for change of roles in the active devices */ + rdev_for_each_safe(rdev2, tmp, mddev) { + if (test_bit(Faulty, &rdev2->flags)) { + if (test_bit(ClusterRemove, &rdev2->flags)) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + continue; + } + + /* Check if the roles changed */ + role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); + + if (test_bit(Candidate, &rdev2->flags)) { + if (role == MD_DISK_ROLE_FAULTY) { + pr_info("md: Removing Candidate device %pg because add failed\n", + rdev2->bdev); + md_kick_rdev_from_array(rdev2); + continue; + } + else + clear_bit(Candidate, &rdev2->flags); + } + + if (role != rdev2->raid_disk) { + /* + * got activated except reshape is happening. + */ + if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && + !(le32_to_cpu(sb->feature_map) & + MD_FEATURE_RESHAPE_ACTIVE) && + !mddev->cluster_ops->resync_status_get(mddev)) { + /* + * -1 to make raid1_add_disk() set conf->fullsync + * to 1. This could avoid skipping sync when the + * remote node is down during resyncing. + */ + if ((le32_to_cpu(sb->feature_map) + & MD_FEATURE_RECOVERY_OFFSET)) + rdev2->saved_raid_disk = -1; + else + rdev2->saved_raid_disk = role; + ret = remove_and_add_spares(mddev, rdev2); + pr_info("Activated spare: %pg\n", + rdev2->bdev); + /* wakeup mddev->thread here, so array could + * perform resync with the new activated disk */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + /* device faulty + * We just want to do the minimum to mark the disk + * as faulty. The recovery is performed by the + * one who initiated the error. + */ + if (role == MD_DISK_ROLE_FAULTY || + role == MD_DISK_ROLE_JOURNAL) { + md_error(mddev, rdev2); + clear_bit(Blocked, &rdev2->flags); + } + } + } + + if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { + ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); + if (ret) + pr_warn("md: updating array disks failed. %d\n", ret); + } + + /* + * Since mddev->delta_disks has already updated in update_raid_disks, + * so it is time to check reshape. + */ + if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && + (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { + /* + * reshape is happening in the remote node, we need to + * update reshape_position and call start_reshape. + */ + mddev->reshape_position = le64_to_cpu(sb->reshape_position); + if (mddev->pers->update_reshape_pos) + mddev->pers->update_reshape_pos(mddev); + if (mddev->pers->start_reshape) + mddev->pers->start_reshape(mddev); + } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && + mddev->reshape_position != MaxSector && + !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { + /* reshape is just done in another node. */ + mddev->reshape_position = MaxSector; + if (mddev->pers->update_reshape_pos) + mddev->pers->update_reshape_pos(mddev); + } + + /* Finally set the event to be up to date */ + mddev->events = le64_to_cpu(sb->events); +} + +static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) +{ + int err; + struct page *swapout = rdev->sb_page; + struct mdp_superblock_1 *sb; + + /* Store the sb page of the rdev in the swapout temporary + * variable in case we err in the future + */ + rdev->sb_page = NULL; + err = alloc_disk_sb(rdev); + if (err == 0) { + ClearPageUptodate(rdev->sb_page); + rdev->sb_loaded = 0; + err = super_types[mddev->major_version]. + load_super(rdev, NULL, mddev->minor_version); + } + if (err < 0) { + pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", + __func__, __LINE__, rdev->desc_nr, err); + if (rdev->sb_page) + put_page(rdev->sb_page); + rdev->sb_page = swapout; + rdev->sb_loaded = 1; + return err; + } + + sb = page_address(rdev->sb_page); + /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET + * is not set + */ + + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) + rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); + + /* The other node finished recovery, call spare_active to set + * device In_sync and mddev->degraded + */ + if (rdev->recovery_offset == MaxSector && + !test_bit(In_sync, &rdev->flags) && + mddev->pers->spare_active(mddev)) + sysfs_notify_dirent_safe(mddev->sysfs_degraded); + + put_page(swapout); + return 0; +} + +void md_reload_sb(struct mddev *mddev, int nr) +{ + struct md_rdev *rdev = NULL, *iter; + int err; + + /* Find the rdev */ + rdev_for_each_rcu(iter, mddev) { + if (iter->desc_nr == nr) { + rdev = iter; + break; + } + } + + if (!rdev) { + pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); + return; + } + + err = read_rdev(mddev, rdev); + if (err < 0) + return; + + check_sb_changes(mddev, rdev); + + /* Read all rdev's to update recovery_offset */ + rdev_for_each_rcu(rdev, mddev) { + if (!test_bit(Faulty, &rdev->flags)) + read_rdev(mddev, rdev); + } +} +EXPORT_SYMBOL(md_reload_sb); + #ifndef MODULE /* @@ -8579,6 +10689,7 @@ err_wq: * at boot time. */ +static DEFINE_MUTEX(detected_devices_mutex); static LIST_HEAD(all_detected_devices); struct detected_devices_node { struct list_head list; @@ -8592,15 +10703,13 @@ void md_autodetect_dev(dev_t dev) node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); if (node_detected_dev) { node_detected_dev->dev = dev; + mutex_lock(&detected_devices_mutex); list_add_tail(&node_detected_dev->list, &all_detected_devices); - } else { - printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" - ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); + mutex_unlock(&detected_devices_mutex); } } - -static void autostart_arrays(int part) +void md_autostart_arrays(int part) { struct md_rdev *rdev; struct detected_devices_node *node_detected_dev; @@ -8610,8 +10719,9 @@ static void autostart_arrays(int part) i_scanned = 0; i_passed = 0; - printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); + pr_info("md: Autodetecting RAID arrays.\n"); + mutex_lock(&detected_devices_mutex); while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { i_scanned++; node_detected_dev = list_entry(all_detected_devices.next, @@ -8619,21 +10729,22 @@ static void autostart_arrays(int part) list_del(&node_detected_dev->list); dev = node_detected_dev->dev; kfree(node_detected_dev); + mutex_unlock(&detected_devices_mutex); rdev = md_import_device(dev,0, 90); + mutex_lock(&detected_devices_mutex); if (IS_ERR(rdev)) continue; - if (test_bit(Faulty, &rdev->flags)) { - MD_BUG(); + if (test_bit(Faulty, &rdev->flags)) continue; - } + set_bit(AutoDetected, &rdev->flags); list_add(&rdev->same_set, &pending_raid_disks); i_passed++; } + mutex_unlock(&detected_devices_mutex); - printk(KERN_INFO "md: Scanned %d and added %d devices.\n", - i_scanned, i_passed); + pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); autorun_devices(part); } @@ -8643,58 +10754,67 @@ static void autostart_arrays(int part) static __exit void md_exit(void) { struct mddev *mddev; - struct list_head *tmp; - - blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); - blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); + int delay = 1; unregister_blkdev(MD_MAJOR,"md"); unregister_blkdev(mdp_major, "mdp"); unregister_reboot_notifier(&md_notifier); unregister_sysctl_table(raid_table_header); + + /* We cannot unload the modules while some process is + * waiting for us in select() or poll() - wake them up + */ + md_unloading = 1; + while (waitqueue_active(&md_event_waiters)) { + /* not safe to leave yet */ + wake_up(&md_event_waiters); + msleep(delay); + delay += delay; + } remove_proc_entry("mdstat", NULL); - for_each_mddev(mddev, tmp) { + + spin_lock(&all_mddevs_lock); + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { + if (!mddev_get(mddev)) + continue; + spin_unlock(&all_mddevs_lock); export_array(mddev); + mddev->ctime = 0; mddev->hold_active = 0; + /* + * As the mddev is now fully clear, mddev_put will schedule + * the mddev for destruction by a workqueue, and the + * destroy_workqueue() below will wait for that to complete. + */ + spin_lock(&all_mddevs_lock); + mddev_put_locked(mddev); } + spin_unlock(&all_mddevs_lock); + destroy_workqueue(md_misc_wq); destroy_workqueue(md_wq); + md_bitmap_exit(); } subsys_initcall(md_init); module_exit(md_exit) -static int get_ro(char *buffer, struct kernel_param *kp) +static int get_ro(char *buffer, const struct kernel_param *kp) { - return sprintf(buffer, "%d", start_readonly); + return sprintf(buffer, "%d\n", start_readonly); } -static int set_ro(const char *val, struct kernel_param *kp) +static int set_ro(const char *val, const struct kernel_param *kp) { - char *e; - int num = simple_strtoul(val, &e, 10); - if (*val && (*e == '\0' || *e == '\n')) { - start_readonly = num; - return 0; - } - return -EINVAL; + return kstrtouint(val, 10, (unsigned int *)&start_readonly); } module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); - module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); +module_param(create_on_open, bool, S_IRUSR|S_IWUSR); +module_param(legacy_async_del_gendisk, bool, 0600); +module_param(check_new_feature, bool, 0600); -EXPORT_SYMBOL(register_md_personality); -EXPORT_SYMBOL(unregister_md_personality); -EXPORT_SYMBOL(md_error); -EXPORT_SYMBOL(md_done_sync); -EXPORT_SYMBOL(md_write_start); -EXPORT_SYMBOL(md_write_end); -EXPORT_SYMBOL(md_register_thread); -EXPORT_SYMBOL(md_unregister_thread); -EXPORT_SYMBOL(md_wakeup_thread); -EXPORT_SYMBOL(md_check_recovery); -EXPORT_SYMBOL(md_reap_sync_thread); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MD RAID framework"); MODULE_ALIAS("md"); |
