summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-05 17:52:22 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-05 17:52:22 -0700
commit2a013e37ce691a7c072df27b35e9790fc8f5a82f (patch)
tree0d81e512ebb93484602a0802396ed7c1f465d3b4 /drivers
parent17447717a3266965e257d3eae79d89539ce3ec0a (diff)
parente89c6fdf9e0eb1b5a03574d4ca73e83eae8deb91 (diff)
Merge tag 'md/4.3' of git://neil.brown.name/md
Pull md updates from Neil Brown: - an assortment of little fixes, several for minor races only likely to be hit during testing - further cluster-md-raid1 development, not ready for real use yet. - new RAID6 syndrome code for ARM NEON - fix a race where a write can return before failure of one device is properly recorded in metadata, so an immediate crash might result in that write being lost. * tag 'md/4.3' of git://neil.brown.name/md: (33 commits) md/raid5: ensure device failure recorded before write request returns. md/raid5: use bio_list for the list of bios to return. md/raid10: ensure device failure recorded before write request returns. md/raid1: ensure device failure recorded before write request returns. md-cluster: remove inappropriate try_module_get from join() md: extend spinlock protection in register_md_cluster_operations md-cluster: Read the disk bitmap sb and check if it needs recovery md-cluster: only call complete(&cinfo->completion) when node join cluster md-cluster: add missed lockres_free md-cluster: remove the unused sb_lock md-cluster: init suspend_list and suspend_lock early in join md-cluster: add the error check if failed to get dlm lock md-cluster: init completion within lockres_init md-cluster: fix deadlock issue on message lock md-cluster: transfer the resync ownership to another node md-cluster: split recover_slot for future code reuse md-cluster: use %pU to print UUIDs md: setup safemode_timer before it's being used md/raid5: handle possible race as reshape completes. md: sync sync_completed has correct value as recovery finishes. ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/md-cluster.c159
-rw-r--r--drivers/md/md.c110
-rw-r--r--drivers/md/raid0.c75
-rw-r--r--drivers/md/raid1.c30
-rw-r--r--drivers/md/raid1.h5
-rw-r--r--drivers/md/raid10.c33
-rw-r--r--drivers/md/raid10.h6
-rw-r--r--drivers/md/raid5.c140
-rw-r--r--drivers/md/raid5.h5
9 files changed, 373 insertions, 190 deletions
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 0072190515e0..11e3bc9d2a4b 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -45,6 +45,7 @@ struct resync_info {
/* md_cluster_info flags */
#define MD_CLUSTER_WAITING_FOR_NEWDISK 1
#define MD_CLUSTER_SUSPEND_READ_BALANCING 2
+#define MD_CLUSTER_BEGIN_JOIN_CLUSTER 3
struct md_cluster_info {
@@ -52,7 +53,6 @@ struct md_cluster_info {
dlm_lockspace_t *lockspace;
int slot_number;
struct completion completion;
- struct dlm_lock_resource *sb_lock;
struct mutex sb_mutex;
struct dlm_lock_resource *bitmap_lockres;
struct list_head suspend_list;
@@ -75,6 +75,7 @@ enum msg_type {
NEWDISK,
REMOVE,
RE_ADD,
+ BITMAP_NEEDS_SYNC,
};
struct cluster_msg {
@@ -99,7 +100,6 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
{
int ret = 0;
- init_completion(&res->completion);
ret = dlm_lock(res->ls, mode, &res->lksb,
res->flags, res->name, strlen(res->name),
0, sync_ast, res, res->bast);
@@ -124,6 +124,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
if (!res)
return NULL;
+ init_completion(&res->completion);
res->ls = cinfo->lockspace;
res->mddev = mddev;
namelen = strlen(name);
@@ -165,11 +166,24 @@ out_err:
static void lockres_free(struct dlm_lock_resource *res)
{
+ int ret;
+
if (!res)
return;
- init_completion(&res->completion);
- dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
+ /* cancel a lock request or a conversion request that is blocked */
+ res->flags |= DLM_LKF_CANCEL;
+retry:
+ ret = dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
+ if (unlikely(ret != 0)) {
+ pr_info("%s: failed to unlock %s return %d\n", __func__, res->name, ret);
+
+ /* if a lock conversion is cancelled, then the lock is put
+ * back to grant queue, need to ensure it is unlocked */
+ if (ret == -DLM_ECANCEL)
+ goto retry;
+ }
+ res->flags &= ~DLM_LKF_CANCEL;
wait_for_completion(&res->completion);
kfree(res->name);
@@ -177,18 +191,6 @@ static void lockres_free(struct dlm_lock_resource *res)
kfree(res);
}
-static char *pretty_uuid(char *dest, char *src)
-{
- int i, len = 0;
-
- for (i = 0; i < 16; i++) {
- if (i == 4 || i == 6 || i == 8 || i == 10)
- len += sprintf(dest + len, "-");
- len += sprintf(dest + len, "%02x", (__u8)src[i]);
- }
- return dest;
-}
-
static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
sector_t lo, sector_t hi)
{
@@ -281,16 +283,11 @@ static void recover_prep(void *arg)
set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
}
-static void recover_slot(void *arg, struct dlm_slot *slot)
+static void __recover_slot(struct mddev *mddev, int slot)
{
- struct mddev *mddev = arg;
struct md_cluster_info *cinfo = mddev->cluster_info;
- pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
- mddev->bitmap_info.cluster_name,
- slot->nodeid, slot->slot,
- cinfo->slot_number);
- set_bit(slot->slot - 1, &cinfo->recovery_map);
+ set_bit(slot, &cinfo->recovery_map);
if (!cinfo->recovery_thread) {
cinfo->recovery_thread = md_register_thread(recover_bitmaps,
mddev, "recover");
@@ -302,6 +299,20 @@ static void recover_slot(void *arg, struct dlm_slot *slot)
md_wakeup_thread(cinfo->recovery_thread);
}
+static void recover_slot(void *arg, struct dlm_slot *slot)
+{
+ struct mddev *mddev = arg;
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+
+ pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
+ mddev->bitmap_info.cluster_name,
+ slot->nodeid, slot->slot,
+ cinfo->slot_number);
+ /* deduct one since dlm slot starts from one while the num of
+ * cluster-md begins with 0 */
+ __recover_slot(mddev, slot->slot - 1);
+}
+
static void recover_done(void *arg, struct dlm_slot *slots,
int num_slots, int our_slot,
uint32_t generation)
@@ -310,10 +321,17 @@ static void recover_done(void *arg, struct dlm_slot *slots,
struct md_cluster_info *cinfo = mddev->cluster_info;
cinfo->slot_number = our_slot;
- complete(&cinfo->completion);
+ /* completion is only need to be complete when node join cluster,
+ * it doesn't need to run during another node's failure */
+ if (test_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state)) {
+ complete(&cinfo->completion);
+ clear_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
+ }
clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
}
+/* the ops is called when node join the cluster, and do lock recovery
+ * if node failure occurs */
static const struct dlm_lockspace_ops md_ls_ops = {
.recover_prep = recover_prep,
.recover_slot = recover_slot,
@@ -388,7 +406,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
int len;
len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
- pretty_uuid(disk_uuid + len, cmsg->uuid);
+ sprintf(disk_uuid + len, "%pU", cmsg->uuid);
snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
init_completion(&cinfo->newdisk_completion);
@@ -457,6 +475,11 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
__func__, __LINE__, msg->slot);
process_readd_disk(mddev, msg);
break;
+ case BITMAP_NEEDS_SYNC:
+ pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n",
+ __func__, __LINE__, msg->slot);
+ __recover_slot(mddev, msg->slot);
+ break;
default:
pr_warn("%s:%d Received unknown message from %d\n",
__func__, __LINE__, msg->slot);
@@ -472,6 +495,7 @@ static void recv_daemon(struct md_thread *thread)
struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
struct cluster_msg msg;
+ int ret;
/*get CR on Message*/
if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
@@ -484,13 +508,21 @@ static void recv_daemon(struct md_thread *thread)
process_recvd_msg(thread->mddev, &msg);
/*release CR on ack_lockres*/
- dlm_unlock_sync(ack_lockres);
- /*up-convert to EX on message_lockres*/
- dlm_lock_sync(message_lockres, DLM_LOCK_EX);
+ ret = dlm_unlock_sync(ack_lockres);
+ if (unlikely(ret != 0))
+ pr_info("unlock ack failed return %d\n", ret);
+ /*up-convert to PR on message_lockres*/
+ ret = dlm_lock_sync(message_lockres, DLM_LOCK_PR);
+ if (unlikely(ret != 0))
+ pr_info("lock PR on msg failed return %d\n", ret);
/*get CR on ack_lockres again*/
- dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
+ ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
+ if (unlikely(ret != 0))
+ pr_info("lock CR on ack failed return %d\n", ret);
/*release CR on message_lockres*/
- dlm_unlock_sync(message_lockres);
+ ret = dlm_unlock_sync(message_lockres);
+ if (unlikely(ret != 0))
+ pr_info("unlock msg failed return %d\n", ret);
}
/* lock_comm()
@@ -519,7 +551,7 @@ static void unlock_comm(struct md_cluster_info *cinfo)
* The function:
* 1. Grabs the message lockresource in EX mode
* 2. Copies the message to the message LVB
- * 3. Downconverts message lockresource to CR
+ * 3. Downconverts message lockresource to CW
* 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes
* and the other nodes read the message. The thread will wait here until all other
* nodes have released ack lock resource.
@@ -540,12 +572,12 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
sizeof(struct cluster_msg));
- /*down-convert EX to CR on Message*/
- error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CR);
+ /*down-convert EX to CW on Message*/
+ error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CW);
if (error) {
- pr_err("md-cluster: failed to convert EX to CR on MESSAGE(%d)\n",
+ pr_err("md-cluster: failed to convert EX to CW on MESSAGE(%d)\n",
error);
- goto failed_message;
+ goto failed_ack;
}
/*up-convert CR to EX on Ack*/
@@ -565,7 +597,13 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
}
failed_ack:
- dlm_unlock_sync(cinfo->message_lockres);
+ error = dlm_unlock_sync(cinfo->message_lockres);
+ if (unlikely(error != 0)) {
+ pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
+ error);
+ /* in case the message can't be released due to some reason */
+ goto failed_ack;
+ }
failed_message:
return error;
}
@@ -587,6 +625,7 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
struct dlm_lock_resource *bm_lockres;
struct suspend_info *s;
char str[64];
+ sector_t lo, hi;
for (i = 0; i < total_slots; i++) {
@@ -617,9 +656,24 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
lockres_free(bm_lockres);
continue;
}
- if (ret)
+ if (ret) {
+ lockres_free(bm_lockres);
goto out;
- /* TODO: Read the disk bitmap sb and check if it needs recovery */
+ }
+
+ /* Read the disk bitmap sb and check if it needs recovery */
+ ret = bitmap_copy_from_slot(mddev, i, &lo, &hi, false);
+ if (ret) {
+ pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
+ lockres_free(bm_lockres);
+ continue;
+ }
+ if ((hi > 0) && (lo < mddev->recovery_cp)) {
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ mddev->recovery_cp = lo;
+ md_check_recovery(mddev);
+ }
+
dlm_unlock_sync(bm_lockres);
lockres_free(bm_lockres);
}
@@ -633,20 +687,20 @@ static int join(struct mddev *mddev, int nodes)
int ret, ops_rv;
char str[64];
- if (!try_module_get(THIS_MODULE))
- return -ENOENT;
-
cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL);
if (!cinfo)
return -ENOMEM;
+ INIT_LIST_HEAD(&cinfo->suspend_list);
+ spin_lock_init(&cinfo->suspend_lock);
init_completion(&cinfo->completion);
+ set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
mutex_init(&cinfo->sb_mutex);
mddev->cluster_info = cinfo;
memset(str, 0, 64);
- pretty_uuid(str, mddev->uuid);
+ sprintf(str, "%pU", mddev->uuid);
ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
DLM_LSFL_FS, LVB_SIZE,
&md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
@@ -659,12 +713,6 @@ static int join(struct mddev *mddev, int nodes)
ret = -ERANGE;
goto err;
}
- cinfo->sb_lock = lockres_init(mddev, "cmd-super",
- NULL, 0);
- if (!cinfo->sb_lock) {
- ret = -ENOMEM;
- goto err;
- }
/* Initiate the communication resources */
ret = -ENOMEM;
cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
@@ -705,9 +753,6 @@ static int join(struct mddev *mddev, int nodes)
goto err;
}
- INIT_LIST_HEAD(&cinfo->suspend_list);
- spin_lock_init(&cinfo->suspend_lock);
-
ret = gather_all_resync_info(mddev, nodes);
if (ret)
goto err;
@@ -719,12 +764,10 @@ err:
lockres_free(cinfo->ack_lockres);
lockres_free(cinfo->no_new_dev_lockres);
lockres_free(cinfo->bitmap_lockres);
- lockres_free(cinfo->sb_lock);
if (cinfo->lockspace)
dlm_release_lockspace(cinfo->lockspace, 2);
mddev->cluster_info = NULL;
kfree(cinfo);
- module_put(THIS_MODULE);
return ret;
}
@@ -740,7 +783,6 @@ static int leave(struct mddev *mddev)
lockres_free(cinfo->token_lockres);
lockres_free(cinfo->ack_lockres);
lockres_free(cinfo->no_new_dev_lockres);
- lockres_free(cinfo->sb_lock);
lockres_free(cinfo->bitmap_lockres);
dlm_release_lockspace(cinfo->lockspace, 2);
return 0;
@@ -817,8 +859,17 @@ static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)
static void resync_finish(struct mddev *mddev)
{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ struct cluster_msg cmsg;
+ int slot = cinfo->slot_number - 1;
+
pr_info("%s:%d\n", __func__, __LINE__);
resync_send(mddev, RESYNCING, 0, 0);
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+ cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
+ cmsg.slot = cpu_to_le32(slot);
+ sendmsg(cinfo, &cmsg);
+ }
}
static int area_resyncing(struct mddev *mddev, int direction,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 40332625713b..4f5ecbe94ccb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -483,6 +483,8 @@ static void mddev_put(struct mddev *mddev)
bioset_free(bs);
}
+static void md_safemode_timeout(unsigned long data);
+
void mddev_init(struct mddev *mddev)
{
mutex_init(&mddev->open_mutex);
@@ -490,7 +492,8 @@ void mddev_init(struct mddev *mddev)
mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs);
- init_timer(&mddev->safemode_timer);
+ setup_timer(&mddev->safemode_timer, md_safemode_timeout,
+ (unsigned long) mddev);
atomic_set(&mddev->active, 1);
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->active_io, 0);
@@ -3255,8 +3258,6 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
return 0;
}
-static void md_safemode_timeout(unsigned long data);
-
static ssize_t
safe_delay_show(struct mddev *mddev, char *page)
{
@@ -4189,6 +4190,8 @@ action_show(struct mddev *mddev, char *page)
type = "repair";
} else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
type = "recover";
+ else if (mddev->reshape_position != MaxSector)
+ type = "reshape";
}
return sprintf(page, "%s\n", type);
}
@@ -5180,8 +5183,6 @@ int md_run(struct mddev *mddev)
atomic_set(&mddev->max_corr_read_errors,
MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
mddev->safemode = 0;
- mddev->safemode_timer.function = md_safemode_timeout;
- mddev->safemode_timer.data = (unsigned long) mddev;
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
mddev->in_sync = 1;
smp_wmb();
@@ -5194,6 +5195,11 @@ int md_run(struct mddev *mddev)
if (sysfs_link_rdev(mddev, rdev))
/* failure here is OK */;
+ if (mddev->degraded && !mddev->ro)
+ /* This ensures that recovering status is reported immediately
+ * via sysfs - until a lack of spares is confirmed.
+ */
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
if (mddev->flags & MD_UPDATE_SB_FLAGS)
@@ -5741,16 +5747,16 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg)
err = 0;
spin_lock(&mddev->lock);
- /* bitmap disabled, zero the first byte and copy out */
- if (!mddev->bitmap_info.file)
- file->pathname[0] = '\0';
- else if ((ptr = file_path(mddev->bitmap_info.file,
- file->pathname, sizeof(file->pathname))),
- IS_ERR(ptr))
- err = PTR_ERR(ptr);
- else
- memmove(file->pathname, ptr,
- sizeof(file->pathname)-(ptr-file->pathname));
+ /* bitmap enabled */
+ if (mddev->bitmap_info.file) {
+ ptr = file_path(mddev->bitmap_info.file, file->pathname,
+ sizeof(file->pathname));
+ if (IS_ERR(ptr))
+ err = PTR_ERR(ptr);
+ else
+ memmove(file->pathname, ptr,
+ sizeof(file->pathname)-(ptr-file->pathname));
+ }
spin_unlock(&mddev->lock);
if (err == 0 &&
@@ -7069,7 +7075,7 @@ static void status_unused(struct seq_file *seq)
seq_printf(seq, "\n");
}
-static void status_resync(struct seq_file *seq, struct mddev *mddev)
+static int status_resync(struct seq_file *seq, struct mddev *mddev)
{
sector_t max_sectors, resync, res;
unsigned long dt, db;
@@ -7077,18 +7083,32 @@ static void status_resync(struct seq_file *seq, struct mddev *mddev)
int scale;
unsigned int per_milli;
- if (mddev->curr_resync <= 3)
- resync = 0;
- else
- resync = mddev->curr_resync
- - atomic_read(&mddev->recovery_active);
-
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sectors = mddev->resync_max_sectors;
else
max_sectors = mddev->dev_sectors;
+ resync = mddev->curr_resync;
+ if (resync <= 3) {
+ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+ /* Still cleaning up */
+ resync = max_sectors;
+ } else
+ resync -= atomic_read(&mddev->recovery_active);
+
+ if (resync == 0) {
+ if (mddev->recovery_cp < MaxSector) {
+ seq_printf(seq, "\tresync=PENDING");
+ return 1;
+ }
+ return 0;
+ }
+ if (resync < 3) {
+ seq_printf(seq, "\tresync=DELAYED");
+ return 1;
+ }
+
WARN_ON(max_sectors == 0);
/* Pick 'scale' such that (resync>>scale)*1000 will fit
* in a sector_t, and (max_sectors>>scale) will fit in a
@@ -7153,6 +7173,7 @@ static void status_resync(struct seq_file *seq, struct mddev *mddev)
((unsigned long)rt % 60)/6);
seq_printf(seq, " speed=%ldK/sec", db/2/dt);
+ return 1;
}
static void *md_seq_start(struct seq_file *seq, loff_t *pos)
@@ -7298,13 +7319,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
mddev->pers->status(seq, mddev);
seq_printf(seq, "\n ");
if (mddev->pers->sync_request) {
- if (mddev->curr_resync > 2) {
- status_resync(seq, mddev);
+ if (status_resync(seq, mddev))
seq_printf(seq, "\n ");
- } else if (mddev->curr_resync >= 1)
- seq_printf(seq, "\tresync=DELAYED\n ");
- else if (mddev->recovery_cp < MaxSector)
- seq_printf(seq, "\tresync=PENDING\n ");
}
} else
seq_printf(seq, "\n ");
@@ -7387,15 +7403,19 @@ int unregister_md_personality(struct md_personality *p)
}
EXPORT_SYMBOL(unregister_md_personality);
-int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module)
+int register_md_cluster_operations(struct md_cluster_operations *ops,
+ struct module *module)
{
- if (md_cluster_ops != NULL)
- return -EALREADY;
+ int ret = 0;
spin_lock(&pers_lock);
- md_cluster_ops = ops;
- md_cluster_mod = module;
+ if (md_cluster_ops != NULL)
+ ret = -EALREADY;
+ else {
+ md_cluster_ops = ops;
+ md_cluster_mod = module;
+ }
spin_unlock(&pers_lock);
- return 0;
+ return ret;
}
EXPORT_SYMBOL(register_md_cluster_operations);
@@ -7793,7 +7813,8 @@ void md_do_sync(struct md_thread *thread)
> (max_sectors >> 4)) ||
time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
(j - mddev->curr_resync_completed)*2
- >= mddev->resync_max - mddev->curr_resync_completed
+ >= mddev->resync_max - mddev->curr_resync_completed ||
+ mddev->curr_resync_completed > mddev->resync_max
)) {
/* time to update curr_resync_completed */
wait_event(mddev->recovery_wait,
@@ -7838,6 +7859,9 @@ void md_do_sync(struct md_thread *thread)
break;
j += sectors;
+ if (j > max_sectors)
+ /* when skipping, extra large numbers can be returned. */
+ j = max_sectors;
if (j > 2)
mddev->curr_resync = j;
if (mddev_is_clustered(mddev))
@@ -7906,12 +7930,15 @@ void md_do_sync(struct md_thread *thread)
blk_finish_plug(&plug);
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
+ if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+ mddev->curr_resync > 2) {
+ mddev->curr_resync_completed = mddev->curr_resync;
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ }
/* tell personality that we are finished */
mddev->pers->sync_request(mddev, max_sectors, &skipped);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->resync_finish(mddev);
-
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
mddev->curr_resync > 2) {
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
@@ -7945,6 +7972,9 @@ void md_do_sync(struct md_thread *thread)
}
}
skip:
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->resync_finish(mddev);
+
set_bit(MD_CHANGE_DEVS, &mddev->flags);
spin_lock(&mddev->lock);
@@ -7955,11 +7985,11 @@ void md_do_sync(struct md_thread *thread)
mddev->resync_max = MaxSector;
} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
mddev->resync_min = mddev->curr_resync_completed;
+ set_bit(MD_RECOVERY_DONE, &mddev->recovery);
mddev->curr_resync = 0;
spin_unlock(&mddev->lock);
wake_up(&resync_wait);
- set_bit(MD_RECOVERY_DONE, &mddev->recovery);
md_wakeup_thread(mddev->thread);
return;
}
@@ -8128,6 +8158,7 @@ void md_check_recovery(struct mddev *mddev)
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
+ clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
goto unlock;
}
@@ -8574,6 +8605,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
/* Make sure they get written out promptly */
sysfs_notify_dirent_safe(rdev->sysfs_state);
set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
md_wakeup_thread(rdev->mddev->thread);
}
return rv;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 59cda501a224..63e619b2f44e 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -83,7 +83,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
char b[BDEVNAME_SIZE];
char b2[BDEVNAME_SIZE];
struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
- bool discard_supported = false;
+ unsigned short blksize = 512;
if (!conf)
return -ENOMEM;
@@ -98,6 +98,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
sector_div(sectors, mddev->chunk_sectors);
rdev1->sectors = sectors * mddev->chunk_sectors;
+ blksize = max(blksize, queue_logical_block_size(
+ rdev1->bdev->bd_disk->queue));
+
rdev_for_each(rdev2, mddev) {
pr_debug("md/raid0:%s: comparing %s(%llu)"
" with %s(%llu)\n",
@@ -134,6 +137,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
}
pr_debug("md/raid0:%s: FINAL %d zones\n",
mdname(mddev), conf->nr_strip_zones);
+ /*
+ * now since we have the hard sector sizes, we can make sure
+ * chunk size is a multiple of that sector size
+ */
+ if ((mddev->chunk_sectors << 9) % blksize) {
+ printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n",
+ mdname(mddev),
+ mddev->chunk_sectors << 9, blksize);
+ err = -EINVAL;
+ goto abort;
+ }
+
err = -ENOMEM;
conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
conf->nr_strip_zones, GFP_KERNEL);
@@ -188,16 +203,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
}
dev[j] = rdev1;
- if (mddev->queue)
- disk_stack_limits(mddev->gendisk, rdev1->bdev,
- rdev1->data_offset << 9);
-
if (!smallest || (rdev1->sectors < smallest->sectors))
smallest = rdev1;
cnt++;
-
- if (blk_queue_discard(bdev_get_queue(rdev1->bdev)))
- discard_supported = true;
}
if (cnt != mddev->raid_disks) {
printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
@@ -258,28 +266,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
(unsigned long long)smallest->sectors);
}
- /*
- * now since we have the hard sector sizes, we can make sure
- * chunk size is a multiple of that sector size
- */
- if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) {
- printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n",
- mdname(mddev),
- mddev->chunk_sectors << 9);
- goto abort;
- }
-
- if (mddev->queue) {
- blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
- blk_queue_io_opt(mddev->queue,
- (mddev->chunk_sectors << 9) * mddev->raid_disks);
-
- if (!discard_supported)
- queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
- else
- queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
- }
-
pr_debug("md/raid0:%s: done.\n", mdname(mddev));
*private_conf = conf;
@@ -378,12 +364,6 @@ static int raid0_run(struct mddev *mddev)
if (md_check_no_bitmap(mddev))
return -EINVAL;
- if (mddev->queue) {
- blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
- blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
- blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
- }
-
/* if private is not null, we are here after takeover */
if (mddev->private == NULL) {
ret = create_strip_zones(mddev, &conf);
@@ -392,6 +372,29 @@ static int raid0_run(struct mddev *mddev)
mddev->private = conf;
}
conf = mddev->private;
+ if (mddev->queue) {
+ struct md_rdev *rdev;
+ bool discard_supported = false;
+
+ rdev_for_each(rdev, mddev) {
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+ if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+ discard_supported = true;
+ }
+ blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
+ blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
+ blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
+
+ blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
+ blk_queue_io_opt(mddev->queue,
+ (mddev->chunk_sectors << 9) * mddev->raid_disks);
+
+ if (!discard_supported)
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+ else
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+ }
/* calculate array device size */
md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f39d69f884de..4517f06c41ba 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1474,6 +1474,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &mddev->flags);
printk(KERN_ALERT
"md/raid1:%s: Disk failure on %s, disabling device.\n"
"md/raid1:%s: Operation continuing on %d devices.\n",
@@ -2235,6 +2236,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
{
int m;
+ bool fail = false;
for (m = 0; m < conf->raid_disks * 2 ; m++)
if (r1_bio->bios[m] == IO_MADE_GOOD) {
struct md_rdev *rdev = conf->mirrors[m].rdev;
@@ -2247,6 +2249,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
* narrow down and record precise write
* errors.
*/
+ fail = true;
if (!narrow_write_error(r1_bio, m)) {
md_error(conf->mddev,
conf->mirrors[m].rdev);
@@ -2258,7 +2261,13 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
}
if (test_bit(R1BIO_WriteError, &r1_bio->state))
close_write(r1_bio);
- raid_end_bio_io(r1_bio);
+ if (fail) {
+ spin_lock_irq(&conf->device_lock);
+ list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
+ spin_unlock_irq(&conf->device_lock);
+ md_wakeup_thread(conf->mddev->thread);
+ } else
+ raid_end_bio_io(r1_bio);
}
static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
@@ -2364,6 +2373,23 @@ static void raid1d(struct md_thread *thread)
md_check_recovery(mddev);
+ if (!list_empty_careful(&conf->bio_end_io_list) &&
+ !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ LIST_HEAD(tmp);
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ list_add(&tmp, &conf->bio_end_io_list);
+ list_del_init(&conf->bio_end_io_list);
+ }
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ while (!list_empty(&tmp)) {
+ r1_bio = list_first_entry(&conf->bio_end_io_list,
+ struct r1bio, retry_list);
+ list_del(&r1_bio->retry_list);
+ raid_end_bio_io(r1_bio);
+ }
+ }
+
blk_start_plug(&plug);
for (;;) {
@@ -2763,6 +2789,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
INIT_LIST_HEAD(&conf->retry_list);
+ INIT_LIST_HEAD(&conf->bio_end_io_list);
spin_lock_init(&conf->resync_lock);
init_waitqueue_head(&conf->wait_barrier);
@@ -3057,6 +3084,7 @@ static int raid1_reshape(struct mddev *mddev)
unfreeze_array(conf);
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 14ebb288c1ef..c52d7139c5d7 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -61,6 +61,11 @@ struct r1conf {
* block, or anything else.
*/
struct list_head retry_list;
+ /* A separate list of r1bio which just need raid_end_bio_io called.
+ * This mustn't happen for writes which had any errors if the superblock
+ * needs to be written.
+ */
+ struct list_head bio_end_io_list;
/* queue pending writes to be submitted on unplug */
struct bio_list pending_bio_list;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index b0fce2ebf7ad..0fc33eb88855 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1589,6 +1589,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &mddev->flags);
spin_unlock_irqrestore(&conf->device_lock, flags);
printk(KERN_ALERT
"md/raid10:%s: Disk failure on %s, disabling device.\n"
@@ -2623,6 +2624,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
}
put_buf(r10_bio);
} else {
+ bool fail = false;
for (m = 0; m < conf->copies; m++) {
int dev = r10_bio->devs[m].devnum;
struct bio *bio = r10_bio->devs[m].bio;
@@ -2634,6 +2636,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
r10_bio->sectors, 0);
rdev_dec_pending(rdev, conf->mddev);
} else if (bio != NULL && bio->bi_error) {
+ fail = true;
if (!narrow_write_error(r10_bio, m)) {
md_error(conf->mddev, rdev);
set_bit(R10BIO_Degraded,
@@ -2654,7 +2657,13 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
if (test_bit(R10BIO_WriteError,
&r10_bio->state))
close_write(r10_bio);
- raid_end_bio_io(r10_bio);
+ if (fail) {
+ spin_lock_irq(&conf->device_lock);
+ list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
+ spin_unlock_irq(&conf->device_lock);
+ md_wakeup_thread(conf->mddev->thread);
+ } else
+ raid_end_bio_io(r10_bio);
}
}
@@ -2669,6 +2678,23 @@ static void raid10d(struct md_thread *thread)
md_check_recovery(mddev);
+ if (!list_empty_careful(&conf->bio_end_io_list) &&
+ !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ LIST_HEAD(tmp);
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ list_add(&tmp, &conf->bio_end_io_list);
+ list_del_init(&conf->bio_end_io_list);
+ }
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ while (!list_empty(&tmp)) {
+ r10_bio = list_first_entry(&conf->bio_end_io_list,
+ struct r10bio, retry_list);
+ list_del(&r10_bio->retry_list);
+ raid_end_bio_io(r10_bio);
+ }
+ }
+
blk_start_plug(&plug);
for (;;) {
@@ -3443,6 +3469,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
conf->reshape_safe = conf->reshape_progress;
spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list);
+ INIT_LIST_HEAD(&conf->bio_end_io_list);
spin_lock_init(&conf->resync_lock);
init_waitqueue_head(&conf->wait_barrier);
@@ -4097,7 +4124,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
* at a time, possibly less if that exceeds RESYNC_PAGES,
* or we hit a bad block or something.
* This might mean we pause for normal IO in the middle of
- * a chunk, but that is not a problem was mddev->reshape_position
+ * a chunk, but that is not a problem as mddev->reshape_position
* can record any location.
*
* If we will want to write to a location that isn't
@@ -4121,7 +4148,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
*
* In all this the minimum difference in data offsets
* (conf->offset_diff - always positive) allows a bit of slack,
- * so next can be after 'safe', but not by more than offset_disk
+ * so next can be after 'safe', but not by more than offset_diff
*
* We need to prepare all the bios here before we start any IO
* to ensure the size we choose is acceptable to all devices.
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 5ee6473ddc2c..6fc2c75759bf 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -53,6 +53,12 @@ struct r10conf {
sector_t offset_diff;
struct list_head retry_list;
+ /* A separate list of r1bio which just need raid_end_bio_io called.
+ * This mustn't happen for writes which had any errors if the superblock
+ * needs to be written.
+ */
+ struct list_head bio_end_io_list;
+
/* queue pending writes and submit them on unplug */
struct bio_list pending_bio_list;
int pending_count;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b29e89cb815b..15ef2c641b2b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -223,18 +223,14 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
return slot;
}
-static void return_io(struct bio *return_bi)
+static void return_io(struct bio_list *return_bi)
{
- struct bio *bi = return_bi;
- while (bi) {
-
- return_bi = bi->bi_next;
- bi->bi_next = NULL;
+ struct bio *bi;
+ while ((bi = bio_list_pop(return_bi)) != NULL) {
bi->bi_iter.bi_size = 0;
trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
bi, 0);
bio_endio(bi);
- bi = return_bi;
}
}
@@ -1177,7 +1173,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
static void ops_complete_biofill(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
- struct bio *return_bi = NULL;
+ struct bio_list return_bi = BIO_EMPTY_LIST;
int i;
pr_debug("%s: stripe %llu\n", __func__,
@@ -1201,17 +1197,15 @@ static void ops_complete_biofill(void *stripe_head_ref)
while (rbi && rbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
rbi2 = r5_next_bio(rbi, dev->sector);
- if (!raid5_dec_bi_active_stripes(rbi)) {
- rbi->bi_next = return_bi;
- return_bi = rbi;
- }
+ if (!raid5_dec_bi_active_stripes(rbi))
+ bio_list_add(&return_bi, rbi);
rbi = rbi2;
}
}
}
clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
- return_io(return_bi);
+ return_io(&return_bi);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
@@ -2517,6 +2511,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &mddev->flags);
printk(KERN_ALERT
"md/raid:%s: Disk failure on %s, disabling device.\n"
"md/raid:%s: Operation continuing on %d devices.\n",
@@ -3069,7 +3064,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
static void
handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
struct stripe_head_state *s, int disks,
- struct bio **return_bi)
+ struct bio_list *return_bi)
{
int i;
BUG_ON(sh->batch_head);
@@ -3114,8 +3109,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bi->bi_error = -EIO;
if (!raid5_dec_bi_active_stripes(bi)) {
md_write_end(conf->mddev);
- bi->bi_next = *return_bi;
- *return_bi = bi;
+ bio_list_add(return_bi, bi);
}
bi = nextbi;
}
@@ -3139,8 +3133,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bi->bi_error = -EIO;
if (!raid5_dec_bi_active_stripes(bi)) {
md_write_end(conf->mddev);
- bi->bi_next = *return_bi;
- *return_bi = bi;
+ bio_list_add(return_bi, bi);
}
bi = bi2;
}
@@ -3163,10 +3156,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
r5_next_bio(bi, sh->dev[i].sector);
bi->bi_error = -EIO;
- if (!raid5_dec_bi_active_stripes(bi)) {
- bi->bi_next = *return_bi;
- *return_bi = bi;
- }
+ if (!raid5_dec_bi_active_stripes(bi))
+ bio_list_add(return_bi, bi);
bi = nextbi;
}
}
@@ -3445,7 +3436,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
* never LOCKED, so we don't need to test 'failed' directly.
*/
static void handle_stripe_clean_event(struct r5conf *conf,
- struct stripe_head *sh, int disks, struct bio **return_bi)
+ struct stripe_head *sh, int disks, struct bio_list *return_bi)
{
int i;
struct r5dev *dev;
@@ -3479,8 +3470,7 @@ returnbi:
wbi2 = r5_next_bio(wbi, dev->sector);
if (!raid5_dec_bi_active_stripes(wbi)) {
md_write_end(conf->mddev);
- wbi->bi_next = *return_bi;
- *return_bi = wbi;
+ bio_list_add(return_bi, wbi);
}
wbi = wbi2;
}
@@ -4613,7 +4603,15 @@ finish:
md_wakeup_thread(conf->mddev->thread);
}
- return_io(s.return_bi);
+ if (!bio_list_empty(&s.return_bi)) {
+ if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) {
+ spin_lock_irq(&conf->device_lock);
+ bio_list_merge(&conf->return_bi, &s.return_bi);
+ spin_unlock_irq(&conf->device_lock);
+ md_wakeup_thread(conf->mddev->thread);
+ } else
+ return_io(&s.return_bi);
+ }
clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
}
@@ -4672,12 +4670,12 @@ static int raid5_congested(struct mddev *mddev, int bits)
static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
{
+ struct r5conf *conf = mddev->private;
sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
- unsigned int chunk_sectors = mddev->chunk_sectors;
+ unsigned int chunk_sectors;
unsigned int bio_sectors = bio_sectors(bio);
- if (mddev->new_chunk_sectors < mddev->chunk_sectors)
- chunk_sectors = mddev->new_chunk_sectors;
+ chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
return chunk_sectors >=
((sector & (chunk_sectors - 1)) + bio_sectors);
}
@@ -5325,6 +5323,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
sector_t stripe_addr;
int reshape_sectors;
struct list_head stripes;
+ sector_t retn;
if (sector_nr == 0) {
/* If restarting in the middle, skip the initial sectors */
@@ -5332,6 +5331,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
conf->reshape_progress < raid5_size(mddev, 0, 0)) {
sector_nr = raid5_size(mddev, 0, 0)
- conf->reshape_progress;
+ } else if (mddev->reshape_backwards &&
+ conf->reshape_progress == MaxSector) {
+ /* shouldn't happen, but just in case, finish up.*/
+ sector_nr = MaxSector;
} else if (!mddev->reshape_backwards &&
conf->reshape_progress > 0)
sector_nr = conf->reshape_progress;
@@ -5340,7 +5343,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
mddev->curr_resync_completed = sector_nr;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
*skipped = 1;
- return sector_nr;
+ retn = sector_nr;
+ goto finish;
}
}
@@ -5348,10 +5352,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
* If old and new chunk sizes differ, we need to process the
* largest of these
*/
- if (mddev->new_chunk_sectors > mddev->chunk_sectors)
- reshape_sectors = mddev->new_chunk_sectors;
- else
- reshape_sectors = mddev->chunk_sectors;
+
+ reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
/* We update the metadata at least every 10 seconds, or when
* the data about to be copied would over-write the source of
@@ -5366,11 +5368,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
safepos = conf->reshape_safe;
sector_div(safepos, data_disks);
if (mddev->reshape_backwards) {
- writepos -= min_t(sector_t, reshape_sectors, writepos);
+ BUG_ON(writepos < reshape_sectors);
+ writepos -= reshape_sectors;
readpos += reshape_sectors;
safepos += reshape_sectors;
} else {
writepos += reshape_sectors;
+ /* readpos and safepos are worst-case calculations.
+ * A negative number is overly pessimistic, and causes
+ * obvious problems for unsigned storage. So clip to 0.
+ */
readpos -= min_t(sector_t, reshape_sectors, readpos);
safepos -= min_t(sector_t, reshape_sectors, safepos);
}
@@ -5513,7 +5520,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
* then we need to write out the superblock.
*/
sector_nr += reshape_sectors;
- if ((sector_nr - mddev->curr_resync_completed) * 2
+ retn = reshape_sectors;
+finish:
+ if (mddev->curr_resync_completed > mddev->resync_max ||
+ (sector_nr - mddev->curr_resync_completed) * 2
>= mddev->resync_max - mddev->curr_resync_completed) {
/* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap,
@@ -5538,7 +5548,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
ret:
- return reshape_sectors;
+ return retn;
}
static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
@@ -5794,6 +5804,18 @@ static void raid5d(struct md_thread *thread)
md_check_recovery(mddev);
+ if (!bio_list_empty(&conf->return_bi) &&
+ !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ struct bio_list tmp = BIO_EMPTY_LIST;
+ spin_lock_irq(&conf->device_lock);
+ if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ bio_list_merge(&tmp, &conf->return_bi);
+ bio_list_init(&conf->return_bi);
+ }
+ spin_unlock_irq(&conf->device_lock);
+ return_io(&tmp);
+ }
+
blk_start_plug(&plug);
handled = 0;
spin_lock_irq(&conf->device_lock);
@@ -6234,8 +6256,8 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
/* size is defined by the smallest of previous and new size */
raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
- sectors &= ~((sector_t)mddev->chunk_sectors - 1);
- sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
+ sectors &= ~((sector_t)conf->chunk_sectors - 1);
+ sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
return sectors * (raid_disks - conf->max_degraded);
}
@@ -6453,6 +6475,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
INIT_LIST_HEAD(&conf->hold_list);
INIT_LIST_HEAD(&conf->delayed_list);
INIT_LIST_HEAD(&conf->bitmap_list);
+ bio_list_init(&conf->return_bi);
init_llist_head(&conf->released_stripes);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
@@ -6542,6 +6565,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (conf->reshape_progress != MaxSector) {
conf->prev_chunk_sectors = mddev->chunk_sectors;
conf->prev_algo = mddev->layout;
+ } else {
+ conf->prev_chunk_sectors = conf->chunk_sectors;
+ conf->prev_algo = conf->algorithm;
}
conf->min_nr_stripes = NR_STRIPES;
@@ -6661,6 +6687,8 @@ static int run(struct mddev *mddev)
sector_t here_new, here_old;
int old_disks;
int max_degraded = (mddev->level == 6 ? 2 : 1);
+ int chunk_sectors;
+ int new_data_disks;
if (mddev->new_level != mddev->level) {
printk(KERN_ERR "md/raid:%s: unsupported reshape "
@@ -6672,28 +6700,25 @@ static int run(struct mddev *mddev)
/* reshape_position must be on a new-stripe boundary, and one
* further up in new geometry must map after here in old
* geometry.
+ * If the chunk sizes are different, then as we perform reshape
+ * in units of the largest of the two, reshape_position needs
+ * be a multiple of the largest chunk size times new data disks.
*/
here_new = mddev->reshape_position;
- if (sector_div(here_new, mddev->new_chunk_sectors *
- (mddev->raid_disks - max_degraded))) {
+ chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
+ new_data_disks = mddev->raid_disks - max_degraded;
+ if (sector_div(here_new, chunk_sectors * new_data_disks)) {
printk(KERN_ERR "md/raid:%s: reshape_position not "
"on a stripe boundary\n", mdname(mddev));
return -EINVAL;
}
- reshape_offset = here_new * mddev->new_chunk_sectors;
+ reshape_offset = here_new * chunk_sectors;
/* here_new is the stripe we will write to */
here_old = mddev->reshape_position;
- sector_div(here_old, mddev->chunk_sectors *
- (old_disks-max_degraded));
+ sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
/* here_old is the first stripe that we might need to read
* from */
if (mddev->delta_disks == 0) {
- if ((here_new * mddev->new_chunk_sectors !=
- here_old * mddev->chunk_sectors)) {
- printk(KERN_ERR "md/raid:%s: reshape position is"
- " confused - aborting\n", mdname(mddev));
- return -EINVAL;
- }
/* We cannot be sure it is safe to start an in-place
* reshape. It is only safe if user-space is monitoring
* and taking constant backups.
@@ -6712,10 +6737,10 @@ static int run(struct mddev *mddev)
return -EINVAL;
}
} else if (mddev->reshape_backwards
- ? (here_new * mddev->new_chunk_sectors + min_offset_diff <=
- here_old * mddev->chunk_sectors)
- : (here_new * mddev->new_chunk_sectors >=
- here_old * mddev->chunk_sectors + (-min_offset_diff))) {
+ ? (here_new * chunk_sectors + min_offset_diff <=
+ here_old * chunk_sectors)
+ : (here_new * chunk_sectors >=
+ here_old * chunk_sectors + (-min_offset_diff))) {
/* Reading from the same stripe as writing to - bad */
printk(KERN_ERR "md/raid:%s: reshape_position too early for "
"auto-recovery - aborting.\n",
@@ -6967,7 +6992,7 @@ static void status(struct seq_file *seq, struct mddev *mddev)
int i;
seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
- mddev->chunk_sectors / 2, mddev->layout);
+ conf->chunk_sectors / 2, mddev->layout);
seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
for (i = 0; i < conf->raid_disks; i++)
seq_printf (seq, "%s",
@@ -7173,7 +7198,9 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
* worth it.
*/
sector_t newsize;
- sectors &= ~((sector_t)mddev->chunk_sectors - 1);
+ struct r5conf *conf = mddev->private;
+
+ sectors &= ~((sector_t)conf->chunk_sectors - 1);
newsize = raid5_size(mddev, sectors, mddev->raid_disks);
if (mddev->external_size &&
mddev->array_sectors > newsize)
@@ -7412,6 +7439,7 @@ static void end_reshape(struct r5conf *conf)
rdev->data_offset = rdev->new_data_offset;
smp_wmb();
conf->reshape_progress = MaxSector;
+ conf->mddev->reshape_position = MaxSector;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index d05144278690..828c2925e68f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -265,7 +265,7 @@ struct stripe_head_state {
int dec_preread_active;
unsigned long ops_request;
- struct bio *return_bi;
+ struct bio_list return_bi;
struct md_rdev *blocked_rdev;
int handle_bad_blocks;
};
@@ -476,6 +476,9 @@ struct r5conf {
int skip_copy; /* Don't copy data from bio to stripe cache */
struct list_head *last_hold; /* detect hold_list promotions */
+ /* bios to have bi_end_io called after metadata is synced */
+ struct bio_list return_bi;
+
atomic_t reshape_stripes; /* stripes with pending writes for reshape */
/* unfortunately we need two cache names as we temporarily have
* two caches.