summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/md.rst32
-rw-r--r--Documentation/md/md-cluster.txt2
-rw-r--r--Documentation/md/raid5-ppl.txt44
-rw-r--r--block/bio.c61
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/bitmap.c59
-rw-r--r--drivers/md/bitmap.h3
-rw-r--r--drivers/md/linear.c75
-rw-r--r--drivers/md/md-cluster.c223
-rw-r--r--drivers/md/md-cluster.h1
-rw-r--r--drivers/md/md.c414
-rw-r--r--drivers/md/md.h71
-rw-r--r--drivers/md/raid0.c78
-rw-r--r--drivers/md/raid1.c679
-rw-r--r--drivers/md/raid1.h13
-rw-r--r--drivers/md/raid10.c736
-rw-r--r--drivers/md/raid10.h1
-rw-r--r--drivers/md/raid5-cache.c362
-rw-r--r--drivers/md/raid5-log.h115
-rw-r--r--drivers/md/raid5-ppl.c1271
-rw-r--r--drivers/md/raid5.c643
-rw-r--r--drivers/md/raid5.h106
-rw-r--r--include/linux/bio.h11
-rw-r--r--include/linux/percpu-refcount.h1
-rw-r--r--include/uapi/linux/raid/md_p.h45
-rw-r--r--lib/percpu-refcount.c17
26 files changed, 3582 insertions, 1483 deletions
diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst
index 1e61bf50595c..84de718f24a4 100644
--- a/Documentation/admin-guide/md.rst
+++ b/Documentation/admin-guide/md.rst
@@ -276,14 +276,14 @@ All md devices contain:
array creation it will default to 0, though starting the array as
``clean`` will set it much larger.
- new_dev
+ new_dev
This file can be written but not read. The value written should
be a block device number as major:minor. e.g. 8:0
This will cause that device to be attached to the array, if it is
available. It will then appear at md/dev-XXX (depending on the
name of the device) and further configuration is then possible.
- safe_mode_delay
+ safe_mode_delay
When an md array has seen no write requests for a certain period
of time, it will be marked as ``clean``. When another write
request arrives, the array is marked as ``dirty`` before the write
@@ -292,7 +292,7 @@ All md devices contain:
period as a number of seconds. The default is 200msec (0.200).
Writing a value of 0 disables safemode.
- array_state
+ array_state
This file contains a single word which describes the current
state of the array. In many cases, the state can be set by
writing the word for the desired state, however some states
@@ -401,7 +401,30 @@ All md devices contain:
once the array becomes non-degraded, and this fact has been
recorded in the metadata.
+ consistency_policy
+ This indicates how the array maintains consistency in case of unexpected
+ shutdown. It can be:
+ none
+ Array has no redundancy information, e.g. raid0, linear.
+
+ resync
+ Full resync is performed and all redundancy is regenerated when the
+ array is started after unclean shutdown.
+
+ bitmap
+ Resync assisted by a write-intent bitmap.
+
+ journal
+ For raid4/5/6, journal device is used to log transactions and replay
+ after unclean shutdown.
+
+ ppl
+ For raid5 only, Partial Parity Log is used to close the write hole and
+ eliminate resync.
+
+ The accepted values when writing to this file are ``ppl`` and ``resync``,
+ used to enable and disable PPL.
As component devices are added to an md array, they appear in the ``md``
@@ -563,6 +586,9 @@ Each directory contains:
adds bad blocks without acknowledging them. This is largely
for testing.
+ ppl_sector, ppl_size
+ Location and size (in sectors) of the space used for Partial Parity Log
+ on this device.
An active md device will also contain an entry for each active device
diff --git a/Documentation/md/md-cluster.txt b/Documentation/md/md-cluster.txt
index 38883276d31c..2663d49dd8a0 100644
--- a/Documentation/md/md-cluster.txt
+++ b/Documentation/md/md-cluster.txt
@@ -321,4 +321,4 @@ The algorithm is:
There are somethings which are not supported by cluster MD yet.
-- update size and change array_sectors.
+- change array_sectors.
diff --git a/Documentation/md/raid5-ppl.txt b/Documentation/md/raid5-ppl.txt
new file mode 100644
index 000000000000..127072b09363
--- /dev/null
+++ b/Documentation/md/raid5-ppl.txt
@@ -0,0 +1,44 @@
+Partial Parity Log
+
+Partial Parity Log (PPL) is a feature available for RAID5 arrays. The issue
+addressed by PPL is that after a dirty shutdown, parity of a particular stripe
+may become inconsistent with data on other member disks. If the array is also
+in degraded state, there is no way to recalculate parity, because one of the
+disks is missing. This can lead to silent data corruption when rebuilding the
+array or using it is as degraded - data calculated from parity for array blocks
+that have not been touched by a write request during the unclean shutdown can
+be incorrect. Such condition is known as the RAID5 Write Hole. Because of
+this, md by default does not allow starting a dirty degraded array.
+
+Partial parity for a write operation is the XOR of stripe data chunks not
+modified by this write. It is just enough data needed for recovering from the
+write hole. XORing partial parity with the modified chunks produces parity for
+the stripe, consistent with its state before the write operation, regardless of
+which chunk writes have completed. If one of the not modified data disks of
+this stripe is missing, this updated parity can be used to recover its
+contents. PPL recovery is also performed when starting an array after an
+unclean shutdown and all disks are available, eliminating the need to resync
+the array. Because of this, using write-intent bitmap and PPL together is not
+supported.
+
+When handling a write request PPL writes partial parity before new data and
+parity are dispatched to disks. PPL is a distributed log - it is stored on
+array member drives in the metadata area, on the parity drive of a particular
+stripe. It does not require a dedicated journaling drive. Write performance is
+reduced by up to 30%-40% but it scales with the number of drives in the array
+and the journaling drive does not become a bottleneck or a single point of
+failure.
+
+Unlike raid5-cache, the other solution in md for closing the write hole, PPL is
+not a true journal. It does not protect from losing in-flight data, only from
+silent data corruption. If a dirty disk of a stripe is lost, no PPL recovery is
+performed for this stripe (parity is not updated). So it is possible to have
+arbitrary data in the written part of a stripe if that disk is lost. In such
+case the behavior is the same as in plain raid5.
+
+PPL is available for md version-1 metadata and external (specifically IMSM)
+metadata arrays. It can be enabled using mdadm option --consistency-policy=ppl.
+
+Currently, volatile write-back cache should be disabled on all member drives
+when using PPL. Otherwise it cannot guarantee consistency in case of power
+failure.
diff --git a/block/bio.c b/block/bio.c
index f4d207180266..888e7801c638 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -633,20 +633,21 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
}
EXPORT_SYMBOL(bio_clone_fast);
-static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
- struct bio_set *bs, int offset,
- int size)
+/**
+ * bio_clone_bioset - clone a bio
+ * @bio_src: bio to clone
+ * @gfp_mask: allocation priority
+ * @bs: bio_set to allocate from
+ *
+ * Clone bio. Caller will own the returned bio, but not the actual data it
+ * points to. Reference count of returned bio will be one.
+ */
+struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+ struct bio_set *bs)
{
struct bvec_iter iter;
struct bio_vec bv;
struct bio *bio;
- struct bvec_iter iter_src = bio_src->bi_iter;
-
- /* for supporting partial clone */
- if (offset || size != bio_src->bi_iter.bi_size) {
- bio_advance_iter(bio_src, &iter_src, offset);
- iter_src.bi_size = size;
- }
/*
* Pre immutable biovecs, __bio_clone() used to just do a memcpy from
@@ -670,8 +671,7 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
* __bio_clone_fast() anyways.
*/
- bio = bio_alloc_bioset(gfp_mask, __bio_segments(bio_src,
- &iter_src), bs);
+ bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
if (!bio)
return NULL;
bio->bi_bdev = bio_src->bi_bdev;
@@ -688,7 +688,7 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
break;
default:
- __bio_for_each_segment(bv, bio_src, iter, iter_src)
+ bio_for_each_segment(bv, bio_src, iter)
bio->bi_io_vec[bio->bi_vcnt++] = bv;
break;
}
@@ -707,44 +707,9 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
return bio;
}
-
-/**
- * bio_clone_bioset - clone a bio
- * @bio_src: bio to clone
- * @gfp_mask: allocation priority
- * @bs: bio_set to allocate from
- *
- * Clone bio. Caller will own the returned bio, but not the actual data it
- * points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
- struct bio_set *bs)
-{
- return __bio_clone_bioset(bio_src, gfp_mask, bs, 0,
- bio_src->bi_iter.bi_size);
-}
EXPORT_SYMBOL(bio_clone_bioset);
/**
- * bio_clone_bioset_partial - clone a partial bio
- * @bio_src: bio to clone
- * @gfp_mask: allocation priority
- * @bs: bio_set to allocate from
- * @offset: cloned starting from the offset
- * @size: size for the cloned bio
- *
- * Clone bio. Caller will own the returned bio, but not the actual data it
- * points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset_partial(struct bio *bio_src, gfp_t gfp_mask,
- struct bio_set *bs, int offset,
- int size)
-{
- return __bio_clone_bioset(bio_src, gfp_mask, bs, offset, size);
-}
-EXPORT_SYMBOL(bio_clone_bioset_partial);
-
-/**
* bio_add_pc_page - attempt to add page to bio
* @q: the target queue
* @bio: destination bio
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 3cbda1af87a0..4d48714ccc6b 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -18,7 +18,7 @@ dm-cache-cleaner-y += dm-cache-policy-cleaner.o
dm-era-y += dm-era-target.o
dm-verity-y += dm-verity-target.o
md-mod-y += md.o bitmap.o
-raid456-y += raid5.o raid5-cache.o
+raid456-y += raid5.o raid5-cache.o raid5-ppl.o
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 9fb2ccac958a..bf7419a56454 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -471,6 +471,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
kunmap_atomic(sb);
write_page(bitmap, bitmap->storage.sb_page, 1);
}
+EXPORT_SYMBOL(bitmap_update_sb);
/* print out the bitmap file superblock */
void bitmap_print_sb(struct bitmap *bitmap)
@@ -696,7 +697,7 @@ re_read:
out:
kunmap_atomic(sb);
- /* Assiging chunksize is required for "re_read" */
+ /* Assigning chunksize is required for "re_read" */
bitmap->mddev->bitmap_info.chunksize = chunksize;
if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
err = md_setup_cluster(bitmap->mddev, nodes);
@@ -1727,7 +1728,7 @@ void bitmap_flush(struct mddev *mddev)
/*
* free memory that was allocated
*/
-static void bitmap_free(struct bitmap *bitmap)
+void bitmap_free(struct bitmap *bitmap)
{
unsigned long k, pages;
struct bitmap_page *bp;
@@ -1761,6 +1762,21 @@ static void bitmap_free(struct bitmap *bitmap)
kfree(bp);
kfree(bitmap);
}
+EXPORT_SYMBOL(bitmap_free);
+
+void bitmap_wait_behind_writes(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+
+ /* wait for behind writes to complete */
+ if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+ pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
+ mdname(mddev));
+ /* need to kick something here to make sure I/O goes? */
+ wait_event(bitmap->behind_wait,
+ atomic_read(&bitmap->behind_writes) == 0);
+ }
+}
void bitmap_destroy(struct mddev *mddev)
{
@@ -1769,6 +1785,8 @@ void bitmap_destroy(struct mddev *mddev)
if (!bitmap) /* there was no bitmap */
return;
+ bitmap_wait_behind_writes(mddev);
+
mutex_lock(&mddev->bitmap_info.mutex);
spin_lock(&mddev->lock);
mddev->bitmap = NULL; /* disconnect from the md device */
@@ -1920,6 +1938,27 @@ out:
}
EXPORT_SYMBOL_GPL(bitmap_load);
+struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot)
+{
+ int rv = 0;
+ struct bitmap *bitmap;
+
+ bitmap = bitmap_create(mddev, slot);
+ if (IS_ERR(bitmap)) {
+ rv = PTR_ERR(bitmap);
+ return ERR_PTR(rv);
+ }
+
+ rv = bitmap_init_from_disk(bitmap, 0);
+ if (rv) {
+ bitmap_free(bitmap);
+ return ERR_PTR(rv);
+ }
+
+ return bitmap;
+}
+EXPORT_SYMBOL(get_bitmap_from_slot);
+
/* Loads the bitmap associated with slot and copies the resync information
* to our bitmap
*/
@@ -1929,14 +1968,13 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
int rv = 0, i, j;
sector_t block, lo = 0, hi = 0;
struct bitmap_counts *counts;
- struct bitmap *bitmap = bitmap_create(mddev, slot);
-
- if (IS_ERR(bitmap))
- return PTR_ERR(bitmap);
+ struct bitmap *bitmap;
- rv = bitmap_init_from_disk(bitmap, 0);
- if (rv)
- goto err;
+ bitmap = get_bitmap_from_slot(mddev, slot);
+ if (IS_ERR(bitmap)) {
+ pr_err("%s can't get bitmap from slot %d\n", __func__, slot);
+ return -1;
+ }
counts = &bitmap->counts;
for (j = 0; j < counts->chunks; j++) {
@@ -1963,8 +2001,7 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
bitmap_unplug(mddev->bitmap);
*low = lo;
*high = hi;
-err:
- bitmap_free(bitmap);
+
return rv;
}
EXPORT_SYMBOL_GPL(bitmap_copy_from_slot);
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 5b6dd63dda91..d15721ac07a6 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -267,8 +267,11 @@ void bitmap_daemon_work(struct mddev *mddev);
int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
int chunksize, int init);
+struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot);
int bitmap_copy_from_slot(struct mddev *mddev, int slot,
sector_t *lo, sector_t *hi, bool clear_bits);
+void bitmap_free(struct bitmap *bitmap);
+void bitmap_wait_behind_writes(struct mddev *mddev);
#endif
#endif
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 377a8a3672e3..df6f2c98eca7 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -249,54 +249,49 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
{
char b[BDEVNAME_SIZE];
struct dev_info *tmp_dev;
- struct bio *split;
sector_t start_sector, end_sector, data_offset;
+ sector_t bio_sector = bio->bi_iter.bi_sector;
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
return;
}
- do {
- sector_t bio_sector = bio->bi_iter.bi_sector;
- tmp_dev = which_dev(mddev, bio_sector);
- start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
- end_sector = tmp_dev->end_sector;
- data_offset = tmp_dev->rdev->data_offset;
- bio->bi_bdev = tmp_dev->rdev->bdev;
-
- if (unlikely(bio_sector >= end_sector ||
- bio_sector < start_sector))
- goto out_of_bounds;
-
- if (unlikely(bio_end_sector(bio) > end_sector)) {
- /* This bio crosses a device boundary, so we have to
- * split it.
- */
- split = bio_split(bio, end_sector - bio_sector,
- GFP_NOIO, fs_bio_set);
- bio_chain(split, bio);
- } else {
- split = bio;
- }
+ tmp_dev = which_dev(mddev, bio_sector);
+ start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
+ end_sector = tmp_dev->end_sector;
+ data_offset = tmp_dev->rdev->data_offset;
+
+ if (unlikely(bio_sector >= end_sector ||
+ bio_sector < start_sector))
+ goto out_of_bounds;
+
+ if (unlikely(bio_end_sector(bio) > end_sector)) {
+ /* This bio crosses a device boundary, so we have to split it */
+ struct bio *split = bio_split(bio, end_sector - bio_sector,
+ GFP_NOIO, mddev->bio_set);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ }
- split->bi_iter.bi_sector = split->bi_iter.bi_sector -
- start_sector + data_offset;
-
- if (unlikely((bio_op(split) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
- /* Just ignore it */
- bio_endio(split);
- } else {
- if (mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
- split, disk_devt(mddev->gendisk),
- bio_sector);
- mddev_check_writesame(mddev, split);
- mddev_check_write_zeroes(mddev, split);
- generic_make_request(split);
- }
- } while (split != bio);
+ bio->bi_bdev = tmp_dev->rdev->bdev;
+ bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
+ start_sector + data_offset;
+
+ if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
+ /* Just ignore it */
+ bio_endio(bio);
+ } else {
+ if (mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+ bio, disk_devt(mddev->gendisk),
+ bio_sector);
+ mddev_check_writesame(mddev, bio);
+ mddev_check_write_zeroes(mddev, bio);
+ generic_make_request(bio);
+ }
return;
out_of_bounds:
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 321ecac23027..7299ce2f08a8 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -67,9 +67,10 @@ struct resync_info {
* set up all the related infos such as bitmap and personality */
#define MD_CLUSTER_ALREADY_IN_CLUSTER 6
#define MD_CLUSTER_PENDING_RECV_EVENT 7
-
+#define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8
struct md_cluster_info {
+ struct mddev *mddev; /* the md device which md_cluster_info belongs to */
/* dlm lock space and resources for clustered raid. */
dlm_lockspace_t *lockspace;
int slot_number;
@@ -103,6 +104,7 @@ enum msg_type {
REMOVE,
RE_ADD,
BITMAP_NEEDS_SYNC,
+ CHANGE_CAPACITY,
};
struct cluster_msg {
@@ -523,11 +525,17 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
{
+ int got_lock = 0;
struct md_cluster_info *cinfo = mddev->cluster_info;
mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
- set_bit(MD_RELOAD_SB, &mddev->flags);
+
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
- md_wakeup_thread(mddev->thread);
+ wait_event(mddev->thread->wqueue,
+ (got_lock = mddev_trylock(mddev)) ||
+ test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state));
+ md_reload_sb(mddev, mddev->good_device_nr);
+ if (got_lock)
+ mddev_unlock(mddev);
}
static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
@@ -572,6 +580,10 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
case METADATA_UPDATED:
process_metadata_update(mddev, msg);
break;
+ case CHANGE_CAPACITY:
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ break;
case RESYNCING:
process_suspend_info(mddev, le32_to_cpu(msg->slot),
le64_to_cpu(msg->low),
@@ -646,11 +658,29 @@ out:
* Takes the lock on the TOKEN lock resource so no other
* node can communicate while the operation is underway.
*/
-static int lock_token(struct md_cluster_info *cinfo)
+static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked)
{
- int error;
+ int error, set_bit = 0;
+ struct mddev *mddev = cinfo->mddev;
+ /*
+ * If resync thread run after raid1d thread, then process_metadata_update
+ * could not continue if raid1d held reconfig_mutex (and raid1d is blocked
+ * since another node already got EX on Token and waitting the EX of Ack),
+ * so let resync wake up thread in case flag is set.
+ */
+ if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
+ &cinfo->state)) {
+ error = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
+ &cinfo->state);
+ WARN_ON_ONCE(error);
+ md_wakeup_thread(mddev->thread);
+ set_bit = 1;
+ }
error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
+ if (set_bit)
+ clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
+
if (error)
pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
__func__, __LINE__, error);
@@ -663,12 +693,12 @@ static int lock_token(struct md_cluster_info *cinfo)
/* lock_comm()
* Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
*/
-static int lock_comm(struct md_cluster_info *cinfo)
+static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
{
wait_event(cinfo->wait,
!test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
- return lock_token(cinfo);
+ return lock_token(cinfo, mddev_locked);
}
static void unlock_comm(struct md_cluster_info *cinfo)
@@ -743,11 +773,12 @@ failed_message:
return error;
}
-static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
+static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg,
+ bool mddev_locked)
{
int ret;
- lock_comm(cinfo);
+ lock_comm(cinfo, mddev_locked);
ret = __sendmsg(cinfo, cmsg);
unlock_comm(cinfo);
return ret;
@@ -834,6 +865,7 @@ static int join(struct mddev *mddev, int nodes)
mutex_init(&cinfo->recv_mutex);
mddev->cluster_info = cinfo;
+ cinfo->mddev = mddev;
memset(str, 0, 64);
sprintf(str, "%pU", mddev->uuid);
@@ -908,6 +940,7 @@ static int join(struct mddev *mddev, int nodes)
return 0;
err:
+ set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
md_unregister_thread(&cinfo->recovery_thread);
md_unregister_thread(&cinfo->recv_thread);
lockres_free(cinfo->message_lockres);
@@ -943,7 +976,7 @@ static void resync_bitmap(struct mddev *mddev)
int err;
cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
- err = sendmsg(cinfo, &cmsg);
+ err = sendmsg(cinfo, &cmsg, 1);
if (err)
pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
__func__, __LINE__, err);
@@ -963,6 +996,7 @@ static int leave(struct mddev *mddev)
if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
resync_bitmap(mddev);
+ set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
md_unregister_thread(&cinfo->recovery_thread);
md_unregister_thread(&cinfo->recv_thread);
lockres_free(cinfo->message_lockres);
@@ -997,16 +1031,30 @@ static int slot_number(struct mddev *mddev)
static int metadata_update_start(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
+ int ret;
+
+ /*
+ * metadata_update_start is always called with the protection of
+ * reconfig_mutex, so set WAITING_FOR_TOKEN here.
+ */
+ ret = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
+ &cinfo->state);
+ WARN_ON_ONCE(ret);
+ md_wakeup_thread(mddev->thread);
wait_event(cinfo->wait,
!test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) ||
test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state));
/* If token is already locked, return 0 */
- if (cinfo->token_lockres->mode == DLM_LOCK_EX)
+ if (cinfo->token_lockres->mode == DLM_LOCK_EX) {
+ clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
return 0;
+ }
- return lock_token(cinfo);
+ ret = lock_token(cinfo, 1);
+ clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
+ return ret;
}
static int metadata_update_finish(struct mddev *mddev)
@@ -1043,6 +1091,141 @@ static void metadata_update_cancel(struct mddev *mddev)
unlock_comm(cinfo);
}
+/*
+ * return 0 if all the bitmaps have the same sync_size
+ */
+int cluster_check_sync_size(struct mddev *mddev)
+{
+ int i, rv;
+ bitmap_super_t *sb;
+ unsigned long my_sync_size, sync_size = 0;
+ int node_num = mddev->bitmap_info.nodes;
+ int current_slot = md_cluster_ops->slot_number(mddev);
+ struct bitmap *bitmap = mddev->bitmap;
+ char str[64];
+ struct dlm_lock_resource *bm_lockres;
+
+ sb = kmap_atomic(bitmap->storage.sb_page);
+ my_sync_size = sb->sync_size;
+ kunmap_atomic(sb);
+
+ for (i = 0; i < node_num; i++) {
+ if (i == current_slot)
+ continue;
+
+ bitmap = get_bitmap_from_slot(mddev, i);
+ if (IS_ERR(bitmap)) {
+ pr_err("can't get bitmap from slot %d\n", i);
+ return -1;
+ }
+
+ /*
+ * If we can hold the bitmap lock of one node then
+ * the slot is not occupied, update the sb.
+ */
+ snprintf(str, 64, "bitmap%04d", i);
+ bm_lockres = lockres_init(mddev, str, NULL, 1);
+ if (!bm_lockres) {
+ pr_err("md-cluster: Cannot initialize %s\n", str);
+ bitmap_free(bitmap);
+ return -1;
+ }
+ bm_lockres->flags |= DLM_LKF_NOQUEUE;
+ rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
+ if (!rv)
+ bitmap_update_sb(bitmap);
+ lockres_free(bm_lockres);
+
+ sb = kmap_atomic(bitmap->storage.sb_page);
+ if (sync_size == 0)
+ sync_size = sb->sync_size;
+ else if (sync_size != sb->sync_size) {
+ kunmap_atomic(sb);
+ bitmap_free(bitmap);
+ return -1;
+ }
+ kunmap_atomic(sb);
+ bitmap_free(bitmap);
+ }
+
+ return (my_sync_size == sync_size) ? 0 : -1;
+}
+
+/*
+ * Update the size for cluster raid is a little more complex, we perform it
+ * by the steps:
+ * 1. hold token lock and update superblock in initiator node.
+ * 2. send METADATA_UPDATED msg to other nodes.
+ * 3. The initiator node continues to check each bitmap's sync_size, if all
+ * bitmaps have the same value of sync_size, then we can set capacity and
+ * let other nodes to perform it. If one node can't update sync_size
+ * accordingly, we need to revert to previous value.
+ */
+static void update_size(struct mddev *mddev, sector_t old_dev_sectors)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ struct cluster_msg cmsg;
+ struct md_rdev *rdev;
+ int ret = 0;
+ int raid_slot = -1;
+
+ md_update_sb(mddev, 1);
+ lock_comm(cinfo, 1);
+
+ memset(&cmsg, 0, sizeof(cmsg));
+ cmsg.type = cpu_to_le32(METADATA_UPDATED);
+ rdev_for_each(rdev, mddev)
+ if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) {
+ raid_slot = rdev->desc_nr;
+ break;
+ }
+ if (raid_slot >= 0) {
+ cmsg.raid_slot = cpu_to_le32(raid_slot);
+ /*
+ * We can only change capiticy after all the nodes can do it,
+ * so need to wait after other nodes already received the msg
+ * and handled the change
+ */
+ ret = __sendmsg(cinfo, &cmsg);
+ if (ret) {
+ pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
+ __func__, __LINE__);
+ unlock_comm(cinfo);
+ return;
+ }
+ } else {
+ pr_err("md-cluster: No good device id found to send\n");
+ unlock_comm(cinfo);
+ return;
+ }
+
+ /*
+ * check the sync_size from other node's bitmap, if sync_size
+ * have already updated in other nodes as expected, send an
+ * empty metadata msg to permit the change of capacity
+ */
+ if (cluster_check_sync_size(mddev) == 0) {
+ memset(&cmsg, 0, sizeof(cmsg));
+ cmsg.type = cpu_to_le32(CHANGE_CAPACITY);
+ ret = __sendmsg(cinfo, &cmsg);
+ if (ret)
+ pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n",
+ __func__, __LINE__);
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ } else {
+ /* revert to previous sectors */
+ ret = mddev->pers->resize(mddev, old_dev_sectors);
+ if (!ret)
+ revalidate_disk(mddev->gendisk);
+ ret = __sendmsg(cinfo, &cmsg);
+ if (ret)
+ pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
+ __func__, __LINE__);
+ }
+ unlock_comm(cinfo);
+}
+
static int resync_start(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -1069,7 +1252,14 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
cmsg.low = cpu_to_le64(lo);
cmsg.high = cpu_to_le64(hi);
- return sendmsg(cinfo, &cmsg);
+ /*
+ * mddev_lock is held if resync_info_update is called from
+ * resync_finish (md_reap_sync_thread -> resync_finish)
+ */
+ if (lo == 0 && hi == 0)
+ return sendmsg(cinfo, &cmsg, 1);
+ else
+ return sendmsg(cinfo, &cmsg, 0);
}
static int resync_finish(struct mddev *mddev)
@@ -1119,7 +1309,7 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
cmsg.type = cpu_to_le32(NEWDISK);
memcpy(cmsg.uuid, uuid, 16);
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
- lock_comm(cinfo);
+ lock_comm(cinfo, 1);
ret = __sendmsg(cinfo, &cmsg);
if (ret)
return ret;
@@ -1179,7 +1369,7 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct md_cluster_info *cinfo = mddev->cluster_info;
cmsg.type = cpu_to_le32(REMOVE);
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
- return sendmsg(cinfo, &cmsg);
+ return sendmsg(cinfo, &cmsg, 1);
}
static int lock_all_bitmaps(struct mddev *mddev)
@@ -1243,7 +1433,7 @@ static int gather_bitmaps(struct md_rdev *rdev)
cmsg.type = cpu_to_le32(RE_ADD);
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
- err = sendmsg(cinfo, &cmsg);
+ err = sendmsg(cinfo, &cmsg, 1);
if (err)
goto out;
@@ -1281,6 +1471,7 @@ static struct md_cluster_operations cluster_ops = {
.gather_bitmaps = gather_bitmaps,
.lock_all_bitmaps = lock_all_bitmaps,
.unlock_all_bitmaps = unlock_all_bitmaps,
+ .update_size = update_size,
};
static int __init cluster_init(void)
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index e765499ba591..274016177983 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -27,6 +27,7 @@ struct md_cluster_operations {
int (*gather_bitmaps)(struct md_rdev *rdev);
int (*lock_all_bitmaps)(struct mddev *mddev);
void (*unlock_all_bitmaps)(struct mddev *mddev);
+ void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors);
};
#endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f6ae1d67bcd0..82f798be964f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -65,6 +65,8 @@
#include <linux/raid/md_p.h>
#include <linux/raid/md_u.h>
#include <linux/slab.h>
+#include <linux/percpu-refcount.h>
+
#include <trace/events/block.h>
#include "md.h"
#include "bitmap.h"
@@ -172,6 +174,16 @@ static const struct block_device_operations md_fops;
static int start_readonly;
+/*
+ * The original mechanism for creating an md device is to create
+ * a device node in /dev and to open it. This causes races with device-close.
+ * The preferred method is to write to the "new_array" module parameter.
+ * This can avoid races.
+ * Setting create_on_open to false disables the original mechanism
+ * so all the races disappear.
+ */
+static bool create_on_open = true;
+
/* bio_clone_mddev
* like bio_clone, but with a local bio set
*/
@@ -1507,6 +1519,12 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
} else if (sb->bblog_offset != 0)
rdev->badblocks.shift = 0;
+ if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+ rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
+ rdev->ppl.size = le16_to_cpu(sb->ppl.size);
+ rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
+ }
+
if (!refdev) {
ret = 1;
} else {
@@ -1619,6 +1637,13 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
set_bit(MD_HAS_JOURNAL, &mddev->flags);
+
+ if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+ if (le32_to_cpu(sb->feature_map) &
+ (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
+ return -EINVAL;
+ set_bit(MD_HAS_PPL, &mddev->flags);
+ }
} else if (mddev->pers == NULL) {
/* Insist of good event counter while assembling, except for
* spares (which don't need an event count) */
@@ -1832,6 +1857,12 @@ retry:
if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
+ if (test_bit(MD_HAS_PPL, &mddev->flags)) {
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
+ sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
+ sb->ppl.size = cpu_to_le16(rdev->ppl.size);
+ }
+
rdev_for_each(rdev2, mddev) {
i = rdev2->desc_nr;
if (test_bit(Faulty, &rdev2->flags))
@@ -2072,6 +2103,10 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
if (find_rdev(mddev, rdev->bdev->bd_dev))
return -EEXIST;
+ if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) &&
+ mddev->pers)
+ return -EROFS;
+
/* make sure rdev->sectors exceeds mddev->dev_sectors */
if (!test_bit(Journal, &rdev->flags) &&
rdev->sectors &&
@@ -2233,6 +2268,33 @@ static void export_array(struct mddev *mddev)
mddev->major_version = 0;
}
+static bool set_in_sync(struct mddev *mddev)
+{
+ WARN_ON_ONCE(!spin_is_locked(&mddev->lock));
+ if (!mddev->in_sync) {
+ mddev->sync_checkers++;
+ spin_unlock(&mddev->lock);
+ percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
+ spin_lock(&mddev->lock);
+ if (!mddev->in_sync &&
+ percpu_ref_is_zero(&mddev->writes_pending)) {
+ mddev->in_sync = 1;
+ /*
+ * Ensure ->in_sync is visible before we clear
+ * ->sync_checkers.
+ */
+ smp_mb();
+ set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ }
+ if (--mddev->sync_checkers == 0)
+ percpu_ref_switch_to_percpu(&mddev->writes_pending);
+ }
+ if (mddev->safemode == 1)
+ mddev->safemode = 0;
+ return mddev->in_sync;
+}
+
static void sync_sbs(struct mddev *mddev, int nospares)
{
/* Update each superblock (in-memory image), but
@@ -3131,6 +3193,78 @@ static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
static struct rdev_sysfs_entry rdev_unack_bad_blocks =
__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
+static ssize_t
+ppl_sector_show(struct md_rdev *rdev, char *page)
+{
+ return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
+}
+
+static ssize_t
+ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+ unsigned long long sector;
+
+ if (kstrtoull(buf, 10, &sector) < 0)
+ return -EINVAL;
+ if (sector != (sector_t)sector)
+ return -EINVAL;
+
+ if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
+ rdev->raid_disk >= 0)
+ return -EBUSY;
+
+ if (rdev->mddev->persistent) {
+ if (rdev->mddev->major_version == 0)
+ return -EINVAL;
+ if ((sector > rdev->sb_start &&
+ sector - rdev->sb_start > S16_MAX) ||
+ (sector < rdev->sb_start &&
+ rdev->sb_start - sector > -S16_MIN))
+ return -EINVAL;
+ rdev->ppl.offset = sector - rdev->sb_start;
+ } else if (!rdev->mddev->external) {
+ return -EBUSY;
+ }
+ rdev->ppl.sector = sector;
+ return len;
+}
+
+static struct rdev_sysfs_entry rdev_ppl_sector =
+__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
+
+static ssize_t
+ppl_size_show(struct md_rdev *rdev, char *page)
+{
+ return sprintf(page, "%u\n", rdev->ppl.size);
+}
+
+static ssize_t
+ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+ unsigned int size;
+
+ if (kstrtouint(buf, 10, &size) < 0)
+ return -EINVAL;
+
+ if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
+ rdev->raid_disk >= 0)
+ return -EBUSY;
+
+ if (rdev->mddev->persistent) {
+ if (rdev->mddev->major_version == 0)
+ return -EINVAL;
+ if (size > U16_MAX)
+ return -EINVAL;
+ } else if (!rdev->mddev->external) {
+ return -EBUSY;
+ }
+ rdev->ppl.size = size;
+ return len;
+}
+
+static struct rdev_sysfs_entry rdev_ppl_size =
+__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
+
static struct attribute *rdev_default_attrs[] = {
&rdev_state.attr,
&rdev_errors.attr,
@@ -3141,6 +3275,8 @@ static struct attribute *rdev_default_attrs[] = {
&rdev_recovery_start.attr,
&rdev_bad_blocks.attr,
&rdev_unack_bad_blocks.attr,
+ &rdev_ppl_sector.attr,
+ &rdev_ppl_size.attr,
NULL,
};
static ssize_t
@@ -3903,6 +4039,7 @@ array_state_show(struct mddev *mddev, char *page)
st = read_auto;
break;
case 0:
+ spin_lock(&mddev->lock);
if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
st = write_pending;
else if (mddev->in_sync)
@@ -3911,6 +4048,7 @@ array_state_show(struct mddev *mddev, char *page)
st = active_idle;
else
st = active;
+ spin_unlock(&mddev->lock);
}
else {
if (list_empty(&mddev->disks) &&
@@ -3931,7 +4069,7 @@ static int restart_array(struct mddev *mddev);
static ssize_t
array_state_store(struct mddev *mddev, const char *buf, size_t len)
{
- int err;
+ int err = 0;
enum array_state st = match_word(buf, array_states);
if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
@@ -3944,18 +4082,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
md_wakeup_thread(mddev->thread);
wake_up(&mddev->sb_wait);
- err = 0;
} else /* st == clean */ {
restart_array(mddev);
- if (atomic_read(&mddev->writes_pending) == 0) {
- if (mddev->in_sync == 0) {
- mddev->in_sync = 1;
- if (mddev->safemode == 1)
- mddev->safemode = 0;
- set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
- }
- err = 0;
- } else
+ if (!set_in_sync(mddev))
err = -EBUSY;
}
if (!err)
@@ -4013,15 +4142,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
if (err)
break;
spin_lock(&mddev->lock);
- if (atomic_read(&mddev->writes_pending) == 0) {
- if (mddev->in_sync == 0) {
- mddev->in_sync = 1;
- if (mddev->safemode == 1)
- mddev->safemode = 0;
- set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
- }
- err = 0;
- } else
+ if (!set_in_sync(mddev))
err = -EBUSY;
spin_unlock(&mddev->lock);
} else
@@ -4843,8 +4964,10 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len)
return err;
/* cluster raid doesn't support change array_sectors */
- if (mddev_is_clustered(mddev))
+ if (mddev_is_clustered(mddev)) {
+ mddev_unlock(mddev);
return -EINVAL;
+ }
if (strncmp(buf, "default", 7) == 0) {
if (mddev->pers)
@@ -4877,6 +5000,52 @@ static struct md_sysfs_entry md_array_size =
__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
array_size_store);
+static ssize_t
+consistency_policy_show(struct mddev *mddev, char *page)
+{
+ int ret;
+
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+ ret = sprintf(page, "journal\n");
+ } else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
+ ret = sprintf(page, "ppl\n");
+ } else if (mddev->bitmap) {
+ ret = sprintf(page, "bitmap\n");
+ } else if (mddev->pers) {
+ if (mddev->pers->sync_request)
+ ret = sprintf(page, "resync\n");
+ else
+ ret = sprintf(page, "none\n");
+ } else {
+ ret = sprintf(page, "unknown\n");
+ }
+
+ return ret;
+}
+
+static ssize_t
+consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int err = 0;
+
+ if (mddev->pers) {
+ if (mddev->pers->change_consistency_policy)
+ err = mddev->pers->change_consistency_policy(mddev, buf);
+ else
+ err = -EBUSY;
+ } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
+ set_bit(MD_HAS_PPL, &mddev->flags);
+ } else {
+ err = -EINVAL;
+ }
+
+ return err ? err : len;
+}
+
+static struct md_sysfs_entry md_consistency_policy =
+__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
+ consistency_policy_store);
+
static struct attribute *md_default_attrs[] = {
&md_level.attr,
&md_layout.attr,
@@ -4892,6 +5061,7 @@ static struct attribute *md_default_attrs[] = {
&md_reshape_direction.attr,
&md_array_size.attr,
&max_corr_read_errors.attr,
+ &md_consistency_policy.attr,
NULL,
};
@@ -4976,6 +5146,7 @@ static void md_free(struct kobject *ko)
del_gendisk(mddev->gendisk);
put_disk(mddev->gendisk);
}
+ percpu_ref_exit(&mddev->writes_pending);
kfree(mddev);
}
@@ -5001,8 +5172,19 @@ static void mddev_delayed_delete(struct work_struct *ws)
kobject_put(&mddev->kobj);
}
+static void no_op(struct percpu_ref *r) {}
+
static int md_alloc(dev_t dev, char *name)
{
+ /*
+ * If dev is zero, name is the name of a device to allocate with
+ * an arbitrary minor number. It will be "md_???"
+ * If dev is non-zero it must be a device number with a MAJOR of
+ * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then
+ * the device is being created by opening a node in /dev.
+ * If "name" is not NULL, the device is being created by
+ * writing to /sys/module/md_mod/parameters/new_array.
+ */
static DEFINE_MUTEX(disks_mutex);
struct mddev *mddev = mddev_find(dev);
struct gendisk *disk;
@@ -5028,7 +5210,7 @@ static int md_alloc(dev_t dev, char *name)
if (mddev->gendisk)
goto abort;
- if (name) {
+ if (name && !dev) {
/* Need to ensure that 'name' is not a duplicate.
*/
struct mddev *mddev2;
@@ -5042,6 +5224,11 @@ static int md_alloc(dev_t dev, char *name)
}
spin_unlock(&all_mddevs_lock);
}
+ if (name && dev)
+ /*
+ * Creating /dev/mdNNN via "newarray", so adjust hold_active.
+ */
+ mddev->hold_active = UNTIL_STOP;
error = -ENOMEM;
mddev->queue = blk_alloc_queue(GFP_KERNEL);
@@ -5052,6 +5239,10 @@ static int md_alloc(dev_t dev, char *name)
blk_queue_make_request(mddev->queue, md_make_request);
blk_set_stacking_limits(&mddev->queue->limits);
+ if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
+ goto abort;
+ /* We want to start with the refcount at zero */
+ percpu_ref_put(&mddev->writes_pending);
disk = alloc_disk(1 << shift);
if (!disk) {
blk_cleanup_queue(mddev->queue);
@@ -5108,38 +5299,48 @@ static int md_alloc(dev_t dev, char *name)
static struct kobject *md_probe(dev_t dev, int *part, void *data)
{
- md_alloc(dev, NULL);
+ if (create_on_open)
+ md_alloc(dev, NULL);
return NULL;
}
static int add_named_array(const char *val, struct kernel_param *kp)
{
- /* val must be "md_*" where * is not all digits.
- * We allocate an array with a large free minor number, and
+ /*
+ * val must be "md_*" or "mdNNN".
+ * For "md_*" we allocate an array with a large free minor number, and
* set the name to val. val must not already be an active name.
+ * For "mdNNN" we allocate an array with the minor number NNN
+ * which must not already be in use.
*/
int len = strlen(val);
char buf[DISK_NAME_LEN];
+ unsigned long devnum;
while (len && val[len-1] == '\n')
len--;
if (len >= DISK_NAME_LEN)
return -E2BIG;
strlcpy(buf, val, len+1);
- if (strncmp(buf, "md_", 3) != 0)
- return -EINVAL;
- return md_alloc(0, buf);
+ if (strncmp(buf, "md_", 3) == 0)
+ return md_alloc(0, buf);
+ if (strncmp(buf, "md", 2) == 0 &&
+ isdigit(buf[2]) &&
+ kstrtoul(buf+2, 10, &devnum) == 0 &&
+ devnum <= MINORMASK)
+ return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);
+
+ return -EINVAL;
}
static void md_safemode_timeout(unsigned long data)
{
struct mddev *mddev = (struct mddev *) data;
- if (!atomic_read(&mddev->writes_pending)) {
- mddev->safemode = 1;
- if (mddev->external)
- sysfs_notify_dirent_safe(mddev->sysfs_state);
- }
+ mddev->safemode = 1;
+ if (mddev->external)
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+
md_wakeup_thread(mddev->thread);
}
@@ -5185,6 +5386,13 @@ int md_run(struct mddev *mddev)
continue;
sync_blockdev(rdev->bdev);
invalidate_bdev(rdev->bdev);
+ if (mddev->ro != 1 &&
+ (bdev_read_only(rdev->bdev) ||
+ bdev_read_only(rdev->meta_bdev))) {
+ mddev->ro = 1;
+ if (mddev->gendisk)
+ set_disk_ro(mddev->gendisk, 1);
+ }
/* perform some consistency tests on the device.
* We don't want the data to overlap the metadata,
@@ -5344,7 +5552,6 @@ int md_run(struct mddev *mddev)
} else if (mddev->ro == 2) /* auto-readonly not meaningful */
mddev->ro = 0;
- atomic_set(&mddev->writes_pending,0);
atomic_set(&mddev->max_corr_read_errors,
MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
mddev->safemode = 0;
@@ -5410,6 +5617,9 @@ out:
static int restart_array(struct mddev *mddev)
{
struct gendisk *disk = mddev->gendisk;
+ struct md_rdev *rdev;
+ bool has_journal = false;
+ bool has_readonly = false;
/* Complain if it has no devices */
if (list_empty(&mddev->disks))
@@ -5418,24 +5628,21 @@ static int restart_array(struct mddev *mddev)
return -EINVAL;
if (!mddev->ro)
return -EBUSY;
- if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
- struct md_rdev *rdev;
- bool has_journal = false;
-
- rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev) {
- if (test_bit(Journal, &rdev->flags) &&
- !test_bit(Faulty, &rdev->flags)) {
- has_journal = true;
- break;
- }
- }
- rcu_read_unlock();
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
+ if (test_bit(Journal, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags))
+ has_journal = true;
+ if (bdev_read_only(rdev->bdev))
+ has_readonly = true;
+ }
+ rcu_read_unlock();
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
/* Don't restart rw with journal missing/faulty */
- if (!has_journal)
return -EINVAL;
- }
+ if (has_readonly)
+ return -EROFS;
mddev->safemode = 0;
mddev->ro = 0;
@@ -5535,15 +5742,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
static void mddev_detach(struct mddev *mddev)
{
- struct bitmap *bitmap = mddev->bitmap;
- /* wait for behind writes to complete */
- if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
- pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
- mdname(mddev));
- /* need to kick something here to make sure I/O goes? */
- wait_event(bitmap->behind_wait,
- atomic_read(&bitmap->behind_writes) == 0);
- }
+ bitmap_wait_behind_writes(mddev);
if (mddev->pers && mddev->pers->quiesce) {
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
@@ -5556,6 +5755,7 @@ static void mddev_detach(struct mddev *mddev)
static void __md_stop(struct mddev *mddev)
{
struct md_personality *pers = mddev->pers;
+ bitmap_destroy(mddev);
mddev_detach(mddev);
/* Ensure ->event_work is done */
flush_workqueue(md_misc_wq);
@@ -5576,7 +5776,6 @@ void md_stop(struct mddev *mddev)
* This is called from dm-raid
*/
__md_stop(mddev);
- bitmap_destroy(mddev);
if (mddev->bio_set)
bioset_free(mddev->bio_set);
}
@@ -5714,7 +5913,6 @@ static int do_md_stop(struct mddev *mddev, int mode,
if (mode == 0) {
pr_info("md: %s stopped.\n", mdname(mddev));
- bitmap_destroy(mddev);
if (mddev->bitmap_info.file) {
struct file *f = mddev->bitmap_info.file;
spin_lock(&mddev->lock);
@@ -6493,10 +6691,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
struct md_rdev *rdev;
int rv;
int fit = (num_sectors == 0);
-
- /* cluster raid doesn't support update size */
- if (mddev_is_clustered(mddev))
- return -EINVAL;
+ sector_t old_dev_sectors = mddev->dev_sectors;
if (mddev->pers->resize == NULL)
return -EINVAL;
@@ -6525,7 +6720,9 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
}
rv = mddev->pers->resize(mddev, num_sectors);
if (!rv) {
- if (mddev->queue) {
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->update_size(mddev, old_dev_sectors);
+ else if (mddev->queue) {
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
}
@@ -6776,6 +6973,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
void __user *argp = (void __user *)arg;
struct mddev *mddev = NULL;
int ro;
+ bool did_set_md_closing = false;
if (!md_ioctl_valid(cmd))
return -ENOTTY;
@@ -6865,7 +7063,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
err = -EBUSY;
goto out;
}
+ WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags));
set_bit(MD_CLOSING, &mddev->flags);
+ did_set_md_closing = true;
mutex_unlock(&mddev->open_mutex);
sync_blockdev(bdev);
}
@@ -7058,6 +7258,8 @@ unlock:
mddev->hold_active = 0;
mddev_unlock(mddev);
out:
+ if(did_set_md_closing)
+ clear_bit(MD_CLOSING, &mddev->flags);
return err;
}
#ifdef CONFIG_COMPAT
@@ -7208,8 +7410,8 @@ void md_wakeup_thread(struct md_thread *thread)
{
if (thread) {
pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
- set_bit(THREAD_WAKEUP, &thread->flags);
- wake_up(&thread->wqueue);
+ if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags))
+ wake_up(&thread->wqueue);
}
}
EXPORT_SYMBOL(md_wakeup_thread);
@@ -7756,10 +7958,13 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
md_wakeup_thread(mddev->sync_thread);
did_change = 1;
}
- atomic_inc(&mddev->writes_pending);
+ rcu_read_lock();
+ percpu_ref_get(&mddev->writes_pending);
+ smp_mb(); /* Match smp_mb in set_in_sync() */
if (mddev->safemode == 1)
mddev->safemode = 0;
- if (mddev->in_sync) {
+ /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
+ if (mddev->in_sync || !mddev->sync_checkers) {
spin_lock(&mddev->lock);
if (mddev->in_sync) {
mddev->in_sync = 0;
@@ -7770,6 +7975,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
}
spin_unlock(&mddev->lock);
}
+ rcu_read_unlock();
if (did_change)
sysfs_notify_dirent_safe(mddev->sysfs_state);
wait_event(mddev->sb_wait,
@@ -7777,15 +7983,38 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
}
EXPORT_SYMBOL(md_write_start);
+/* md_write_inc can only be called when md_write_start() has
+ * already been called at least once of the current request.
+ * It increments the counter and is useful when a single request
+ * is split into several parts. Each part causes an increment and
+ * so needs a matching md_write_end().
+ * Unlike md_write_start(), it is safe to call md_write_inc() inside
+ * a spinlocked region.
+ */
+void md_write_inc(struct mddev *mddev, struct bio *bi)
+{
+ if (bio_data_dir(bi) != WRITE)
+ return;
+ WARN_ON_ONCE(mddev->in_sync || mddev->ro);
+ percpu_ref_get(&mddev->writes_pending);
+}
+EXPORT_SYMBOL(md_write_inc);
+
void md_write_end(struct mddev *mddev)
{
- if (atomic_dec_and_test(&mddev->writes_pending)) {
- if (mddev->safemode == 2)
- md_wakeup_thread(mddev->thread);
- else if (mddev->safemode_delay)
- mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
- }
+ percpu_ref_put(&mddev->writes_pending);
+
+ if (mddev->safemode == 2)
+ md_wakeup_thread(mddev->thread);
+ else if (mddev->safemode_delay)
+ /* The roundup() ensures this only performs locking once
+ * every ->safemode_delay jiffies
+ */
+ mod_timer(&mddev->safemode_timer,
+ roundup(jiffies, mddev->safemode_delay) +
+ mddev->safemode_delay);
}
+
EXPORT_SYMBOL(md_write_end);
/* md_allow_write(mddev)
@@ -8385,9 +8614,8 @@ void md_check_recovery(struct mddev *mddev)
(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
- test_bit(MD_RELOAD_SB, &mddev->flags) ||
(mddev->external == 0 && mddev->safemode == 1) ||
- (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
+ (mddev->safemode == 2
&& !mddev->in_sync && mddev->recovery_cp == MaxSector)
))
return;
@@ -8434,27 +8662,12 @@ void md_check_recovery(struct mddev *mddev)
rdev->raid_disk < 0)
md_kick_rdev_from_array(rdev);
}
-
- if (test_and_clear_bit(MD_RELOAD_SB, &mddev->flags))
- md_reload_sb(mddev, mddev->good_device_nr);
}
- if (!mddev->external) {
- int did_change = 0;
+ if (!mddev->external && !mddev->in_sync) {
spin_lock(&mddev->lock);
- if (mddev->safemode &&
- !atomic_read(&mddev->writes_pending) &&
- !mddev->in_sync &&
- mddev->recovery_cp == MaxSector) {
- mddev->in_sync = 1;
- did_change = 1;
- set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
- }
- if (mddev->safemode == 1)
- mddev->safemode = 0;
+ set_in_sync(mddev);
spin_unlock(&mddev->lock);
- if (did_change)
- sysfs_notify_dirent_safe(mddev->sysfs_state);
}
if (mddev->sb_flags)
@@ -8747,6 +8960,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
int role, ret;
char b[BDEVNAME_SIZE];
+ /*
+ * If size is changed in another node then we need to
+ * do resize as well.
+ */
+ if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
+ ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
+ if (ret)
+ pr_info("md-cluster: resize failed\n");
+ else
+ bitmap_update_sb(mddev->bitmap);
+ }
+
/* Check for change of roles in the active devices */
rdev_for_each(rdev2, mddev) {
if (test_bit(Faulty, &rdev2->flags))
@@ -8997,6 +9222,7 @@ static int set_ro(const char *val, struct kernel_param *kp)
module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
+module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MD RAID framework");
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1e76d64ce180..4e75d121bfcc 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -122,6 +122,13 @@ struct md_rdev {
* sysfs entry */
struct badblocks badblocks;
+
+ struct {
+ short offset; /* Offset from superblock to start of PPL.
+ * Not used by external metadata. */
+ unsigned int size; /* Size in sectors of the PPL space */
+ sector_t sector; /* First sector of the PPL space */
+ } ppl;
};
enum flag_bits {
Faulty, /* device is known to have a fault */
@@ -219,9 +226,6 @@ enum mddev_flags {
* it then */
MD_JOURNAL_CLEAN, /* A raid with journal is already clean */
MD_HAS_JOURNAL, /* The raid array has journal feature set */
- MD_RELOAD_SB, /* Reload the superblock because another node
- * updated it.
- */
MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
* already took resync lock, need to
* release the lock */
@@ -229,6 +233,7 @@ enum mddev_flags {
* supported as calls to md_error() will
* never cause the array to become failed.
*/
+ MD_HAS_PPL, /* The raid array has PPL feature set */
};
enum mddev_sb_flags {
@@ -404,7 +409,8 @@ struct mddev {
*/
unsigned int safemode_delay;
struct timer_list safemode_timer;
- atomic_t writes_pending;
+ struct percpu_ref writes_pending;
+ int sync_checkers; /* # of threads checking writes_pending */
struct request_queue *queue; /* for plugging ... */
struct bitmap *bitmap; /* the bitmap for the device */
@@ -540,6 +546,8 @@ struct md_personality
/* congested implements bdi.congested_fn().
* Will not be called while array is 'suspended' */
int (*congested)(struct mddev *mddev, int bits);
+ /* Changes the consistency policy of an active array. */
+ int (*change_consistency_policy)(struct mddev *mddev, const char *buf);
};
struct md_sysfs_entry {
@@ -641,6 +649,7 @@ extern void md_wakeup_thread(struct md_thread *thread);
extern void md_check_recovery(struct mddev *mddev);
extern void md_reap_sync_thread(struct mddev *mddev);
extern void md_write_start(struct mddev *mddev, struct bio *bi);
+extern void md_write_inc(struct mddev *mddev, struct bio *bi);
extern void md_write_end(struct mddev *mddev);
extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
@@ -716,4 +725,58 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio
!bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
mddev->queue->limits.max_write_zeroes_sectors = 0;
}
+
+/* Maximum size of each resync request */
+#define RESYNC_BLOCK_SIZE (64*1024)
+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
+
+/* for managing resync I/O pages */
+struct resync_pages {
+ unsigned idx; /* for get/put page from the pool */
+ void *raid_bio;
+ struct page *pages[RESYNC_PAGES];
+};
+
+static inline int resync_alloc_pages(struct resync_pages *rp,
+ gfp_t gfp_flags)
+{
+ int i;
+
+ for (i = 0; i < RESYNC_PAGES; i++) {
+ rp->pages[i] = alloc_page(gfp_flags);
+ if (!rp->pages[i])
+ goto out_free;
+ }
+
+ return 0;
+
+out_free:
+ while (--i >= 0)
+ put_page(rp->pages[i]);
+ return -ENOMEM;
+}
+
+static inline void resync_free_pages(struct resync_pages *rp)
+{
+ int i;
+
+ for (i = 0; i < RESYNC_PAGES; i++)
+ put_page(rp->pages[i]);
+}
+
+static inline void resync_get_all_pages(struct resync_pages *rp)
+{
+ int i;
+
+ for (i = 0; i < RESYNC_PAGES; i++)
+ get_page(rp->pages[i]);
+}
+
+static inline struct page *resync_fetch_page(struct resync_pages *rp,
+ unsigned idx)
+{
+ if (WARN_ON_ONCE(idx >= RESYNC_PAGES))
+ return NULL;
+ return rp->pages[idx];
+}
#endif /* _MD_MD_H */
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ce7a6a56cf73..84e58596594d 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -29,7 +29,8 @@
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
(1L << MD_JOURNAL_CLEAN) | \
- (1L << MD_FAILFAST_SUPPORTED))
+ (1L << MD_FAILFAST_SUPPORTED) |\
+ (1L << MD_HAS_PPL))
static int raid0_congested(struct mddev *mddev, int bits)
{
@@ -462,53 +463,54 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
{
struct strip_zone *zone;
struct md_rdev *tmp_dev;
- struct bio *split;
+ sector_t bio_sector;
+ sector_t sector;
+ unsigned chunk_sects;
+ unsigned sectors;
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
return;
}
- do {
- sector_t bio_sector = bio->bi_iter.bi_sector;
- sector_t sector = bio_sector;
- unsigned chunk_sects = mddev->chunk_sectors;
+ bio_sector = bio->bi_iter.bi_sector;
+ sector = bio_sector;
+ chunk_sects = mddev->chunk_sectors;
- unsigned sectors = chunk_sects -
- (likely(is_power_of_2(chunk_sects))
- ? (sector & (chunk_sects-1))
- : sector_div(sector, chunk_sects));
+ sectors = chunk_sects -
+ (likely(is_power_of_2(chunk_sects))
+ ? (sector & (chunk_sects-1))
+ : sector_div(sector, chunk_sects));
- /* Restore due to sector_div */
- sector = bio_sector;
+ /* Restore due to sector_div */
+ sector = bio_sector;
- if (sectors < bio_sectors(bio)) {
- split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
- bio_chain(split, bio);
- } else {
- split = bio;
- }
+ if (sectors < bio_sectors(bio)) {
+ struct bio *split = bio_split(bio, sectors, GFP_NOIO, mddev->bio_set);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ }
- zone = find_zone(mddev->private, &sector);
- tmp_dev = map_sector(mddev, zone, sector, &sector);
- split->bi_bdev = tmp_dev->bdev;
- split->bi_iter.bi_sector = sector + zone->dev_start +
- tmp_dev->data_offset;
-
- if (unlikely((bio_op(split) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
- /* Just ignore it */
- bio_endio(split);
- } else {
- if (mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
- split, disk_devt(mddev->gendisk),
- bio_sector);
- mddev_check_writesame(mddev, split);
- mddev_check_write_zeroes(mddev, split);
- generic_make_request(split);
- }
- } while (split != bio);
+ zone = find_zone(mddev->private, &sector);
+ tmp_dev = map_sector(mddev, zone, sector, &sector);
+ bio->bi_bdev = tmp_dev->bdev;
+ bio->bi_iter.bi_sector = sector + zone->dev_start +
+ tmp_dev->data_offset;
+
+ if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
+ /* Just ignore it */
+ bio_endio(bio);
+ } else {
+ if (mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+ bio, disk_devt(mddev->gendisk),
+ bio_sector);
+ mddev_check_writesame(mddev, bio);
+ mddev_check_write_zeroes(mddev, bio);
+ generic_make_request(bio);
+ }
}
static void raid0_status(struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b59cc100320a..7ed59351fe97 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -47,7 +47,8 @@
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
- (1L << MD_JOURNAL_CLEAN))
+ (1L << MD_JOURNAL_CLEAN) | \
+ (1L << MD_HAS_PPL))
/*
* Number of guaranteed r1bios in case of extreme VM load:
@@ -80,6 +81,24 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
#define raid1_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
+/*
+ * 'strct resync_pages' stores actual pages used for doing the resync
+ * IO, and it is per-bio, so make .bi_private points to it.
+ */
+static inline struct resync_pages *get_resync_pages(struct bio *bio)
+{
+ return bio->bi_private;
+}
+
+/*
+ * for resync bio, r1bio pointer can be retrieved from the per-bio
+ * 'struct resync_pages'.
+ */
+static inline struct r1bio *get_resync_r1bio(struct bio *bio)
+{
+ return get_resync_pages(bio)->raid_bio;
+}
+
static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
{
struct pool_info *pi = data;
@@ -94,10 +113,8 @@ static void r1bio_pool_free(void *r1_bio, void *data)
kfree(r1_bio);
}
-#define RESYNC_BLOCK_SIZE (64*1024)
#define RESYNC_DEPTH 32
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
-#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
@@ -109,12 +126,18 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
struct r1bio *r1_bio;
struct bio *bio;
int need_pages;
- int i, j;
+ int j;
+ struct resync_pages *rps;
r1_bio = r1bio_pool_alloc(gfp_flags, pi);
if (!r1_bio)
return NULL;
+ rps = kmalloc(sizeof(struct resync_pages) * pi->raid_disks,
+ gfp_flags);
+ if (!rps)
+ goto out_free_r1bio;
+
/*
* Allocate bios : 1 for reading, n-1 for writing
*/
@@ -134,19 +157,22 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
need_pages = pi->raid_disks;
else
need_pages = 1;
- for (j = 0; j < need_pages; j++) {
+ for (j = 0; j < pi->raid_disks; j++) {
+ struct resync_pages *rp = &rps[j];
+
bio = r1_bio->bios[j];
- bio->bi_vcnt = RESYNC_PAGES;
- if (bio_alloc_pages(bio, gfp_flags))
- goto out_free_pages;
- }
- /* If not user-requests, copy the page pointers to all bios */
- if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
- for (i=0; i<RESYNC_PAGES ; i++)
- for (j=1; j<pi->raid_disks; j++)
- r1_bio->bios[j]->bi_io_vec[i].bv_page =
- r1_bio->bios[0]->bi_io_vec[i].bv_page;
+ if (j < need_pages) {
+ if (resync_alloc_pages(rp, gfp_flags))
+ goto out_free_pages;
+ } else {
+ memcpy(rp, &rps[0], sizeof(*rp));
+ resync_get_all_pages(rp);
+ }
+
+ rp->idx = 0;
+ rp->raid_bio = r1_bio;
+ bio->bi_private = rp;
}
r1_bio->master_bio = NULL;
@@ -155,11 +181,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
out_free_pages:
while (--j >= 0)
- bio_free_pages(r1_bio->bios[j]);
+ resync_free_pages(&rps[j]);
out_free_bio:
while (++j < pi->raid_disks)
bio_put(r1_bio->bios[j]);
+ kfree(rps);
+
+out_free_r1bio:
r1bio_pool_free(r1_bio, data);
return NULL;
}
@@ -167,18 +196,18 @@ out_free_bio:
static void r1buf_pool_free(void *__r1_bio, void *data)
{
struct pool_info *pi = data;
- int i,j;
+ int i;
struct r1bio *r1bio = __r1_bio;
+ struct resync_pages *rp = NULL;
- for (i = 0; i < RESYNC_PAGES; i++)
- for (j = pi->raid_disks; j-- ;) {
- if (j == 0 ||
- r1bio->bios[j]->bi_io_vec[i].bv_page !=
- r1bio->bios[0]->bi_io_vec[i].bv_page)
- safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
- }
- for (i=0 ; i < pi->raid_disks; i++)
+ for (i = pi->raid_disks; i--; ) {
+ rp = get_resync_pages(r1bio->bios[i]);
+ resync_free_pages(rp);
bio_put(r1bio->bios[i]);
+ }
+
+ /* resync pages array stored in the 1st bio's .bi_private */
+ kfree(rp);
r1bio_pool_free(r1bio, data);
}
@@ -245,35 +274,17 @@ static void reschedule_retry(struct r1bio *r1_bio)
static void call_bio_endio(struct r1bio *r1_bio)
{
struct bio *bio = r1_bio->master_bio;
- int done;
struct r1conf *conf = r1_bio->mddev->private;
- sector_t bi_sector = bio->bi_iter.bi_sector;
-
- if (bio->bi_phys_segments) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- bio->bi_phys_segments--;
- done = (bio->bi_phys_segments == 0);
- spin_unlock_irqrestore(&conf->device_lock, flags);
- /*
- * make_request() might be waiting for
- * bi_phys_segments to decrease
- */
- wake_up(&conf->wait_barrier);
- } else
- done = 1;
if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
bio->bi_error = -EIO;
- if (done) {
- bio_endio(bio);
- /*
- * Wake up any possible resync thread that waits for the device
- * to go idle.
- */
- allow_barrier(conf, bi_sector);
- }
+ bio_endio(bio);
+ /*
+ * Wake up any possible resync thread that waits for the device
+ * to go idle.
+ */
+ allow_barrier(conf, r1_bio->sector);
}
static void raid_end_bio_io(struct r1bio *r1_bio)
@@ -377,12 +388,9 @@ static void close_write(struct r1bio *r1_bio)
{
/* it really is the end of this request */
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
- /* free extra copy of the data pages */
- int i = r1_bio->behind_page_count;
- while (i--)
- safe_put_page(r1_bio->behind_bvecs[i].bv_page);
- kfree(r1_bio->behind_bvecs);
- r1_bio->behind_bvecs = NULL;
+ bio_free_pages(r1_bio->behind_master_bio);
+ bio_put(r1_bio->behind_master_bio);
+ r1_bio->behind_master_bio = NULL;
}
/* clear the bitmap if all writes complete successfully */
bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
@@ -484,6 +492,10 @@ static void raid1_end_write_request(struct bio *bio)
}
if (behind) {
+ /* we release behind master bio when all write are done */
+ if (r1_bio->behind_master_bio == bio)
+ to_put = NULL;
+
if (test_bit(WriteMostly, &rdev->flags))
atomic_dec(&r1_bio->behind_remaining);
@@ -775,6 +787,30 @@ static int raid1_congested(struct mddev *mddev, int bits)
return ret;
}
+static void flush_bio_list(struct r1conf *conf, struct bio *bio)
+{
+ /* flush any pending bitmap writes to disk before proceeding w/ I/O */
+ bitmap_unplug(conf->mddev->bitmap);
+ wake_up(&conf->wait_barrier);
+
+ while (bio) { /* submit pending writes */
+ struct bio *next = bio->bi_next;
+ struct md_rdev *rdev = (void*)bio->bi_bdev;
+ bio->bi_next = NULL;
+ bio->bi_bdev = rdev->bdev;
+ if (test_bit(Faulty, &rdev->flags)) {
+ bio->bi_error = -EIO;
+ bio_endio(bio);
+ } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+ /* Just ignore it */
+ bio_endio(bio);
+ else
+ generic_make_request(bio);
+ bio = next;
+ }
+}
+
static void flush_pending_writes(struct r1conf *conf)
{
/* Any writes that have been queued but are awaiting
@@ -787,27 +823,7 @@ static void flush_pending_writes(struct r1conf *conf)
bio = bio_list_get(&conf->pending_bio_list);
conf->pending_count = 0;
spin_unlock_irq(&conf->device_lock);
- /* flush any pending bitmap writes to
- * disk before proceeding w/ I/O */
- bitmap_unplug(conf->mddev->bitmap);
- wake_up(&conf->wait_barrier);
-
- while (bio) { /* submit pending writes */
- struct bio *next = bio->bi_next;
- struct md_rdev *rdev = (void*)bio->bi_bdev;
- bio->bi_next = NULL;
- bio->bi_bdev = rdev->bdev;
- if (test_bit(Faulty, &rdev->flags)) {
- bio->bi_error = -EIO;
- bio_endio(bio);
- } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
- /* Just ignore it */
- bio_endio(bio);
- else
- generic_make_request(bio);
- bio = next;
- }
+ flush_bio_list(conf, bio);
} else
spin_unlock_irq(&conf->device_lock);
}
@@ -869,7 +885,7 @@ static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH,
conf->resync_lock);
- atomic_inc(&conf->nr_pending[idx]);
+ atomic_inc(&conf->nr_sync_pending);
spin_unlock_irq(&conf->resync_lock);
}
@@ -880,7 +896,7 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
BUG_ON(atomic_read(&conf->barrier[idx]) <= 0);
atomic_dec(&conf->barrier[idx]);
- atomic_dec(&conf->nr_pending[idx]);
+ atomic_dec(&conf->nr_sync_pending);
wake_up(&conf->wait_barrier);
}
@@ -1017,7 +1033,8 @@ static int get_unqueued_pending(struct r1conf *conf)
{
int idx, ret;
- for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+ ret = atomic_read(&conf->nr_sync_pending);
+ for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
ret += atomic_read(&conf->nr_pending[idx]) -
atomic_read(&conf->nr_queued[idx]);
@@ -1068,39 +1085,49 @@ static void unfreeze_array(struct r1conf *conf)
wake_up(&conf->wait_barrier);
}
-/* duplicate the data pages for behind I/O
- */
-static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
+static struct bio *alloc_behind_master_bio(struct r1bio *r1_bio,
+ struct bio *bio)
{
- int i;
- struct bio_vec *bvec;
- struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
- GFP_NOIO);
- if (unlikely(!bvecs))
- return;
+ int size = bio->bi_iter.bi_size;
+ unsigned vcnt = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ int i = 0;
+ struct bio *behind_bio = NULL;
+
+ behind_bio = bio_alloc_mddev(GFP_NOIO, vcnt, r1_bio->mddev);
+ if (!behind_bio)
+ goto fail;
- bio_for_each_segment_all(bvec, bio, i) {
- bvecs[i] = *bvec;
- bvecs[i].bv_page = alloc_page(GFP_NOIO);
- if (unlikely(!bvecs[i].bv_page))
- goto do_sync_io;
- memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset,
- kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
- kunmap(bvecs[i].bv_page);
- kunmap(bvec->bv_page);
- }
- r1_bio->behind_bvecs = bvecs;
- r1_bio->behind_page_count = bio->bi_vcnt;
+ /* discard op, we don't support writezero/writesame yet */
+ if (!bio_has_data(bio))
+ goto skip_copy;
+
+ while (i < vcnt && size) {
+ struct page *page;
+ int len = min_t(int, PAGE_SIZE, size);
+
+ page = alloc_page(GFP_NOIO);
+ if (unlikely(!page))
+ goto free_pages;
+
+ bio_add_page(behind_bio, page, len, 0);
+
+ size -= len;
+ i++;
+ }
+
+ bio_copy_data(behind_bio, bio);
+skip_copy:
+ r1_bio->behind_master_bio = behind_bio;;
set_bit(R1BIO_BehindIO, &r1_bio->state);
- return;
-do_sync_io:
- for (i = 0; i < bio->bi_vcnt; i++)
- if (bvecs[i].bv_page)
- put_page(bvecs[i].bv_page);
- kfree(bvecs);
+ return behind_bio;
+
+free_pages:
pr_debug("%dB behind alloc failed, doing sync I/O\n",
bio->bi_iter.bi_size);
+ bio_free_pages(behind_bio);
+fail:
+ return behind_bio;
}
struct raid1_plug_cb {
@@ -1130,91 +1157,102 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
/* we aren't scheduling, so we can do the write-out directly. */
bio = bio_list_get(&plug->pending);
- bitmap_unplug(mddev->bitmap);
- wake_up(&conf->wait_barrier);
-
- while (bio) { /* submit pending writes */
- struct bio *next = bio->bi_next;
- struct md_rdev *rdev = (void*)bio->bi_bdev;
- bio->bi_next = NULL;
- bio->bi_bdev = rdev->bdev;
- if (test_bit(Faulty, &rdev->flags)) {
- bio->bi_error = -EIO;
- bio_endio(bio);
- } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
- /* Just ignore it */
- bio_endio(bio);
- else
- generic_make_request(bio);
- bio = next;
- }
+ flush_bio_list(conf, bio);
kfree(plug);
}
+static void init_r1bio(struct r1bio *r1_bio, struct mddev *mddev, struct bio *bio)
+{
+ r1_bio->master_bio = bio;
+ r1_bio->sectors = bio_sectors(bio);
+ r1_bio->state = 0;
+ r1_bio->mddev = mddev;
+ r1_bio->sector = bio->bi_iter.bi_sector;
+}
+
static inline struct r1bio *
-alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled)
+alloc_r1bio(struct mddev *mddev, struct bio *bio)
{
struct r1conf *conf = mddev->private;
struct r1bio *r1_bio;
r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
- r1_bio->master_bio = bio;
- r1_bio->sectors = bio_sectors(bio) - sectors_handled;
- r1_bio->state = 0;
- r1_bio->mddev = mddev;
- r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
-
+ /* Ensure no bio records IO_BLOCKED */
+ memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0]));
+ init_r1bio(r1_bio, mddev, bio);
return r1_bio;
}
-static void raid1_read_request(struct mddev *mddev, struct bio *bio)
+static void raid1_read_request(struct mddev *mddev, struct bio *bio,
+ int max_read_sectors, struct r1bio *r1_bio)
{
struct r1conf *conf = mddev->private;
struct raid1_info *mirror;
- struct r1bio *r1_bio;
struct bio *read_bio;
struct bitmap *bitmap = mddev->bitmap;
const int op = bio_op(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
- int sectors_handled;
int max_sectors;
int rdisk;
+ bool print_msg = !!r1_bio;
+ char b[BDEVNAME_SIZE];
/*
- * Still need barrier for READ in case that whole
- * array is frozen.
+ * If r1_bio is set, we are blocking the raid1d thread
+ * so there is a tiny risk of deadlock. So ask for
+ * emergency memory if needed.
*/
- wait_read_barrier(conf, bio->bi_iter.bi_sector);
+ gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO;
- r1_bio = alloc_r1bio(mddev, bio, 0);
+ if (print_msg) {
+ /* Need to get the block device name carefully */
+ struct md_rdev *rdev;
+ rcu_read_lock();
+ rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev);
+ if (rdev)
+ bdevname(rdev->bdev, b);
+ else
+ strcpy(b, "???");
+ rcu_read_unlock();
+ }
/*
- * We might need to issue multiple reads to different
- * devices if there are bad blocks around, so we keep
- * track of the number of reads in bio->bi_phys_segments.
- * If this is 0, there is only one r1_bio and no locking
- * will be needed when requests complete. If it is
- * non-zero, then it is the number of not-completed requests.
+ * Still need barrier for READ in case that whole
+ * array is frozen.
*/
- bio->bi_phys_segments = 0;
- bio_clear_flag(bio, BIO_SEG_VALID);
+ wait_read_barrier(conf, bio->bi_iter.bi_sector);
+
+ if (!r1_bio)
+ r1_bio = alloc_r1bio(mddev, bio);
+ else
+ init_r1bio(r1_bio, mddev, bio);
+ r1_bio->sectors = max_read_sectors;
/*
* make_request() can abort the operation when read-ahead is being
* used and no empty request is available.
*/
-read_again:
rdisk = read_balance(conf, r1_bio, &max_sectors);
if (rdisk < 0) {
/* couldn't find anywhere to read from */
+ if (print_msg) {
+ pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
+ mdname(mddev),
+ b,
+ (unsigned long long)r1_bio->sector);
+ }
raid_end_bio_io(r1_bio);
return;
}
mirror = conf->mirrors + rdisk;
+ if (print_msg)
+ pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n",
+ mdname(mddev),
+ (unsigned long long)r1_bio->sector,
+ bdevname(mirror->rdev->bdev, b));
+
if (test_bit(WriteMostly, &mirror->rdev->flags) &&
bitmap) {
/*
@@ -1225,11 +1263,20 @@ read_again:
wait_event(bitmap->behind_wait,
atomic_read(&bitmap->behind_writes) == 0);
}
+
+ if (max_sectors < bio_sectors(bio)) {
+ struct bio *split = bio_split(bio, max_sectors,
+ gfp, conf->bio_split);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ r1_bio->master_bio = bio;
+ r1_bio->sectors = max_sectors;
+ }
+
r1_bio->read_disk = rdisk;
- read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
- bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
- max_sectors);
+ read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
r1_bio->bios[rdisk] = read_bio;
@@ -1248,35 +1295,11 @@ read_again:
read_bio, disk_devt(mddev->gendisk),
r1_bio->sector);
- if (max_sectors < r1_bio->sectors) {
- /*
- * could not read all from this device, so we will need another
- * r1_bio.
- */
- sectors_handled = (r1_bio->sector + max_sectors
- - bio->bi_iter.bi_sector);
- r1_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (bio->bi_phys_segments == 0)
- bio->bi_phys_segments = 2;
- else
- bio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
-
- /*
- * Cannot call generic_make_request directly as that will be
- * queued in __make_request and subsequent mempool_alloc might
- * block waiting for it. So hand bio over to raid1d.
- */
- reschedule_retry(r1_bio);
-
- r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
- goto read_again;
- } else
- generic_make_request(read_bio);
+ generic_make_request(read_bio);
}
-static void raid1_write_request(struct mddev *mddev, struct bio *bio)
+static void raid1_write_request(struct mddev *mddev, struct bio *bio,
+ int max_write_sectors)
{
struct r1conf *conf = mddev->private;
struct r1bio *r1_bio;
@@ -1287,7 +1310,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
struct blk_plug_cb *cb;
struct raid1_plug_cb *plug = NULL;
int first_clone;
- int sectors_handled;
int max_sectors;
/*
@@ -1326,17 +1348,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
}
wait_barrier(conf, bio->bi_iter.bi_sector);
- r1_bio = alloc_r1bio(mddev, bio, 0);
-
- /* We might need to issue multiple writes to different
- * devices if there are bad blocks around, so we keep
- * track of the number of writes in bio->bi_phys_segments.
- * If this is 0, there is only one r1_bio and no locking
- * will be needed when requests complete. If it is
- * non-zero, then it is the number of not-completed requests.
- */
- bio->bi_phys_segments = 0;
- bio_clear_flag(bio, BIO_SEG_VALID);
+ r1_bio = alloc_r1bio(mddev, bio);
+ r1_bio->sectors = max_write_sectors;
if (conf->pending_count >= max_queued_requests) {
md_wakeup_thread(mddev->thread);
@@ -1435,31 +1448,26 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
goto retry_write;
}
- if (max_sectors < r1_bio->sectors) {
- /* We are splitting this write into multiple parts, so
- * we need to prepare for allocating another r1_bio.
- */
+ if (max_sectors < bio_sectors(bio)) {
+ struct bio *split = bio_split(bio, max_sectors,
+ GFP_NOIO, conf->bio_split);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ r1_bio->master_bio = bio;
r1_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (bio->bi_phys_segments == 0)
- bio->bi_phys_segments = 2;
- else
- bio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
}
- sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0);
first_clone = 1;
+
for (i = 0; i < disks; i++) {
struct bio *mbio = NULL;
- sector_t offset;
if (!r1_bio->bios[i])
continue;
- offset = r1_bio->sector - bio->bi_iter.bi_sector;
if (first_clone) {
/* do behind I/O ?
@@ -1470,11 +1478,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
(atomic_read(&bitmap->behind_writes)
< mddev->bitmap_info.max_write_behind) &&
!waitqueue_active(&bitmap->behind_wait)) {
- mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
- mddev->bio_set,
- offset << 9,
- max_sectors << 9);
- alloc_behind_pages(mbio, r1_bio);
+ mbio = alloc_behind_master_bio(r1_bio, bio);
}
bitmap_startwrite(bitmap, r1_bio->sector,
@@ -1485,26 +1489,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
}
if (!mbio) {
- if (r1_bio->behind_bvecs)
- mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
- mddev->bio_set,
- offset << 9,
- max_sectors << 9);
- else {
+ if (r1_bio->behind_master_bio)
+ mbio = bio_clone_fast(r1_bio->behind_master_bio,
+ GFP_NOIO,
+ mddev->bio_set);
+ else
mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
- bio_trim(mbio, offset, max_sectors);
- }
}
- if (r1_bio->behind_bvecs) {
- struct bio_vec *bvec;
- int j;
-
- /*
- * We trimmed the bio, so _all is legit
- */
- bio_for_each_segment_all(bvec, mbio, j)
- bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
+ if (r1_bio->behind_master_bio) {
if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
}
@@ -1548,17 +1541,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
if (!plug)
md_wakeup_thread(mddev->thread);
}
- /* Mustn't call r1_bio_write_done before this next test,
- * as it could result in the bio being freed.
- */
- if (sectors_handled < bio_sectors(bio)) {
- r1_bio_write_done(r1_bio);
- /* We need another r1_bio. It has already been counted
- * in bio->bi_phys_segments
- */
- r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
- goto retry_write;
- }
r1_bio_write_done(r1_bio);
@@ -1568,7 +1550,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
static void raid1_make_request(struct mddev *mddev, struct bio *bio)
{
- struct bio *split;
sector_t sectors;
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
@@ -1576,43 +1557,20 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio)
return;
}
- /* if bio exceeds barrier unit boundary, split it */
- do {
- sectors = align_to_barrier_unit_end(
- bio->bi_iter.bi_sector, bio_sectors(bio));
- if (sectors < bio_sectors(bio)) {
- split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
- bio_chain(split, bio);
- } else {
- split = bio;
- }
-
- if (bio_data_dir(split) == READ) {
- raid1_read_request(mddev, split);
+ /*
+ * There is a limit to the maximum size, but
+ * the read/write handler might find a lower limit
+ * due to bad blocks. To avoid multiple splits,
+ * we pass the maximum number of sectors down
+ * and let the lower level perform the split.
+ */
+ sectors = align_to_barrier_unit_end(
+ bio->bi_iter.bi_sector, bio_sectors(bio));
- /*
- * If a bio is splitted, the first part of bio will
- * pass barrier but the bio is queued in
- * current->bio_list (see generic_make_request). If
- * there is a raise_barrier() called here, the second
- * part of bio can't pass barrier. But since the first
- * part bio isn't dispatched to underlaying disks yet,
- * the barrier is never released, hence raise_barrier
- * will alays wait. We have a deadlock.
- * Note, this only happens in read path. For write
- * path, the first part of bio is dispatched in a
- * schedule() call (because of blk plug) or offloaded
- * to raid10d.
- * Quitting from the function immediately can change
- * the bio order queued in bio_list and avoid the deadlock.
- */
- if (split != bio) {
- generic_make_request(bio);
- break;
- }
- } else
- raid1_write_request(mddev, split);
- } while (split != bio);
+ if (bio_data_dir(bio) == READ)
+ raid1_read_request(mddev, bio, sectors, NULL);
+ else
+ raid1_write_request(mddev, bio, sectors);
}
static void raid1_status(struct seq_file *seq, struct mddev *mddev)
@@ -1874,9 +1832,9 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
p->rdev = repl;
conf->mirrors[conf->raid_disks + number].rdev = NULL;
unfreeze_array(conf);
- clear_bit(WantReplacement, &rdev->flags);
- } else
- clear_bit(WantReplacement, &rdev->flags);
+ }
+
+ clear_bit(WantReplacement, &rdev->flags);
err = md_integrity_register(mddev);
}
abort:
@@ -1887,7 +1845,7 @@ abort:
static void end_sync_read(struct bio *bio)
{
- struct r1bio *r1_bio = bio->bi_private;
+ struct r1bio *r1_bio = get_resync_r1bio(bio);
update_head_pos(r1_bio->read_disk, r1_bio);
@@ -1906,7 +1864,7 @@ static void end_sync_read(struct bio *bio)
static void end_sync_write(struct bio *bio)
{
int uptodate = !bio->bi_error;
- struct r1bio *r1_bio = bio->bi_private;
+ struct r1bio *r1_bio = get_resync_r1bio(bio);
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
sector_t first_bad;
@@ -1985,6 +1943,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
struct bio *bio = r1_bio->bios[r1_bio->read_disk];
+ struct page **pages = get_resync_pages(bio)->pages;
sector_t sect = r1_bio->sector;
int sectors = r1_bio->sectors;
int idx = 0;
@@ -2018,7 +1977,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
*/
rdev = conf->mirrors[d].rdev;
if (sync_page_io(rdev, sect, s<<9,
- bio->bi_io_vec[idx].bv_page,
+ pages[idx],
REQ_OP_READ, 0, false)) {
success = 1;
break;
@@ -2073,7 +2032,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
continue;
rdev = conf->mirrors[d].rdev;
if (r1_sync_page_io(rdev, sect, s,
- bio->bi_io_vec[idx].bv_page,
+ pages[idx],
WRITE) == 0) {
r1_bio->bios[d]->bi_end_io = NULL;
rdev_dec_pending(rdev, mddev);
@@ -2088,7 +2047,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
continue;
rdev = conf->mirrors[d].rdev;
if (r1_sync_page_io(rdev, sect, s,
- bio->bi_io_vec[idx].bv_page,
+ pages[idx],
READ) != 0)
atomic_add(s, &rdev->corrected_errors);
}
@@ -2122,7 +2081,9 @@ static void process_checks(struct r1bio *r1_bio)
int j;
int size;
int error;
+ struct bio_vec *bi;
struct bio *b = r1_bio->bios[i];
+ struct resync_pages *rp = get_resync_pages(b);
if (b->bi_end_io != end_sync_read)
continue;
/* fixup the bio for reuse, but preserve errno */
@@ -2135,12 +2096,11 @@ static void process_checks(struct r1bio *r1_bio)
conf->mirrors[i].rdev->data_offset;
b->bi_bdev = conf->mirrors[i].rdev->bdev;
b->bi_end_io = end_sync_read;
- b->bi_private = r1_bio;
+ rp->raid_bio = r1_bio;
+ b->bi_private = rp;
size = b->bi_iter.bi_size;
- for (j = 0; j < vcnt ; j++) {
- struct bio_vec *bi;
- bi = &b->bi_io_vec[j];
+ bio_for_each_segment_all(bi, b, j) {
bi->bv_offset = 0;
if (size > PAGE_SIZE)
bi->bv_len = PAGE_SIZE;
@@ -2162,20 +2122,24 @@ static void process_checks(struct r1bio *r1_bio)
struct bio *pbio = r1_bio->bios[primary];
struct bio *sbio = r1_bio->bios[i];
int error = sbio->bi_error;
+ struct page **ppages = get_resync_pages(pbio)->pages;
+ struct page **spages = get_resync_pages(sbio)->pages;
+ struct bio_vec *bi;
+ int page_len[RESYNC_PAGES] = { 0 };
if (sbio->bi_end_io != end_sync_read)
continue;
/* Now we can 'fixup' the error value */
sbio->bi_error = 0;
+ bio_for_each_segment_all(bi, sbio, j)
+ page_len[j] = bi->bv_len;
+
if (!error) {
for (j = vcnt; j-- ; ) {
- struct page *p, *s;
- p = pbio->bi_io_vec[j].bv_page;
- s = sbio->bi_io_vec[j].bv_page;
- if (memcmp(page_address(p),
- page_address(s),
- sbio->bi_io_vec[j].bv_len))
+ if (memcmp(page_address(ppages[j]),
+ page_address(spages[j]),
+ page_len[j]))
break;
}
} else
@@ -2222,6 +2186,8 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
(i == r1_bio->read_disk ||
!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
continue;
+ if (test_bit(Faulty, &conf->mirrors[i].rdev->flags))
+ continue;
bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
@@ -2391,18 +2357,11 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
/* Write at 'sector' for 'sectors'*/
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
- unsigned vcnt = r1_bio->behind_page_count;
- struct bio_vec *vec = r1_bio->behind_bvecs;
-
- while (!vec->bv_page) {
- vec++;
- vcnt--;
- }
-
- wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
- memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
-
- wbio->bi_vcnt = vcnt;
+ wbio = bio_clone_fast(r1_bio->behind_master_bio,
+ GFP_NOIO,
+ mddev->bio_set);
+ /* We really need a _all clone */
+ wbio->bi_iter = (struct bvec_iter){ 0 };
} else {
wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
mddev->bio_set);
@@ -2501,11 +2460,8 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
{
- int disk;
- int max_sectors;
struct mddev *mddev = conf->mddev;
struct bio *bio;
- char b[BDEVNAME_SIZE];
struct md_rdev *rdev;
dev_t bio_dev;
sector_t bio_sector;
@@ -2521,7 +2477,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
*/
bio = r1_bio->bios[r1_bio->read_disk];
- bdevname(bio->bi_bdev, b);
bio_dev = bio->bi_bdev->bd_dev;
bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector;
bio_put(bio);
@@ -2539,62 +2494,12 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
}
rdev_dec_pending(rdev, conf->mddev);
+ allow_barrier(conf, r1_bio->sector);
+ bio = r1_bio->master_bio;
-read_more:
- disk = read_balance(conf, r1_bio, &max_sectors);
- if (disk == -1) {
- pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
- mdname(mddev), b, (unsigned long long)r1_bio->sector);
- raid_end_bio_io(r1_bio);
- } else {
- const unsigned long do_sync
- = r1_bio->master_bio->bi_opf & REQ_SYNC;
- r1_bio->read_disk = disk;
- bio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
- mddev->bio_set);
- bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
- max_sectors);
- r1_bio->bios[r1_bio->read_disk] = bio;
- rdev = conf->mirrors[disk].rdev;
- pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n",
- mdname(mddev),
- (unsigned long long)r1_bio->sector,
- bdevname(rdev->bdev, b));
- bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset;
- bio->bi_bdev = rdev->bdev;
- bio->bi_end_io = raid1_end_read_request;
- bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
- if (test_bit(FailFast, &rdev->flags) &&
- test_bit(R1BIO_FailFast, &r1_bio->state))
- bio->bi_opf |= MD_FAILFAST;
- bio->bi_private = r1_bio;
- if (max_sectors < r1_bio->sectors) {
- /* Drat - have to split this up more */
- struct bio *mbio = r1_bio->master_bio;
- int sectors_handled = (r1_bio->sector + max_sectors
- - mbio->bi_iter.bi_sector);
- r1_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (mbio->bi_phys_segments == 0)
- mbio->bi_phys_segments = 2;
- else
- mbio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
- trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
- bio, bio_dev, bio_sector);
- generic_make_request(bio);
- bio = NULL;
-
- r1_bio = alloc_r1bio(mddev, mbio, sectors_handled);
- set_bit(R1BIO_ReadError, &r1_bio->state);
-
- goto read_more;
- } else {
- trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
- bio, bio_dev, bio_sector);
- generic_make_request(bio);
- }
- }
+ /* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */
+ r1_bio->state = 0;
+ raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio);
}
static void raid1d(struct md_thread *thread)
@@ -2660,10 +2565,7 @@ static void raid1d(struct md_thread *thread)
else if (test_bit(R1BIO_ReadError, &r1_bio->state))
handle_read_error(conf, r1_bio);
else
- /* just a partial read to be scheduled from separate
- * context
- */
- generic_make_request(r1_bio->bios[r1_bio->read_disk]);
+ WARN_ON_ONCE(1);
cond_resched();
if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
@@ -2793,7 +2695,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
for (i = 0; i < conf->raid_disks * 2; i++) {
struct md_rdev *rdev;
bio = r1_bio->bios[i];
- bio_reset(bio);
rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev == NULL ||
@@ -2849,7 +2750,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
atomic_inc(&rdev->nr_pending);
bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
bio->bi_bdev = rdev->bdev;
- bio->bi_private = r1_bio;
if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
}
@@ -2935,31 +2835,25 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
}
for (i = 0 ; i < conf->raid_disks * 2; i++) {
+ struct resync_pages *rp;
+
bio = r1_bio->bios[i];
+ rp = get_resync_pages(bio);
if (bio->bi_end_io) {
- page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
- if (bio_add_page(bio, page, len, 0) == 0) {
- /* stop here */
- bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
- while (i > 0) {
- i--;
- bio = r1_bio->bios[i];
- if (bio->bi_end_io==NULL)
- continue;
- /* remove last page from this bio */
- bio->bi_vcnt--;
- bio->bi_iter.bi_size -= len;
- bio_clear_flag(bio, BIO_SEG_VALID);
- }
- goto bio_full;
- }
+ page = resync_fetch_page(rp, rp->idx++);
+
+ /*
+ * won't fail because the vec table is big
+ * enough to hold all these pages
+ */
+ bio_add_page(bio, page, len, 0);
}
}
nr_sectors += len>>9;
sector_nr += len>>9;
sync_blocks -= (len>>9);
- } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
- bio_full:
+ } while (get_resync_pages(r1_bio->bios[disk]->bi_private)->idx < RESYNC_PAGES);
+
r1_bio->sectors = nr_sectors;
if (mddev_is_clustered(mddev) &&
@@ -3059,12 +2953,15 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!conf->r1bio_pool)
goto abort;
+ conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+ if (!conf->bio_split)
+ goto abort;
+
conf->poolinfo->mddev = mddev;
err = -EINVAL;
spin_lock_init(&conf->device_lock);
rdev_for_each(rdev, mddev) {
- struct request_queue *q;
int disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
|| disk_idx < 0)
@@ -3077,8 +2974,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (disk->rdev)
goto abort;
disk->rdev = rdev;
- q = bdev_get_queue(rdev->bdev);
-
disk->head_position = 0;
disk->seq_start = MaxSector;
}
@@ -3140,6 +3035,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
kfree(conf->nr_waiting);
kfree(conf->nr_queued);
kfree(conf->barrier);
+ if (conf->bio_split)
+ bioset_free(conf->bio_split);
kfree(conf);
}
return ERR_PTR(err);
@@ -3247,6 +3144,8 @@ static void raid1_free(struct mddev *mddev, void *priv)
kfree(conf->nr_waiting);
kfree(conf->nr_queued);
kfree(conf->barrier);
+ if (conf->bio_split)
+ bioset_free(conf->bio_split);
kfree(conf);
}
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index dd22a37d0d83..c8894ef1e9d2 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -84,6 +84,7 @@ struct r1conf {
*/
wait_queue_head_t wait_barrier;
spinlock_t resync_lock;
+ atomic_t nr_sync_pending;
atomic_t *nr_pending;
atomic_t *nr_waiting;
atomic_t *nr_queued;
@@ -107,6 +108,8 @@ struct r1conf {
mempool_t *r1bio_pool;
mempool_t *r1buf_pool;
+ struct bio_set *bio_split;
+
/* temporary buffer to synchronous IO when attempting to repair
* a read error.
*/
@@ -153,9 +156,13 @@ struct r1bio {
int read_disk;
struct list_head retry_list;
- /* Next two are only valid when R1BIO_BehindIO is set */
- struct bio_vec *behind_bvecs;
- int behind_page_count;
+
+ /*
+ * When R1BIO_BehindIO is set, we store pages for write behind
+ * in behind_master_bio.
+ */
+ struct bio *behind_master_bio;
+
/*
* if the IO is in WRITE direction, then multiple bios are used.
* We choose the number when they are allocated.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 28ec3a93acee..6b86a0032cf8 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -110,6 +110,24 @@ static void end_reshape(struct r10conf *conf);
#define raid10_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
+/*
+ * 'strct resync_pages' stores actual pages used for doing the resync
+ * IO, and it is per-bio, so make .bi_private points to it.
+ */
+static inline struct resync_pages *get_resync_pages(struct bio *bio)
+{
+ return bio->bi_private;
+}
+
+/*
+ * for resync bio, r10bio pointer can be retrieved from the per-bio
+ * 'struct resync_pages'.
+ */
+static inline struct r10bio *get_resync_r10bio(struct bio *bio)
+{
+ return get_resync_pages(bio)->raid_bio;
+}
+
static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
{
struct r10conf *conf = data;
@@ -125,9 +143,6 @@ static void r10bio_pool_free(void *r10_bio, void *data)
kfree(r10_bio);
}
-/* Maximum size of each resync request */
-#define RESYNC_BLOCK_SIZE (64*1024)
-#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
/* amount of memory to reserve for resync requests */
#define RESYNC_WINDOW (1024*1024)
/* maximum number of concurrent requests, memory permitting */
@@ -143,11 +158,11 @@ static void r10bio_pool_free(void *r10_bio, void *data)
static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
{
struct r10conf *conf = data;
- struct page *page;
struct r10bio *r10_bio;
struct bio *bio;
- int i, j;
- int nalloc;
+ int j;
+ int nalloc, nalloc_rp;
+ struct resync_pages *rps;
r10_bio = r10bio_pool_alloc(gfp_flags, conf);
if (!r10_bio)
@@ -159,6 +174,15 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
else
nalloc = 2; /* recovery */
+ /* allocate once for all bios */
+ if (!conf->have_replacement)
+ nalloc_rp = nalloc;
+ else
+ nalloc_rp = nalloc * 2;
+ rps = kmalloc(sizeof(struct resync_pages) * nalloc_rp, gfp_flags);
+ if (!rps)
+ goto out_free_r10bio;
+
/*
* Allocate bios.
*/
@@ -178,36 +202,40 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
* Allocate RESYNC_PAGES data pages and attach them
* where needed.
*/
- for (j = 0 ; j < nalloc; j++) {
+ for (j = 0; j < nalloc; j++) {
struct bio *rbio = r10_bio->devs[j].repl_bio;
+ struct resync_pages *rp, *rp_repl;
+
+ rp = &rps[j];
+ if (rbio)
+ rp_repl = &rps[nalloc + j];
+
bio = r10_bio->devs[j].bio;
- for (i = 0; i < RESYNC_PAGES; i++) {
- if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
- &conf->mddev->recovery)) {
- /* we can share bv_page's during recovery
- * and reshape */
- struct bio *rbio = r10_bio->devs[0].bio;
- page = rbio->bi_io_vec[i].bv_page;
- get_page(page);
- } else
- page = alloc_page(gfp_flags);
- if (unlikely(!page))
+
+ if (!j || test_bit(MD_RECOVERY_SYNC,
+ &conf->mddev->recovery)) {
+ if (resync_alloc_pages(rp, gfp_flags))
goto out_free_pages;
+ } else {
+ memcpy(rp, &rps[0], sizeof(*rp));
+ resync_get_all_pages(rp);
+ }
- bio->bi_io_vec[i].bv_page = page;
- if (rbio)
- rbio->bi_io_vec[i].bv_page = page;
+ rp->idx = 0;
+ rp->raid_bio = r10_bio;
+ bio->bi_private = rp;
+ if (rbio) {
+ memcpy(rp_repl, rp, sizeof(*rp));
+ rbio->bi_private = rp_repl;
}
}
return r10_bio;
out_free_pages:
- for ( ; i > 0 ; i--)
- safe_put_page(bio->bi_io_vec[i-1].bv_page);
- while (j--)
- for (i = 0; i < RESYNC_PAGES ; i++)
- safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
+ while (--j >= 0)
+ resync_free_pages(&rps[j * 2]);
+
j = 0;
out_free_bio:
for ( ; j < nalloc; j++) {
@@ -216,30 +244,34 @@ out_free_bio:
if (r10_bio->devs[j].repl_bio)
bio_put(r10_bio->devs[j].repl_bio);
}
+ kfree(rps);
+out_free_r10bio:
r10bio_pool_free(r10_bio, conf);
return NULL;
}
static void r10buf_pool_free(void *__r10_bio, void *data)
{
- int i;
struct r10conf *conf = data;
struct r10bio *r10bio = __r10_bio;
int j;
+ struct resync_pages *rp = NULL;
- for (j=0; j < conf->copies; j++) {
+ for (j = conf->copies; j--; ) {
struct bio *bio = r10bio->devs[j].bio;
- if (bio) {
- for (i = 0; i < RESYNC_PAGES; i++) {
- safe_put_page(bio->bi_io_vec[i].bv_page);
- bio->bi_io_vec[i].bv_page = NULL;
- }
- bio_put(bio);
- }
+
+ rp = get_resync_pages(bio);
+ resync_free_pages(rp);
+ bio_put(bio);
+
bio = r10bio->devs[j].repl_bio;
if (bio)
bio_put(bio);
}
+
+ /* resync pages array stored in the 1st bio's .bi_private */
+ kfree(rp);
+
r10bio_pool_free(r10bio, conf);
}
@@ -301,27 +333,18 @@ static void reschedule_retry(struct r10bio *r10_bio)
static void raid_end_bio_io(struct r10bio *r10_bio)
{
struct bio *bio = r10_bio->master_bio;
- int done;
struct r10conf *conf = r10_bio->mddev->private;
- if (bio->bi_phys_segments) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- bio->bi_phys_segments--;
- done = (bio->bi_phys_segments == 0);
- spin_unlock_irqrestore(&conf->device_lock, flags);
- } else
- done = 1;
if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
bio->bi_error = -EIO;
- if (done) {
- bio_endio(bio);
- /*
- * Wake up any possible resync thread that waits for the device
- * to go idle.
- */
- allow_barrier(conf);
- }
+
+ bio_endio(bio);
+ /*
+ * Wake up any possible resync thread that waits for the device
+ * to go idle.
+ */
+ allow_barrier(conf);
+
free_r10bio(r10_bio);
}
@@ -1095,12 +1118,41 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
struct bio *read_bio;
const int op = bio_op(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
- int sectors_handled;
int max_sectors;
sector_t sectors;
struct md_rdev *rdev;
- int slot;
+ char b[BDEVNAME_SIZE];
+ int slot = r10_bio->read_slot;
+ struct md_rdev *err_rdev = NULL;
+ gfp_t gfp = GFP_NOIO;
+ if (r10_bio->devs[slot].rdev) {
+ /*
+ * This is an error retry, but we cannot
+ * safely dereference the rdev in the r10_bio,
+ * we must use the one in conf.
+ * If it has already been disconnected (unlikely)
+ * we lose the device name in error messages.
+ */
+ int disk;
+ /*
+ * As we are blocking raid10, it is a little safer to
+ * use __GFP_HIGH.
+ */
+ gfp = GFP_NOIO | __GFP_HIGH;
+
+ rcu_read_lock();
+ disk = r10_bio->devs[slot].devnum;
+ err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ if (err_rdev)
+ bdevname(err_rdev->bdev, b);
+ else {
+ strcpy(b, "???");
+ /* This never gets dereferenced */
+ err_rdev = r10_bio->devs[slot].rdev;
+ }
+ rcu_read_unlock();
+ }
/*
* Register the new request and wait if the reconstruction
* thread has put up a bar for new requests.
@@ -1108,7 +1160,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
*/
wait_barrier(conf);
- sectors = bio_sectors(bio);
+ sectors = r10_bio->sectors;
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_iter.bi_sector < conf->reshape_progress &&
bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
@@ -1125,17 +1177,33 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
wait_barrier(conf);
}
-read_again:
rdev = read_balance(conf, r10_bio, &max_sectors);
if (!rdev) {
+ if (err_rdev) {
+ pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
+ mdname(mddev), b,
+ (unsigned long long)r10_bio->sector);
+ }
raid_end_bio_io(r10_bio);
return;
}
+ if (err_rdev)
+ pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
+ mdname(mddev),
+ bdevname(rdev->bdev, b),
+ (unsigned long long)r10_bio->sector);
+ if (max_sectors < bio_sectors(bio)) {
+ struct bio *split = bio_split(bio, max_sectors,
+ gfp, conf->bio_split);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ r10_bio->master_bio = bio;
+ r10_bio->sectors = max_sectors;
+ }
slot = r10_bio->read_slot;
- read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
- bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
- max_sectors);
+ read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
r10_bio->devs[slot].bio = read_bio;
r10_bio->devs[slot].rdev = rdev;
@@ -1154,55 +1222,86 @@ read_again:
trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
read_bio, disk_devt(mddev->gendisk),
r10_bio->sector);
- if (max_sectors < r10_bio->sectors) {
- /*
- * Could not read all from this device, so we will need another
- * r10_bio.
- */
- sectors_handled = (r10_bio->sector + max_sectors
- - bio->bi_iter.bi_sector);
- r10_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (bio->bi_phys_segments == 0)
- bio->bi_phys_segments = 2;
- else
- bio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
- /*
- * Cannot call generic_make_request directly as that will be
- * queued in __generic_make_request and subsequent
- * mempool_alloc might block waiting for it. so hand bio over
- * to raid10d.
- */
- reschedule_retry(r10_bio);
-
- r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
- r10_bio->master_bio = bio;
- r10_bio->sectors = bio_sectors(bio) - sectors_handled;
- r10_bio->state = 0;
- r10_bio->mddev = mddev;
- r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
- goto read_again;
- } else
- generic_make_request(read_bio);
+ generic_make_request(read_bio);
return;
}
-static void raid10_write_request(struct mddev *mddev, struct bio *bio,
- struct r10bio *r10_bio)
+static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
+ struct bio *bio, bool replacement,
+ int n_copy)
{
- struct r10conf *conf = mddev->private;
- int i;
const int op = bio_op(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
unsigned long flags;
- struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
struct raid10_plug_cb *plug = NULL;
+ struct r10conf *conf = mddev->private;
+ struct md_rdev *rdev;
+ int devnum = r10_bio->devs[n_copy].devnum;
+ struct bio *mbio;
+
+ if (replacement) {
+ rdev = conf->mirrors[devnum].replacement;
+ if (rdev == NULL) {
+ /* Replacement just got moved to main 'rdev' */
+ smp_mb();
+ rdev = conf->mirrors[devnum].rdev;
+ }
+ } else
+ rdev = conf->mirrors[devnum].rdev;
+
+ mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+ if (replacement)
+ r10_bio->devs[n_copy].repl_bio = mbio;
+ else
+ r10_bio->devs[n_copy].bio = mbio;
+
+ mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
+ choose_data_offset(r10_bio, rdev));
+ mbio->bi_bdev = rdev->bdev;
+ mbio->bi_end_io = raid10_end_write_request;
+ bio_set_op_attrs(mbio, op, do_sync | do_fua);
+ if (!replacement && test_bit(FailFast,
+ &conf->mirrors[devnum].rdev->flags)
+ && enough(conf, devnum))
+ mbio->bi_opf |= MD_FAILFAST;
+ mbio->bi_private = r10_bio;
+
+ if (conf->mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
+ mbio, disk_devt(conf->mddev->gendisk),
+ r10_bio->sector);
+ /* flush_pending_writes() needs access to the rdev so...*/
+ mbio->bi_bdev = (void *)rdev;
+
+ atomic_inc(&r10_bio->remaining);
+
+ cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
+ if (cb)
+ plug = container_of(cb, struct raid10_plug_cb, cb);
+ else
+ plug = NULL;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (plug) {
+ bio_list_add(&plug->pending, mbio);
+ plug->pending_cnt++;
+ } else {
+ bio_list_add(&conf->pending_bio_list, mbio);
+ conf->pending_count++;
+ }
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ if (!plug)
+ md_wakeup_thread(mddev->thread);
+}
+
+static void raid10_write_request(struct mddev *mddev, struct bio *bio,
+ struct r10bio *r10_bio)
+{
+ struct r10conf *conf = mddev->private;
+ int i;
+ struct md_rdev *blocked_rdev;
sector_t sectors;
- int sectors_handled;
int max_sectors;
md_write_start(mddev, bio);
@@ -1214,7 +1313,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
*/
wait_barrier(conf);
- sectors = bio_sectors(bio);
+ sectors = r10_bio->sectors;
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_iter.bi_sector < conf->reshape_progress &&
bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
@@ -1262,9 +1361,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
* on which we have seen a write error, we want to avoid
* writing to those blocks. This potentially requires several
* writes to write around the bad blocks. Each set of writes
- * gets its own r10_bio with a set of bios attached. The number
- * of r10_bios is recored in bio->bi_phys_segments just as with
- * the read case.
+ * gets its own r10_bio with a set of bios attached.
*/
r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
@@ -1384,145 +1481,31 @@ retry_write:
goto retry_write;
}
- if (max_sectors < r10_bio->sectors) {
- /* We are splitting this into multiple parts, so
- * we need to prepare for allocating another r10_bio.
- */
+ if (max_sectors < r10_bio->sectors)
r10_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (bio->bi_phys_segments == 0)
- bio->bi_phys_segments = 2;
- else
- bio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
+
+ if (r10_bio->sectors < bio_sectors(bio)) {
+ struct bio *split = bio_split(bio, r10_bio->sectors,
+ GFP_NOIO, conf->bio_split);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ r10_bio->master_bio = bio;
}
- sectors_handled = r10_bio->sector + max_sectors -
- bio->bi_iter.bi_sector;
atomic_set(&r10_bio->remaining, 1);
bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
for (i = 0; i < conf->copies; i++) {
- struct bio *mbio;
- int d = r10_bio->devs[i].devnum;
- if (r10_bio->devs[i].bio) {
- struct md_rdev *rdev = conf->mirrors[d].rdev;
- mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
- bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
- max_sectors);
- r10_bio->devs[i].bio = mbio;
-
- mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
- choose_data_offset(r10_bio, rdev));
- mbio->bi_bdev = rdev->bdev;
- mbio->bi_end_io = raid10_end_write_request;
- bio_set_op_attrs(mbio, op, do_sync | do_fua);
- if (test_bit(FailFast, &conf->mirrors[d].rdev->flags) &&
- enough(conf, d))
- mbio->bi_opf |= MD_FAILFAST;
- mbio->bi_private = r10_bio;
-
- if (conf->mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
- mbio, disk_devt(conf->mddev->gendisk),
- r10_bio->sector);
- /* flush_pending_writes() needs access to the rdev so...*/
- mbio->bi_bdev = (void*)rdev;
-
- atomic_inc(&r10_bio->remaining);
-
- cb = blk_check_plugged(raid10_unplug, mddev,
- sizeof(*plug));
- if (cb)
- plug = container_of(cb, struct raid10_plug_cb,
- cb);
- else
- plug = NULL;
- spin_lock_irqsave(&conf->device_lock, flags);
- if (plug) {
- bio_list_add(&plug->pending, mbio);
- plug->pending_cnt++;
- } else {
- bio_list_add(&conf->pending_bio_list, mbio);
- conf->pending_count++;
- }
- spin_unlock_irqrestore(&conf->device_lock, flags);
- if (!plug)
- md_wakeup_thread(mddev->thread);
- }
-
- if (r10_bio->devs[i].repl_bio) {
- struct md_rdev *rdev = conf->mirrors[d].replacement;
- if (rdev == NULL) {
- /* Replacement just got moved to main 'rdev' */
- smp_mb();
- rdev = conf->mirrors[d].rdev;
- }
- mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
- bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
- max_sectors);
- r10_bio->devs[i].repl_bio = mbio;
-
- mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr +
- choose_data_offset(r10_bio, rdev));
- mbio->bi_bdev = rdev->bdev;
- mbio->bi_end_io = raid10_end_write_request;
- bio_set_op_attrs(mbio, op, do_sync | do_fua);
- mbio->bi_private = r10_bio;
-
- if (conf->mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
- mbio, disk_devt(conf->mddev->gendisk),
- r10_bio->sector);
- /* flush_pending_writes() needs access to the rdev so...*/
- mbio->bi_bdev = (void*)rdev;
-
- atomic_inc(&r10_bio->remaining);
-
- cb = blk_check_plugged(raid10_unplug, mddev,
- sizeof(*plug));
- if (cb)
- plug = container_of(cb, struct raid10_plug_cb,
- cb);
- else
- plug = NULL;
- spin_lock_irqsave(&conf->device_lock, flags);
- if (plug) {
- bio_list_add(&plug->pending, mbio);
- plug->pending_cnt++;
- } else {
- bio_list_add(&conf->pending_bio_list, mbio);
- conf->pending_count++;
- }
- spin_unlock_irqrestore(&conf->device_lock, flags);
- if (!plug)
- md_wakeup_thread(mddev->thread);
- }
- }
-
- /* Don't remove the bias on 'remaining' (one_write_done) until
- * after checking if we need to go around again.
- */
-
- if (sectors_handled < bio_sectors(bio)) {
- one_write_done(r10_bio);
- /* We need another r10_bio. It has already been counted
- * in bio->bi_phys_segments.
- */
- r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
- r10_bio->master_bio = bio;
- r10_bio->sectors = bio_sectors(bio) - sectors_handled;
-
- r10_bio->mddev = mddev;
- r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
- r10_bio->state = 0;
- goto retry_write;
+ if (r10_bio->devs[i].bio)
+ raid10_write_one_disk(mddev, r10_bio, bio, false, i);
+ if (r10_bio->devs[i].repl_bio)
+ raid10_write_one_disk(mddev, r10_bio, bio, true, i);
}
one_write_done(r10_bio);
}
-static void __make_request(struct mddev *mddev, struct bio *bio)
+static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
{
struct r10conf *conf = mddev->private;
struct r10bio *r10_bio;
@@ -1530,21 +1513,12 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
r10_bio->master_bio = bio;
- r10_bio->sectors = bio_sectors(bio);
+ r10_bio->sectors = sectors;
r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_iter.bi_sector;
r10_bio->state = 0;
-
- /*
- * We might need to issue multiple reads to different devices if there
- * are bad blocks around, so we keep track of the number of reads in
- * bio->bi_phys_segments. If this is 0, there is only one r10_bio and
- * no locking will be needed when the request completes. If it is
- * non-zero, then it is the number of not-completed requests.
- */
- bio->bi_phys_segments = 0;
- bio_clear_flag(bio, BIO_SEG_VALID);
+ memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
if (bio_data_dir(bio) == READ)
raid10_read_request(mddev, bio, r10_bio);
@@ -1557,54 +1531,26 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
struct r10conf *conf = mddev->private;
sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
int chunk_sects = chunk_mask + 1;
-
- struct bio *split;
+ int sectors = bio_sectors(bio);
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
return;
}
- do {
-
- /*
- * If this request crosses a chunk boundary, we need to split
- * it.
- */
- if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
- bio_sectors(bio) > chunk_sects
- && (conf->geo.near_copies < conf->geo.raid_disks
- || conf->prev.near_copies <
- conf->prev.raid_disks))) {
- split = bio_split(bio, chunk_sects -
- (bio->bi_iter.bi_sector &
- (chunk_sects - 1)),
- GFP_NOIO, fs_bio_set);
- bio_chain(split, bio);
- } else {
- split = bio;
- }
-
- /*
- * If a bio is splitted, the first part of bio will pass
- * barrier but the bio is queued in current->bio_list (see
- * generic_make_request). If there is a raise_barrier() called
- * here, the second part of bio can't pass barrier. But since
- * the first part bio isn't dispatched to underlaying disks
- * yet, the barrier is never released, hence raise_barrier will
- * alays wait. We have a deadlock.
- * Note, this only happens in read path. For write path, the
- * first part of bio is dispatched in a schedule() call
- * (because of blk plug) or offloaded to raid10d.
- * Quitting from the function immediately can change the bio
- * order queued in bio_list and avoid the deadlock.
- */
- __make_request(mddev, split);
- if (split != bio && bio_data_dir(bio) == READ) {
- generic_make_request(bio);
- break;
- }
- } while (split != bio);
+ /*
+ * If this request crosses a chunk boundary, we need to split
+ * it.
+ */
+ if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
+ sectors > chunk_sects
+ && (conf->geo.near_copies < conf->geo.raid_disks
+ || conf->prev.near_copies <
+ conf->prev.raid_disks)))
+ sectors = chunk_sects -
+ (bio->bi_iter.bi_sector &
+ (chunk_sects - 1));
+ __make_request(mddev, bio, sectors);
/* In case raid10d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
@@ -1928,13 +1874,9 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
* but will never see neither -- if they are careful.
*/
p->replacement = NULL;
- clear_bit(WantReplacement, &rdev->flags);
- } else
- /* We might have just remove the Replacement as faulty
- * Clear the flag just in case
- */
- clear_bit(WantReplacement, &rdev->flags);
+ }
+ clear_bit(WantReplacement, &rdev->flags);
err = md_integrity_register(mddev);
abort:
@@ -1943,17 +1885,9 @@ abort:
return err;
}
-static void end_sync_read(struct bio *bio)
+static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
{
- struct r10bio *r10_bio = bio->bi_private;
struct r10conf *conf = r10_bio->mddev->private;
- int d;
-
- if (bio == r10_bio->master_bio) {
- /* this is a reshape read */
- d = r10_bio->read_slot; /* really the read dev */
- } else
- d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
if (!bio->bi_error)
set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -1977,6 +1911,23 @@ static void end_sync_read(struct bio *bio)
}
}
+static void end_sync_read(struct bio *bio)
+{
+ struct r10bio *r10_bio = get_resync_r10bio(bio);
+ struct r10conf *conf = r10_bio->mddev->private;
+ int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
+
+ __end_sync_read(r10_bio, bio, d);
+}
+
+static void end_reshape_read(struct bio *bio)
+{
+ /* reshape read bio isn't allocated from r10buf_pool */
+ struct r10bio *r10_bio = bio->bi_private;
+
+ __end_sync_read(r10_bio, bio, r10_bio->read_slot);
+}
+
static void end_sync_request(struct r10bio *r10_bio)
{
struct mddev *mddev = r10_bio->mddev;
@@ -2006,7 +1957,7 @@ static void end_sync_request(struct r10bio *r10_bio)
static void end_sync_write(struct bio *bio)
{
- struct r10bio *r10_bio = bio->bi_private;
+ struct r10bio *r10_bio = get_resync_r10bio(bio);
struct mddev *mddev = r10_bio->mddev;
struct r10conf *conf = mddev->private;
int d;
@@ -2065,6 +2016,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
int i, first;
struct bio *tbio, *fbio;
int vcnt;
+ struct page **tpages, **fpages;
atomic_set(&r10_bio->remaining, 1);
@@ -2080,12 +2032,14 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
fbio = r10_bio->devs[i].bio;
fbio->bi_iter.bi_size = r10_bio->sectors << 9;
fbio->bi_iter.bi_idx = 0;
+ fpages = get_resync_pages(fbio)->pages;
vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
/* now find blocks with errors */
for (i=0 ; i < conf->copies ; i++) {
int j, d;
struct md_rdev *rdev;
+ struct resync_pages *rp;
tbio = r10_bio->devs[i].bio;
@@ -2093,6 +2047,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
continue;
if (i == first)
continue;
+
+ tpages = get_resync_pages(tbio)->pages;
d = r10_bio->devs[i].devnum;
rdev = conf->mirrors[d].rdev;
if (!r10_bio->devs[i].bio->bi_error) {
@@ -2105,8 +2061,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
int len = PAGE_SIZE;
if (sectors < (len / 512))
len = sectors * 512;
- if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
- page_address(tbio->bi_io_vec[j].bv_page),
+ if (memcmp(page_address(fpages[j]),
+ page_address(tpages[j]),
len))
break;
sectors -= len/512;
@@ -2127,11 +2083,13 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
* First we need to fixup bv_offset, bv_len and
* bi_vecs, as the read request might have corrupted these
*/
+ rp = get_resync_pages(tbio);
bio_reset(tbio);
tbio->bi_vcnt = vcnt;
tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
- tbio->bi_private = r10_bio;
+ rp->raid_bio = r10_bio;
+ tbio->bi_private = rp;
tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
tbio->bi_end_io = end_sync_write;
bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
@@ -2202,6 +2160,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
int idx = 0;
int dr = r10_bio->devs[0].devnum;
int dw = r10_bio->devs[1].devnum;
+ struct page **pages = get_resync_pages(bio)->pages;
while (sectors) {
int s = sectors;
@@ -2217,7 +2176,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
ok = sync_page_io(rdev,
addr,
s << 9,
- bio->bi_io_vec[idx].bv_page,
+ pages[idx],
REQ_OP_READ, 0, false);
if (ok) {
rdev = conf->mirrors[dw].rdev;
@@ -2225,7 +2184,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
ok = sync_page_io(rdev,
addr,
s << 9,
- bio->bi_io_vec[idx].bv_page,
+ pages[idx],
REQ_OP_WRITE, 0, false);
if (!ok) {
set_bit(WriteErrorSeen, &rdev->flags);
@@ -2625,9 +2584,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
struct bio *bio;
struct r10conf *conf = mddev->private;
struct md_rdev *rdev = r10_bio->devs[slot].rdev;
- char b[BDEVNAME_SIZE];
- unsigned long do_sync;
- int max_sectors;
dev_t bio_dev;
sector_t bio_last_sector;
@@ -2640,7 +2596,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
* frozen.
*/
bio = r10_bio->devs[slot].bio;
- bdevname(bio->bi_bdev, b);
bio_dev = bio->bi_bdev->bd_dev;
bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors;
bio_put(bio);
@@ -2656,69 +2611,9 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
md_error(mddev, rdev);
rdev_dec_pending(rdev, mddev);
-
-read_more:
- rdev = read_balance(conf, r10_bio, &max_sectors);
- if (rdev == NULL) {
- pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
- mdname(mddev), b,
- (unsigned long long)r10_bio->sector);
- raid_end_bio_io(r10_bio);
- return;
- }
-
- do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC);
- slot = r10_bio->read_slot;
- pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
- mdname(mddev),
- bdevname(rdev->bdev, b),
- (unsigned long long)r10_bio->sector);
- bio = bio_clone_fast(r10_bio->master_bio, GFP_NOIO, mddev->bio_set);
- bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
- r10_bio->devs[slot].bio = bio;
- r10_bio->devs[slot].rdev = rdev;
- bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
- + choose_data_offset(r10_bio, rdev);
- bio->bi_bdev = rdev->bdev;
- bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
- if (test_bit(FailFast, &rdev->flags) &&
- test_bit(R10BIO_FailFast, &r10_bio->state))
- bio->bi_opf |= MD_FAILFAST;
- bio->bi_private = r10_bio;
- bio->bi_end_io = raid10_end_read_request;
- trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
- bio, bio_dev,
- bio_last_sector - r10_bio->sectors);
-
- if (max_sectors < r10_bio->sectors) {
- /* Drat - have to split this up more */
- struct bio *mbio = r10_bio->master_bio;
- int sectors_handled =
- r10_bio->sector + max_sectors
- - mbio->bi_iter.bi_sector;
- r10_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (mbio->bi_phys_segments == 0)
- mbio->bi_phys_segments = 2;
- else
- mbio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
- generic_make_request(bio);
-
- r10_bio = mempool_alloc(conf->r10bio_pool,
- GFP_NOIO);
- r10_bio->master_bio = mbio;
- r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
- r10_bio->state = 0;
- set_bit(R10BIO_ReadError,
- &r10_bio->state);
- r10_bio->mddev = mddev;
- r10_bio->sector = mbio->bi_iter.bi_sector
- + sectors_handled;
-
- goto read_more;
- } else
- generic_make_request(bio);
+ allow_barrier(conf);
+ r10_bio->state = 0;
+ raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
}
static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
@@ -2805,6 +2700,11 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
conf->nr_queued++;
spin_unlock_irq(&conf->device_lock);
+ /*
+ * In case freeze_array() is waiting for condition
+ * nr_pending == nr_queued + extra to be true.
+ */
+ wake_up(&conf->wait_barrier);
md_wakeup_thread(conf->mddev->thread);
} else {
if (test_bit(R10BIO_WriteError,
@@ -2879,13 +2779,8 @@ static void raid10d(struct md_thread *thread)
recovery_request_write(mddev, r10_bio);
else if (test_bit(R10BIO_ReadError, &r10_bio->state))
handle_read_error(mddev, r10_bio);
- else {
- /* just a partial read to be scheduled from a
- * separate context
- */
- int slot = r10_bio->read_slot;
- generic_make_request(r10_bio->devs[slot].bio);
- }
+ else
+ WARN_ON_ONCE(1);
cond_resched();
if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
@@ -3199,10 +3094,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
}
}
bio = r10_bio->devs[0].bio;
- bio_reset(bio);
bio->bi_next = biolist;
biolist = bio;
- bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_read;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
if (test_bit(FailFast, &rdev->flags))
@@ -3226,10 +3119,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (!test_bit(In_sync, &mrdev->flags)) {
bio = r10_bio->devs[1].bio;
- bio_reset(bio);
bio->bi_next = biolist;
biolist = bio;
- bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_iter.bi_sector = to_addr
@@ -3254,10 +3145,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (mreplace == NULL || bio == NULL ||
test_bit(Faulty, &mreplace->flags))
break;
- bio_reset(bio);
bio->bi_next = biolist;
biolist = bio;
- bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_iter.bi_sector = to_addr +
@@ -3379,7 +3268,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
r10_bio->devs[i].repl_bio->bi_end_io = NULL;
bio = r10_bio->devs[i].bio;
- bio_reset(bio);
bio->bi_error = -EIO;
rcu_read_lock();
rdev = rcu_dereference(conf->mirrors[d].rdev);
@@ -3404,7 +3292,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
atomic_inc(&r10_bio->remaining);
bio->bi_next = biolist;
biolist = bio;
- bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_read;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
@@ -3423,13 +3310,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
/* Need to set up for writing to the replacement */
bio = r10_bio->devs[i].repl_bio;
- bio_reset(bio);
bio->bi_error = -EIO;
sector = r10_bio->devs[i].addr;
bio->bi_next = biolist;
biolist = bio;
- bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
@@ -3468,27 +3353,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (len == 0)
break;
for (bio= biolist ; bio ; bio=bio->bi_next) {
- struct bio *bio2;
- page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
- if (bio_add_page(bio, page, len, 0))
- continue;
-
- /* stop here */
- bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
- for (bio2 = biolist;
- bio2 && bio2 != bio;
- bio2 = bio2->bi_next) {
- /* remove last page from this bio */
- bio2->bi_vcnt--;
- bio2->bi_iter.bi_size -= len;
- bio_clear_flag(bio2, BIO_SEG_VALID);
- }
- goto bio_full;
+ struct resync_pages *rp = get_resync_pages(bio);
+ page = resync_fetch_page(rp, rp->idx++);
+ /*
+ * won't fail because the vec table is big enough
+ * to hold all these pages
+ */
+ bio_add_page(bio, page, len, 0);
}
nr_sectors += len>>9;
sector_nr += len>>9;
- } while (biolist->bi_vcnt < RESYNC_PAGES);
- bio_full:
+ } while (get_resync_pages(biolist)->idx < RESYNC_PAGES);
r10_bio->sectors = nr_sectors;
while (biolist) {
@@ -3496,7 +3371,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
biolist = biolist->bi_next;
bio->bi_next = NULL;
- r10_bio = bio->bi_private;
+ r10_bio = get_resync_r10bio(bio);
r10_bio->sectors = nr_sectors;
if (bio->bi_end_io == end_sync_read) {
@@ -3678,6 +3553,10 @@ static struct r10conf *setup_conf(struct mddev *mddev)
if (!conf->r10bio_pool)
goto out;
+ conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+ if (!conf->bio_split)
+ goto out;
+
calc_sectors(conf, mddev->dev_sectors);
if (mddev->reshape_position == MaxSector) {
conf->prev = conf->geo;
@@ -3715,6 +3594,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
mempool_destroy(conf->r10bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
+ if (conf->bio_split)
+ bioset_free(conf->bio_split);
kfree(conf);
}
return ERR_PTR(err);
@@ -3760,7 +3641,6 @@ static int raid10_run(struct mddev *mddev)
rdev_for_each(rdev, mddev) {
long long diff;
- struct request_queue *q;
disk_idx = rdev->raid_disk;
if (disk_idx < 0)
@@ -3779,7 +3659,6 @@ static int raid10_run(struct mddev *mddev)
goto out_free_conf;
disk->rdev = rdev;
}
- q = bdev_get_queue(rdev->bdev);
diff = (rdev->new_data_offset - rdev->data_offset);
if (!mddev->reshape_backwards)
diff = -diff;
@@ -3796,6 +3675,7 @@ static int raid10_run(struct mddev *mddev)
if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
discard_supported = true;
+ first = 0;
}
if (mddev->queue) {
@@ -3925,6 +3805,8 @@ static void raid10_free(struct mddev *mddev, void *priv)
kfree(conf->mirrors);
kfree(conf->mirrors_old);
kfree(conf->mirrors_new);
+ if (conf->bio_split)
+ bioset_free(conf->bio_split);
kfree(conf);
}
@@ -4198,6 +4080,7 @@ static int raid10_start_reshape(struct mddev *mddev)
diff = 0;
if (first || diff < min_offset_diff)
min_offset_diff = diff;
+ first = 0;
}
}
@@ -4388,6 +4271,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
struct bio *blist;
struct bio *bio, *read_bio;
int sectors_done = 0;
+ struct page **pages;
if (sector_nr == 0) {
/* If restarting in the middle, skip the initial sectors */
@@ -4508,7 +4392,7 @@ read_more:
read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
+ rdev->data_offset);
read_bio->bi_private = r10_bio;
- read_bio->bi_end_io = end_sync_read;
+ read_bio->bi_end_io = end_reshape_read;
bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
read_bio->bi_error = 0;
@@ -4538,11 +4422,9 @@ read_more:
if (!rdev2 || test_bit(Faulty, &rdev2->flags))
continue;
- bio_reset(b);
b->bi_bdev = rdev2->bdev;
b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
rdev2->new_data_offset;
- b->bi_private = r10_bio;
b->bi_end_io = end_reshape_write;
bio_set_op_attrs(b, REQ_OP_WRITE, 0);
b->bi_next = blist;
@@ -4552,31 +4434,22 @@ read_more:
/* Now add as many pages as possible to all of these bios. */
nr_sectors = 0;
+ pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
- struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
+ struct page *page = pages[s / (PAGE_SIZE >> 9)];
int len = (max_sectors - s) << 9;
if (len > PAGE_SIZE)
len = PAGE_SIZE;
for (bio = blist; bio ; bio = bio->bi_next) {
- struct bio *bio2;
- if (bio_add_page(bio, page, len, 0))
- continue;
-
- /* Didn't fit, must stop */
- for (bio2 = blist;
- bio2 && bio2 != bio;
- bio2 = bio2->bi_next) {
- /* Remove last page from this bio */
- bio2->bi_vcnt--;
- bio2->bi_iter.bi_size -= len;
- bio_clear_flag(bio2, BIO_SEG_VALID);
- }
- goto bio_full;
+ /*
+ * won't fail because the vec table is big enough
+ * to hold all these pages
+ */
+ bio_add_page(bio, page, len, 0);
}
sector_nr += len >> 9;
nr_sectors += len >> 9;
}
-bio_full:
rcu_read_unlock();
r10_bio->sectors = nr_sectors;
@@ -4690,7 +4563,10 @@ static int handle_reshape_read_error(struct mddev *mddev,
struct r10bio *r10b = &on_stack.r10_bio;
int slot = 0;
int idx = 0;
- struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
+ struct page **pages;
+
+ /* reshape IOs share pages from .devs[0].bio */
+ pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
r10b->sector = r10_bio->sector;
__raid10_find_phys(&conf->prev, r10b);
@@ -4719,7 +4595,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
success = sync_page_io(rdev,
addr,
s << 9,
- bvec[idx].bv_page,
+ pages[idx],
REQ_OP_READ, 0, false);
rdev_dec_pending(rdev, mddev);
rcu_read_lock();
@@ -4747,7 +4623,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
static void end_reshape_write(struct bio *bio)
{
- struct r10bio *r10_bio = bio->bi_private;
+ struct r10bio *r10_bio = get_resync_r10bio(bio);
struct mddev *mddev = r10_bio->mddev;
struct r10conf *conf = mddev->private;
int d;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 3162615e57bd..735ce1a3d260 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -82,6 +82,7 @@ struct r10conf {
mempool_t *r10bio_pool;
mempool_t *r10buf_pool;
struct page *tmppage;
+ struct bio_set *bio_split;
/* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array.
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 3f307be01b10..b6194e082e48 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -30,6 +30,7 @@
* underneath hardware sector size. only works with PAGE_SIZE == 4096
*/
#define BLOCK_SECTORS (8)
+#define BLOCK_SECTOR_SHIFT (3)
/*
* log->max_free_space is min(1/4 disk size, 10G reclaimable space).
@@ -43,7 +44,7 @@
/* wake up reclaim thread periodically */
#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
/* start flush with these full stripes */
-#define R5C_FULL_STRIPE_FLUSH_BATCH 256
+#define R5C_FULL_STRIPE_FLUSH_BATCH(conf) (conf->max_nr_stripes / 4)
/* reclaim stripes in groups */
#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
@@ -307,8 +308,7 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
}
static void
-r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
- struct bio_list *return_bi)
+r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
{
struct bio *wbi, *wbi2;
@@ -317,24 +317,21 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
- if (!raid5_dec_bi_active_stripes(wbi)) {
- md_write_end(conf->mddev);
- bio_list_add(return_bi, wbi);
- }
+ md_write_end(conf->mddev);
+ bio_endio(wbi);
wbi = wbi2;
}
}
void r5c_handle_cached_data_endio(struct r5conf *conf,
- struct stripe_head *sh, int disks, struct bio_list *return_bi)
+ struct stripe_head *sh, int disks)
{
int i;
for (i = sh->disks; i--; ) {
if (sh->dev[i].written) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
- r5c_return_dev_pending_writes(conf, &sh->dev[i],
- return_bi);
+ r5c_return_dev_pending_writes(conf, &sh->dev[i]);
bitmap_endwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS,
!test_bit(STRIPE_DEGRADED, &sh->state),
@@ -343,6 +340,8 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
}
}
+void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+
/* Check whether we should flush some stripes to free up stripe cache */
void r5c_check_stripe_cache_usage(struct r5conf *conf)
{
@@ -381,7 +380,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
* or a full stripe (chunk size / 4k stripes).
*/
if (atomic_read(&conf->r5c_cached_full_stripes) >=
- min(R5C_FULL_STRIPE_FLUSH_BATCH,
+ min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
conf->chunk_sectors >> STRIPE_SHIFT))
r5l_wake_reclaim(conf->log, 0);
}
@@ -590,7 +589,7 @@ static void r5l_log_endio(struct bio *bio)
spin_lock_irqsave(&log->io_list_lock, flags);
__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
- if (log->need_cache_flush)
+ if (log->need_cache_flush && !list_empty(&io->stripe_list))
r5l_move_to_end_ios(log);
else
r5l_log_run_stripes(log);
@@ -618,9 +617,11 @@ static void r5l_log_endio(struct bio *bio)
bio_endio(bi);
atomic_dec(&io->pending_stripe);
}
- if (atomic_read(&io->pending_stripe) == 0)
- __r5l_stripe_write_finished(io);
}
+
+ /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
+ if (atomic_read(&io->pending_stripe) == 0)
+ __r5l_stripe_write_finished(io);
}
static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
@@ -842,6 +843,41 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
r5_reserve_log_entry(log, io);
}
+static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect)
+{
+ struct mddev *mddev = log->rdev->mddev;
+ struct r5conf *conf = mddev->private;
+ struct r5l_io_unit *io;
+ struct r5l_payload_flush *payload;
+ int meta_size;
+
+ /*
+ * payload_flush requires extra writes to the journal.
+ * To avoid handling the extra IO in quiesce, just skip
+ * flush_payload
+ */
+ if (conf->quiesce)
+ return;
+
+ mutex_lock(&log->io_mutex);
+ meta_size = sizeof(struct r5l_payload_flush) + sizeof(__le64);
+
+ if (r5l_get_meta(log, meta_size)) {
+ mutex_unlock(&log->io_mutex);
+ return;
+ }
+
+ /* current implementation is one stripe per flush payload */
+ io = log->current_io;
+ payload = page_address(io->meta_page) + io->meta_offset;
+ payload->header.type = cpu_to_le16(R5LOG_PAYLOAD_FLUSH);
+ payload->header.flags = cpu_to_le16(0);
+ payload->size = cpu_to_le32(sizeof(__le64));
+ payload->flush_stripes[0] = cpu_to_le64(sect);
+ io->meta_offset += meta_size;
+ mutex_unlock(&log->io_mutex);
+}
+
static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
int data_pages, int parity_pages)
{
@@ -1393,7 +1429,7 @@ static void r5c_do_reclaim(struct r5conf *conf)
stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
atomic_read(&conf->r5c_cached_full_stripes) - flushing_full >
- R5C_FULL_STRIPE_FLUSH_BATCH)
+ R5C_FULL_STRIPE_FLUSH_BATCH(conf))
/*
* if stripe cache pressure moderate, or if there is many full
* stripes,flush all full stripes
@@ -1552,6 +1588,8 @@ bool r5l_log_disk_error(struct r5conf *conf)
return ret;
}
+#define R5L_RECOVERY_PAGE_POOL_SIZE 256
+
struct r5l_recovery_ctx {
struct page *meta_page; /* current meta */
sector_t meta_total_blocks; /* total size of current meta and data */
@@ -1560,18 +1598,131 @@ struct r5l_recovery_ctx {
int data_parity_stripes; /* number of data_parity stripes */
int data_only_stripes; /* number of data_only stripes */
struct list_head cached_list;
+
+ /*
+ * read ahead page pool (ra_pool)
+ * in recovery, log is read sequentially. It is not efficient to
+ * read every page with sync_page_io(). The read ahead page pool
+ * reads multiple pages with one IO, so further log read can
+ * just copy data from the pool.
+ */
+ struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE];
+ sector_t pool_offset; /* offset of first page in the pool */
+ int total_pages; /* total allocated pages */
+ int valid_pages; /* pages with valid data */
+ struct bio *ra_bio; /* bio to do the read ahead */
};
+static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx)
+{
+ struct page *page;
+
+ ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, log->bs);
+ if (!ctx->ra_bio)
+ return -ENOMEM;
+
+ ctx->valid_pages = 0;
+ ctx->total_pages = 0;
+ while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) {
+ page = alloc_page(GFP_KERNEL);
+
+ if (!page)
+ break;
+ ctx->ra_pool[ctx->total_pages] = page;
+ ctx->total_pages += 1;
+ }
+
+ if (ctx->total_pages == 0) {
+ bio_put(ctx->ra_bio);
+ return -ENOMEM;
+ }
+
+ ctx->pool_offset = 0;
+ return 0;
+}
+
+static void r5l_recovery_free_ra_pool(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx)
+{
+ int i;
+
+ for (i = 0; i < ctx->total_pages; ++i)
+ put_page(ctx->ra_pool[i]);
+ bio_put(ctx->ra_bio);
+}
+
+/*
+ * fetch ctx->valid_pages pages from offset
+ * In normal cases, ctx->valid_pages == ctx->total_pages after the call.
+ * However, if the offset is close to the end of the journal device,
+ * ctx->valid_pages could be smaller than ctx->total_pages
+ */
+static int r5l_recovery_fetch_ra_pool(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx,
+ sector_t offset)
+{
+ bio_reset(ctx->ra_bio);
+ ctx->ra_bio->bi_bdev = log->rdev->bdev;
+ bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0);
+ ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset;
+
+ ctx->valid_pages = 0;
+ ctx->pool_offset = offset;
+
+ while (ctx->valid_pages < ctx->total_pages) {
+ bio_add_page(ctx->ra_bio,
+ ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, 0);
+ ctx->valid_pages += 1;
+
+ offset = r5l_ring_add(log, offset, BLOCK_SECTORS);
+
+ if (offset == 0) /* reached end of the device */
+ break;
+ }
+
+ return submit_bio_wait(ctx->ra_bio);
+}
+
+/*
+ * try read a page from the read ahead page pool, if the page is not in the
+ * pool, call r5l_recovery_fetch_ra_pool
+ */
+static int r5l_recovery_read_page(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx,
+ struct page *page,
+ sector_t offset)
+{
+ int ret;
+
+ if (offset < ctx->pool_offset ||
+ offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS) {
+ ret = r5l_recovery_fetch_ra_pool(log, ctx, offset);
+ if (ret)
+ return ret;
+ }
+
+ BUG_ON(offset < ctx->pool_offset ||
+ offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS);
+
+ memcpy(page_address(page),
+ page_address(ctx->ra_pool[(offset - ctx->pool_offset) >>
+ BLOCK_SECTOR_SHIFT]),
+ PAGE_SIZE);
+ return 0;
+}
+
static int r5l_recovery_read_meta_block(struct r5l_log *log,
struct r5l_recovery_ctx *ctx)
{
struct page *page = ctx->meta_page;
struct r5l_meta_block *mb;
u32 crc, stored_crc;
+ int ret;
- if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
- false))
- return -EIO;
+ ret = r5l_recovery_read_page(log, ctx, page, ctx->pos);
+ if (ret != 0)
+ return ret;
mb = page_address(page);
stored_crc = le32_to_cpu(mb->checksum);
@@ -1653,8 +1804,7 @@ static void r5l_recovery_load_data(struct r5l_log *log,
raid5_compute_sector(conf,
le64_to_cpu(payload->location), 0,
&dd_idx, sh);
- sync_page_io(log->rdev, log_offset, PAGE_SIZE,
- sh->dev[dd_idx].page, REQ_OP_READ, 0, false);
+ r5l_recovery_read_page(log, ctx, sh->dev[dd_idx].page, log_offset);
sh->dev[dd_idx].log_checksum =
le32_to_cpu(payload->checksum[0]);
ctx->meta_total_blocks += BLOCK_SECTORS;
@@ -1673,17 +1823,15 @@ static void r5l_recovery_load_parity(struct r5l_log *log,
struct r5conf *conf = mddev->private;
ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
- sync_page_io(log->rdev, log_offset, PAGE_SIZE,
- sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false);
+ r5l_recovery_read_page(log, ctx, sh->dev[sh->pd_idx].page, log_offset);
sh->dev[sh->pd_idx].log_checksum =
le32_to_cpu(payload->checksum[0]);
set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
if (sh->qd_idx >= 0) {
- sync_page_io(log->rdev,
- r5l_ring_add(log, log_offset, BLOCK_SECTORS),
- PAGE_SIZE, sh->dev[sh->qd_idx].page,
- REQ_OP_READ, 0, false);
+ r5l_recovery_read_page(
+ log, ctx, sh->dev[sh->qd_idx].page,
+ r5l_ring_add(log, log_offset, BLOCK_SECTORS));
sh->dev[sh->qd_idx].log_checksum =
le32_to_cpu(payload->checksum[1]);
set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
@@ -1814,14 +1962,15 @@ r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
/* if matches return 0; otherwise return -EINVAL */
static int
-r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page,
+r5l_recovery_verify_data_checksum(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx,
+ struct page *page,
sector_t log_offset, __le32 log_checksum)
{
void *addr;
u32 checksum;
- sync_page_io(log->rdev, log_offset, PAGE_SIZE,
- page, REQ_OP_READ, 0, false);
+ r5l_recovery_read_page(log, ctx, page, log_offset);
addr = kmap_atomic(page);
checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
kunmap_atomic(addr);
@@ -1843,6 +1992,7 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
struct page *page;
struct r5l_payload_data_parity *payload;
+ struct r5l_payload_flush *payload_flush;
page = alloc_page(GFP_KERNEL);
if (!page)
@@ -1850,33 +2000,42 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
while (mb_offset < le32_to_cpu(mb->meta_size)) {
payload = (void *)mb + mb_offset;
+ payload_flush = (void *)mb + mb_offset;
- if (payload->header.type == R5LOG_PAYLOAD_DATA) {
+ if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
if (r5l_recovery_verify_data_checksum(
- log, page, log_offset,
+ log, ctx, page, log_offset,
payload->checksum[0]) < 0)
goto mismatch;
- } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) {
+ } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
if (r5l_recovery_verify_data_checksum(
- log, page, log_offset,
+ log, ctx, page, log_offset,
payload->checksum[0]) < 0)
goto mismatch;
if (conf->max_degraded == 2 && /* q for RAID 6 */
r5l_recovery_verify_data_checksum(
- log, page,
+ log, ctx, page,
r5l_ring_add(log, log_offset,
BLOCK_SECTORS),
payload->checksum[1]) < 0)
goto mismatch;
- } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */
+ } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
+ /* nothing to do for R5LOG_PAYLOAD_FLUSH here */
+ } else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */
goto mismatch;
- log_offset = r5l_ring_add(log, log_offset,
- le32_to_cpu(payload->size));
+ if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
+ mb_offset += sizeof(struct r5l_payload_flush) +
+ le32_to_cpu(payload_flush->size);
+ } else {
+ /* DATA or PARITY payload */
+ log_offset = r5l_ring_add(log, log_offset,
+ le32_to_cpu(payload->size));
+ mb_offset += sizeof(struct r5l_payload_data_parity) +
+ sizeof(__le32) *
+ (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+ }
- mb_offset += sizeof(struct r5l_payload_data_parity) +
- sizeof(__le32) *
- (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
}
put_page(page);
@@ -1904,6 +2063,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
struct r5conf *conf = mddev->private;
struct r5l_meta_block *mb;
struct r5l_payload_data_parity *payload;
+ struct r5l_payload_flush *payload_flush;
int mb_offset;
sector_t log_offset;
sector_t stripe_sect;
@@ -1929,7 +2089,31 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
int dd;
payload = (void *)mb + mb_offset;
- stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ?
+ payload_flush = (void *)mb + mb_offset;
+
+ if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
+ int i, count;
+
+ count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
+ for (i = 0; i < count; ++i) {
+ stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
+ sh = r5c_recovery_lookup_stripe(cached_stripe_list,
+ stripe_sect);
+ if (sh) {
+ WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
+ r5l_recovery_reset_stripe(sh);
+ list_del_init(&sh->lru);
+ raid5_release_stripe(sh);
+ }
+ }
+
+ mb_offset += sizeof(struct r5l_payload_flush) +
+ le32_to_cpu(payload_flush->size);
+ continue;
+ }
+
+ /* DATA or PARITY payload */
+ stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
raid5_compute_sector(
conf, le64_to_cpu(payload->location), 0, &dd,
NULL)
@@ -1967,7 +2151,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
list_add_tail(&sh->lru, cached_stripe_list);
}
- if (payload->header.type == R5LOG_PAYLOAD_DATA) {
+ if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
r5l_recovery_replay_one_stripe(conf, sh, ctx);
@@ -1975,7 +2159,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
}
r5l_recovery_load_data(log, sh, ctx, payload,
log_offset);
- } else if (payload->header.type == R5LOG_PAYLOAD_PARITY)
+ } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
r5l_recovery_load_parity(log, sh, ctx, payload,
log_offset);
else
@@ -2177,7 +2361,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
payload = (void *)mb + offset;
payload->header.type = cpu_to_le16(
R5LOG_PAYLOAD_DATA);
- payload->size = BLOCK_SECTORS;
+ payload->size = cpu_to_le32(BLOCK_SECTORS);
payload->location = cpu_to_le64(
raid5_compute_blocknr(sh, i, 0));
addr = kmap_atomic(dev->page);
@@ -2241,55 +2425,70 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
static int r5l_recovery_log(struct r5l_log *log)
{
struct mddev *mddev = log->rdev->mddev;
- struct r5l_recovery_ctx ctx;
+ struct r5l_recovery_ctx *ctx;
int ret;
sector_t pos;
- ctx.pos = log->last_checkpoint;
- ctx.seq = log->last_cp_seq;
- ctx.meta_page = alloc_page(GFP_KERNEL);
- ctx.data_only_stripes = 0;
- ctx.data_parity_stripes = 0;
- INIT_LIST_HEAD(&ctx.cached_list);
-
- if (!ctx.meta_page)
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
return -ENOMEM;
- ret = r5c_recovery_flush_log(log, &ctx);
- __free_page(ctx.meta_page);
+ ctx->pos = log->last_checkpoint;
+ ctx->seq = log->last_cp_seq;
+ INIT_LIST_HEAD(&ctx->cached_list);
+ ctx->meta_page = alloc_page(GFP_KERNEL);
- if (ret)
- return ret;
+ if (!ctx->meta_page) {
+ ret = -ENOMEM;
+ goto meta_page;
+ }
- pos = ctx.pos;
- ctx.seq += 10000;
+ if (r5l_recovery_allocate_ra_pool(log, ctx) != 0) {
+ ret = -ENOMEM;
+ goto ra_pool;
+ }
+ ret = r5c_recovery_flush_log(log, ctx);
- if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
+ if (ret)
+ goto error;
+
+ pos = ctx->pos;
+ ctx->seq += 10000;
+
+ if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0))
pr_debug("md/raid:%s: starting from clean shutdown\n",
mdname(mddev));
else
pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
- mdname(mddev), ctx.data_only_stripes,
- ctx.data_parity_stripes);
-
- if (ctx.data_only_stripes == 0) {
- log->next_checkpoint = ctx.pos;
- r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
- ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
- } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
+ mdname(mddev), ctx->data_only_stripes,
+ ctx->data_parity_stripes);
+
+ if (ctx->data_only_stripes == 0) {
+ log->next_checkpoint = ctx->pos;
+ r5l_log_write_empty_meta_block(log, ctx->pos, ctx->seq++);
+ ctx->pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
+ } else if (r5c_recovery_rewrite_data_only_stripes(log, ctx)) {
pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
mdname(mddev));
- return -EIO;
+ ret = -EIO;
+ goto error;
}
- log->log_start = ctx.pos;
- log->seq = ctx.seq;
+ log->log_start = ctx->pos;
+ log->seq = ctx->seq;
log->last_checkpoint = pos;
r5l_write_super(log, pos);
- r5c_recovery_flush_data_only_stripes(log, &ctx);
- return 0;
+ r5c_recovery_flush_data_only_stripes(log, ctx);
+ ret = 0;
+error:
+ r5l_recovery_free_ra_pool(log, ctx);
+ra_pool:
+ __free_page(ctx->meta_page);
+meta_page:
+ kfree(ctx);
+ return ret;
}
static void r5l_write_super(struct r5l_log *log, sector_t cp)
@@ -2618,11 +2817,11 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
atomic_dec(&conf->r5c_flushing_full_stripes);
atomic_dec(&conf->r5c_cached_full_stripes);
}
+
+ r5l_append_flush_payload(log, sh->sector);
}
-int
-r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
- struct stripe_head_state *s)
+int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
int pages = 0;
@@ -2785,6 +2984,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
{
struct request_queue *q = bdev_get_queue(rdev->bdev);
struct r5l_log *log;
+ char b[BDEVNAME_SIZE];
+
+ pr_debug("md/raid:%s: using device %s as journal\n",
+ mdname(conf->mddev), bdevname(rdev->bdev, b));
if (PAGE_SIZE != 4096)
return -EINVAL;
@@ -2887,8 +3090,13 @@ io_kc:
return -EINVAL;
}
-void r5l_exit_log(struct r5l_log *log)
+void r5l_exit_log(struct r5conf *conf)
{
+ struct r5l_log *log = conf->log;
+
+ conf->log = NULL;
+ synchronize_rcu();
+
flush_work(&log->disable_writeback_work);
md_unregister_thread(&log->reclaim_thread);
mempool_destroy(log->meta_pool);
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
new file mode 100644
index 000000000000..27097101ccca
--- /dev/null
+++ b/drivers/md/raid5-log.h
@@ -0,0 +1,115 @@
+#ifndef _RAID5_LOG_H
+#define _RAID5_LOG_H
+
+extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
+extern void r5l_exit_log(struct r5conf *conf);
+extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
+extern void r5l_write_stripe_run(struct r5l_log *log);
+extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
+extern void r5l_stripe_write_finished(struct stripe_head *sh);
+extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
+extern void r5l_quiesce(struct r5l_log *log, int state);
+extern bool r5l_log_disk_error(struct r5conf *conf);
+extern bool r5c_is_writeback(struct r5l_log *log);
+extern int
+r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
+ struct stripe_head_state *s, int disks);
+extern void
+r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
+ struct stripe_head_state *s);
+extern void r5c_release_extra_page(struct stripe_head *sh);
+extern void r5c_use_extra_page(struct stripe_head *sh);
+extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+extern void r5c_handle_cached_data_endio(struct r5conf *conf,
+ struct stripe_head *sh, int disks);
+extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
+extern void r5c_make_stripe_write_out(struct stripe_head *sh);
+extern void r5c_flush_cache(struct r5conf *conf, int num);
+extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
+extern void r5c_check_cached_full_stripe(struct r5conf *conf);
+extern struct md_sysfs_entry r5c_journal_mode;
+extern void r5c_update_on_rdev_error(struct mddev *mddev);
+extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
+
+extern struct dma_async_tx_descriptor *
+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
+ struct dma_async_tx_descriptor *tx);
+extern int ppl_init_log(struct r5conf *conf);
+extern void ppl_exit_log(struct r5conf *conf);
+extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
+extern void ppl_write_stripe_run(struct r5conf *conf);
+extern void ppl_stripe_write_finished(struct stripe_head *sh);
+extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
+
+static inline bool raid5_has_ppl(struct r5conf *conf)
+{
+ return test_bit(MD_HAS_PPL, &conf->mddev->flags);
+}
+
+static inline int log_stripe(struct stripe_head *sh, struct stripe_head_state *s)
+{
+ struct r5conf *conf = sh->raid_conf;
+
+ if (conf->log) {
+ if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
+ /* writing out phase */
+ if (s->waiting_extra_page)
+ return 0;
+ return r5l_write_stripe(conf->log, sh);
+ } else if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
+ /* caching phase */
+ return r5c_cache_data(conf->log, sh);
+ }
+ } else if (raid5_has_ppl(conf)) {
+ return ppl_write_stripe(conf, sh);
+ }
+
+ return -EAGAIN;
+}
+
+static inline void log_stripe_write_finished(struct stripe_head *sh)
+{
+ struct r5conf *conf = sh->raid_conf;
+
+ if (conf->log)
+ r5l_stripe_write_finished(sh);
+ else if (raid5_has_ppl(conf))
+ ppl_stripe_write_finished(sh);
+}
+
+static inline void log_write_stripe_run(struct r5conf *conf)
+{
+ if (conf->log)
+ r5l_write_stripe_run(conf->log);
+ else if (raid5_has_ppl(conf))
+ ppl_write_stripe_run(conf);
+}
+
+static inline void log_exit(struct r5conf *conf)
+{
+ if (conf->log)
+ r5l_exit_log(conf);
+ else if (raid5_has_ppl(conf))
+ ppl_exit_log(conf);
+}
+
+static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev,
+ bool ppl)
+{
+ if (journal_dev)
+ return r5l_init_log(conf, journal_dev);
+ else if (ppl)
+ return ppl_init_log(conf);
+
+ return 0;
+}
+
+static inline int log_modify(struct r5conf *conf, struct md_rdev *rdev, bool add)
+{
+ if (raid5_has_ppl(conf))
+ return ppl_modify_log(conf, rdev, add);
+
+ return 0;
+}
+
+#endif
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
new file mode 100644
index 000000000000..5d25bebf3328
--- /dev/null
+++ b/drivers/md/raid5-ppl.c
@@ -0,0 +1,1271 @@
+/*
+ * Partial Parity Log for closing the RAID5 write hole
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/crc32c.h>
+#include <linux/flex_array.h>
+#include <linux/async_tx.h>
+#include <linux/raid/md_p.h>
+#include "md.h"
+#include "raid5.h"
+
+/*
+ * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for
+ * partial parity data. The header contains an array of entries
+ * (struct ppl_header_entry) which describe the logged write requests.
+ * Partial parity for the entries comes after the header, written in the same
+ * sequence as the entries:
+ *
+ * Header
+ * entry0
+ * ...
+ * entryN
+ * PP data
+ * PP for entry0
+ * ...
+ * PP for entryN
+ *
+ * An entry describes one or more consecutive stripe_heads, up to a full
+ * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the
+ * number of stripe_heads in the entry and n is the number of modified data
+ * disks. Every stripe_head in the entry must write to the same data disks.
+ * An example of a valid case described by a single entry (writes to the first
+ * stripe of a 4 disk array, 16k chunk size):
+ *
+ * sh->sector dd0 dd1 dd2 ppl
+ * +-----+-----+-----+
+ * 0 | --- | --- | --- | +----+
+ * 8 | -W- | -W- | --- | | pp | data_sector = 8
+ * 16 | -W- | -W- | --- | | pp | data_size = 3 * 2 * 4k
+ * 24 | -W- | -W- | --- | | pp | pp_size = 3 * 4k
+ * +-----+-----+-----+ +----+
+ *
+ * data_sector is the first raid sector of the modified data, data_size is the
+ * total size of modified data and pp_size is the size of partial parity for
+ * this entry. Entries for full stripe writes contain no partial parity
+ * (pp_size = 0), they only mark the stripes for which parity should be
+ * recalculated after an unclean shutdown. Every entry holds a checksum of its
+ * partial parity, the header also has a checksum of the header itself.
+ *
+ * A write request is always logged to the PPL instance stored on the parity
+ * disk of the corresponding stripe. For each member disk there is one ppl_log
+ * used to handle logging for this disk, independently from others. They are
+ * grouped in child_logs array in struct ppl_conf, which is assigned to
+ * r5conf->log_private.
+ *
+ * ppl_io_unit represents a full PPL write, header_page contains the ppl_header.
+ * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head
+ * can be appended to the last entry if it meets the conditions for a valid
+ * entry described above, otherwise a new entry is added. Checksums of entries
+ * are calculated incrementally as stripes containing partial parity are being
+ * added. ppl_submit_iounit() calculates the checksum of the header and submits
+ * a bio containing the header page and partial parity pages (sh->ppl_page) for
+ * all stripes of the io_unit. When the PPL write completes, the stripes
+ * associated with the io_unit are released and raid5d starts writing their data
+ * and parity. When all stripes are written, the io_unit is freed and the next
+ * can be submitted.
+ *
+ * An io_unit is used to gather stripes until it is submitted or becomes full
+ * (if the maximum number of entries or size of PPL is reached). Another io_unit
+ * can't be submitted until the previous has completed (PPL and stripe
+ * data+parity is written). The log->io_list tracks all io_units of a log
+ * (for a single member disk). New io_units are added to the end of the list
+ * and the first io_unit is submitted, if it is not submitted already.
+ * The current io_unit accepting new stripes is always at the end of the list.
+ */
+
+struct ppl_conf {
+ struct mddev *mddev;
+
+ /* array of child logs, one for each raid disk */
+ struct ppl_log *child_logs;
+ int count;
+
+ int block_size; /* the logical block size used for data_sector
+ * in ppl_header_entry */
+ u32 signature; /* raid array identifier */
+ atomic64_t seq; /* current log write sequence number */
+
+ struct kmem_cache *io_kc;
+ mempool_t *io_pool;
+ struct bio_set *bs;
+
+ /* used only for recovery */
+ int recovered_entries;
+ int mismatch_count;
+
+ /* stripes to retry if failed to allocate io_unit */
+ struct list_head no_mem_stripes;
+ spinlock_t no_mem_stripes_lock;
+};
+
+struct ppl_log {
+ struct ppl_conf *ppl_conf; /* shared between all log instances */
+
+ struct md_rdev *rdev; /* array member disk associated with
+ * this log instance */
+ struct mutex io_mutex;
+ struct ppl_io_unit *current_io; /* current io_unit accepting new data
+ * always at the end of io_list */
+ spinlock_t io_list_lock;
+ struct list_head io_list; /* all io_units of this log */
+};
+
+#define PPL_IO_INLINE_BVECS 32
+
+struct ppl_io_unit {
+ struct ppl_log *log;
+
+ struct page *header_page; /* for ppl_header */
+
+ unsigned int entries_count; /* number of entries in ppl_header */
+ unsigned int pp_size; /* total size current of partial parity */
+
+ u64 seq; /* sequence number of this log write */
+ struct list_head log_sibling; /* log->io_list */
+
+ struct list_head stripe_list; /* stripes added to the io_unit */
+ atomic_t pending_stripes; /* how many stripes not written to raid */
+
+ bool submitted; /* true if write to log started */
+
+ /* inline bio and its biovec for submitting the iounit */
+ struct bio bio;
+ struct bio_vec biovec[PPL_IO_INLINE_BVECS];
+};
+
+struct dma_async_tx_descriptor *
+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
+ struct dma_async_tx_descriptor *tx)
+{
+ int disks = sh->disks;
+ struct page **srcs = flex_array_get(percpu->scribble, 0);
+ int count = 0, pd_idx = sh->pd_idx, i;
+ struct async_submit_ctl submit;
+
+ pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
+
+ /*
+ * Partial parity is the XOR of stripe data chunks that are not changed
+ * during the write request. Depending on available data
+ * (read-modify-write vs. reconstruct-write case) we calculate it
+ * differently.
+ */
+ if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+ /*
+ * rmw: xor old data and parity from updated disks
+ * This is calculated earlier by ops_run_prexor5() so just copy
+ * the parity dev page.
+ */
+ srcs[count++] = sh->dev[pd_idx].page;
+ } else if (sh->reconstruct_state == reconstruct_state_drain_run) {
+ /* rcw: xor data from all not updated disks */
+ for (i = disks; i--;) {
+ struct r5dev *dev = &sh->dev[i];
+ if (test_bit(R5_UPTODATE, &dev->flags))
+ srcs[count++] = dev->page;
+ }
+ } else {
+ return tx;
+ }
+
+ init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
+ NULL, sh, flex_array_get(percpu->scribble, 0)
+ + sizeof(struct page *) * (sh->disks + 2));
+
+ if (count == 1)
+ tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE,
+ &submit);
+ else
+ tx = async_xor(sh->ppl_page, srcs, 0, count, PAGE_SIZE,
+ &submit);
+
+ return tx;
+}
+
+static void *ppl_io_pool_alloc(gfp_t gfp_mask, void *pool_data)
+{
+ struct kmem_cache *kc = pool_data;
+ struct ppl_io_unit *io;
+
+ io = kmem_cache_alloc(kc, gfp_mask);
+ if (!io)
+ return NULL;
+
+ io->header_page = alloc_page(gfp_mask);
+ if (!io->header_page) {
+ kmem_cache_free(kc, io);
+ return NULL;
+ }
+
+ return io;
+}
+
+static void ppl_io_pool_free(void *element, void *pool_data)
+{
+ struct kmem_cache *kc = pool_data;
+ struct ppl_io_unit *io = element;
+
+ __free_page(io->header_page);
+ kmem_cache_free(kc, io);
+}
+
+static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
+ struct stripe_head *sh)
+{
+ struct ppl_conf *ppl_conf = log->ppl_conf;
+ struct ppl_io_unit *io;
+ struct ppl_header *pplhdr;
+ struct page *header_page;
+
+ io = mempool_alloc(ppl_conf->io_pool, GFP_NOWAIT);
+ if (!io)
+ return NULL;
+
+ header_page = io->header_page;
+ memset(io, 0, sizeof(*io));
+ io->header_page = header_page;
+
+ io->log = log;
+ INIT_LIST_HEAD(&io->log_sibling);
+ INIT_LIST_HEAD(&io->stripe_list);
+ atomic_set(&io->pending_stripes, 0);
+ bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
+
+ pplhdr = page_address(io->header_page);
+ clear_page(pplhdr);
+ memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
+ pplhdr->signature = cpu_to_le32(ppl_conf->signature);
+
+ io->seq = atomic64_add_return(1, &ppl_conf->seq);
+ pplhdr->generation = cpu_to_le64(io->seq);
+
+ return io;
+}
+
+static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
+{
+ struct ppl_io_unit *io = log->current_io;
+ struct ppl_header_entry *e = NULL;
+ struct ppl_header *pplhdr;
+ int i;
+ sector_t data_sector = 0;
+ int data_disks = 0;
+ unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
+ struct r5conf *conf = sh->raid_conf;
+
+ pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
+
+ /* check if current io_unit is full */
+ if (io && (io->pp_size == entry_space ||
+ io->entries_count == PPL_HDR_MAX_ENTRIES)) {
+ pr_debug("%s: add io_unit blocked by seq: %llu\n",
+ __func__, io->seq);
+ io = NULL;
+ }
+
+ /* add a new unit if there is none or the current is full */
+ if (!io) {
+ io = ppl_new_iounit(log, sh);
+ if (!io)
+ return -ENOMEM;
+ spin_lock_irq(&log->io_list_lock);
+ list_add_tail(&io->log_sibling, &log->io_list);
+ spin_unlock_irq(&log->io_list_lock);
+
+ log->current_io = io;
+ }
+
+ for (i = 0; i < sh->disks; i++) {
+ struct r5dev *dev = &sh->dev[i];
+
+ if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) {
+ if (!data_disks || dev->sector < data_sector)
+ data_sector = dev->sector;
+ data_disks++;
+ }
+ }
+ BUG_ON(!data_disks);
+
+ pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__,
+ io->seq, (unsigned long long)data_sector, data_disks);
+
+ pplhdr = page_address(io->header_page);
+
+ if (io->entries_count > 0) {
+ struct ppl_header_entry *last =
+ &pplhdr->entries[io->entries_count - 1];
+ struct stripe_head *sh_last = list_last_entry(
+ &io->stripe_list, struct stripe_head, log_list);
+ u64 data_sector_last = le64_to_cpu(last->data_sector);
+ u32 data_size_last = le32_to_cpu(last->data_size);
+
+ /*
+ * Check if we can append the stripe to the last entry. It must
+ * be just after the last logged stripe and write to the same
+ * disks. Use bit shift and logarithm to avoid 64-bit division.
+ */
+ if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
+ (data_sector >> ilog2(conf->chunk_sectors) ==
+ data_sector_last >> ilog2(conf->chunk_sectors)) &&
+ ((data_sector - data_sector_last) * data_disks ==
+ data_size_last >> 9))
+ e = last;
+ }
+
+ if (!e) {
+ e = &pplhdr->entries[io->entries_count++];
+ e->data_sector = cpu_to_le64(data_sector);
+ e->parity_disk = cpu_to_le32(sh->pd_idx);
+ e->checksum = cpu_to_le32(~0);
+ }
+
+ le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT);
+
+ /* don't write any PP if full stripe write */
+ if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
+ le32_add_cpu(&e->pp_size, PAGE_SIZE);
+ io->pp_size += PAGE_SIZE;
+ e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
+ page_address(sh->ppl_page),
+ PAGE_SIZE));
+ }
+
+ list_add_tail(&sh->log_list, &io->stripe_list);
+ atomic_inc(&io->pending_stripes);
+ sh->ppl_io = io;
+
+ return 0;
+}
+
+int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
+{
+ struct ppl_conf *ppl_conf = conf->log_private;
+ struct ppl_io_unit *io = sh->ppl_io;
+ struct ppl_log *log;
+
+ if (io || test_bit(STRIPE_SYNCING, &sh->state) || !sh->ppl_page ||
+ !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
+ !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
+ clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
+ return -EAGAIN;
+ }
+
+ log = &ppl_conf->child_logs[sh->pd_idx];
+
+ mutex_lock(&log->io_mutex);
+
+ if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
+ mutex_unlock(&log->io_mutex);
+ return -EAGAIN;
+ }
+
+ set_bit(STRIPE_LOG_TRAPPED, &sh->state);
+ clear_bit(STRIPE_DELAYED, &sh->state);
+ atomic_inc(&sh->count);
+
+ if (ppl_log_stripe(log, sh)) {
+ spin_lock_irq(&ppl_conf->no_mem_stripes_lock);
+ list_add_tail(&sh->log_list, &ppl_conf->no_mem_stripes);
+ spin_unlock_irq(&ppl_conf->no_mem_stripes_lock);
+ }
+
+ mutex_unlock(&log->io_mutex);
+
+ return 0;
+}
+
+static void ppl_log_endio(struct bio *bio)
+{
+ struct ppl_io_unit *io = bio->bi_private;
+ struct ppl_log *log = io->log;
+ struct ppl_conf *ppl_conf = log->ppl_conf;
+ struct stripe_head *sh, *next;
+
+ pr_debug("%s: seq: %llu\n", __func__, io->seq);
+
+ if (bio->bi_error)
+ md_error(ppl_conf->mddev, log->rdev);
+
+ list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
+ list_del_init(&sh->log_list);
+
+ set_bit(STRIPE_HANDLE, &sh->state);
+ raid5_release_stripe(sh);
+ }
+}
+
+static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio)
+{
+ char b[BDEVNAME_SIZE];
+
+ pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
+ __func__, io->seq, bio->bi_iter.bi_size,
+ (unsigned long long)bio->bi_iter.bi_sector,
+ bdevname(bio->bi_bdev, b));
+
+ submit_bio(bio);
+}
+
+static void ppl_submit_iounit(struct ppl_io_unit *io)
+{
+ struct ppl_log *log = io->log;
+ struct ppl_conf *ppl_conf = log->ppl_conf;
+ struct ppl_header *pplhdr = page_address(io->header_page);
+ struct bio *bio = &io->bio;
+ struct stripe_head *sh;
+ int i;
+
+ bio->bi_private = io;
+
+ if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
+ ppl_log_endio(bio);
+ return;
+ }
+
+ for (i = 0; i < io->entries_count; i++) {
+ struct ppl_header_entry *e = &pplhdr->entries[i];
+
+ pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n",
+ __func__, io->seq, i, le64_to_cpu(e->data_sector),
+ le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size));
+
+ e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >>
+ ilog2(ppl_conf->block_size >> 9));
+ e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum));
+ }
+
+ pplhdr->entries_count = cpu_to_le32(io->entries_count);
+ pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
+
+ bio->bi_end_io = ppl_log_endio;
+ bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
+ bio->bi_bdev = log->rdev->bdev;
+ bio->bi_iter.bi_sector = log->rdev->ppl.sector;
+ bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
+
+ list_for_each_entry(sh, &io->stripe_list, log_list) {
+ /* entries for full stripe writes have no partial parity */
+ if (test_bit(STRIPE_FULL_WRITE, &sh->state))
+ continue;
+
+ if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
+ struct bio *prev = bio;
+
+ bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
+ ppl_conf->bs);
+ bio->bi_opf = prev->bi_opf;
+ bio->bi_bdev = prev->bi_bdev;
+ bio->bi_iter.bi_sector = bio_end_sector(prev);
+ bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
+
+ bio_chain(bio, prev);
+ ppl_submit_iounit_bio(io, prev);
+ }
+ }
+
+ ppl_submit_iounit_bio(io, bio);
+}
+
+static void ppl_submit_current_io(struct ppl_log *log)
+{
+ struct ppl_io_unit *io;
+
+ spin_lock_irq(&log->io_list_lock);
+
+ io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
+ log_sibling);
+ if (io && io->submitted)
+ io = NULL;
+
+ spin_unlock_irq(&log->io_list_lock);
+
+ if (io) {
+ io->submitted = true;
+
+ if (io == log->current_io)
+ log->current_io = NULL;
+
+ ppl_submit_iounit(io);
+ }
+}
+
+void ppl_write_stripe_run(struct r5conf *conf)
+{
+ struct ppl_conf *ppl_conf = conf->log_private;
+ struct ppl_log *log;
+ int i;
+
+ for (i = 0; i < ppl_conf->count; i++) {
+ log = &ppl_conf->child_logs[i];
+
+ mutex_lock(&log->io_mutex);
+ ppl_submit_current_io(log);
+ mutex_unlock(&log->io_mutex);
+ }
+}
+
+static void ppl_io_unit_finished(struct ppl_io_unit *io)
+{
+ struct ppl_log *log = io->log;
+ struct ppl_conf *ppl_conf = log->ppl_conf;
+ unsigned long flags;
+
+ pr_debug("%s: seq: %llu\n", __func__, io->seq);
+
+ local_irq_save(flags);
+
+ spin_lock(&log->io_list_lock);
+ list_del(&io->log_sibling);
+ spin_unlock(&log->io_list_lock);
+
+ mempool_free(io, ppl_conf->io_pool);
+
+ spin_lock(&ppl_conf->no_mem_stripes_lock);
+ if (!list_empty(&ppl_conf->no_mem_stripes)) {
+ struct stripe_head *sh;
+
+ sh = list_first_entry(&ppl_conf->no_mem_stripes,
+ struct stripe_head, log_list);
+ list_del_init(&sh->log_list);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ raid5_release_stripe(sh);
+ }
+ spin_unlock(&ppl_conf->no_mem_stripes_lock);
+
+ local_irq_restore(flags);
+}
+
+void ppl_stripe_write_finished(struct stripe_head *sh)
+{
+ struct ppl_io_unit *io;
+
+ io = sh->ppl_io;
+ sh->ppl_io = NULL;
+
+ if (io && atomic_dec_and_test(&io->pending_stripes))
+ ppl_io_unit_finished(io);
+}
+
+static void ppl_xor(int size, struct page *page1, struct page *page2)
+{
+ struct async_submit_ctl submit;
+ struct dma_async_tx_descriptor *tx;
+ struct page *xor_srcs[] = { page1, page2 };
+
+ init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST,
+ NULL, NULL, NULL, NULL);
+ tx = async_xor(page1, xor_srcs, 0, 2, size, &submit);
+
+ async_tx_quiesce(&tx);
+}
+
+/*
+ * PPL recovery strategy: xor partial parity and data from all modified data
+ * disks within a stripe and write the result as the new stripe parity. If all
+ * stripe data disks are modified (full stripe write), no partial parity is
+ * available, so just xor the data disks.
+ *
+ * Recovery of a PPL entry shall occur only if all modified data disks are
+ * available and read from all of them succeeds.
+ *
+ * A PPL entry applies to a stripe, partial parity size for an entry is at most
+ * the size of the chunk. Examples of possible cases for a single entry:
+ *
+ * case 0: single data disk write:
+ * data0 data1 data2 ppl parity
+ * +--------+--------+--------+ +--------------------+
+ * | ------ | ------ | ------ | +----+ | (no change) |
+ * | ------ | -data- | ------ | | pp | -> | data1 ^ pp |
+ * | ------ | -data- | ------ | | pp | -> | data1 ^ pp |
+ * | ------ | ------ | ------ | +----+ | (no change) |
+ * +--------+--------+--------+ +--------------------+
+ * pp_size = data_size
+ *
+ * case 1: more than one data disk write:
+ * data0 data1 data2 ppl parity
+ * +--------+--------+--------+ +--------------------+
+ * | ------ | ------ | ------ | +----+ | (no change) |
+ * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
+ * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
+ * | ------ | ------ | ------ | +----+ | (no change) |
+ * +--------+--------+--------+ +--------------------+
+ * pp_size = data_size / modified_data_disks
+ *
+ * case 2: write to all data disks (also full stripe write):
+ * data0 data1 data2 parity
+ * +--------+--------+--------+ +--------------------+
+ * | ------ | ------ | ------ | | (no change) |
+ * | -data- | -data- | -data- | --------> | xor all data |
+ * | ------ | ------ | ------ | --------> | (no change) |
+ * | ------ | ------ | ------ | | (no change) |
+ * +--------+--------+--------+ +--------------------+
+ * pp_size = 0
+ *
+ * The following cases are possible only in other implementations. The recovery
+ * code can handle them, but they are not generated at runtime because they can
+ * be reduced to cases 0, 1 and 2:
+ *
+ * case 3:
+ * data0 data1 data2 ppl parity
+ * +--------+--------+--------+ +----+ +--------------------+
+ * | ------ | -data- | -data- | | pp | | data1 ^ data2 ^ pp |
+ * | ------ | -data- | -data- | | pp | -> | data1 ^ data2 ^ pp |
+ * | -data- | -data- | -data- | | -- | -> | xor all data |
+ * | -data- | -data- | ------ | | pp | | data0 ^ data1 ^ pp |
+ * +--------+--------+--------+ +----+ +--------------------+
+ * pp_size = chunk_size
+ *
+ * case 4:
+ * data0 data1 data2 ppl parity
+ * +--------+--------+--------+ +----+ +--------------------+
+ * | ------ | -data- | ------ | | pp | | data1 ^ pp |
+ * | ------ | ------ | ------ | | -- | -> | (no change) |
+ * | ------ | ------ | ------ | | -- | -> | (no change) |
+ * | -data- | ------ | ------ | | pp | | data0 ^ pp |
+ * +--------+--------+--------+ +----+ +--------------------+
+ * pp_size = chunk_size
+ */
+static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
+ sector_t ppl_sector)
+{
+ struct ppl_conf *ppl_conf = log->ppl_conf;
+ struct mddev *mddev = ppl_conf->mddev;
+ struct r5conf *conf = mddev->private;
+ int block_size = ppl_conf->block_size;
+ struct page *page1;
+ struct page *page2;
+ sector_t r_sector_first;
+ sector_t r_sector_last;
+ int strip_sectors;
+ int data_disks;
+ int i;
+ int ret = 0;
+ char b[BDEVNAME_SIZE];
+ unsigned int pp_size = le32_to_cpu(e->pp_size);
+ unsigned int data_size = le32_to_cpu(e->data_size);
+
+ page1 = alloc_page(GFP_KERNEL);
+ page2 = alloc_page(GFP_KERNEL);
+
+ if (!page1 || !page2) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9);
+
+ if ((pp_size >> 9) < conf->chunk_sectors) {
+ if (pp_size > 0) {
+ data_disks = data_size / pp_size;
+ strip_sectors = pp_size >> 9;
+ } else {
+ data_disks = conf->raid_disks - conf->max_degraded;
+ strip_sectors = (data_size >> 9) / data_disks;
+ }
+ r_sector_last = r_sector_first +
+ (data_disks - 1) * conf->chunk_sectors +
+ strip_sectors;
+ } else {
+ data_disks = conf->raid_disks - conf->max_degraded;
+ strip_sectors = conf->chunk_sectors;
+ r_sector_last = r_sector_first + (data_size >> 9);
+ }
+
+ pr_debug("%s: array sector first: %llu last: %llu\n", __func__,
+ (unsigned long long)r_sector_first,
+ (unsigned long long)r_sector_last);
+
+ /* if start and end is 4k aligned, use a 4k block */
+ if (block_size == 512 &&
+ (r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
+ (r_sector_last & (STRIPE_SECTORS - 1)) == 0)
+ block_size = STRIPE_SIZE;
+
+ /* iterate through blocks in strip */
+ for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
+ bool update_parity = false;
+ sector_t parity_sector;
+ struct md_rdev *parity_rdev;
+ struct stripe_head sh;
+ int disk;
+ int indent = 0;
+
+ pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i);
+ indent += 2;
+
+ memset(page_address(page1), 0, PAGE_SIZE);
+
+ /* iterate through data member disks */
+ for (disk = 0; disk < data_disks; disk++) {
+ int dd_idx;
+ struct md_rdev *rdev;
+ sector_t sector;
+ sector_t r_sector = r_sector_first + i +
+ (disk * conf->chunk_sectors);
+
+ pr_debug("%s:%*s data member disk %d start\n",
+ __func__, indent, "", disk);
+ indent += 2;
+
+ if (r_sector >= r_sector_last) {
+ pr_debug("%s:%*s array sector %llu doesn't need parity update\n",
+ __func__, indent, "",
+ (unsigned long long)r_sector);
+ indent -= 2;
+ continue;
+ }
+
+ update_parity = true;
+
+ /* map raid sector to member disk */
+ sector = raid5_compute_sector(conf, r_sector, 0,
+ &dd_idx, NULL);
+ pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n",
+ __func__, indent, "",
+ (unsigned long long)r_sector, dd_idx,
+ (unsigned long long)sector);
+
+ rdev = conf->disks[dd_idx].rdev;
+ if (!rdev) {
+ pr_debug("%s:%*s data member disk %d missing\n",
+ __func__, indent, "", dd_idx);
+ update_parity = false;
+ break;
+ }
+
+ pr_debug("%s:%*s reading data member disk %s sector %llu\n",
+ __func__, indent, "", bdevname(rdev->bdev, b),
+ (unsigned long long)sector);
+ if (!sync_page_io(rdev, sector, block_size, page2,
+ REQ_OP_READ, 0, false)) {
+ md_error(mddev, rdev);
+ pr_debug("%s:%*s read failed!\n", __func__,
+ indent, "");
+ ret = -EIO;
+ goto out;
+ }
+
+ ppl_xor(block_size, page1, page2);
+
+ indent -= 2;
+ }
+
+ if (!update_parity)
+ continue;
+
+ if (pp_size > 0) {
+ pr_debug("%s:%*s reading pp disk sector %llu\n",
+ __func__, indent, "",
+ (unsigned long long)(ppl_sector + i));
+ if (!sync_page_io(log->rdev,
+ ppl_sector - log->rdev->data_offset + i,
+ block_size, page2, REQ_OP_READ, 0,
+ false)) {
+ pr_debug("%s:%*s read failed!\n", __func__,
+ indent, "");
+ md_error(mddev, log->rdev);
+ ret = -EIO;
+ goto out;
+ }
+
+ ppl_xor(block_size, page1, page2);
+ }
+
+ /* map raid sector to parity disk */
+ parity_sector = raid5_compute_sector(conf, r_sector_first + i,
+ 0, &disk, &sh);
+ BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
+ parity_rdev = conf->disks[sh.pd_idx].rdev;
+
+ BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
+ pr_debug("%s:%*s write parity at sector %llu, disk %s\n",
+ __func__, indent, "",
+ (unsigned long long)parity_sector,
+ bdevname(parity_rdev->bdev, b));
+ if (!sync_page_io(parity_rdev, parity_sector, block_size,
+ page1, REQ_OP_WRITE, 0, false)) {
+ pr_debug("%s:%*s parity write error!\n", __func__,
+ indent, "");
+ md_error(mddev, parity_rdev);
+ ret = -EIO;
+ goto out;
+ }
+ }
+out:
+ if (page1)
+ __free_page(page1);
+ if (page2)
+ __free_page(page2);
+ return ret;
+}
+
+static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr)
+{
+ struct ppl_conf *ppl_conf = log->ppl_conf;
+ struct md_rdev *rdev = log->rdev;
+ struct mddev *mddev = rdev->mddev;
+ sector_t ppl_sector = rdev->ppl.sector + (PPL_HEADER_SIZE >> 9);
+ struct page *page;
+ int i;
+ int ret = 0;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ /* iterate through all PPL entries saved */
+ for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) {
+ struct ppl_header_entry *e = &pplhdr->entries[i];
+ u32 pp_size = le32_to_cpu(e->pp_size);
+ sector_t sector = ppl_sector;
+ int ppl_entry_sectors = pp_size >> 9;
+ u32 crc, crc_stored;
+
+ pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n",
+ __func__, rdev->raid_disk, i,
+ (unsigned long long)ppl_sector, pp_size);
+
+ crc = ~0;
+ crc_stored = le32_to_cpu(e->checksum);
+
+ /* read parial parity for this entry and calculate its checksum */
+ while (pp_size) {
+ int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size;
+
+ if (!sync_page_io(rdev, sector - rdev->data_offset,
+ s, page, REQ_OP_READ, 0, false)) {
+ md_error(mddev, rdev);
+ ret = -EIO;
+ goto out;
+ }
+
+ crc = crc32c_le(crc, page_address(page), s);
+
+ pp_size -= s;
+ sector += s >> 9;
+ }
+
+ crc = ~crc;
+
+ if (crc != crc_stored) {
+ /*
+ * Don't recover this entry if the checksum does not
+ * match, but keep going and try to recover other
+ * entries.
+ */
+ pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
+ __func__, crc_stored, crc);
+ ppl_conf->mismatch_count++;
+ } else {
+ ret = ppl_recover_entry(log, e, ppl_sector);
+ if (ret)
+ goto out;
+ ppl_conf->recovered_entries++;
+ }
+
+ ppl_sector += ppl_entry_sectors;
+ }
+
+ /* flush the disk cache after recovery if necessary */
+ ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL);
+out:
+ __free_page(page);
+ return ret;
+}
+
+static int ppl_write_empty_header(struct ppl_log *log)
+{
+ struct page *page;
+ struct ppl_header *pplhdr;
+ struct md_rdev *rdev = log->rdev;
+ int ret = 0;
+
+ pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__,
+ rdev->raid_disk, (unsigned long long)rdev->ppl.sector);
+
+ page = alloc_page(GFP_NOIO | __GFP_ZERO);
+ if (!page)
+ return -ENOMEM;
+
+ pplhdr = page_address(page);
+ memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
+ pplhdr->signature = cpu_to_le32(log->ppl_conf->signature);
+ pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
+
+ if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
+ PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_FUA, 0,
+ false)) {
+ md_error(rdev->mddev, rdev);
+ ret = -EIO;
+ }
+
+ __free_page(page);
+ return ret;
+}
+
+static int ppl_load_distributed(struct ppl_log *log)
+{
+ struct ppl_conf *ppl_conf = log->ppl_conf;
+ struct md_rdev *rdev = log->rdev;
+ struct mddev *mddev = rdev->mddev;
+ struct page *page;
+ struct ppl_header *pplhdr;
+ u32 crc, crc_stored;
+ u32 signature;
+ int ret = 0;
+
+ pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk);
+
+ /* read PPL header */
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
+ PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
+ md_error(mddev, rdev);
+ ret = -EIO;
+ goto out;
+ }
+ pplhdr = page_address(page);
+
+ /* check header validity */
+ crc_stored = le32_to_cpu(pplhdr->checksum);
+ pplhdr->checksum = 0;
+ crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
+
+ if (crc_stored != crc) {
+ pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x\n",
+ __func__, crc_stored, crc);
+ ppl_conf->mismatch_count++;
+ goto out;
+ }
+
+ signature = le32_to_cpu(pplhdr->signature);
+
+ if (mddev->external) {
+ /*
+ * For external metadata the header signature is set and
+ * validated in userspace.
+ */
+ ppl_conf->signature = signature;
+ } else if (ppl_conf->signature != signature) {
+ pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x\n",
+ __func__, signature, ppl_conf->signature);
+ ppl_conf->mismatch_count++;
+ goto out;
+ }
+
+ /* attempt to recover from log if we are starting a dirty array */
+ if (!mddev->pers && mddev->recovery_cp != MaxSector)
+ ret = ppl_recover(log, pplhdr);
+out:
+ /* write empty header if we are starting the array */
+ if (!ret && !mddev->pers)
+ ret = ppl_write_empty_header(log);
+
+ __free_page(page);
+
+ pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
+ __func__, ret, ppl_conf->mismatch_count,
+ ppl_conf->recovered_entries);
+ return ret;
+}
+
+static int ppl_load(struct ppl_conf *ppl_conf)
+{
+ int ret = 0;
+ u32 signature = 0;
+ bool signature_set = false;
+ int i;
+
+ for (i = 0; i < ppl_conf->count; i++) {
+ struct ppl_log *log = &ppl_conf->child_logs[i];
+
+ /* skip missing drive */
+ if (!log->rdev)
+ continue;
+
+ ret = ppl_load_distributed(log);
+ if (ret)
+ break;
+
+ /*
+ * For external metadata we can't check if the signature is
+ * correct on a single drive, but we can check if it is the same
+ * on all drives.
+ */
+ if (ppl_conf->mddev->external) {
+ if (!signature_set) {
+ signature = ppl_conf->signature;
+ signature_set = true;
+ } else if (signature != ppl_conf->signature) {
+ pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n",
+ mdname(ppl_conf->mddev));
+ ret = -EINVAL;
+ break;
+ }
+ }
+ }
+
+ pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
+ __func__, ret, ppl_conf->mismatch_count,
+ ppl_conf->recovered_entries);
+ return ret;
+}
+
+static void __ppl_exit_log(struct ppl_conf *ppl_conf)
+{
+ clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
+
+ kfree(ppl_conf->child_logs);
+
+ if (ppl_conf->bs)
+ bioset_free(ppl_conf->bs);
+ mempool_destroy(ppl_conf->io_pool);
+ kmem_cache_destroy(ppl_conf->io_kc);
+
+ kfree(ppl_conf);
+}
+
+void ppl_exit_log(struct r5conf *conf)
+{
+ struct ppl_conf *ppl_conf = conf->log_private;
+
+ if (ppl_conf) {
+ __ppl_exit_log(ppl_conf);
+ conf->log_private = NULL;
+ }
+}
+
+static int ppl_validate_rdev(struct md_rdev *rdev)
+{
+ char b[BDEVNAME_SIZE];
+ int ppl_data_sectors;
+ int ppl_size_new;
+
+ /*
+ * The configured PPL size must be enough to store
+ * the header and (at the very least) partial parity
+ * for one stripe. Round it down to ensure the data
+ * space is cleanly divisible by stripe size.
+ */
+ ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
+
+ if (ppl_data_sectors > 0)
+ ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
+
+ if (ppl_data_sectors <= 0) {
+ pr_warn("md/raid:%s: PPL space too small on %s\n",
+ mdname(rdev->mddev), bdevname(rdev->bdev, b));
+ return -ENOSPC;
+ }
+
+ ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9);
+
+ if ((rdev->ppl.sector < rdev->data_offset &&
+ rdev->ppl.sector + ppl_size_new > rdev->data_offset) ||
+ (rdev->ppl.sector >= rdev->data_offset &&
+ rdev->data_offset + rdev->sectors > rdev->ppl.sector)) {
+ pr_warn("md/raid:%s: PPL space overlaps with data on %s\n",
+ mdname(rdev->mddev), bdevname(rdev->bdev, b));
+ return -EINVAL;
+ }
+
+ if (!rdev->mddev->external &&
+ ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) ||
+ (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) {
+ pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n",
+ mdname(rdev->mddev), bdevname(rdev->bdev, b));
+ return -EINVAL;
+ }
+
+ rdev->ppl.size = ppl_size_new;
+
+ return 0;
+}
+
+int ppl_init_log(struct r5conf *conf)
+{
+ struct ppl_conf *ppl_conf;
+ struct mddev *mddev = conf->mddev;
+ int ret = 0;
+ int i;
+ bool need_cache_flush = false;
+
+ pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
+ mdname(conf->mddev));
+
+ if (PAGE_SIZE != 4096)
+ return -EINVAL;
+
+ if (mddev->level != 5) {
+ pr_warn("md/raid:%s PPL is not compatible with raid level %d\n",
+ mdname(mddev), mddev->level);
+ return -EINVAL;
+ }
+
+ if (mddev->bitmap_info.file || mddev->bitmap_info.offset) {
+ pr_warn("md/raid:%s PPL is not compatible with bitmap\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+ pr_warn("md/raid:%s PPL is not compatible with journal\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+
+ ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
+ if (!ppl_conf)
+ return -ENOMEM;
+
+ ppl_conf->mddev = mddev;
+
+ ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0);
+ if (!ppl_conf->io_kc) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ppl_conf->io_pool = mempool_create(conf->raid_disks, ppl_io_pool_alloc,
+ ppl_io_pool_free, ppl_conf->io_kc);
+ if (!ppl_conf->io_pool) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ppl_conf->bs = bioset_create(conf->raid_disks, 0);
+ if (!ppl_conf->bs) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ppl_conf->count = conf->raid_disks;
+ ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
+ GFP_KERNEL);
+ if (!ppl_conf->child_logs) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ atomic64_set(&ppl_conf->seq, 0);
+ INIT_LIST_HEAD(&ppl_conf->no_mem_stripes);
+ spin_lock_init(&ppl_conf->no_mem_stripes_lock);
+
+ if (!mddev->external) {
+ ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
+ ppl_conf->block_size = 512;
+ } else {
+ ppl_conf->block_size = queue_logical_block_size(mddev->queue);
+ }
+
+ for (i = 0; i < ppl_conf->count; i++) {
+ struct ppl_log *log = &ppl_conf->child_logs[i];
+ struct md_rdev *rdev = conf->disks[i].rdev;
+
+ mutex_init(&log->io_mutex);
+ spin_lock_init(&log->io_list_lock);
+ INIT_LIST_HEAD(&log->io_list);
+
+ log->ppl_conf = ppl_conf;
+ log->rdev = rdev;
+
+ if (rdev) {
+ struct request_queue *q;
+
+ ret = ppl_validate_rdev(rdev);
+ if (ret)
+ goto err;
+
+ q = bdev_get_queue(rdev->bdev);
+ if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+ need_cache_flush = true;
+ }
+ }
+
+ if (need_cache_flush)
+ pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
+ mdname(mddev));
+
+ /* load and possibly recover the logs from the member disks */
+ ret = ppl_load(ppl_conf);
+
+ if (ret) {
+ goto err;
+ } else if (!mddev->pers &&
+ mddev->recovery_cp == 0 && !mddev->degraded &&
+ ppl_conf->recovered_entries > 0 &&
+ ppl_conf->mismatch_count == 0) {
+ /*
+ * If we are starting a dirty array and the recovery succeeds
+ * without any issues, set the array as clean.
+ */
+ mddev->recovery_cp = MaxSector;
+ set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+ } else if (mddev->pers && ppl_conf->mismatch_count > 0) {
+ /* no mismatch allowed when enabling PPL for a running array */
+ ret = -EINVAL;
+ goto err;
+ }
+
+ conf->log_private = ppl_conf;
+ set_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
+
+ return 0;
+err:
+ __ppl_exit_log(ppl_conf);
+ return ret;
+}
+
+int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add)
+{
+ struct ppl_conf *ppl_conf = conf->log_private;
+ struct ppl_log *log;
+ int ret = 0;
+ char b[BDEVNAME_SIZE];
+
+ if (!rdev)
+ return -EINVAL;
+
+ pr_debug("%s: disk: %d operation: %s dev: %s\n",
+ __func__, rdev->raid_disk, add ? "add" : "remove",
+ bdevname(rdev->bdev, b));
+
+ if (rdev->raid_disk < 0)
+ return 0;
+
+ if (rdev->raid_disk >= ppl_conf->count)
+ return -ENODEV;
+
+ log = &ppl_conf->child_logs[rdev->raid_disk];
+
+ mutex_lock(&log->io_mutex);
+ if (add) {
+ ret = ppl_validate_rdev(rdev);
+ if (!ret) {
+ log->rdev = rdev;
+ ret = ppl_write_empty_header(log);
+ }
+ } else {
+ log->rdev = NULL;
+ }
+ mutex_unlock(&log->io_mutex);
+
+ return ret;
+}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2efdb0d67460..2e38cfac5b1d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -58,11 +58,13 @@
#include <linux/sched/signal.h>
#include <trace/events/block.h>
+#include <linux/list_sort.h>
#include "md.h"
#include "raid5.h"
#include "raid0.h"
#include "bitmap.h"
+#include "raid5-log.h"
#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
@@ -156,17 +158,6 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
return slot;
}
-static void return_io(struct bio_list *return_bi)
-{
- struct bio *bi;
- while ((bi = bio_list_pop(return_bi)) != NULL) {
- bi->bi_iter.bi_size = 0;
- trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
- bi, 0);
- bio_endio(bi);
- }
-}
-
static void print_raid5_conf (struct r5conf *conf);
static int stripe_operations_active(struct stripe_head *sh)
@@ -176,6 +167,13 @@ static int stripe_operations_active(struct stripe_head *sh)
test_bit(STRIPE_COMPUTE_RUN, &sh->state);
}
+static bool stripe_is_lowprio(struct stripe_head *sh)
+{
+ return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
+ test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
+ !test_bit(STRIPE_R5C_CACHING, &sh->state);
+}
+
static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
@@ -191,7 +189,10 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
if (list_empty(&sh->lru)) {
struct r5worker_group *group;
group = conf->worker_groups + cpu_to_group(cpu);
- list_add_tail(&sh->lru, &group->handle_list);
+ if (stripe_is_lowprio(sh))
+ list_add_tail(&sh->lru, &group->loprio_list);
+ else
+ list_add_tail(&sh->lru, &group->handle_list);
group->stripes_cnt++;
sh->group = group;
}
@@ -254,7 +255,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
clear_bit(STRIPE_DELAYED, &sh->state);
clear_bit(STRIPE_BIT_DELAY, &sh->state);
if (conf->worker_cnt_per_group == 0) {
- list_add_tail(&sh->lru, &conf->handle_list);
+ if (stripe_is_lowprio(sh))
+ list_add_tail(&sh->lru,
+ &conf->loprio_list);
+ else
+ list_add_tail(&sh->lru,
+ &conf->handle_list);
} else {
raid5_wakeup_stripe_thread(sh);
return;
@@ -481,6 +487,7 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
sh->dev[i].page = page;
sh->dev[i].orig_page = page;
}
+
return 0;
}
@@ -729,7 +736,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
- if (conf->log)
+ if (conf->log || raid5_has_ppl(conf))
return false;
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
@@ -863,41 +870,107 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
return 1;
}
-static void flush_deferred_bios(struct r5conf *conf)
+static void dispatch_bio_list(struct bio_list *tmp)
{
- struct bio_list tmp;
struct bio *bio;
- if (!conf->batch_bio_dispatch || !conf->group_cnt)
+ while ((bio = bio_list_pop(tmp)))
+ generic_make_request(bio);
+}
+
+static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
+{
+ const struct r5pending_data *da = list_entry(a,
+ struct r5pending_data, sibling);
+ const struct r5pending_data *db = list_entry(b,
+ struct r5pending_data, sibling);
+ if (da->sector > db->sector)
+ return 1;
+ if (da->sector < db->sector)
+ return -1;
+ return 0;
+}
+
+static void dispatch_defer_bios(struct r5conf *conf, int target,
+ struct bio_list *list)
+{
+ struct r5pending_data *data;
+ struct list_head *first, *next = NULL;
+ int cnt = 0;
+
+ if (conf->pending_data_cnt == 0)
+ return;
+
+ list_sort(NULL, &conf->pending_list, cmp_stripe);
+
+ first = conf->pending_list.next;
+
+ /* temporarily move the head */
+ if (conf->next_pending_data)
+ list_move_tail(&conf->pending_list,
+ &conf->next_pending_data->sibling);
+
+ while (!list_empty(&conf->pending_list)) {
+ data = list_first_entry(&conf->pending_list,
+ struct r5pending_data, sibling);
+ if (&data->sibling == first)
+ first = data->sibling.next;
+ next = data->sibling.next;
+
+ bio_list_merge(list, &data->bios);
+ list_move(&data->sibling, &conf->free_list);
+ cnt++;
+ if (cnt >= target)
+ break;
+ }
+ conf->pending_data_cnt -= cnt;
+ BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
+
+ if (next != &conf->pending_list)
+ conf->next_pending_data = list_entry(next,
+ struct r5pending_data, sibling);
+ else
+ conf->next_pending_data = NULL;
+ /* list isn't empty */
+ if (first != &conf->pending_list)
+ list_move_tail(&conf->pending_list, first);
+}
+
+static void flush_deferred_bios(struct r5conf *conf)
+{
+ struct bio_list tmp = BIO_EMPTY_LIST;
+
+ if (conf->pending_data_cnt == 0)
return;
- bio_list_init(&tmp);
spin_lock(&conf->pending_bios_lock);
- bio_list_merge(&tmp, &conf->pending_bios);
- bio_list_init(&conf->pending_bios);
+ dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
+ BUG_ON(conf->pending_data_cnt != 0);
spin_unlock(&conf->pending_bios_lock);
- while ((bio = bio_list_pop(&tmp)))
- generic_make_request(bio);
+ dispatch_bio_list(&tmp);
}
-static void defer_bio_issue(struct r5conf *conf, struct bio *bio)
+static void defer_issue_bios(struct r5conf *conf, sector_t sector,
+ struct bio_list *bios)
{
- /*
- * change group_cnt will drain all bios, so this is safe
- *
- * A read generally means a read-modify-write, which usually means a
- * randwrite, so we don't delay it
- */
- if (!conf->batch_bio_dispatch || !conf->group_cnt ||
- bio_op(bio) == REQ_OP_READ) {
- generic_make_request(bio);
- return;
- }
+ struct bio_list tmp = BIO_EMPTY_LIST;
+ struct r5pending_data *ent;
+
spin_lock(&conf->pending_bios_lock);
- bio_list_add(&conf->pending_bios, bio);
+ ent = list_first_entry(&conf->free_list, struct r5pending_data,
+ sibling);
+ list_move_tail(&ent->sibling, &conf->pending_list);
+ ent->sector = sector;
+ bio_list_init(&ent->bios);
+ bio_list_merge(&ent->bios, bios);
+ conf->pending_data_cnt++;
+ if (conf->pending_data_cnt >= PENDING_IO_MAX)
+ dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
+
spin_unlock(&conf->pending_bios_lock);
- md_wakeup_thread(conf->mddev->thread);
+
+ dispatch_bio_list(&tmp);
}
static void
@@ -910,21 +983,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
struct r5conf *conf = sh->raid_conf;
int i, disks = sh->disks;
struct stripe_head *head_sh = sh;
+ struct bio_list pending_bios = BIO_EMPTY_LIST;
+ bool should_defer;
might_sleep();
- if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
- /* writing out phase */
- if (s->waiting_extra_page)
- return;
- if (r5l_write_stripe(conf->log, sh) == 0)
- return;
- } else { /* caching phase */
- if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
- r5c_cache_data(conf->log, sh, s);
- return;
- }
- }
+ if (log_stripe(sh, s) == 0)
+ return;
+
+ should_defer = conf->batch_bio_dispatch && conf->group_cnt;
for (i = disks; i--; ) {
int op, op_flags = 0;
@@ -1080,7 +1147,10 @@ again:
trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
bi, disk_devt(conf->mddev->gendisk),
sh->dev[i].sector);
- defer_bio_issue(conf, bi);
+ if (should_defer && op_is_write(op))
+ bio_list_add(&pending_bios, bi);
+ else
+ generic_make_request(bi);
}
if (rrdev) {
if (s->syncing || s->expanding || s->expanded
@@ -1125,7 +1195,10 @@ again:
trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
rbi, disk_devt(conf->mddev->gendisk),
sh->dev[i].sector);
- defer_bio_issue(conf, rbi);
+ if (should_defer && op_is_write(op))
+ bio_list_add(&pending_bios, rbi);
+ else
+ generic_make_request(rbi);
}
if (!rdev && !rrdev) {
if (op_is_write(op))
@@ -1143,6 +1216,9 @@ again:
if (sh != head_sh)
goto again;
}
+
+ if (should_defer && !bio_list_empty(&pending_bios))
+ defer_issue_bios(conf, head_sh->sector, &pending_bios);
}
static struct dma_async_tx_descriptor *
@@ -1212,7 +1288,6 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
static void ops_complete_biofill(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
- struct bio_list return_bi = BIO_EMPTY_LIST;
int i;
pr_debug("%s: stripe %llu\n", __func__,
@@ -1236,16 +1311,13 @@ static void ops_complete_biofill(void *stripe_head_ref)
while (rbi && rbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
rbi2 = r5_next_bio(rbi, dev->sector);
- if (!raid5_dec_bi_active_stripes(rbi))
- bio_list_add(&return_bi, rbi);
+ bio_endio(rbi);
rbi = rbi2;
}
}
}
clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
- return_io(&return_bi);
-
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
}
@@ -2014,6 +2086,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
tx = ops_run_prexor6(sh, percpu, tx);
}
+ if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
+ tx = ops_run_partial_parity(sh, percpu, tx);
+
if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
tx = ops_run_biodrain(sh, tx);
overlap_clear++;
@@ -2046,8 +2121,15 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
put_cpu();
}
+static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
+{
+ if (sh->ppl_page)
+ __free_page(sh->ppl_page);
+ kmem_cache_free(sc, sh);
+}
+
static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
- int disks)
+ int disks, struct r5conf *conf)
{
struct stripe_head *sh;
int i;
@@ -2061,6 +2143,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
INIT_LIST_HEAD(&sh->r5c);
INIT_LIST_HEAD(&sh->log_list);
atomic_set(&sh->count, 1);
+ sh->raid_conf = conf;
sh->log_start = MaxSector;
for (i = 0; i < disks; i++) {
struct r5dev *dev = &sh->dev[i];
@@ -2068,6 +2151,14 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
bio_init(&dev->req, &dev->vec, 1);
bio_init(&dev->rreq, &dev->rvec, 1);
}
+
+ if (raid5_has_ppl(conf)) {
+ sh->ppl_page = alloc_page(gfp);
+ if (!sh->ppl_page) {
+ free_stripe(sc, sh);
+ sh = NULL;
+ }
+ }
}
return sh;
}
@@ -2075,15 +2166,13 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
{
struct stripe_head *sh;
- sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size);
+ sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
if (!sh)
return 0;
- sh->raid_conf = conf;
-
if (grow_buffers(sh, gfp)) {
shrink_buffers(sh);
- kmem_cache_free(conf->slab_cache, sh);
+ free_stripe(conf->slab_cache, sh);
return 0;
}
sh->hash_lock_index =
@@ -2210,7 +2299,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
* pages have been transferred over, and the old kmem_cache is
* freed when all stripes are done.
* 3/ reallocate conf->disks to be suitable bigger. If this fails,
- * we simple return a failre status - no need to clean anything up.
+ * we simple return a failure status - no need to clean anything up.
* 4/ allocate new pages for the new slots in the new stripe_heads.
* If this fails, we don't bother trying the shrink the
* stripe_heads down again, we just leave them as they are.
@@ -2228,9 +2317,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
int i;
int hash, cnt;
- if (newsize <= conf->pool_size)
- return 0; /* never bother to shrink */
-
err = md_allow_write(conf->mddev);
if (err)
return err;
@@ -2246,11 +2332,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
mutex_lock(&conf->cache_size_mutex);
for (i = conf->max_nr_stripes; i; i--) {
- nsh = alloc_stripe(sc, GFP_KERNEL, newsize);
+ nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
if (!nsh)
break;
- nsh->raid_conf = conf;
list_add(&nsh->lru, &newstripes);
}
if (i) {
@@ -2258,7 +2343,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
while (!list_empty(&newstripes)) {
nsh = list_entry(newstripes.next, struct stripe_head, lru);
list_del(&nsh->lru);
- kmem_cache_free(sc, nsh);
+ free_stripe(sc, nsh);
}
kmem_cache_destroy(sc);
mutex_unlock(&conf->cache_size_mutex);
@@ -2284,7 +2369,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
nsh->dev[i].orig_page = osh->dev[i].page;
}
nsh->hash_lock_index = hash;
- kmem_cache_free(conf->slab_cache, osh);
+ free_stripe(conf->slab_cache, osh);
cnt++;
if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
!!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
@@ -2323,6 +2408,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
err = -ENOMEM;
mutex_unlock(&conf->cache_size_mutex);
+
+ conf->slab_cache = sc;
+ conf->active_name = 1-conf->active_name;
+
/* Step 4, return new stripes to service */
while(!list_empty(&newstripes)) {
nsh = list_entry(newstripes.next, struct stripe_head, lru);
@@ -2340,8 +2429,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
}
/* critical section pass, GFP_NOIO no longer needed */
- conf->slab_cache = sc;
- conf->active_name = 1-conf->active_name;
if (!err)
conf->pool_size = newsize;
return err;
@@ -2359,7 +2446,7 @@ static int drop_one_stripe(struct r5conf *conf)
return 0;
BUG_ON(atomic_read(&sh->count));
shrink_buffers(sh);
- kmem_cache_free(conf->slab_cache, sh);
+ free_stripe(conf->slab_cache, sh);
atomic_dec(&conf->active_stripes);
conf->max_nr_stripes--;
return 1;
@@ -3082,6 +3169,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
s->locked++;
}
+ if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
+ test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
+ !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
+ test_bit(R5_Insync, &sh->dev[pd_idx].flags))
+ set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
+
pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
__func__, (unsigned long long)sh->sector,
s->locked, s->ops_request);
@@ -3103,14 +3196,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
(unsigned long long)bi->bi_iter.bi_sector,
(unsigned long long)sh->sector);
- /*
- * If several bio share a stripe. The bio bi_phys_segments acts as a
- * reference count to avoid race. The reference count should already be
- * increased before this function is called (for example, in
- * raid5_make_request()), so other bio sharing this stripe will not free the
- * stripe. If a stripe is owned by one stripe, the stripe lock will
- * protect it.
- */
spin_lock_irq(&sh->stripe_lock);
/* Don't allow new IO added to stripes in batch list */
if (sh->batch_head)
@@ -3129,6 +3214,36 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
goto overlap;
+ if (forwrite && raid5_has_ppl(conf)) {
+ /*
+ * With PPL only writes to consecutive data chunks within a
+ * stripe are allowed because for a single stripe_head we can
+ * only have one PPL entry at a time, which describes one data
+ * range. Not really an overlap, but wait_for_overlap can be
+ * used to handle this.
+ */
+ sector_t sector;
+ sector_t first = 0;
+ sector_t last = 0;
+ int count = 0;
+ int i;
+
+ for (i = 0; i < sh->disks; i++) {
+ if (i != sh->pd_idx &&
+ (i == dd_idx || sh->dev[i].towrite)) {
+ sector = sh->dev[i].sector;
+ if (count == 0 || sector < first)
+ first = sector;
+ if (sector > last)
+ last = sector;
+ count++;
+ }
+ }
+
+ if (first + conf->chunk_sectors * (count - 1) != last)
+ goto overlap;
+ }
+
if (!forwrite || previous)
clear_bit(STRIPE_BATCH_READY, &sh->state);
@@ -3136,7 +3251,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
if (*bip)
bi->bi_next = *bip;
*bip = bi;
- raid5_inc_bi_active_stripes(bi);
+ bio_inc_remaining(bi);
+ md_write_inc(conf->mddev, bi);
if (forwrite) {
/* check if page is covered */
@@ -3213,8 +3329,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
static void
handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
- struct stripe_head_state *s, int disks,
- struct bio_list *return_bi)
+ struct stripe_head_state *s, int disks)
{
int i;
BUG_ON(sh->batch_head);
@@ -3250,7 +3365,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bi)
bitmap_end = 1;
- r5l_stripe_write_finished(sh);
+ log_stripe_write_finished(sh);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
@@ -3260,10 +3375,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
bi->bi_error = -EIO;
- if (!raid5_dec_bi_active_stripes(bi)) {
- md_write_end(conf->mddev);
- bio_list_add(return_bi, bi);
- }
+ md_write_end(conf->mddev);
+ bio_endio(bi);
bi = nextbi;
}
if (bitmap_end)
@@ -3284,10 +3397,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
bi->bi_error = -EIO;
- if (!raid5_dec_bi_active_stripes(bi)) {
- md_write_end(conf->mddev);
- bio_list_add(return_bi, bi);
- }
+ md_write_end(conf->mddev);
+ bio_endio(bi);
bi = bi2;
}
@@ -3312,8 +3423,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
r5_next_bio(bi, sh->dev[i].sector);
bi->bi_error = -EIO;
- if (!raid5_dec_bi_active_stripes(bi))
- bio_list_add(return_bi, bi);
+ bio_endio(bi);
bi = nextbi;
}
}
@@ -3449,7 +3559,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
/* Pre-reads at not permitted until after short delay
* to gather multiple requests. However if this
- * device is no Insync, the block could only be be computed
+ * device is no Insync, the block could only be computed
* and there is no need to delay that.
*/
return 0;
@@ -3468,7 +3578,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
/* If we are forced to do a reconstruct-write, either because
* the current RAID6 implementation only supports that, or
- * or because parity cannot be trusted and we are currently
+ * because parity cannot be trusted and we are currently
* recovering it, there is extra need to be careful.
* If one of the devices that we would need to read, because
* it is not being overwritten (and maybe not written at all)
@@ -3508,9 +3618,20 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
BUG_ON(test_bit(R5_Wantread, &dev->flags));
BUG_ON(sh->batch_head);
+
+ /*
+ * In the raid6 case if the only non-uptodate disk is P
+ * then we already trusted P to compute the other failed
+ * drives. It is safe to compute rather than re-read P.
+ * In other cases we only compute blocks from failed
+ * devices, otherwise check/repair might fail to detect
+ * a real inconsistency.
+ */
+
if ((s->uptodate == disks - 1) &&
+ ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
(s->failed && (disk_idx == s->failed_num[0] ||
- disk_idx == s->failed_num[1]))) {
+ disk_idx == s->failed_num[1])))) {
/* have disk failed, and we're requested to fetch it;
* do compute it
*/
@@ -3612,7 +3733,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
* never LOCKED, so we don't need to test 'failed' directly.
*/
static void handle_stripe_clean_event(struct r5conf *conf,
- struct stripe_head *sh, int disks, struct bio_list *return_bi)
+ struct stripe_head *sh, int disks)
{
int i;
struct r5dev *dev;
@@ -3644,10 +3765,8 @@ returnbi:
while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
- if (!raid5_dec_bi_active_stripes(wbi)) {
- md_write_end(conf->mddev);
- bio_list_add(return_bi, wbi);
- }
+ md_write_end(conf->mddev);
+ bio_endio(wbi);
wbi = wbi2;
}
bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -3669,7 +3788,7 @@ returnbi:
discard_pending = 1;
}
- r5l_stripe_write_finished(sh);
+ log_stripe_write_finished(sh);
if (!discard_pending &&
test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
@@ -4556,7 +4675,8 @@ static void handle_stripe(struct stripe_head *sh)
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
goto finish;
- if (s.handle_bad_blocks) {
+ if (s.handle_bad_blocks ||
+ test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
set_bit(STRIPE_HANDLE, &sh->state);
goto finish;
}
@@ -4589,7 +4709,7 @@ static void handle_stripe(struct stripe_head *sh)
sh->reconstruct_state = 0;
break_stripe_batch_list(sh, 0);
if (s.to_read+s.to_write+s.written)
- handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
+ handle_failed_stripe(conf, sh, &s, disks);
if (s.syncing + s.replacing)
handle_failed_sync(conf, sh, &s);
}
@@ -4655,11 +4775,11 @@ static void handle_stripe(struct stripe_head *sh)
&& !test_bit(R5_LOCKED, &qdev->flags)
&& (test_bit(R5_UPTODATE, &qdev->flags) ||
test_bit(R5_Discard, &qdev->flags))))))
- handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+ handle_stripe_clean_event(conf, sh, disks);
if (s.just_cached)
- r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
- r5l_stripe_write_finished(sh);
+ r5c_handle_cached_data_endio(conf, sh, disks);
+ log_stripe_write_finished(sh);
/* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests
@@ -4886,16 +5006,6 @@ finish:
md_wakeup_thread(conf->mddev->thread);
}
- if (!bio_list_empty(&s.return_bi)) {
- if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
- spin_lock_irq(&conf->device_lock);
- bio_list_merge(&conf->return_bi, &s.return_bi);
- spin_unlock_irq(&conf->device_lock);
- md_wakeup_thread(conf->mddev->thread);
- } else
- return_io(&s.return_bi);
- }
-
clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
}
@@ -4984,12 +5094,14 @@ static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
md_wakeup_thread(conf->mddev->thread);
}
-static struct bio *remove_bio_from_retry(struct r5conf *conf)
+static struct bio *remove_bio_from_retry(struct r5conf *conf,
+ unsigned int *offset)
{
struct bio *bi;
bi = conf->retry_read_aligned;
if (bi) {
+ *offset = conf->retry_read_offset;
conf->retry_read_aligned = NULL;
return bi;
}
@@ -4997,11 +5109,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
if(bi) {
conf->retry_read_aligned_list = bi->bi_next;
bi->bi_next = NULL;
- /*
- * this sets the active strip count to 1 and the processed
- * strip count to zero (upper 8 bits)
- */
- raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
+ *offset = 0;
}
return bi;
@@ -5136,24 +5244,20 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
{
struct bio *split;
+ sector_t sector = raid_bio->bi_iter.bi_sector;
+ unsigned chunk_sects = mddev->chunk_sectors;
+ unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
- do {
- sector_t sector = raid_bio->bi_iter.bi_sector;
- unsigned chunk_sects = mddev->chunk_sectors;
- unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
-
- if (sectors < bio_sectors(raid_bio)) {
- split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set);
- bio_chain(split, raid_bio);
- } else
- split = raid_bio;
+ if (sectors < bio_sectors(raid_bio)) {
+ struct r5conf *conf = mddev->private;
+ split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split);
+ bio_chain(split, raid_bio);
+ generic_make_request(raid_bio);
+ raid_bio = split;
+ }
- if (!raid5_read_one_chunk(mddev, split)) {
- if (split != raid_bio)
- generic_make_request(raid_bio);
- return split;
- }
- } while (split != raid_bio);
+ if (!raid5_read_one_chunk(mddev, raid_bio))
+ return raid_bio;
return NULL;
}
@@ -5170,19 +5274,27 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
*/
static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
{
- struct stripe_head *sh = NULL, *tmp;
+ struct stripe_head *sh, *tmp;
struct list_head *handle_list = NULL;
- struct r5worker_group *wg = NULL;
+ struct r5worker_group *wg;
+ bool second_try = !r5c_is_writeback(conf->log);
+ bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state);
+again:
+ wg = NULL;
+ sh = NULL;
if (conf->worker_cnt_per_group == 0) {
- handle_list = &conf->handle_list;
+ handle_list = try_loprio ? &conf->loprio_list :
+ &conf->handle_list;
} else if (group != ANY_GROUP) {
- handle_list = &conf->worker_groups[group].handle_list;
+ handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
+ &conf->worker_groups[group].handle_list;
wg = &conf->worker_groups[group];
} else {
int i;
for (i = 0; i < conf->group_cnt; i++) {
- handle_list = &conf->worker_groups[i].handle_list;
+ handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
+ &conf->worker_groups[i].handle_list;
wg = &conf->worker_groups[i];
if (!list_empty(handle_list))
break;
@@ -5233,8 +5345,13 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
wg = NULL;
}
- if (!sh)
- return NULL;
+ if (!sh) {
+ if (second_try)
+ return NULL;
+ second_try = true;
+ try_loprio = !try_loprio;
+ goto again;
+ }
if (wg) {
wg->stripes_cnt--;
@@ -5323,7 +5440,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
struct r5conf *conf = mddev->private;
sector_t logical_sector, last_sector;
struct stripe_head *sh;
- int remaining;
int stripe_sectors;
if (mddev->reshape_position != MaxSector)
@@ -5334,7 +5450,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
bi->bi_next = NULL;
- bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+ md_write_start(mddev, bi);
stripe_sectors = conf->chunk_sectors *
(conf->raid_disks - conf->max_degraded);
@@ -5380,7 +5496,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
continue;
sh->dev[d].towrite = bi;
set_bit(R5_OVERWRITE, &sh->dev[d].flags);
- raid5_inc_bi_active_stripes(bi);
+ bio_inc_remaining(bi);
+ md_write_inc(mddev, bi);
sh->overwrite_disks++;
}
spin_unlock_irq(&sh->stripe_lock);
@@ -5403,11 +5520,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
release_stripe_plug(mddev, sh);
}
- remaining = raid5_dec_bi_active_stripes(bi);
- if (remaining == 0) {
- md_write_end(mddev);
- bio_endio(bi);
- }
+ md_write_end(mddev);
+ bio_endio(bi);
}
static void raid5_make_request(struct mddev *mddev, struct bio * bi)
@@ -5418,7 +5532,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
sector_t logical_sector, last_sector;
struct stripe_head *sh;
const int rw = bio_data_dir(bi);
- int remaining;
DEFINE_WAIT(w);
bool do_prepare;
bool do_flush = false;
@@ -5440,8 +5553,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
do_flush = bi->bi_opf & REQ_PREFLUSH;
}
- md_write_start(mddev, bi);
-
/*
* If array is degraded, better not do chunk aligned read because
* later we might have to read it again in order to reconstruct
@@ -5462,7 +5573,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
- bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+ md_write_start(mddev, bi);
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
@@ -5597,16 +5708,9 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
}
finish_wait(&conf->wait_for_overlap, &w);
- remaining = raid5_dec_bi_active_stripes(bi);
- if (remaining == 0) {
-
- if ( rw == WRITE )
- md_write_end(mddev);
-
- trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
- bi, 0);
- bio_endio(bi);
- }
+ if (rw == WRITE)
+ md_write_end(mddev);
+ bio_endio(bi);
}
static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
@@ -5955,7 +6059,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
return STRIPE_SECTORS;
}
-static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
+static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
+ unsigned int offset)
{
/* We may not be able to submit a whole bio at once as there
* may not be enough stripe_heads available.
@@ -5971,7 +6076,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
int dd_idx;
sector_t sector, logical_sector, last_sector;
int scnt = 0;
- int remaining;
int handled = 0;
logical_sector = raid_bio->bi_iter.bi_sector &
@@ -5985,7 +6089,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
sector += STRIPE_SECTORS,
scnt++) {
- if (scnt < raid5_bi_processed_stripes(raid_bio))
+ if (scnt < offset)
/* already done this stripe */
continue;
@@ -5993,15 +6097,15 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
if (!sh) {
/* failed to get a stripe - must wait */
- raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
+ conf->retry_read_offset = scnt;
return handled;
}
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
raid5_release_stripe(sh);
- raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
+ conf->retry_read_offset = scnt;
return handled;
}
@@ -6010,12 +6114,9 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
raid5_release_stripe(sh);
handled++;
}
- remaining = raid5_dec_bi_active_stripes(raid_bio);
- if (remaining == 0) {
- trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
- raid_bio, 0);
- bio_endio(raid_bio);
- }
+
+ bio_endio(raid_bio);
+
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
return handled;
@@ -6058,7 +6159,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
for (i = 0; i < batch_size; i++)
handle_stripe(batch[i]);
- r5l_write_stripe_run(conf->log);
+ log_write_stripe_run(conf);
cond_resched();
@@ -6075,6 +6176,7 @@ static void raid5_do_work(struct work_struct *work)
struct r5worker *worker = container_of(work, struct r5worker, work);
struct r5worker_group *group = worker->group;
struct r5conf *conf = group->conf;
+ struct mddev *mddev = conf->mddev;
int group_id = group - conf->worker_groups;
int handled;
struct blk_plug plug;
@@ -6095,6 +6197,9 @@ static void raid5_do_work(struct work_struct *work)
if (!batch_size && !released)
break;
handled += batch_size;
+ wait_event_lock_irq(mddev->sb_wait,
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
+ conf->device_lock);
}
pr_debug("%d stripes handled\n", handled);
@@ -6122,24 +6227,13 @@ static void raid5d(struct md_thread *thread)
md_check_recovery(mddev);
- if (!bio_list_empty(&conf->return_bi) &&
- !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
- struct bio_list tmp = BIO_EMPTY_LIST;
- spin_lock_irq(&conf->device_lock);
- if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
- bio_list_merge(&tmp, &conf->return_bi);
- bio_list_init(&conf->return_bi);
- }
- spin_unlock_irq(&conf->device_lock);
- return_io(&tmp);
- }
-
blk_start_plug(&plug);
handled = 0;
spin_lock_irq(&conf->device_lock);
while (1) {
struct bio *bio;
int batch_size, released;
+ unsigned int offset;
released = release_stripe_list(conf, conf->temp_inactive_list);
if (released)
@@ -6157,10 +6251,10 @@ static void raid5d(struct md_thread *thread)
}
raid5_activate_delayed(conf);
- while ((bio = remove_bio_from_retry(conf))) {
+ while ((bio = remove_bio_from_retry(conf, &offset))) {
int ok;
spin_unlock_irq(&conf->device_lock);
- ok = retry_aligned_read(conf, bio);
+ ok = retry_aligned_read(conf, bio, offset);
spin_lock_irq(&conf->device_lock);
if (!ok)
break;
@@ -6544,6 +6638,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
group = &(*worker_groups)[i];
INIT_LIST_HEAD(&group->handle_list);
+ INIT_LIST_HEAD(&group->loprio_list);
group->conf = conf;
group->workers = workers + i * cnt;
@@ -6634,8 +6729,8 @@ static void free_conf(struct r5conf *conf)
{
int i;
- if (conf->log)
- r5l_exit_log(conf->log);
+ log_exit(conf);
+
if (conf->shrinker.nr_deferred)
unregister_shrinker(&conf->shrinker);
@@ -6646,7 +6741,10 @@ static void free_conf(struct r5conf *conf)
if (conf->disks[i].extra_page)
put_page(conf->disks[i].extra_page);
kfree(conf->disks);
+ if (conf->bio_split)
+ bioset_free(conf->bio_split);
kfree(conf->stripe_hashtbl);
+ kfree(conf->pending_data);
kfree(conf);
}
@@ -6756,6 +6854,14 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
if (conf == NULL)
goto abort;
+ INIT_LIST_HEAD(&conf->free_list);
+ INIT_LIST_HEAD(&conf->pending_list);
+ conf->pending_data = kzalloc(sizeof(struct r5pending_data) *
+ PENDING_IO_MAX, GFP_KERNEL);
+ if (!conf->pending_data)
+ goto abort;
+ for (i = 0; i < PENDING_IO_MAX; i++)
+ list_add(&conf->pending_data[i].sibling, &conf->free_list);
/* Don't enable multi-threading by default*/
if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
&new_group)) {
@@ -6771,15 +6877,14 @@ static struct r5conf *setup_conf(struct mddev *mddev)
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
+ INIT_LIST_HEAD(&conf->loprio_list);
INIT_LIST_HEAD(&conf->hold_list);
INIT_LIST_HEAD(&conf->delayed_list);
INIT_LIST_HEAD(&conf->bitmap_list);
- bio_list_init(&conf->return_bi);
init_llist_head(&conf->released_stripes);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
atomic_set(&conf->active_aligned_reads, 0);
- bio_list_init(&conf->pending_bios);
spin_lock_init(&conf->pending_bios_lock);
conf->batch_bio_dispatch = true;
rdev_for_each(rdev, mddev) {
@@ -6813,6 +6918,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
}
+ conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+ if (!conf->bio_split)
+ goto abort;
conf->mddev = mddev;
if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
@@ -7097,6 +7205,13 @@ static int raid5_run(struct mddev *mddev)
BUG_ON(mddev->delta_disks != 0);
}
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
+ test_bit(MD_HAS_PPL, &mddev->flags)) {
+ pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
+ mdname(mddev));
+ clear_bit(MD_HAS_PPL, &mddev->flags);
+ }
+
if (mddev->private == NULL)
conf = setup_conf(mddev);
else
@@ -7188,7 +7303,10 @@ static int raid5_run(struct mddev *mddev)
if (mddev->degraded > dirty_parity_disks &&
mddev->recovery_cp != MaxSector) {
- if (mddev->ok_start_degraded)
+ if (test_bit(MD_HAS_PPL, &mddev->flags))
+ pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
+ mdname(mddev));
+ else if (mddev->ok_start_degraded)
pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
mdname(mddev));
else {
@@ -7254,14 +7372,6 @@ static int raid5_run(struct mddev *mddev)
mddev->queue->limits.discard_alignment = stripe;
mddev->queue->limits.discard_granularity = stripe;
- /*
- * We use 16-bit counter of active stripes in bi_phys_segments
- * (minus one for over-loaded initialization)
- */
- blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
- blk_queue_max_discard_sectors(mddev->queue,
- 0xfffe * STRIPE_SECTORS);
-
blk_queue_max_write_same_sectors(mddev->queue, 0);
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
@@ -7299,14 +7409,8 @@ static int raid5_run(struct mddev *mddev)
blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
}
- if (journal_dev) {
- char b[BDEVNAME_SIZE];
-
- pr_debug("md/raid:%s: using device %s as journal\n",
- mdname(mddev), bdevname(journal_dev->bdev, b));
- if (r5l_init_log(conf, journal_dev))
- goto abort;
- }
+ if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
+ goto abort;
return 0;
abort:
@@ -7420,17 +7524,16 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
print_raid5_conf(conf);
if (test_bit(Journal, &rdev->flags) && conf->log) {
- struct r5l_log *log;
/*
* we can't wait pending write here, as this is called in
* raid5d, wait will deadlock.
+ * neilb: there is no locking about new writes here,
+ * so this cannot be safe.
*/
- if (atomic_read(&mddev->writes_pending))
+ if (atomic_read(&conf->active_stripes)) {
return -EBUSY;
- log = conf->log;
- conf->log = NULL;
- synchronize_rcu();
- r5l_exit_log(log);
+ }
+ log_exit(conf);
return 0;
}
if (rdev == p->rdev)
@@ -7469,6 +7572,11 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
*rdevp = rdev;
}
}
+ if (!err) {
+ err = log_modify(conf, rdev, false);
+ if (err)
+ goto abort;
+ }
if (p->replacement) {
/* We must have just cleared 'rdev' */
p->rdev = p->replacement;
@@ -7477,12 +7585,12 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
* but will never see neither - if they are careful
*/
p->replacement = NULL;
- clear_bit(WantReplacement, &rdev->flags);
- } else
- /* We might have just removed the Replacement as faulty-
- * clear the bit just in case
- */
- clear_bit(WantReplacement, &rdev->flags);
+
+ if (!err)
+ err = log_modify(conf, p->rdev, true);
+ }
+
+ clear_bit(WantReplacement, &rdev->flags);
abort:
print_raid5_conf(conf);
@@ -7499,7 +7607,6 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int last = conf->raid_disks - 1;
if (test_bit(Journal, &rdev->flags)) {
- char b[BDEVNAME_SIZE];
if (conf->log)
return -EBUSY;
@@ -7508,9 +7615,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
* The array is in readonly mode if journal is missing, so no
* write requests running. We should be safe
*/
- r5l_init_log(conf, rdev);
- pr_debug("md/raid:%s: using device %s as journal\n",
- mdname(mddev), bdevname(rdev->bdev, b));
+ log_init(conf, rdev, false);
return 0;
}
if (mddev->recovery_disabled == conf->recovery_disabled)
@@ -7537,10 +7642,12 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (p->rdev == NULL) {
clear_bit(In_sync, &rdev->flags);
rdev->raid_disk = disk;
- err = 0;
if (rdev->saved_raid_disk != disk)
conf->fullsync = 1;
rcu_assign_pointer(p->rdev, rdev);
+
+ err = log_modify(conf, rdev, true);
+
goto out;
}
}
@@ -7574,7 +7681,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
sector_t newsize;
struct r5conf *conf = mddev->private;
- if (conf->log)
+ if (conf->log || raid5_has_ppl(conf))
return -EINVAL;
sectors &= ~((sector_t)conf->chunk_sectors - 1);
newsize = raid5_size(mddev, sectors, mddev->raid_disks);
@@ -7625,7 +7732,7 @@ static int check_reshape(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
- if (conf->log)
+ if (conf->log || raid5_has_ppl(conf))
return -EINVAL;
if (mddev->delta_disks == 0 &&
mddev->new_layout == mddev->layout &&
@@ -7658,6 +7765,9 @@ static int check_reshape(struct mddev *mddev)
mddev->chunk_sectors)
) < 0)
return -ENOMEM;
+
+ if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
+ return 0; /* never bother to shrink */
return resize_stripes(conf, (conf->previous_raid_disks
+ mddev->delta_disks));
}
@@ -8148,6 +8258,68 @@ static void *raid6_takeover(struct mddev *mddev)
return setup_conf(mddev);
}
+static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
+{
+ struct r5conf *conf;
+ int err;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf) {
+ mddev_unlock(mddev);
+ return -ENODEV;
+ }
+
+ if (strncmp(buf, "ppl", 3) == 0) {
+ /* ppl only works with RAID 5 */
+ if (!raid5_has_ppl(conf) && conf->level == 5) {
+ err = log_init(conf, NULL, true);
+ if (!err) {
+ err = resize_stripes(conf, conf->pool_size);
+ if (err)
+ log_exit(conf);
+ }
+ } else
+ err = -EINVAL;
+ } else if (strncmp(buf, "resync", 6) == 0) {
+ if (raid5_has_ppl(conf)) {
+ mddev_suspend(mddev);
+ log_exit(conf);
+ mddev_resume(mddev);
+ err = resize_stripes(conf, conf->pool_size);
+ } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
+ r5l_log_disk_error(conf)) {
+ bool journal_dev_exists = false;
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, mddev)
+ if (test_bit(Journal, &rdev->flags)) {
+ journal_dev_exists = true;
+ break;
+ }
+
+ if (!journal_dev_exists) {
+ mddev_suspend(mddev);
+ clear_bit(MD_HAS_JOURNAL, &mddev->flags);
+ mddev_resume(mddev);
+ } else /* need remove journal device first */
+ err = -EBUSY;
+ } else
+ err = -EINVAL;
+ } else {
+ err = -EINVAL;
+ }
+
+ if (!err)
+ md_update_sb(mddev, 1);
+
+ mddev_unlock(mddev);
+
+ return err;
+}
+
static struct md_personality raid6_personality =
{
.name = "raid6",
@@ -8170,6 +8342,7 @@ static struct md_personality raid6_personality =
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
.congested = raid5_congested,
+ .change_consistency_policy = raid5_change_consistency_policy,
};
static struct md_personality raid5_personality =
{
@@ -8193,6 +8366,7 @@ static struct md_personality raid5_personality =
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
.congested = raid5_congested,
+ .change_consistency_policy = raid5_change_consistency_policy,
};
static struct md_personality raid4_personality =
@@ -8217,6 +8391,7 @@ static struct md_personality raid4_personality =
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
.congested = raid5_congested,
+ .change_consistency_policy = raid5_change_consistency_policy,
};
static int __init raid5_init(void)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 4bb27b97bf6b..625c7f16fd6b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -224,10 +224,16 @@ struct stripe_head {
spinlock_t batch_lock; /* only header's lock is useful */
struct list_head batch_list; /* protected by head's batch lock*/
- struct r5l_io_unit *log_io;
+ union {
+ struct r5l_io_unit *log_io;
+ struct ppl_io_unit *ppl_io;
+ };
+
struct list_head log_list;
sector_t log_start; /* first meta block on the journal */
struct list_head r5c; /* for r5c_cache->stripe_in_journal */
+
+ struct page *ppl_page; /* partial parity of this stripe */
/**
* struct stripe_operations
* @target - STRIPE_OP_COMPUTE_BLK target
@@ -272,7 +278,6 @@ struct stripe_head_state {
int dec_preread_active;
unsigned long ops_request;
- struct bio_list return_bi;
struct md_rdev *blocked_rdev;
int handle_bad_blocks;
int log_failed;
@@ -400,6 +405,7 @@ enum {
STRIPE_OP_BIODRAIN,
STRIPE_OP_RECONSTRUCT,
STRIPE_OP_CHECK,
+ STRIPE_OP_PARTIAL_PARITY,
};
/*
@@ -481,50 +487,6 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
return NULL;
}
-/*
- * We maintain a biased count of active stripes in the bottom 16 bits of
- * bi_phys_segments, and a count of processed stripes in the upper 16 bits
- */
-static inline int raid5_bi_processed_stripes(struct bio *bio)
-{
- atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
- return (atomic_read(segments) >> 16) & 0xffff;
-}
-
-static inline int raid5_dec_bi_active_stripes(struct bio *bio)
-{
- atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
- return atomic_sub_return(1, segments) & 0xffff;
-}
-
-static inline void raid5_inc_bi_active_stripes(struct bio *bio)
-{
- atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
- atomic_inc(segments);
-}
-
-static inline void raid5_set_bi_processed_stripes(struct bio *bio,
- unsigned int cnt)
-{
- atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
- int old, new;
-
- do {
- old = atomic_read(segments);
- new = (old & 0xffff) | (cnt << 16);
- } while (atomic_cmpxchg(segments, old, new) != old);
-}
-
-static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
-{
- atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
- atomic_set(segments, cnt);
-}
-
/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
* This is because we sometimes take all the spinlocks
* and creating that much locking depth can cause
@@ -542,6 +504,7 @@ struct r5worker {
struct r5worker_group {
struct list_head handle_list;
+ struct list_head loprio_list;
struct r5conf *conf;
struct r5worker *workers;
int stripes_cnt;
@@ -571,6 +534,14 @@ enum r5_cache_state {
*/
};
+#define PENDING_IO_MAX 512
+#define PENDING_IO_ONE_FLUSH 128
+struct r5pending_data {
+ struct list_head sibling;
+ sector_t sector; /* stripe sector */
+ struct bio_list bios;
+};
+
struct r5conf {
struct hlist_head *stripe_hashtbl;
/* only protect corresponding hash list and inactive_list */
@@ -608,10 +579,12 @@ struct r5conf {
*/
struct list_head handle_list; /* stripes needing handling */
+ struct list_head loprio_list; /* low priority stripes */
struct list_head hold_list; /* preread ready stripes */
struct list_head delayed_list; /* stripes that have plugged requests */
struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */
struct bio *retry_read_aligned; /* currently retrying aligned bios */
+ unsigned int retry_read_offset; /* sector offset into retry_read_aligned */
struct bio *retry_read_aligned_list; /* aligned bios retry list */
atomic_t preread_active_stripes; /* stripes with scheduled io */
atomic_t active_aligned_reads;
@@ -621,9 +594,6 @@ struct r5conf {
int skip_copy; /* Don't copy data from bio to stripe cache */
struct list_head *last_hold; /* detect hold_list promotions */
- /* bios to have bi_end_io called after metadata is synced */
- struct bio_list return_bi;
-
atomic_t reshape_stripes; /* stripes with pending writes for reshape */
/* unfortunately we need two cache names as we temporarily have
* two caches.
@@ -676,6 +646,7 @@ struct r5conf {
int pool_size; /* number of disks in stripeheads in pool */
spinlock_t device_lock;
struct disk_info *disks;
+ struct bio_set *bio_split;
/* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array.
@@ -686,10 +657,15 @@ struct r5conf {
int group_cnt;
int worker_cnt_per_group;
struct r5l_log *log;
+ void *log_private;
- struct bio_list pending_bios;
spinlock_t pending_bios_lock;
bool batch_bio_dispatch;
+ struct r5pending_data *pending_data;
+ struct list_head free_list;
+ struct list_head pending_list;
+ int pending_data_cnt;
+ struct r5pending_data *next_pending_data;
};
@@ -765,34 +741,4 @@ extern struct stripe_head *
raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce);
extern int raid5_calc_degraded(struct r5conf *conf);
-extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
-extern void r5l_exit_log(struct r5l_log *log);
-extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
-extern void r5l_write_stripe_run(struct r5l_log *log);
-extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
-extern void r5l_stripe_write_finished(struct stripe_head *sh);
-extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
-extern void r5l_quiesce(struct r5l_log *log, int state);
-extern bool r5l_log_disk_error(struct r5conf *conf);
-extern bool r5c_is_writeback(struct r5l_log *log);
-extern int
-r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
- struct stripe_head_state *s, int disks);
-extern void
-r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
- struct stripe_head_state *s);
-extern void r5c_release_extra_page(struct stripe_head *sh);
-extern void r5c_use_extra_page(struct stripe_head *sh);
-extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
-extern void r5c_handle_cached_data_endio(struct r5conf *conf,
- struct stripe_head *sh, int disks, struct bio_list *return_bi);
-extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
- struct stripe_head_state *s);
-extern void r5c_make_stripe_write_out(struct stripe_head *sh);
-extern void r5c_flush_cache(struct r5conf *conf, int num);
-extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
-extern void r5c_check_cached_full_stripe(struct r5conf *conf);
-extern struct md_sysfs_entry r5c_journal_mode;
-extern void r5c_update_on_rdev_error(struct mddev *mddev);
-extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
#endif
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 4931756d86d9..d1b04b0e99cf 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -183,7 +183,7 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
#define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
-static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec)
+static inline unsigned bio_segments(struct bio *bio)
{
unsigned segs = 0;
struct bio_vec bv;
@@ -205,17 +205,12 @@ static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec)
break;
}
- __bio_for_each_segment(bv, bio, iter, *bvec)
+ bio_for_each_segment(bv, bio, iter)
segs++;
return segs;
}
-static inline unsigned bio_segments(struct bio *bio)
-{
- return __bio_segments(bio, &bio->bi_iter);
-}
-
/*
* get a reference to a bio, so it won't disappear. the intended use is
* something like:
@@ -389,8 +384,6 @@ extern void bio_put(struct bio *);
extern void __bio_clone_fast(struct bio *, struct bio *);
extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
-extern struct bio *bio_clone_bioset_partial(struct bio *, gfp_t,
- struct bio_set *, int, int);
extern struct bio_set *fs_bio_set;
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 3a481a49546e..c13dceb87b60 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -99,6 +99,7 @@ int __must_check percpu_ref_init(struct percpu_ref *ref,
void percpu_ref_exit(struct percpu_ref *ref);
void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
percpu_ref_func_t *confirm_switch);
+void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref);
void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
percpu_ref_func_t *confirm_kill);
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index 9930f3e9040f..d500bd224979 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -242,10 +242,18 @@ struct mdp_superblock_1 {
__le32 chunksize; /* in 512byte sectors */
__le32 raid_disks;
- __le32 bitmap_offset; /* sectors after start of superblock that bitmap starts
- * NOTE: signed, so bitmap can be before superblock
- * only meaningful of feature_map[0] is set.
- */
+ union {
+ __le32 bitmap_offset; /* sectors after start of superblock that bitmap starts
+ * NOTE: signed, so bitmap can be before superblock
+ * only meaningful of feature_map[0] is set.
+ */
+
+ /* only meaningful when feature_map[MD_FEATURE_PPL] is set */
+ struct {
+ __le16 offset; /* sectors from start of superblock that ppl starts (signed) */
+ __le16 size; /* ppl size in sectors */
+ } ppl;
+ };
/* These are only valid with feature bit '4' */
__le32 new_level; /* new level we are reshaping to */
@@ -318,6 +326,7 @@ struct mdp_superblock_1 {
*/
#define MD_FEATURE_CLUSTERED 256 /* clustered MD */
#define MD_FEATURE_JOURNAL 512 /* support write cache */
+#define MD_FEATURE_PPL 1024 /* support PPL */
#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
|MD_FEATURE_RECOVERY_OFFSET \
|MD_FEATURE_RESHAPE_ACTIVE \
@@ -328,6 +337,7 @@ struct mdp_superblock_1 {
|MD_FEATURE_RECOVERY_BITMAP \
|MD_FEATURE_CLUSTERED \
|MD_FEATURE_JOURNAL \
+ |MD_FEATURE_PPL \
)
struct r5l_payload_header {
@@ -388,4 +398,31 @@ struct r5l_meta_block {
#define R5LOG_VERSION 0x1
#define R5LOG_MAGIC 0x6433c509
+
+struct ppl_header_entry {
+ __le64 data_sector; /* raid sector of the new data */
+ __le32 pp_size; /* length of partial parity */
+ __le32 data_size; /* length of data */
+ __le32 parity_disk; /* member disk containing parity */
+ __le32 checksum; /* checksum of partial parity data for this
+ * entry (~crc32c) */
+} __attribute__ ((__packed__));
+
+#define PPL_HEADER_SIZE 4096
+#define PPL_HDR_RESERVED 512
+#define PPL_HDR_ENTRY_SPACE \
+ (PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(__le32) - sizeof(__le64))
+#define PPL_HDR_MAX_ENTRIES \
+ (PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry))
+
+struct ppl_header {
+ __u8 reserved[PPL_HDR_RESERVED];/* reserved space, fill with 0xff */
+ __le32 signature; /* signature (family number of volume) */
+ __le32 padding; /* zero pad */
+ __le64 generation; /* generation number of the header */
+ __le32 entries_count; /* number of entries in entry array */
+ __le32 checksum; /* checksum of the header (~crc32c) */
+ struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES];
+} __attribute__ ((__packed__));
+
#endif
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 9ac959ef4cae..fe03c6d52761 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -260,6 +260,22 @@ void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic);
+
+/**
+ * percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode
+ * @ref: percpu_ref to switch to atomic mode
+ *
+ * Schedule switching the ref to atomic mode, and wait for the
+ * switch to complete. Caller must ensure that no other thread
+ * will switch back to percpu mode.
+ */
+void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref)
+{
+ percpu_ref_switch_to_atomic(ref, NULL);
+ wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
+}
+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync);
/**
* percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
@@ -290,6 +306,7 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);
/**
* percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation