From 9c47008d13add50ec4597a8b9eee200c515282c8 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Thu, 9 Apr 2009 00:27:12 +0100
Subject: dm: add integrity support

This patch provides support for data integrity passthrough in the device
mapper.

 - If one or more component devices support integrity an integrity
   profile is preallocated for the DM device.

 - If all component devices have compatible profiles the DM device is
   flagged as capable.

 - Handle integrity metadata when splitting and cloning bios.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c | 21 +++++++++++++++++++++
 drivers/md/dm-table.c | 40 ++++++++++++++++++++++++++++++++++++++++
 drivers/md/dm.c       | 15 +++++++++++++++
 3 files changed, 76 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index f01096549a93..823ceba6efa8 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1047,6 +1047,19 @@ static int populate_table(struct dm_table *table,
 	return dm_table_complete(table);
 }
 
+static int table_prealloc_integrity(struct dm_table *t,
+				    struct mapped_device *md)
+{
+	struct list_head *devices = dm_table_get_devices(t);
+	struct dm_dev_internal *dd;
+
+	list_for_each_entry(dd, devices, list)
+		if (bdev_get_integrity(dd->dm_dev.bdev))
+			return blk_integrity_register(dm_disk(md), NULL);
+
+	return 0;
+}
+
 static int table_load(struct dm_ioctl *param, size_t param_size)
 {
 	int r;
@@ -1068,6 +1081,14 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 		goto out;
 	}
 
+	r = table_prealloc_integrity(t, md);
+	if (r) {
+		DMERR("%s: could not register integrity profile.",
+		      dm_device_name(md));
+		dm_table_destroy(t);
+		goto out;
+	}
+
 	down_write(&_hash_lock);
 	hc = dm_get_mdptr(md);
 	if (!hc || hc->md != md) {
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e8361b191b9b..02d0b489fad6 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -879,6 +879,45 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
 	return &t->targets[(KEYS_PER_NODE * n) + k];
 }
 
+/*
+ * Set the integrity profile for this device if all devices used have
+ * matching profiles.
+ */
+static void dm_table_set_integrity(struct dm_table *t)
+{
+	struct list_head *devices = dm_table_get_devices(t);
+	struct dm_dev_internal *prev = NULL, *dd = NULL;
+
+	if (!blk_get_integrity(dm_disk(t->md)))
+		return;
+
+	list_for_each_entry(dd, devices, list) {
+		if (prev &&
+		    blk_integrity_compare(prev->dm_dev.bdev->bd_disk,
+					  dd->dm_dev.bdev->bd_disk) < 0) {
+			DMWARN("%s: integrity not set: %s and %s mismatch",
+			       dm_device_name(t->md),
+			       prev->dm_dev.bdev->bd_disk->disk_name,
+			       dd->dm_dev.bdev->bd_disk->disk_name);
+			goto no_integrity;
+		}
+		prev = dd;
+	}
+
+	if (!prev || !bdev_get_integrity(prev->dm_dev.bdev))
+		goto no_integrity;
+
+	blk_integrity_register(dm_disk(t->md),
+			       bdev_get_integrity(prev->dm_dev.bdev));
+
+	return;
+
+no_integrity:
+	blk_integrity_register(dm_disk(t->md), NULL);
+
+	return;
+}
+
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 {
 	/*
@@ -899,6 +938,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	else
 		queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
 
+	dm_table_set_integrity(t);
 }
 
 unsigned int dm_table_get_num_targets(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 788ba96a6256..25d86e2c01f2 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -700,6 +700,12 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
 	clone->bi_io_vec->bv_len = clone->bi_size;
 	clone->bi_flags |= 1 << BIO_CLONED;
 
+	if (bio_integrity(bio)) {
+		bio_integrity_clone(clone, bio, GFP_NOIO);
+		bio_integrity_trim(clone,
+				   bio_sector_offset(bio, idx, offset), len);
+	}
+
 	return clone;
 }
 
@@ -721,6 +727,14 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
 	clone->bi_size = to_bytes(len);
 	clone->bi_flags &= ~(1 << BIO_SEG_VALID);
 
+	if (bio_integrity(bio)) {
+		bio_integrity_clone(clone, bio, GFP_NOIO);
+
+		if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
+			bio_integrity_trim(clone,
+					   bio_sector_offset(bio, idx, 0), len);
+	}
+
 	return clone;
 }
 
@@ -1193,6 +1207,7 @@ static void free_dev(struct mapped_device *md)
 	mempool_destroy(md->tio_pool);
 	mempool_destroy(md->io_pool);
 	bioset_free(md->bs);
+	blk_integrity_unregister(md->disk);
 	del_gendisk(md->disk);
 	free_minor(minor);
 
-- 
cgit 


From 692d0eb9e02cf81fb387ff891f53840db2f3110a Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 9 Apr 2009 00:27:13 +0100
Subject: dm: remove limited barrier support

Prepare for full barrier implementation: first remove the restricted support.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-linear.c |  1 -
 drivers/md/dm-table.c  | 19 -------------------
 drivers/md/dm.c        | 15 ++++++++++-----
 drivers/md/dm.h        |  1 -
 4 files changed, 10 insertions(+), 26 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index bfa107f59d96..79fb53e51c70 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -142,7 +142,6 @@ static struct target_type linear_target = {
 	.status = linear_status,
 	.ioctl  = linear_ioctl,
 	.merge  = linear_merge,
-	.features = DM_TARGET_SUPPORTS_BARRIERS,
 };
 
 int __init dm_linear_init(void)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 02d0b489fad6..429b50b975d5 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -52,8 +52,6 @@ struct dm_table {
 	sector_t *highs;
 	struct dm_target *targets;
 
-	unsigned barriers_supported:1;
-
 	/*
 	 * Indicates the rw permissions for the new logical
 	 * device.  This should be a combination of FMODE_READ
@@ -243,7 +241,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 
 	INIT_LIST_HEAD(&t->devices);
 	atomic_set(&t->holders, 0);
-	t->barriers_supported = 1;
 
 	if (!num_targets)
 		num_targets = KEYS_PER_NODE;
@@ -751,10 +748,6 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 	/* FIXME: the plan is to combine high here and then have
 	 * the merge fn apply the target level restrictions. */
 	combine_restrictions_low(&t->limits, &tgt->limits);
-
-	if (!(tgt->type->features & DM_TARGET_SUPPORTS_BARRIERS))
-		t->barriers_supported = 0;
-
 	return 0;
 
  bad:
@@ -799,12 +792,6 @@ int dm_table_complete(struct dm_table *t)
 
 	check_for_valid_limits(&t->limits);
 
-	/*
-	 * We only support barriers if there is exactly one underlying device.
-	 */
-	if (!list_is_singular(&t->devices))
-		t->barriers_supported = 0;
-
 	/* how many indexes will the btree have ? */
 	leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
 	t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
@@ -1059,12 +1046,6 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
 	return t->md;
 }
 
-int dm_table_barrier_ok(struct dm_table *t)
-{
-	return t->barriers_supported;
-}
-EXPORT_SYMBOL(dm_table_barrier_ok);
-
 EXPORT_SYMBOL(dm_vcalloc);
 EXPORT_SYMBOL(dm_get_device);
 EXPORT_SYMBOL(dm_put_device);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 25d86e2c01f2..ab3b5d84df65 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -851,11 +851,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 		bio_io_error(bio);
 		return;
 	}
-	if (unlikely(bio_barrier(bio) && !dm_table_barrier_ok(ci.map))) {
-		dm_table_put(ci.map);
-		bio_endio(bio, -EOPNOTSUPP);
-		return;
-	}
+
 	ci.md = md;
 	ci.bio = bio;
 	ci.io = alloc_io(md);
@@ -937,6 +933,15 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	struct mapped_device *md = q->queuedata;
 	int cpu;
 
+	/*
+	 * There is no use in forwarding any barrier request since we can't
+	 * guarantee it is (or can be) handled by the targets correctly.
+	 */
+	if (unlikely(bio_barrier(bio))) {
+		bio_endio(bio, -EOPNOTSUPP);
+		return 0;
+	}
+
 	down_read(&md->io_lock);
 
 	cpu = part_stat_lock();
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index b48397c0abbd..a31506d93e91 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -52,7 +52,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits);
  * To check the return value from dm_table_find_target().
  */
 #define dm_target_is_valid(t) ((t)->table)
-int dm_table_barrier_ok(struct dm_table *t);
 
 /*-----------------------------------------------------------------
  * A registry of target types.
-- 
cgit 


From df12ee996378a5917e9555169fe278ecca0612d4 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Thu, 9 Apr 2009 00:27:13 +0100
Subject: dm: rearrange dm_wq_work

Refactor dm_wq_work() to make later patch more readable.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ab3b5d84df65..020a9e1993a7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1437,18 +1437,19 @@ static void dm_wq_work(struct work_struct *work)
 
 	down_write(&md->io_lock);
 
-next_bio:
-	spin_lock_irq(&md->deferred_lock);
-	c = bio_list_pop(&md->deferred);
-	spin_unlock_irq(&md->deferred_lock);
+	while (1) {
+		spin_lock_irq(&md->deferred_lock);
+		c = bio_list_pop(&md->deferred);
+		spin_unlock_irq(&md->deferred_lock);
+
+		if (!c) {
+			clear_bit(DMF_BLOCK_IO, &md->flags);
+			break;
+		}
 
-	if (c) {
 		__split_and_process_bio(md, c);
-		goto next_bio;
 	}
 
-	clear_bit(DMF_BLOCK_IO, &md->flags);
-
 	up_write(&md->io_lock);
 }
 
-- 
cgit 


From 1eb787ec183d1267cac95aae632e92dee05ed50a Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Thu, 9 Apr 2009 00:27:14 +0100
Subject: dm: split DMF_BLOCK_IO flag into two

Split the DMF_BLOCK_IO flag into two.

DMF_BLOCK_IO_FOR_SUSPEND is set when I/O must be blocked while suspending a
device.  DMF_QUEUE_IO_TO_THREAD is set when I/O must be queued to a
worker thread for later processing.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 020a9e1993a7..7cac7220937f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -89,12 +89,13 @@ union map_info *dm_get_mapinfo(struct bio *bio)
 /*
  * Bits for the md->flags field.
  */
-#define DMF_BLOCK_IO 0
+#define DMF_BLOCK_IO_FOR_SUSPEND 0
 #define DMF_SUSPENDED 1
 #define DMF_FROZEN 2
 #define DMF_FREEING 3
 #define DMF_DELETING 4
 #define DMF_NOFLUSH_SUSPENDING 5
+#define DMF_QUEUE_IO_TO_THREAD 6
 
 /*
  * Work processed by per-device workqueue.
@@ -439,7 +440,7 @@ static int queue_io(struct mapped_device *md, struct bio *bio)
 {
 	down_write(&md->io_lock);
 
-	if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
+	if (!test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) {
 		up_write(&md->io_lock);
 		return 1;
 	}
@@ -950,10 +951,10 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	part_stat_unlock();
 
 	/*
-	 * If we're suspended we have to queue
-	 * this io for later.
+	 * If we're suspended or the thread is processing barriers
+	 * we have to queue this io for later.
 	 */
-	while (test_bit(DMF_BLOCK_IO, &md->flags)) {
+	while (test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) {
 		up_read(&md->io_lock);
 
 		if (bio_rw(bio) != READA)
@@ -997,7 +998,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 	struct mapped_device *md = congested_data;
 	struct dm_table *map;
 
-	if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
+	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
 		map = dm_get_table(md);
 		if (map) {
 			r = dm_table_any_congested(map, bdi_bits);
@@ -1443,7 +1444,8 @@ static void dm_wq_work(struct work_struct *work)
 		spin_unlock_irq(&md->deferred_lock);
 
 		if (!c) {
-			clear_bit(DMF_BLOCK_IO, &md->flags);
+			clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
+			clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
 			break;
 		}
 
@@ -1574,10 +1576,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	}
 
 	/*
-	 * First we set the BLOCK_IO flag so no more ios will be mapped.
+	 * First we set the DMF_QUEUE_IO_TO_THREAD flag so no more ios
+	 * will be mapped.
 	 */
 	down_write(&md->io_lock);
-	set_bit(DMF_BLOCK_IO, &md->flags);
+	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
+	set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
 
 	up_write(&md->io_lock);
 
-- 
cgit 


From 54d9a1b4513b96cbd835ca6866c6a604d194b2ae Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Thu, 9 Apr 2009 00:27:14 +0100
Subject: dm: simplify dm_request loop

Refactor the code in dm_request().

Require the new DMF_BLOCK_FOR_SUSPEND flag on readahead bios we will
discard so we don't drop such bios while processing a barrier.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7cac7220937f..bb97ec8d6644 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -929,7 +929,6 @@ out:
  */
 static int dm_request(struct request_queue *q, struct bio *bio)
 {
-	int r = -EIO;
 	int rw = bio_data_dir(bio);
 	struct mapped_device *md = q->queuedata;
 	int cpu;
@@ -957,11 +956,14 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	while (test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) {
 		up_read(&md->io_lock);
 
-		if (bio_rw(bio) != READA)
-			r = queue_io(md, bio);
+		if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
+		    bio_rw(bio) == READA) {
+			bio_io_error(bio);
+			return 0;
+		}
 
-		if (r <= 0)
-			goto out_req;
+		if (!queue_io(md, bio))
+			return 0;
 
 		/*
 		 * We're in a while loop, because someone could suspend
@@ -973,12 +975,6 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	__split_and_process_bio(md, bio);
 	up_read(&md->io_lock);
 	return 0;
-
-out_req:
-	if (r < 0)
-		bio_io_error(bio);
-
-	return 0;
 }
 
 static void dm_unplug_all(struct request_queue *q)
-- 
cgit 


From 3b00b2036fac7a7667d0676a0f80eee575b8c32b Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 9 Apr 2009 00:27:15 +0100
Subject: dm: rework queueing and suspension

Rework shutting down on suspend and document the associated rules.

Drop write lock in __split_and_process_bio to allow more processing
concurrency.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index bb97ec8d6644..9746c1eb9ef7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1434,18 +1434,21 @@ static void dm_wq_work(struct work_struct *work)
 
 	down_write(&md->io_lock);
 
-	while (1) {
+	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
 		spin_lock_irq(&md->deferred_lock);
 		c = bio_list_pop(&md->deferred);
 		spin_unlock_irq(&md->deferred_lock);
 
 		if (!c) {
-			clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
 			clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
 			break;
 		}
 
+		up_write(&md->io_lock);
+
 		__split_and_process_bio(md, c);
+
+		down_write(&md->io_lock);
 	}
 
 	up_write(&md->io_lock);
@@ -1453,8 +1456,9 @@ static void dm_wq_work(struct work_struct *work)
 
 static void dm_queue_flush(struct mapped_device *md)
 {
+	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
+	smp_mb__after_clear_bit();
 	queue_work(md->wq, &md->work);
-	flush_workqueue(md->wq);
 }
 
 /*
@@ -1572,22 +1576,36 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	}
 
 	/*
-	 * First we set the DMF_QUEUE_IO_TO_THREAD flag so no more ios
-	 * will be mapped.
+	 * Here we must make sure that no processes are submitting requests
+	 * to target drivers i.e. no one may be executing
+	 * __split_and_process_bio. This is called from dm_request and
+	 * dm_wq_work.
+	 *
+	 * To get all processes out of __split_and_process_bio in dm_request,
+	 * we take the write lock. To prevent any process from reentering
+	 * __split_and_process_bio from dm_request, we set
+	 * DMF_QUEUE_IO_TO_THREAD.
+	 *
+	 * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
+	 * and call flush_workqueue(md->wq). flush_workqueue will wait until
+	 * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
+	 * further calls to __split_and_process_bio from dm_wq_work.
 	 */
 	down_write(&md->io_lock);
 	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
 	set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
-
 	up_write(&md->io_lock);
 
+	flush_workqueue(md->wq);
+
 	/*
-	 * Wait for the already-mapped ios to complete.
+	 * At this point no more requests are entering target request routines.
+	 * We call dm_wait_for_completion to wait for all existing requests
+	 * to finish.
 	 */
 	r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
 
 	down_write(&md->io_lock);
-
 	if (noflush)
 		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 	up_write(&md->io_lock);
@@ -1600,6 +1618,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 		goto out; /* pushback list is already flushed, so skip flush */
 	}
 
+	/*
+	 * If dm_wait_for_completion returned 0, the device is completely
+	 * quiescent now. There is no request-processing activity. All new
+	 * requests are being added to md->deferred list.
+	 */
+
 	dm_table_postsuspend_targets(map);
 
 	set_bit(DMF_SUSPENDED, &md->flags);
-- 
cgit 


From 92c639021ca6e962645114f02e356e7feb131d0b Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 9 Apr 2009 00:27:15 +0100
Subject: dm: remove dm_request loop

Remove queue_io return value and a loop in dm_request.

IO may be submitted to a worker thread with queue_io().  queue_io() sets
DMF_QUEUE_IO_TO_THREAD so that all further IO is queued for the thread. When
the thread finishes its work, it clears DMF_QUEUE_IO_TO_THREAD and from this
point on, requests are submitted from dm_request again. This will be used
for processing barriers.

Remove the loop in dm_request. queue_io() can submit I/Os to the worker thread
even if DMF_QUEUE_IO_TO_THREAD was not set.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9746c1eb9ef7..db022e5f3912 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -436,21 +436,18 @@ static void end_io_acct(struct dm_io *io)
 /*
  * Add the bio to the list of deferred io.
  */
-static int queue_io(struct mapped_device *md, struct bio *bio)
+static void queue_io(struct mapped_device *md, struct bio *bio)
 {
 	down_write(&md->io_lock);
 
-	if (!test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) {
-		up_write(&md->io_lock);
-		return 1;
-	}
-
 	spin_lock_irq(&md->deferred_lock);
 	bio_list_add(&md->deferred, bio);
 	spin_unlock_irq(&md->deferred_lock);
 
+	if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
+		queue_work(md->wq, &md->work);
+
 	up_write(&md->io_lock);
-	return 0;		/* deferred successfully */
 }
 
 /*
@@ -953,7 +950,7 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	 * If we're suspended or the thread is processing barriers
 	 * we have to queue this io for later.
 	 */
-	while (test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) {
+	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))) {
 		up_read(&md->io_lock);
 
 		if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
@@ -962,14 +959,9 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 			return 0;
 		}
 
-		if (!queue_io(md, bio))
-			return 0;
+		queue_io(md, bio);
 
-		/*
-		 * We're in a while loop, because someone could suspend
-		 * before we get to the following read lock.
-		 */
-		down_read(&md->io_lock);
+		return 0;
 	}
 
 	__split_and_process_bio(md, bio);
-- 
cgit 


From af7e466a1acededbc10beaba9eec8531d561c566 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 9 Apr 2009 00:27:16 +0100
Subject: dm: implement basic barrier support

Barriers are submitted to a worker thread that issues them in-order.

The thread is modified so that when it sees a barrier request it waits
for all pending IO before the request then submits the barrier and
waits for it.  (We must wait, otherwise it could be intermixed with
following requests.)

Errors from the barrier request are recorded in a per-device barrier_error
variable. There may be only one barrier request in progress at once.

For now, the barrier request is converted to a non-barrier request when
sending it to the underlying device.

This patch guarantees correct barrier behavior if the underlying device
doesn't perform write-back caching. The same requirement existed before
barriers were supported in dm.

Bottom layer barrier support (sending barriers by target drivers) and
handling devices with write-back caches will be done in further patches.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 88 ++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 68 insertions(+), 20 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index db022e5f3912..8a994be035ba 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -124,6 +124,11 @@ struct mapped_device {
 	struct bio_list deferred;
 	spinlock_t deferred_lock;
 
+	/*
+	 * An error from the barrier request currently being processed.
+	 */
+	int barrier_error;
+
 	/*
 	 * Processing queue (flush/barriers)
 	 */
@@ -425,6 +430,10 @@ static void end_io_acct(struct dm_io *io)
 	part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
 	part_stat_unlock();
 
+	/*
+	 * After this is decremented the bio must not be touched if it is
+	 * a barrier.
+	 */
 	dm_disk(md)->part0.in_flight = pending =
 		atomic_dec_return(&md->pending);
 
@@ -531,25 +540,35 @@ static void dec_pending(struct dm_io *io, int error)
 			 */
 			spin_lock_irqsave(&md->deferred_lock, flags);
 			if (__noflush_suspending(md))
-				bio_list_add(&md->deferred, io->bio);
+				bio_list_add_head(&md->deferred, io->bio);
 			else
 				/* noflush suspend was interrupted. */
 				io->error = -EIO;
 			spin_unlock_irqrestore(&md->deferred_lock, flags);
 		}
 
-		end_io_acct(io);
-
 		io_error = io->error;
 		bio = io->bio;
 
-		free_io(md, io);
+		if (bio_barrier(bio)) {
+			/*
+			 * There can be just one barrier request so we use
+			 * a per-device variable for error reporting.
+			 * Note that you can't touch the bio after end_io_acct
+			 */
+			md->barrier_error = io_error;
+			end_io_acct(io);
+		} else {
+			end_io_acct(io);
 
-		if (io_error != DM_ENDIO_REQUEUE) {
-			trace_block_bio_complete(md->queue, bio);
+			if (io_error != DM_ENDIO_REQUEUE) {
+				trace_block_bio_complete(md->queue, bio);
 
-			bio_endio(bio, io_error);
+				bio_endio(bio, io_error);
+			}
 		}
+
+		free_io(md, io);
 	}
 }
 
@@ -691,7 +710,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
 
 	clone->bi_sector = sector;
 	clone->bi_bdev = bio->bi_bdev;
-	clone->bi_rw = bio->bi_rw;
+	clone->bi_rw = bio->bi_rw & ~(1 << BIO_RW_BARRIER);
 	clone->bi_vcnt = 1;
 	clone->bi_size = to_bytes(len);
 	clone->bi_io_vec->bv_offset = offset;
@@ -718,6 +737,7 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
 
 	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
 	__bio_clone(clone, bio);
+	clone->bi_rw &= ~(1 << BIO_RW_BARRIER);
 	clone->bi_destructor = dm_bio_destructor;
 	clone->bi_sector = sector;
 	clone->bi_idx = idx;
@@ -846,7 +866,10 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 
 	ci.map = dm_get_table(md);
 	if (unlikely(!ci.map)) {
-		bio_io_error(bio);
+		if (!bio_barrier(bio))
+			bio_io_error(bio);
+		else
+			md->barrier_error = -EIO;
 		return;
 	}
 
@@ -930,15 +953,6 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	struct mapped_device *md = q->queuedata;
 	int cpu;
 
-	/*
-	 * There is no use in forwarding any barrier request since we can't
-	 * guarantee it is (or can be) handled by the targets correctly.
-	 */
-	if (unlikely(bio_barrier(bio))) {
-		bio_endio(bio, -EOPNOTSUPP);
-		return 0;
-	}
-
 	down_read(&md->io_lock);
 
 	cpu = part_stat_lock();
@@ -950,7 +964,8 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	 * If we're suspended or the thread is processing barriers
 	 * we have to queue this io for later.
 	 */
-	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))) {
+	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
+	    unlikely(bio_barrier(bio))) {
 		up_read(&md->io_lock);
 
 		if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
@@ -1415,6 +1430,36 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 	return r;
 }
 
+static int dm_flush(struct mapped_device *md)
+{
+	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
+	return 0;
+}
+
+static void process_barrier(struct mapped_device *md, struct bio *bio)
+{
+	int error = dm_flush(md);
+
+	if (unlikely(error)) {
+		bio_endio(bio, error);
+		return;
+	}
+	if (bio_empty_barrier(bio)) {
+		bio_endio(bio, 0);
+		return;
+	}
+
+	__split_and_process_bio(md, bio);
+
+	error = dm_flush(md);
+
+	if (!error && md->barrier_error)
+		error = md->barrier_error;
+
+	if (md->barrier_error != DM_ENDIO_REQUEUE)
+		bio_endio(bio, error);
+}
+
 /*
  * Process the deferred bios
  */
@@ -1438,7 +1483,10 @@ static void dm_wq_work(struct work_struct *work)
 
 		up_write(&md->io_lock);
 
-		__split_and_process_bio(md, c);
+		if (bio_barrier(c))
+			process_barrier(md, c);
+		else
+			__split_and_process_bio(md, c);
 
 		down_write(&md->io_lock);
 	}
-- 
cgit 


From 73830857bca6f6c9dbd48e906daea50bea42d676 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 9 Apr 2009 00:27:16 +0100
Subject: dm kcopyd: prepare for callback race fix

Use a variable in segment_complete() to point to the dm_kcopyd_client
struct and only release job->pages in run_complete_job() if any are
defined.  These changes are needed by the next patch.

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 0a225da21272..9d379070918b 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -297,7 +297,8 @@ static int run_complete_job(struct kcopyd_job *job)
 	dm_kcopyd_notify_fn fn = job->fn;
 	struct dm_kcopyd_client *kc = job->kc;
 
-	kcopyd_put_pages(kc, job->pages);
+	if (job->pages)
+		kcopyd_put_pages(kc, job->pages);
 	mempool_free(job, kc->job_pool);
 	fn(read_err, write_err, context);
 
@@ -461,6 +462,7 @@ static void segment_complete(int read_err, unsigned long write_err,
 	sector_t progress = 0;
 	sector_t count = 0;
 	struct kcopyd_job *job = (struct kcopyd_job *) context;
+	struct dm_kcopyd_client *kc = job->kc;
 
 	mutex_lock(&job->lock);
 
@@ -490,7 +492,7 @@ static void segment_complete(int read_err, unsigned long write_err,
 
 	if (count) {
 		int i;
-		struct kcopyd_job *sub_job = mempool_alloc(job->kc->job_pool,
+		struct kcopyd_job *sub_job = mempool_alloc(kc->job_pool,
 							   GFP_NOIO);
 
 		*sub_job = *job;
-- 
cgit 


From 340cd44451fb0bfa542365e6b4b565bbd44836e2 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 9 Apr 2009 00:27:17 +0100
Subject: dm kcopyd: fix callback race

If the thread calling dm_kcopyd_copy is delayed due to scheduling inside
split_job/segment_complete and the subjobs complete before the loop in
split_job completes, the kcopyd callback could be invoked from the
thread that called dm_kcopyd_copy instead of the kcopyd workqueue.

dm_kcopyd_copy -> split_job -> segment_complete -> job->fn()

Snapshots depend on the fact that callbacks are called from the singlethreaded
kcopyd workqueue and expect that there is no racing between individual
callbacks. The racing between callbacks can lead to corruption of exception
store and it can also mean that exception store callbacks are called twice
for the same exception - a likely reason for crashes reported inside
pending_complete() / remove_exception().

This patch fixes two problems:

1. job->fn being called from the thread that submitted the job (see above).

- Fix: hand over the completion callback to the kcopyd thread.

2. job->fn(read_err, write_err, job->context); in segment_complete
reports the error of the last subjob, not the union of all errors.

- Fix: pass job->write_err to the callback to report all error bits
  (it is done already in run_complete_job)

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 9d379070918b..3e3fc06cb861 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -511,13 +511,16 @@ static void segment_complete(int read_err, unsigned long write_err,
 	} else if (atomic_dec_and_test(&job->sub_jobs)) {
 
 		/*
-		 * To avoid a race we must keep the job around
-		 * until after the notify function has completed.
-		 * Otherwise the client may try and stop the job
-		 * after we've completed.
+		 * Queue the completion callback to the kcopyd thread.
+		 *
+		 * Some callers assume that all the completions are called
+		 * from a single thread and don't race with each other.
+		 *
+		 * We must not call the callback directly here because this
+		 * code may not be executing in the thread.
 		 */
-		job->fn(read_err, write_err, job->context);
-		mempool_free(job, job->kc->job_pool);
+		push(&kc->complete_jobs, job);
+		wake(kc);
 	}
 }
 
@@ -530,6 +533,8 @@ static void split_job(struct kcopyd_job *job)
 {
 	int i;
 
+	atomic_inc(&job->kc->nr_jobs);
+
 	atomic_set(&job->sub_jobs, SPLIT_COUNT);
 	for (i = 0; i < SPLIT_COUNT; i++)
 		segment_complete(0, 0u, job);
-- 
cgit