From 3b8a44f8edfd695f3bf63e0371f5b622028e560a Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 29 Oct 2014 17:19:44 +0100
Subject: drbd: Remove pointless check

In drbd-8.4 there is always a single connection per resource,
and there is always exactly one peer_device for a device.
peer_device can not be NULL here.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_nl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index e80cbefbc2b5..a1a01ccb7399 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1478,7 +1478,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	device = adm_ctx.device;
 	mutex_lock(&adm_ctx.resource->adm_mutex);
 	peer_device = first_peer_device(device);
-	connection = peer_device ? peer_device->connection : NULL;
+	connection = peer_device->connection;
 	conn_reconfig_start(connection);
 
 	/* if you want to reconfigure, please tear down first */
-- 
cgit 


From 2e9ffde6f0a6bbbd2975e0ec50578a5abae5a5a5 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@linbit.com>
Date: Fri, 8 Aug 2014 17:48:00 +0200
Subject: drbd: De-inline drbd_should_do_remote() and
 drbd_should_send_out_of_sync()

There is no need to have these two as inline functions.  In addition,
drbd_should_send_out_of_sync() is only used in a single place, anyway.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_req.c | 18 ++++++++++++++++++
 drivers/block/drbd/drbd_req.h | 17 +----------------
 2 files changed, 19 insertions(+), 16 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3ae2c0086563..55fca685fca5 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1095,6 +1095,24 @@ static bool do_remote_read(struct drbd_request *req)
 	return false;
 }
 
+bool drbd_should_do_remote(union drbd_dev_state s)
+{
+	return s.pdsk == D_UP_TO_DATE ||
+		(s.pdsk >= D_INCONSISTENT &&
+		 s.conn >= C_WF_BITMAP_T &&
+		 s.conn < C_AHEAD);
+	/* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
+	   That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
+	   states. */
+}
+
+static bool drbd_should_send_out_of_sync(union drbd_dev_state s)
+{
+	return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
+	/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
+	   since we enter state C_AHEAD only if proto >= 96 */
+}
+
 /* returns number of connections (== 1, for drbd 8.4)
  * expected to actually write this data,
  * which does NOT include those that we are L_AHEAD for. */
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 9f6a04080e9f..bb2ef78165e5 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -331,21 +331,6 @@ static inline int req_mod(struct drbd_request *req,
 	return rv;
 }
 
-static inline bool drbd_should_do_remote(union drbd_dev_state s)
-{
-	return s.pdsk == D_UP_TO_DATE ||
-		(s.pdsk >= D_INCONSISTENT &&
-		 s.conn >= C_WF_BITMAP_T &&
-		 s.conn < C_AHEAD);
-	/* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
-	   That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
-	   states. */
-}
-static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s)
-{
-	return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
-	/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
-	   since we enter state C_AHEAD only if proto >= 96 */
-}
+extern bool drbd_should_do_remote(union drbd_dev_state);
 
 #endif
-- 
cgit 


From 5dd2ca1912714a006075e1cb763a3610ef9b3212 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@linbit.com>
Date: Mon, 11 Aug 2014 16:59:23 +0200
Subject: drbd: Get rid of some first_peer_device() calls

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_receiver.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index b4b5680ac6ad..5bb71e50843a 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1380,7 +1380,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
 	if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
 		/* wait for all pending IO completions, before we start
 		 * zeroing things out. */
-		conn_wait_active_ee_empty(first_peer_device(device)->connection);
+		conn_wait_active_ee_empty(peer_req->peer_device->connection);
 		/* add it to the active list now,
 		 * so we can find it to present it in debugfs */
 		peer_req->submit_jif = jiffies;
@@ -1966,7 +1966,7 @@ static int e_end_block(struct drbd_work *w, int cancel)
 	} else
 		D_ASSERT(device, drbd_interval_empty(&peer_req->i));
 
-	drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+	drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
 
 	return err;
 }
@@ -2098,7 +2098,7 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co
 		}
 
 		rcu_read_lock();
-		tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries;
+		tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
 		rcu_read_unlock();
 
 		if (!tp)
@@ -2364,7 +2364,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 	if (dp_flags & DP_SEND_RECEIVE_ACK) {
 		/* I really don't like it that the receiver thread
 		 * sends on the msock, but anyways */
-		drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
+		drbd_send_ack(peer_device, P_RECV_ACK, peer_req);
 	}
 
 	if (tp) {
-- 
cgit 


From f6ba86363908e3f4e3ef11f768be7ca2745b18cf Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@linbit.com>
Date: Wed, 13 Aug 2014 18:33:55 +0200
Subject: drbd: Move enum write_ordering_e to drbd.h

Also change the enum values to all-capital letters.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_int.h      |  6 ------
 drivers/block/drbd/drbd_main.c     |  2 +-
 drivers/block/drbd/drbd_nl.c       |  4 ++--
 drivers/block/drbd/drbd_proc.c     |  6 +++---
 drivers/block/drbd/drbd_receiver.c | 28 ++++++++++++++--------------
 5 files changed, 20 insertions(+), 26 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e66d453a5f2b..47d4b02103b8 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -632,12 +632,6 @@ struct bm_io_work {
 	void (*done)(struct drbd_device *device, int rv);
 };
 
-enum write_ordering_e {
-	WO_none,
-	WO_drain_io,
-	WO_bdev_flush,
-};
-
 struct fifo_buffer {
 	unsigned int head_index;
 	unsigned int size;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 74d97f4bac34..3ee4a44cb225 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2590,7 +2590,7 @@ struct drbd_resource *drbd_create_resource(const char *name)
 	kref_init(&resource->kref);
 	idr_init(&resource->devices);
 	INIT_LIST_HEAD(&resource->connections);
-	resource->write_ordering = WO_bdev_flush;
+	resource->write_ordering = WO_BDEV_FLUSH;
 	list_add_tail_rcu(&resource->resources, &drbd_resources);
 	mutex_init(&resource->conf_update);
 	mutex_init(&resource->adm_mutex);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index a1a01ccb7399..dfc1799d0f83 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1418,7 +1418,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
 		set_bit(MD_NO_FUA, &device->flags);
 
 	if (write_ordering_changed(old_disk_conf, new_disk_conf))
-		drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush);
+		drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH);
 
 	drbd_md_sync(device);
 
@@ -1727,7 +1727,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	new_disk_conf = NULL;
 	new_plan = NULL;
 
-	drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush);
+	drbd_bump_write_ordering(device->resource, device->ldev, WO_BDEV_FLUSH);
 
 	if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
 		set_bit(CRASHED_PRIMARY, &device->flags);
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 3b10fa6cb039..6537b25db9c1 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -245,9 +245,9 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 	char wp;
 
 	static char write_ordering_chars[] = {
-		[WO_none] = 'n',
-		[WO_drain_io] = 'd',
-		[WO_bdev_flush] = 'f',
+		[WO_NONE] = 'n',
+		[WO_DRAIN_IO] = 'd',
+		[WO_BDEV_FLUSH] = 'f',
 	};
 
 	seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 5bb71e50843a..bf38b957d9dd 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1178,7 +1178,7 @@ static void drbd_flush(struct drbd_connection *connection)
 	struct drbd_peer_device *peer_device;
 	int vnr;
 
-	if (connection->resource->write_ordering >= WO_bdev_flush) {
+	if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
 		rcu_read_lock();
 		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
 			struct drbd_device *device = peer_device->device;
@@ -1203,7 +1203,7 @@ static void drbd_flush(struct drbd_connection *connection)
 				/* would rather check on EOPNOTSUPP, but that is not reliable.
 				 * don't try again for ANY return value != 0
 				 * if (rv == -EOPNOTSUPP) */
-				drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io);
+				drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
 			}
 			put_ldev(device);
 			kref_put(&device->kref, drbd_destroy_device);
@@ -1299,10 +1299,10 @@ max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
 
 	dc = rcu_dereference(bdev->disk_conf);
 
-	if (wo == WO_bdev_flush && !dc->disk_flushes)
-		wo = WO_drain_io;
-	if (wo == WO_drain_io && !dc->disk_drain)
-		wo = WO_none;
+	if (wo == WO_BDEV_FLUSH && !dc->disk_flushes)
+		wo = WO_DRAIN_IO;
+	if (wo == WO_DRAIN_IO && !dc->disk_drain)
+		wo = WO_NONE;
 
 	return wo;
 }
@@ -1319,13 +1319,13 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
 	enum write_ordering_e pwo;
 	int vnr;
 	static char *write_ordering_str[] = {
-		[WO_none] = "none",
-		[WO_drain_io] = "drain",
-		[WO_bdev_flush] = "flush",
+		[WO_NONE] = "none",
+		[WO_DRAIN_IO] = "drain",
+		[WO_BDEV_FLUSH] = "flush",
 	};
 
 	pwo = resource->write_ordering;
-	if (wo != WO_bdev_flush)
+	if (wo != WO_BDEV_FLUSH)
 		wo = min(pwo, wo);
 	rcu_read_lock();
 	idr_for_each_entry(&resource->devices, device, vnr) {
@@ -1343,7 +1343,7 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
 	rcu_read_unlock();
 
 	resource->write_ordering = wo;
-	if (pwo != resource->write_ordering || wo == WO_bdev_flush)
+	if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH)
 		drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
 }
 
@@ -1533,7 +1533,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
 	 * Therefore we must send the barrier_ack after the barrier request was
 	 * completed. */
 	switch (connection->resource->write_ordering) {
-	case WO_none:
+	case WO_NONE:
 		if (rv == FE_RECYCLED)
 			return 0;
 
@@ -1546,8 +1546,8 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
 			drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
 			/* Fall through */
 
-	case WO_bdev_flush:
-	case WO_drain_io:
+	case WO_BDEV_FLUSH:
+	case WO_DRAIN_IO:
 		conn_wait_active_ee_empty(connection);
 		drbd_flush(connection);
 
-- 
cgit 


From 1ec317d3d1f9b9ec19926fdf2fc59fa3ec8cd15d Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@linbit.com>
Date: Thu, 14 Aug 2014 16:17:58 +0200
Subject: drbd: drbd_adm_attach(): Add missing drbd_resync_after_changed()

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_nl.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index dfc1799d0f83..94e380ff61ca 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1541,9 +1541,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 
 	write_lock_irq(&global_state_lock);
 	retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
-	write_unlock_irq(&global_state_lock);
 	if (retcode != NO_ERROR)
-		goto fail;
+		goto fail_unlock;
 
 	rcu_read_lock();
 	nc = rcu_dereference(connection->net_conf);
@@ -1551,7 +1550,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 		if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
 			rcu_read_unlock();
 			retcode = ERR_STONITH_AND_PROT_A;
-			goto fail;
+			goto fail_unlock;
 		}
 	}
 	rcu_read_unlock();
@@ -1562,7 +1561,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 		drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
 			PTR_ERR(bdev));
 		retcode = ERR_OPEN_DISK;
-		goto fail;
+		goto fail_unlock;
 	}
 	nbc->backing_bdev = bdev;
 
@@ -1582,7 +1581,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 		drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
 			PTR_ERR(bdev));
 		retcode = ERR_OPEN_MD_DISK;
-		goto fail;
+		goto fail_unlock;
 	}
 	nbc->md_bdev = bdev;
 
@@ -1590,7 +1589,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	    (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
 	     new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
 		retcode = ERR_MD_IDX_INVALID;
-		goto fail;
+		goto fail_unlock;
 	}
 
 	resync_lru = lc_create("resync", drbd_bm_ext_cache,
@@ -1598,14 +1597,14 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 			offsetof(struct bm_extent, lce));
 	if (!resync_lru) {
 		retcode = ERR_NOMEM;
-		goto fail;
+		goto fail_unlock;
 	}
 
 	/* Read our meta data super block early.
 	 * This also sets other on-disk offsets. */
 	retcode = drbd_md_read(device, nbc);
 	if (retcode != NO_ERROR)
-		goto fail;
+		goto fail_unlock;
 
 	if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
 		new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
@@ -1617,7 +1616,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 			(unsigned long long) drbd_get_max_capacity(nbc),
 			(unsigned long long) new_disk_conf->disk_size);
 		retcode = ERR_DISK_TOO_SMALL;
-		goto fail;
+		goto fail_unlock;
 	}
 
 	if (new_disk_conf->meta_dev_idx < 0) {
@@ -1634,7 +1633,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 		drbd_warn(device, "refusing attach: md-device too small, "
 		     "at least %llu sectors needed for this meta-disk type\n",
 		     (unsigned long long) min_md_device_sectors);
-		goto fail;
+		goto fail_unlock;
 	}
 
 	/* Make sure the new disk is big enough
@@ -1642,7 +1641,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	if (drbd_get_max_capacity(nbc) <
 	    drbd_get_capacity(device->this_bdev)) {
 		retcode = ERR_DISK_TOO_SMALL;
-		goto fail;
+		goto fail_unlock;
 	}
 
 	nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
@@ -1672,7 +1671,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	retcode = rv;  /* FIXME: Type mismatch. */
 	drbd_resume_io(device);
 	if (rv < SS_SUCCESS)
-		goto fail;
+		goto fail_unlock;
 
 	if (!get_ldev_if_state(device, D_ATTACHING))
 		goto force_diskless;
@@ -1727,6 +1726,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	new_disk_conf = NULL;
 	new_plan = NULL;
 
+	drbd_resync_after_changed(device);
 	drbd_bump_write_ordering(device->resource, device->ldev, WO_BDEV_FLUSH);
 
 	if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
@@ -1850,6 +1850,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	if (rv < SS_SUCCESS)
 		goto force_diskless_dec;
 
+	write_unlock(&global_state_lock);
+
 	mod_timer(&device->request_timer, jiffies + HZ);
 
 	if (device->state.role == R_PRIMARY)
@@ -1872,6 +1874,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
  force_diskless:
 	drbd_force_state(device, NS(disk, D_DISKLESS));
 	drbd_md_sync(device);
+ fail_unlock:
+	write_unlock_irq(&global_state_lock);
  fail:
 	conn_reconfig_done(connection);
 	if (nbc) {
-- 
cgit 


From 28bc3b8c71cda033a4c013131c635d1148889824 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@linbit.com>
Date: Thu, 14 Aug 2014 18:33:30 +0200
Subject: drbd: Fix locking across all resources

Instead of using a rwlock for synchronizing state changes across
resources, take the request locks of all resources for global state
changes.  Use resources_mutex to serialize global state changes.

This means that taking the request lock of a resource is now enough to
prevent changes of that resource.  (Previously, a read lock on the
global state lock was needed as well.)

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_int.h    | 18 ++-------
 drivers/block/drbd/drbd_main.c   | 24 +++++++++++-
 drivers/block/drbd/drbd_nl.c     | 45 +++++++++++----------
 drivers/block/drbd/drbd_state.c  | 14 +++----
 drivers/block/drbd/drbd_state.h  |  6 +--
 drivers/block/drbd/drbd_worker.c | 85 ++++++++++++++++++----------------------
 6 files changed, 99 insertions(+), 93 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 47d4b02103b8..2c9ee223d548 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -292,6 +292,9 @@ struct drbd_device_work {
 
 extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *);
 
+extern void lock_all_resources(void);
+extern void unlock_all_resources(void);
+
 struct drbd_request {
 	struct drbd_work w;
 	struct drbd_device *device;
@@ -1418,7 +1421,7 @@ extern struct bio_set *drbd_md_io_bio_set;
 /* to allocate from that set */
 extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
 
-extern rwlock_t global_state_lock;
+extern struct mutex resources_mutex;
 
 extern int conn_lowest_minor(struct drbd_connection *connection);
 extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor);
@@ -1688,19 +1691,6 @@ static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_r
 	return 0;
 }
 
-static inline enum drbd_state_rv
-_drbd_set_state(struct drbd_device *device, union drbd_state ns,
-		enum chg_state_flags flags, struct completion *done)
-{
-	enum drbd_state_rv rv;
-
-	read_lock(&global_state_lock);
-	rv = __drbd_set_state(device, ns, flags, done);
-	read_unlock(&global_state_lock);
-
-	return rv;
-}
-
 static inline union drbd_state drbd_read_state(struct drbd_device *device)
 {
 	struct drbd_resource *resource = device->resource;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 3ee4a44cb225..f66294db3b08 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -117,6 +117,7 @@ module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0
  */
 struct idr drbd_devices;
 struct list_head drbd_resources;
+struct mutex resources_mutex;
 
 struct kmem_cache *drbd_request_cache;
 struct kmem_cache *drbd_ee_cache;	/* peer requests */
@@ -2923,7 +2924,7 @@ static int __init drbd_init(void)
 	drbd_proc = NULL; /* play safe for drbd_cleanup */
 	idr_init(&drbd_devices);
 
-	rwlock_init(&global_state_lock);
+	mutex_init(&resources_mutex);
 	INIT_LIST_HEAD(&drbd_resources);
 
 	err = drbd_genl_register();
@@ -3746,6 +3747,27 @@ int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i)
 	return 0;
 }
 
+void lock_all_resources(void)
+{
+	struct drbd_resource *resource;
+	int __maybe_unused i = 0;
+
+	mutex_lock(&resources_mutex);
+	local_irq_disable();
+	for_each_resource(resource, &drbd_resources)
+		spin_lock_nested(&resource->req_lock, i++);
+}
+
+void unlock_all_resources(void)
+{
+	struct drbd_resource *resource;
+
+	for_each_resource(resource, &drbd_resources)
+		spin_unlock(&resource->req_lock);
+	local_irq_enable();
+	mutex_unlock(&resources_mutex);
+}
+
 #ifdef CONFIG_DRBD_FAULT_INJECTION
 /* Fault insertion support including random number generator shamelessly
  * stolen from kernel/rcutorture.c */
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 94e380ff61ca..d37c509e6a44 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1389,13 +1389,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
 		goto fail_unlock;
 	}
 
-	write_lock_irq(&global_state_lock);
+	lock_all_resources();
 	retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
 	if (retcode == NO_ERROR) {
 		rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
 		drbd_resync_after_changed(device);
 	}
-	write_unlock_irq(&global_state_lock);
+	unlock_all_resources();
 
 	if (retcode != NO_ERROR)
 		goto fail_unlock;
@@ -1539,18 +1539,13 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 		goto fail;
 	}
 
-	write_lock_irq(&global_state_lock);
-	retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
-	if (retcode != NO_ERROR)
-		goto fail_unlock;
-
 	rcu_read_lock();
 	nc = rcu_dereference(connection->net_conf);
 	if (nc) {
 		if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
 			rcu_read_unlock();
 			retcode = ERR_STONITH_AND_PROT_A;
-			goto fail_unlock;
+			goto fail;
 		}
 	}
 	rcu_read_unlock();
@@ -1561,7 +1556,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 		drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
 			PTR_ERR(bdev));
 		retcode = ERR_OPEN_DISK;
-		goto fail_unlock;
+		goto fail;
 	}
 	nbc->backing_bdev = bdev;
 
@@ -1581,7 +1576,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 		drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
 			PTR_ERR(bdev));
 		retcode = ERR_OPEN_MD_DISK;
-		goto fail_unlock;
+		goto fail;
 	}
 	nbc->md_bdev = bdev;
 
@@ -1589,7 +1584,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	    (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
 	     new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
 		retcode = ERR_MD_IDX_INVALID;
-		goto fail_unlock;
+		goto fail;
 	}
 
 	resync_lru = lc_create("resync", drbd_bm_ext_cache,
@@ -1597,14 +1592,14 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 			offsetof(struct bm_extent, lce));
 	if (!resync_lru) {
 		retcode = ERR_NOMEM;
-		goto fail_unlock;
+		goto fail;
 	}
 
 	/* Read our meta data super block early.
 	 * This also sets other on-disk offsets. */
 	retcode = drbd_md_read(device, nbc);
 	if (retcode != NO_ERROR)
-		goto fail_unlock;
+		goto fail;
 
 	if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
 		new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
@@ -1616,7 +1611,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 			(unsigned long long) drbd_get_max_capacity(nbc),
 			(unsigned long long) new_disk_conf->disk_size);
 		retcode = ERR_DISK_TOO_SMALL;
-		goto fail_unlock;
+		goto fail;
 	}
 
 	if (new_disk_conf->meta_dev_idx < 0) {
@@ -1633,7 +1628,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 		drbd_warn(device, "refusing attach: md-device too small, "
 		     "at least %llu sectors needed for this meta-disk type\n",
 		     (unsigned long long) min_md_device_sectors);
-		goto fail_unlock;
+		goto fail;
 	}
 
 	/* Make sure the new disk is big enough
@@ -1641,7 +1636,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	if (drbd_get_max_capacity(nbc) <
 	    drbd_get_capacity(device->this_bdev)) {
 		retcode = ERR_DISK_TOO_SMALL;
-		goto fail_unlock;
+		goto fail;
 	}
 
 	nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
@@ -1671,7 +1666,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	retcode = rv;  /* FIXME: Type mismatch. */
 	drbd_resume_io(device);
 	if (rv < SS_SUCCESS)
-		goto fail_unlock;
+		goto fail;
 
 	if (!get_ldev_if_state(device, D_ATTACHING))
 		goto force_diskless;
@@ -1706,6 +1701,13 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 		goto force_diskless_dec;
 	}
 
+	lock_all_resources();
+	retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
+	if (retcode != NO_ERROR) {
+		unlock_all_resources();
+		goto force_diskless_dec;
+	}
+
 	/* Reset the "barriers don't work" bits here, then force meta data to
 	 * be written, to ensure we determine if barriers are supported. */
 	if (new_disk_conf->md_flushes)
@@ -1728,6 +1730,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 
 	drbd_resync_after_changed(device);
 	drbd_bump_write_ordering(device->resource, device->ldev, WO_BDEV_FLUSH);
+	unlock_all_resources();
 
 	if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
 		set_bit(CRASHED_PRIMARY, &device->flags);
@@ -1850,8 +1853,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	if (rv < SS_SUCCESS)
 		goto force_diskless_dec;
 
-	write_unlock(&global_state_lock);
-
 	mod_timer(&device->request_timer, jiffies + HZ);
 
 	if (device->state.role == R_PRIMARY)
@@ -1874,8 +1875,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
  force_diskless:
 	drbd_force_state(device, NS(disk, D_DISKLESS));
 	drbd_md_sync(device);
- fail_unlock:
-	write_unlock_irq(&global_state_lock);
  fail:
 	conn_reconfig_done(connection);
 	if (nbc) {
@@ -3453,8 +3452,10 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	/* not yet safe for genl_family.parallel_ops */
+	mutex_lock(&resources_mutex);
 	if (!conn_create(adm_ctx.resource_name, &res_opts))
 		retcode = ERR_NOMEM;
+	mutex_unlock(&resources_mutex);
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -3545,7 +3546,9 @@ static int adm_del_resource(struct drbd_resource *resource)
 	if (!idr_is_empty(&resource->devices))
 		return ERR_RES_IN_USE;
 
+	mutex_lock(&resources_mutex);
 	list_del_rcu(&resource->resources);
+	mutex_unlock(&resources_mutex);
 	/* Make sure all threads have actually stopped: state handling only
 	 * does drbd_thread_stop_nowait(). */
 	list_for_each_entry(connection, &resource->connections, connections)
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 2d7dd269b6a8..535ae47f84c9 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -937,7 +937,7 @@ void drbd_resume_al(struct drbd_device *device)
 		drbd_info(device, "Resumed AL updates\n");
 }
 
-/* helper for __drbd_set_state */
+/* helper for _drbd_set_state */
 static void set_ov_position(struct drbd_device *device, enum drbd_conns cs)
 {
 	if (first_peer_device(device)->connection->agreed_pro_version < 90)
@@ -965,17 +965,17 @@ static void set_ov_position(struct drbd_device *device, enum drbd_conns cs)
 }
 
 /**
- * __drbd_set_state() - Set a new DRBD state
+ * _drbd_set_state() - Set a new DRBD state
  * @device:	DRBD device.
  * @ns:		new state.
  * @flags:	Flags
  * @done:	Optional completion, that will get completed after the after_state_ch() finished
  *
- * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
+ * Caller needs to hold req_lock. Do not call directly.
  */
 enum drbd_state_rv
-__drbd_set_state(struct drbd_device *device, union drbd_state ns,
-	         enum chg_state_flags flags, struct completion *done)
+_drbd_set_state(struct drbd_device *device, union drbd_state ns,
+	        enum chg_state_flags flags, struct completion *done)
 {
 	struct drbd_peer_device *peer_device = first_peer_device(device);
 	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
@@ -1444,7 +1444,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 	if (os.disk != D_FAILED && ns.disk == D_FAILED) {
 		enum drbd_io_error_p eh = EP_PASS_ON;
 		int was_io_error = 0;
-		/* corresponding get_ldev was in __drbd_set_state, to serialize
+		/* corresponding get_ldev was in _drbd_set_state, to serialize
 		 * our cleanup here with the transition to D_DISKLESS.
 		 * But is is still not save to dreference ldev here, since
 		 * we might come from an failed Attach before ldev was set. */
@@ -1759,7 +1759,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union
 		if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
 			ns.disk = os.disk;
 
-		rv = __drbd_set_state(device, ns, flags, NULL);
+		rv = _drbd_set_state(device, ns, flags, NULL);
 		if (rv < SS_SUCCESS)
 			BUG();
 
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
index 7f53c40823cd..bd989536f888 100644
--- a/drivers/block/drbd/drbd_state.h
+++ b/drivers/block/drbd/drbd_state.h
@@ -122,9 +122,9 @@ extern enum drbd_state_rv
 _drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state,
 					union drbd_state, enum chg_state_flags);
 
-extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state,
-					   enum chg_state_flags,
-					   struct completion *done);
+extern enum drbd_state_rv _drbd_set_state(struct drbd_device *, union drbd_state,
+					  enum chg_state_flags,
+					  struct completion *done);
 extern void print_st_err(struct drbd_device *, union drbd_state,
 			union drbd_state, int);
 
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 5578c1477ba6..3b3d980ac8c7 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -55,13 +55,6 @@ static int make_resync_request(struct drbd_device *, int);
  *
  */
 
-
-/* About the global_state_lock
-   Each state transition on an device holds a read lock. In case we have
-   to evaluate the resync after dependencies, we grab a write lock, because
-   we need stable states on all devices for that.  */
-rwlock_t global_state_lock;
-
 /* used for synchronous meta data and bitmap IO
  * submitted by drbd_md_sync_page_io()
  */
@@ -1456,70 +1449,73 @@ static int _drbd_may_sync_now(struct drbd_device *device)
 }
 
 /**
- * _drbd_pause_after() - Pause resync on all devices that may not resync now
+ * drbd_pause_after() - Pause resync on all devices that may not resync now
  * @device:	DRBD device.
  *
  * Called from process context only (admin command and after_state_ch).
  */
-static int _drbd_pause_after(struct drbd_device *device)
+static bool drbd_pause_after(struct drbd_device *device)
 {
+	bool changed = false;
 	struct drbd_device *odev;
-	int i, rv = 0;
+	int i;
 
 	rcu_read_lock();
 	idr_for_each_entry(&drbd_devices, odev, i) {
 		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
 			continue;
-		if (!_drbd_may_sync_now(odev))
-			rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
-			       != SS_NOTHING_TO_DO);
+		if (!_drbd_may_sync_now(odev) &&
+		    _drbd_set_state(_NS(odev, aftr_isp, 1),
+				    CS_HARD, NULL) != SS_NOTHING_TO_DO)
+			changed = true;
 	}
 	rcu_read_unlock();
 
-	return rv;
+	return changed;
 }
 
 /**
- * _drbd_resume_next() - Resume resync on all devices that may resync now
+ * drbd_resume_next() - Resume resync on all devices that may resync now
  * @device:	DRBD device.
  *
  * Called from process context only (admin command and worker).
  */
-static int _drbd_resume_next(struct drbd_device *device)
+static bool drbd_resume_next(struct drbd_device *device)
 {
+	bool changed = false;
 	struct drbd_device *odev;
-	int i, rv = 0;
+	int i;
 
 	rcu_read_lock();
 	idr_for_each_entry(&drbd_devices, odev, i) {
 		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
 			continue;
 		if (odev->state.aftr_isp) {
-			if (_drbd_may_sync_now(odev))
-				rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
-							CS_HARD, NULL)
-				       != SS_NOTHING_TO_DO) ;
+			if (_drbd_may_sync_now(odev) &&
+			    _drbd_set_state(_NS(odev, aftr_isp, 0),
+					    CS_HARD, NULL) != SS_NOTHING_TO_DO)
+				changed = true;
 		}
 	}
 	rcu_read_unlock();
-	return rv;
+	return changed;
 }
 
 void resume_next_sg(struct drbd_device *device)
 {
-	write_lock_irq(&global_state_lock);
-	_drbd_resume_next(device);
-	write_unlock_irq(&global_state_lock);
+	lock_all_resources();
+	drbd_resume_next(device);
+	unlock_all_resources();
 }
 
 void suspend_other_sg(struct drbd_device *device)
 {
-	write_lock_irq(&global_state_lock);
-	_drbd_pause_after(device);
-	write_unlock_irq(&global_state_lock);
+	lock_all_resources();
+	drbd_pause_after(device);
+	unlock_all_resources();
 }
 
-/* caller must hold global_state_lock */
+/* caller must lock_all_resources() */
 enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
 {
 	struct drbd_device *odev;
@@ -1557,15 +1553,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_min
 	}
 }
 
-/* caller must hold global_state_lock */
+/* caller must lock_all_resources() */
 void drbd_resync_after_changed(struct drbd_device *device)
 {
-	int changes;
+	int changed;
 
 	do {
-		changes  = _drbd_pause_after(device);
-		changes |= _drbd_resume_next(device);
-	} while (changes);
+		changed  = drbd_pause_after(device);
+		changed |= drbd_resume_next(device);
+	} while (changed);
 }
 
 void drbd_rs_controller_reset(struct drbd_device *device)
@@ -1685,19 +1681,14 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 	} else {
 		mutex_lock(device->state_mutex);
 	}
-	clear_bit(B_RS_H_DONE, &device->flags);
 
-	/* req_lock: serialize with drbd_send_and_submit() and others
-	 * global_state_lock: for stable sync-after dependencies */
-	spin_lock_irq(&device->resource->req_lock);
-	write_lock(&global_state_lock);
+	lock_all_resources();
+	clear_bit(B_RS_H_DONE, &device->flags);
 	/* Did some connection breakage or IO error race with us? */
 	if (device->state.conn < C_CONNECTED
 	|| !get_ldev_if_state(device, D_NEGOTIATING)) {
-		write_unlock(&global_state_lock);
-		spin_unlock_irq(&device->resource->req_lock);
-		mutex_unlock(device->state_mutex);
-		return;
+		unlock_all_resources();
+		goto out;
 	}
 
 	ns = drbd_read_state(device);
@@ -1711,7 +1702,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 	else /* side == C_SYNC_SOURCE */
 		ns.pdsk = D_INCONSISTENT;
 
-	r = __drbd_set_state(device, ns, CS_VERBOSE, NULL);
+	r = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
 	ns = drbd_read_state(device);
 
 	if (ns.conn < C_CONNECTED)
@@ -1732,7 +1723,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 			device->rs_mark_left[i] = tw;
 			device->rs_mark_time[i] = now;
 		}
-		_drbd_pause_after(device);
+		drbd_pause_after(device);
 		/* Forget potentially stale cached per resync extent bit-counts.
 		 * Open coded drbd_rs_cancel_all(device), we already have IRQs
 		 * disabled, and know the disk state is ok. */
@@ -1742,8 +1733,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 		device->resync_wenr = LC_FREE;
 		spin_unlock(&device->al_lock);
 	}
-	write_unlock(&global_state_lock);
-	spin_unlock_irq(&device->resource->req_lock);
+	unlock_all_resources();
 
 	if (r == SS_SUCCESS) {
 		wake_up(&device->al_wait); /* for lc_reset() above */
@@ -1807,6 +1797,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 		drbd_md_sync(device);
 	}
 	put_ldev(device);
+out:
 	mutex_unlock(device->state_mutex);
 }
 
-- 
cgit 


From a29728463b254ce81ecefdf20c1a02e01d9361da Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@linbit.com>
Date: Thu, 31 Jul 2014 17:41:33 +0200
Subject: drbd: Backport the "events2" command

The events2 command originates from drbd-9 development. It features
more information but requires a incompatible change in output
format.
Therefore the previous events command continues to exist, the new
improved events2 command becomes available now.

This prepares the user-base for a later switch to the complete
drbd9 code base.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_int.h          |  45 +++
 drivers/block/drbd/drbd_nl.c           | 625 ++++++++++++++++++++++++++++++++-
 drivers/block/drbd/drbd_receiver.c     |   6 -
 drivers/block/drbd/drbd_state.c        | 424 +++++++++++++++++++++-
 drivers/block/drbd/drbd_state_change.h |  63 ++++
 5 files changed, 1151 insertions(+), 12 deletions(-)
 create mode 100644 drivers/block/drbd/drbd_state_change.h

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 2c9ee223d548..965aae0ba492 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -667,6 +667,8 @@ enum {
 	DEVICE_WORK_PENDING,	/* tell worker that some device has pending work */
 };
 
+enum which_state { NOW, OLD = NOW, NEW };
+
 struct drbd_resource {
 	char *name;
 #ifdef CONFIG_DEBUG_FS
@@ -785,6 +787,17 @@ struct drbd_connection {
 	} send;
 };
 
+static inline bool has_net_conf(struct drbd_connection *connection)
+{
+	bool has_net_conf;
+
+	rcu_read_lock();
+	has_net_conf = rcu_dereference(connection->net_conf);
+	rcu_read_unlock();
+
+	return has_net_conf;
+}
+
 void __update_timing_details(
 		struct drbd_thread_timing_details *tdp,
 		unsigned int *cb_nr,
@@ -1017,6 +1030,12 @@ static inline struct drbd_peer_device *first_peer_device(struct drbd_device *dev
 	return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices);
 }
 
+static inline struct drbd_peer_device *
+conn_peer_device(struct drbd_connection *connection, int volume_number)
+{
+	return idr_find(&connection->peer_devices, volume_number);
+}
+
 #define for_each_resource(resource, _resources) \
 	list_for_each_entry(resource, _resources, resources)
 
@@ -1451,6 +1470,9 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t);
 
 
 /* drbd_nl.c */
+
+extern struct mutex notification_mutex;
+
 extern void drbd_suspend_io(struct drbd_device *device);
 extern void drbd_resume_io(struct drbd_device *device);
 extern char *ppsize(char *buf, unsigned long long size);
@@ -1665,6 +1687,29 @@ struct sib_info {
 };
 void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib);
 
+extern void notify_resource_state(struct sk_buff *,
+				  unsigned int,
+				  struct drbd_resource *,
+				  struct resource_info *,
+				  enum drbd_notification_type);
+extern void notify_device_state(struct sk_buff *,
+				unsigned int,
+				struct drbd_device *,
+				struct device_info *,
+				enum drbd_notification_type);
+extern void notify_connection_state(struct sk_buff *,
+				    unsigned int,
+				    struct drbd_connection *,
+				    struct connection_info *,
+				    enum drbd_notification_type);
+extern void notify_peer_device_state(struct sk_buff *,
+				     unsigned int,
+				     struct drbd_peer_device *,
+				     struct peer_device_info *,
+				     enum drbd_notification_type);
+extern void notify_helper(enum drbd_notification_type, struct drbd_device *,
+			  struct drbd_connection *, const char *, int);
+
 /*
  * inline helper functions
  *************************/
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index d37c509e6a44..aa805cdde769 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -36,6 +36,7 @@
 #include "drbd_int.h"
 #include "drbd_protocol.h"
 #include "drbd_req.h"
+#include "drbd_state_change.h"
 #include <asm/unaligned.h>
 #include <linux/drbd_limits.h>
 #include <linux/kthread.h>
@@ -75,11 +76,17 @@ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
 int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
 /* .dumpit */
 int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb);
 
 #include <linux/drbd_genl_api.h>
 #include "drbd_nla.h"
 #include <linux/genl_magic_func.h>
 
+static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
+static atomic_t notify_genl_seq = ATOMIC_INIT(2); /* two. */
+
+DEFINE_MUTEX(notification_mutex);
+
 /* used blkdev_get_by_path, to claim our meta data device(s) */
 static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
 
@@ -349,6 +356,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
 	sib.sib_reason = SIB_HELPER_PRE;
 	sib.helper_name = cmd;
 	drbd_bcast_event(device, &sib);
+	notify_helper(NOTIFY_CALL, device, connection, cmd, 0);
 	ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
 	if (ret)
 		drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n",
@@ -361,6 +369,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
 	sib.sib_reason = SIB_HELPER_POST;
 	sib.helper_exit_code = ret;
 	drbd_bcast_event(device, &sib);
+	notify_helper(NOTIFY_RESPONSE, device, connection, cmd, ret);
 
 	if (current == connection->worker.task)
 		clear_bit(CALLBACK_PENDING, &connection->flags);
@@ -388,6 +397,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd)
 
 	drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name);
 	/* TODO: conn_bcast_event() ?? */
+	notify_helper(NOTIFY_CALL, NULL, connection, cmd, 0);
 
 	ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
 	if (ret)
@@ -399,6 +409,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd)
 			  usermode_helper, cmd, resource_name,
 			  (ret >> 8) & 0xff, ret);
 	/* TODO: conn_bcast_event() ?? */
+	notify_helper(NOTIFY_RESPONSE, NULL, connection, cmd, ret);
 
 	if (ret < 0) /* Ignore any ERRNOs we got. */
 		ret = 0;
@@ -2248,8 +2259,31 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
 	return 0;
 }
 
+static void connection_to_info(struct connection_info *info,
+			       struct drbd_connection *connection)
+{
+	info->conn_connection_state = connection->cstate;
+	info->conn_role = conn_highest_peer(connection);
+}
+
+static void peer_device_to_info(struct peer_device_info *info,
+				struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+
+	info->peer_repl_state =
+		max_t(enum drbd_conns, C_WF_REPORT_PARAMS, device->state.conn);
+	info->peer_disk_state = device->state.pdsk;
+	info->peer_resync_susp_user = device->state.user_isp;
+	info->peer_resync_susp_peer = device->state.peer_isp;
+	info->peer_resync_susp_dependency = device->state.aftr_isp;
+}
+
 int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
 {
+	struct connection_info connection_info;
+	enum drbd_notification_type flags;
+	unsigned int peer_devices = 0;
 	struct drbd_config_context adm_ctx;
 	struct drbd_peer_device *peer_device;
 	struct net_conf *old_net_conf, *new_net_conf = NULL;
@@ -2350,6 +2384,22 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
 	connection->peer_addr_len = nla_len(adm_ctx.peer_addr);
 	memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len);
 
+	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+		peer_devices++;
+	}
+
+	connection_to_info(&connection_info, connection);
+	flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+	mutex_lock(&notification_mutex);
+	notify_connection_state(NULL, 0, connection, &connection_info, NOTIFY_CREATE | flags);
+	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+		struct peer_device_info peer_device_info;
+
+		peer_device_to_info(&peer_device_info, peer_device);
+		flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+		notify_peer_device_state(NULL, 0, peer_device, &peer_device_info, NOTIFY_CREATE | flags);
+	}
+	mutex_unlock(&notification_mutex);
 	mutex_unlock(&adm_ctx.resource->conf_update);
 
 	rcu_read_lock();
@@ -2431,6 +2481,8 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection
 			drbd_err(connection,
 				"unexpected rv2=%d in conn_try_disconnect()\n",
 				rv2);
+		/* Unlike in DRBD 9, the state engine has generated
+		 * NOTIFY_DESTROY events before clearing connection->net_conf. */
 	}
 	return rv;
 }
@@ -3417,8 +3469,18 @@ drbd_check_resource_name(struct drbd_config_context *adm_ctx)
 	return NO_ERROR;
 }
 
+static void resource_to_info(struct resource_info *info,
+			     struct drbd_resource *resource)
+{
+	info->res_role = conn_highest_role(first_connection(resource));
+	info->res_susp = resource->susp;
+	info->res_susp_nod = resource->susp_nod;
+	info->res_susp_fen = resource->susp_fen;
+}
+
 int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_connection *connection;
 	struct drbd_config_context adm_ctx;
 	enum drbd_ret_code retcode;
 	struct res_opts res_opts;
@@ -3453,14 +3515,32 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
 
 	/* not yet safe for genl_family.parallel_ops */
 	mutex_lock(&resources_mutex);
-	if (!conn_create(adm_ctx.resource_name, &res_opts))
-		retcode = ERR_NOMEM;
+	connection = conn_create(adm_ctx.resource_name, &res_opts);
 	mutex_unlock(&resources_mutex);
+
+	if (connection) {
+		struct resource_info resource_info;
+
+		mutex_lock(&notification_mutex);
+		resource_to_info(&resource_info, connection->resource);
+		notify_resource_state(NULL, 0, connection->resource,
+				      &resource_info, NOTIFY_CREATE);
+		mutex_unlock(&notification_mutex);
+	} else
+		retcode = ERR_NOMEM;
+
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
+static void device_to_info(struct device_info *info,
+			   struct drbd_device *device)
+{
+	info->dev_disk_state = device->state.disk;
+}
+
+
 int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
 {
 	struct drbd_config_context adm_ctx;
@@ -3495,6 +3575,36 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
 
 	mutex_lock(&adm_ctx.resource->adm_mutex);
 	retcode = drbd_create_device(&adm_ctx, dh->minor);
+	if (retcode == NO_ERROR) {
+		struct drbd_device *device;
+		struct drbd_peer_device *peer_device;
+		struct device_info info;
+		unsigned int peer_devices = 0;
+		enum drbd_notification_type flags;
+
+		device = minor_to_device(dh->minor);
+		for_each_peer_device(peer_device, device) {
+			if (!has_net_conf(peer_device->connection))
+				continue;
+			peer_devices++;
+		}
+
+		device_to_info(&info, device);
+		mutex_lock(&notification_mutex);
+		flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+		notify_device_state(NULL, 0, device, &info, NOTIFY_CREATE | flags);
+		for_each_peer_device(peer_device, device) {
+			struct peer_device_info peer_device_info;
+
+			if (!has_net_conf(peer_device->connection))
+				continue;
+			peer_device_to_info(&peer_device_info, peer_device);
+			flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+			notify_peer_device_state(NULL, 0, peer_device, &peer_device_info,
+						 NOTIFY_CREATE | flags);
+		}
+		mutex_unlock(&notification_mutex);
+	}
 	mutex_unlock(&adm_ctx.resource->adm_mutex);
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
@@ -3503,13 +3613,35 @@ out:
 
 static enum drbd_ret_code adm_del_minor(struct drbd_device *device)
 {
+	struct drbd_peer_device *peer_device;
+
 	if (device->state.disk == D_DISKLESS &&
 	    /* no need to be device->state.conn == C_STANDALONE &&
 	     * we may want to delete a minor from a live replication group.
 	     */
 	    device->state.role == R_SECONDARY) {
+		struct drbd_connection *connection =
+			first_connection(device->resource);
+
 		_drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS),
 				    CS_VERBOSE + CS_WAIT_COMPLETE);
+
+		/* If the state engine hasn't stopped the sender thread yet, we
+		 * need to flush the sender work queue before generating the
+		 * DESTROY events here. */
+		if (get_t_state(&connection->worker) == RUNNING)
+			drbd_flush_workqueue(&connection->sender_work);
+
+		mutex_lock(&notification_mutex);
+		for_each_peer_device(peer_device, device) {
+			if (!has_net_conf(peer_device->connection))
+				continue;
+			notify_peer_device_state(NULL, 0, peer_device, NULL,
+						 NOTIFY_DESTROY | NOTIFY_CONTINUES);
+		}
+		notify_device_state(NULL, 0, device, NULL, NOTIFY_DESTROY);
+		mutex_unlock(&notification_mutex);
+
 		drbd_delete_device(device);
 		return NO_ERROR;
 	} else
@@ -3546,6 +3678,13 @@ static int adm_del_resource(struct drbd_resource *resource)
 	if (!idr_is_empty(&resource->devices))
 		return ERR_RES_IN_USE;
 
+	/* The state engine has stopped the sender thread, so we don't
+	 * need to flush the sender work queue before generating the
+	 * DESTROY event here. */
+	mutex_lock(&notification_mutex);
+	notify_resource_state(NULL, 0, resource, NULL, NOTIFY_DESTROY);
+	mutex_unlock(&notification_mutex);
+
 	mutex_lock(&resources_mutex);
 	list_del_rcu(&resource->resources);
 	mutex_unlock(&resources_mutex);
@@ -3644,7 +3783,6 @@ finish:
 
 void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
 {
-	static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
 	struct sk_buff *msg;
 	struct drbd_genlmsghdr *d_out;
 	unsigned seq;
@@ -3679,3 +3817,484 @@ failed:
 			"Event seq:%u sib_reason:%u\n",
 			err, seq, sib->sib_reason);
 }
+
+static void device_to_statistics(struct device_statistics *s,
+				 struct drbd_device *device)
+{
+	memset(s, 0, sizeof(*s));
+	s->dev_upper_blocked = !may_inc_ap_bio(device);
+	if (get_ldev(device)) {
+		struct drbd_md *md = &device->ldev->md;
+		u64 *history_uuids = (u64 *)s->history_uuids;
+		struct request_queue *q;
+		int n;
+
+		spin_lock_irq(&md->uuid_lock);
+		s->dev_current_uuid = md->uuid[UI_CURRENT];
+		BUILD_BUG_ON(sizeof(s->history_uuids) < UI_HISTORY_END - UI_HISTORY_START + 1);
+		for (n = 0; n < UI_HISTORY_END - UI_HISTORY_START + 1; n++)
+			history_uuids[n] = md->uuid[UI_HISTORY_START + n];
+		for (; n < HISTORY_UUIDS; n++)
+			history_uuids[n] = 0;
+		s->history_uuids_len = HISTORY_UUIDS;
+		spin_unlock_irq(&md->uuid_lock);
+
+		s->dev_disk_flags = md->flags;
+		q = bdev_get_queue(device->ldev->backing_bdev);
+		s->dev_lower_blocked =
+			bdi_congested(&q->backing_dev_info,
+				      (1 << WB_async_congested) |
+				      (1 << WB_sync_congested));
+		put_ldev(device);
+	}
+	s->dev_size = drbd_get_capacity(device->this_bdev);
+	s->dev_read = device->read_cnt;
+	s->dev_write = device->writ_cnt;
+	s->dev_al_writes = device->al_writ_cnt;
+	s->dev_bm_writes = device->bm_writ_cnt;
+	s->dev_upper_pending = atomic_read(&device->ap_bio_cnt);
+	s->dev_lower_pending = atomic_read(&device->local_cnt);
+	s->dev_al_suspended = test_bit(AL_SUSPENDED, &device->flags);
+	s->dev_exposed_data_uuid = device->ed_uuid;
+}
+
+enum mdf_peer_flag {
+	MDF_PEER_CONNECTED =	1 << 0,
+	MDF_PEER_OUTDATED =	1 << 1,
+	MDF_PEER_FENCING =	1 << 2,
+	MDF_PEER_FULL_SYNC =	1 << 3,
+};
+
+static void peer_device_to_statistics(struct peer_device_statistics *s,
+				      struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+
+	memset(s, 0, sizeof(*s));
+	s->peer_dev_received = device->recv_cnt;
+	s->peer_dev_sent = device->send_cnt;
+	s->peer_dev_pending = atomic_read(&device->ap_pending_cnt) +
+			      atomic_read(&device->rs_pending_cnt);
+	s->peer_dev_unacked = atomic_read(&device->unacked_cnt);
+	s->peer_dev_out_of_sync = drbd_bm_total_weight(device) << (BM_BLOCK_SHIFT - 9);
+	s->peer_dev_resync_failed = device->rs_failed << (BM_BLOCK_SHIFT - 9);
+	if (get_ldev(device)) {
+		struct drbd_md *md = &device->ldev->md;
+
+		spin_lock_irq(&md->uuid_lock);
+		s->peer_dev_bitmap_uuid = md->uuid[UI_BITMAP];
+		spin_unlock_irq(&md->uuid_lock);
+		s->peer_dev_flags =
+			(drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND) ?
+				MDF_PEER_CONNECTED : 0) +
+			(drbd_md_test_flag(device->ldev, MDF_CONSISTENT) &&
+			 !drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE) ?
+				MDF_PEER_OUTDATED : 0) +
+			/* FIXME: MDF_PEER_FENCING? */
+			(drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ?
+				MDF_PEER_FULL_SYNC : 0);
+		put_ldev(device);
+	}
+}
+
+static int nla_put_notification_header(struct sk_buff *msg,
+				       enum drbd_notification_type type)
+{
+	struct drbd_notification_header nh = {
+		.nh_type = type,
+	};
+
+	return drbd_notification_header_to_skb(msg, &nh, true);
+}
+
+void notify_resource_state(struct sk_buff *skb,
+			   unsigned int seq,
+			   struct drbd_resource *resource,
+			   struct resource_info *resource_info,
+			   enum drbd_notification_type type)
+{
+	struct resource_statistics resource_statistics;
+	struct drbd_genlmsghdr *dh;
+	bool multicast = false;
+	int err;
+
+	if (!skb) {
+		seq = atomic_inc_return(&notify_genl_seq);
+		skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+		err = -ENOMEM;
+		if (!skb)
+			goto failed;
+		multicast = true;
+	}
+
+	err = -EMSGSIZE;
+	dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_RESOURCE_STATE);
+	if (!dh)
+		goto nla_put_failure;
+	dh->minor = -1U;
+	dh->ret_code = NO_ERROR;
+	if (nla_put_drbd_cfg_context(skb, resource, NULL, NULL) ||
+	    nla_put_notification_header(skb, type) ||
+	    ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+	     resource_info_to_skb(skb, resource_info, true)))
+		goto nla_put_failure;
+	resource_statistics.res_stat_write_ordering = resource->write_ordering;
+	err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN));
+	if (err)
+		goto nla_put_failure;
+	genlmsg_end(skb, dh);
+	if (multicast) {
+		err = drbd_genl_multicast_events(skb, 0);
+		/* skb has been consumed or freed in netlink_broadcast() */
+		if (err && err != -ESRCH)
+			goto failed;
+	}
+	return;
+
+nla_put_failure:
+	nlmsg_free(skb);
+failed:
+	drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n",
+			err, seq);
+}
+
+void notify_device_state(struct sk_buff *skb,
+			 unsigned int seq,
+			 struct drbd_device *device,
+			 struct device_info *device_info,
+			 enum drbd_notification_type type)
+{
+	struct device_statistics device_statistics;
+	struct drbd_genlmsghdr *dh;
+	bool multicast = false;
+	int err;
+
+	if (!skb) {
+		seq = atomic_inc_return(&notify_genl_seq);
+		skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+		err = -ENOMEM;
+		if (!skb)
+			goto failed;
+		multicast = true;
+	}
+
+	err = -EMSGSIZE;
+	dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_DEVICE_STATE);
+	if (!dh)
+		goto nla_put_failure;
+	dh->minor = device->minor;
+	dh->ret_code = NO_ERROR;
+	if (nla_put_drbd_cfg_context(skb, device->resource, NULL, device) ||
+	    nla_put_notification_header(skb, type) ||
+	    ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+	     device_info_to_skb(skb, device_info, true)))
+		goto nla_put_failure;
+	device_to_statistics(&device_statistics, device);
+	device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN));
+	genlmsg_end(skb, dh);
+	if (multicast) {
+		err = drbd_genl_multicast_events(skb, 0);
+		/* skb has been consumed or freed in netlink_broadcast() */
+		if (err && err != -ESRCH)
+			goto failed;
+	}
+	return;
+
+nla_put_failure:
+	nlmsg_free(skb);
+failed:
+	drbd_err(device, "Error %d while broadcasting event. Event seq:%u\n",
+		 err, seq);
+}
+
+void notify_connection_state(struct sk_buff *skb,
+			     unsigned int seq,
+			     struct drbd_connection *connection,
+			     struct connection_info *connection_info,
+			     enum drbd_notification_type type)
+{
+	struct connection_statistics connection_statistics;
+	struct drbd_genlmsghdr *dh;
+	bool multicast = false;
+	int err;
+
+	if (!skb) {
+		seq = atomic_inc_return(&notify_genl_seq);
+		skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+		err = -ENOMEM;
+		if (!skb)
+			goto failed;
+		multicast = true;
+	}
+
+	err = -EMSGSIZE;
+	dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_CONNECTION_STATE);
+	if (!dh)
+		goto nla_put_failure;
+	dh->minor = -1U;
+	dh->ret_code = NO_ERROR;
+	if (nla_put_drbd_cfg_context(skb, connection->resource, connection, NULL) ||
+	    nla_put_notification_header(skb, type) ||
+	    ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+	     connection_info_to_skb(skb, connection_info, true)))
+		goto nla_put_failure;
+	connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags);
+	connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN));
+	genlmsg_end(skb, dh);
+	if (multicast) {
+		err = drbd_genl_multicast_events(skb, 0);
+		/* skb has been consumed or freed in netlink_broadcast() */
+		if (err && err != -ESRCH)
+			goto failed;
+	}
+	return;
+
+nla_put_failure:
+	nlmsg_free(skb);
+failed:
+	drbd_err(connection, "Error %d while broadcasting event. Event seq:%u\n",
+		 err, seq);
+}
+
+void notify_peer_device_state(struct sk_buff *skb,
+			      unsigned int seq,
+			      struct drbd_peer_device *peer_device,
+			      struct peer_device_info *peer_device_info,
+			      enum drbd_notification_type type)
+{
+	struct peer_device_statistics peer_device_statistics;
+	struct drbd_resource *resource = peer_device->device->resource;
+	struct drbd_genlmsghdr *dh;
+	bool multicast = false;
+	int err;
+
+	if (!skb) {
+		seq = atomic_inc_return(&notify_genl_seq);
+		skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+		err = -ENOMEM;
+		if (!skb)
+			goto failed;
+		multicast = true;
+	}
+
+	err = -EMSGSIZE;
+	dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_PEER_DEVICE_STATE);
+	if (!dh)
+		goto nla_put_failure;
+	dh->minor = -1U;
+	dh->ret_code = NO_ERROR;
+	if (nla_put_drbd_cfg_context(skb, resource, peer_device->connection, peer_device->device) ||
+	    nla_put_notification_header(skb, type) ||
+	    ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+	     peer_device_info_to_skb(skb, peer_device_info, true)))
+		goto nla_put_failure;
+	peer_device_to_statistics(&peer_device_statistics, peer_device);
+	peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN));
+	genlmsg_end(skb, dh);
+	if (multicast) {
+		err = drbd_genl_multicast_events(skb, 0);
+		/* skb has been consumed or freed in netlink_broadcast() */
+		if (err && err != -ESRCH)
+			goto failed;
+	}
+	return;
+
+nla_put_failure:
+	nlmsg_free(skb);
+failed:
+	drbd_err(peer_device, "Error %d while broadcasting event. Event seq:%u\n",
+		 err, seq);
+}
+
+void notify_helper(enum drbd_notification_type type,
+		   struct drbd_device *device, struct drbd_connection *connection,
+		   const char *name, int status)
+{
+	struct drbd_resource *resource = device ? device->resource : connection->resource;
+	struct drbd_helper_info helper_info;
+	unsigned int seq = atomic_inc_return(&notify_genl_seq);
+	struct sk_buff *skb = NULL;
+	struct drbd_genlmsghdr *dh;
+	int err;
+
+	strlcpy(helper_info.helper_name, name, sizeof(helper_info.helper_name));
+	helper_info.helper_name_len = min(strlen(name), sizeof(helper_info.helper_name));
+	helper_info.helper_status = status;
+
+	skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+	err = -ENOMEM;
+	if (!skb)
+		goto fail;
+
+	err = -EMSGSIZE;
+	dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_HELPER);
+	if (!dh)
+		goto fail;
+	dh->minor = device ? device->minor : -1;
+	dh->ret_code = NO_ERROR;
+	mutex_lock(&notification_mutex);
+	if (nla_put_drbd_cfg_context(skb, resource, connection, device) ||
+	    nla_put_notification_header(skb, type) ||
+	    drbd_helper_info_to_skb(skb, &helper_info, true))
+		goto unlock_fail;
+	genlmsg_end(skb, dh);
+	err = drbd_genl_multicast_events(skb, 0);
+	skb = NULL;
+	/* skb has been consumed or freed in netlink_broadcast() */
+	if (err && err != -ESRCH)
+		goto unlock_fail;
+	mutex_unlock(&notification_mutex);
+	return;
+
+unlock_fail:
+	mutex_unlock(&notification_mutex);
+fail:
+	nlmsg_free(skb);
+	drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n",
+		 err, seq);
+}
+
+static void notify_initial_state_done(struct sk_buff *skb, unsigned int seq)
+{
+	struct drbd_genlmsghdr *dh;
+	int err;
+
+	err = -EMSGSIZE;
+	dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_INITIAL_STATE_DONE);
+	if (!dh)
+		goto nla_put_failure;
+	dh->minor = -1U;
+	dh->ret_code = NO_ERROR;
+	if (nla_put_notification_header(skb, NOTIFY_EXISTS))
+		goto nla_put_failure;
+	genlmsg_end(skb, dh);
+	return;
+
+nla_put_failure:
+	nlmsg_free(skb);
+	pr_err("Error %d sending event. Event seq:%u\n", err, seq);
+}
+
+static void free_state_changes(struct list_head *list)
+{
+	while (!list_empty(list)) {
+		struct drbd_state_change *state_change =
+			list_first_entry(list, struct drbd_state_change, list);
+		list_del(&state_change->list);
+		forget_state_change(state_change);
+	}
+}
+
+static unsigned int notifications_for_state_change(struct drbd_state_change *state_change)
+{
+	return 1 +
+	       state_change->n_connections +
+	       state_change->n_devices +
+	       state_change->n_devices * state_change->n_connections;
+}
+
+static int get_initial_state(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct drbd_state_change *state_change = (struct drbd_state_change *)cb->args[0];
+	unsigned int seq = cb->args[2];
+	unsigned int n;
+	enum drbd_notification_type flags = 0;
+
+	/* There is no need for taking notification_mutex here: it doesn't
+	   matter if the initial state events mix with later state chage
+	   events; we can always tell the events apart by the NOTIFY_EXISTS
+	   flag. */
+
+	cb->args[5]--;
+	if (cb->args[5] == 1) {
+		notify_initial_state_done(skb, seq);
+		goto out;
+	}
+	n = cb->args[4]++;
+	if (cb->args[4] < cb->args[3])
+		flags |= NOTIFY_CONTINUES;
+	if (n < 1) {
+		notify_resource_state_change(skb, seq, state_change->resource,
+					     NOTIFY_EXISTS | flags);
+		goto next;
+	}
+	n--;
+	if (n < state_change->n_connections) {
+		notify_connection_state_change(skb, seq, &state_change->connections[n],
+					       NOTIFY_EXISTS | flags);
+		goto next;
+	}
+	n -= state_change->n_connections;
+	if (n < state_change->n_devices) {
+		notify_device_state_change(skb, seq, &state_change->devices[n],
+					   NOTIFY_EXISTS | flags);
+		goto next;
+	}
+	n -= state_change->n_devices;
+	if (n < state_change->n_devices * state_change->n_connections) {
+		notify_peer_device_state_change(skb, seq, &state_change->peer_devices[n],
+						NOTIFY_EXISTS | flags);
+		goto next;
+	}
+
+next:
+	if (cb->args[4] == cb->args[3]) {
+		struct drbd_state_change *next_state_change =
+			list_entry(state_change->list.next,
+				   struct drbd_state_change, list);
+		cb->args[0] = (long)next_state_change;
+		cb->args[3] = notifications_for_state_change(next_state_change);
+		cb->args[4] = 0;
+	}
+out:
+	return skb->len;
+}
+
+int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct drbd_resource *resource;
+	LIST_HEAD(head);
+
+	if (cb->args[5] >= 1) {
+		if (cb->args[5] > 1)
+			return get_initial_state(skb, cb);
+		if (cb->args[0]) {
+			struct drbd_state_change *state_change =
+				(struct drbd_state_change *)cb->args[0];
+
+			/* connect list to head */
+			list_add(&head, &state_change->list);
+			free_state_changes(&head);
+		}
+		return 0;
+	}
+
+	cb->args[5] = 2;  /* number of iterations */
+	mutex_lock(&resources_mutex);
+	for_each_resource(resource, &drbd_resources) {
+		struct drbd_state_change *state_change;
+
+		state_change = remember_old_state(resource, GFP_KERNEL);
+		if (!state_change) {
+			if (!list_empty(&head))
+				free_state_changes(&head);
+			mutex_unlock(&resources_mutex);
+			return -ENOMEM;
+		}
+		copy_old_to_new_state_change(state_change);
+		list_add_tail(&state_change->list, &head);
+		cb->args[5] += notifications_for_state_change(state_change);
+	}
+	mutex_unlock(&resources_mutex);
+
+	if (!list_empty(&head)) {
+		struct drbd_state_change *state_change =
+			list_entry(head.next, struct drbd_state_change, list);
+		cb->args[0] = (long)state_change;
+		cb->args[3] = notifications_for_state_change(state_change);
+		list_del(&head);  /* detach list from head */
+	}
+
+	cb->args[2] = cb->nlh->nlmsg_seq;
+	return get_initial_state(skb, cb);
+}
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index bf38b957d9dd..61b73c77a690 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1508,12 +1508,6 @@ static void conn_wait_active_ee_empty(struct drbd_connection *connection)
 	rcu_read_unlock();
 }
 
-static struct drbd_peer_device *
-conn_peer_device(struct drbd_connection *connection, int volume_number)
-{
-	return idr_find(&connection->peer_devices, volume_number);
-}
-
 static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
 {
 	int rv;
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 535ae47f84c9..bc4b45bf9ace 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -29,6 +29,7 @@
 #include "drbd_int.h"
 #include "drbd_protocol.h"
 #include "drbd_req.h"
+#include "drbd_state_change.h"
 
 struct after_state_chg_work {
 	struct drbd_work w;
@@ -37,6 +38,7 @@ struct after_state_chg_work {
 	union drbd_state ns;
 	enum chg_state_flags flags;
 	struct completion *done;
+	struct drbd_state_change *state_change;
 };
 
 enum sanitize_state_warnings {
@@ -48,9 +50,266 @@ enum sanitize_state_warnings {
 	IMPLICITLY_UPGRADED_PDSK,
 };
 
+static void count_objects(struct drbd_resource *resource,
+			  unsigned int *n_devices,
+			  unsigned int *n_connections)
+{
+	struct drbd_device *device;
+	struct drbd_connection *connection;
+	int vnr;
+
+	*n_devices = 0;
+	*n_connections = 0;
+
+	idr_for_each_entry(&resource->devices, device, vnr)
+		(*n_devices)++;
+	for_each_connection(connection, resource) {
+		if (!has_net_conf(connection))
+			continue;
+		(*n_connections)++;
+	}
+}
+
+static struct drbd_state_change *alloc_state_change(unsigned int n_devices, unsigned int n_connections, gfp_t gfp)
+{
+	struct drbd_state_change *state_change;
+	unsigned int size, n;
+
+	size = sizeof(struct drbd_state_change) +
+	       n_devices * sizeof(struct drbd_device_state_change) +
+	       n_connections * sizeof(struct drbd_connection_state_change) +
+	       n_devices * n_connections * sizeof(struct drbd_peer_device_state_change);
+	state_change = kmalloc(size, gfp);
+	if (!state_change)
+		return NULL;
+	state_change->n_devices = n_devices;
+	state_change->n_connections = n_connections;
+	state_change->devices = (void *)(state_change + 1);
+	state_change->connections = (void *)&state_change->devices[n_devices];
+	state_change->peer_devices = (void *)&state_change->connections[n_connections];
+	state_change->resource->resource = NULL;
+	for (n = 0; n < n_devices; n++)
+		state_change->devices[n].device = NULL;
+	for (n = 0; n < n_connections; n++)
+		state_change->connections[n].connection = NULL;
+	return state_change;
+}
+
+struct drbd_state_change *remember_old_state(struct drbd_resource *resource, gfp_t gfp)
+{
+	struct drbd_state_change *state_change;
+	struct drbd_device *device;
+	unsigned int n_devices;
+	struct drbd_connection *connection;
+	unsigned int n_connections;
+	int vnr;
+
+	struct drbd_device_state_change *device_state_change;
+	struct drbd_peer_device_state_change *peer_device_state_change;
+	struct drbd_connection_state_change *connection_state_change;
+
+retry:
+	rcu_read_lock();
+	count_objects(resource, &n_devices, &n_connections);
+	rcu_read_unlock();
+	state_change = alloc_state_change(n_devices, n_connections, gfp);
+	if (!state_change)
+		return NULL;
+
+	rcu_read_lock();
+	count_objects(resource, &n_devices, &n_connections);
+	if (n_devices != state_change->n_devices ||
+	    n_connections != state_change->n_connections) {
+		kfree(state_change);
+		rcu_read_unlock();
+		goto retry;
+	}
+
+	kref_get(&resource->kref);
+	state_change->resource->resource = resource;
+	state_change->resource->role[OLD] =
+		conn_highest_role(first_connection(resource));
+	state_change->resource->susp[OLD] = resource->susp;
+	state_change->resource->susp_nod[OLD] = resource->susp_nod;
+	state_change->resource->susp_fen[OLD] = resource->susp_fen;
+
+	device_state_change = state_change->devices;
+	peer_device_state_change = state_change->peer_devices;
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		kref_get(&device->kref);
+		device_state_change->device = device;
+		device_state_change->disk_state[OLD] = device->state.disk;
+
+		/* The peer_devices for each device have to be enumerated in
+		   the order of the connections. We may not use for_each_peer_device() here. */
+		for_each_connection(connection, resource) {
+			struct drbd_peer_device *peer_device;
+
+			if (!has_net_conf(connection))
+				continue;
+			peer_device = conn_peer_device(connection, device->vnr);
+			peer_device_state_change->peer_device = peer_device;
+			peer_device_state_change->disk_state[OLD] =
+				device->state.pdsk;
+			peer_device_state_change->repl_state[OLD] =
+				max_t(enum drbd_conns,
+				      C_WF_REPORT_PARAMS, device->state.conn);
+			peer_device_state_change->resync_susp_user[OLD] =
+				device->state.user_isp;
+			peer_device_state_change->resync_susp_peer[OLD] =
+				device->state.peer_isp;
+			peer_device_state_change->resync_susp_dependency[OLD] =
+				device->state.aftr_isp;
+			peer_device_state_change++;
+		}
+		device_state_change++;
+	}
+
+	connection_state_change = state_change->connections;
+	for_each_connection(connection, resource) {
+		if (!has_net_conf(connection))
+			continue;
+		kref_get(&connection->kref);
+		connection_state_change->connection = connection;
+		connection_state_change->cstate[OLD] =
+			connection->cstate;
+		connection_state_change->peer_role[OLD] =
+			conn_highest_peer(connection);
+		connection_state_change++;
+	}
+	rcu_read_unlock();
+
+	return state_change;
+}
+
+static void remember_new_state(struct drbd_state_change *state_change)
+{
+	struct drbd_resource_state_change *resource_state_change;
+	struct drbd_resource *resource;
+	unsigned int n;
+
+	if (!state_change)
+		return;
+
+	resource_state_change = &state_change->resource[0];
+	resource = resource_state_change->resource;
+
+	resource_state_change->role[NEW] =
+		conn_highest_role(first_connection(resource));
+	resource_state_change->susp[NEW] = resource->susp;
+	resource_state_change->susp_nod[NEW] = resource->susp_nod;
+	resource_state_change->susp_fen[NEW] = resource->susp_fen;
+
+	for (n = 0; n < state_change->n_devices; n++) {
+		struct drbd_device_state_change *device_state_change =
+			&state_change->devices[n];
+		struct drbd_device *device = device_state_change->device;
+
+		device_state_change->disk_state[NEW] = device->state.disk;
+	}
+
+	for (n = 0; n < state_change->n_connections; n++) {
+		struct drbd_connection_state_change *connection_state_change =
+			&state_change->connections[n];
+		struct drbd_connection *connection =
+			connection_state_change->connection;
+
+		connection_state_change->cstate[NEW] = connection->cstate;
+		connection_state_change->peer_role[NEW] =
+			conn_highest_peer(connection);
+	}
+
+	for (n = 0; n < state_change->n_devices * state_change->n_connections; n++) {
+		struct drbd_peer_device_state_change *peer_device_state_change =
+			&state_change->peer_devices[n];
+		struct drbd_device *device =
+			peer_device_state_change->peer_device->device;
+		union drbd_dev_state state = device->state;
+
+		peer_device_state_change->disk_state[NEW] = state.pdsk;
+		peer_device_state_change->repl_state[NEW] =
+			max_t(enum drbd_conns, C_WF_REPORT_PARAMS, state.conn);
+		peer_device_state_change->resync_susp_user[NEW] =
+			state.user_isp;
+		peer_device_state_change->resync_susp_peer[NEW] =
+			state.peer_isp;
+		peer_device_state_change->resync_susp_dependency[NEW] =
+			state.aftr_isp;
+	}
+}
+
+void copy_old_to_new_state_change(struct drbd_state_change *state_change)
+{
+	struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
+	unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
+
+#define OLD_TO_NEW(x) \
+	(x[NEW] = x[OLD])
+
+	OLD_TO_NEW(resource_state_change->role);
+	OLD_TO_NEW(resource_state_change->susp);
+	OLD_TO_NEW(resource_state_change->susp_nod);
+	OLD_TO_NEW(resource_state_change->susp_fen);
+
+	for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+		struct drbd_connection_state_change *connection_state_change =
+				&state_change->connections[n_connection];
+
+		OLD_TO_NEW(connection_state_change->peer_role);
+		OLD_TO_NEW(connection_state_change->cstate);
+	}
+
+	for (n_device = 0; n_device < state_change->n_devices; n_device++) {
+		struct drbd_device_state_change *device_state_change =
+			&state_change->devices[n_device];
+
+		OLD_TO_NEW(device_state_change->disk_state);
+	}
+
+	n_peer_devices = state_change->n_devices * state_change->n_connections;
+	for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) {
+		struct drbd_peer_device_state_change *p =
+			&state_change->peer_devices[n_peer_device];
+
+		OLD_TO_NEW(p->disk_state);
+		OLD_TO_NEW(p->repl_state);
+		OLD_TO_NEW(p->resync_susp_user);
+		OLD_TO_NEW(p->resync_susp_peer);
+		OLD_TO_NEW(p->resync_susp_dependency);
+	}
+
+#undef OLD_TO_NEW
+}
+
+void forget_state_change(struct drbd_state_change *state_change)
+{
+	unsigned int n;
+
+	if (!state_change)
+		return;
+
+	if (state_change->resource->resource)
+		kref_put(&state_change->resource->resource->kref, drbd_destroy_resource);
+	for (n = 0; n < state_change->n_devices; n++) {
+		struct drbd_device *device = state_change->devices[n].device;
+
+		if (device)
+			kref_put(&device->kref, drbd_destroy_device);
+	}
+	for (n = 0; n < state_change->n_connections; n++) {
+		struct drbd_connection *connection =
+			state_change->connections[n].connection;
+
+		if (connection)
+			kref_put(&connection->kref, drbd_destroy_connection);
+	}
+	kfree(state_change);
+}
+
 static int w_after_state_ch(struct drbd_work *w, int unused);
 static void after_state_ch(struct drbd_device *device, union drbd_state os,
-			   union drbd_state ns, enum chg_state_flags flags);
+			   union drbd_state ns, enum chg_state_flags flags,
+			   struct drbd_state_change *);
 static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state);
 static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *);
 static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
@@ -93,6 +352,7 @@ static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2)
 		return R_SECONDARY;
 	return R_UNKNOWN;
 }
+
 static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2)
 {
 	if (role1 == R_UNKNOWN || role2 == R_UNKNOWN)
@@ -983,6 +1243,7 @@ _drbd_set_state(struct drbd_device *device, union drbd_state ns,
 	enum drbd_state_rv rv = SS_SUCCESS;
 	enum sanitize_state_warnings ssw;
 	struct after_state_chg_work *ascw;
+	struct drbd_state_change *state_change;
 
 	os = drbd_read_state(device);
 
@@ -1037,6 +1298,9 @@ _drbd_set_state(struct drbd_device *device, union drbd_state ns,
 	if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
 		clear_bit(RS_DONE, &device->flags);
 
+	/* FIXME: Have any flags been set earlier in this function already? */
+	state_change = remember_old_state(device->resource, GFP_ATOMIC);
+
 	/* changes to local_cnt and device flags should be visible before
 	 * changes to state, which again should be visible before anything else
 	 * depending on that change happens. */
@@ -1047,6 +1311,8 @@ _drbd_set_state(struct drbd_device *device, union drbd_state ns,
 	device->resource->susp_fen = ns.susp_fen;
 	smp_wmb();
 
+	remember_new_state(state_change);
+
 	/* put replicated vs not-replicated requests in seperate epochs */
 	if (drbd_should_do_remote((union drbd_dev_state)os.i) !=
 	    drbd_should_do_remote((union drbd_dev_state)ns.i))
@@ -1184,6 +1450,7 @@ _drbd_set_state(struct drbd_device *device, union drbd_state ns,
 		ascw->w.cb = w_after_state_ch;
 		ascw->device = device;
 		ascw->done = done;
+		ascw->state_change = state_change;
 		drbd_queue_work(&connection->sender_work,
 				&ascw->w);
 	} else {
@@ -1199,7 +1466,8 @@ static int w_after_state_ch(struct drbd_work *w, int unused)
 		container_of(w, struct after_state_chg_work, w);
 	struct drbd_device *device = ascw->device;
 
-	after_state_ch(device, ascw->os, ascw->ns, ascw->flags);
+	after_state_ch(device, ascw->os, ascw->ns, ascw->flags, ascw->state_change);
+	forget_state_change(ascw->state_change);
 	if (ascw->flags & CS_WAIT_COMPLETE)
 		complete(ascw->done);
 	kfree(ascw);
@@ -1245,6 +1513,139 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
 	return rv;
 }
 
+void notify_resource_state_change(struct sk_buff *skb,
+				  unsigned int seq,
+				  struct drbd_resource_state_change *resource_state_change,
+				  enum drbd_notification_type type)
+{
+	struct drbd_resource *resource = resource_state_change->resource;
+	struct resource_info resource_info = {
+		.res_role = resource_state_change->role[NEW],
+		.res_susp = resource_state_change->susp[NEW],
+		.res_susp_nod = resource_state_change->susp_nod[NEW],
+		.res_susp_fen = resource_state_change->susp_fen[NEW],
+	};
+
+	notify_resource_state(skb, seq, resource, &resource_info, type);
+}
+
+void notify_connection_state_change(struct sk_buff *skb,
+				    unsigned int seq,
+				    struct drbd_connection_state_change *connection_state_change,
+				    enum drbd_notification_type type)
+{
+	struct drbd_connection *connection = connection_state_change->connection;
+	struct connection_info connection_info = {
+		.conn_connection_state = connection_state_change->cstate[NEW],
+		.conn_role = connection_state_change->peer_role[NEW],
+	};
+
+	notify_connection_state(skb, seq, connection, &connection_info, type);
+}
+
+void notify_device_state_change(struct sk_buff *skb,
+				unsigned int seq,
+				struct drbd_device_state_change *device_state_change,
+				enum drbd_notification_type type)
+{
+	struct drbd_device *device = device_state_change->device;
+	struct device_info device_info = {
+		.dev_disk_state = device_state_change->disk_state[NEW],
+	};
+
+	notify_device_state(skb, seq, device, &device_info, type);
+}
+
+void notify_peer_device_state_change(struct sk_buff *skb,
+				     unsigned int seq,
+				     struct drbd_peer_device_state_change *p,
+				     enum drbd_notification_type type)
+{
+	struct drbd_peer_device *peer_device = p->peer_device;
+	struct peer_device_info peer_device_info = {
+		.peer_repl_state = p->repl_state[NEW],
+		.peer_disk_state = p->disk_state[NEW],
+		.peer_resync_susp_user = p->resync_susp_user[NEW],
+		.peer_resync_susp_peer = p->resync_susp_peer[NEW],
+		.peer_resync_susp_dependency = p->resync_susp_dependency[NEW],
+	};
+
+	notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type);
+}
+
+static void broadcast_state_change(struct drbd_state_change *state_change)
+{
+	struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
+	bool resource_state_has_changed;
+	unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
+	void (*last_func)(struct sk_buff *, unsigned int, void *,
+			  enum drbd_notification_type) = NULL;
+	void *uninitialized_var(last_arg);
+
+#define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW])
+#define FINAL_STATE_CHANGE(type) \
+	({ if (last_func) \
+		last_func(NULL, 0, last_arg, type); \
+	})
+#define REMEMBER_STATE_CHANGE(func, arg, type) \
+	({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \
+	   last_func = (typeof(last_func))func; \
+	   last_arg = arg; \
+	 })
+
+	mutex_lock(&notification_mutex);
+
+	resource_state_has_changed =
+	    HAS_CHANGED(resource_state_change->role) ||
+	    HAS_CHANGED(resource_state_change->susp) ||
+	    HAS_CHANGED(resource_state_change->susp_nod) ||
+	    HAS_CHANGED(resource_state_change->susp_fen);
+
+	if (resource_state_has_changed)
+		REMEMBER_STATE_CHANGE(notify_resource_state_change,
+				      resource_state_change, NOTIFY_CHANGE);
+
+	for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+		struct drbd_connection_state_change *connection_state_change =
+				&state_change->connections[n_connection];
+
+		if (HAS_CHANGED(connection_state_change->peer_role) ||
+		    HAS_CHANGED(connection_state_change->cstate))
+			REMEMBER_STATE_CHANGE(notify_connection_state_change,
+					      connection_state_change, NOTIFY_CHANGE);
+	}
+
+	for (n_device = 0; n_device < state_change->n_devices; n_device++) {
+		struct drbd_device_state_change *device_state_change =
+			&state_change->devices[n_device];
+
+		if (HAS_CHANGED(device_state_change->disk_state))
+			REMEMBER_STATE_CHANGE(notify_device_state_change,
+					      device_state_change, NOTIFY_CHANGE);
+	}
+
+	n_peer_devices = state_change->n_devices * state_change->n_connections;
+	for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) {
+		struct drbd_peer_device_state_change *p =
+			&state_change->peer_devices[n_peer_device];
+
+		if (HAS_CHANGED(p->disk_state) ||
+		    HAS_CHANGED(p->repl_state) ||
+		    HAS_CHANGED(p->resync_susp_user) ||
+		    HAS_CHANGED(p->resync_susp_peer) ||
+		    HAS_CHANGED(p->resync_susp_dependency))
+			REMEMBER_STATE_CHANGE(notify_peer_device_state_change,
+					      p, NOTIFY_CHANGE);
+	}
+
+	FINAL_STATE_CHANGE(NOTIFY_CHANGE);
+	mutex_unlock(&notification_mutex);
+
+#undef HAS_CHANGED
+#undef FINAL_STATE_CHANGE
+#undef REMEMBER_STATE_CHANGE
+}
+
 /**
  * after_state_ch() - Perform after state change actions that may sleep
  * @device:	DRBD device.
@@ -1253,13 +1654,16 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
  * @flags:	Flags
  */
 static void after_state_ch(struct drbd_device *device, union drbd_state os,
-			   union drbd_state ns, enum chg_state_flags flags)
+			   union drbd_state ns, enum chg_state_flags flags,
+			   struct drbd_state_change *state_change)
 {
 	struct drbd_resource *resource = device->resource;
 	struct drbd_peer_device *peer_device = first_peer_device(device);
 	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
 	struct sib_info sib;
 
+	broadcast_state_change(state_change);
+
 	sib.sib_reason = SIB_STATE_CHANGE;
 	sib.os = os;
 	sib.ns = ns;
@@ -1572,6 +1976,7 @@ struct after_conn_state_chg_work {
 	union drbd_state ns_max; /* new, max state, over all devices */
 	enum chg_state_flags flags;
 	struct drbd_connection *connection;
+	struct drbd_state_change *state_change;
 };
 
 static int w_after_conn_state_ch(struct drbd_work *w, int unused)
@@ -1584,6 +1989,8 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
 	struct drbd_peer_device *peer_device;
 	int vnr;
 
+	broadcast_state_change(acscw->state_change);
+	forget_state_change(acscw->state_change);
 	kfree(acscw);
 
 	/* Upon network configuration, we need to start the receiver */
@@ -1593,6 +2000,13 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
 	if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) {
 		struct net_conf *old_conf;
 
+		mutex_lock(&notification_mutex);
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+			notify_peer_device_state(NULL, 0, peer_device, NULL,
+						 NOTIFY_DESTROY | NOTIFY_CONTINUES);
+		notify_connection_state(NULL, 0, connection, NULL, NOTIFY_DESTROY);
+		mutex_unlock(&notification_mutex);
+
 		mutex_lock(&connection->resource->conf_update);
 		old_conf = connection->net_conf;
 		connection->my_addr_len = 0;
@@ -1823,6 +2237,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u
 	enum drbd_conns oc = connection->cstate;
 	union drbd_state ns_max, ns_min, os;
 	bool have_mutex = false;
+	struct drbd_state_change *state_change;
 
 	if (mask.conn) {
 		rv = is_valid_conn_transition(oc, val.conn);
@@ -1868,10 +2283,12 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u
 			goto abort;
 	}
 
+	state_change = remember_old_state(connection->resource, GFP_ATOMIC);
 	conn_old_common_state(connection, &os, &flags);
 	flags |= CS_DC_SUSP;
 	conn_set_state(connection, mask, val, &ns_min, &ns_max, flags);
 	conn_pr_state_change(connection, os, ns_max, flags);
+	remember_new_state(state_change);
 
 	acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC);
 	if (acscw) {
@@ -1882,6 +2299,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u
 		acscw->w.cb = w_after_conn_state_ch;
 		kref_get(&connection->kref);
 		acscw->connection = connection;
+		acscw->state_change = state_change;
 		drbd_queue_work(&connection->sender_work, &acscw->w);
 	} else {
 		drbd_err(connection, "Could not kmalloc an acscw\n");
diff --git a/drivers/block/drbd/drbd_state_change.h b/drivers/block/drbd/drbd_state_change.h
new file mode 100644
index 000000000000..9e503a1a0bfb
--- /dev/null
+++ b/drivers/block/drbd/drbd_state_change.h
@@ -0,0 +1,63 @@
+#ifndef DRBD_STATE_CHANGE_H
+#define DRBD_STATE_CHANGE_H
+
+struct drbd_resource_state_change {
+	struct drbd_resource *resource;
+	enum drbd_role role[2];
+	bool susp[2];
+	bool susp_nod[2];
+	bool susp_fen[2];
+};
+
+struct drbd_device_state_change {
+	struct drbd_device *device;
+	enum drbd_disk_state disk_state[2];
+};
+
+struct drbd_connection_state_change {
+	struct drbd_connection *connection;
+	enum drbd_conns cstate[2];  /* drbd9: enum drbd_conn_state */
+	enum drbd_role peer_role[2];
+};
+
+struct drbd_peer_device_state_change {
+	struct drbd_peer_device *peer_device;
+	enum drbd_disk_state disk_state[2];
+	enum drbd_conns repl_state[2];  /* drbd9: enum drbd_repl_state */
+	bool resync_susp_user[2];
+	bool resync_susp_peer[2];
+	bool resync_susp_dependency[2];
+};
+
+struct drbd_state_change {
+	struct list_head list;
+	unsigned int n_devices;
+	unsigned int n_connections;
+	struct drbd_resource_state_change resource[1];
+	struct drbd_device_state_change *devices;
+	struct drbd_connection_state_change *connections;
+	struct drbd_peer_device_state_change *peer_devices;
+};
+
+extern struct drbd_state_change *remember_old_state(struct drbd_resource *, gfp_t);
+extern void copy_old_to_new_state_change(struct drbd_state_change *);
+extern void forget_state_change(struct drbd_state_change *);
+
+extern void notify_resource_state_change(struct sk_buff *,
+					 unsigned int,
+					 struct drbd_resource_state_change *,
+					 enum drbd_notification_type type);
+extern void notify_connection_state_change(struct sk_buff *,
+					   unsigned int,
+					   struct drbd_connection_state_change *,
+					   enum drbd_notification_type type);
+extern void notify_device_state_change(struct sk_buff *,
+				       unsigned int,
+				       struct drbd_device_state_change *,
+				       enum drbd_notification_type type);
+extern void notify_peer_device_state_change(struct sk_buff *,
+					    unsigned int,
+					    struct drbd_peer_device_state_change *,
+					    enum drbd_notification_type type);
+
+#endif  /* DRBD_STATE_CHANGE_H */
-- 
cgit 


From a55bbd375d1802141f0f043e2cd08f85c23d6209 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@linbit.com>
Date: Thu, 28 Aug 2014 13:31:14 +0200
Subject: drbd: Backport the "status" command

The status command originates the drbd9 code base. While for now we
keep the status information in /proc/drbd available, this commit
allows the user base to gracefully migrate their monitoring
infrastructure to the new status reporting interface.

In drbd9 no status information is exposed through /proc/drbd.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_nl.c | 566 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 487 insertions(+), 79 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index aa805cdde769..1eb10e28ac19 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -76,6 +76,13 @@ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
 int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
 /* .dumpit */
 int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_devices_done(struct netlink_callback *cb);
+int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_connections_done(struct netlink_callback *cb);
+int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb);
 int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb);
 
 #include <linux/drbd_genl_api.h>
@@ -2964,6 +2971,486 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+/*
+ * The generic netlink dump callbacks are called outside the genl_lock(), so
+ * they cannot use the simple attribute parsing code which uses global
+ * attribute tables.
+ */
+static struct nlattr *find_cfg_context_attr(const struct nlmsghdr *nlh, int attr)
+{
+	const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
+	const int maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
+	struct nlattr *nla;
+
+	nla = nla_find(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen),
+		       DRBD_NLA_CFG_CONTEXT);
+	if (!nla)
+		return NULL;
+	return drbd_nla_find_nested(maxtype, nla, __nla_type(attr));
+}
+
+static void resource_to_info(struct resource_info *, struct drbd_resource *);
+
+int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct drbd_genlmsghdr *dh;
+	struct drbd_resource *resource;
+	struct resource_info resource_info;
+	struct resource_statistics resource_statistics;
+	int err;
+
+	rcu_read_lock();
+	if (cb->args[0]) {
+		for_each_resource_rcu(resource, &drbd_resources)
+			if (resource == (struct drbd_resource *)cb->args[0])
+				goto found_resource;
+		err = 0;  /* resource was probably deleted */
+		goto out;
+	}
+	resource = list_entry(&drbd_resources,
+			      struct drbd_resource, resources);
+
+found_resource:
+	list_for_each_entry_continue_rcu(resource, &drbd_resources, resources) {
+		goto put_result;
+	}
+	err = 0;
+	goto out;
+
+put_result:
+	dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+			cb->nlh->nlmsg_seq, &drbd_genl_family,
+			NLM_F_MULTI, DRBD_ADM_GET_RESOURCES);
+	err = -ENOMEM;
+	if (!dh)
+		goto out;
+	dh->minor = -1U;
+	dh->ret_code = NO_ERROR;
+	err = nla_put_drbd_cfg_context(skb, resource, NULL, NULL);
+	if (err)
+		goto out;
+	err = res_opts_to_skb(skb, &resource->res_opts, !capable(CAP_SYS_ADMIN));
+	if (err)
+		goto out;
+	resource_to_info(&resource_info, resource);
+	err = resource_info_to_skb(skb, &resource_info, !capable(CAP_SYS_ADMIN));
+	if (err)
+		goto out;
+	resource_statistics.res_stat_write_ordering = resource->write_ordering;
+	err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN));
+	if (err)
+		goto out;
+	cb->args[0] = (long)resource;
+	genlmsg_end(skb, dh);
+	err = 0;
+
+out:
+	rcu_read_unlock();
+	if (err)
+		return err;
+	return skb->len;
+}
+
+static void device_to_statistics(struct device_statistics *s,
+				 struct drbd_device *device)
+{
+	memset(s, 0, sizeof(*s));
+	s->dev_upper_blocked = !may_inc_ap_bio(device);
+	if (get_ldev(device)) {
+		struct drbd_md *md = &device->ldev->md;
+		u64 *history_uuids = (u64 *)s->history_uuids;
+		struct request_queue *q;
+		int n;
+
+		spin_lock_irq(&md->uuid_lock);
+		s->dev_current_uuid = md->uuid[UI_CURRENT];
+		BUILD_BUG_ON(sizeof(s->history_uuids) < UI_HISTORY_END - UI_HISTORY_START + 1);
+		for (n = 0; n < UI_HISTORY_END - UI_HISTORY_START + 1; n++)
+			history_uuids[n] = md->uuid[UI_HISTORY_START + n];
+		for (; n < HISTORY_UUIDS; n++)
+			history_uuids[n] = 0;
+		s->history_uuids_len = HISTORY_UUIDS;
+		spin_unlock_irq(&md->uuid_lock);
+
+		s->dev_disk_flags = md->flags;
+		q = bdev_get_queue(device->ldev->backing_bdev);
+		s->dev_lower_blocked =
+			bdi_congested(&q->backing_dev_info,
+				      (1 << WB_async_congested) |
+				      (1 << WB_sync_congested));
+		put_ldev(device);
+	}
+	s->dev_size = drbd_get_capacity(device->this_bdev);
+	s->dev_read = device->read_cnt;
+	s->dev_write = device->writ_cnt;
+	s->dev_al_writes = device->al_writ_cnt;
+	s->dev_bm_writes = device->bm_writ_cnt;
+	s->dev_upper_pending = atomic_read(&device->ap_bio_cnt);
+	s->dev_lower_pending = atomic_read(&device->local_cnt);
+	s->dev_al_suspended = test_bit(AL_SUSPENDED, &device->flags);
+	s->dev_exposed_data_uuid = device->ed_uuid;
+}
+
+static int put_resource_in_arg0(struct netlink_callback *cb, int holder_nr)
+{
+	if (cb->args[0]) {
+		struct drbd_resource *resource =
+			(struct drbd_resource *)cb->args[0];
+		kref_put(&resource->kref, drbd_destroy_resource);
+	}
+
+	return 0;
+}
+
+int drbd_adm_dump_devices_done(struct netlink_callback *cb) {
+	return put_resource_in_arg0(cb, 7);
+}
+
+static void device_to_info(struct device_info *, struct drbd_device *);
+
+int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *resource_filter;
+	struct drbd_resource *resource;
+	struct drbd_device *uninitialized_var(device);
+	int minor, err, retcode;
+	struct drbd_genlmsghdr *dh;
+	struct device_info device_info;
+	struct device_statistics device_statistics;
+	struct idr *idr_to_search;
+
+	resource = (struct drbd_resource *)cb->args[0];
+	if (!cb->args[0] && !cb->args[1]) {
+		resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
+		if (resource_filter) {
+			retcode = ERR_RES_NOT_KNOWN;
+			resource = drbd_find_resource(nla_data(resource_filter));
+			if (!resource)
+				goto put_result;
+			cb->args[0] = (long)resource;
+		}
+	}
+
+	rcu_read_lock();
+	minor = cb->args[1];
+	idr_to_search = resource ? &resource->devices : &drbd_devices;
+	device = idr_get_next(idr_to_search, &minor);
+	if (!device) {
+		err = 0;
+		goto out;
+	}
+	idr_for_each_entry_continue(idr_to_search, device, minor) {
+		retcode = NO_ERROR;
+		goto put_result;  /* only one iteration */
+	}
+	err = 0;
+	goto out;  /* no more devices */
+
+put_result:
+	dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+			cb->nlh->nlmsg_seq, &drbd_genl_family,
+			NLM_F_MULTI, DRBD_ADM_GET_DEVICES);
+	err = -ENOMEM;
+	if (!dh)
+		goto out;
+	dh->ret_code = retcode;
+	dh->minor = -1U;
+	if (retcode == NO_ERROR) {
+		dh->minor = device->minor;
+		err = nla_put_drbd_cfg_context(skb, device->resource, NULL, device);
+		if (err)
+			goto out;
+		if (get_ldev(device)) {
+			struct disk_conf *disk_conf =
+				rcu_dereference(device->ldev->disk_conf);
+
+			err = disk_conf_to_skb(skb, disk_conf, !capable(CAP_SYS_ADMIN));
+			put_ldev(device);
+			if (err)
+				goto out;
+		}
+		device_to_info(&device_info, device);
+		err = device_info_to_skb(skb, &device_info, !capable(CAP_SYS_ADMIN));
+		if (err)
+			goto out;
+
+		device_to_statistics(&device_statistics, device);
+		err = device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN));
+		if (err)
+			goto out;
+		cb->args[1] = minor + 1;
+	}
+	genlmsg_end(skb, dh);
+	err = 0;
+
+out:
+	rcu_read_unlock();
+	if (err)
+		return err;
+	return skb->len;
+}
+
+int drbd_adm_dump_connections_done(struct netlink_callback *cb)
+{
+	return put_resource_in_arg0(cb, 6);
+}
+
+enum { SINGLE_RESOURCE, ITERATE_RESOURCES };
+
+int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *resource_filter;
+	struct drbd_resource *resource = NULL, *next_resource;
+	struct drbd_connection *uninitialized_var(connection);
+	int err = 0, retcode;
+	struct drbd_genlmsghdr *dh;
+	struct connection_info connection_info;
+	struct connection_statistics connection_statistics;
+
+	rcu_read_lock();
+	resource = (struct drbd_resource *)cb->args[0];
+	if (!cb->args[0]) {
+		resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
+		if (resource_filter) {
+			retcode = ERR_RES_NOT_KNOWN;
+			resource = drbd_find_resource(nla_data(resource_filter));
+			if (!resource)
+				goto put_result;
+			cb->args[0] = (long)resource;
+			cb->args[1] = SINGLE_RESOURCE;
+		}
+	}
+	if (!resource) {
+		if (list_empty(&drbd_resources))
+			goto out;
+		resource = list_first_entry(&drbd_resources, struct drbd_resource, resources);
+		kref_get(&resource->kref);
+		cb->args[0] = (long)resource;
+		cb->args[1] = ITERATE_RESOURCES;
+	}
+
+    next_resource:
+	rcu_read_unlock();
+	mutex_lock(&resource->conf_update);
+	rcu_read_lock();
+	if (cb->args[2]) {
+		for_each_connection_rcu(connection, resource)
+			if (connection == (struct drbd_connection *)cb->args[2])
+				goto found_connection;
+		/* connection was probably deleted */
+		goto no_more_connections;
+	}
+	connection = list_entry(&resource->connections, struct drbd_connection, connections);
+
+found_connection:
+	list_for_each_entry_continue_rcu(connection, &resource->connections, connections) {
+		if (!has_net_conf(connection))
+			continue;
+		retcode = NO_ERROR;
+		goto put_result;  /* only one iteration */
+	}
+
+no_more_connections:
+	if (cb->args[1] == ITERATE_RESOURCES) {
+		for_each_resource_rcu(next_resource, &drbd_resources) {
+			if (next_resource == resource)
+				goto found_resource;
+		}
+		/* resource was probably deleted */
+	}
+	goto out;
+
+found_resource:
+	list_for_each_entry_continue_rcu(next_resource, &drbd_resources, resources) {
+		mutex_unlock(&resource->conf_update);
+		kref_put(&resource->kref, drbd_destroy_resource);
+		resource = next_resource;
+		kref_get(&resource->kref);
+		cb->args[0] = (long)resource;
+		cb->args[2] = 0;
+		goto next_resource;
+	}
+	goto out;  /* no more resources */
+
+put_result:
+	dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+			cb->nlh->nlmsg_seq, &drbd_genl_family,
+			NLM_F_MULTI, DRBD_ADM_GET_CONNECTIONS);
+	err = -ENOMEM;
+	if (!dh)
+		goto out;
+	dh->ret_code = retcode;
+	dh->minor = -1U;
+	if (retcode == NO_ERROR) {
+		struct net_conf *net_conf;
+
+		err = nla_put_drbd_cfg_context(skb, resource, connection, NULL);
+		if (err)
+			goto out;
+		net_conf = rcu_dereference(connection->net_conf);
+		if (net_conf) {
+			err = net_conf_to_skb(skb, net_conf, !capable(CAP_SYS_ADMIN));
+			if (err)
+				goto out;
+		}
+		connection_to_info(&connection_info, connection);
+		err = connection_info_to_skb(skb, &connection_info, !capable(CAP_SYS_ADMIN));
+		if (err)
+			goto out;
+		connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags);
+		err = connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN));
+		if (err)
+			goto out;
+		cb->args[2] = (long)connection;
+	}
+	genlmsg_end(skb, dh);
+	err = 0;
+
+out:
+	rcu_read_unlock();
+	if (resource)
+		mutex_unlock(&resource->conf_update);
+	if (err)
+		return err;
+	return skb->len;
+}
+
+enum mdf_peer_flag {
+	MDF_PEER_CONNECTED =	1 << 0,
+	MDF_PEER_OUTDATED =	1 << 1,
+	MDF_PEER_FENCING =	1 << 2,
+	MDF_PEER_FULL_SYNC =	1 << 3,
+};
+
+static void peer_device_to_statistics(struct peer_device_statistics *s,
+				      struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+
+	memset(s, 0, sizeof(*s));
+	s->peer_dev_received = device->recv_cnt;
+	s->peer_dev_sent = device->send_cnt;
+	s->peer_dev_pending = atomic_read(&device->ap_pending_cnt) +
+			      atomic_read(&device->rs_pending_cnt);
+	s->peer_dev_unacked = atomic_read(&device->unacked_cnt);
+	s->peer_dev_out_of_sync = drbd_bm_total_weight(device) << (BM_BLOCK_SHIFT - 9);
+	s->peer_dev_resync_failed = device->rs_failed << (BM_BLOCK_SHIFT - 9);
+	if (get_ldev(device)) {
+		struct drbd_md *md = &device->ldev->md;
+
+		spin_lock_irq(&md->uuid_lock);
+		s->peer_dev_bitmap_uuid = md->uuid[UI_BITMAP];
+		spin_unlock_irq(&md->uuid_lock);
+		s->peer_dev_flags =
+			(drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND) ?
+				MDF_PEER_CONNECTED : 0) +
+			(drbd_md_test_flag(device->ldev, MDF_CONSISTENT) &&
+			 !drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE) ?
+				MDF_PEER_OUTDATED : 0) +
+			/* FIXME: MDF_PEER_FENCING? */
+			(drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ?
+				MDF_PEER_FULL_SYNC : 0);
+		put_ldev(device);
+	}
+}
+
+int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb)
+{
+	return put_resource_in_arg0(cb, 9);
+}
+
+int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *resource_filter;
+	struct drbd_resource *resource;
+	struct drbd_device *uninitialized_var(device);
+	struct drbd_peer_device *peer_device = NULL;
+	int minor, err, retcode;
+	struct drbd_genlmsghdr *dh;
+	struct idr *idr_to_search;
+
+	resource = (struct drbd_resource *)cb->args[0];
+	if (!cb->args[0] && !cb->args[1]) {
+		resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
+		if (resource_filter) {
+			retcode = ERR_RES_NOT_KNOWN;
+			resource = drbd_find_resource(nla_data(resource_filter));
+			if (!resource)
+				goto put_result;
+		}
+		cb->args[0] = (long)resource;
+	}
+
+	rcu_read_lock();
+	minor = cb->args[1];
+	idr_to_search = resource ? &resource->devices : &drbd_devices;
+	device = idr_find(idr_to_search, minor);
+	if (!device) {
+next_device:
+		minor++;
+		cb->args[2] = 0;
+		device = idr_get_next(idr_to_search, &minor);
+		if (!device) {
+			err = 0;
+			goto out;
+		}
+	}
+	if (cb->args[2]) {
+		for_each_peer_device(peer_device, device)
+			if (peer_device == (struct drbd_peer_device *)cb->args[2])
+				goto found_peer_device;
+		/* peer device was probably deleted */
+		goto next_device;
+	}
+	/* Make peer_device point to the list head (not the first entry). */
+	peer_device = list_entry(&device->peer_devices, struct drbd_peer_device, peer_devices);
+
+found_peer_device:
+	list_for_each_entry_continue_rcu(peer_device, &device->peer_devices, peer_devices) {
+		if (!has_net_conf(peer_device->connection))
+			continue;
+		retcode = NO_ERROR;
+		goto put_result;  /* only one iteration */
+	}
+	goto next_device;
+
+put_result:
+	dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+			cb->nlh->nlmsg_seq, &drbd_genl_family,
+			NLM_F_MULTI, DRBD_ADM_GET_PEER_DEVICES);
+	err = -ENOMEM;
+	if (!dh)
+		goto out;
+	dh->ret_code = retcode;
+	dh->minor = -1U;
+	if (retcode == NO_ERROR) {
+		struct peer_device_info peer_device_info;
+		struct peer_device_statistics peer_device_statistics;
+
+		dh->minor = minor;
+		err = nla_put_drbd_cfg_context(skb, device->resource, peer_device->connection, device);
+		if (err)
+			goto out;
+		peer_device_to_info(&peer_device_info, peer_device);
+		err = peer_device_info_to_skb(skb, &peer_device_info, !capable(CAP_SYS_ADMIN));
+		if (err)
+			goto out;
+		peer_device_to_statistics(&peer_device_statistics, peer_device);
+		err = peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN));
+		if (err)
+			goto out;
+		cb->args[1] = minor;
+		cb->args[2] = (long)peer_device;
+	}
+	genlmsg_end(skb, dh);
+	err = 0;
+
+out:
+	rcu_read_unlock();
+	if (err)
+		return err;
+	return skb->len;
+}
 /*
  * Return the connection of @resource if @resource has exactly one connection.
  */
@@ -3818,85 +4305,6 @@ failed:
 			err, seq, sib->sib_reason);
 }
 
-static void device_to_statistics(struct device_statistics *s,
-				 struct drbd_device *device)
-{
-	memset(s, 0, sizeof(*s));
-	s->dev_upper_blocked = !may_inc_ap_bio(device);
-	if (get_ldev(device)) {
-		struct drbd_md *md = &device->ldev->md;
-		u64 *history_uuids = (u64 *)s->history_uuids;
-		struct request_queue *q;
-		int n;
-
-		spin_lock_irq(&md->uuid_lock);
-		s->dev_current_uuid = md->uuid[UI_CURRENT];
-		BUILD_BUG_ON(sizeof(s->history_uuids) < UI_HISTORY_END - UI_HISTORY_START + 1);
-		for (n = 0; n < UI_HISTORY_END - UI_HISTORY_START + 1; n++)
-			history_uuids[n] = md->uuid[UI_HISTORY_START + n];
-		for (; n < HISTORY_UUIDS; n++)
-			history_uuids[n] = 0;
-		s->history_uuids_len = HISTORY_UUIDS;
-		spin_unlock_irq(&md->uuid_lock);
-
-		s->dev_disk_flags = md->flags;
-		q = bdev_get_queue(device->ldev->backing_bdev);
-		s->dev_lower_blocked =
-			bdi_congested(&q->backing_dev_info,
-				      (1 << WB_async_congested) |
-				      (1 << WB_sync_congested));
-		put_ldev(device);
-	}
-	s->dev_size = drbd_get_capacity(device->this_bdev);
-	s->dev_read = device->read_cnt;
-	s->dev_write = device->writ_cnt;
-	s->dev_al_writes = device->al_writ_cnt;
-	s->dev_bm_writes = device->bm_writ_cnt;
-	s->dev_upper_pending = atomic_read(&device->ap_bio_cnt);
-	s->dev_lower_pending = atomic_read(&device->local_cnt);
-	s->dev_al_suspended = test_bit(AL_SUSPENDED, &device->flags);
-	s->dev_exposed_data_uuid = device->ed_uuid;
-}
-
-enum mdf_peer_flag {
-	MDF_PEER_CONNECTED =	1 << 0,
-	MDF_PEER_OUTDATED =	1 << 1,
-	MDF_PEER_FENCING =	1 << 2,
-	MDF_PEER_FULL_SYNC =	1 << 3,
-};
-
-static void peer_device_to_statistics(struct peer_device_statistics *s,
-				      struct drbd_peer_device *peer_device)
-{
-	struct drbd_device *device = peer_device->device;
-
-	memset(s, 0, sizeof(*s));
-	s->peer_dev_received = device->recv_cnt;
-	s->peer_dev_sent = device->send_cnt;
-	s->peer_dev_pending = atomic_read(&device->ap_pending_cnt) +
-			      atomic_read(&device->rs_pending_cnt);
-	s->peer_dev_unacked = atomic_read(&device->unacked_cnt);
-	s->peer_dev_out_of_sync = drbd_bm_total_weight(device) << (BM_BLOCK_SHIFT - 9);
-	s->peer_dev_resync_failed = device->rs_failed << (BM_BLOCK_SHIFT - 9);
-	if (get_ldev(device)) {
-		struct drbd_md *md = &device->ldev->md;
-
-		spin_lock_irq(&md->uuid_lock);
-		s->peer_dev_bitmap_uuid = md->uuid[UI_BITMAP];
-		spin_unlock_irq(&md->uuid_lock);
-		s->peer_dev_flags =
-			(drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND) ?
-				MDF_PEER_CONNECTED : 0) +
-			(drbd_md_test_flag(device->ldev, MDF_CONSISTENT) &&
-			 !drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE) ?
-				MDF_PEER_OUTDATED : 0) +
-			/* FIXME: MDF_PEER_FENCING? */
-			(drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ?
-				MDF_PEER_FULL_SYNC : 0);
-		put_ldev(device);
-	}
-}
-
 static int nla_put_notification_header(struct sk_buff *msg,
 				       enum drbd_notification_type type)
 {
-- 
cgit 


From d01efceeea99d91022e14aa4169857b1256bcba5 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Wed, 19 Nov 2014 13:33:32 +0100
Subject: drbd: Deletion of an unnecessary check before the function call
 "lc_destroy"

The lc_destroy() function tests whether its argument is NULL and then
returns immediately. Thus the test around the call is not needed.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Roland Kammerer <roland.kammerer@linbit.com>

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_nl.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 1eb10e28ac19..b87fb313660e 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1133,8 +1133,7 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
 		lc_destroy(n);
 		return -EBUSY;
 	} else {
-		if (t)
-			lc_destroy(t);
+		lc_destroy(t);
 	}
 	drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */
 	return 0;
-- 
cgit 


From d38f8612298d463cb5dbffdbac6db6e38c5dd22f Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 10 Dec 2014 15:26:57 +0100
Subject: drbd: Replace 0 with the more meaningful GFP_NOWAIT

GFP_NOWAIT has a value of 0. I.e. functionality not changed.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_nl.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index b87fb313660e..af78f0906cb2 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -4289,7 +4289,7 @@ void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
 	if (nla_put_status_info(msg, device, sib))
 		goto nla_put_failure;
 	genlmsg_end(msg, d_out);
-	err = drbd_genl_multicast_events(msg, 0);
+	err = drbd_genl_multicast_events(msg, GFP_NOWAIT);
 	/* msg has been consumed or freed in netlink_broadcast() */
 	if (err && err != -ESRCH)
 		goto failed;
@@ -4351,7 +4351,7 @@ void notify_resource_state(struct sk_buff *skb,
 		goto nla_put_failure;
 	genlmsg_end(skb, dh);
 	if (multicast) {
-		err = drbd_genl_multicast_events(skb, 0);
+		err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
 		/* skb has been consumed or freed in netlink_broadcast() */
 		if (err && err != -ESRCH)
 			goto failed;
@@ -4400,7 +4400,7 @@ void notify_device_state(struct sk_buff *skb,
 	device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN));
 	genlmsg_end(skb, dh);
 	if (multicast) {
-		err = drbd_genl_multicast_events(skb, 0);
+		err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
 		/* skb has been consumed or freed in netlink_broadcast() */
 		if (err && err != -ESRCH)
 			goto failed;
@@ -4449,7 +4449,7 @@ void notify_connection_state(struct sk_buff *skb,
 	connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN));
 	genlmsg_end(skb, dh);
 	if (multicast) {
-		err = drbd_genl_multicast_events(skb, 0);
+		err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
 		/* skb has been consumed or freed in netlink_broadcast() */
 		if (err && err != -ESRCH)
 			goto failed;
@@ -4499,7 +4499,7 @@ void notify_peer_device_state(struct sk_buff *skb,
 	peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN));
 	genlmsg_end(skb, dh);
 	if (multicast) {
-		err = drbd_genl_multicast_events(skb, 0);
+		err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
 		/* skb has been consumed or freed in netlink_broadcast() */
 		if (err && err != -ESRCH)
 			goto failed;
@@ -4545,7 +4545,7 @@ void notify_helper(enum drbd_notification_type type,
 	    drbd_helper_info_to_skb(skb, &helper_info, true))
 		goto unlock_fail;
 	genlmsg_end(skb, dh);
-	err = drbd_genl_multicast_events(skb, 0);
+	err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
 	skb = NULL;
 	/* skb has been consumed or freed in netlink_broadcast() */
 	if (err && err != -ESRCH)
-- 
cgit 


From 05cbbb395f193a1d15e0f749eff8abe5c9c49b62 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 16 Jan 2015 17:41:55 +0100
Subject: drbd: Fix spurious disk-timeout

(You should not use disk-timeout anyways,
 see the man page for why...)

We add incoming requests to the tail of some ring list.
On local completion, requests are removed from that list.
The timer looks only at the head of that ring list,
so is supposed to only see the oldest request.
All protected by a spinlock.

The request object is created with timestamps zeroed out.
The timestamp was only filled in just before the actual submit.
But to actually submit the request, we need to give up the spinlock.

If you are unlucky, there is no older still pending request, the timer
looks at a new request with timestamp still zero (before it even was
submitted), and 0 + timeout is most likely older than "now".

Better assign the timestamp right when we put the
request object on said ring list.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_req.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 55fca685fca5..7660f6e749ff 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1167,7 +1167,6 @@ drbd_submit_req_private_bio(struct drbd_request *req)
 	 * stable storage, and this is a WRITE, we may not even submit
 	 * this bio. */
 	if (get_ldev(device)) {
-		req->pre_submit_jif = jiffies;
 		if (drbd_insert_fault(device,
 				      rw == WRITE ? DRBD_FAULT_DT_WR
 				    : rw == READ  ? DRBD_FAULT_DT_RD
@@ -1311,6 +1310,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
 			&device->pending_master_completion[rw == WRITE]);
 	if (req->private_bio) {
 		/* needs to be marked within the same spinlock */
+		req->pre_submit_jif = jiffies;
 		list_add_tail(&req->req_pending_local,
 			&device->pending_completion[rw == WRITE]);
 		_req_mod(req, TO_BE_SUBMITTED);
-- 
cgit 


From 05a72772fcaae4ac88052b6c93aa0d116ff0a748 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 26 Jan 2015 11:35:38 +0100
Subject: drbd: drbdsetup detach of an unresponsive local disk should not block
 IO "forever"

When detaching, we make sure no application IO is in-flight
by internally suspending IO, then trigger the state change,
wait for the result, and finally internally resume IO again.

Once we triggered the stat change to "Failed",
we expect it to change from Failed to Diskless.
(To avoid races, we actually wait for it to leave "Failed").

On an unresponsive local IO backend, this may not happen, ever.
Don't have a "hung" detach block IO "forever", but resume IO
before waiting for the state change to Diskless.

We may well be able to continue IO to and from a healthy peer.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_nl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index af78f0906cb2..331b378b7d0b 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1929,9 +1929,9 @@ static int adm_detach(struct drbd_device *device, int force)
 	retcode = drbd_request_state(device, NS(disk, D_FAILED));
 	drbd_md_put_buffer(device);
 	/* D_FAILED will transition to DISKLESS. */
+	drbd_resume_io(device);
 	ret = wait_event_interruptible(device->misc_wait,
 			device->state.disk != D_FAILED);
-	drbd_resume_io(device);
 	if ((int)retcode == (int)SS_IS_DISKLESS)
 		retcode = SS_NOTHING_TO_DO;
 	if (ret)
-- 
cgit 


From 9bd2eb2c98239c465a51571b5c2a465a2ff7d70b Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 19 Feb 2015 14:01:50 +0100
Subject: drbd: also bump UUIDs if a diskless primary connects

If for some reason the primary lost its disk *and* the replication link
before it is able to communicate the disk loss, probably blocked IO,
then later is able to re-establish the connection, the peer needs to
bump its UUIDs just like it does when peer only loses the disk
and is able to communicate this in time.

Otherwise, a later re-attach of the disk on the primary may start a
resync in the "wrong" direction.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index bc4b45bf9ace..06afd4df1b7b 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -1781,7 +1781,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 	}
 
 	if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) {
-		if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
+		if (os.peer != R_PRIMARY && ns.peer == R_PRIMARY &&
 		    device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
 			drbd_uuid_new_current(device);
 			drbd_send_uuids(peer_device);
-- 
cgit 


From dc99562a48f39db4be0f9195137ec6f5350baf19 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 19 Feb 2015 13:43:55 +0100
Subject: drbd: add comment why we want to first call local-io-error, then send
 state

Even though we really want to get the state information about our bad
disk to the peer as soon as possible, it is useful to first call the
local-io-error handler.

People may chose to hard-reset the box from there.
If that looks and behaves exactly like a "regular node crash", without
bumping the data generation UUIDs on the peer in between, it makes it
easier to deal with.

If you intend to return from the local-io-error handler, then better
return as quickly as possible to avoid triggering other timeouts.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_state.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 06afd4df1b7b..a4e4505fee36 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -1859,6 +1859,10 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 
 			was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags);
 
+			/* Intentionally call this handler first, before drbd_send_state().
+			 * See: 2932204 drbd: call local-io-error handler early
+			 * People may chose to hard-reset the box from this handler.
+			 * It is useful if this looks like a "regular node crash". */
 			if (was_io_error && eh == EP_CALL_HELPER)
 				drbd_khelper(device, "local-io-error");
 
-- 
cgit 


From 142207f782dd7f174a1e1b954c3a9dd04316bd95 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 19 Feb 2015 13:48:59 +0100
Subject: drbd: drbd_panic_after_delayed_completion_of_aborted_request()

The only way to make DRBD intentionally call panic is to
set a disk timeout, have that trigger, "abort" some request and complete
to upper layers, then have the backend IO subsystem later complete these
requests successfully regardless.

As the attached IO pages have been recycled for other purposes
meanwhile, this will cause unexpected random memory changes.
To prevent corruption, we rather panic in that case.

Make it obvious from stack traces that this was the case by introducing
drbd_panic_after_delayed_completion_of_aborted_request().

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_worker.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 3b3d980ac8c7..9c89ebe21c6b 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -188,6 +188,12 @@ void drbd_peer_request_endio(struct bio *bio)
 	}
 }
 
+void drbd_panic_after_delayed_completion_of_aborted_request(struct drbd_device *device)
+{
+	panic("drbd%u %s/%u potential random memory corruption caused by delayed completion of aborted local request\n",
+		device->minor, device->resource->name, device->vnr);
+}
+
 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
  */
 void drbd_request_endio(struct bio *bio)
@@ -231,7 +237,7 @@ void drbd_request_endio(struct bio *bio)
 			drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
 
 		if (!bio->bi_error)
-			panic("possible random memory corruption caused by delayed completion of aborted local request\n");
+			drbd_panic_after_delayed_completion_of_aborted_request(device);
 	}
 
 	/* to avoid recursion in __req_mod */
-- 
cgit 


From 84d34f2f0724f26de04f9863704a7ca797c0fd62 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 19 Feb 2015 13:54:11 +0100
Subject: drbd: improve network timeout detection

Don't blame the peer for being unresponsive,
if we did not even ask the question yet.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_int.h    |   2 +
 drivers/block/drbd/drbd_req.c    | 123 ++++++++++++++++++++++++++++++---------
 drivers/block/drbd/drbd_worker.c |   2 +
 3 files changed, 100 insertions(+), 27 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 965aae0ba492..1d00f2e061c5 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -773,6 +773,8 @@ struct drbd_connection {
 	struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST];
 
 	struct {
+		unsigned long last_sent_barrier_jif;
+
 		/* whether this sender thread
 		 * has processed a single write yet. */
 		bool seen_any_write_yet;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 7660f6e749ff..3add7c5e97e0 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1531,6 +1531,78 @@ blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio)
 	return BLK_QC_T_NONE;
 }
 
+static bool net_timeout_reached(struct drbd_request *net_req,
+		struct drbd_connection *connection,
+		unsigned long now, unsigned long ent,
+		unsigned int ko_count, unsigned int timeout)
+{
+	struct drbd_device *device = net_req->device;
+
+	if (!time_after(now, net_req->pre_send_jif + ent))
+		return false;
+
+	if (time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent))
+		return false;
+
+	if (net_req->rq_state & RQ_NET_PENDING) {
+		drbd_warn(device, "Remote failed to finish a request within %ums > ko-count (%u) * timeout (%u * 0.1s)\n",
+			jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout);
+		return true;
+	}
+
+	/* We received an ACK already (or are using protocol A),
+	 * but are waiting for the epoch closing barrier ack.
+	 * Check if we sent the barrier already.  We should not blame the peer
+	 * for being unresponsive, if we did not even ask it yet. */
+	if (net_req->epoch == connection->send.current_epoch_nr) {
+		drbd_warn(device,
+			"We did not send a P_BARRIER for %ums > ko-count (%u) * timeout (%u * 0.1s); drbd kernel thread blocked?\n",
+			jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout);
+		return false;
+	}
+
+	/* Worst case: we may have been blocked for whatever reason, then
+	 * suddenly are able to send a lot of requests (and epoch separating
+	 * barriers) in quick succession.
+	 * The timestamp of the net_req may be much too old and not correspond
+	 * to the sending time of the relevant unack'ed barrier packet, so
+	 * would trigger a spurious timeout.  The latest barrier packet may
+	 * have a too recent timestamp to trigger the timeout, potentially miss
+	 * a timeout.  Right now we don't have a place to conveniently store
+	 * these timestamps.
+	 * But in this particular situation, the application requests are still
+	 * completed to upper layers, DRBD should still "feel" responsive.
+	 * No need yet to kill this connection, it may still recover.
+	 * If not, eventually we will have queued enough into the network for
+	 * us to block. From that point of view, the timestamp of the last sent
+	 * barrier packet is relevant enough.
+	 */
+	if (time_after(now, connection->send.last_sent_barrier_jif + ent)) {
+		drbd_warn(device, "Remote failed to answer a P_BARRIER (sent at %lu jif; now=%lu jif) within %ums > ko-count (%u) * timeout (%u * 0.1s)\n",
+			connection->send.last_sent_barrier_jif, now,
+			jiffies_to_msecs(now - connection->send.last_sent_barrier_jif), ko_count, timeout);
+		return true;
+	}
+	return false;
+}
+
+/* A request is considered timed out, if
+ * - we have some effective timeout from the configuration,
+ *   with some state restrictions applied,
+ * - the oldest request is waiting for a response from the network
+ *   resp. the local disk,
+ * - the oldest request is in fact older than the effective timeout,
+ * - the connection was established (resp. disk was attached)
+ *   for longer than the timeout already.
+ * Note that for 32bit jiffies and very stable connections/disks,
+ * we may have a wrap around, which is catched by
+ *   !time_in_range(now, last_..._jif, last_..._jif + timeout).
+ *
+ * Side effect: once per 32bit wrap-around interval, which means every
+ * ~198 days with 250 HZ, we have a window where the timeout would need
+ * to expire twice (worst case) to become effective. Good enough.
+ */
+
 void request_timer_fn(unsigned long data)
 {
 	struct drbd_device *device = (struct drbd_device *) data;
@@ -1540,11 +1612,14 @@ void request_timer_fn(unsigned long data)
 	unsigned long oldest_submit_jif;
 	unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
 	unsigned long now;
+	unsigned int ko_count = 0, timeout = 0;
 
 	rcu_read_lock();
 	nc = rcu_dereference(connection->net_conf);
-	if (nc && device->state.conn >= C_WF_REPORT_PARAMS)
-		ent = nc->timeout * HZ/10 * nc->ko_count;
+	if (nc && device->state.conn >= C_WF_REPORT_PARAMS) {
+		ko_count = nc->ko_count;
+		timeout = nc->timeout;
+	}
 
 	if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */
 		dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10;
@@ -1552,6 +1627,8 @@ void request_timer_fn(unsigned long data)
 	}
 	rcu_read_unlock();
 
+
+	ent = timeout * HZ/10 * ko_count;
 	et = min_not_zero(dt, ent);
 
 	if (!et)
@@ -1563,11 +1640,22 @@ void request_timer_fn(unsigned long data)
 	spin_lock_irq(&device->resource->req_lock);
 	req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
 	req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
-	req_peer = connection->req_not_net_done;
+
 	/* maybe the oldest request waiting for the peer is in fact still
-	 * blocking in tcp sendmsg */
-	if (!req_peer && connection->req_next && connection->req_next->pre_send_jif)
-		req_peer = connection->req_next;
+	 * blocking in tcp sendmsg.  That's ok, though, that's handled via the
+	 * socket send timeout, requesting a ping, and bumping ko-count in
+	 * we_should_drop_the_connection().
+	 */
+
+	/* check the oldest request we did successfully sent,
+	 * but which is still waiting for an ACK. */
+	req_peer = connection->req_ack_pending;
+
+	/* if we don't have such request (e.g. protocoll A)
+	 * check the oldest requests which is still waiting on its epoch
+	 * closing barrier ack. */
+	if (!req_peer)
+		req_peer = connection->req_not_net_done;
 
 	/* evaluate the oldest peer request only in one timer! */
 	if (req_peer && req_peer->device != device)
@@ -1584,28 +1672,9 @@ void request_timer_fn(unsigned long data)
 		: req_write ? req_write->pre_submit_jif
 		: req_read ? req_read->pre_submit_jif : now;
 
-	/* The request is considered timed out, if
-	 * - we have some effective timeout from the configuration,
-	 *   with above state restrictions applied,
-	 * - the oldest request is waiting for a response from the network
-	 *   resp. the local disk,
-	 * - the oldest request is in fact older than the effective timeout,
-	 * - the connection was established (resp. disk was attached)
-	 *   for longer than the timeout already.
-	 * Note that for 32bit jiffies and very stable connections/disks,
-	 * we may have a wrap around, which is catched by
-	 *   !time_in_range(now, last_..._jif, last_..._jif + timeout).
-	 *
-	 * Side effect: once per 32bit wrap-around interval, which means every
-	 * ~198 days with 250 HZ, we have a window where the timeout would need
-	 * to expire twice (worst case) to become effective. Good enough.
-	 */
-	if (ent && req_peer &&
-		 time_after(now, req_peer->pre_send_jif + ent) &&
-		!time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
-		drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
+	if (ent && req_peer && net_timeout_reached(req_peer, connection, now, ent, ko_count, timeout))
 		_conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD);
-	}
+
 	if (dt && oldest_submit_jif != now &&
 		 time_after(now, oldest_submit_jif + dt) &&
 		!time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 9c89ebe21c6b..8bbabe37ef0d 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1290,6 +1290,7 @@ static int drbd_send_barrier(struct drbd_connection *connection)
 	p->barrier = connection->send.current_epoch_nr;
 	p->pad = 0;
 	connection->send.current_epoch_writes = 0;
+	connection->send.last_sent_barrier_jif = jiffies;
 
 	return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
 }
@@ -1314,6 +1315,7 @@ static void re_init_if_first_write(struct drbd_connection *connection, unsigned
 		connection->send.seen_any_write_yet = true;
 		connection->send.current_epoch_nr = epoch;
 		connection->send.current_epoch_writes = 0;
+		connection->send.last_sent_barrier_jif = jiffies;
 	}
 }
 
-- 
cgit 


From 2b479766ee79a821f3cd7fad92fbcf2ff16cacb3 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 25 Feb 2015 13:53:28 +0100
Subject: drbd: fix NULL deref in remember_new_state

The recent (not yet released) backport of the extended state broadcasts
to support the "events2" subcommand of drbdsetup had some glitches.

remember_old_state() would first count all connections with a
net_conf != NULL, then allocate a suitable array, then populate that
array with all connections found to have net_conf != NULL.

This races with the state change to C_STANDALONE,
and the NULL assignment there.

remember_new_state() then iterates over said connection array,
assuming that it would be fully populated.

But rcu_lock() just makes sure the thing some pointer points to,
if any, won't go away. It does not make the pointer itself immutable.

In fact there is no need to "filter" connections based on whether or not
they have a currently valid configuration.  Just record them always, if
they don't have a config, that's fine, there will be no change then.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_state.c | 46 +++++++++++++----------------------------
 1 file changed, 14 insertions(+), 32 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index a4e4505fee36..f022e99f9855 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -63,11 +63,8 @@ static void count_objects(struct drbd_resource *resource,
 
 	idr_for_each_entry(&resource->devices, device, vnr)
 		(*n_devices)++;
-	for_each_connection(connection, resource) {
-		if (!has_net_conf(connection))
-			continue;
+	for_each_connection(connection, resource)
 		(*n_connections)++;
-	}
 }
 
 static struct drbd_state_change *alloc_state_change(unsigned int n_devices, unsigned int n_connections, gfp_t gfp)
@@ -108,23 +105,13 @@ struct drbd_state_change *remember_old_state(struct drbd_resource *resource, gfp
 	struct drbd_peer_device_state_change *peer_device_state_change;
 	struct drbd_connection_state_change *connection_state_change;
 
-retry:
-	rcu_read_lock();
+	/* Caller holds req_lock spinlock.
+	 * No state, no device IDR, no connections lists can change. */
 	count_objects(resource, &n_devices, &n_connections);
-	rcu_read_unlock();
 	state_change = alloc_state_change(n_devices, n_connections, gfp);
 	if (!state_change)
 		return NULL;
 
-	rcu_read_lock();
-	count_objects(resource, &n_devices, &n_connections);
-	if (n_devices != state_change->n_devices ||
-	    n_connections != state_change->n_connections) {
-		kfree(state_change);
-		rcu_read_unlock();
-		goto retry;
-	}
-
 	kref_get(&resource->kref);
 	state_change->resource->resource = resource;
 	state_change->resource->role[OLD] =
@@ -133,6 +120,17 @@ retry:
 	state_change->resource->susp_nod[OLD] = resource->susp_nod;
 	state_change->resource->susp_fen[OLD] = resource->susp_fen;
 
+	connection_state_change = state_change->connections;
+	for_each_connection(connection, resource) {
+		kref_get(&connection->kref);
+		connection_state_change->connection = connection;
+		connection_state_change->cstate[OLD] =
+			connection->cstate;
+		connection_state_change->peer_role[OLD] =
+			conn_highest_peer(connection);
+		connection_state_change++;
+	}
+
 	device_state_change = state_change->devices;
 	peer_device_state_change = state_change->peer_devices;
 	idr_for_each_entry(&resource->devices, device, vnr) {
@@ -145,8 +143,6 @@ retry:
 		for_each_connection(connection, resource) {
 			struct drbd_peer_device *peer_device;
 
-			if (!has_net_conf(connection))
-				continue;
 			peer_device = conn_peer_device(connection, device->vnr);
 			peer_device_state_change->peer_device = peer_device;
 			peer_device_state_change->disk_state[OLD] =
@@ -165,20 +161,6 @@ retry:
 		device_state_change++;
 	}
 
-	connection_state_change = state_change->connections;
-	for_each_connection(connection, resource) {
-		if (!has_net_conf(connection))
-			continue;
-		kref_get(&connection->kref);
-		connection_state_change->connection = connection;
-		connection_state_change->cstate[OLD] =
-			connection->cstate;
-		connection_state_change->peer_role[OLD] =
-			conn_highest_peer(connection);
-		connection_state_change++;
-	}
-	rcu_read_unlock();
-
 	return state_change;
 }
 
-- 
cgit 


From 6434f404b43afa0cfe54fec009760510431ca103 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 25 Feb 2015 19:37:28 +0100
Subject: drbd: fix refcount error during detach of an already failed disk

A D_FAILED disk transitions as quickly as possible to
D_DISKLESS. But in the "unresponsive local disk" case,
there remains a time window where a administrative detach command could
find the disk already failed, but some internal meta data IO against the
unresponsive local disk still pending.

In that case, drbd_md_get_buffer() will return NULL.
Don't unconditionally call drbd_md_put_buffer(), or it will cause
refcount imbalance, and prevent any further re-attach on this volume
(until it is deleted and re-created).

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_nl.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 331b378b7d0b..79dc3d4f5aee 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1915,6 +1915,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 static int adm_detach(struct drbd_device *device, int force)
 {
 	enum drbd_state_rv retcode;
+	void *buffer;
 	int ret;
 
 	if (force) {
@@ -1925,9 +1926,12 @@ static int adm_detach(struct drbd_device *device, int force)
 	}
 
 	drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
-	drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
-	retcode = drbd_request_state(device, NS(disk, D_FAILED));
-	drbd_md_put_buffer(device);
+	buffer = drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
+	if (buffer) {
+		retcode = drbd_request_state(device, NS(disk, D_FAILED));
+		drbd_md_put_buffer(device);
+	} else /* already <= D_FAILED */
+		retcode = SS_NOTHING_TO_DO;
 	/* D_FAILED will transition to DISKLESS. */
 	drbd_resume_io(device);
 	ret = wait_event_interruptible(device->misc_wait,
-- 
cgit 


From 1c03e52083c8fa6e70a0b921d25d1916f68320fc Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 16 Mar 2015 15:01:00 +0100
Subject: drbd: Rename asender to ack_receiver

This prepares the next patch where the sending on the meta (or
control) socket is moved to a dedicated workqueue.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_int.h      |  6 +++---
 drivers/block/drbd/drbd_main.c     | 10 +++++-----
 drivers/block/drbd/drbd_receiver.c |  6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 1d00f2e061c5..dee629797d0f 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -754,7 +754,7 @@ struct drbd_connection {
 	unsigned long last_reconnect_jif;
 	struct drbd_thread receiver;
 	struct drbd_thread worker;
-	struct drbd_thread asender;
+	struct drbd_thread ack_receiver;
 
 	/* cached pointers,
 	 * so we can look up the oldest pending requests more quickly.
@@ -1557,7 +1557,7 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
 
 /* drbd_receiver.c */
 extern int drbd_receiver(struct drbd_thread *thi);
-extern int drbd_asender(struct drbd_thread *thi);
+extern int drbd_ack_receiver(struct drbd_thread *thi);
 extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
 extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
 		bool throttle_if_app_is_waiting);
@@ -1971,7 +1971,7 @@ extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
 static inline void wake_asender(struct drbd_connection *connection)
 {
 	if (test_bit(SIGNAL_ASENDER, &connection->flags))
-		force_sig(DRBD_SIG, connection->asender.task);
+		force_sig(DRBD_SIG, connection->ack_receiver.task);
 }
 
 static inline void request_ping(struct drbd_connection *connection)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index f66294db3b08..445f2c8bfa1b 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1436,8 +1436,8 @@ static int we_should_drop_the_connection(struct drbd_connection *connection, str
 	/* long elapsed = (long)(jiffies - device->last_received); */
 
 	drop_it =   connection->meta.socket == sock
-		|| !connection->asender.task
-		|| get_t_state(&connection->asender) != RUNNING
+		|| !connection->ack_receiver.task
+		|| get_t_state(&connection->ack_receiver) != RUNNING
 		|| connection->cstate < C_WF_REPORT_PARAMS;
 
 	if (drop_it)
@@ -2564,7 +2564,7 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op
 		cpumask_copy(resource->cpu_mask, new_cpu_mask);
 		for_each_connection_rcu(connection, resource) {
 			connection->receiver.reset_cpu_mask = 1;
-			connection->asender.reset_cpu_mask = 1;
+			connection->ack_receiver.reset_cpu_mask = 1;
 			connection->worker.reset_cpu_mask = 1;
 		}
 	}
@@ -2653,8 +2653,8 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
 	connection->receiver.connection = connection;
 	drbd_thread_init(resource, &connection->worker, drbd_worker, "worker");
 	connection->worker.connection = connection;
-	drbd_thread_init(resource, &connection->asender, drbd_asender, "asender");
-	connection->asender.connection = connection;
+	drbd_thread_init(resource, &connection->ack_receiver, drbd_ack_receiver, "ack_recv");
+	connection->ack_receiver.connection = connection;
 
 	kref_init(&connection->kref);
 
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 61b73c77a690..eed4ae9107b4 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1099,7 +1099,7 @@ randomize:
 		return 0;
 	}
 
-	drbd_thread_start(&connection->asender);
+	drbd_thread_start(&connection->ack_receiver);
 
 	mutex_lock(&connection->resource->conf_update);
 	/* The discard_my_data flag is a single-shot modifier to the next
@@ -4656,7 +4656,7 @@ static void conn_disconnect(struct drbd_connection *connection)
 	conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
 
 	/* asender does not clean up anything. it must not interfere, either */
-	drbd_thread_stop(&connection->asender);
+	drbd_thread_stop(&connection->ack_receiver);
 	drbd_free_sock(connection);
 
 	rcu_read_lock();
@@ -5487,7 +5487,7 @@ static struct asender_cmd asender_tbl[] = {
 	[P_RETRY_WRITE]	    = { sizeof(struct p_block_ack), got_BlockAck },
 };
 
-int drbd_asender(struct drbd_thread *thi)
+int drbd_ack_receiver(struct drbd_thread *thi)
 {
 	struct drbd_connection *connection = thi->connection;
 	struct asender_cmd *cmd = NULL;
-- 
cgit 


From 668700b40a7c8727bbd2b3fd4fd22e0ce3f1aeb6 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 16 Mar 2015 16:08:29 +0100
Subject: drbd: Create a dedicated workqueue for sending acks on the control
 connection

The intention is to reduce CPU utilization. Recent measurements
unveiled that the current performance bottleneck is CPU utilization
on the receiving node. The asender thread became CPU limited.

One of the main points is to eliminate the idr_for_each_entry() loop
from the sending acks code path.

One exception in that is sending back ping_acks. These stay
in the ack-receiver thread. Otherwise the logic becomes too
complicated for no added value.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_int.h      |  27 ++---
 drivers/block/drbd/drbd_main.c     |  10 +-
 drivers/block/drbd/drbd_nl.c       |   4 +-
 drivers/block/drbd/drbd_protocol.h |   2 +-
 drivers/block/drbd/drbd_receiver.c | 203 +++++++++++++++++++++----------------
 drivers/block/drbd/drbd_req.c      |   2 +-
 drivers/block/drbd/drbd_worker.c   |   8 +-
 7 files changed, 141 insertions(+), 115 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index dee629797d0f..3efaf181438c 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -77,13 +77,6 @@ extern int fault_devs;
 extern char usermode_helper[];
 
 
-/* I don't remember why XCPU ...
- * This is used to wake the asender,
- * and to interrupt sending the sending task
- * on disconnect.
- */
-#define DRBD_SIG SIGXCPU
-
 /* This is used to stop/restart our threads.
  * Cannot use SIGTERM nor SIGKILL, since these
  * are sent out by init on runlevel changes
@@ -647,8 +640,7 @@ extern struct fifo_buffer *fifo_alloc(int fifo_size);
 enum {
 	NET_CONGESTED,		/* The data socket is congested */
 	RESOLVE_CONFLICTS,	/* Set on one node, cleared on the peer! */
-	SEND_PING,		/* whether asender should send a ping asap */
-	SIGNAL_ASENDER,		/* whether asender wants to be interrupted */
+	SEND_PING,
 	GOT_PING_ACK,		/* set when we receive a ping_ack packet, ping_wait gets woken */
 	CONN_WD_ST_CHG_REQ,	/* A cluster wide state change on the connection is active */
 	CONN_WD_ST_CHG_OKAY,
@@ -755,6 +747,7 @@ struct drbd_connection {
 	struct drbd_thread receiver;
 	struct drbd_thread worker;
 	struct drbd_thread ack_receiver;
+	struct workqueue_struct *ack_sender;
 
 	/* cached pointers,
 	 * so we can look up the oldest pending requests more quickly.
@@ -823,6 +816,7 @@ struct drbd_peer_device {
 	struct list_head peer_devices;
 	struct drbd_device *device;
 	struct drbd_connection *connection;
+	struct work_struct send_acks_work;
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debugfs_peer_dev;
 #endif
@@ -1558,6 +1552,8 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
 /* drbd_receiver.c */
 extern int drbd_receiver(struct drbd_thread *thi);
 extern int drbd_ack_receiver(struct drbd_thread *thi);
+extern void drbd_send_ping_wf(struct work_struct *ws);
+extern void drbd_send_acks_wf(struct work_struct *ws);
 extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
 extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
 		bool throttle_if_app_is_waiting);
@@ -1968,16 +1964,21 @@ drbd_device_post_work(struct drbd_device *device, int work_bit)
 
 extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
 
-static inline void wake_asender(struct drbd_connection *connection)
+/* To get the ack_receiver out of the blocking network stack,
+ * so it can change its sk_rcvtimeo from idle- to ping-timeout,
+ * and send a ping, we need to send a signal.
+ * Which signal we send is irrelevant. */
+static inline void wake_ack_receiver(struct drbd_connection *connection)
 {
-	if (test_bit(SIGNAL_ASENDER, &connection->flags))
-		force_sig(DRBD_SIG, connection->ack_receiver.task);
+	struct task_struct *task = connection->ack_receiver.task;
+	if (task && get_t_state(&connection->ack_receiver) == RUNNING)
+		force_sig(SIGXCPU, task);
 }
 
 static inline void request_ping(struct drbd_connection *connection)
 {
 	set_bit(SEND_PING, &connection->flags);
-	wake_asender(connection);
+	wake_ack_receiver(connection);
 }
 
 extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 445f2c8bfa1b..938bca2df027 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1794,15 +1794,6 @@ int drbd_send(struct drbd_connection *connection, struct socket *sock,
 		drbd_update_congested(connection);
 	}
 	do {
-		/* STRANGE
-		 * tcp_sendmsg does _not_ use its size parameter at all ?
-		 *
-		 * -EAGAIN on timeout, -EINTR on signal.
-		 */
-/* THINK
- * do we need to block DRBD_SIG if sock == &meta.socket ??
- * otherwise wake_asender() might interrupt some send_*Ack !
- */
 		rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
 		if (rv == -EAGAIN) {
 			if (we_should_drop_the_connection(connection, sock))
@@ -2821,6 +2812,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 			goto out_idr_remove_from_resource;
 		}
 		kref_get(&connection->kref);
+		INIT_WORK(&peer_device->send_acks_work, drbd_send_acks_wf);
 	}
 
 	if (init_submitter(device)) {
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 79dc3d4f5aee..f35cefb20e25 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1258,8 +1258,8 @@ static void conn_reconfig_done(struct drbd_connection *connection)
 		connection->cstate == C_STANDALONE;
 	spin_unlock_irq(&connection->resource->req_lock);
 	if (stop_threads) {
-		/* asender is implicitly stopped by receiver
-		 * in conn_disconnect() */
+		/* ack_receiver thread and ack_sender workqueue are implicitly
+		 * stopped by receiver in conn_disconnect() */
 		drbd_thread_stop(&connection->receiver);
 		drbd_thread_stop(&connection->worker);
 	}
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
index 2da9104a3851..ef9245363dcc 100644
--- a/drivers/block/drbd/drbd_protocol.h
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -23,7 +23,7 @@ enum drbd_packet {
 	P_AUTH_RESPONSE	      = 0x11,
 	P_STATE_CHG_REQ	      = 0x12,
 
-	/* asender (meta socket */
+	/* (meta socket) */
 	P_PING		      = 0x13,
 	P_PING_ACK	      = 0x14,
 	P_RECV_ACK	      = 0x15, /* Used in protocol B */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index eed4ae9107b4..ea54341df3bf 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -215,7 +215,7 @@ static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
 	}
 }
 
-static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
+static void drbd_reclaim_net_peer_reqs(struct drbd_device *device)
 {
 	LIST_HEAD(reclaimed);
 	struct drbd_peer_request *peer_req, *t;
@@ -223,11 +223,30 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
 	spin_lock_irq(&device->resource->req_lock);
 	reclaim_finished_net_peer_reqs(device, &reclaimed);
 	spin_unlock_irq(&device->resource->req_lock);
-
 	list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
 		drbd_free_net_peer_req(device, peer_req);
 }
 
+static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		if (!atomic_read(&device->pp_in_use_by_net))
+			continue;
+
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		drbd_reclaim_net_peer_reqs(device);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
+
 /**
  * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
  * @device:	DRBD device.
@@ -265,10 +284,15 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
 	if (atomic_read(&device->pp_in_use) < mxb)
 		page = __drbd_alloc_pages(device, number);
 
+	/* Try to keep the fast path fast, but occasionally we need
+	 * to reclaim the pages we lended to the network stack. */
+	if (page && atomic_read(&device->pp_in_use_by_net) > 512)
+		drbd_reclaim_net_peer_reqs(device);
+
 	while (page == NULL) {
 		prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
 
-		drbd_kick_lo_and_reclaim_net(device);
+		drbd_reclaim_net_peer_reqs(device);
 
 		if (atomic_read(&device->pp_in_use) < mxb) {
 			page = __drbd_alloc_pages(device, number);
@@ -1100,6 +1124,11 @@ randomize:
 	}
 
 	drbd_thread_start(&connection->ack_receiver);
+	connection->ack_sender = create_singlethread_workqueue("drbd_ack_sender");
+	if (!connection->ack_sender) {
+		drbd_err(connection, "Failed to create workqueue ack_sender\n");
+		return 0;
+	}
 
 	mutex_lock(&connection->resource->conf_update);
 	/* The discard_my_data flag is a single-shot modifier to the next
@@ -1746,7 +1775,7 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req
 }
 
 /*
- * e_end_resync_block() is called in asender context via
+ * e_end_resync_block() is called in ack_sender context via
  * drbd_finish_peer_reqs().
  */
 static int e_end_resync_block(struct drbd_work *w, int unused)
@@ -1920,7 +1949,7 @@ static void restart_conflicting_writes(struct drbd_device *device,
 }
 
 /*
- * e_end_block() is called in asender context via drbd_finish_peer_reqs().
+ * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs().
  */
 static int e_end_block(struct drbd_work *w, int cancel)
 {
@@ -2211,7 +2240,7 @@ static int handle_write_conflicts(struct drbd_device *device,
 			peer_req->w.cb = superseded ? e_send_superseded :
 						   e_send_retry_write;
 			list_add_tail(&peer_req->w.list, &device->done_ee);
-			wake_asender(connection);
+			queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work);
 
 			err = -ENOENT;
 			goto out;
@@ -4050,7 +4079,7 @@ static int receive_state(struct drbd_connection *connection, struct packet_info
 	os = ns = drbd_read_state(device);
 	spin_unlock_irq(&device->resource->req_lock);
 
-	/* If some other part of the code (asender thread, timeout)
+	/* If some other part of the code (ack_receiver thread, timeout)
 	 * already decided to close the connection again,
 	 * we must not "re-establish" it here. */
 	if (os.conn <= C_TEAR_DOWN)
@@ -4655,8 +4684,12 @@ static void conn_disconnect(struct drbd_connection *connection)
 	 */
 	conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
 
-	/* asender does not clean up anything. it must not interfere, either */
+	/* ack_receiver does not clean up anything. it must not interfere, either */
 	drbd_thread_stop(&connection->ack_receiver);
+	if (connection->ack_sender) {
+		destroy_workqueue(connection->ack_sender);
+		connection->ack_sender = NULL;
+	}
 	drbd_free_sock(connection);
 
 	rcu_read_lock();
@@ -5425,49 +5458,39 @@ static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
 	return 0;
 }
 
-static int connection_finish_peer_reqs(struct drbd_connection *connection)
+struct meta_sock_cmd {
+	size_t pkt_size;
+	int (*fn)(struct drbd_connection *connection, struct packet_info *);
+};
+
+static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout)
 {
-	struct drbd_peer_device *peer_device;
-	int vnr, not_empty = 0;
+	long t;
+	struct net_conf *nc;
 
-	do {
-		clear_bit(SIGNAL_ASENDER, &connection->flags);
-		flush_signals(current);
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	t = ping_timeout ? nc->ping_timeo : nc->ping_int;
+	rcu_read_unlock();
 
-		rcu_read_lock();
-		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-			struct drbd_device *device = peer_device->device;
-			kref_get(&device->kref);
-			rcu_read_unlock();
-			if (drbd_finish_peer_reqs(device)) {
-				kref_put(&device->kref, drbd_destroy_device);
-				return 1;
-			}
-			kref_put(&device->kref, drbd_destroy_device);
-			rcu_read_lock();
-		}
-		set_bit(SIGNAL_ASENDER, &connection->flags);
+	t *= HZ;
+	if (ping_timeout)
+		t /= 10;
 
-		spin_lock_irq(&connection->resource->req_lock);
-		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-			struct drbd_device *device = peer_device->device;
-			not_empty = !list_empty(&device->done_ee);
-			if (not_empty)
-				break;
-		}
-		spin_unlock_irq(&connection->resource->req_lock);
-		rcu_read_unlock();
-	} while (not_empty);
+	connection->meta.socket->sk->sk_rcvtimeo = t;
+}
 
-	return 0;
+static void set_ping_timeout(struct drbd_connection *connection)
+{
+	set_rcvtimeo(connection, 1);
 }
 
-struct asender_cmd {
-	size_t pkt_size;
-	int (*fn)(struct drbd_connection *connection, struct packet_info *);
-};
+static void set_idle_timeout(struct drbd_connection *connection)
+{
+	set_rcvtimeo(connection, 0);
+}
 
-static struct asender_cmd asender_tbl[] = {
+static struct meta_sock_cmd ack_receiver_tbl[] = {
 	[P_PING]	    = { 0, got_Ping },
 	[P_PING_ACK]	    = { 0, got_PingAck },
 	[P_RECV_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
@@ -5490,61 +5513,37 @@ static struct asender_cmd asender_tbl[] = {
 int drbd_ack_receiver(struct drbd_thread *thi)
 {
 	struct drbd_connection *connection = thi->connection;
-	struct asender_cmd *cmd = NULL;
+	struct meta_sock_cmd *cmd = NULL;
 	struct packet_info pi;
+	unsigned long pre_recv_jif;
 	int rv;
 	void *buf    = connection->meta.rbuf;
 	int received = 0;
 	unsigned int header_size = drbd_header_size(connection);
 	int expect   = header_size;
 	bool ping_timeout_active = false;
-	struct net_conf *nc;
-	int ping_timeo, tcp_cork, ping_int;
 	struct sched_param param = { .sched_priority = 2 };
 
 	rv = sched_setscheduler(current, SCHED_RR, &param);
 	if (rv < 0)
-		drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv);
+		drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv);
 
 	while (get_t_state(thi) == RUNNING) {
 		drbd_thread_current_set_cpu(thi);
 
-		rcu_read_lock();
-		nc = rcu_dereference(connection->net_conf);
-		ping_timeo = nc->ping_timeo;
-		tcp_cork = nc->tcp_cork;
-		ping_int = nc->ping_int;
-		rcu_read_unlock();
+		conn_reclaim_net_peer_reqs(connection);
 
 		if (test_and_clear_bit(SEND_PING, &connection->flags)) {
 			if (drbd_send_ping(connection)) {
 				drbd_err(connection, "drbd_send_ping has failed\n");
 				goto reconnect;
 			}
-			connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10;
+			set_ping_timeout(connection);
 			ping_timeout_active = true;
 		}
 
-		/* TODO: conditionally cork; it may hurt latency if we cork without
-		   much to send */
-		if (tcp_cork)
-			drbd_tcp_cork(connection->meta.socket);
-		if (connection_finish_peer_reqs(connection)) {
-			drbd_err(connection, "connection_finish_peer_reqs() failed\n");
-			goto reconnect;
-		}
-		/* but unconditionally uncork unless disabled */
-		if (tcp_cork)
-			drbd_tcp_uncork(connection->meta.socket);
-
-		/* short circuit, recv_msg would return EINTR anyways. */
-		if (signal_pending(current))
-			continue;
-
+		pre_recv_jif = jiffies;
 		rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
-		clear_bit(SIGNAL_ASENDER, &connection->flags);
-
-		flush_signals(current);
 
 		/* Note:
 		 * -EINTR	 (on meta) we got a signal
@@ -5556,7 +5555,6 @@ int drbd_ack_receiver(struct drbd_thread *thi)
 		 * rv <  expected: "woken" by signal during receive
 		 * rv == 0	 : "connection shut down by peer"
 		 */
-received_more:
 		if (likely(rv > 0)) {
 			received += rv;
 			buf	 += rv;
@@ -5578,8 +5576,7 @@ received_more:
 		} else if (rv == -EAGAIN) {
 			/* If the data socket received something meanwhile,
 			 * that is good enough: peer is still alive. */
-			if (time_after(connection->last_received,
-				jiffies - connection->meta.socket->sk->sk_rcvtimeo))
+			if (time_after(connection->last_received, pre_recv_jif))
 				continue;
 			if (ping_timeout_active) {
 				drbd_err(connection, "PingAck did not arrive in time.\n");
@@ -5588,6 +5585,10 @@ received_more:
 			set_bit(SEND_PING, &connection->flags);
 			continue;
 		} else if (rv == -EINTR) {
+			/* maybe drbd_thread_stop(): the while condition will notice.
+			 * maybe woken for send_ping: we'll send a ping above,
+			 * and change the rcvtimeo */
+			flush_signals(current);
 			continue;
 		} else {
 			drbd_err(connection, "sock_recvmsg returned %d\n", rv);
@@ -5597,8 +5598,8 @@ received_more:
 		if (received == expect && cmd == NULL) {
 			if (decode_header(connection, connection->meta.rbuf, &pi))
 				goto reconnect;
-			cmd = &asender_tbl[pi.cmd];
-			if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
+			cmd = &ack_receiver_tbl[pi.cmd];
+			if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) {
 				drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
 					 cmdname(pi.cmd), pi.cmd);
 				goto disconnect;
@@ -5621,9 +5622,8 @@ received_more:
 
 			connection->last_received = jiffies;
 
-			if (cmd == &asender_tbl[P_PING_ACK]) {
-				/* restore idle timeout */
-				connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
+			if (cmd == &ack_receiver_tbl[P_PING_ACK]) {
+				set_idle_timeout(connection);
 				ping_timeout_active = false;
 			}
 
@@ -5632,11 +5632,6 @@ received_more:
 			expect	 = header_size;
 			cmd	 = NULL;
 		}
-		if (test_bit(SEND_PING, &connection->flags))
-			continue;
-		rv = drbd_recv_short(connection->meta.socket, buf, expect-received, MSG_DONTWAIT);
-		if (rv > 0)
-			goto received_more;
 	}
 
 	if (0) {
@@ -5648,9 +5643,41 @@ reconnect:
 disconnect:
 		conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 	}
-	clear_bit(SIGNAL_ASENDER, &connection->flags);
 
-	drbd_info(connection, "asender terminated\n");
+	drbd_info(connection, "ack_receiver terminated\n");
 
 	return 0;
 }
+
+void drbd_send_acks_wf(struct work_struct *ws)
+{
+	struct drbd_peer_device *peer_device =
+		container_of(ws, struct drbd_peer_device, send_acks_work);
+	struct drbd_connection *connection = peer_device->connection;
+	struct drbd_device *device = peer_device->device;
+	struct net_conf *nc;
+	int tcp_cork, err;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	tcp_cork = nc->tcp_cork;
+	rcu_read_unlock();
+
+	if (tcp_cork)
+		drbd_tcp_cork(connection->meta.socket);
+
+	err = drbd_finish_peer_reqs(device);
+	kref_put(&device->kref, drbd_destroy_device);
+	/* get is in drbd_endio_write_sec_final(). That is necessary to keep the
+	   struct work_struct send_acks_work alive, which is in the peer_device object */
+
+	if (err) {
+		conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+		return;
+	}
+
+	if (tcp_cork)
+		drbd_tcp_uncork(connection->meta.socket);
+
+	return;
+}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3add7c5e97e0..7907fb562388 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -453,7 +453,7 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 		kref_get(&req->kref); /* wait for the DONE */
 
 	if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
-		/* potentially already completed in the asender thread */
+		/* potentially already completed in the ack_receiver thread */
 		if (!(s & RQ_NET_DONE)) {
 			atomic_add(req->i.size >> 9, &device->ap_in_flight);
 			set_if_null_req_not_net_done(peer_device, req);
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 8bbabe37ef0d..2f29bf3e4dba 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -113,6 +113,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
 	unsigned long flags = 0;
 	struct drbd_peer_device *peer_device = peer_req->peer_device;
 	struct drbd_device *device = peer_device->device;
+	struct drbd_connection *connection = peer_device->connection;
 	struct drbd_interval i;
 	int do_wake;
 	u64 block_id;
@@ -145,6 +146,12 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
 	 * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
 	if (peer_req->flags & EE_WAS_ERROR)
 		__drbd_chk_io_error(device, DRBD_WRITE_ERROR);
+
+	if (connection->cstate >= C_WF_REPORT_PARAMS) {
+		kref_get(&device->kref); /* put is in drbd_send_acks_wf() */
+		if (!queue_work(connection->ack_sender, &peer_device->send_acks_work))
+			kref_put(&device->kref, drbd_destroy_device);
+	}
 	spin_unlock_irqrestore(&device->resource->req_lock, flags);
 
 	if (block_id == ID_SYNCER)
@@ -156,7 +163,6 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
 	if (do_al_complete_io)
 		drbd_al_complete_io(device, &i);
 
-	wake_asender(peer_device->connection);
 	put_ldev(device);
 }
 
-- 
cgit 


From 9fa48269197f7fcb8ccfbef03c3f3a8616872579 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 18 Mar 2015 17:13:26 +0100
Subject: drbd: prevent NULL pointer deref when resuming diskless primary

In a multiple error scenario, we may end up with a "frozen" Primary,
that has no access to any data (no local disk, no replication link).

If we then resume-io, we try to generate a new data generation id,
which will fail if there is no longer a local disk.

Double check for available local data,
which prevents the NULL pointer deref.

If we are diskless, turn the resume-io in this situation
into the first stage of a "force down", by bumping the "effective" data
gen id, which will prevent later attach or connect to the former data
set without first being demoted (deconfigured).

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_nl.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index f35cefb20e25..5e4adff91d1d 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -2920,7 +2920,30 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
 	mutex_lock(&adm_ctx.resource->adm_mutex);
 	device = adm_ctx.device;
 	if (test_bit(NEW_CUR_UUID, &device->flags)) {
-		drbd_uuid_new_current(device);
+		if (get_ldev_if_state(device, D_ATTACHING)) {
+			drbd_uuid_new_current(device);
+			put_ldev(device);
+		} else {
+			/* This is effectively a multi-stage "forced down".
+			 * The NEW_CUR_UUID bit is supposedly only set, if we
+			 * lost the replication connection, and are configured
+			 * to freeze IO and wait for some fence-peer handler.
+			 * So we still don't have a replication connection.
+			 * And now we don't have a local disk either.  After
+			 * resume, we will fail all pending and new IO, because
+			 * we don't have any data anymore.  Which means we will
+			 * eventually be able to terminate all users of this
+			 * device, and then take it down.  By bumping the
+			 * "effective" data uuid, we make sure that you really
+			 * need to tear down before you reconfigure, we will
+			 * the refuse to re-connect or re-attach (because no
+			 * matching real data uuid exists).
+			 */
+			u64 val;
+			get_random_bytes(&val, sizeof(u64));
+			drbd_set_ed_uuid(device, val);
+			drbd_warn(device, "Resumed without access to data; please tear down before attempting to re-configure.\n");
+		}
 		clear_bit(NEW_CUR_UUID, &device->flags);
 	}
 	drbd_suspend_io(device);
-- 
cgit 


From f5ec0173b92ba88c7b38e32a7ee3dd3b1568ea50 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 18 Mar 2015 17:19:10 +0100
Subject: drbd: debugfs: expose ed_data_gen_id

The effective data generation ID may be interesting for debugging
purposes of scenarios involving diskless states.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_debugfs.c | 10 ++++++++++
 drivers/block/drbd/drbd_int.h     |  1 +
 2 files changed, 11 insertions(+)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index 6b88a35fb048..96a0107a72ea 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -771,6 +771,13 @@ static int device_data_gen_id_show(struct seq_file *m, void *ignored)
 	return 0;
 }
 
+static int device_ed_gen_id_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+	seq_printf(m, "0x%016llX\n", (unsigned long long)device->ed_uuid);
+	return 0;
+}
+
 #define drbd_debugfs_device_attr(name)						\
 static int device_ ## name ## _open(struct inode *inode, struct file *file)	\
 {										\
@@ -796,6 +803,7 @@ drbd_debugfs_device_attr(oldest_requests)
 drbd_debugfs_device_attr(act_log_extents)
 drbd_debugfs_device_attr(resync_extents)
 drbd_debugfs_device_attr(data_gen_id)
+drbd_debugfs_device_attr(ed_gen_id)
 
 void drbd_debugfs_device_add(struct drbd_device *device)
 {
@@ -839,6 +847,7 @@ void drbd_debugfs_device_add(struct drbd_device *device)
 	DCF(act_log_extents);
 	DCF(resync_extents);
 	DCF(data_gen_id);
+	DCF(ed_gen_id);
 #undef DCF
 	return;
 
@@ -854,6 +863,7 @@ void drbd_debugfs_device_cleanup(struct drbd_device *device)
 	drbd_debugfs_remove(&device->debugfs_vol_act_log_extents);
 	drbd_debugfs_remove(&device->debugfs_vol_resync_extents);
 	drbd_debugfs_remove(&device->debugfs_vol_data_gen_id);
+	drbd_debugfs_remove(&device->debugfs_vol_ed_gen_id);
 	drbd_debugfs_remove(&device->debugfs_vol);
 }
 
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 3efaf181438c..08f266eab3ee 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -835,6 +835,7 @@ struct drbd_device {
 	struct dentry *debugfs_vol_act_log_extents;
 	struct dentry *debugfs_vol_resync_extents;
 	struct dentry *debugfs_vol_data_gen_id;
+	struct dentry *debugfs_vol_ed_gen_id;
 #endif
 
 	unsigned int vnr;	/* volume number within the connection */
-- 
cgit 


From 39e91a60c823603d6377cc8fa0b0bf301d1966eb Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 24 Mar 2015 10:40:26 +0100
Subject: drbd: use resource name in workqueue

Since kernel 3.3, we can use snprintf-style arguments
to create a workqueue.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_main.c     | 4 ++--
 drivers/block/drbd/drbd_receiver.c | 5 ++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 938bca2df027..3a9a0f112004 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2694,8 +2694,8 @@ static int init_submitter(struct drbd_device *device)
 {
 	/* opencoded create_singlethread_workqueue(),
 	 * to be able to say "drbd%d", ..., minor */
-	device->submit.wq = alloc_workqueue("drbd%u_submit",
-			WQ_UNBOUND | WQ_MEM_RECLAIM, 1, device->minor);
+	device->submit.wq =
+		alloc_ordered_workqueue("drbd%u_submit", WQ_MEM_RECLAIM, device->minor);
 	if (!device->submit.wq)
 		return -ENOMEM;
 
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index ea54341df3bf..1957fe8601dc 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1124,7 +1124,10 @@ randomize:
 	}
 
 	drbd_thread_start(&connection->ack_receiver);
-	connection->ack_sender = create_singlethread_workqueue("drbd_ack_sender");
+	/* opencoded create_singlethread_workqueue(),
+	 * to be able to use format string arguments */
+	connection->ack_sender =
+		alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name);
 	if (!connection->ack_sender) {
 		drbd_err(connection, "Failed to create workqueue ack_sender\n");
 		return 0;
-- 
cgit 


From 2630628b2dbc3fc320aafaf84836119e4e3d62f1 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 20 Mar 2015 15:47:22 +0100
Subject: drbd: avoid redefinition of BITS_PER_PAGE

Apparently we now implicitly get definitions for BITS_PER_PAGE and
BITS_PER_PAGE_MASK from the pid_namespace.h

Instead of renaming our defines, I chose to define only if not yet
defined, but to double check the value if already defined.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_bitmap.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 9462d2752850..8bdc34dbaedf 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -479,8 +479,14 @@ void drbd_bm_cleanup(struct drbd_device *device)
  * this masks out the remaining bits.
  * Returns the number of bits cleared.
  */
+#ifndef BITS_PER_PAGE
 #define BITS_PER_PAGE		(1UL << (PAGE_SHIFT + 3))
 #define BITS_PER_PAGE_MASK	(BITS_PER_PAGE - 1)
+#else
+# if BITS_PER_PAGE != (1UL << (PAGE_SHIFT + 3))
+#  error "ambiguous BITS_PER_PAGE"
+# endif
+#endif
 #define BITS_PER_LONG_MASK	(BITS_PER_LONG - 1)
 static int bm_clear_surplus(struct drbd_bitmap *b)
 {
-- 
cgit 


From 5fb3bc4ddcdda8d2a6b2185075d140b9009f99b5 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 20 Mar 2015 16:15:24 +0100
Subject: drbd: use bitmap_weight() helper, don't open code

Suggested by Akinobu Mita <akinobu.mita@gmail.com>

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_bitmap.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 8bdc34dbaedf..0dabc9b93725 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -24,7 +24,7 @@
 
 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
 
-#include <linux/bitops.h>
+#include <linux/bitmap.h>
 #include <linux/vmalloc.h>
 #include <linux/string.h>
 #include <linux/drbd.h>
@@ -565,21 +565,19 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b)
 	unsigned long *p_addr;
 	unsigned long bits = 0;
 	unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
-	int idx, i, last_word;
+	int idx, last_word;
 
 	/* all but last page */
 	for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
 		p_addr = __bm_map_pidx(b, idx);
-		for (i = 0; i < LWPP; i++)
-			bits += hweight_long(p_addr[i]);
+		bits += bitmap_weight(p_addr, BITS_PER_PAGE);
 		__bm_unmap(p_addr);
 		cond_resched();
 	}
 	/* last (or only) page */
 	last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
 	p_addr = __bm_map_pidx(b, idx);
-	for (i = 0; i < last_word; i++)
-		bits += hweight_long(p_addr[i]);
+	bits += bitmap_weight(p_addr, last_word * BITS_PER_LONG);
 	p_addr[last_word] &= cpu_to_lel(mask);
 	bits += hweight_long(p_addr[last_word]);
 	/* 32bit arch, may have an unused padding long */
@@ -1425,6 +1423,9 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
 	int bits;
 	int changed = 0;
 	unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
+
+	/* I think it is more cache line friendly to hweight_long then set to ~0UL,
+	 * than to first bitmap_weight() all words, then bitmap_fill() all words */
 	for (i = first_word; i < last_word; i++) {
 		bits = hweight_long(paddr[i]);
 		paddr[i] = ~0UL;
@@ -1634,8 +1635,7 @@ int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
 		int n = e-s;
 		p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
 		bm = p_addr + MLPP(s);
-		while (n--)
-			count += hweight_long(*bm++);
+		count += bitmap_weight(bm, n * BITS_PER_LONG);
 		bm_unmap(p_addr);
 	} else {
 		drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s);
-- 
cgit 


From edb5e5f63d80c368467e4a0628c5b6d30f1673eb Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 24 Mar 2015 21:35:26 +0100
Subject: drbd: fix spurious alert level printk

When accessing out meta data area on disk, we double check the
plausibility of the requested sector offsets, and are very noisy about
it if they look suspicious.

During initial read of our "superblock", for "external" meta data,
this triggered because the range estimate returned by
drbd_md_last_sector() was still wrong.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_main.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 3a9a0f112004..a4aa7eb5507d 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -3270,6 +3270,10 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
 	 * and read it. */
 	bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
 	bdev->md.md_offset = drbd_md_ss(bdev);
+	/* Even for (flexible or indexed) external meta data,
+	 * initially restrict us to the 4k superblock for now.
+	 * Affects the paranoia out-of-range access check in drbd_md_sync_page_io(). */
+	bdev->md.md_size_sect = 8;
 
 	if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) {
 		/* NOTE: can't do normal error processing here as this is
-- 
cgit 


From 088b70526d5b4080010147e4a6ae1252c3fc2228 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 24 Mar 2015 21:46:29 +0100
Subject: drbd: fix queue limit setup for discard

We cannot possibly support SECDISCARD, even if all backend devices would
support it: if our peer is currently unreachable, some instance of the
data may obviously still be recoverable.

We did not set discard_granularity at all.  We don't really care (yet),
we only pass them on, so for now, set our granularity to one sector.
blkdev_stack_limits() takes care of the rest.

If we decide we cannot support discards,
not only clear the (not user visible) QUEUE_FLAG_DISCARD,
but set both (user visible) discard_granularity and max_discard_sectors
to zero, to avoid confusion with e.g. lsblk -D.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_nl.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 5e4adff91d1d..4703f1ad7f92 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1168,21 +1168,20 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
 	if (b) {
 		struct drbd_connection *connection = first_peer_device(device)->connection;
 
+		blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS);
+
 		if (blk_queue_discard(b) &&
 		    (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) {
-			/* For now, don't allow more than one activity log extent worth of data
-			 * to be discarded in one go. We may need to rework drbd_al_begin_io()
-			 * to allow for even larger discard ranges */
-			blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS);
-
+			/* We don't care, stacking below should fix it for the local device.
+			 * Whether or not it is a suitable granularity on the remote device
+			 * is not our problem, really. If you care, you need to
+			 * use devices with similar topology on all peers. */
+			q->limits.discard_granularity = 512;
 			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
-			/* REALLY? Is stacking secdiscard "legal"? */
-			if (blk_queue_secdiscard(b))
-				queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q);
 		} else {
 			blk_queue_max_discard_sectors(q, 0);
 			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
-			queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q);
+			q->limits.discard_granularity = 0;
 		}
 
 		blk_queue_stack_limits(q, b);
@@ -1194,6 +1193,12 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
 			q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
 		}
 	}
+	/* To avoid confusion, if this queue does not support discard, clear
+	 * max_discard_sectors, which is what lsblk -D reports to the user.  */
+	if (!blk_queue_discard(q)) {
+		blk_queue_max_discard_sectors(q, 0);
+		q->limits.discard_granularity = 0;
+	}
 }
 
 void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev)
-- 
cgit 


From 63a7c8ad92af5f57d4a2c5be223d6ca424c3670b Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 26 Mar 2015 20:53:55 +0100
Subject: drbd: make drbd known to lsblk: use bd_link_disk_holder

lsblk should be able to pick up stacking device driver relations
involving DRBD conveniently.

Even though upstream kernel since 2011 says
	"DON'T USE THIS UNLESS YOU'RE ALREADY USING IT."
a new user has been added since (bcache),
which sets the precedences for us to use it as well.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_int.h    |   2 +-
 drivers/block/drbd/drbd_main.c   |  16 +-----
 drivers/block/drbd/drbd_nl.c     | 121 ++++++++++++++++++++++++++++-----------
 drivers/block/drbd/drbd_worker.c |   2 +-
 4 files changed, 90 insertions(+), 51 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 08f266eab3ee..a26265375e1e 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1126,7 +1126,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int
 extern int drbd_send_bitmap(struct drbd_device *device);
 extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
 extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
-extern void drbd_free_ldev(struct drbd_backing_dev *ldev);
+extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev);
 extern void drbd_device_cleanup(struct drbd_device *device);
 void drbd_print_uuids(struct drbd_device *device, const char *text);
 
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index a4aa7eb5507d..136fa733a15e 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1992,7 +1992,7 @@ void drbd_device_cleanup(struct drbd_device *device)
 		drbd_bm_cleanup(device);
 	}
 
-	drbd_free_ldev(device->ldev);
+	drbd_backing_dev_free(device, device->ldev);
 	device->ldev = NULL;
 
 	clear_bit(AL_SUSPENDED, &device->flags);
@@ -2171,7 +2171,7 @@ void drbd_destroy_device(struct kref *kref)
 	if (device->this_bdev)
 		bdput(device->this_bdev);
 
-	drbd_free_ldev(device->ldev);
+	drbd_backing_dev_free(device, device->ldev);
 	device->ldev = NULL;
 
 	drbd_release_all_peer_reqs(device);
@@ -2964,18 +2964,6 @@ fail:
 	return err;
 }
 
-void drbd_free_ldev(struct drbd_backing_dev *ldev)
-{
-	if (ldev == NULL)
-		return;
-
-	blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
-	blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
-
-	kfree(ldev->disk_conf);
-	kfree(ldev);
-}
-
 static void drbd_free_one_sock(struct drbd_socket *ds)
 {
 	struct socket *s;
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 4703f1ad7f92..ee34739ee9ff 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1471,6 +1471,88 @@ success:
 	return 0;
 }
 
+static struct block_device *open_backing_dev(struct drbd_device *device,
+		const char *bdev_path, void *claim_ptr, bool do_bd_link)
+{
+	struct block_device *bdev;
+	int err = 0;
+
+	bdev = blkdev_get_by_path(bdev_path,
+				  FMODE_READ | FMODE_WRITE | FMODE_EXCL, claim_ptr);
+	if (IS_ERR(bdev)) {
+		drbd_err(device, "open(\"%s\") failed with %ld\n",
+				bdev_path, PTR_ERR(bdev));
+		return bdev;
+	}
+
+	if (!do_bd_link)
+		return bdev;
+
+	err = bd_link_disk_holder(bdev, device->vdisk);
+	if (err) {
+		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+		drbd_err(device, "bd_link_disk_holder(\"%s\", ...) failed with %d\n",
+				bdev_path, err);
+		bdev = ERR_PTR(err);
+	}
+	return bdev;
+}
+
+static int open_backing_devices(struct drbd_device *device,
+		struct disk_conf *new_disk_conf,
+		struct drbd_backing_dev *nbc)
+{
+	struct block_device *bdev;
+
+	bdev = open_backing_dev(device, new_disk_conf->backing_dev, device, true);
+	if (IS_ERR(bdev))
+		return ERR_OPEN_DISK;
+	nbc->backing_bdev = bdev;
+
+	/*
+	 * meta_dev_idx >= 0: external fixed size, possibly multiple
+	 * drbd sharing one meta device.  TODO in that case, paranoia
+	 * check that [md_bdev, meta_dev_idx] is not yet used by some
+	 * other drbd minor!  (if you use drbd.conf + drbdadm, that
+	 * should check it for you already; but if you don't, or
+	 * someone fooled it, we need to double check here)
+	 */
+	bdev = open_backing_dev(device, new_disk_conf->meta_dev,
+		/* claim ptr: device, if claimed exclusively; shared drbd_m_holder,
+		 * if potentially shared with other drbd minors */
+			(new_disk_conf->meta_dev_idx < 0) ? (void*)device : (void*)drbd_m_holder,
+		/* avoid double bd_claim_by_disk() for the same (source,target) tuple,
+		 * as would happen with internal metadata. */
+			(new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_FLEX_INT &&
+			 new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_INTERNAL));
+	if (IS_ERR(bdev))
+		return ERR_OPEN_MD_DISK;
+	nbc->md_bdev = bdev;
+	return NO_ERROR;
+}
+
+static void close_backing_dev(struct drbd_device *device, struct block_device *bdev,
+	bool do_bd_unlink)
+{
+	if (!bdev)
+		return;
+	if (do_bd_unlink)
+		bd_unlink_disk_holder(bdev, device->vdisk);
+	blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+}
+
+void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev)
+{
+	if (ldev == NULL)
+		return;
+
+	close_backing_dev(device, ldev->md_bdev, ldev->md_bdev != ldev->backing_bdev);
+	close_backing_dev(device, ldev->backing_bdev, true);
+
+	kfree(ldev->disk_conf);
+	kfree(ldev);
+}
+
 int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 {
 	struct drbd_config_context adm_ctx;
@@ -1484,7 +1566,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	sector_t min_md_device_sectors;
 	struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
 	struct disk_conf *new_disk_conf = NULL;
-	struct block_device *bdev;
 	struct lru_cache *resync_lru = NULL;
 	struct fifo_buffer *new_plan = NULL;
 	union drbd_state ns, os;
@@ -1572,35 +1653,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	}
 	rcu_read_unlock();
 
-	bdev = blkdev_get_by_path(new_disk_conf->backing_dev,
-				  FMODE_READ | FMODE_WRITE | FMODE_EXCL, device);
-	if (IS_ERR(bdev)) {
-		drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
-			PTR_ERR(bdev));
-		retcode = ERR_OPEN_DISK;
-		goto fail;
-	}
-	nbc->backing_bdev = bdev;
-
-	/*
-	 * meta_dev_idx >= 0: external fixed size, possibly multiple
-	 * drbd sharing one meta device.  TODO in that case, paranoia
-	 * check that [md_bdev, meta_dev_idx] is not yet used by some
-	 * other drbd minor!  (if you use drbd.conf + drbdadm, that
-	 * should check it for you already; but if you don't, or
-	 * someone fooled it, we need to double check here)
-	 */
-	bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
-				  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
-				  (new_disk_conf->meta_dev_idx < 0) ?
-				  (void *)device : (void *)drbd_m_holder);
-	if (IS_ERR(bdev)) {
-		drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
-			PTR_ERR(bdev));
-		retcode = ERR_OPEN_MD_DISK;
+	retcode = open_backing_devices(device, new_disk_conf, nbc);
+	if (retcode != NO_ERROR)
 		goto fail;
-	}
-	nbc->md_bdev = bdev;
 
 	if ((nbc->backing_bdev == nbc->md_bdev) !=
 	    (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
@@ -1900,12 +1955,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
  fail:
 	conn_reconfig_done(connection);
 	if (nbc) {
-		if (nbc->backing_bdev)
-			blkdev_put(nbc->backing_bdev,
-				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
-		if (nbc->md_bdev)
-			blkdev_put(nbc->md_bdev,
-				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+		close_backing_dev(device, nbc->md_bdev, nbc->md_bdev != nbc->backing_bdev);
+		close_backing_dev(device, nbc->backing_bdev, true);
 		kfree(nbc);
 	}
 	kfree(new_disk_conf);
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 2f29bf3e4dba..eff716c27b1f 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1841,7 +1841,7 @@ static void drbd_ldev_destroy(struct drbd_device *device)
 	device->act_log = NULL;
 
 	__acquire(local);
-	drbd_free_ldev(device->ldev);
+	drbd_backing_dev_free(device, device->ldev);
 	device->ldev = NULL;
 	__release(local);
 
-- 
cgit 


From 5bded4effb601d9f92382db38fd501f98692eb2d Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 16 Apr 2015 16:51:34 +0200
Subject: drbd: don't block forever in disconnect during resync if
 fencing=r-a-stonith

Disconnect should wait for pending bitmap IO.
But if that bitmap IO is not happening, because it is waiting for
pending application IO, and there is no progress, because the fencing
policy suspended application IO because of the disconnect,
then we deadlock.

The bitmap writeout in this case does not care for concurrent
application IO, so there is no point waiting for it.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 136fa733a15e..5b43dfb79819 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -3563,7 +3563,9 @@ void drbd_queue_bitmap_io(struct drbd_device *device,
 
 	spin_lock_irq(&device->resource->req_lock);
 	set_bit(BITMAP_IO, &device->flags);
-	if (atomic_read(&device->ap_bio_cnt) == 0) {
+	/* don't wait for pending application IO if the caller indicates that
+	 * application IO does not conflict anyways. */
+	if (flags == BM_LOCKED_CHANGE_ALLOWED || atomic_read(&device->ap_bio_cnt) == 0) {
 		if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
 			drbd_queue_work(&first_peer_device(device)->connection->sender_work,
 					&device->bm_io_work.w);
-- 
cgit 


From 706447861bb210cf2fb6a58bc1d29a6636175987 Mon Sep 17 00:00:00 2001
From: Oleg Drokin <green@linuxhacker.ru>
Date: Sun, 26 Apr 2015 01:28:43 -0400
Subject: drbd: fix memory leak in drbd_adm_resize

new_disk_conf could be leaked if the follow on checks fail,
so make sure to free it on error if it was not assigned yet.

Found with smatch.

Signed-off-by: Oleg Drokin <green@linuxhacker.ru>
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_nl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index ee34739ee9ff..613778994b23 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -2706,6 +2706,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 		mutex_unlock(&device->resource->conf_update);
 		synchronize_rcu();
 		kfree(old_disk_conf);
+		new_disk_conf = NULL;
 	}
 
 	ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
@@ -2739,6 +2740,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 
  fail_ldev:
 	put_ldev(device);
+	kfree(new_disk_conf);
 	goto fail;
 }
 
-- 
cgit 


From f85d9f2d02cdcd1b79e00fccd667b37b251ba3ac Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 18 May 2015 14:08:46 +0200
Subject: drbd: fix "endless" transfer log walk in protocol A

Don't remember a DRBD request as ack_pending, if it is not.

In protocol A, we usually clear RQ_NET_PENDING at the same time we set
RQ_NET_SENT, so when deciding to remember it as ack_pending,
mod_rq_state needs to look at the current request state,
not at the previous state before the current modification was applied.

This should prevent advance_conn_req_ack_pending() from walking the full
transfer log just to find NULL in protocol A, which would cause serious
performance degradation with many "in-flight" requests, e.g. when
working via DRBD-proxy, or with a huge bandwidth-delay product.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_req.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 7907fb562388..2255dcfebd2b 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -458,7 +458,7 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 			atomic_add(req->i.size >> 9, &device->ap_in_flight);
 			set_if_null_req_not_net_done(peer_device, req);
 		}
-		if (s & RQ_NET_PENDING)
+		if (req->rq_state & RQ_NET_PENDING)
 			set_if_null_req_ack_pending(peer_device, req);
 	}
 
-- 
cgit 


From 7dbb4386b90a13a7b0cab12aae184e5e04c536c3 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 28 Feb 2013 10:30:19 +0100
Subject: drbd: make suspend_io() / resume_io() must be thread and recursion
 safe

Avoid to prematurely resume application IO: don't set/clear a single
bit, but inc/dec an atomic counter.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_int.h   | 4 ++--
 drivers/block/drbd/drbd_nl.c    | 8 +++++---
 drivers/block/drbd/drbd_state.c | 2 +-
 3 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index a26265375e1e..df3d89d5777a 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -500,7 +500,6 @@ enum {
 
 	MD_NO_FUA,		/* Users wants us to not use FUA/FLUSH on meta data dev */
 
-	SUSPEND_IO,		/* suspend application io */
 	BITMAP_IO,		/* suspend application io;
 				   once no more io in flight, start bitmap io */
 	BITMAP_IO_QUEUED,       /* Started bitmap IO */
@@ -880,6 +879,7 @@ struct drbd_device {
 	atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
 	atomic_t unacked_cnt;	 /* Need to send replies for */
 	atomic_t local_cnt;	 /* Waiting for local completion */
+	atomic_t suspend_cnt;
 
 	/* Interval tree of pending local requests */
 	struct rb_root read_requests;
@@ -2263,7 +2263,7 @@ static inline bool may_inc_ap_bio(struct drbd_device *device)
 
 	if (drbd_suspended(device))
 		return false;
-	if (test_bit(SUSPEND_IO, &device->flags))
+	if (atomic_read(&device->suspend_cnt))
 		return false;
 
 	/* to avoid potential deadlock or bitmap corruption,
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 613778994b23..c7cd3df8107e 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -865,9 +865,11 @@ char *ppsize(char *buf, unsigned long long size)
  * and can be long lived.
  * This changes an device->flag, is triggered by drbd internals,
  * and should be short-lived. */
+/* It needs to be a counter, since multiple threads might
+   independently suspend and resume IO. */
 void drbd_suspend_io(struct drbd_device *device)
 {
-	set_bit(SUSPEND_IO, &device->flags);
+	atomic_inc(&device->suspend_cnt);
 	if (drbd_suspended(device))
 		return;
 	wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt));
@@ -875,8 +877,8 @@ void drbd_suspend_io(struct drbd_device *device)
 
 void drbd_resume_io(struct drbd_device *device)
 {
-	clear_bit(SUSPEND_IO, &device->flags);
-	wake_up(&device->misc_wait);
+	if (atomic_dec_and_test(&device->suspend_cnt))
+		wake_up(&device->misc_wait);
 }
 
 /**
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index f022e99f9855..5a7ef7873b67 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -1484,7 +1484,7 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
 	D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
 
 	/* open coded non-blocking drbd_suspend_io(device); */
-	set_bit(SUSPEND_IO, &device->flags);
+	atomic_inc(&device->suspend_cnt);
 
 	drbd_bm_lock(device, why, flags);
 	rv = io_fn(device);
-- 
cgit 


From 603ee2c8c78b2fb5a9dc14fb8b2bb2650ebcab1f Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 3 Jun 2015 13:13:34 +0200
Subject: drbd: separate out __al_write_transaction helper function

To be able to "force out" an activity log transaction,
even if there are no pending updates.

This will be used to relocate the on-disk activity log,
if the on-disk offsets have to be changed,
without the need to empty the activity log first.

While at it, move the definition,
so we can drop the forward declaration of a static helper.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_actlog.c | 304 ++++++++++++++++++++-------------------
 1 file changed, 156 insertions(+), 148 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index b3868e7a1ffd..4b484ac1d8cb 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -288,7 +288,162 @@ bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *
 	return need_transaction;
 }
 
-static int al_write_transaction(struct drbd_device *device);
+#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
+/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
+ * are still coupled, or assume too much about their relation.
+ * Code below will not work if this is violated.
+ * Will be cleaned up with some followup patch.
+ */
+# error FIXME
+#endif
+
+static unsigned int al_extent_to_bm_page(unsigned int al_enr)
+{
+	return al_enr >>
+		/* bit to page */
+		((PAGE_SHIFT + 3) -
+		/* al extent number to bit */
+		 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
+}
+
+static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
+{
+	const unsigned int stripes = device->ldev->md.al_stripes;
+	const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k;
+
+	/* transaction number, modulo on-disk ring buffer wrap around */
+	unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k);
+
+	/* ... to aligned 4k on disk block */
+	t = ((t % stripes) * stripe_size_4kB) + t/stripes;
+
+	/* ... to 512 byte sector in activity log */
+	t *= 8;
+
+	/* ... plus offset to the on disk position */
+	return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
+}
+
+static int __al_write_transaction(struct drbd_device *device, struct al_transaction_on_disk *buffer)
+{
+	struct lc_element *e;
+	sector_t sector;
+	int i, mx;
+	unsigned extent_nr;
+	unsigned crc = 0;
+	int err = 0;
+
+	memset(buffer, 0, sizeof(*buffer));
+	buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
+	buffer->tr_number = cpu_to_be32(device->al_tr_number);
+
+	i = 0;
+
+	/* Even though no one can start to change this list
+	 * once we set the LC_LOCKED -- from drbd_al_begin_io(),
+	 * lc_try_lock_for_transaction() --, someone may still
+	 * be in the process of changing it. */
+	spin_lock_irq(&device->al_lock);
+	list_for_each_entry(e, &device->act_log->to_be_changed, list) {
+		if (i == AL_UPDATES_PER_TRANSACTION) {
+			i++;
+			break;
+		}
+		buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
+		buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
+		if (e->lc_number != LC_FREE)
+			drbd_bm_mark_for_writeout(device,
+					al_extent_to_bm_page(e->lc_number));
+		i++;
+	}
+	spin_unlock_irq(&device->al_lock);
+	BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
+
+	buffer->n_updates = cpu_to_be16(i);
+	for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
+		buffer->update_slot_nr[i] = cpu_to_be16(-1);
+		buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
+	}
+
+	buffer->context_size = cpu_to_be16(device->act_log->nr_elements);
+	buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle);
+
+	mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
+		   device->act_log->nr_elements - device->al_tr_cycle);
+	for (i = 0; i < mx; i++) {
+		unsigned idx = device->al_tr_cycle + i;
+		extent_nr = lc_element_by_index(device->act_log, idx)->lc_number;
+		buffer->context[i] = cpu_to_be32(extent_nr);
+	}
+	for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
+		buffer->context[i] = cpu_to_be32(LC_FREE);
+
+	device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
+	if (device->al_tr_cycle >= device->act_log->nr_elements)
+		device->al_tr_cycle = 0;
+
+	sector = al_tr_number_to_on_disk_sector(device);
+
+	crc = crc32c(0, buffer, 4096);
+	buffer->crc32c = cpu_to_be32(crc);
+
+	if (drbd_bm_write_hinted(device))
+		err = -EIO;
+	else {
+		bool write_al_updates;
+		rcu_read_lock();
+		write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
+		rcu_read_unlock();
+		if (write_al_updates) {
+			if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
+				err = -EIO;
+				drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+			} else {
+				device->al_tr_number++;
+				device->al_writ_cnt++;
+			}
+		}
+	}
+
+	return err;
+}
+
+static int al_write_transaction(struct drbd_device *device)
+{
+	struct al_transaction_on_disk *buffer;
+	int err;
+
+	if (!get_ldev(device)) {
+		drbd_err(device, "disk is %s, cannot start al transaction\n",
+			drbd_disk_str(device->state.disk));
+		return -EIO;
+	}
+
+	/* The bitmap write may have failed, causing a state change. */
+	if (device->state.disk < D_INCONSISTENT) {
+		drbd_err(device,
+			"disk is %s, cannot write al transaction\n",
+			drbd_disk_str(device->state.disk));
+		put_ldev(device);
+		return -EIO;
+	}
+
+	/* protects md_io_buffer, al_tr_cycle, ... */
+	buffer = drbd_md_get_buffer(device, __func__);
+	if (!buffer) {
+		drbd_err(device, "disk failed while waiting for md_io buffer\n");
+		put_ldev(device);
+		return -ENODEV;
+	}
+
+	err = __al_write_transaction(device, buffer);
+
+	drbd_md_put_buffer(device);
+	put_ldev(device);
+
+	return err;
+}
+
 
 void drbd_al_begin_io_commit(struct drbd_device *device)
 {
@@ -420,153 +575,6 @@ void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i)
 	wake_up(&device->al_wait);
 }
 
-#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
-/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
- * are still coupled, or assume too much about their relation.
- * Code below will not work if this is violated.
- * Will be cleaned up with some followup patch.
- */
-# error FIXME
-#endif
-
-static unsigned int al_extent_to_bm_page(unsigned int al_enr)
-{
-	return al_enr >>
-		/* bit to page */
-		((PAGE_SHIFT + 3) -
-		/* al extent number to bit */
-		 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
-}
-
-static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
-{
-	const unsigned int stripes = device->ldev->md.al_stripes;
-	const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k;
-
-	/* transaction number, modulo on-disk ring buffer wrap around */
-	unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k);
-
-	/* ... to aligned 4k on disk block */
-	t = ((t % stripes) * stripe_size_4kB) + t/stripes;
-
-	/* ... to 512 byte sector in activity log */
-	t *= 8;
-
-	/* ... plus offset to the on disk position */
-	return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
-}
-
-int al_write_transaction(struct drbd_device *device)
-{
-	struct al_transaction_on_disk *buffer;
-	struct lc_element *e;
-	sector_t sector;
-	int i, mx;
-	unsigned extent_nr;
-	unsigned crc = 0;
-	int err = 0;
-
-	if (!get_ldev(device)) {
-		drbd_err(device, "disk is %s, cannot start al transaction\n",
-			drbd_disk_str(device->state.disk));
-		return -EIO;
-	}
-
-	/* The bitmap write may have failed, causing a state change. */
-	if (device->state.disk < D_INCONSISTENT) {
-		drbd_err(device,
-			"disk is %s, cannot write al transaction\n",
-			drbd_disk_str(device->state.disk));
-		put_ldev(device);
-		return -EIO;
-	}
-
-	/* protects md_io_buffer, al_tr_cycle, ... */
-	buffer = drbd_md_get_buffer(device, __func__);
-	if (!buffer) {
-		drbd_err(device, "disk failed while waiting for md_io buffer\n");
-		put_ldev(device);
-		return -ENODEV;
-	}
-
-	memset(buffer, 0, sizeof(*buffer));
-	buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
-	buffer->tr_number = cpu_to_be32(device->al_tr_number);
-
-	i = 0;
-
-	/* Even though no one can start to change this list
-	 * once we set the LC_LOCKED -- from drbd_al_begin_io(),
-	 * lc_try_lock_for_transaction() --, someone may still
-	 * be in the process of changing it. */
-	spin_lock_irq(&device->al_lock);
-	list_for_each_entry(e, &device->act_log->to_be_changed, list) {
-		if (i == AL_UPDATES_PER_TRANSACTION) {
-			i++;
-			break;
-		}
-		buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
-		buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
-		if (e->lc_number != LC_FREE)
-			drbd_bm_mark_for_writeout(device,
-					al_extent_to_bm_page(e->lc_number));
-		i++;
-	}
-	spin_unlock_irq(&device->al_lock);
-	BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
-
-	buffer->n_updates = cpu_to_be16(i);
-	for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
-		buffer->update_slot_nr[i] = cpu_to_be16(-1);
-		buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
-	}
-
-	buffer->context_size = cpu_to_be16(device->act_log->nr_elements);
-	buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle);
-
-	mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
-		   device->act_log->nr_elements - device->al_tr_cycle);
-	for (i = 0; i < mx; i++) {
-		unsigned idx = device->al_tr_cycle + i;
-		extent_nr = lc_element_by_index(device->act_log, idx)->lc_number;
-		buffer->context[i] = cpu_to_be32(extent_nr);
-	}
-	for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
-		buffer->context[i] = cpu_to_be32(LC_FREE);
-
-	device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
-	if (device->al_tr_cycle >= device->act_log->nr_elements)
-		device->al_tr_cycle = 0;
-
-	sector = al_tr_number_to_on_disk_sector(device);
-
-	crc = crc32c(0, buffer, 4096);
-	buffer->crc32c = cpu_to_be32(crc);
-
-	if (drbd_bm_write_hinted(device))
-		err = -EIO;
-	else {
-		bool write_al_updates;
-		rcu_read_lock();
-		write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
-		rcu_read_unlock();
-		if (write_al_updates) {
-			if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
-				err = -EIO;
-				drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
-			} else {
-				device->al_tr_number++;
-				device->al_writ_cnt++;
-			}
-		}
-	}
-
-	drbd_md_put_buffer(device);
-	put_ldev(device);
-
-	return err;
-}
-
 static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext)
 {
 	int rv;
-- 
cgit 


From 5f7c01249bea67c32a1a1551a8f2fe0b8b801ab4 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 8 Jun 2015 15:18:45 +0200
Subject: drbd: avoid potential deadlock during handshake

During handshake communication, we also reconsider our device size,
using drbd_determine_dev_size(). Just in case we need to change the
offsets or layout of our on-disk metadata, we lock out application
and other meta data IO, and wait for the activity log to be "idle"
(no more referenced extents).

If this handshake happens just after a connection loss, with a fencing
policy of "resource-and-stonith", we have frozen IO.

If, additionally, the activity log was "starving" (too many incoming
random writes at that point in time), it won't become idle, ever,
because of the frozen IO, and this would be a lockup of the receiver
thread, and consquentially of DRBD.

Previous logic (re-)initialized with a special "empty" transaction
block, which required the activity log to fully drain first.

Instead, write out some standard activity log transactions.
Using lc_try_lock_for_transaction() instead of lc_try_lock() does not
care about pending activity log references, avoiding the potential
deadlock.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_actlog.c | 19 +++++++++++--------
 drivers/block/drbd/drbd_int.h    |  2 +-
 drivers/block/drbd/drbd_nl.c     | 33 +++++++++++++++++++--------------
 3 files changed, 31 insertions(+), 23 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 4b484ac1d8cb..10459a145062 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -614,21 +614,24 @@ void drbd_al_shrink(struct drbd_device *device)
 	wake_up(&device->al_wait);
 }
 
-int drbd_initialize_al(struct drbd_device *device, void *buffer)
+int drbd_al_initialize(struct drbd_device *device, void *buffer)
 {
 	struct al_transaction_on_disk *al = buffer;
 	struct drbd_md *md = &device->ldev->md;
-	sector_t al_base = md->md_offset + md->al_offset;
 	int al_size_4k = md->al_stripes * md->al_stripe_size_4k;
 	int i;
 
-	memset(al, 0, 4096);
-	al->magic = cpu_to_be32(DRBD_AL_MAGIC);
-	al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED);
-	al->crc32c = cpu_to_be32(crc32c(0, al, 4096));
+	__al_write_transaction(device, al);
+	/* There may or may not have been a pending transaction. */
+	spin_lock_irq(&device->al_lock);
+	lc_committed(device->act_log);
+	spin_unlock_irq(&device->al_lock);
 
-	for (i = 0; i < al_size_4k; i++) {
-		int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE);
+	/* The rest of the transactions will have an empty "updates" list, and
+	 * are written out only to provide the context, and to initialize the
+	 * on-disk ring buffer. */
+	for (i = 1; i < al_size_4k; i++) {
+		int err = __al_write_transaction(device, al);
 		if (err)
 			return err;
 	}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index df3d89d5777a..b6844feb9f9b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1667,7 +1667,7 @@ extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int s
 #define drbd_rs_failed_io(device, sector, size) \
 	__drbd_change_sync(device, sector, size, RECORD_RS_FAILED)
 extern void drbd_al_shrink(struct drbd_device *device);
-extern int drbd_initialize_al(struct drbd_device *, void *);
+extern int drbd_al_initialize(struct drbd_device *, void *);
 
 /* drbd_nl.c */
 /* state info broadcast */
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index c7cd3df8107e..f4ca27359541 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -903,15 +903,14 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
 	int md_moved, la_size_changed;
 	enum determine_dev_size rv = DS_UNCHANGED;
 
-	/* race:
-	 * application request passes inc_ap_bio,
-	 * but then cannot get an AL-reference.
-	 * this function later may wait on ap_bio_cnt == 0. -> deadlock.
+	/* We may change the on-disk offsets of our meta data below.  Lock out
+	 * anything that may cause meta data IO, to avoid acting on incomplete
+	 * layout changes or scribbling over meta data that is in the process
+	 * of being moved.
 	 *
-	 * to avoid that:
-	 * Suspend IO right here.
-	 * still lock the act_log to not trigger ASSERTs there.
-	 */
+	 * Move is not exactly correct, btw, currently we have all our meta
+	 * data in core memory, to "move" it we just write it all out, there
+	 * are no reads. */
 	drbd_suspend_io(device);
 	buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */
 	if (!buffer) {
@@ -919,9 +918,6 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
 		return DS_ERROR;
 	}
 
-	/* no wait necessary anymore, actually we could assert that */
-	wait_event(device->al_wait, lc_try_lock(device->act_log));
-
 	prev_first_sect = drbd_md_first_sector(device->ldev);
 	prev_size = device->ldev->md.md_size_sect;
 	la_size_sect = device->ldev->md.la_size_sect;
@@ -997,20 +993,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
 		 * Clear the timer, to avoid scary "timer expired!" messages,
 		 * "Superblock" is written out at least twice below, anyways. */
 		del_timer(&device->md_sync_timer);
-		drbd_al_shrink(device); /* All extents inactive. */
 
+		/* We won't change the "al-extents" setting, we just may need
+		 * to move the on-disk location of the activity log ringbuffer.
+		 * Lock for transaction is good enough, it may well be "dirty"
+		 * or even "starving". */
+		wait_event(device->al_wait, lc_try_lock_for_transaction(device->act_log));
+
+		/* mark current on-disk bitmap and activity log as unreliable */
 		prev_flags = md->flags;
-		md->flags &= ~MDF_PRIMARY_IND;
+		md->flags |= MDF_FULL_SYNC | MDF_AL_DISABLED;
 		drbd_md_write(device, buffer);
 
+		drbd_al_initialize(device, buffer);
+
 		drbd_info(device, "Writing the whole bitmap, %s\n",
 			 la_size_changed && md_moved ? "size changed and md moved" :
 			 la_size_changed ? "size changed" : "md moved");
 		/* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
 		drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
 			       "size changed", BM_LOCKED_MASK);
-		drbd_initialize_al(device, buffer);
 
+		/* on-disk bitmap and activity log is authoritative again
+		 * (unless there was an IO error meanwhile...) */
 		md->flags = prev_flags;
 		drbd_md_write(device, buffer);
 
-- 
cgit 


From 8011e2490907c267e8be02a549246082d537e082 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 8 Jun 2015 14:48:38 +0200
Subject: drbd: fix error path during resize

In case the lower level device size changed, but some other internal
details of the resize did not work out, drbd_determine_dev_size() would
try to restore the previous settings, trusting
drbd_md_set_sector_offsets() to "do the right thing", but overlooked
that this internally may set the meta data base offset based on device size.

This could end up with incomplete on-disk meta data layout change, and
ultimately lead to data corruption (if the failure was not noticed or
ignored by the operator, and other things go wrong as well).

Just remember all meta data related offsets/sizes,
and on error restore them all.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_nl.c | 68 +++++++++++++++++++++++++-------------------
 1 file changed, 38 insertions(+), 30 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index f4ca27359541..c055c5e12f24 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -891,12 +891,18 @@ void drbd_resume_io(struct drbd_device *device)
 enum determine_dev_size
 drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
 {
-	sector_t prev_first_sect, prev_size; /* previous meta location */
-	sector_t la_size_sect, u_size;
+	struct md_offsets_and_sizes {
+		u64 last_agreed_sect;
+		u64 md_offset;
+		s32 al_offset;
+		s32 bm_offset;
+		u32 md_size_sect;
+
+		u32 al_stripes;
+		u32 al_stripe_size_4k;
+	} prev;
+	sector_t u_size, size;
 	struct drbd_md *md = &device->ldev->md;
-	u32 prev_al_stripe_size_4k;
-	u32 prev_al_stripes;
-	sector_t size;
 	char ppb[10];
 	void *buffer;
 
@@ -918,16 +924,17 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
 		return DS_ERROR;
 	}
 
-	prev_first_sect = drbd_md_first_sector(device->ldev);
-	prev_size = device->ldev->md.md_size_sect;
-	la_size_sect = device->ldev->md.la_size_sect;
+	/* remember current offset and sizes */
+	prev.last_agreed_sect = md->la_size_sect;
+	prev.md_offset = md->md_offset;
+	prev.al_offset = md->al_offset;
+	prev.bm_offset = md->bm_offset;
+	prev.md_size_sect = md->md_size_sect;
+	prev.al_stripes = md->al_stripes;
+	prev.al_stripe_size_4k = md->al_stripe_size_4k;
 
 	if (rs) {
 		/* rs is non NULL if we should change the AL layout only */
-
-		prev_al_stripes = md->al_stripes;
-		prev_al_stripe_size_4k = md->al_stripe_size_4k;
-
 		md->al_stripes = rs->al_stripes;
 		md->al_stripe_size_4k = rs->al_stripe_size / 4;
 		md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
@@ -940,7 +947,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
 	rcu_read_unlock();
 	size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED);
 
-	if (size < la_size_sect) {
+	if (size < prev.last_agreed_sect) {
 		if (rs && u_size == 0) {
 			/* Remove "rs &&" later. This check should always be active, but
 			   right now the receiver expects the permissive behavior */
@@ -961,30 +968,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
 		err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC));
 		if (unlikely(err)) {
 			/* currently there is only one error: ENOMEM! */
-			size = drbd_bm_capacity(device)>>1;
+			size = drbd_bm_capacity(device);
 			if (size == 0) {
 				drbd_err(device, "OUT OF MEMORY! "
 				    "Could not allocate bitmap!\n");
 			} else {
 				drbd_err(device, "BM resizing failed. "
-				    "Leaving size unchanged at size = %lu KB\n",
-				    (unsigned long)size);
+				    "Leaving size unchanged\n");
 			}
 			rv = DS_ERROR;
 		}
 		/* racy, see comments above. */
 		drbd_set_my_capacity(device, size);
-		device->ldev->md.la_size_sect = size;
+		md->la_size_sect = size;
 		drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
 		     (unsigned long long)size>>1);
 	}
 	if (rv <= DS_ERROR)
 		goto err_out;
 
-	la_size_changed = (la_size_sect != device->ldev->md.la_size_sect);
+	la_size_changed = (prev.last_agreed_sect != md->la_size_sect);
 
-	md_moved = prev_first_sect != drbd_md_first_sector(device->ldev)
-		|| prev_size	   != device->ldev->md.md_size_sect;
+	md_moved = prev.md_offset    != md->md_offset
+		|| prev.md_size_sect != md->md_size_sect;
 
 	if (la_size_changed || md_moved || rs) {
 		u32 prev_flags;
@@ -1024,20 +1030,22 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
 				  md->al_stripes, md->al_stripe_size_4k * 4);
 	}
 
-	if (size > la_size_sect)
-		rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO;
-	if (size < la_size_sect)
+	if (size > prev.last_agreed_sect)
+		rv = prev.last_agreed_sect ? DS_GREW : DS_GREW_FROM_ZERO;
+	if (size < prev.last_agreed_sect)
 		rv = DS_SHRUNK;
 
 	if (0) {
 	err_out:
-		if (rs) {
-			md->al_stripes = prev_al_stripes;
-			md->al_stripe_size_4k = prev_al_stripe_size_4k;
-			md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k;
-
-			drbd_md_set_sector_offsets(device, device->ldev);
-		}
+		/* restore previous offset and sizes */
+		md->la_size_sect = prev.last_agreed_sect;
+		md->md_offset = prev.md_offset;
+		md->al_offset = prev.al_offset;
+		md->bm_offset = prev.bm_offset;
+		md->md_size_sect = prev.md_size_sect;
+		md->al_stripes = prev.al_stripes;
+		md->al_stripe_size_4k = prev.al_stripe_size_4k;
+		md->al_size_4k = (u64)prev.al_stripes * prev.al_stripe_size_4k;
 	}
 	lc_unlock(device->act_log);
 	wake_up(&device->al_wait);
-- 
cgit 


From 8182503df1baa077d1504cd192753b23530eb03c Mon Sep 17 00:00:00 2001
From: Shraddha Barke <shraddha.6596@gmail.com>
Date: Tue, 22 Dec 2015 14:02:19 +0530
Subject: block: sx8.c: Replace timeval with ktime_t

32-bit systems using 'struct timeval' will break in the year 2038,
in order to avoid that replace the code with more appropriate types.
This patch replaces timeval with 64 bit ktime_t which is y2038 safe.
Since st->timestamp is only interested in seconds, directly using
time64_t here. Function ktime_get_seconds is used since it uses
monotonic instead of real time and thus will not cause overflow.

Signed-off-by: Shraddha Barke <shraddha.6596@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/sx8.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 59c91d49b14b..baadb777e035 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -23,7 +23,7 @@
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
 #include <linux/delay.h>
-#include <linux/time.h>
+#include <linux/ktime.h>
 #include <linux/hdreg.h>
 #include <linux/dma-mapping.h>
 #include <linux/completion.h>
@@ -671,16 +671,15 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 static unsigned int carm_fill_sync_time(struct carm_host *host,
 					unsigned int idx, void *mem)
 {
-	struct timeval tv;
 	struct carm_msg_sync_time *st = mem;
 
-	do_gettimeofday(&tv);
+	time64_t tv = ktime_get_seconds();
 
 	memset(st, 0, sizeof(*st));
 	st->type	= CARM_MSG_MISC;
 	st->subtype	= MISC_SET_TIME;
 	st->handle	= cpu_to_le32(TAG_ENCODE(idx));
-	st->timestamp	= cpu_to_le32(tv.tv_sec);
+	st->timestamp	= cpu_to_le32(tv);
 
 	return sizeof(struct carm_msg_sync_time);
 }
-- 
cgit 


From 39fc8830eb96a559be8554b8df1e0c31f375feaf Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 23 Dec 2015 08:42:59 -0700
Subject: sx8: use real time for the command seconds

Commit 8182503df1ba used monotonic time, but if the adapter is
using the seconds for logging entries, then we'll get duplicate
entries if the system is rebooted. Use real time instead.

Reported-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 8182503df1ba ("block: sx8.c: Replace timeval with ktime_t")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/sx8.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index baadb777e035..ba4bfe933276 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -673,7 +673,7 @@ static unsigned int carm_fill_sync_time(struct carm_host *host,
 {
 	struct carm_msg_sync_time *st = mem;
 
-	time64_t tv = ktime_get_seconds();
+	time64_t tv = ktime_get_real_seconds();
 
 	memset(st, 0, sizeof(*st));
 	st->type	= CARM_MSG_MISC;
-- 
cgit 


From 2ef9ccbfcb90cf84bdba320a571b18b05c41101b Mon Sep 17 00:00:00 2001
From: Zheng Liu <gnehzuil.liu@gmail.com>
Date: Sun, 29 Nov 2015 17:17:05 -0800
Subject: bcache: fix a livelock when we cause a huge number of cache misses

Subject :	[PATCH v2] bcache: fix a livelock in btree lock
Date :	Wed, 25 Feb 2015 20:32:09 +0800 (02/25/2015 04:32:09 AM)

This commit tries to fix a livelock in bcache.  This livelock might
happen when we causes a huge number of cache misses simultaneously.

When we get a cache miss, bcache will execute the following path.

->cached_dev_make_request()
  ->cached_dev_read()
    ->cached_lookup()
      ->bch->btree_map_keys()
        ->btree_root()  <------------------------
          ->bch_btree_map_keys_recurse()        |
            ->cache_lookup_fn()                 |
              ->cached_dev_cache_miss()         |
                ->bch_btree_insert_check_key() -|
                  [If btree->seq is not equal to seq + 1, we should return
                   EINTR and traverse btree again.]

In bch_btree_insert_check_key() function we first need to check upgrade
flag (op->lock == -1), and when this flag is true we need to release
read btree->lock and try to take write btree->lock.  During taking and
releasing this write lock, btree->seq will be monotone increased in
order to prevent other threads modify this in cache miss (see btree.h:74).
But if there are some cache misses caused by some requested, we could
meet a livelock because btree->seq is always changed by others.  Thus no
one can make progress.

This commit will try to take write btree->lock if it encounters a race
when we traverse btree.  Although it sacrifice the scalability but we
can ensure that only one can modify the btree.

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Tested-by: Joshua Schmid <jschmid@suse.com>
Tested-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: Joshua Schmid <jschmid@suse.com>
Cc: Zhu Yanhai <zhu.yanhai@gmail.com>
Cc: Kent Overstreet <kmo@daterainc.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/btree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 83392f856dfd..4a1179cbc52e 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2162,8 +2162,10 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
 		rw_lock(true, b, b->level);
 
 		if (b->key.ptr[0] != btree_ptr ||
-		    b->seq != seq + 1)
+                   b->seq != seq + 1) {
+                       op->lock = b->level;
 			goto out;
+               }
 	}
 
 	SET_KEY_PTRS(check_key, 1);
-- 
cgit 


From c5f1e5adf956e3ba82d204c7c141a75da9fa449a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Sun, 29 Nov 2015 17:18:33 -0800
Subject: bcache: Add a cond_resched() call to gc

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Tested-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: Kent Overstreet <kmo@daterainc.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/btree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 4a1179cbc52e..22b9e34ceb75 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1741,6 +1741,7 @@ static void bch_btree_gc(struct cache_set *c)
 	do {
 		ret = btree_root(gc_root, c, &op, &writes, &stats);
 		closure_sync(&writes);
+		cond_resched();
 
 		if (ret && ret != -EAGAIN)
 			pr_warn("gc failed!");
-- 
cgit 


From fecaee6f20ee122ad75402c53d8278f9bb142ddc Mon Sep 17 00:00:00 2001
From: Zheng Liu <wenqing.lz@taobao.com>
Date: Sun, 29 Nov 2015 17:19:32 -0800
Subject: bcache: clear BCACHE_DEV_UNLINK_DONE flag when attaching a backing
 device

This bug can be reproduced by the following script:

  #!/bin/bash

  bcache_sysfs="/sys/fs/bcache"

  function clear_cache()
  {
  	if [ ! -e $bcache_sysfs ]; then
  		echo "no bcache sysfs"
  		exit
  	fi

  	cset_uuid=$(ls -l $bcache_sysfs|head -n 2|tail -n 1|awk '{print $9}')
  	sudo sh -c "echo $cset_uuid > /sys/block/sdb/sdb1/bcache/detach"
  	sleep 5
  	sudo sh -c "echo $cset_uuid > /sys/block/sdb/sdb1/bcache/attach"
  }

  for ((i=0;i<10;i++)); do
  	clear_cache
  done

The warning messages look like below:
[  275.948611] ------------[ cut here ]------------
[  275.963840] WARNING: at fs/sysfs/dir.c:512 sysfs_add_one+0xb8/0xd0() (Tainted: P        W
---------------   )
[  275.979253] Hardware name: Tecal RH2285
[  275.994106] sysfs: cannot create duplicate filename '/devices/pci0000:00/0000:00:09.0/0000:08:00.0/host4/target4:2:1/4:2:1:0/block/sdb/sdb1/bcache/cache'
[  276.024105] Modules linked in: bcache tcp_diag inet_diag ipmi_devintf ipmi_si ipmi_msghandler
bonding 8021q garp stp llc ipv6 ext3 jbd loop sg iomemory_vsl(P) bnx2 microcode serio_raw i2c_i801
i2c_core iTCO_wdt iTCO_vendor_support i7core_edac edac_core shpchp ext4 jbd2 mbcache megaraid_sas
pata_acpi ata_generic ata_piix dm_mod [last unloaded: scsi_wait_scan]
[  276.072643] Pid: 2765, comm: sh Tainted: P        W  ---------------    2.6.32 #1
[  276.089315] Call Trace:
[  276.105801]  [<ffffffff81070fe7>] ? warn_slowpath_common+0x87/0xc0
[  276.122650]  [<ffffffff810710d6>] ? warn_slowpath_fmt+0x46/0x50
[  276.139361]  [<ffffffff81205c08>] ? sysfs_add_one+0xb8/0xd0
[  276.156012]  [<ffffffff8120609b>] ? sysfs_do_create_link+0x12b/0x170
[  276.172682]  [<ffffffff81206113>] ? sysfs_create_link+0x13/0x20
[  276.189282]  [<ffffffffa03bda21>] ? bcache_device_link+0xc1/0x110 [bcache]
[  276.205993]  [<ffffffffa03bfa08>] ? bch_cached_dev_attach+0x478/0x4f0 [bcache]
[  276.222794]  [<ffffffffa03c4a17>] ? bch_cached_dev_store+0x627/0x780 [bcache]
[  276.239680]  [<ffffffff8116783a>] ? alloc_pages_current+0xaa/0x110
[  276.256594]  [<ffffffff81203b15>] ? sysfs_write_file+0xe5/0x170
[  276.273364]  [<ffffffff811887b8>] ? vfs_write+0xb8/0x1a0
[  276.290133]  [<ffffffff811890b1>] ? sys_write+0x51/0x90
[  276.306368]  [<ffffffff8100c072>] ? system_call_fastpath+0x16/0x1b
[  276.322301] ---[ end trace 9f5d4fcdd0c3edfb ]---
[  276.338241] ------------[ cut here ]------------
[  276.354109] WARNING: at /home/wenqing.lz/bcache/bcache/super.c:720
bcache_device_link+0xdf/0x110 [bcache]() (Tainted: P        W  ---------------   )
[  276.386017] Hardware name: Tecal RH2285
[  276.401430] Couldn't create device <-> cache set symlinks
[  276.401759] Modules linked in: bcache tcp_diag inet_diag ipmi_devintf ipmi_si ipmi_msghandler
bonding 8021q garp stp llc ipv6 ext3 jbd loop sg iomemory_vsl(P) bnx2 microcode serio_raw i2c_i801
i2c_core iTCO_wdt iTCO_vendor_support i7core_edac edac_core shpchp ext4 jbd2 mbcache megaraid_sas
pata_acpi ata_generic ata_piix dm_mod [last unloaded: scsi_wait_scan]
[  276.465477] Pid: 2765, comm: sh Tainted: P        W  ---------------    2.6.32 #1
[  276.482169] Call Trace:
[  276.498610]  [<ffffffff81070fe7>] ? warn_slowpath_common+0x87/0xc0
[  276.515405]  [<ffffffff810710d6>] ? warn_slowpath_fmt+0x46/0x50
[  276.532059]  [<ffffffffa03bda3f>] ? bcache_device_link+0xdf/0x110 [bcache]
[  276.548808]  [<ffffffffa03bfa08>] ? bch_cached_dev_attach+0x478/0x4f0 [bcache]
[  276.565569]  [<ffffffffa03c4a17>] ? bch_cached_dev_store+0x627/0x780 [bcache]
[  276.582418]  [<ffffffff8116783a>] ? alloc_pages_current+0xaa/0x110
[  276.599341]  [<ffffffff81203b15>] ? sysfs_write_file+0xe5/0x170
[  276.616142]  [<ffffffff811887b8>] ? vfs_write+0xb8/0x1a0
[  276.632607]  [<ffffffff811890b1>] ? sys_write+0x51/0x90
[  276.648671]  [<ffffffff8100c072>] ? system_call_fastpath+0x16/0x1b
[  276.664756] ---[ end trace 9f5d4fcdd0c3edfc ]---

We forget to clear BCACHE_DEV_UNLINK_DONE flag in bcache_device_attach()
function when we attach a backing device first time.  After detaching this
backing device, this flag will be true and sysfs_remove_link() isn't called in
bcache_device_unlink().  Then when we attach this backing device again,
sysfs_create_link() will return EEXIST error in bcache_device_link().

So the fix is trival and we clear this flag in bcache_device_link().

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Tested-by: Joshua Schmid <jschmid@suse.com>
Tested-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: Kent Overstreet <kmo@daterainc.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 679a093a3bf6..383f06028a0a 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -685,6 +685,8 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
 	WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
 	     sysfs_create_link(&c->kobj, &d->kobj, d->name),
 	     "Couldn't create device <-> cache set symlinks");
+
+	clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
 }
 
 static void bcache_device_detach(struct bcache_device *d)
-- 
cgit 


From 4d4d8573a8451acc9f01cbea24b7e55f04a252fe Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sun, 29 Nov 2015 17:20:59 -0800
Subject: bcache: fix a leak in bch_cached_dev_run()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Joshua Schmid <jschmid@suse.com>
Tested-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: Kent Overstreet <kmo@daterainc.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/super.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 383f06028a0a..43e911e4e8d0 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -849,8 +849,11 @@ void bch_cached_dev_run(struct cached_dev *dc)
 	buf[SB_LABEL_SIZE] = '\0';
 	env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
 
-	if (atomic_xchg(&dc->running, 1))
+	if (atomic_xchg(&dc->running, 1)) {
+		kfree(env[1]);
+		kfree(env[2]);
 		return;
+	}
 
 	if (!d->c &&
 	    BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
-- 
cgit 


From 2ecf0cdb2b437402110ab57546e02abfa68a716b Mon Sep 17 00:00:00 2001
From: Zheng Liu <wenqing.lz@taobao.com>
Date: Sun, 29 Nov 2015 17:21:57 -0800
Subject: bcache: unregister reboot notifier if bcache fails to unregister
 device

In bcache_init() function it forgot to unregister reboot notifier if
bcache fails to unregister a block device.  This commit fixes this.

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Tested-by: Joshua Schmid <jschmid@suse.com>
Tested-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: Kent Overstreet <kmo@daterainc.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 43e911e4e8d0..18f14a246d70 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2071,8 +2071,10 @@ static int __init bcache_init(void)
 	closure_debug_init();
 
 	bcache_major = register_blkdev(0, "bcache");
-	if (bcache_major < 0)
+	if (bcache_major < 0) {
+		unregister_reboot_notifier(&reboot);
 		return bcache_major;
+	}
 
 	if (!(bcache_wq = create_workqueue("bcache")) ||
 	    !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
-- 
cgit 


From d7076f21629f8f329bca4a44dc408d94670f49e2 Mon Sep 17 00:00:00 2001
From: Gabriel de Perthuis <g2p.code@gmail.com>
Date: Sun, 29 Nov 2015 18:40:23 -0800
Subject: bcache: allows use of register in udev to avoid "device_busy" error.

Allows to use register, not register_quiet in udev to avoid "device_busy" error.
The initial patch proposed at https://lkml.org/lkml/2013/8/26/549 by Gabriel de Perthuis
<g2p.code@gmail.com> does not unlock the mutex and hangs the kernel.

See http://thread.gmane.org/gmane.linux.kernel.bcache.devel/2594 for the discussion.

Cc: Denis Bychkov <manover@gmail.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: Gabriel de Perthuis <g2p.code@gmail.com>
Cc: stable@vger.kernel.org

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/super.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 18f14a246d70..8d0ead98eb6e 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1938,6 +1938,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 			else
 				err = "device busy";
 			mutex_unlock(&bch_register_lock);
+			if (attr == &ksysfs_register_quiet)
+				goto out;
 		}
 		goto err;
 	}
@@ -1976,8 +1978,7 @@ out:
 err_close:
 	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 err:
-	if (attr != &ksysfs_register_quiet)
-		pr_info("error opening %s: %s", path, err);
+	pr_info("error opening %s: %s", path, err);
 	ret = -EINVAL;
 	goto out;
 }
-- 
cgit 


From 8d16ce540c94c9d366eb36fc91b7154d92d6397b Mon Sep 17 00:00:00 2001
From: Stefan Bader <stefan.bader@canonical.com>
Date: Sun, 29 Nov 2015 18:44:49 -0800
Subject: bcache: prevent crash on changing writeback_running

Added a safeguard in the shutdown case. At least while not being
attached it is also possible to trigger a kernel bug by writing into
writeback_running. This change  adds the same check before trying to
wake up the thread for that case.

Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/writeback.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 0a9dab187b79..073a042aed24 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -63,7 +63,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 
 static inline void bch_writeback_queue(struct cached_dev *dc)
 {
-	wake_up_process(dc->writeback_thread);
+	if (!IS_ERR_OR_NULL(dc->writeback_thread))
+		wake_up_process(dc->writeback_thread);
 }
 
 static inline void bch_writeback_add(struct cached_dev *dc)
-- 
cgit 


From 627ccd20b4ad3ba836472468208e2ac4dfadbf03 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 29 Nov 2015 18:47:01 -0800
Subject: bcache: Change refill_dirty() to always scan entire disk if necessary

Previously, it would only scan the entire disk if it was starting from
the very start of the disk - i.e. if the previous scan got to the end.

This was broken by refill_full_stripes(), which updates last_scanned so
that refill_dirty was never triggering the searched_from_start path.

But if we change refill_dirty() to always scan the entire disk if
necessary, regardless of what last_scanned was, the code gets cleaner
and we fix that bug too.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/writeback.c | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index b23f88d9f18c..b9346cd9cda1 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -323,6 +323,10 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
 
 static bool dirty_pred(struct keybuf *buf, struct bkey *k)
 {
+	struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys);
+
+	BUG_ON(KEY_INODE(k) != dc->disk.id);
+
 	return KEY_DIRTY(k);
 }
 
@@ -372,11 +376,24 @@ next:
 	}
 }
 
+/*
+ * Returns true if we scanned the entire disk
+ */
 static bool refill_dirty(struct cached_dev *dc)
 {
 	struct keybuf *buf = &dc->writeback_keys;
+	struct bkey start = KEY(dc->disk.id, 0, 0);
 	struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
-	bool searched_from_start = false;
+	struct bkey start_pos;
+
+	/*
+	 * make sure keybuf pos is inside the range for this disk - at bringup
+	 * we might not be attached yet so this disk's inode nr isn't
+	 * initialized then
+	 */
+	if (bkey_cmp(&buf->last_scanned, &start) < 0 ||
+	    bkey_cmp(&buf->last_scanned, &end) > 0)
+		buf->last_scanned = start;
 
 	if (dc->partial_stripes_expensive) {
 		refill_full_stripes(dc);
@@ -384,14 +401,20 @@ static bool refill_dirty(struct cached_dev *dc)
 			return false;
 	}
 
-	if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
-		buf->last_scanned = KEY(dc->disk.id, 0, 0);
-		searched_from_start = true;
-	}
-
+	start_pos = buf->last_scanned;
 	bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
 
-	return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start;
+	if (bkey_cmp(&buf->last_scanned, &end) < 0)
+		return false;
+
+	/*
+	 * If we get to the end start scanning again from the beginning, and
+	 * only scan up to where we initially started scanning from:
+	 */
+	buf->last_scanned = start;
+	bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred);
+
+	return bkey_cmp(&buf->last_scanned, &start_pos) >= 0;
 }
 
 static int bch_writeback_thread(void *arg)
-- 
cgit 


From 81f351615772365d46ceeac3e50c9dd4e8f9dc89 Mon Sep 17 00:00:00 2001
From: Bob Liu <bob.liu@oracle.com>
Date: Sat, 14 Nov 2015 11:12:11 +0800
Subject: xen/blkfront: separate per ring information out of device info

Split per ring information to a new structure "blkfront_ring_info".

A ring is the representation of a hardware queue, every vbd device can associate
with one or more rings depending on how many hardware queues/rings to be used.

This patch is a preparation for supporting real multi hardware queues/rings.

We also add a backpointer to 'struct blkfront_info' (dev_info) which
is not needed (we could use containers_of) but further patch
("xen/blkfront: pseudo support for multi hardware queues/rings")
will make allocation of 'blkfront_ring_info' dynamic.

Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 359 ++++++++++++++++++++++++-------------------
 1 file changed, 197 insertions(+), 162 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2fee2eef988d..0c3ad214a792 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -119,6 +119,23 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
  */
 #define RINGREF_NAME_LEN (20)
 
+/*
+ *  Per-ring info.
+ *  Every blkfront device can associate with one or more blkfront_ring_info,
+ *  depending on how many hardware queues/rings to be used.
+ */
+struct blkfront_ring_info {
+	struct blkif_front_ring ring;
+	unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
+	unsigned int evtchn, irq;
+	struct work_struct work;
+	struct gnttab_free_callback callback;
+	struct blk_shadow shadow[BLK_MAX_RING_SIZE];
+	struct list_head indirect_pages;
+	unsigned long shadow_free;
+	struct blkfront_info *dev_info;
+};
+
 /*
  * We have one of these per vbd, whether ide, scsi or 'other'.  They
  * hang in private_data off the gendisk structure. We may end up
@@ -133,18 +150,10 @@ struct blkfront_info
 	int vdevice;
 	blkif_vdev_t handle;
 	enum blkif_state connected;
-	int ring_ref[XENBUS_MAX_RING_GRANTS];
 	unsigned int nr_ring_pages;
-	struct blkif_front_ring ring;
-	unsigned int evtchn, irq;
 	struct request_queue *rq;
-	struct work_struct work;
-	struct gnttab_free_callback callback;
-	struct blk_shadow shadow[BLK_MAX_RING_SIZE];
 	struct list_head grants;
-	struct list_head indirect_pages;
 	unsigned int persistent_gnts_c;
-	unsigned long shadow_free;
 	unsigned int feature_flush;
 	unsigned int feature_discard:1;
 	unsigned int feature_secdiscard:1;
@@ -155,6 +164,7 @@ struct blkfront_info
 	unsigned int max_indirect_segments;
 	int is_ready;
 	struct blk_mq_tag_set tag_set;
+	struct blkfront_ring_info rinfo;
 };
 
 static unsigned int nr_minors;
@@ -198,33 +208,35 @@ static DEFINE_SPINLOCK(minor_lock);
 
 #define GREFS(_psegs)	((_psegs) * GRANTS_PER_PSEG)
 
-static int blkfront_setup_indirect(struct blkfront_info *info);
+static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
 static int blkfront_gather_backend_features(struct blkfront_info *info);
 
-static int get_id_from_freelist(struct blkfront_info *info)
+static int get_id_from_freelist(struct blkfront_ring_info *rinfo)
 {
-	unsigned long free = info->shadow_free;
-	BUG_ON(free >= BLK_RING_SIZE(info));
-	info->shadow_free = info->shadow[free].req.u.rw.id;
-	info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
+	unsigned long free = rinfo->shadow_free;
+
+	BUG_ON(free >= BLK_RING_SIZE(rinfo->dev_info));
+	rinfo->shadow_free = rinfo->shadow[free].req.u.rw.id;
+	rinfo->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
 	return free;
 }
 
-static int add_id_to_freelist(struct blkfront_info *info,
+static int add_id_to_freelist(struct blkfront_ring_info *rinfo,
 			       unsigned long id)
 {
-	if (info->shadow[id].req.u.rw.id != id)
+	if (rinfo->shadow[id].req.u.rw.id != id)
 		return -EINVAL;
-	if (info->shadow[id].request == NULL)
+	if (rinfo->shadow[id].request == NULL)
 		return -EINVAL;
-	info->shadow[id].req.u.rw.id  = info->shadow_free;
-	info->shadow[id].request = NULL;
-	info->shadow_free = id;
+	rinfo->shadow[id].req.u.rw.id  = rinfo->shadow_free;
+	rinfo->shadow[id].request = NULL;
+	rinfo->shadow_free = id;
 	return 0;
 }
 
-static int fill_grant_buffer(struct blkfront_info *info, int num)
+static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
 {
+	struct blkfront_info *info = rinfo->dev_info;
 	struct page *granted_page;
 	struct grant *gnt_list_entry, *n;
 	int i = 0;
@@ -326,8 +338,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head,
 		struct page *indirect_page;
 
 		/* Fetch a pre-allocated page to use for indirect grefs */
-		BUG_ON(list_empty(&info->indirect_pages));
-		indirect_page = list_first_entry(&info->indirect_pages,
+		BUG_ON(list_empty(&info->rinfo.indirect_pages));
+		indirect_page = list_first_entry(&info->rinfo.indirect_pages,
 						 struct page, lru);
 		list_del(&indirect_page->lru);
 		gnt_list_entry->page = indirect_page;
@@ -403,8 +415,8 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr)
 
 static void blkif_restart_queue_callback(void *arg)
 {
-	struct blkfront_info *info = (struct blkfront_info *)arg;
-	schedule_work(&info->work);
+	struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)arg;
+	schedule_work(&rinfo->work);
 }
 
 static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
@@ -456,16 +468,16 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
 	return 0;
 }
 
-static int blkif_queue_discard_req(struct request *req)
+static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo)
 {
-	struct blkfront_info *info = req->rq_disk->private_data;
+	struct blkfront_info *info = rinfo->dev_info;
 	struct blkif_request *ring_req;
 	unsigned long id;
 
 	/* Fill out a communications ring structure. */
-	ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
-	id = get_id_from_freelist(info);
-	info->shadow[id].request = req;
+	ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt);
+	id = get_id_from_freelist(rinfo);
+	rinfo->shadow[id].request = req;
 
 	ring_req->operation = BLKIF_OP_DISCARD;
 	ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
@@ -476,10 +488,10 @@ static int blkif_queue_discard_req(struct request *req)
 	else
 		ring_req->u.discard.flag = 0;
 
-	info->ring.req_prod_pvt++;
+	rinfo->ring.req_prod_pvt++;
 
 	/* Keep a private copy so we can reissue requests when recovering. */
-	info->shadow[id].req = *ring_req;
+	rinfo->shadow[id].req = *ring_req;
 
 	return 0;
 }
@@ -487,7 +499,7 @@ static int blkif_queue_discard_req(struct request *req)
 struct setup_rw_req {
 	unsigned int grant_idx;
 	struct blkif_request_segment *segments;
-	struct blkfront_info *info;
+	struct blkfront_ring_info *rinfo;
 	struct blkif_request *ring_req;
 	grant_ref_t gref_head;
 	unsigned int id;
@@ -507,8 +519,9 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
 	/* Convenient aliases */
 	unsigned int grant_idx = setup->grant_idx;
 	struct blkif_request *ring_req = setup->ring_req;
-	struct blkfront_info *info = setup->info;
-	struct blk_shadow *shadow = &info->shadow[setup->id];
+	struct blkfront_ring_info *rinfo = setup->rinfo;
+	struct blkfront_info *info = rinfo->dev_info;
+	struct blk_shadow *shadow = &rinfo->shadow[setup->id];
 
 	if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
 	    (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
@@ -566,16 +579,16 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
 	(setup->grant_idx)++;
 }
 
-static int blkif_queue_rw_req(struct request *req)
+static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo)
 {
-	struct blkfront_info *info = req->rq_disk->private_data;
+	struct blkfront_info *info = rinfo->dev_info;
 	struct blkif_request *ring_req;
 	unsigned long id;
 	int i;
 	struct setup_rw_req setup = {
 		.grant_idx = 0,
 		.segments = NULL,
-		.info = info,
+		.rinfo = rinfo,
 		.need_copy = rq_data_dir(req) && info->feature_persistent,
 	};
 
@@ -603,9 +616,9 @@ static int blkif_queue_rw_req(struct request *req)
 		    max_grefs - info->persistent_gnts_c,
 		    &setup.gref_head) < 0) {
 			gnttab_request_free_callback(
-				&info->callback,
+				&rinfo->callback,
 				blkif_restart_queue_callback,
-				info,
+				rinfo,
 				max_grefs);
 			return 1;
 		}
@@ -613,23 +626,23 @@ static int blkif_queue_rw_req(struct request *req)
 		new_persistent_gnts = 0;
 
 	/* Fill out a communications ring structure. */
-	ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
-	id = get_id_from_freelist(info);
-	info->shadow[id].request = req;
+	ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt);
+	id = get_id_from_freelist(rinfo);
+	rinfo->shadow[id].request = req;
 
 	BUG_ON(info->max_indirect_segments == 0 &&
 	       GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST);
 	BUG_ON(info->max_indirect_segments &&
 	       GREFS(req->nr_phys_segments) > info->max_indirect_segments);
 
-	num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
+	num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg);
 	num_grant = 0;
 	/* Calculate the number of grant used */
-	for_each_sg(info->shadow[id].sg, sg, num_sg, i)
+	for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
 	       num_grant += gnttab_count_grant(sg->offset, sg->length);
 
 	ring_req->u.rw.id = id;
-	info->shadow[id].num_sg = num_sg;
+	rinfo->shadow[id].num_sg = num_sg;
 	if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
 		/*
 		 * The indirect operation can only be a BLKIF_OP_READ or
@@ -674,7 +687,7 @@ static int blkif_queue_rw_req(struct request *req)
 
 	setup.ring_req = ring_req;
 	setup.id = id;
-	for_each_sg(info->shadow[id].sg, sg, num_sg, i) {
+	for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) {
 		BUG_ON(sg->offset + sg->length > PAGE_SIZE);
 
 		if (setup.need_copy) {
@@ -694,10 +707,10 @@ static int blkif_queue_rw_req(struct request *req)
 	if (setup.segments)
 		kunmap_atomic(setup.segments);
 
-	info->ring.req_prod_pvt++;
+	rinfo->ring.req_prod_pvt++;
 
 	/* Keep a private copy so we can reissue requests when recovering. */
-	info->shadow[id].req = *ring_req;
+	rinfo->shadow[id].req = *ring_req;
 
 	if (new_persistent_gnts)
 		gnttab_free_grant_references(setup.gref_head);
@@ -711,27 +724,25 @@ static int blkif_queue_rw_req(struct request *req)
  *
  * @req: a request struct
  */
-static int blkif_queue_request(struct request *req)
+static int blkif_queue_request(struct request *req, struct blkfront_ring_info *rinfo)
 {
-	struct blkfront_info *info = req->rq_disk->private_data;
-
-	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
+	if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED))
 		return 1;
 
 	if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE)))
-		return blkif_queue_discard_req(req);
+		return blkif_queue_discard_req(req, rinfo);
 	else
-		return blkif_queue_rw_req(req);
+		return blkif_queue_rw_req(req, rinfo);
 }
 
-static inline void flush_requests(struct blkfront_info *info)
+static inline void flush_requests(struct blkfront_ring_info *rinfo)
 {
 	int notify;
 
-	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rinfo->ring, notify);
 
 	if (notify)
-		notify_remote_via_irq(info->irq);
+		notify_remote_via_irq(rinfo->irq);
 }
 
 static inline bool blkif_request_flush_invalid(struct request *req,
@@ -747,20 +758,21 @@ static inline bool blkif_request_flush_invalid(struct request *req,
 static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
 			   const struct blk_mq_queue_data *qd)
 {
-	struct blkfront_info *info = qd->rq->rq_disk->private_data;
+	struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data;
+	struct blkfront_info *info = rinfo->dev_info;
 
 	blk_mq_start_request(qd->rq);
 	spin_lock_irq(&info->io_lock);
-	if (RING_FULL(&info->ring))
+	if (RING_FULL(&rinfo->ring))
 		goto out_busy;
 
-	if (blkif_request_flush_invalid(qd->rq, info))
+	if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info))
 		goto out_err;
 
-	if (blkif_queue_request(qd->rq))
+	if (blkif_queue_request(qd->rq, rinfo))
 		goto out_busy;
 
-	flush_requests(info);
+	flush_requests(rinfo);
 	spin_unlock_irq(&info->io_lock);
 	return BLK_MQ_RQ_QUEUE_OK;
 
@@ -774,9 +786,19 @@ out_busy:
 	return BLK_MQ_RQ_QUEUE_BUSY;
 }
 
+static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+			    unsigned int index)
+{
+	struct blkfront_info *info = (struct blkfront_info *)data;
+
+	hctx->driver_data = &info->rinfo;
+	return 0;
+}
+
 static struct blk_mq_ops blkfront_mq_ops = {
 	.queue_rq = blkif_queue_rq,
 	.map_queue = blk_mq_map_queue,
+	.init_hctx = blk_mq_init_hctx,
 };
 
 static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
@@ -1029,6 +1051,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 static void xlvbd_release_gendisk(struct blkfront_info *info)
 {
 	unsigned int minor, nr_minors;
+	struct blkfront_ring_info *rinfo = &info->rinfo;
 
 	if (info->rq == NULL)
 		return;
@@ -1037,10 +1060,10 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
 	blk_mq_stop_hw_queues(info->rq);
 
 	/* No more gnttab callback work. */
-	gnttab_cancel_free_callback(&info->callback);
+	gnttab_cancel_free_callback(&rinfo->callback);
 
 	/* Flush gnttab callback work. Must be done with no locks held. */
-	flush_work(&info->work);
+	flush_work(&rinfo->work);
 
 	del_gendisk(info->gd);
 
@@ -1057,20 +1080,20 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
 }
 
 /* Must be called with io_lock holded */
-static void kick_pending_request_queues(struct blkfront_info *info)
+static void kick_pending_request_queues(struct blkfront_ring_info *rinfo)
 {
-	if (!RING_FULL(&info->ring))
-		blk_mq_start_stopped_hw_queues(info->rq, true);
+	if (!RING_FULL(&rinfo->ring))
+		blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true);
 }
 
 static void blkif_restart_queue(struct work_struct *work)
 {
-	struct blkfront_info *info = container_of(work, struct blkfront_info, work);
+	struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work);
 
-	spin_lock_irq(&info->io_lock);
-	if (info->connected == BLKIF_STATE_CONNECTED)
-		kick_pending_request_queues(info);
-	spin_unlock_irq(&info->io_lock);
+	spin_lock_irq(&rinfo->dev_info->io_lock);
+	if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED)
+		kick_pending_request_queues(rinfo);
+	spin_unlock_irq(&rinfo->dev_info->io_lock);
 }
 
 static void blkif_free(struct blkfront_info *info, int suspend)
@@ -1078,6 +1101,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 	struct grant *persistent_gnt;
 	struct grant *n;
 	int i, j, segs;
+	struct blkfront_ring_info *rinfo = &info->rinfo;
 
 	/* Prevent new requests being issued until we fix things up. */
 	spin_lock_irq(&info->io_lock);
@@ -1090,7 +1114,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 	/* Remove all persistent grants */
 	if (!list_empty(&info->grants)) {
 		list_for_each_entry_safe(persistent_gnt, n,
-		                         &info->grants, node) {
+					 &info->grants, node) {
 			list_del(&persistent_gnt->node);
 			if (persistent_gnt->gref != GRANT_INVALID_REF) {
 				gnttab_end_foreign_access(persistent_gnt->gref,
@@ -1108,11 +1132,11 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 	 * Remove indirect pages, this only happens when using indirect
 	 * descriptors but not persistent grants
 	 */
-	if (!list_empty(&info->indirect_pages)) {
+	if (!list_empty(&rinfo->indirect_pages)) {
 		struct page *indirect_page, *n;
 
 		BUG_ON(info->feature_persistent);
-		list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
+		list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
 			list_del(&indirect_page->lru);
 			__free_page(indirect_page);
 		}
@@ -1123,21 +1147,21 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 		 * Clear persistent grants present in requests already
 		 * on the shared ring
 		 */
-		if (!info->shadow[i].request)
+		if (!rinfo->shadow[i].request)
 			goto free_shadow;
 
-		segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
-		       info->shadow[i].req.u.indirect.nr_segments :
-		       info->shadow[i].req.u.rw.nr_segments;
+		segs = rinfo->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
+		       rinfo->shadow[i].req.u.indirect.nr_segments :
+		       rinfo->shadow[i].req.u.rw.nr_segments;
 		for (j = 0; j < segs; j++) {
-			persistent_gnt = info->shadow[i].grants_used[j];
+			persistent_gnt = rinfo->shadow[i].grants_used[j];
 			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
 			if (info->feature_persistent)
 				__free_page(persistent_gnt->page);
 			kfree(persistent_gnt);
 		}
 
-		if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
+		if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT)
 			/*
 			 * If this is not an indirect operation don't try to
 			 * free indirect segments
@@ -1145,41 +1169,41 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 			goto free_shadow;
 
 		for (j = 0; j < INDIRECT_GREFS(segs); j++) {
-			persistent_gnt = info->shadow[i].indirect_grants[j];
+			persistent_gnt = rinfo->shadow[i].indirect_grants[j];
 			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
 			__free_page(persistent_gnt->page);
 			kfree(persistent_gnt);
 		}
 
 free_shadow:
-		kfree(info->shadow[i].grants_used);
-		info->shadow[i].grants_used = NULL;
-		kfree(info->shadow[i].indirect_grants);
-		info->shadow[i].indirect_grants = NULL;
-		kfree(info->shadow[i].sg);
-		info->shadow[i].sg = NULL;
+		kfree(rinfo->shadow[i].grants_used);
+		rinfo->shadow[i].grants_used = NULL;
+		kfree(rinfo->shadow[i].indirect_grants);
+		rinfo->shadow[i].indirect_grants = NULL;
+		kfree(rinfo->shadow[i].sg);
+		rinfo->shadow[i].sg = NULL;
 	}
 
 	/* No more gnttab callback work. */
-	gnttab_cancel_free_callback(&info->callback);
+	gnttab_cancel_free_callback(&rinfo->callback);
 	spin_unlock_irq(&info->io_lock);
 
 	/* Flush gnttab callback work. Must be done with no locks held. */
-	flush_work(&info->work);
+	flush_work(&rinfo->work);
 
 	/* Free resources associated with old device channel. */
 	for (i = 0; i < info->nr_ring_pages; i++) {
-		if (info->ring_ref[i] != GRANT_INVALID_REF) {
-			gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
-			info->ring_ref[i] = GRANT_INVALID_REF;
+		if (rinfo->ring_ref[i] != GRANT_INVALID_REF) {
+			gnttab_end_foreign_access(rinfo->ring_ref[i], 0, 0);
+			rinfo->ring_ref[i] = GRANT_INVALID_REF;
 		}
 	}
-	free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
-	info->ring.sring = NULL;
+	free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
+	rinfo->ring.sring = NULL;
 
-	if (info->irq)
-		unbind_from_irqhandler(info->irq, info);
-	info->evtchn = info->irq = 0;
+	if (rinfo->irq)
+		unbind_from_irqhandler(rinfo->irq, rinfo);
+	rinfo->evtchn = rinfo->irq = 0;
 
 }
 
@@ -1209,12 +1233,13 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
 	kunmap_atomic(shared_data);
 }
 
-static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
+static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *rinfo,
 			     struct blkif_response *bret)
 {
 	int i = 0;
 	struct scatterlist *sg;
 	int num_sg, num_grant;
+	struct blkfront_info *info = rinfo->dev_info;
 	struct copy_from_grant data = {
 		.s = s,
 		.grant_idx = 0,
@@ -1284,7 +1309,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
 				 */
 				if (!info->feature_persistent) {
 					indirect_page = s->indirect_grants[i]->page;
-					list_add(&indirect_page->lru, &info->indirect_pages);
+					list_add(&indirect_page->lru, &rinfo->indirect_pages);
 				}
 				s->indirect_grants[i]->gref = GRANT_INVALID_REF;
 				list_add_tail(&s->indirect_grants[i]->node, &info->grants);
@@ -1299,7 +1324,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 	struct blkif_response *bret;
 	RING_IDX i, rp;
 	unsigned long flags;
-	struct blkfront_info *info = (struct blkfront_info *)dev_id;
+	struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id;
+	struct blkfront_info *info = rinfo->dev_info;
 	int error;
 
 	spin_lock_irqsave(&info->io_lock, flags);
@@ -1310,13 +1336,13 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 	}
 
  again:
-	rp = info->ring.sring->rsp_prod;
+	rp = rinfo->ring.sring->rsp_prod;
 	rmb(); /* Ensure we see queued responses up to 'rp'. */
 
-	for (i = info->ring.rsp_cons; i != rp; i++) {
+	for (i = rinfo->ring.rsp_cons; i != rp; i++) {
 		unsigned long id;
 
-		bret = RING_GET_RESPONSE(&info->ring, i);
+		bret = RING_GET_RESPONSE(&rinfo->ring, i);
 		id   = bret->id;
 		/*
 		 * The backend has messed up and given us an id that we would
@@ -1330,12 +1356,12 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 			 * the id is busted. */
 			continue;
 		}
-		req  = info->shadow[id].request;
+		req  = rinfo->shadow[id].request;
 
 		if (bret->operation != BLKIF_OP_DISCARD)
-			blkif_completion(&info->shadow[id], info, bret);
+			blkif_completion(&rinfo->shadow[id], rinfo, bret);
 
-		if (add_id_to_freelist(info, id)) {
+		if (add_id_to_freelist(rinfo, id)) {
 			WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
 			     info->gd->disk_name, op_name(bret->operation), id);
 			continue;
@@ -1364,7 +1390,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 				error = -EOPNOTSUPP;
 			}
 			if (unlikely(bret->status == BLKIF_RSP_ERROR &&
-				     info->shadow[id].req.u.rw.nr_segments == 0)) {
+				     rinfo->shadow[id].req.u.rw.nr_segments == 0)) {
 				printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
 				       info->gd->disk_name, op_name(bret->operation));
 				error = -EOPNOTSUPP;
@@ -1389,17 +1415,17 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 		}
 	}
 
-	info->ring.rsp_cons = i;
+	rinfo->ring.rsp_cons = i;
 
-	if (i != info->ring.req_prod_pvt) {
+	if (i != rinfo->ring.req_prod_pvt) {
 		int more_to_do;
-		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
+		RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do);
 		if (more_to_do)
 			goto again;
 	} else
-		info->ring.sring->rsp_event = i + 1;
+		rinfo->ring.sring->rsp_event = i + 1;
 
-	kick_pending_request_queues(info);
+	kick_pending_request_queues(rinfo);
 
 	spin_unlock_irqrestore(&info->io_lock, flags);
 
@@ -1408,15 +1434,16 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 
 
 static int setup_blkring(struct xenbus_device *dev,
-			 struct blkfront_info *info)
+			 struct blkfront_ring_info *rinfo)
 {
 	struct blkif_sring *sring;
 	int err, i;
+	struct blkfront_info *info = rinfo->dev_info;
 	unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE;
 	grant_ref_t gref[XENBUS_MAX_RING_GRANTS];
 
 	for (i = 0; i < info->nr_ring_pages; i++)
-		info->ring_ref[i] = GRANT_INVALID_REF;
+		rinfo->ring_ref[i] = GRANT_INVALID_REF;
 
 	sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
 						       get_order(ring_size));
@@ -1425,29 +1452,29 @@ static int setup_blkring(struct xenbus_device *dev,
 		return -ENOMEM;
 	}
 	SHARED_RING_INIT(sring);
-	FRONT_RING_INIT(&info->ring, sring, ring_size);
+	FRONT_RING_INIT(&rinfo->ring, sring, ring_size);
 
-	err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref);
+	err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref);
 	if (err < 0) {
 		free_pages((unsigned long)sring, get_order(ring_size));
-		info->ring.sring = NULL;
+		rinfo->ring.sring = NULL;
 		goto fail;
 	}
 	for (i = 0; i < info->nr_ring_pages; i++)
-		info->ring_ref[i] = gref[i];
+		rinfo->ring_ref[i] = gref[i];
 
-	err = xenbus_alloc_evtchn(dev, &info->evtchn);
+	err = xenbus_alloc_evtchn(dev, &rinfo->evtchn);
 	if (err)
 		goto fail;
 
-	err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0,
-					"blkif", info);
+	err = bind_evtchn_to_irqhandler(rinfo->evtchn, blkif_interrupt, 0,
+					"blkif", rinfo);
 	if (err <= 0) {
 		xenbus_dev_fatal(dev, err,
 				 "bind_evtchn_to_irqhandler failed");
 		goto fail;
 	}
-	info->irq = err;
+	rinfo->irq = err;
 
 	return 0;
 fail:
@@ -1465,6 +1492,7 @@ static int talk_to_blkback(struct xenbus_device *dev,
 	int err, i;
 	unsigned int max_page_order = 0;
 	unsigned int ring_page_order = 0;
+	struct blkfront_ring_info *rinfo = &info->rinfo;
 
 	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
 			   "max-ring-page-order", "%u", &max_page_order);
@@ -1476,7 +1504,7 @@ static int talk_to_blkback(struct xenbus_device *dev,
 	}
 
 	/* Create shared ring, alloc event channel. */
-	err = setup_blkring(dev, info);
+	err = setup_blkring(dev, rinfo);
 	if (err)
 		goto out;
 
@@ -1489,7 +1517,7 @@ again:
 
 	if (info->nr_ring_pages == 1) {
 		err = xenbus_printf(xbt, dev->nodename,
-				    "ring-ref", "%u", info->ring_ref[0]);
+				    "ring-ref", "%u", rinfo->ring_ref[0]);
 		if (err) {
 			message = "writing ring-ref";
 			goto abort_transaction;
@@ -1507,7 +1535,7 @@ again:
 
 			snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
 			err = xenbus_printf(xbt, dev->nodename, ring_ref_name,
-					    "%u", info->ring_ref[i]);
+					    "%u", rinfo->ring_ref[i]);
 			if (err) {
 				message = "writing ring-ref";
 				goto abort_transaction;
@@ -1515,7 +1543,7 @@ again:
 		}
 	}
 	err = xenbus_printf(xbt, dev->nodename,
-			    "event-channel", "%u", info->evtchn);
+			    "event-channel", "%u", rinfo->evtchn);
 	if (err) {
 		message = "writing event-channel";
 		goto abort_transaction;
@@ -1541,8 +1569,8 @@ again:
 	}
 
 	for (i = 0; i < BLK_RING_SIZE(info); i++)
-		info->shadow[i].req.u.rw.id = i+1;
-	info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+		rinfo->shadow[i].req.u.rw.id = i+1;
+	rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
 	xenbus_switch_state(dev, XenbusStateInitialised);
 
 	return 0;
@@ -1568,6 +1596,7 @@ static int blkfront_probe(struct xenbus_device *dev,
 {
 	int err, vdevice;
 	struct blkfront_info *info;
+	struct blkfront_ring_info *rinfo;
 
 	/* FIXME: Use dynamic device id if this is not set. */
 	err = xenbus_scanf(XBT_NIL, dev->nodename,
@@ -1617,15 +1646,18 @@ static int blkfront_probe(struct xenbus_device *dev,
 		return -ENOMEM;
 	}
 
+	rinfo = &info->rinfo;
+	INIT_LIST_HEAD(&rinfo->indirect_pages);
+	rinfo->dev_info = info;
+	INIT_WORK(&rinfo->work, blkif_restart_queue);
+
 	mutex_init(&info->mutex);
 	spin_lock_init(&info->io_lock);
 	info->xbdev = dev;
 	info->vdevice = vdevice;
 	INIT_LIST_HEAD(&info->grants);
-	INIT_LIST_HEAD(&info->indirect_pages);
 	info->persistent_gnts_c = 0;
 	info->connected = BLKIF_STATE_DISCONNECTED;
-	INIT_WORK(&info->work, blkif_restart_queue);
 
 	/* Front end dir is a number, which is used as the id. */
 	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
@@ -1659,19 +1691,20 @@ static int blkif_recover(struct blkfront_info *info)
 	int pending, size;
 	struct split_bio *split_bio;
 	struct list_head requests;
+	struct blkfront_ring_info *rinfo = &info->rinfo;
 
 	/* Stage 1: Make a safe copy of the shadow state. */
-	copy = kmemdup(info->shadow, sizeof(info->shadow),
+	copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow),
 		       GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
 	if (!copy)
 		return -ENOMEM;
 
 	/* Stage 2: Set up free list. */
-	memset(&info->shadow, 0, sizeof(info->shadow));
+	memset(&rinfo->shadow, 0, sizeof(rinfo->shadow));
 	for (i = 0; i < BLK_RING_SIZE(info); i++)
-		info->shadow[i].req.u.rw.id = i+1;
-	info->shadow_free = info->ring.req_prod_pvt;
-	info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+		rinfo->shadow[i].req.u.rw.id = i+1;
+	rinfo->shadow_free = rinfo->ring.req_prod_pvt;
+	rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
 
 	rc = blkfront_gather_backend_features(info);
 	if (rc) {
@@ -1717,7 +1750,7 @@ static int blkif_recover(struct blkfront_info *info)
 	info->connected = BLKIF_STATE_CONNECTED;
 
 	/* Kick any other new requests queued since we resumed */
-	kick_pending_request_queues(info);
+	kick_pending_request_queues(rinfo);
 
 	list_for_each_entry_safe(req, n, &requests, queuelist) {
 		/* Requeue pending requests (flush or discard) */
@@ -1851,10 +1884,11 @@ static void blkfront_setup_discard(struct blkfront_info *info)
 		info->feature_secdiscard = !!discard_secure;
 }
 
-static int blkfront_setup_indirect(struct blkfront_info *info)
+static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
 {
 	unsigned int psegs, grants;
 	int err, i;
+	struct blkfront_info *info = rinfo->dev_info;
 
 	if (info->max_indirect_segments == 0)
 		grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
@@ -1862,7 +1896,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
 		grants = info->max_indirect_segments;
 	psegs = grants / GRANTS_PER_PSEG;
 
-	err = fill_grant_buffer(info,
+	err = fill_grant_buffer(rinfo,
 				(grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info));
 	if (err)
 		goto out_of_memory;
@@ -1875,31 +1909,31 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
 		 */
 		int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info);
 
-		BUG_ON(!list_empty(&info->indirect_pages));
+		BUG_ON(!list_empty(&rinfo->indirect_pages));
 		for (i = 0; i < num; i++) {
 			struct page *indirect_page = alloc_page(GFP_NOIO);
 			if (!indirect_page)
 				goto out_of_memory;
-			list_add(&indirect_page->lru, &info->indirect_pages);
+			list_add(&indirect_page->lru, &rinfo->indirect_pages);
 		}
 	}
 
 	for (i = 0; i < BLK_RING_SIZE(info); i++) {
-		info->shadow[i].grants_used = kzalloc(
-			sizeof(info->shadow[i].grants_used[0]) * grants,
+		rinfo->shadow[i].grants_used = kzalloc(
+			sizeof(rinfo->shadow[i].grants_used[0]) * grants,
 			GFP_NOIO);
-		info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO);
+		rinfo->shadow[i].sg = kzalloc(sizeof(rinfo->shadow[i].sg[0]) * psegs, GFP_NOIO);
 		if (info->max_indirect_segments)
-			info->shadow[i].indirect_grants = kzalloc(
-				sizeof(info->shadow[i].indirect_grants[0]) *
+			rinfo->shadow[i].indirect_grants = kzalloc(
+				sizeof(rinfo->shadow[i].indirect_grants[0]) *
 				INDIRECT_GREFS(grants),
 				GFP_NOIO);
-		if ((info->shadow[i].grants_used == NULL) ||
-			(info->shadow[i].sg == NULL) ||
+		if ((rinfo->shadow[i].grants_used == NULL) ||
+			(rinfo->shadow[i].sg == NULL) ||
 		     (info->max_indirect_segments &&
-		     (info->shadow[i].indirect_grants == NULL)))
+		     (rinfo->shadow[i].indirect_grants == NULL)))
 			goto out_of_memory;
-		sg_init_table(info->shadow[i].sg, psegs);
+		sg_init_table(rinfo->shadow[i].sg, psegs);
 	}
 
 
@@ -1907,16 +1941,16 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
 
 out_of_memory:
 	for (i = 0; i < BLK_RING_SIZE(info); i++) {
-		kfree(info->shadow[i].grants_used);
-		info->shadow[i].grants_used = NULL;
-		kfree(info->shadow[i].sg);
-		info->shadow[i].sg = NULL;
-		kfree(info->shadow[i].indirect_grants);
-		info->shadow[i].indirect_grants = NULL;
-	}
-	if (!list_empty(&info->indirect_pages)) {
+		kfree(rinfo->shadow[i].grants_used);
+		rinfo->shadow[i].grants_used = NULL;
+		kfree(rinfo->shadow[i].sg);
+		rinfo->shadow[i].sg = NULL;
+		kfree(rinfo->shadow[i].indirect_grants);
+		rinfo->shadow[i].indirect_grants = NULL;
+	}
+	if (!list_empty(&rinfo->indirect_pages)) {
 		struct page *indirect_page, *n;
-		list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
+		list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
 			list_del(&indirect_page->lru);
 			__free_page(indirect_page);
 		}
@@ -1983,7 +2017,7 @@ static int blkfront_gather_backend_features(struct blkfront_info *info)
 		info->max_indirect_segments = min(indirect_segments,
 						  xen_blkif_max_segments);
 
-	return blkfront_setup_indirect(info);
+	return blkfront_setup_indirect(&info->rinfo);
 }
 
 /*
@@ -1997,6 +2031,7 @@ static void blkfront_connect(struct blkfront_info *info)
 	unsigned int physical_sector_size;
 	unsigned int binfo;
 	int err;
+	struct blkfront_ring_info *rinfo = &info->rinfo;
 
 	switch (info->connected) {
 	case BLKIF_STATE_CONNECTED:
@@ -2073,7 +2108,7 @@ static void blkfront_connect(struct blkfront_info *info)
 	/* Kick pending requests. */
 	spin_lock_irq(&info->io_lock);
 	info->connected = BLKIF_STATE_CONNECTED;
-	kick_pending_request_queues(info);
+	kick_pending_request_queues(rinfo);
 	spin_unlock_irq(&info->io_lock);
 
 	add_disk(info->gd);
-- 
cgit 


From 3df0e5059908b8fdba351c4b5dd77caadd95a949 Mon Sep 17 00:00:00 2001
From: Bob Liu <bob.liu@oracle.com>
Date: Sat, 14 Nov 2015 11:12:12 +0800
Subject: xen/blkfront: pseudo support for multi hardware queues/rings

Preparatory patch for multiple hardware queues (rings). The number of
rings is unconditionally set to 1, larger number will be enabled in
patch "xen/blkfront: negotiate number of queues/rings to be used with backend"
so as to make review easier.

Note that blkfront_gather_backend_features does not call
blkfront_setup_indirect anymore (as that needs to be done per ring).
That means that in blkif_recover/blkif_connect we have to do it in a loop
(bounded by nr_rings).

Signed-off-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 343 +++++++++++++++++++++++++------------------
 1 file changed, 198 insertions(+), 145 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 0c3ad214a792..0638b1722a40 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -150,6 +150,7 @@ struct blkfront_info
 	int vdevice;
 	blkif_vdev_t handle;
 	enum blkif_state connected;
+	/* Number of pages per ring buffer. */
 	unsigned int nr_ring_pages;
 	struct request_queue *rq;
 	struct list_head grants;
@@ -164,7 +165,8 @@ struct blkfront_info
 	unsigned int max_indirect_segments;
 	int is_ready;
 	struct blk_mq_tag_set tag_set;
-	struct blkfront_ring_info rinfo;
+	struct blkfront_ring_info *rinfo;
+	unsigned int nr_rings;
 };
 
 static unsigned int nr_minors;
@@ -209,7 +211,7 @@ static DEFINE_SPINLOCK(minor_lock);
 #define GREFS(_psegs)	((_psegs) * GRANTS_PER_PSEG)
 
 static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
-static int blkfront_gather_backend_features(struct blkfront_info *info);
+static void blkfront_gather_backend_features(struct blkfront_info *info);
 
 static int get_id_from_freelist(struct blkfront_ring_info *rinfo)
 {
@@ -338,8 +340,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head,
 		struct page *indirect_page;
 
 		/* Fetch a pre-allocated page to use for indirect grefs */
-		BUG_ON(list_empty(&info->rinfo.indirect_pages));
-		indirect_page = list_first_entry(&info->rinfo.indirect_pages,
+		BUG_ON(list_empty(&info->rinfo->indirect_pages));
+		indirect_page = list_first_entry(&info->rinfo->indirect_pages,
 						 struct page, lru);
 		list_del(&indirect_page->lru);
 		gnt_list_entry->page = indirect_page;
@@ -597,7 +599,6 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
 	 * existing persistent grants, or if we have to get new grants,
 	 * as there are not sufficiently many free.
 	 */
-	bool new_persistent_gnts;
 	struct scatterlist *sg;
 	int num_sg, max_grefs, num_grant;
 
@@ -609,12 +610,12 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
 		 */
 		max_grefs += INDIRECT_GREFS(max_grefs);
 
-	/* Check if we have enough grants to allocate a requests */
-	if (info->persistent_gnts_c < max_grefs) {
-		new_persistent_gnts = 1;
-		if (gnttab_alloc_grant_references(
-		    max_grefs - info->persistent_gnts_c,
-		    &setup.gref_head) < 0) {
+	/*
+	 * We have to reserve 'max_grefs' grants because persistent
+	 * grants are shared by all rings.
+	 */
+	if (max_grefs > 0)
+		if (gnttab_alloc_grant_references(max_grefs, &setup.gref_head) < 0) {
 			gnttab_request_free_callback(
 				&rinfo->callback,
 				blkif_restart_queue_callback,
@@ -622,8 +623,6 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
 				max_grefs);
 			return 1;
 		}
-	} else
-		new_persistent_gnts = 0;
 
 	/* Fill out a communications ring structure. */
 	ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt);
@@ -712,7 +711,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
 	/* Keep a private copy so we can reissue requests when recovering. */
 	rinfo->shadow[id].req = *ring_req;
 
-	if (new_persistent_gnts)
+	if (max_grefs > 0)
 		gnttab_free_grant_references(setup.gref_head);
 
 	return 0;
@@ -791,7 +790,8 @@ static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 {
 	struct blkfront_info *info = (struct blkfront_info *)data;
 
-	hctx->driver_data = &info->rinfo;
+	BUG_ON(info->nr_rings <= index);
+	hctx->driver_data = &info->rinfo[index];
 	return 0;
 }
 
@@ -1050,8 +1050,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 
 static void xlvbd_release_gendisk(struct blkfront_info *info)
 {
-	unsigned int minor, nr_minors;
-	struct blkfront_ring_info *rinfo = &info->rinfo;
+	unsigned int minor, nr_minors, i;
 
 	if (info->rq == NULL)
 		return;
@@ -1059,11 +1058,15 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
 	/* No more blkif_request(). */
 	blk_mq_stop_hw_queues(info->rq);
 
-	/* No more gnttab callback work. */
-	gnttab_cancel_free_callback(&rinfo->callback);
+	for (i = 0; i < info->nr_rings; i++) {
+		struct blkfront_ring_info *rinfo = &info->rinfo[i];
 
-	/* Flush gnttab callback work. Must be done with no locks held. */
-	flush_work(&rinfo->work);
+		/* No more gnttab callback work. */
+		gnttab_cancel_free_callback(&rinfo->callback);
+
+		/* Flush gnttab callback work. Must be done with no locks held. */
+		flush_work(&rinfo->work);
+	}
 
 	del_gendisk(info->gd);
 
@@ -1096,37 +1099,11 @@ static void blkif_restart_queue(struct work_struct *work)
 	spin_unlock_irq(&rinfo->dev_info->io_lock);
 }
 
-static void blkif_free(struct blkfront_info *info, int suspend)
+static void blkif_free_ring(struct blkfront_ring_info *rinfo)
 {
 	struct grant *persistent_gnt;
-	struct grant *n;
+	struct blkfront_info *info = rinfo->dev_info;
 	int i, j, segs;
-	struct blkfront_ring_info *rinfo = &info->rinfo;
-
-	/* Prevent new requests being issued until we fix things up. */
-	spin_lock_irq(&info->io_lock);
-	info->connected = suspend ?
-		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
-	/* No more blkif_request(). */
-	if (info->rq)
-		blk_mq_stop_hw_queues(info->rq);
-
-	/* Remove all persistent grants */
-	if (!list_empty(&info->grants)) {
-		list_for_each_entry_safe(persistent_gnt, n,
-					 &info->grants, node) {
-			list_del(&persistent_gnt->node);
-			if (persistent_gnt->gref != GRANT_INVALID_REF) {
-				gnttab_end_foreign_access(persistent_gnt->gref,
-				                          0, 0UL);
-				info->persistent_gnts_c--;
-			}
-			if (info->feature_persistent)
-				__free_page(persistent_gnt->page);
-			kfree(persistent_gnt);
-		}
-	}
-	BUG_ON(info->persistent_gnts_c != 0);
 
 	/*
 	 * Remove indirect pages, this only happens when using indirect
@@ -1186,7 +1163,6 @@ free_shadow:
 
 	/* No more gnttab callback work. */
 	gnttab_cancel_free_callback(&rinfo->callback);
-	spin_unlock_irq(&info->io_lock);
 
 	/* Flush gnttab callback work. Must be done with no locks held. */
 	flush_work(&rinfo->work);
@@ -1204,7 +1180,45 @@ free_shadow:
 	if (rinfo->irq)
 		unbind_from_irqhandler(rinfo->irq, rinfo);
 	rinfo->evtchn = rinfo->irq = 0;
+}
 
+static void blkif_free(struct blkfront_info *info, int suspend)
+{
+	struct grant *persistent_gnt, *n;
+	unsigned int i;
+
+	/* Prevent new requests being issued until we fix things up. */
+	spin_lock_irq(&info->io_lock);
+	info->connected = suspend ?
+		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
+	/* No more blkif_request(). */
+	if (info->rq)
+		blk_mq_stop_hw_queues(info->rq);
+
+	/* Remove all persistent grants */
+	if (!list_empty(&info->grants)) {
+		list_for_each_entry_safe(persistent_gnt, n,
+					 &info->grants, node) {
+			list_del(&persistent_gnt->node);
+			if (persistent_gnt->gref != GRANT_INVALID_REF) {
+				gnttab_end_foreign_access(persistent_gnt->gref,
+							  0, 0UL);
+				info->persistent_gnts_c--;
+			}
+			if (info->feature_persistent)
+				__free_page(persistent_gnt->page);
+			kfree(persistent_gnt);
+		}
+	}
+	BUG_ON(info->persistent_gnts_c != 0);
+
+	for (i = 0; i < info->nr_rings; i++)
+		blkif_free_ring(&info->rinfo[i]);
+
+	kfree(info->rinfo);
+	info->rinfo = NULL;
+	info->nr_rings = 0;
+	spin_unlock_irq(&info->io_lock);
 }
 
 struct copy_from_grant {
@@ -1492,7 +1506,7 @@ static int talk_to_blkback(struct xenbus_device *dev,
 	int err, i;
 	unsigned int max_page_order = 0;
 	unsigned int ring_page_order = 0;
-	struct blkfront_ring_info *rinfo = &info->rinfo;
+	struct blkfront_ring_info *rinfo;
 
 	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
 			   "max-ring-page-order", "%u", &max_page_order);
@@ -1503,10 +1517,13 @@ static int talk_to_blkback(struct xenbus_device *dev,
 		info->nr_ring_pages = 1 << ring_page_order;
 	}
 
-	/* Create shared ring, alloc event channel. */
-	err = setup_blkring(dev, rinfo);
-	if (err)
-		goto out;
+	for (i = 0; i < info->nr_rings; i++) {
+		rinfo = &info->rinfo[i];
+		/* Create shared ring, alloc event channel. */
+		err = setup_blkring(dev, rinfo);
+		if (err)
+			goto destroy_blkring;
+	}
 
 again:
 	err = xenbus_transaction_start(&xbt);
@@ -1515,37 +1532,43 @@ again:
 		goto destroy_blkring;
 	}
 
-	if (info->nr_ring_pages == 1) {
-		err = xenbus_printf(xbt, dev->nodename,
-				    "ring-ref", "%u", rinfo->ring_ref[0]);
-		if (err) {
-			message = "writing ring-ref";
-			goto abort_transaction;
-		}
-	} else {
-		err = xenbus_printf(xbt, dev->nodename,
-				    "ring-page-order", "%u", ring_page_order);
-		if (err) {
-			message = "writing ring-page-order";
-			goto abort_transaction;
-		}
-
-		for (i = 0; i < info->nr_ring_pages; i++) {
-			char ring_ref_name[RINGREF_NAME_LEN];
-
-			snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
-			err = xenbus_printf(xbt, dev->nodename, ring_ref_name,
-					    "%u", rinfo->ring_ref[i]);
+	if (info->nr_rings == 1) {
+		rinfo = &info->rinfo[0];
+		if (info->nr_ring_pages == 1) {
+			err = xenbus_printf(xbt, dev->nodename,
+					    "ring-ref", "%u", rinfo->ring_ref[0]);
 			if (err) {
 				message = "writing ring-ref";
 				goto abort_transaction;
 			}
+		} else {
+			err = xenbus_printf(xbt, dev->nodename,
+					    "ring-page-order", "%u", ring_page_order);
+			if (err) {
+				message = "writing ring-page-order";
+				goto abort_transaction;
+			}
+
+			for (i = 0; i < info->nr_ring_pages; i++) {
+				char ring_ref_name[RINGREF_NAME_LEN];
+
+				snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
+				err = xenbus_printf(xbt, dev->nodename, ring_ref_name,
+						    "%u", rinfo->ring_ref[i]);
+				if (err) {
+					message = "writing ring-ref";
+					goto abort_transaction;
+				}
+			}
 		}
-	}
-	err = xenbus_printf(xbt, dev->nodename,
-			    "event-channel", "%u", rinfo->evtchn);
-	if (err) {
-		message = "writing event-channel";
+		err = xenbus_printf(xbt, dev->nodename,
+				    "event-channel", "%u", rinfo->evtchn);
+		if (err) {
+			message = "writing event-channel";
+			goto abort_transaction;
+		}
+	} else {
+		/* Not supported at this stage. */
 		goto abort_transaction;
 	}
 	err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
@@ -1568,9 +1591,15 @@ again:
 		goto destroy_blkring;
 	}
 
-	for (i = 0; i < BLK_RING_SIZE(info); i++)
-		rinfo->shadow[i].req.u.rw.id = i+1;
-	rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+	for (i = 0; i < info->nr_rings; i++) {
+		unsigned int j;
+
+		rinfo = &info->rinfo[i];
+
+		for (j = 0; j < BLK_RING_SIZE(info); j++)
+			rinfo->shadow[j].req.u.rw.id = j + 1;
+		rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+	}
 	xenbus_switch_state(dev, XenbusStateInitialised);
 
 	return 0;
@@ -1581,7 +1610,7 @@ again:
 		xenbus_dev_fatal(dev, err, "%s", message);
  destroy_blkring:
 	blkif_free(info, 0);
- out:
+
 	return err;
 }
 
@@ -1595,8 +1624,8 @@ static int blkfront_probe(struct xenbus_device *dev,
 			  const struct xenbus_device_id *id)
 {
 	int err, vdevice;
+	unsigned int r_index;
 	struct blkfront_info *info;
-	struct blkfront_ring_info *rinfo;
 
 	/* FIXME: Use dynamic device id if this is not set. */
 	err = xenbus_scanf(XBT_NIL, dev->nodename,
@@ -1646,10 +1675,22 @@ static int blkfront_probe(struct xenbus_device *dev,
 		return -ENOMEM;
 	}
 
-	rinfo = &info->rinfo;
-	INIT_LIST_HEAD(&rinfo->indirect_pages);
-	rinfo->dev_info = info;
-	INIT_WORK(&rinfo->work, blkif_restart_queue);
+	info->nr_rings = 1;
+	info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL);
+	if (!info->rinfo) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure");
+		kfree(info);
+		return -ENOMEM;
+	}
+
+	for (r_index = 0; r_index < info->nr_rings; r_index++) {
+		struct blkfront_ring_info *rinfo;
+
+		rinfo = &info->rinfo[r_index];
+		INIT_LIST_HEAD(&rinfo->indirect_pages);
+		rinfo->dev_info = info;
+		INIT_WORK(&rinfo->work, blkif_restart_queue);
+	}
 
 	mutex_init(&info->mutex);
 	spin_lock_init(&info->io_lock);
@@ -1681,7 +1722,7 @@ static void split_bio_end(struct bio *bio)
 
 static int blkif_recover(struct blkfront_info *info)
 {
-	int i;
+	unsigned int i, r_index;
 	struct request *req, *n;
 	struct blk_shadow *copy;
 	int rc;
@@ -1691,57 +1732,62 @@ static int blkif_recover(struct blkfront_info *info)
 	int pending, size;
 	struct split_bio *split_bio;
 	struct list_head requests;
-	struct blkfront_ring_info *rinfo = &info->rinfo;
-
-	/* Stage 1: Make a safe copy of the shadow state. */
-	copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow),
-		       GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
-	if (!copy)
-		return -ENOMEM;
-
-	/* Stage 2: Set up free list. */
-	memset(&rinfo->shadow, 0, sizeof(rinfo->shadow));
-	for (i = 0; i < BLK_RING_SIZE(info); i++)
-		rinfo->shadow[i].req.u.rw.id = i+1;
-	rinfo->shadow_free = rinfo->ring.req_prod_pvt;
-	rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
-
-	rc = blkfront_gather_backend_features(info);
-	if (rc) {
-		kfree(copy);
-		return rc;
-	}
 
+	blkfront_gather_backend_features(info);
 	segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
 	blk_queue_max_segments(info->rq, segs);
 	bio_list_init(&bio_list);
 	INIT_LIST_HEAD(&requests);
-	for (i = 0; i < BLK_RING_SIZE(info); i++) {
-		/* Not in use? */
-		if (!copy[i].request)
-			continue;
 
-		/*
-		 * Get the bios in the request so we can re-queue them.
-		 */
-		if (copy[i].request->cmd_flags &
-		    (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+	for (r_index = 0; r_index < info->nr_rings; r_index++) {
+		struct blkfront_ring_info *rinfo;
+
+		rinfo = &info->rinfo[r_index];
+		/* Stage 1: Make a safe copy of the shadow state. */
+		copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow),
+			       GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
+		if (!copy)
+			return -ENOMEM;
+
+		/* Stage 2: Set up free list. */
+		memset(&rinfo->shadow, 0, sizeof(rinfo->shadow));
+		for (i = 0; i < BLK_RING_SIZE(info); i++)
+			rinfo->shadow[i].req.u.rw.id = i+1;
+		rinfo->shadow_free = rinfo->ring.req_prod_pvt;
+		rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+
+		rc = blkfront_setup_indirect(rinfo);
+		if (rc) {
+			kfree(copy);
+			return rc;
+		}
+
+		for (i = 0; i < BLK_RING_SIZE(info); i++) {
+			/* Not in use? */
+			if (!copy[i].request)
+				continue;
+
 			/*
-			 * Flush operations don't contain bios, so
-			 * we need to requeue the whole request
+			 * Get the bios in the request so we can re-queue them.
 			 */
-			list_add(&copy[i].request->queuelist, &requests);
-			continue;
+			if (copy[i].request->cmd_flags &
+			    (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+				/*
+				 * Flush operations don't contain bios, so
+				 * we need to requeue the whole request
+				 */
+				list_add(&copy[i].request->queuelist, &requests);
+				continue;
+			}
+			merge_bio.head = copy[i].request->bio;
+			merge_bio.tail = copy[i].request->biotail;
+			bio_list_merge(&bio_list, &merge_bio);
+			copy[i].request->bio = NULL;
+			blk_end_request_all(copy[i].request, 0);
 		}
-		merge_bio.head = copy[i].request->bio;
-		merge_bio.tail = copy[i].request->biotail;
-		bio_list_merge(&bio_list, &merge_bio);
-		copy[i].request->bio = NULL;
-		blk_end_request_all(copy[i].request, 0);
-	}
-
-	kfree(copy);
 
+		kfree(copy);
+	}
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
 	spin_lock_irq(&info->io_lock);
@@ -1749,8 +1795,13 @@ static int blkif_recover(struct blkfront_info *info)
 	/* Now safe for us to use the shared ring */
 	info->connected = BLKIF_STATE_CONNECTED;
 
-	/* Kick any other new requests queued since we resumed */
-	kick_pending_request_queues(rinfo);
+	for (r_index = 0; r_index < info->nr_rings; r_index++) {
+		struct blkfront_ring_info *rinfo;
+
+		rinfo = &info->rinfo[r_index];
+		/* Kick any other new requests queued since we resumed */
+		kick_pending_request_queues(rinfo);
+	}
 
 	list_for_each_entry_safe(req, n, &requests, queuelist) {
 		/* Requeue pending requests (flush or discard) */
@@ -1961,7 +2012,7 @@ out_of_memory:
 /*
  * Gather all backend feature-*
  */
-static int blkfront_gather_backend_features(struct blkfront_info *info)
+static void blkfront_gather_backend_features(struct blkfront_info *info)
 {
 	int err;
 	int barrier, flush, discard, persistent;
@@ -2016,8 +2067,6 @@ static int blkfront_gather_backend_features(struct blkfront_info *info)
 	else
 		info->max_indirect_segments = min(indirect_segments,
 						  xen_blkif_max_segments);
-
-	return blkfront_setup_indirect(&info->rinfo);
 }
 
 /*
@@ -2030,8 +2079,7 @@ static void blkfront_connect(struct blkfront_info *info)
 	unsigned long sector_size;
 	unsigned int physical_sector_size;
 	unsigned int binfo;
-	int err;
-	struct blkfront_ring_info *rinfo = &info->rinfo;
+	int err, i;
 
 	switch (info->connected) {
 	case BLKIF_STATE_CONNECTED:
@@ -2088,11 +2136,15 @@ static void blkfront_connect(struct blkfront_info *info)
 	if (err != 1)
 		physical_sector_size = sector_size;
 
-	err = blkfront_gather_backend_features(info);
-	if (err) {
-		xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
-				 info->xbdev->otherend);
-		return;
+	blkfront_gather_backend_features(info);
+	for (i = 0; i < info->nr_rings; i++) {
+		err = blkfront_setup_indirect(&info->rinfo[i]);
+		if (err) {
+			xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
+					 info->xbdev->otherend);
+			blkif_free(info, 0);
+			break;
+		}
 	}
 
 	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
@@ -2108,7 +2160,8 @@ static void blkfront_connect(struct blkfront_info *info)
 	/* Kick pending requests. */
 	spin_lock_irq(&info->io_lock);
 	info->connected = BLKIF_STATE_CONNECTED;
-	kick_pending_request_queues(rinfo);
+	for (i = 0; i < info->nr_rings; i++)
+		kick_pending_request_queues(&info->rinfo[i]);
 	spin_unlock_irq(&info->io_lock);
 
 	add_disk(info->gd);
-- 
cgit 


From 11659569f7202d0cb6553e81f9b8aa04dfeb94ce Mon Sep 17 00:00:00 2001
From: Bob Liu <bob.liu@oracle.com>
Date: Sat, 14 Nov 2015 11:12:13 +0800
Subject: xen/blkfront: split per device io_lock

After patch "xen/blkfront: separate per ring information out of device
info", per-ring data is protected by a per-device lock ('io_lock').

This is not a good way and will effect the scalability, so introduce a
per-ring lock ('ring_lock').

The old 'io_lock' is renamed to 'dev_lock' which protects the ->grants list and
->persistent_gnts_c which are shared by all rings.

Note that in 'blkfront_probe' the 'blkfront_info' is setup via kzalloc
so setting ->persistent_gnts_c to zero is not needed.

Signed-off-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 73 ++++++++++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 26 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 0638b1722a40..a9058bbdaa6b 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -125,6 +125,8 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
  *  depending on how many hardware queues/rings to be used.
  */
 struct blkfront_ring_info {
+	/* Lock to protect data in every ring buffer. */
+	spinlock_t ring_lock;
 	struct blkif_front_ring ring;
 	unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
 	unsigned int evtchn, irq;
@@ -143,7 +145,6 @@ struct blkfront_ring_info {
  */
 struct blkfront_info
 {
-	spinlock_t io_lock;
 	struct mutex mutex;
 	struct xenbus_device *xbdev;
 	struct gendisk *gd;
@@ -153,6 +154,11 @@ struct blkfront_info
 	/* Number of pages per ring buffer. */
 	unsigned int nr_ring_pages;
 	struct request_queue *rq;
+	/*
+	 * Lock to protect info->grants list and persistent_gnts_c shared by all
+	 * rings.
+	 */
+	spinlock_t dev_lock;
 	struct list_head grants;
 	unsigned int persistent_gnts_c;
 	unsigned int feature_flush;
@@ -258,7 +264,9 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
 		}
 
 		gnt_list_entry->gref = GRANT_INVALID_REF;
+		spin_lock_irq(&info->dev_lock);
 		list_add(&gnt_list_entry->node, &info->grants);
+		spin_unlock_irq(&info->dev_lock);
 		i++;
 	}
 
@@ -267,7 +275,9 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
 out_of_memory:
 	list_for_each_entry_safe(gnt_list_entry, n,
 	                         &info->grants, node) {
+		spin_lock_irq(&info->dev_lock);
 		list_del(&gnt_list_entry->node);
+		spin_unlock_irq(&info->dev_lock);
 		if (info->feature_persistent)
 			__free_page(gnt_list_entry->page);
 		kfree(gnt_list_entry);
@@ -280,7 +290,9 @@ out_of_memory:
 static struct grant *get_free_grant(struct blkfront_info *info)
 {
 	struct grant *gnt_list_entry;
+	unsigned long flags;
 
+	spin_lock_irqsave(&info->dev_lock, flags);
 	BUG_ON(list_empty(&info->grants));
 	gnt_list_entry = list_first_entry(&info->grants, struct grant,
 					  node);
@@ -288,6 +300,7 @@ static struct grant *get_free_grant(struct blkfront_info *info)
 
 	if (gnt_list_entry->gref != GRANT_INVALID_REF)
 		info->persistent_gnts_c--;
+	spin_unlock_irqrestore(&info->dev_lock, flags);
 
 	return gnt_list_entry;
 }
@@ -757,11 +770,11 @@ static inline bool blkif_request_flush_invalid(struct request *req,
 static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
 			   const struct blk_mq_queue_data *qd)
 {
+	unsigned long flags;
 	struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data;
-	struct blkfront_info *info = rinfo->dev_info;
 
 	blk_mq_start_request(qd->rq);
-	spin_lock_irq(&info->io_lock);
+	spin_lock_irqsave(&rinfo->ring_lock, flags);
 	if (RING_FULL(&rinfo->ring))
 		goto out_busy;
 
@@ -772,15 +785,15 @@ static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
 		goto out_busy;
 
 	flush_requests(rinfo);
-	spin_unlock_irq(&info->io_lock);
+	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
 	return BLK_MQ_RQ_QUEUE_OK;
 
 out_err:
-	spin_unlock_irq(&info->io_lock);
+	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
 	return BLK_MQ_RQ_QUEUE_ERROR;
 
 out_busy:
-	spin_unlock_irq(&info->io_lock);
+	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
 	blk_mq_stop_hw_queue(hctx);
 	return BLK_MQ_RQ_QUEUE_BUSY;
 }
@@ -1082,21 +1095,28 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
 	info->gd = NULL;
 }
 
-/* Must be called with io_lock holded */
-static void kick_pending_request_queues(struct blkfront_ring_info *rinfo)
+/* Already hold rinfo->ring_lock. */
+static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo)
 {
 	if (!RING_FULL(&rinfo->ring))
 		blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true);
 }
 
+static void kick_pending_request_queues(struct blkfront_ring_info *rinfo)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rinfo->ring_lock, flags);
+	kick_pending_request_queues_locked(rinfo);
+	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
+}
+
 static void blkif_restart_queue(struct work_struct *work)
 {
 	struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work);
 
-	spin_lock_irq(&rinfo->dev_info->io_lock);
 	if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED)
 		kick_pending_request_queues(rinfo);
-	spin_unlock_irq(&rinfo->dev_info->io_lock);
 }
 
 static void blkif_free_ring(struct blkfront_ring_info *rinfo)
@@ -1188,7 +1208,6 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 	unsigned int i;
 
 	/* Prevent new requests being issued until we fix things up. */
-	spin_lock_irq(&info->io_lock);
 	info->connected = suspend ?
 		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
 	/* No more blkif_request(). */
@@ -1196,6 +1215,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 		blk_mq_stop_hw_queues(info->rq);
 
 	/* Remove all persistent grants */
+	spin_lock_irq(&info->dev_lock);
 	if (!list_empty(&info->grants)) {
 		list_for_each_entry_safe(persistent_gnt, n,
 					 &info->grants, node) {
@@ -1211,6 +1231,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 		}
 	}
 	BUG_ON(info->persistent_gnts_c != 0);
+	spin_unlock_irq(&info->dev_lock);
 
 	for (i = 0; i < info->nr_rings; i++)
 		blkif_free_ring(&info->rinfo[i]);
@@ -1218,7 +1239,6 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 	kfree(info->rinfo);
 	info->rinfo = NULL;
 	info->nr_rings = 0;
-	spin_unlock_irq(&info->io_lock);
 }
 
 struct copy_from_grant {
@@ -1253,6 +1273,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri
 	int i = 0;
 	struct scatterlist *sg;
 	int num_sg, num_grant;
+	unsigned long flags;
 	struct blkfront_info *info = rinfo->dev_info;
 	struct copy_from_grant data = {
 		.s = s,
@@ -1291,8 +1312,10 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri
 			if (!info->feature_persistent)
 				pr_alert_ratelimited("backed has not unmapped grant: %u\n",
 						     s->grants_used[i]->gref);
+			spin_lock_irqsave(&info->dev_lock, flags);
 			list_add(&s->grants_used[i]->node, &info->grants);
 			info->persistent_gnts_c++;
+			spin_unlock_irqrestore(&info->dev_lock, flags);
 		} else {
 			/*
 			 * If the grant is not mapped by the backend we end the
@@ -1302,7 +1325,9 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri
 			 */
 			gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL);
 			s->grants_used[i]->gref = GRANT_INVALID_REF;
+			spin_lock_irqsave(&info->dev_lock, flags);
 			list_add_tail(&s->grants_used[i]->node, &info->grants);
+			spin_unlock_irqrestore(&info->dev_lock, flags);
 		}
 	}
 	if (s->req.operation == BLKIF_OP_INDIRECT) {
@@ -1311,8 +1336,10 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri
 				if (!info->feature_persistent)
 					pr_alert_ratelimited("backed has not unmapped grant: %u\n",
 							     s->indirect_grants[i]->gref);
+				spin_lock_irqsave(&info->dev_lock, flags);
 				list_add(&s->indirect_grants[i]->node, &info->grants);
 				info->persistent_gnts_c++;
+				spin_unlock_irqrestore(&info->dev_lock, flags);
 			} else {
 				struct page *indirect_page;
 
@@ -1326,7 +1353,9 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri
 					list_add(&indirect_page->lru, &rinfo->indirect_pages);
 				}
 				s->indirect_grants[i]->gref = GRANT_INVALID_REF;
+				spin_lock_irqsave(&info->dev_lock, flags);
 				list_add_tail(&s->indirect_grants[i]->node, &info->grants);
+				spin_unlock_irqrestore(&info->dev_lock, flags);
 			}
 		}
 	}
@@ -1342,13 +1371,10 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 	struct blkfront_info *info = rinfo->dev_info;
 	int error;
 
-	spin_lock_irqsave(&info->io_lock, flags);
-
-	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
-		spin_unlock_irqrestore(&info->io_lock, flags);
+	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
 		return IRQ_HANDLED;
-	}
 
+	spin_lock_irqsave(&rinfo->ring_lock, flags);
  again:
 	rp = rinfo->ring.sring->rsp_prod;
 	rmb(); /* Ensure we see queued responses up to 'rp'. */
@@ -1439,9 +1465,9 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 	} else
 		rinfo->ring.sring->rsp_event = i + 1;
 
-	kick_pending_request_queues(rinfo);
+	kick_pending_request_queues_locked(rinfo);
 
-	spin_unlock_irqrestore(&info->io_lock, flags);
+	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
 
 	return IRQ_HANDLED;
 }
@@ -1690,14 +1716,14 @@ static int blkfront_probe(struct xenbus_device *dev,
 		INIT_LIST_HEAD(&rinfo->indirect_pages);
 		rinfo->dev_info = info;
 		INIT_WORK(&rinfo->work, blkif_restart_queue);
+		spin_lock_init(&rinfo->ring_lock);
 	}
 
 	mutex_init(&info->mutex);
-	spin_lock_init(&info->io_lock);
+	spin_lock_init(&info->dev_lock);
 	info->xbdev = dev;
 	info->vdevice = vdevice;
 	INIT_LIST_HEAD(&info->grants);
-	info->persistent_gnts_c = 0;
 	info->connected = BLKIF_STATE_DISCONNECTED;
 
 	/* Front end dir is a number, which is used as the id. */
@@ -1790,8 +1816,6 @@ static int blkif_recover(struct blkfront_info *info)
 	}
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
-	spin_lock_irq(&info->io_lock);
-
 	/* Now safe for us to use the shared ring */
 	info->connected = BLKIF_STATE_CONNECTED;
 
@@ -1809,7 +1833,6 @@ static int blkif_recover(struct blkfront_info *info)
 		BUG_ON(req->nr_phys_segments > segs);
 		blk_mq_requeue_request(req);
 	}
-	spin_unlock_irq(&info->io_lock);
 	blk_mq_kick_requeue_list(info->rq);
 
 	while ((bio = bio_list_pop(&bio_list)) != NULL) {
@@ -2158,11 +2181,9 @@ static void blkfront_connect(struct blkfront_info *info)
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
 	/* Kick pending requests. */
-	spin_lock_irq(&info->io_lock);
 	info->connected = BLKIF_STATE_CONNECTED;
 	for (i = 0; i < info->nr_rings; i++)
 		kick_pending_request_queues(&info->rinfo[i]);
-	spin_unlock_irq(&info->io_lock);
 
 	add_disk(info->gd);
 
-- 
cgit 


From 28d949bcc28bbc2d206f9c3f69b892575e81c040 Mon Sep 17 00:00:00 2001
From: Bob Liu <bob.liu@oracle.com>
Date: Sat, 14 Nov 2015 11:12:14 +0800
Subject: xen/blkfront: negotiate number of queues/rings to be used with
 backend

The max number of hardware queues for xen/blkfront is set by parameter
'max_queues'(default 4), while it is also capped by the max value that the
xen/blkback exposes through XenStore key 'multi-queue-max-queues'.

The negotiated number is the smaller one and would be written back to xenstore
as "multi-queue-num-queues", blkback needs to read this negotiated number.

Signed-off-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 160 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 119 insertions(+), 41 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index a9058bbdaa6b..81c87e69cbe9 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -99,6 +99,10 @@ static unsigned int xen_blkif_max_segments = 32;
 module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
 MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
 
+static unsigned int xen_blkif_max_queues = 4;
+module_param_named(max_queues, xen_blkif_max_queues, uint, S_IRUGO);
+MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk");
+
 /*
  * Maximum order of pages to be used for the shared ring between front and
  * backend, 4KB page granularity is used.
@@ -118,6 +122,10 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
  * characters are enough. Define to 20 to keep consist with backend.
  */
 #define RINGREF_NAME_LEN (20)
+/*
+ * queue-%u would take 7 + 10(UINT_MAX) = 17 characters.
+ */
+#define QUEUE_NAME_LEN (17)
 
 /*
  *  Per-ring info.
@@ -823,7 +831,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
 
 	memset(&info->tag_set, 0, sizeof(info->tag_set));
 	info->tag_set.ops = &blkfront_mq_ops;
-	info->tag_set.nr_hw_queues = 1;
+	info->tag_set.nr_hw_queues = info->nr_rings;
 	info->tag_set.queue_depth =  BLK_RING_SIZE(info);
 	info->tag_set.numa_node = NUMA_NO_NODE;
 	info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
@@ -1522,6 +1530,53 @@ fail:
 	return err;
 }
 
+/*
+ * Write out per-ring/queue nodes including ring-ref and event-channel, and each
+ * ring buffer may have multi pages depending on ->nr_ring_pages.
+ */
+static int write_per_ring_nodes(struct xenbus_transaction xbt,
+				struct blkfront_ring_info *rinfo, const char *dir)
+{
+	int err;
+	unsigned int i;
+	const char *message = NULL;
+	struct blkfront_info *info = rinfo->dev_info;
+
+	if (info->nr_ring_pages == 1) {
+		err = xenbus_printf(xbt, dir, "ring-ref", "%u", rinfo->ring_ref[0]);
+		if (err) {
+			message = "writing ring-ref";
+			goto abort_transaction;
+		}
+	} else {
+		for (i = 0; i < info->nr_ring_pages; i++) {
+			char ring_ref_name[RINGREF_NAME_LEN];
+
+			snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
+			err = xenbus_printf(xbt, dir, ring_ref_name,
+					    "%u", rinfo->ring_ref[i]);
+			if (err) {
+				message = "writing ring-ref";
+				goto abort_transaction;
+			}
+		}
+	}
+
+	err = xenbus_printf(xbt, dir, "event-channel", "%u", rinfo->evtchn);
+	if (err) {
+		message = "writing event-channel";
+		goto abort_transaction;
+	}
+
+	return 0;
+
+abort_transaction:
+	xenbus_transaction_end(xbt, 1);
+	if (message)
+		xenbus_dev_fatal(info->xbdev, err, "%s", message);
+
+	return err;
+}
 
 /* Common code used when first setting up, and when resuming. */
 static int talk_to_blkback(struct xenbus_device *dev,
@@ -1529,10 +1584,9 @@ static int talk_to_blkback(struct xenbus_device *dev,
 {
 	const char *message = NULL;
 	struct xenbus_transaction xbt;
-	int err, i;
-	unsigned int max_page_order = 0;
+	int err;
+	unsigned int i, max_page_order = 0;
 	unsigned int ring_page_order = 0;
-	struct blkfront_ring_info *rinfo;
 
 	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
 			   "max-ring-page-order", "%u", &max_page_order);
@@ -1544,7 +1598,8 @@ static int talk_to_blkback(struct xenbus_device *dev,
 	}
 
 	for (i = 0; i < info->nr_rings; i++) {
-		rinfo = &info->rinfo[i];
+		struct blkfront_ring_info *rinfo = &info->rinfo[i];
+
 		/* Create shared ring, alloc event channel. */
 		err = setup_blkring(dev, rinfo);
 		if (err)
@@ -1558,44 +1613,49 @@ again:
 		goto destroy_blkring;
 	}
 
-	if (info->nr_rings == 1) {
-		rinfo = &info->rinfo[0];
-		if (info->nr_ring_pages == 1) {
-			err = xenbus_printf(xbt, dev->nodename,
-					    "ring-ref", "%u", rinfo->ring_ref[0]);
-			if (err) {
-				message = "writing ring-ref";
-				goto abort_transaction;
-			}
-		} else {
-			err = xenbus_printf(xbt, dev->nodename,
-					    "ring-page-order", "%u", ring_page_order);
-			if (err) {
-				message = "writing ring-page-order";
-				goto abort_transaction;
-			}
+	if (info->nr_ring_pages > 1) {
+		err = xenbus_printf(xbt, dev->nodename, "ring-page-order", "%u",
+				    ring_page_order);
+		if (err) {
+			message = "writing ring-page-order";
+			goto abort_transaction;
+		}
+	}
 
-			for (i = 0; i < info->nr_ring_pages; i++) {
-				char ring_ref_name[RINGREF_NAME_LEN];
+	/* We already got the number of queues/rings in _probe */
+	if (info->nr_rings == 1) {
+		err = write_per_ring_nodes(xbt, &info->rinfo[0], dev->nodename);
+		if (err)
+			goto destroy_blkring;
+	} else {
+		char *path;
+		size_t pathsize;
 
-				snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
-				err = xenbus_printf(xbt, dev->nodename, ring_ref_name,
-						    "%u", rinfo->ring_ref[i]);
-				if (err) {
-					message = "writing ring-ref";
-					goto abort_transaction;
-				}
-			}
-		}
-		err = xenbus_printf(xbt, dev->nodename,
-				    "event-channel", "%u", rinfo->evtchn);
+		err = xenbus_printf(xbt, dev->nodename, "multi-queue-num-queues", "%u",
+				    info->nr_rings);
 		if (err) {
-			message = "writing event-channel";
+			message = "writing multi-queue-num-queues";
 			goto abort_transaction;
 		}
-	} else {
-		/* Not supported at this stage. */
-		goto abort_transaction;
+
+		pathsize = strlen(dev->nodename) + QUEUE_NAME_LEN;
+		path = kmalloc(pathsize, GFP_KERNEL);
+		if (!path) {
+			err = -ENOMEM;
+			message = "ENOMEM while writing ring references";
+			goto abort_transaction;
+		}
+
+		for (i = 0; i < info->nr_rings; i++) {
+			memset(path, 0, pathsize);
+			snprintf(path, pathsize, "%s/queue-%u", dev->nodename, i);
+			err = write_per_ring_nodes(xbt, &info->rinfo[i], path);
+			if (err) {
+				kfree(path);
+				goto destroy_blkring;
+			}
+		}
+		kfree(path);
 	}
 	err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
 			    XEN_IO_PROTO_ABI_NATIVE);
@@ -1619,8 +1679,7 @@ again:
 
 	for (i = 0; i < info->nr_rings; i++) {
 		unsigned int j;
-
-		rinfo = &info->rinfo[i];
+		struct blkfront_ring_info *rinfo = &info->rinfo[i];
 
 		for (j = 0; j < BLK_RING_SIZE(info); j++)
 			rinfo->shadow[j].req.u.rw.id = j + 1;
@@ -1652,6 +1711,7 @@ static int blkfront_probe(struct xenbus_device *dev,
 	int err, vdevice;
 	unsigned int r_index;
 	struct blkfront_info *info;
+	unsigned int backend_max_queues = 0;
 
 	/* FIXME: Use dynamic device id if this is not set. */
 	err = xenbus_scanf(XBT_NIL, dev->nodename,
@@ -1701,7 +1761,18 @@ static int blkfront_probe(struct xenbus_device *dev,
 		return -ENOMEM;
 	}
 
-	info->nr_rings = 1;
+	info->xbdev = dev;
+	/* Check if backend supports multiple queues. */
+	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+			   "multi-queue-max-queues", "%u", &backend_max_queues);
+	if (err < 0)
+		backend_max_queues = 1;
+
+	info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
+	/* We need at least one ring. */
+	if (!info->nr_rings)
+		info->nr_rings = 1;
+
 	info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL);
 	if (!info->rinfo) {
 		xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure");
@@ -2390,6 +2461,7 @@ static struct xenbus_driver blkfront_driver = {
 static int __init xlblk_init(void)
 {
 	int ret;
+	int nr_cpus = num_online_cpus();
 
 	if (!xen_domain())
 		return -ENODEV;
@@ -2400,6 +2472,12 @@ static int __init xlblk_init(void)
 		xen_blkif_max_ring_order = 0;
 	}
 
+	if (xen_blkif_max_queues > nr_cpus) {
+		pr_info("Invalid max_queues (%d), will use default max: %d.\n",
+			xen_blkif_max_queues, nr_cpus);
+		xen_blkif_max_queues = nr_cpus;
+	}
+
 	if (!xen_has_pv_disk_devices())
 		return -ENODEV;
 
-- 
cgit 


From 6f03a7ff89485f0a7a559bf5c7631d2986c4ecfa Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 16 Nov 2015 15:14:41 -0500
Subject: xen/blkfront: Cleanup of comments, fix unaligned variables, and
 syntax errors.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 81c87e69cbe9..87ab09fc7b5f 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -118,8 +118,8 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
 	__CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS)
 
 /*
- * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
- * characters are enough. Define to 20 to keep consist with backend.
+ * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
+ * characters are enough. Define to 20 to keep consistent with backend.
  */
 #define RINGREF_NAME_LEN (20)
 /*
@@ -238,7 +238,7 @@ static int get_id_from_freelist(struct blkfront_ring_info *rinfo)
 }
 
 static int add_id_to_freelist(struct blkfront_ring_info *rinfo,
-			       unsigned long id)
+			      unsigned long id)
 {
 	if (rinfo->shadow[id].req.u.rw.id != id)
 		return -EINVAL;
@@ -257,7 +257,7 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
 	struct grant *gnt_list_entry, *n;
 	int i = 0;
 
-	while(i < num) {
+	while (i < num) {
 		gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO);
 		if (!gnt_list_entry)
 			goto out_of_memory;
@@ -776,7 +776,7 @@ static inline bool blkif_request_flush_invalid(struct request *req,
 }
 
 static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
-			   const struct blk_mq_queue_data *qd)
+			  const struct blk_mq_queue_data *qd)
 {
 	unsigned long flags;
 	struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data;
@@ -1968,8 +1968,7 @@ static int blkfront_resume(struct xenbus_device *dev)
 	return err;
 }
 
-static void
-blkfront_closing(struct blkfront_info *info)
+static void blkfront_closing(struct blkfront_info *info)
 {
 	struct xenbus_device *xbdev = info->xbdev;
 	struct block_device *bdev = NULL;
-- 
cgit 


From 75f070b3967b0c3bf0e1bc43411b06bab6c2c2cd Mon Sep 17 00:00:00 2001
From: Bob Liu <bob.liu@oracle.com>
Date: Mon, 16 Nov 2015 16:25:33 -0500
Subject: xen/blkfront: Remove duplicate setting of ->xbdev.

We do the same exact operations a bit earlier in the
function.

Signed-off-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 87ab09fc7b5f..b7f06cf14788 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1792,7 +1792,6 @@ static int blkfront_probe(struct xenbus_device *dev,
 
 	mutex_init(&info->mutex);
 	spin_lock_init(&info->dev_lock);
-	info->xbdev = dev;
 	info->vdevice = vdevice;
 	INIT_LIST_HEAD(&info->grants);
 	info->connected = BLKIF_STATE_DISCONNECTED;
-- 
cgit 


From 73716df7da4f60dd2d59a9302227d0394f1b8fcc Mon Sep 17 00:00:00 2001
From: Bob Liu <bob.liu@oracle.com>
Date: Mon, 16 Nov 2015 16:51:39 -0500
Subject: xen/blkfront: make persistent grants pool per-queue

Make persistent grants per-queue/ring instead of per-device, so that we can
drop the 'dev_lock' and get better scalability.

Test was done based on null_blk driver:
dom0: v4.2-rc8 16vcpus 10GB "modprobe null_blk"
domu: v4.2-rc8 16vcpus 10GB

[test]
rw=read
direct=1
ioengine=libaio
bs=4k
time_based
runtime=30
filename=/dev/xvdb
numjobs=16
iodepth=64
iodepth_batch=64
iodepth_batch_complete=64
group_reporting

Queues:			  1 	   4 	  	  8 	 	 16
Iops orig(k):		810 	1064 		780 		700
Iops patched(k):	810     1230(~20%)	1024(~20%)	850(~20%)

Signed-off-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 110 +++++++++++++++++--------------------------
 1 file changed, 43 insertions(+), 67 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index b7f06cf14788..9d46960b7d72 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -142,6 +142,8 @@ struct blkfront_ring_info {
 	struct gnttab_free_callback callback;
 	struct blk_shadow shadow[BLK_MAX_RING_SIZE];
 	struct list_head indirect_pages;
+	struct list_head grants;
+	unsigned int persistent_gnts_c;
 	unsigned long shadow_free;
 	struct blkfront_info *dev_info;
 };
@@ -162,13 +164,6 @@ struct blkfront_info
 	/* Number of pages per ring buffer. */
 	unsigned int nr_ring_pages;
 	struct request_queue *rq;
-	/*
-	 * Lock to protect info->grants list and persistent_gnts_c shared by all
-	 * rings.
-	 */
-	spinlock_t dev_lock;
-	struct list_head grants;
-	unsigned int persistent_gnts_c;
 	unsigned int feature_flush;
 	unsigned int feature_discard:1;
 	unsigned int feature_secdiscard:1;
@@ -272,9 +267,7 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
 		}
 
 		gnt_list_entry->gref = GRANT_INVALID_REF;
-		spin_lock_irq(&info->dev_lock);
-		list_add(&gnt_list_entry->node, &info->grants);
-		spin_unlock_irq(&info->dev_lock);
+		list_add(&gnt_list_entry->node, &rinfo->grants);
 		i++;
 	}
 
@@ -282,10 +275,8 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
 
 out_of_memory:
 	list_for_each_entry_safe(gnt_list_entry, n,
-	                         &info->grants, node) {
-		spin_lock_irq(&info->dev_lock);
+	                         &rinfo->grants, node) {
 		list_del(&gnt_list_entry->node);
-		spin_unlock_irq(&info->dev_lock);
 		if (info->feature_persistent)
 			__free_page(gnt_list_entry->page);
 		kfree(gnt_list_entry);
@@ -295,20 +286,17 @@ out_of_memory:
 	return -ENOMEM;
 }
 
-static struct grant *get_free_grant(struct blkfront_info *info)
+static struct grant *get_free_grant(struct blkfront_ring_info *rinfo)
 {
 	struct grant *gnt_list_entry;
-	unsigned long flags;
 
-	spin_lock_irqsave(&info->dev_lock, flags);
-	BUG_ON(list_empty(&info->grants));
-	gnt_list_entry = list_first_entry(&info->grants, struct grant,
+	BUG_ON(list_empty(&rinfo->grants));
+	gnt_list_entry = list_first_entry(&rinfo->grants, struct grant,
 					  node);
 	list_del(&gnt_list_entry->node);
 
 	if (gnt_list_entry->gref != GRANT_INVALID_REF)
-		info->persistent_gnts_c--;
-	spin_unlock_irqrestore(&info->dev_lock, flags);
+		rinfo->persistent_gnts_c--;
 
 	return gnt_list_entry;
 }
@@ -324,9 +312,10 @@ static inline void grant_foreign_access(const struct grant *gnt_list_entry,
 
 static struct grant *get_grant(grant_ref_t *gref_head,
 			       unsigned long gfn,
-			       struct blkfront_info *info)
+			       struct blkfront_ring_info *rinfo)
 {
-	struct grant *gnt_list_entry = get_free_grant(info);
+	struct grant *gnt_list_entry = get_free_grant(rinfo);
+	struct blkfront_info *info = rinfo->dev_info;
 
 	if (gnt_list_entry->gref != GRANT_INVALID_REF)
 		return gnt_list_entry;
@@ -347,9 +336,10 @@ static struct grant *get_grant(grant_ref_t *gref_head,
 }
 
 static struct grant *get_indirect_grant(grant_ref_t *gref_head,
-					struct blkfront_info *info)
+					struct blkfront_ring_info *rinfo)
 {
-	struct grant *gnt_list_entry = get_free_grant(info);
+	struct grant *gnt_list_entry = get_free_grant(rinfo);
+	struct blkfront_info *info = rinfo->dev_info;
 
 	if (gnt_list_entry->gref != GRANT_INVALID_REF)
 		return gnt_list_entry;
@@ -361,8 +351,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head,
 		struct page *indirect_page;
 
 		/* Fetch a pre-allocated page to use for indirect grefs */
-		BUG_ON(list_empty(&info->rinfo->indirect_pages));
-		indirect_page = list_first_entry(&info->rinfo->indirect_pages,
+		BUG_ON(list_empty(&rinfo->indirect_pages));
+		indirect_page = list_first_entry(&rinfo->indirect_pages,
 						 struct page, lru);
 		list_del(&indirect_page->lru);
 		gnt_list_entry->page = indirect_page;
@@ -543,7 +533,6 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
 	unsigned int grant_idx = setup->grant_idx;
 	struct blkif_request *ring_req = setup->ring_req;
 	struct blkfront_ring_info *rinfo = setup->rinfo;
-	struct blkfront_info *info = rinfo->dev_info;
 	struct blk_shadow *shadow = &rinfo->shadow[setup->id];
 
 	if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
@@ -552,13 +541,13 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
 			kunmap_atomic(setup->segments);
 
 		n = grant_idx / GRANTS_PER_INDIRECT_FRAME;
-		gnt_list_entry = get_indirect_grant(&setup->gref_head, info);
+		gnt_list_entry = get_indirect_grant(&setup->gref_head, rinfo);
 		shadow->indirect_grants[n] = gnt_list_entry;
 		setup->segments = kmap_atomic(gnt_list_entry->page);
 		ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
 	}
 
-	gnt_list_entry = get_grant(&setup->gref_head, gfn, info);
+	gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo);
 	ref = gnt_list_entry->gref;
 	shadow->grants_used[grant_idx] = gnt_list_entry;
 
@@ -1129,7 +1118,7 @@ static void blkif_restart_queue(struct work_struct *work)
 
 static void blkif_free_ring(struct blkfront_ring_info *rinfo)
 {
-	struct grant *persistent_gnt;
+	struct grant *persistent_gnt, *n;
 	struct blkfront_info *info = rinfo->dev_info;
 	int i, j, segs;
 
@@ -1147,6 +1136,23 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo)
 		}
 	}
 
+	/* Remove all persistent grants. */
+	if (!list_empty(&rinfo->grants)) {
+		list_for_each_entry_safe(persistent_gnt, n,
+					 &rinfo->grants, node) {
+			list_del(&persistent_gnt->node);
+			if (persistent_gnt->gref != GRANT_INVALID_REF) {
+				gnttab_end_foreign_access(persistent_gnt->gref,
+							  0, 0UL);
+				rinfo->persistent_gnts_c--;
+			}
+			if (info->feature_persistent)
+				__free_page(persistent_gnt->page);
+			kfree(persistent_gnt);
+		}
+	}
+	BUG_ON(rinfo->persistent_gnts_c != 0);
+
 	for (i = 0; i < BLK_RING_SIZE(info); i++) {
 		/*
 		 * Clear persistent grants present in requests already
@@ -1212,7 +1218,6 @@ free_shadow:
 
 static void blkif_free(struct blkfront_info *info, int suspend)
 {
-	struct grant *persistent_gnt, *n;
 	unsigned int i;
 
 	/* Prevent new requests being issued until we fix things up. */
@@ -1222,25 +1227,6 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 	if (info->rq)
 		blk_mq_stop_hw_queues(info->rq);
 
-	/* Remove all persistent grants */
-	spin_lock_irq(&info->dev_lock);
-	if (!list_empty(&info->grants)) {
-		list_for_each_entry_safe(persistent_gnt, n,
-					 &info->grants, node) {
-			list_del(&persistent_gnt->node);
-			if (persistent_gnt->gref != GRANT_INVALID_REF) {
-				gnttab_end_foreign_access(persistent_gnt->gref,
-							  0, 0UL);
-				info->persistent_gnts_c--;
-			}
-			if (info->feature_persistent)
-				__free_page(persistent_gnt->page);
-			kfree(persistent_gnt);
-		}
-	}
-	BUG_ON(info->persistent_gnts_c != 0);
-	spin_unlock_irq(&info->dev_lock);
-
 	for (i = 0; i < info->nr_rings; i++)
 		blkif_free_ring(&info->rinfo[i]);
 
@@ -1281,7 +1267,6 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri
 	int i = 0;
 	struct scatterlist *sg;
 	int num_sg, num_grant;
-	unsigned long flags;
 	struct blkfront_info *info = rinfo->dev_info;
 	struct copy_from_grant data = {
 		.s = s,
@@ -1320,10 +1305,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri
 			if (!info->feature_persistent)
 				pr_alert_ratelimited("backed has not unmapped grant: %u\n",
 						     s->grants_used[i]->gref);
-			spin_lock_irqsave(&info->dev_lock, flags);
-			list_add(&s->grants_used[i]->node, &info->grants);
-			info->persistent_gnts_c++;
-			spin_unlock_irqrestore(&info->dev_lock, flags);
+			list_add(&s->grants_used[i]->node, &rinfo->grants);
+			rinfo->persistent_gnts_c++;
 		} else {
 			/*
 			 * If the grant is not mapped by the backend we end the
@@ -1333,9 +1316,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri
 			 */
 			gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL);
 			s->grants_used[i]->gref = GRANT_INVALID_REF;
-			spin_lock_irqsave(&info->dev_lock, flags);
-			list_add_tail(&s->grants_used[i]->node, &info->grants);
-			spin_unlock_irqrestore(&info->dev_lock, flags);
+			list_add_tail(&s->grants_used[i]->node, &rinfo->grants);
 		}
 	}
 	if (s->req.operation == BLKIF_OP_INDIRECT) {
@@ -1344,10 +1325,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri
 				if (!info->feature_persistent)
 					pr_alert_ratelimited("backed has not unmapped grant: %u\n",
 							     s->indirect_grants[i]->gref);
-				spin_lock_irqsave(&info->dev_lock, flags);
-				list_add(&s->indirect_grants[i]->node, &info->grants);
-				info->persistent_gnts_c++;
-				spin_unlock_irqrestore(&info->dev_lock, flags);
+				list_add(&s->indirect_grants[i]->node, &rinfo->grants);
+				rinfo->persistent_gnts_c++;
 			} else {
 				struct page *indirect_page;
 
@@ -1361,9 +1340,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri
 					list_add(&indirect_page->lru, &rinfo->indirect_pages);
 				}
 				s->indirect_grants[i]->gref = GRANT_INVALID_REF;
-				spin_lock_irqsave(&info->dev_lock, flags);
-				list_add_tail(&s->indirect_grants[i]->node, &info->grants);
-				spin_unlock_irqrestore(&info->dev_lock, flags);
+				list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants);
 			}
 		}
 	}
@@ -1785,15 +1762,14 @@ static int blkfront_probe(struct xenbus_device *dev,
 
 		rinfo = &info->rinfo[r_index];
 		INIT_LIST_HEAD(&rinfo->indirect_pages);
+		INIT_LIST_HEAD(&rinfo->grants);
 		rinfo->dev_info = info;
 		INIT_WORK(&rinfo->work, blkif_restart_queue);
 		spin_lock_init(&rinfo->ring_lock);
 	}
 
 	mutex_init(&info->mutex);
-	spin_lock_init(&info->dev_lock);
 	info->vdevice = vdevice;
-	INIT_LIST_HEAD(&info->grants);
 	info->connected = BLKIF_STATE_DISCONNECTED;
 
 	/* Front end dir is a number, which is used as the id. */
-- 
cgit 


From 45fc82642e54018740a25444d1165901501b601b Mon Sep 17 00:00:00 2001
From: Peng Fan <van.freenix@gmail.com>
Date: Wed, 25 Nov 2015 18:26:01 +0800
Subject: xen/blkfront: correct setting for xen_blkif_max_ring_order
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

According to this piece code:
"
     pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
              xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
"
if xen_blkif_max_ring_order is bigger that XENBUS_MAX_RING_GRANT_ORDER,
need to set xen_blkif_max_ring_order using XENBUS_MAX_RING_GRANT_ORDER,
but not 0.

Signed-off-by: Peng Fan <van.freenix@gmail.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: "Roger Pau Monné" <roger.pau@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 9d46960b7d72..8b0f3d92d8b4 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -2443,7 +2443,7 @@ static int __init xlblk_init(void)
 	if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
 		pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
 			xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
-		xen_blkif_max_ring_order = 0;
+		xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
 	}
 
 	if (xen_blkif_max_queues > nr_cpus) {
-- 
cgit 


From 597957000ab5b1b38085c20868f3f7b9c305bae5 Mon Sep 17 00:00:00 2001
From: Bob Liu <bob.liu@oracle.com>
Date: Sat, 14 Nov 2015 11:12:15 +0800
Subject: xen/blkback: separate ring information out of struct xen_blkif

Split per ring information to an new structure "xen_blkif_ring", so that one vbd
device can be associated with one or more rings/hardware queues.

Introduce 'pers_gnts_lock' to protect the pool of persistent grants since we
may have multi backend threads.

This patch is a preparation for supporting multi hardware queues/rings.

Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
v2: Align the variables in the structure.
---
 drivers/block/xen-blkback/blkback.c | 235 ++++++++++++++++++++----------------
 drivers/block/xen-blkback/common.h  |  56 +++++----
 drivers/block/xen-blkback/xenbus.c  |  96 +++++++--------
 3 files changed, 215 insertions(+), 172 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index f9099940c272..4fd8640d146c 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -173,11 +173,11 @@ static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num)
 
 #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
 
-static int do_block_io_op(struct xen_blkif *blkif);
-static int dispatch_rw_block_io(struct xen_blkif *blkif,
+static int do_block_io_op(struct xen_blkif_ring *ring);
+static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 				struct blkif_request *req,
 				struct pending_req *pending_req);
-static void make_response(struct xen_blkif *blkif, u64 id,
+static void make_response(struct xen_blkif_ring *ring, u64 id,
 			  unsigned short op, int st);
 
 #define foreach_grant_safe(pos, n, rbtree, node) \
@@ -189,14 +189,8 @@ static void make_response(struct xen_blkif *blkif, u64 id,
 
 
 /*
- * We don't need locking around the persistent grant helpers
- * because blkback uses a single-thread for each backed, so we
- * can be sure that this functions will never be called recursively.
- *
- * The only exception to that is put_persistent_grant, that can be called
- * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
- * bit operations to modify the flags of a persistent grant and to count
- * the number of used grants.
+ * pers_gnts_lock must be used around all the persistent grant helpers
+ * because blkback may use multi-thread/queue for each backend.
  */
 static int add_persistent_gnt(struct xen_blkif *blkif,
 			       struct persistent_gnt *persistent_gnt)
@@ -204,6 +198,7 @@ static int add_persistent_gnt(struct xen_blkif *blkif,
 	struct rb_node **new = NULL, *parent = NULL;
 	struct persistent_gnt *this;
 
+	BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock));
 	if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) {
 		if (!blkif->vbd.overflow_max_grants)
 			blkif->vbd.overflow_max_grants = 1;
@@ -241,6 +236,7 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
 	struct persistent_gnt *data;
 	struct rb_node *node = NULL;
 
+	BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock));
 	node = blkif->persistent_gnts.rb_node;
 	while (node) {
 		data = container_of(node, struct persistent_gnt, node);
@@ -265,6 +261,7 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
 static void put_persistent_gnt(struct xen_blkif *blkif,
                                struct persistent_gnt *persistent_gnt)
 {
+	BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock));
 	if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
 		pr_alert_ratelimited("freeing a grant already unused\n");
 	set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
@@ -286,6 +283,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
 	unmap_data.unmap_ops = unmap;
 	unmap_data.kunmap_ops = NULL;
 
+	BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock));
 	foreach_grant_safe(persistent_gnt, n, root, node) {
 		BUG_ON(persistent_gnt->handle ==
 			BLKBACK_INVALID_HANDLE);
@@ -322,11 +320,13 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
 	int segs_to_unmap = 0;
 	struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work);
 	struct gntab_unmap_queue_data unmap_data;
+	unsigned long flags;
 
 	unmap_data.pages = pages;
 	unmap_data.unmap_ops = unmap;
 	unmap_data.kunmap_ops = NULL;
 
+	spin_lock_irqsave(&blkif->pers_gnts_lock, flags);
 	while(!list_empty(&blkif->persistent_purge_list)) {
 		persistent_gnt = list_first_entry(&blkif->persistent_purge_list,
 		                                  struct persistent_gnt,
@@ -348,6 +348,7 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
 		}
 		kfree(persistent_gnt);
 	}
+	spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags);
 	if (segs_to_unmap > 0) {
 		unmap_data.count = segs_to_unmap;
 		BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
@@ -362,16 +363,18 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
 	unsigned int num_clean, total;
 	bool scan_used = false, clean_used = false;
 	struct rb_root *root;
+	unsigned long flags;
 
+	spin_lock_irqsave(&blkif->pers_gnts_lock, flags);
 	if (blkif->persistent_gnt_c < xen_blkif_max_pgrants ||
 	    (blkif->persistent_gnt_c == xen_blkif_max_pgrants &&
 	    !blkif->vbd.overflow_max_grants)) {
-		return;
+		goto out;
 	}
 
 	if (work_busy(&blkif->persistent_purge_work)) {
 		pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
-		return;
+		goto out;
 	}
 
 	num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
@@ -379,7 +382,7 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
 	num_clean = min(blkif->persistent_gnt_c, num_clean);
 	if ((num_clean == 0) ||
 	    (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use))))
-		return;
+		goto out;
 
 	/*
 	 * At this point, we can assure that there will be no calls
@@ -436,29 +439,35 @@ finished:
 	}
 
 	blkif->persistent_gnt_c -= (total - num_clean);
+	spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags);
 	blkif->vbd.overflow_max_grants = 0;
 
 	/* We can defer this work */
 	schedule_work(&blkif->persistent_purge_work);
 	pr_debug("Purged %u/%u\n", (total - num_clean), total);
 	return;
+
+out:
+	spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags);
+
+	return;
 }
 
 /*
  * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
  */
-static struct pending_req *alloc_req(struct xen_blkif *blkif)
+static struct pending_req *alloc_req(struct xen_blkif_ring *ring)
 {
 	struct pending_req *req = NULL;
 	unsigned long flags;
 
-	spin_lock_irqsave(&blkif->pending_free_lock, flags);
-	if (!list_empty(&blkif->pending_free)) {
-		req = list_entry(blkif->pending_free.next, struct pending_req,
+	spin_lock_irqsave(&ring->pending_free_lock, flags);
+	if (!list_empty(&ring->pending_free)) {
+		req = list_entry(ring->pending_free.next, struct pending_req,
 				 free_list);
 		list_del(&req->free_list);
 	}
-	spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
+	spin_unlock_irqrestore(&ring->pending_free_lock, flags);
 	return req;
 }
 
@@ -466,17 +475,17 @@ static struct pending_req *alloc_req(struct xen_blkif *blkif)
  * Return the 'pending_req' structure back to the freepool. We also
  * wake up the thread if it was waiting for a free page.
  */
-static void free_req(struct xen_blkif *blkif, struct pending_req *req)
+static void free_req(struct xen_blkif_ring *ring, struct pending_req *req)
 {
 	unsigned long flags;
 	int was_empty;
 
-	spin_lock_irqsave(&blkif->pending_free_lock, flags);
-	was_empty = list_empty(&blkif->pending_free);
-	list_add(&req->free_list, &blkif->pending_free);
-	spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
+	spin_lock_irqsave(&ring->pending_free_lock, flags);
+	was_empty = list_empty(&ring->pending_free);
+	list_add(&req->free_list, &ring->pending_free);
+	spin_unlock_irqrestore(&ring->pending_free_lock, flags);
 	if (was_empty)
-		wake_up(&blkif->pending_free_wq);
+		wake_up(&ring->pending_free_wq);
 }
 
 /*
@@ -556,10 +565,10 @@ abort:
 /*
  * Notification from the guest OS.
  */
-static void blkif_notify_work(struct xen_blkif *blkif)
+static void blkif_notify_work(struct xen_blkif_ring *ring)
 {
-	blkif->waiting_reqs = 1;
-	wake_up(&blkif->wq);
+	ring->waiting_reqs = 1;
+	wake_up(&ring->wq);
 }
 
 irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
@@ -590,7 +599,8 @@ static void print_stats(struct xen_blkif *blkif)
 
 int xen_blkif_schedule(void *arg)
 {
-	struct xen_blkif *blkif = arg;
+	struct xen_blkif_ring *ring = arg;
+	struct xen_blkif *blkif = ring->blkif;
 	struct xen_vbd *vbd = &blkif->vbd;
 	unsigned long timeout;
 	int ret;
@@ -606,27 +616,27 @@ int xen_blkif_schedule(void *arg)
 		timeout = msecs_to_jiffies(LRU_INTERVAL);
 
 		timeout = wait_event_interruptible_timeout(
-			blkif->wq,
-			blkif->waiting_reqs || kthread_should_stop(),
+			ring->wq,
+			ring->waiting_reqs || kthread_should_stop(),
 			timeout);
 		if (timeout == 0)
 			goto purge_gnt_list;
 		timeout = wait_event_interruptible_timeout(
-			blkif->pending_free_wq,
-			!list_empty(&blkif->pending_free) ||
+			ring->pending_free_wq,
+			!list_empty(&ring->pending_free) ||
 			kthread_should_stop(),
 			timeout);
 		if (timeout == 0)
 			goto purge_gnt_list;
 
-		blkif->waiting_reqs = 0;
+		ring->waiting_reqs = 0;
 		smp_mb(); /* clear flag *before* checking for work */
 
-		ret = do_block_io_op(blkif);
+		ret = do_block_io_op(ring);
 		if (ret > 0)
-			blkif->waiting_reqs = 1;
+			ring->waiting_reqs = 1;
 		if (ret == -EACCES)
-			wait_event_interruptible(blkif->shutdown_wq,
+			wait_event_interruptible(ring->shutdown_wq,
 						 kthread_should_stop());
 
 purge_gnt_list:
@@ -649,7 +659,7 @@ purge_gnt_list:
 	if (log_stats)
 		print_stats(blkif);
 
-	blkif->xenblkd = NULL;
+	ring->xenblkd = NULL;
 	xen_blkif_put(blkif);
 
 	return 0;
@@ -658,32 +668,40 @@ purge_gnt_list:
 /*
  * Remove persistent grants and empty the pool of free pages
  */
-void xen_blkbk_free_caches(struct xen_blkif *blkif)
+void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
 {
+	struct xen_blkif *blkif = ring->blkif;
+	unsigned long flags;
+
 	/* Free all persistent grant pages */
+	spin_lock_irqsave(&blkif->pers_gnts_lock, flags);
 	if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
 		free_persistent_gnts(blkif, &blkif->persistent_gnts,
 			blkif->persistent_gnt_c);
 
 	BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
 	blkif->persistent_gnt_c = 0;
+	spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags);
 
 	/* Since we are shutting down remove all pages from the buffer */
 	shrink_free_pagepool(blkif, 0 /* All */);
 }
 
 static unsigned int xen_blkbk_unmap_prepare(
-	struct xen_blkif *blkif,
+	struct xen_blkif_ring *ring,
 	struct grant_page **pages,
 	unsigned int num,
 	struct gnttab_unmap_grant_ref *unmap_ops,
 	struct page **unmap_pages)
 {
 	unsigned int i, invcount = 0;
+	unsigned long flags;
 
 	for (i = 0; i < num; i++) {
 		if (pages[i]->persistent_gnt != NULL) {
-			put_persistent_gnt(blkif, pages[i]->persistent_gnt);
+			spin_lock_irqsave(&ring->blkif->pers_gnts_lock, flags);
+			put_persistent_gnt(ring->blkif, pages[i]->persistent_gnt);
+			spin_unlock_irqrestore(&ring->blkif->pers_gnts_lock, flags);
 			continue;
 		}
 		if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
@@ -700,17 +718,18 @@ static unsigned int xen_blkbk_unmap_prepare(
 
 static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
 {
-	struct pending_req* pending_req = (struct pending_req*) (data->data);
-	struct xen_blkif *blkif = pending_req->blkif;
+	struct pending_req *pending_req = (struct pending_req *)(data->data);
+	struct xen_blkif_ring *ring = pending_req->ring;
+	struct xen_blkif *blkif = ring->blkif;
 
 	/* BUG_ON used to reproduce existing behaviour,
 	   but is this the best way to deal with this? */
 	BUG_ON(result);
 
 	put_free_pages(blkif, data->pages, data->count);
-	make_response(blkif, pending_req->id,
+	make_response(ring, pending_req->id,
 		      pending_req->operation, pending_req->status);
-	free_req(blkif, pending_req);
+	free_req(ring, pending_req);
 	/*
 	 * Make sure the request is freed before releasing blkif,
 	 * or there could be a race between free_req and the
@@ -723,7 +742,7 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
 	 * pending_free_wq if there's a drain going on, but it has
 	 * to be taken into account if the current model is changed.
 	 */
-	if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) {
+	if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) {
 		complete(&blkif->drain_complete);
 	}
 	xen_blkif_put(blkif);
@@ -732,11 +751,11 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
 static void xen_blkbk_unmap_and_respond(struct pending_req *req)
 {
 	struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
-	struct xen_blkif *blkif = req->blkif;
+	struct xen_blkif_ring *ring = req->ring;
 	struct grant_page **pages = req->segments;
 	unsigned int invcount;
 
-	invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_segs,
+	invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs,
 					   req->unmap, req->unmap_pages);
 
 	work->data = req;
@@ -757,7 +776,7 @@ static void xen_blkbk_unmap_and_respond(struct pending_req *req)
  * of hypercalls, but since this is only used in error paths there's
  * no real need.
  */
-static void xen_blkbk_unmap(struct xen_blkif *blkif,
+static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
                             struct grant_page *pages[],
                             int num)
 {
@@ -768,20 +787,20 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif,
 
 	while (num) {
 		unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
-		
-		invcount = xen_blkbk_unmap_prepare(blkif, pages, batch,
+
+		invcount = xen_blkbk_unmap_prepare(ring, pages, batch,
 						   unmap, unmap_pages);
 		if (invcount) {
 			ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
 			BUG_ON(ret);
-			put_free_pages(blkif, unmap_pages, invcount);
+			put_free_pages(ring->blkif, unmap_pages, invcount);
 		}
 		pages += batch;
 		num -= batch;
 	}
 }
 
-static int xen_blkbk_map(struct xen_blkif *blkif,
+static int xen_blkbk_map(struct xen_blkif_ring *ring,
 			 struct grant_page *pages[],
 			 int num, bool ro)
 {
@@ -794,6 +813,8 @@ static int xen_blkbk_map(struct xen_blkif *blkif,
 	int ret = 0;
 	int last_map = 0, map_until = 0;
 	int use_persistent_gnts;
+	struct xen_blkif *blkif = ring->blkif;
+	unsigned long irq_flags;
 
 	use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
 
@@ -806,10 +827,13 @@ again:
 	for (i = map_until; i < num; i++) {
 		uint32_t flags;
 
-		if (use_persistent_gnts)
+		if (use_persistent_gnts) {
+			spin_lock_irqsave(&blkif->pers_gnts_lock, irq_flags);
 			persistent_gnt = get_persistent_gnt(
 				blkif,
 				pages[i]->gref);
+			spin_unlock_irqrestore(&blkif->pers_gnts_lock, irq_flags);
+		}
 
 		if (persistent_gnt) {
 			/*
@@ -880,8 +904,10 @@ again:
 			persistent_gnt->gnt = map[new_map_idx].ref;
 			persistent_gnt->handle = map[new_map_idx].handle;
 			persistent_gnt->page = pages[seg_idx]->page;
+			spin_lock_irqsave(&blkif->pers_gnts_lock, irq_flags);
 			if (add_persistent_gnt(blkif,
 			                       persistent_gnt)) {
+				spin_unlock_irqrestore(&blkif->pers_gnts_lock, irq_flags);
 				kfree(persistent_gnt);
 				persistent_gnt = NULL;
 				goto next;
@@ -890,6 +916,7 @@ again:
 			pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
 				 persistent_gnt->gnt, blkif->persistent_gnt_c,
 				 xen_blkif_max_pgrants);
+			spin_unlock_irqrestore(&blkif->pers_gnts_lock, irq_flags);
 			goto next;
 		}
 		if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
@@ -921,7 +948,7 @@ static int xen_blkbk_map_seg(struct pending_req *pending_req)
 {
 	int rc;
 
-	rc = xen_blkbk_map(pending_req->blkif, pending_req->segments,
+	rc = xen_blkbk_map(pending_req->ring, pending_req->segments,
 			   pending_req->nr_segs,
 	                   (pending_req->operation != BLKIF_OP_READ));
 
@@ -934,7 +961,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
 				    struct phys_req *preq)
 {
 	struct grant_page **pages = pending_req->indirect_pages;
-	struct xen_blkif *blkif = pending_req->blkif;
+	struct xen_blkif_ring *ring = pending_req->ring;
 	int indirect_grefs, rc, n, nseg, i;
 	struct blkif_request_segment *segments = NULL;
 
@@ -945,7 +972,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
 	for (i = 0; i < indirect_grefs; i++)
 		pages[i]->gref = req->u.indirect.indirect_grefs[i];
 
-	rc = xen_blkbk_map(blkif, pages, indirect_grefs, true);
+	rc = xen_blkbk_map(ring, pages, indirect_grefs, true);
 	if (rc)
 		goto unmap;
 
@@ -972,15 +999,16 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
 unmap:
 	if (segments)
 		kunmap_atomic(segments);
-	xen_blkbk_unmap(blkif, pages, indirect_grefs);
+	xen_blkbk_unmap(ring, pages, indirect_grefs);
 	return rc;
 }
 
-static int dispatch_discard_io(struct xen_blkif *blkif,
+static int dispatch_discard_io(struct xen_blkif_ring *ring,
 				struct blkif_request *req)
 {
 	int err = 0;
 	int status = BLKIF_RSP_OKAY;
+	struct xen_blkif *blkif = ring->blkif;
 	struct block_device *bdev = blkif->vbd.bdev;
 	unsigned long secure;
 	struct phys_req preq;
@@ -1013,26 +1041,28 @@ fail_response:
 	} else if (err)
 		status = BLKIF_RSP_ERROR;
 
-	make_response(blkif, req->u.discard.id, req->operation, status);
+	make_response(ring, req->u.discard.id, req->operation, status);
 	xen_blkif_put(blkif);
 	return err;
 }
 
-static int dispatch_other_io(struct xen_blkif *blkif,
+static int dispatch_other_io(struct xen_blkif_ring *ring,
 			     struct blkif_request *req,
 			     struct pending_req *pending_req)
 {
-	free_req(blkif, pending_req);
-	make_response(blkif, req->u.other.id, req->operation,
+	free_req(ring, pending_req);
+	make_response(ring, req->u.other.id, req->operation,
 		      BLKIF_RSP_EOPNOTSUPP);
 	return -EIO;
 }
 
-static void xen_blk_drain_io(struct xen_blkif *blkif)
+static void xen_blk_drain_io(struct xen_blkif_ring *ring)
 {
+	struct xen_blkif *blkif = ring->blkif;
+
 	atomic_set(&blkif->drain, 1);
 	do {
-		if (atomic_read(&blkif->inflight) == 0)
+		if (atomic_read(&ring->inflight) == 0)
 			break;
 		wait_for_completion_interruptible_timeout(
 				&blkif->drain_complete, HZ);
@@ -1053,12 +1083,12 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
 	if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
 	    (error == -EOPNOTSUPP)) {
 		pr_debug("flush diskcache op failed, not supported\n");
-		xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
+		xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0);
 		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
 	} else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
 		    (error == -EOPNOTSUPP)) {
 		pr_debug("write barrier op failed, not supported\n");
-		xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0);
+		xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0);
 		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
 	} else if (error) {
 		pr_debug("Buffer not up-to-date at end of operation,"
@@ -1092,9 +1122,9 @@ static void end_block_io_op(struct bio *bio)
  * and transmute  it to the block API to hand it over to the proper block disk.
  */
 static int
-__do_block_io_op(struct xen_blkif *blkif)
+__do_block_io_op(struct xen_blkif_ring *ring)
 {
-	union blkif_back_rings *blk_rings = &blkif->blk_rings;
+	union blkif_back_rings *blk_rings = &ring->blk_rings;
 	struct blkif_request req;
 	struct pending_req *pending_req;
 	RING_IDX rc, rp;
@@ -1107,7 +1137,7 @@ __do_block_io_op(struct xen_blkif *blkif)
 	if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
 		rc = blk_rings->common.rsp_prod_pvt;
 		pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
-			rp, rc, rp - rc, blkif->vbd.pdevice);
+			rp, rc, rp - rc, ring->blkif->vbd.pdevice);
 		return -EACCES;
 	}
 	while (rc != rp) {
@@ -1120,14 +1150,14 @@ __do_block_io_op(struct xen_blkif *blkif)
 			break;
 		}
 
-		pending_req = alloc_req(blkif);
+		pending_req = alloc_req(ring);
 		if (NULL == pending_req) {
-			blkif->st_oo_req++;
+			ring->blkif->st_oo_req++;
 			more_to_do = 1;
 			break;
 		}
 
-		switch (blkif->blk_protocol) {
+		switch (ring->blkif->blk_protocol) {
 		case BLKIF_PROTOCOL_NATIVE:
 			memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
 			break;
@@ -1151,16 +1181,16 @@ __do_block_io_op(struct xen_blkif *blkif)
 		case BLKIF_OP_WRITE_BARRIER:
 		case BLKIF_OP_FLUSH_DISKCACHE:
 		case BLKIF_OP_INDIRECT:
-			if (dispatch_rw_block_io(blkif, &req, pending_req))
+			if (dispatch_rw_block_io(ring, &req, pending_req))
 				goto done;
 			break;
 		case BLKIF_OP_DISCARD:
-			free_req(blkif, pending_req);
-			if (dispatch_discard_io(blkif, &req))
+			free_req(ring, pending_req);
+			if (dispatch_discard_io(ring, &req))
 				goto done;
 			break;
 		default:
-			if (dispatch_other_io(blkif, &req, pending_req))
+			if (dispatch_other_io(ring, &req, pending_req))
 				goto done;
 			break;
 		}
@@ -1173,13 +1203,13 @@ done:
 }
 
 static int
-do_block_io_op(struct xen_blkif *blkif)
+do_block_io_op(struct xen_blkif_ring *ring)
 {
-	union blkif_back_rings *blk_rings = &blkif->blk_rings;
+	union blkif_back_rings *blk_rings = &ring->blk_rings;
 	int more_to_do;
 
 	do {
-		more_to_do = __do_block_io_op(blkif);
+		more_to_do = __do_block_io_op(ring);
 		if (more_to_do)
 			break;
 
@@ -1192,7 +1222,7 @@ do_block_io_op(struct xen_blkif *blkif)
  * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
  * and call the 'submit_bio' to pass it to the underlying storage.
  */
-static int dispatch_rw_block_io(struct xen_blkif *blkif,
+static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 				struct blkif_request *req,
 				struct pending_req *pending_req)
 {
@@ -1220,17 +1250,17 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 
 	switch (req_operation) {
 	case BLKIF_OP_READ:
-		blkif->st_rd_req++;
+		ring->blkif->st_rd_req++;
 		operation = READ;
 		break;
 	case BLKIF_OP_WRITE:
-		blkif->st_wr_req++;
+		ring->blkif->st_wr_req++;
 		operation = WRITE_ODIRECT;
 		break;
 	case BLKIF_OP_WRITE_BARRIER:
 		drain = true;
 	case BLKIF_OP_FLUSH_DISKCACHE:
-		blkif->st_f_req++;
+		ring->blkif->st_f_req++;
 		operation = WRITE_FLUSH;
 		break;
 	default:
@@ -1255,7 +1285,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 
 	preq.nr_sects      = 0;
 
-	pending_req->blkif     = blkif;
+	pending_req->ring      = ring;
 	pending_req->id        = req->u.rw.id;
 	pending_req->operation = req_operation;
 	pending_req->status    = BLKIF_RSP_OKAY;
@@ -1282,12 +1312,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 			goto fail_response;
 	}
 
-	if (xen_vbd_translate(&preq, blkif, operation) != 0) {
+	if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) {
 		pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
 			 operation == READ ? "read" : "write",
 			 preq.sector_number,
 			 preq.sector_number + preq.nr_sects,
-			 blkif->vbd.pdevice);
+			 ring->blkif->vbd.pdevice);
 		goto fail_response;
 	}
 
@@ -1299,7 +1329,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 		if (((int)preq.sector_number|(int)seg[i].nsec) &
 		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
 			pr_debug("Misaligned I/O request from domain %d\n",
-				 blkif->domid);
+				 ring->blkif->domid);
 			goto fail_response;
 		}
 	}
@@ -1308,7 +1338,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	 * issue the WRITE_FLUSH.
 	 */
 	if (drain)
-		xen_blk_drain_io(pending_req->blkif);
+		xen_blk_drain_io(pending_req->ring);
 
 	/*
 	 * If we have failed at this point, we need to undo the M2P override,
@@ -1323,8 +1353,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	 * This corresponding xen_blkif_put is done in __end_block_io_op, or
 	 * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
 	 */
-	xen_blkif_get(blkif);
-	atomic_inc(&blkif->inflight);
+	xen_blkif_get(ring->blkif);
+	atomic_inc(&ring->inflight);
 
 	for (i = 0; i < nseg; i++) {
 		while ((bio == NULL) ||
@@ -1372,19 +1402,19 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	blk_finish_plug(&plug);
 
 	if (operation == READ)
-		blkif->st_rd_sect += preq.nr_sects;
+		ring->blkif->st_rd_sect += preq.nr_sects;
 	else if (operation & WRITE)
-		blkif->st_wr_sect += preq.nr_sects;
+		ring->blkif->st_wr_sect += preq.nr_sects;
 
 	return 0;
 
  fail_flush:
-	xen_blkbk_unmap(blkif, pending_req->segments,
+	xen_blkbk_unmap(ring, pending_req->segments,
 	                pending_req->nr_segs);
  fail_response:
 	/* Haven't submitted any bio's yet. */
-	make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
-	free_req(blkif, pending_req);
+	make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
+	free_req(ring, pending_req);
 	msleep(1); /* back off a bit */
 	return -EIO;
 
@@ -1402,21 +1432,22 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 /*
  * Put a response on the ring on how the operation fared.
  */
-static void make_response(struct xen_blkif *blkif, u64 id,
+static void make_response(struct xen_blkif_ring *ring, u64 id,
 			  unsigned short op, int st)
 {
 	struct blkif_response  resp;
 	unsigned long     flags;
-	union blkif_back_rings *blk_rings = &blkif->blk_rings;
+	union blkif_back_rings *blk_rings;
 	int notify;
 
 	resp.id        = id;
 	resp.operation = op;
 	resp.status    = st;
 
-	spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+	spin_lock_irqsave(&ring->blk_ring_lock, flags);
+	blk_rings = &ring->blk_rings;
 	/* Place on the response ring for the relevant domain. */
-	switch (blkif->blk_protocol) {
+	switch (ring->blkif->blk_protocol) {
 	case BLKIF_PROTOCOL_NATIVE:
 		memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
 		       &resp, sizeof(resp));
@@ -1434,9 +1465,9 @@ static void make_response(struct xen_blkif *blkif, u64 id,
 	}
 	blk_rings->common.rsp_prod_pvt++;
 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
-	spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+	spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
 	if (notify)
-		notify_remote_via_irq(blkif->irq);
+		notify_remote_via_irq(ring->irq);
 }
 
 static int __init xen_blkif_init(void)
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 68e87a037b99..dbdf4164c83f 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -269,34 +269,50 @@ struct persistent_gnt {
 	struct list_head remove_node;
 };
 
+/* Per-ring information. */
+struct xen_blkif_ring {
+	/* Physical parameters of the comms window. */
+	unsigned int		irq;
+	union blkif_back_rings	blk_rings;
+	void			*blk_ring;
+	/* Private fields. */
+	spinlock_t		blk_ring_lock;
+
+	wait_queue_head_t	wq;
+	atomic_t		inflight;
+	/* One thread per blkif ring. */
+	struct task_struct	*xenblkd;
+	unsigned int		waiting_reqs;
+
+	/* List of all 'pending_req' available */
+	struct list_head	pending_free;
+	/* And its spinlock. */
+	spinlock_t		pending_free_lock;
+	wait_queue_head_t	pending_free_wq;
+
+	struct work_struct	free_work;
+	/* Thread shutdown wait queue. */
+	wait_queue_head_t	shutdown_wq;
+	struct xen_blkif 	*blkif;
+};
+
 struct xen_blkif {
 	/* Unique identifier for this interface. */
 	domid_t			domid;
 	unsigned int		handle;
-	/* Physical parameters of the comms window. */
-	unsigned int		irq;
 	/* Comms information. */
 	enum blkif_protocol	blk_protocol;
-	union blkif_back_rings	blk_rings;
-	void			*blk_ring;
 	/* The VBD attached to this interface. */
 	struct xen_vbd		vbd;
 	/* Back pointer to the backend_info. */
 	struct backend_info	*be;
-	/* Private fields. */
-	spinlock_t		blk_ring_lock;
 	atomic_t		refcnt;
-
-	wait_queue_head_t	wq;
 	/* for barrier (drain) requests */
 	struct completion	drain_complete;
 	atomic_t		drain;
-	atomic_t		inflight;
-	/* One thread per one blkif. */
-	struct task_struct	*xenblkd;
-	unsigned int		waiting_reqs;
 
 	/* tree to store persistent grants */
+	spinlock_t		pers_gnts_lock;
 	struct rb_root		persistent_gnts;
 	unsigned int		persistent_gnt_c;
 	atomic_t		persistent_gnt_in_use;
@@ -311,12 +327,6 @@ struct xen_blkif {
 	int			free_pages_num;
 	struct list_head	free_pages;
 
-	/* List of all 'pending_req' available */
-	struct list_head	pending_free;
-	/* And its spinlock. */
-	spinlock_t		pending_free_lock;
-	wait_queue_head_t	pending_free_wq;
-
 	/* statistics */
 	unsigned long		st_print;
 	unsigned long long			st_rd_req;
@@ -328,9 +338,9 @@ struct xen_blkif {
 	unsigned long long			st_wr_sect;
 
 	struct work_struct	free_work;
-	/* Thread shutdown wait queue. */
-	wait_queue_head_t	shutdown_wq;
-	unsigned int nr_ring_pages;
+	unsigned int 		nr_ring_pages;
+	/* All rings for this device. */
+	struct xen_blkif_ring 	ring;
 };
 
 struct seg_buf {
@@ -352,7 +362,7 @@ struct grant_page {
  * response queued for it, with the saved 'id' passed back.
  */
 struct pending_req {
-	struct xen_blkif	*blkif;
+	struct xen_blkif_ring   *ring;
 	u64			id;
 	int			nr_segs;
 	atomic_t		pendcnt;
@@ -394,7 +404,7 @@ int xen_blkif_xenbus_init(void);
 irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
 int xen_blkif_schedule(void *arg);
 int xen_blkif_purge_persistent(void *arg);
-void xen_blkbk_free_caches(struct xen_blkif *blkif);
+void xen_blkbk_free_caches(struct xen_blkif_ring *ring);
 
 int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
 			      struct backend_info *be, int state);
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index f53cff42f8da..e4bfc928035d 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -88,7 +88,7 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
 	char name[BLKBACK_NAME_LEN];
 
 	/* Not ready to connect? */
-	if (!blkif->irq || !blkif->vbd.bdev)
+	if (!blkif->ring.irq || !blkif->vbd.bdev)
 		return;
 
 	/* Already connected? */
@@ -113,10 +113,10 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
 	}
 	invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
 
-	blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, "%s", name);
-	if (IS_ERR(blkif->xenblkd)) {
-		err = PTR_ERR(blkif->xenblkd);
-		blkif->xenblkd = NULL;
+	blkif->ring.xenblkd = kthread_run(xen_blkif_schedule, &blkif->ring, "%s", name);
+	if (IS_ERR(blkif->ring.xenblkd)) {
+		err = PTR_ERR(blkif->ring.xenblkd);
+		blkif->ring.xenblkd = NULL;
 		xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
 		return;
 	}
@@ -125,6 +125,7 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
 static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 {
 	struct xen_blkif *blkif;
+	struct xen_blkif_ring *ring;
 
 	BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
 
@@ -133,41 +134,40 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 		return ERR_PTR(-ENOMEM);
 
 	blkif->domid = domid;
-	spin_lock_init(&blkif->blk_ring_lock);
 	atomic_set(&blkif->refcnt, 1);
-	init_waitqueue_head(&blkif->wq);
 	init_completion(&blkif->drain_complete);
-	atomic_set(&blkif->drain, 0);
-	blkif->st_print = jiffies;
-	blkif->persistent_gnts.rb_node = NULL;
+	INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
 	spin_lock_init(&blkif->free_pages_lock);
 	INIT_LIST_HEAD(&blkif->free_pages);
 	INIT_LIST_HEAD(&blkif->persistent_purge_list);
-	blkif->free_pages_num = 0;
-	atomic_set(&blkif->persistent_gnt_in_use, 0);
-	atomic_set(&blkif->inflight, 0);
+	blkif->st_print = jiffies;
 	INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants);
 
-	INIT_LIST_HEAD(&blkif->pending_free);
-	INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
-	spin_lock_init(&blkif->pending_free_lock);
-	init_waitqueue_head(&blkif->pending_free_wq);
-	init_waitqueue_head(&blkif->shutdown_wq);
+	ring = &blkif->ring;
+	ring->blkif = blkif;
+	spin_lock_init(&ring->blk_ring_lock);
+	init_waitqueue_head(&ring->wq);
+
+	INIT_LIST_HEAD(&ring->pending_free);
+	spin_lock_init(&ring->pending_free_lock);
+	init_waitqueue_head(&ring->pending_free_wq);
+	init_waitqueue_head(&ring->shutdown_wq);
 
 	return blkif;
 }
 
-static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
+static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref,
 			 unsigned int nr_grefs, unsigned int evtchn)
 {
 	int err;
+	struct xen_blkif *blkif = ring->blkif;
 
 	/* Already connected through? */
-	if (blkif->irq)
+	if (ring->irq)
 		return 0;
 
 	err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
-				     &blkif->blk_ring);
+				     &ring->blk_ring);
 	if (err < 0)
 		return err;
 
@@ -175,24 +175,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
 	case BLKIF_PROTOCOL_NATIVE:
 	{
 		struct blkif_sring *sring;
-		sring = (struct blkif_sring *)blkif->blk_ring;
-		BACK_RING_INIT(&blkif->blk_rings.native, sring,
+		sring = (struct blkif_sring *)ring->blk_ring;
+		BACK_RING_INIT(&ring->blk_rings.native, sring,
 			       XEN_PAGE_SIZE * nr_grefs);
 		break;
 	}
 	case BLKIF_PROTOCOL_X86_32:
 	{
 		struct blkif_x86_32_sring *sring_x86_32;
-		sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
-		BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32,
+		sring_x86_32 = (struct blkif_x86_32_sring *)ring->blk_ring;
+		BACK_RING_INIT(&ring->blk_rings.x86_32, sring_x86_32,
 			       XEN_PAGE_SIZE * nr_grefs);
 		break;
 	}
 	case BLKIF_PROTOCOL_X86_64:
 	{
 		struct blkif_x86_64_sring *sring_x86_64;
-		sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
-		BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64,
+		sring_x86_64 = (struct blkif_x86_64_sring *)ring->blk_ring;
+		BACK_RING_INIT(&ring->blk_rings.x86_64, sring_x86_64,
 			       XEN_PAGE_SIZE * nr_grefs);
 		break;
 	}
@@ -202,13 +202,13 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
 
 	err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn,
 						    xen_blkif_be_int, 0,
-						    "blkif-backend", blkif);
+						    "blkif-backend", ring);
 	if (err < 0) {
-		xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
-		blkif->blk_rings.common.sring = NULL;
+		xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
+		ring->blk_rings.common.sring = NULL;
 		return err;
 	}
-	blkif->irq = err;
+	ring->irq = err;
 
 	return 0;
 }
@@ -217,35 +217,36 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif)
 {
 	struct pending_req *req, *n;
 	int i = 0, j;
+	struct xen_blkif_ring *ring = &blkif->ring;
 
-	if (blkif->xenblkd) {
-		kthread_stop(blkif->xenblkd);
-		wake_up(&blkif->shutdown_wq);
-		blkif->xenblkd = NULL;
+	if (ring->xenblkd) {
+		kthread_stop(ring->xenblkd);
+		wake_up(&ring->shutdown_wq);
+		ring->xenblkd = NULL;
 	}
 
 	/* The above kthread_stop() guarantees that at this point we
 	 * don't have any discard_io or other_io requests. So, checking
 	 * for inflight IO is enough.
 	 */
-	if (atomic_read(&blkif->inflight) > 0)
+	if (atomic_read(&ring->inflight) > 0)
 		return -EBUSY;
 
-	if (blkif->irq) {
-		unbind_from_irqhandler(blkif->irq, blkif);
-		blkif->irq = 0;
+	if (ring->irq) {
+		unbind_from_irqhandler(ring->irq, ring);
+		ring->irq = 0;
 	}
 
-	if (blkif->blk_rings.common.sring) {
-		xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
-		blkif->blk_rings.common.sring = NULL;
+	if (ring->blk_rings.common.sring) {
+		xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
+		ring->blk_rings.common.sring = NULL;
 	}
 
 	/* Remove all persistent grants and the cache of ballooned pages. */
-	xen_blkbk_free_caches(blkif);
+	xen_blkbk_free_caches(ring);
 
 	/* Check that there is no request in use */
-	list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
+	list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
 		list_del(&req->free_list);
 
 		for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
@@ -835,6 +836,7 @@ static int connect_ring(struct backend_info *be)
 	char protocol[64] = "";
 	struct pending_req *req, *n;
 	int err, i, j;
+	struct xen_blkif_ring *ring = &be->blkif->ring;
 
 	pr_debug("%s %s\n", __func__, dev->otherend);
 
@@ -923,7 +925,7 @@ static int connect_ring(struct backend_info *be)
 		req = kzalloc(sizeof(*req), GFP_KERNEL);
 		if (!req)
 			goto fail;
-		list_add_tail(&req->free_list, &be->blkif->pending_free);
+		list_add_tail(&req->free_list, &ring->pending_free);
 		for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
 			req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL);
 			if (!req->segments[j])
@@ -938,7 +940,7 @@ static int connect_ring(struct backend_info *be)
 	}
 
 	/* Map the shared frame, irq etc. */
-	err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn);
+	err = xen_blkif_map(ring, ring_ref, nr_grefs, evtchn);
 	if (err) {
 		xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
 		return err;
@@ -947,7 +949,7 @@ static int connect_ring(struct backend_info *be)
 	return 0;
 
 fail:
-	list_for_each_entry_safe(req, n, &be->blkif->pending_free, free_list) {
+	list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
 		list_del(&req->free_list);
 		for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
 			if (!req->segments[j])
-- 
cgit 


From 2fb1ef4f1226ea6d6d3481036cabe01a4415b68c Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 11 Dec 2015 12:08:48 -0500
Subject: xen/blkback: pseudo support for multi hardware queues/rings

Preparatory patch for multiple hardware queues (rings). The number of
rings is unconditionally set to 1, larger number will be enabled in
"xen/blkback: get the number of hardware queues/rings from blkfront".

Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
v2: Align variables in the structures.
---
 drivers/block/xen-blkback/common.h |   3 +-
 drivers/block/xen-blkback/xenbus.c | 277 +++++++++++++++++++++++--------------
 2 files changed, 175 insertions(+), 105 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index dbdf4164c83f..310eff3cf43f 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -340,7 +340,8 @@ struct xen_blkif {
 	struct work_struct	free_work;
 	unsigned int 		nr_ring_pages;
 	/* All rings for this device. */
-	struct xen_blkif_ring 	ring;
+	struct xen_blkif_ring	*rings;
+	unsigned int		nr_rings;
 };
 
 struct seg_buf {
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index e4bfc928035d..f5bfedd0e948 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -86,9 +86,11 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
 {
 	int err;
 	char name[BLKBACK_NAME_LEN];
+	struct xen_blkif_ring *ring;
+	int i;
 
 	/* Not ready to connect? */
-	if (!blkif->ring.irq || !blkif->vbd.bdev)
+	if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev)
 		return;
 
 	/* Already connected? */
@@ -113,19 +115,55 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
 	}
 	invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
 
-	blkif->ring.xenblkd = kthread_run(xen_blkif_schedule, &blkif->ring, "%s", name);
-	if (IS_ERR(blkif->ring.xenblkd)) {
-		err = PTR_ERR(blkif->ring.xenblkd);
-		blkif->ring.xenblkd = NULL;
-		xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
-		return;
+	for (i = 0; i < blkif->nr_rings; i++) {
+		ring = &blkif->rings[i];
+		ring->xenblkd = kthread_run(xen_blkif_schedule, ring, "%s-%d", name, i);
+		if (IS_ERR(ring->xenblkd)) {
+			err = PTR_ERR(ring->xenblkd);
+			ring->xenblkd = NULL;
+			xenbus_dev_fatal(blkif->be->dev, err,
+					"start %s-%d xenblkd", name, i);
+			goto out;
+		}
+	}
+	return;
+
+out:
+	while (--i >= 0) {
+		ring = &blkif->rings[i];
+		kthread_stop(ring->xenblkd);
 	}
+	return;
+}
+
+static int xen_blkif_alloc_rings(struct xen_blkif *blkif)
+{
+	unsigned int r;
+
+	blkif->rings = kzalloc(blkif->nr_rings * sizeof(struct xen_blkif_ring), GFP_KERNEL);
+	if (!blkif->rings)
+		return -ENOMEM;
+
+	for (r = 0; r < blkif->nr_rings; r++) {
+		struct xen_blkif_ring *ring = &blkif->rings[r];
+
+		spin_lock_init(&ring->blk_ring_lock);
+		init_waitqueue_head(&ring->wq);
+		INIT_LIST_HEAD(&ring->pending_free);
+
+		spin_lock_init(&ring->pending_free_lock);
+		init_waitqueue_head(&ring->pending_free_wq);
+		init_waitqueue_head(&ring->shutdown_wq);
+		ring->blkif = blkif;
+		xen_blkif_get(blkif);
+	}
+
+	return 0;
 }
 
 static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 {
 	struct xen_blkif *blkif;
-	struct xen_blkif_ring *ring;
 
 	BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
 
@@ -143,15 +181,11 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 	blkif->st_print = jiffies;
 	INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants);
 
-	ring = &blkif->ring;
-	ring->blkif = blkif;
-	spin_lock_init(&ring->blk_ring_lock);
-	init_waitqueue_head(&ring->wq);
-
-	INIT_LIST_HEAD(&ring->pending_free);
-	spin_lock_init(&ring->pending_free_lock);
-	init_waitqueue_head(&ring->pending_free_wq);
-	init_waitqueue_head(&ring->shutdown_wq);
+	blkif->nr_rings = 1;
+	if (xen_blkif_alloc_rings(blkif)) {
+		kmem_cache_free(xen_blkif_cachep, blkif);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	return blkif;
 }
@@ -216,50 +250,54 @@ static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref,
 static int xen_blkif_disconnect(struct xen_blkif *blkif)
 {
 	struct pending_req *req, *n;
-	int i = 0, j;
-	struct xen_blkif_ring *ring = &blkif->ring;
+	unsigned int j, r;
 
-	if (ring->xenblkd) {
-		kthread_stop(ring->xenblkd);
-		wake_up(&ring->shutdown_wq);
-		ring->xenblkd = NULL;
-	}
+	for (r = 0; r < blkif->nr_rings; r++) {
+		struct xen_blkif_ring *ring = &blkif->rings[r];
+		unsigned int i = 0;
 
-	/* The above kthread_stop() guarantees that at this point we
-	 * don't have any discard_io or other_io requests. So, checking
-	 * for inflight IO is enough.
-	 */
-	if (atomic_read(&ring->inflight) > 0)
-		return -EBUSY;
+		if (ring->xenblkd) {
+			kthread_stop(ring->xenblkd);
+			wake_up(&ring->shutdown_wq);
+			ring->xenblkd = NULL;
+		}
 
-	if (ring->irq) {
-		unbind_from_irqhandler(ring->irq, ring);
-		ring->irq = 0;
-	}
+		/* The above kthread_stop() guarantees that at this point we
+		 * don't have any discard_io or other_io requests. So, checking
+		 * for inflight IO is enough.
+		 */
+		if (atomic_read(&ring->inflight) > 0)
+			return -EBUSY;
 
-	if (ring->blk_rings.common.sring) {
-		xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
-		ring->blk_rings.common.sring = NULL;
-	}
+		if (ring->irq) {
+			unbind_from_irqhandler(ring->irq, ring);
+			ring->irq = 0;
+		}
 
-	/* Remove all persistent grants and the cache of ballooned pages. */
-	xen_blkbk_free_caches(ring);
+		if (ring->blk_rings.common.sring) {
+			xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
+			ring->blk_rings.common.sring = NULL;
+		}
 
-	/* Check that there is no request in use */
-	list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
-		list_del(&req->free_list);
+		/* Remove all persistent grants and the cache of ballooned pages. */
+		xen_blkbk_free_caches(ring);
 
-		for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
-			kfree(req->segments[j]);
+		/* Check that there is no request in use */
+		list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
+			list_del(&req->free_list);
 
-		for (j = 0; j < MAX_INDIRECT_PAGES; j++)
-			kfree(req->indirect_pages[j]);
+			for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
+				kfree(req->segments[j]);
 
-		kfree(req);
-		i++;
-	}
+			for (j = 0; j < MAX_INDIRECT_PAGES; j++)
+				kfree(req->indirect_pages[j]);
 
-	WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
+			kfree(req);
+			i++;
+		}
+
+		WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
+	}
 	blkif->nr_ring_pages = 0;
 
 	return 0;
@@ -279,6 +317,7 @@ static void xen_blkif_free(struct xen_blkif *blkif)
 	BUG_ON(!list_empty(&blkif->free_pages));
 	BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
 
+	kfree(blkif->rings);
 	kmem_cache_free(xen_blkif_cachep, blkif);
 }
 
@@ -427,6 +466,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
 static int xen_blkbk_remove(struct xenbus_device *dev)
 {
 	struct backend_info *be = dev_get_drvdata(&dev->dev);
+	unsigned int i;
 
 	pr_debug("%s %p %d\n", __func__, dev, dev->otherend_id);
 
@@ -443,7 +483,8 @@ static int xen_blkbk_remove(struct xenbus_device *dev)
 
 	if (be->blkif) {
 		xen_blkif_disconnect(be->blkif);
-		xen_blkif_put(be->blkif);
+		for (i = 0; i < be->blkif->nr_rings; i++)
+			xen_blkif_put(be->blkif);
 	}
 
 	kfree(be->mode);
@@ -826,51 +867,43 @@ again:
 	xenbus_transaction_end(xbt, 1);
 }
 
-
-static int connect_ring(struct backend_info *be)
+/*
+ * Each ring may have multi pages, depends on "ring-page-order".
+ */
+static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir)
 {
-	struct xenbus_device *dev = be->dev;
 	unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
-	unsigned int evtchn, nr_grefs, ring_page_order;
-	unsigned int pers_grants;
-	char protocol[64] = "";
 	struct pending_req *req, *n;
 	int err, i, j;
-	struct xen_blkif_ring *ring = &be->blkif->ring;
-
-	pr_debug("%s %s\n", __func__, dev->otherend);
+	struct xen_blkif *blkif = ring->blkif;
+	struct xenbus_device *dev = blkif->be->dev;
+	unsigned int ring_page_order, nr_grefs, evtchn;
 
-	err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
+	err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u",
 			  &evtchn);
 	if (err != 1) {
 		err = -EINVAL;
-		xenbus_dev_fatal(dev, err, "reading %s/event-channel",
-				 dev->otherend);
+		xenbus_dev_fatal(dev, err, "reading %s/event-channel", dir);
 		return err;
 	}
-	pr_info("event-channel %u\n", evtchn);
 
 	err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
 			  &ring_page_order);
 	if (err != 1) {
-		err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
-				  "%u", &ring_ref[0]);
+		err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", &ring_ref[0]);
 		if (err != 1) {
 			err = -EINVAL;
-			xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
-					 dev->otherend);
+			xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir);
 			return err;
 		}
 		nr_grefs = 1;
-		pr_info("%s:using single page: ring-ref %d\n", dev->otherend,
-			ring_ref[0]);
 	} else {
 		unsigned int i;
 
 		if (ring_page_order > xen_blkif_max_ring_order) {
 			err = -EINVAL;
 			xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d",
-					 dev->otherend, ring_page_order,
+					 dir, ring_page_order,
 					 xen_blkif_max_ring_order);
 			return err;
 		}
@@ -880,46 +913,17 @@ static int connect_ring(struct backend_info *be)
 			char ring_ref_name[RINGREF_NAME_LEN];
 
 			snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
-			err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name,
+			err = xenbus_scanf(XBT_NIL, dir, ring_ref_name,
 					   "%u", &ring_ref[i]);
 			if (err != 1) {
 				err = -EINVAL;
 				xenbus_dev_fatal(dev, err, "reading %s/%s",
-						 dev->otherend, ring_ref_name);
+						 dir, ring_ref_name);
 				return err;
 			}
-			pr_info("ring-ref%u: %u\n", i, ring_ref[i]);
 		}
 	}
-
-	be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
-	err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
-			    "%63s", protocol, NULL);
-	if (err)
-		strcpy(protocol, "unspecified, assuming default");
-	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
-		be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
-	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
-		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
-	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
-		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
-	else {
-		xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
-		return -1;
-	}
-	err = xenbus_gather(XBT_NIL, dev->otherend,
-			    "feature-persistent", "%u",
-			    &pers_grants, NULL);
-	if (err)
-		pers_grants = 0;
-
-	be->blkif->vbd.feature_gnt_persistent = pers_grants;
-	be->blkif->vbd.overflow_max_grants = 0;
-	be->blkif->nr_ring_pages = nr_grefs;
-
-	pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
-		nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
-		pers_grants ? "persistent grants" : "");
+	blkif->nr_ring_pages = nr_grefs;
 
 	for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) {
 		req = kzalloc(sizeof(*req), GFP_KERNEL);
@@ -964,6 +968,71 @@ fail:
 		kfree(req);
 	}
 	return -ENOMEM;
+
+}
+
+static int connect_ring(struct backend_info *be)
+{
+	struct xenbus_device *dev = be->dev;
+	unsigned int pers_grants;
+	char protocol[64] = "";
+	int err, i;
+	char *xspath;
+	size_t xspathsize;
+	const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */
+
+	pr_debug("%s %s\n", __func__, dev->otherend);
+
+	be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
+	err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
+			    "%63s", protocol, NULL);
+	if (err)
+		strcpy(protocol, "unspecified, assuming default");
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+	else {
+		xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+		return -1;
+	}
+	err = xenbus_gather(XBT_NIL, dev->otherend,
+			    "feature-persistent", "%u",
+			    &pers_grants, NULL);
+	if (err)
+		pers_grants = 0;
+
+	be->blkif->vbd.feature_gnt_persistent = pers_grants;
+	be->blkif->vbd.overflow_max_grants = 0;
+
+	pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename,
+		 be->blkif->nr_rings, be->blkif->blk_protocol, protocol,
+		 pers_grants ? "persistent grants" : "");
+
+	if (be->blkif->nr_rings == 1)
+		return read_per_ring_refs(&be->blkif->rings[0], dev->otherend);
+	else {
+		xspathsize = strlen(dev->otherend) + xenstore_path_ext_size;
+		xspath = kmalloc(xspathsize, GFP_KERNEL);
+		if (!xspath) {
+			xenbus_dev_fatal(dev, -ENOMEM, "reading ring references");
+			return -ENOMEM;
+		}
+
+		for (i = 0; i < be->blkif->nr_rings; i++) {
+			memset(xspath, 0, xspathsize);
+			snprintf(xspath, xspathsize, "%s/queue-%u", dev->otherend, i);
+			err = read_per_ring_refs(&be->blkif->rings[i], xspath);
+			if (err) {
+				kfree(xspath);
+				return err;
+			}
+		}
+		kfree(xspath);
+	}
+	return 0;
 }
 
 static const struct xenbus_device_id xen_blkbk_ids[] = {
-- 
cgit 


From d62d86000316d7ef38e1c2e9602c3ce6d1cb57bd Mon Sep 17 00:00:00 2001
From: Bob Liu <bob.liu@oracle.com>
Date: Sat, 14 Nov 2015 11:12:17 +0800
Subject: xen/blkback: get the number of hardware queues/rings from blkfront

Backend advertises "multi-queue-max-queues" to front, also get the negotiated
number from "multi-queue-num-queues" written by blkfront.

Signed-off-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 13 +++++++++++++
 drivers/block/xen-blkback/common.h  |  1 +
 drivers/block/xen-blkback/xenbus.c  | 34 ++++++++++++++++++++++++++++------
 3 files changed, 42 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 4fd8640d146c..18b27770d80b 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -83,6 +83,16 @@ module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
 MODULE_PARM_DESC(max_persistent_grants,
                  "Maximum number of grants to map persistently");
 
+/*
+ * Maximum number of rings/queues blkback supports, allow as many queues as there
+ * are CPUs if user has not specified a value.
+ */
+unsigned int xenblk_max_queues;
+module_param_named(max_queues, xenblk_max_queues, uint, 0644);
+MODULE_PARM_DESC(max_queues,
+		 "Maximum number of hardware queues per virtual disk." \
+		 "By default it is the number of online CPUs.");
+
 /*
  * Maximum order of pages to be used for the shared ring between front and
  * backend, 4KB page granularity is used.
@@ -1483,6 +1493,9 @@ static int __init xen_blkif_init(void)
 		xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
 	}
 
+	if (xenblk_max_queues == 0)
+		xenblk_max_queues = num_online_cpus();
+
 	rc = xen_blkif_interface_init();
 	if (rc)
 		goto failed_init;
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 310eff3cf43f..847444dc1df4 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -46,6 +46,7 @@
 #include <xen/interface/io/protocols.h>
 
 extern unsigned int xen_blkif_max_ring_order;
+extern unsigned int xenblk_max_queues;
 /*
  * This is the maximum number of segments that would be allowed in indirect
  * requests. This value will also be passed to the frontend.
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index f5bfedd0e948..0d6bb9383a68 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -181,12 +181,6 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 	blkif->st_print = jiffies;
 	INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants);
 
-	blkif->nr_rings = 1;
-	if (xen_blkif_alloc_rings(blkif)) {
-		kmem_cache_free(xen_blkif_cachep, blkif);
-		return ERR_PTR(-ENOMEM);
-	}
-
 	return blkif;
 }
 
@@ -595,6 +589,12 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
 		goto fail;
 	}
 
+	/* Multi-queue: advertise how many queues are supported by us.*/
+	err = xenbus_printf(XBT_NIL, dev->nodename,
+			    "multi-queue-max-queues", "%u", xenblk_max_queues);
+	if (err)
+		pr_warn("Error writing multi-queue-max-queues\n");
+
 	/* setup back pointer */
 	be->blkif->be = be;
 
@@ -980,6 +980,7 @@ static int connect_ring(struct backend_info *be)
 	char *xspath;
 	size_t xspathsize;
 	const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */
+	unsigned int requested_num_queues = 0;
 
 	pr_debug("%s %s\n", __func__, dev->otherend);
 
@@ -1007,6 +1008,27 @@ static int connect_ring(struct backend_info *be)
 	be->blkif->vbd.feature_gnt_persistent = pers_grants;
 	be->blkif->vbd.overflow_max_grants = 0;
 
+	/*
+	 * Read the number of hardware queues from frontend.
+	 */
+	err = xenbus_scanf(XBT_NIL, dev->otherend, "multi-queue-num-queues",
+			   "%u", &requested_num_queues);
+	if (err < 0) {
+		requested_num_queues = 1;
+	} else {
+		if (requested_num_queues > xenblk_max_queues
+		    || requested_num_queues == 0) {
+			/* Buggy or malicious guest. */
+			xenbus_dev_fatal(dev, err,
+					"guest requested %u queues, exceeding the maximum of %u.",
+					requested_num_queues, xenblk_max_queues);
+			return -ENOSYS;
+		}
+	}
+	be->blkif->nr_rings = requested_num_queues;
+	if (xen_blkif_alloc_rings(be->blkif))
+		return -ENOMEM;
+
 	pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename,
 		 be->blkif->nr_rings, be->blkif->blk_protocol, protocol,
 		 pers_grants ? "persistent grants" : "");
-- 
cgit 


From d4bf0065b7251afb723a29b2fd58f7c38f8ce297 Mon Sep 17 00:00:00 2001
From: Bob Liu <bob.liu@oracle.com>
Date: Sat, 14 Nov 2015 11:12:19 +0800
Subject: xen/blkback: make pool of persistent grants and free pages per-queue

Make pool of persistent grants and free pages per-queue/ring instead of
per-device to get better scalability.

Test was done based on null_blk driver:
dom0: v4.2-rc8 16vcpus 10GB "modprobe null_blk"
domu: v4.2-rc8 16vcpus 10GB

[test]
rw=read
direct=1
ioengine=libaio
bs=4k
time_based
runtime=30
filename=/dev/xvdb
numjobs=16
iodepth=64
iodepth_batch=64
iodepth_batch_complete=64
group_reporting

Results:
iops1: After patch "xen/blkfront: make persistent grants per-queue".
iops2: After this patch.

Queues:			  1 	   4 	  	  8 	 	 16
Iops orig(k):		810 	1064 		780 		700
Iops1(k):		810     1230(~20%)	1024(~20%)	850(~20%)
Iops2(k):		810     1410(~35%)	1354(~75%)      1440(~100%)

With 4 queues after this commit we can get ~75% increase in IOPS, and
performance won't drop if increasing queue numbers.

Please find the respective chart in this link:
https://www.dropbox.com/s/agrcy2pbzbsvmwv/iops.png?dl=0

Signed-off-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 202 ++++++++++++++++--------------------
 drivers/block/xen-blkback/common.h  |  32 +++---
 drivers/block/xen-blkback/xenbus.c  |  21 ++--
 3 files changed, 118 insertions(+), 137 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 18b27770d80b..a00d6c6c2880 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -123,60 +123,60 @@ module_param(log_stats, int, 0644);
 /* Number of free pages to remove on each call to gnttab_free_pages */
 #define NUM_BATCH_FREE_PAGES 10
 
-static inline int get_free_page(struct xen_blkif *blkif, struct page **page)
+static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&blkif->free_pages_lock, flags);
-	if (list_empty(&blkif->free_pages)) {
-		BUG_ON(blkif->free_pages_num != 0);
-		spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+	spin_lock_irqsave(&ring->free_pages_lock, flags);
+	if (list_empty(&ring->free_pages)) {
+		BUG_ON(ring->free_pages_num != 0);
+		spin_unlock_irqrestore(&ring->free_pages_lock, flags);
 		return gnttab_alloc_pages(1, page);
 	}
-	BUG_ON(blkif->free_pages_num == 0);
-	page[0] = list_first_entry(&blkif->free_pages, struct page, lru);
+	BUG_ON(ring->free_pages_num == 0);
+	page[0] = list_first_entry(&ring->free_pages, struct page, lru);
 	list_del(&page[0]->lru);
-	blkif->free_pages_num--;
-	spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+	ring->free_pages_num--;
+	spin_unlock_irqrestore(&ring->free_pages_lock, flags);
 
 	return 0;
 }
 
-static inline void put_free_pages(struct xen_blkif *blkif, struct page **page,
+static inline void put_free_pages(struct xen_blkif_ring *ring, struct page **page,
                                   int num)
 {
 	unsigned long flags;
 	int i;
 
-	spin_lock_irqsave(&blkif->free_pages_lock, flags);
+	spin_lock_irqsave(&ring->free_pages_lock, flags);
 	for (i = 0; i < num; i++)
-		list_add(&page[i]->lru, &blkif->free_pages);
-	blkif->free_pages_num += num;
-	spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+		list_add(&page[i]->lru, &ring->free_pages);
+	ring->free_pages_num += num;
+	spin_unlock_irqrestore(&ring->free_pages_lock, flags);
 }
 
-static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num)
+static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num)
 {
 	/* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
 	struct page *page[NUM_BATCH_FREE_PAGES];
 	unsigned int num_pages = 0;
 	unsigned long flags;
 
-	spin_lock_irqsave(&blkif->free_pages_lock, flags);
-	while (blkif->free_pages_num > num) {
-		BUG_ON(list_empty(&blkif->free_pages));
-		page[num_pages] = list_first_entry(&blkif->free_pages,
+	spin_lock_irqsave(&ring->free_pages_lock, flags);
+	while (ring->free_pages_num > num) {
+		BUG_ON(list_empty(&ring->free_pages));
+		page[num_pages] = list_first_entry(&ring->free_pages,
 		                                   struct page, lru);
 		list_del(&page[num_pages]->lru);
-		blkif->free_pages_num--;
+		ring->free_pages_num--;
 		if (++num_pages == NUM_BATCH_FREE_PAGES) {
-			spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+			spin_unlock_irqrestore(&ring->free_pages_lock, flags);
 			gnttab_free_pages(num_pages, page);
-			spin_lock_irqsave(&blkif->free_pages_lock, flags);
+			spin_lock_irqsave(&ring->free_pages_lock, flags);
 			num_pages = 0;
 		}
 	}
-	spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+	spin_unlock_irqrestore(&ring->free_pages_lock, flags);
 	if (num_pages != 0)
 		gnttab_free_pages(num_pages, page);
 }
@@ -199,23 +199,29 @@ static void make_response(struct xen_blkif_ring *ring, u64 id,
 
 
 /*
- * pers_gnts_lock must be used around all the persistent grant helpers
- * because blkback may use multi-thread/queue for each backend.
+ * We don't need locking around the persistent grant helpers
+ * because blkback uses a single-thread for each backend, so we
+ * can be sure that this functions will never be called recursively.
+ *
+ * The only exception to that is put_persistent_grant, that can be called
+ * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
+ * bit operations to modify the flags of a persistent grant and to count
+ * the number of used grants.
  */
-static int add_persistent_gnt(struct xen_blkif *blkif,
+static int add_persistent_gnt(struct xen_blkif_ring *ring,
 			       struct persistent_gnt *persistent_gnt)
 {
 	struct rb_node **new = NULL, *parent = NULL;
 	struct persistent_gnt *this;
+	struct xen_blkif *blkif = ring->blkif;
 
-	BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock));
-	if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) {
+	if (ring->persistent_gnt_c >= xen_blkif_max_pgrants) {
 		if (!blkif->vbd.overflow_max_grants)
 			blkif->vbd.overflow_max_grants = 1;
 		return -EBUSY;
 	}
 	/* Figure out where to put new node */
-	new = &blkif->persistent_gnts.rb_node;
+	new = &ring->persistent_gnts.rb_node;
 	while (*new) {
 		this = container_of(*new, struct persistent_gnt, node);
 
@@ -234,20 +240,19 @@ static int add_persistent_gnt(struct xen_blkif *blkif,
 	set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
 	/* Add new node and rebalance tree. */
 	rb_link_node(&(persistent_gnt->node), parent, new);
-	rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts);
-	blkif->persistent_gnt_c++;
-	atomic_inc(&blkif->persistent_gnt_in_use);
+	rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts);
+	ring->persistent_gnt_c++;
+	atomic_inc(&ring->persistent_gnt_in_use);
 	return 0;
 }
 
-static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
+static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring,
 						 grant_ref_t gref)
 {
 	struct persistent_gnt *data;
 	struct rb_node *node = NULL;
 
-	BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock));
-	node = blkif->persistent_gnts.rb_node;
+	node = ring->persistent_gnts.rb_node;
 	while (node) {
 		data = container_of(node, struct persistent_gnt, node);
 
@@ -261,25 +266,24 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
 				return NULL;
 			}
 			set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
-			atomic_inc(&blkif->persistent_gnt_in_use);
+			atomic_inc(&ring->persistent_gnt_in_use);
 			return data;
 		}
 	}
 	return NULL;
 }
 
-static void put_persistent_gnt(struct xen_blkif *blkif,
+static void put_persistent_gnt(struct xen_blkif_ring *ring,
                                struct persistent_gnt *persistent_gnt)
 {
-	BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock));
 	if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
 		pr_alert_ratelimited("freeing a grant already unused\n");
 	set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
 	clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
-	atomic_dec(&blkif->persistent_gnt_in_use);
+	atomic_dec(&ring->persistent_gnt_in_use);
 }
 
-static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
+static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root,
                                  unsigned int num)
 {
 	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -293,7 +297,6 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
 	unmap_data.unmap_ops = unmap;
 	unmap_data.kunmap_ops = NULL;
 
-	BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock));
 	foreach_grant_safe(persistent_gnt, n, root, node) {
 		BUG_ON(persistent_gnt->handle ==
 			BLKBACK_INVALID_HANDLE);
@@ -311,7 +314,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
 			unmap_data.count = segs_to_unmap;
 			BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
 
-			put_free_pages(blkif, pages, segs_to_unmap);
+			put_free_pages(ring, pages, segs_to_unmap);
 			segs_to_unmap = 0;
 		}
 
@@ -328,17 +331,15 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
 	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	struct persistent_gnt *persistent_gnt;
 	int segs_to_unmap = 0;
-	struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work);
+	struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work);
 	struct gntab_unmap_queue_data unmap_data;
-	unsigned long flags;
 
 	unmap_data.pages = pages;
 	unmap_data.unmap_ops = unmap;
 	unmap_data.kunmap_ops = NULL;
 
-	spin_lock_irqsave(&blkif->pers_gnts_lock, flags);
-	while(!list_empty(&blkif->persistent_purge_list)) {
-		persistent_gnt = list_first_entry(&blkif->persistent_purge_list,
+	while(!list_empty(&ring->persistent_purge_list)) {
+		persistent_gnt = list_first_entry(&ring->persistent_purge_list,
 		                                  struct persistent_gnt,
 		                                  remove_node);
 		list_del(&persistent_gnt->remove_node);
@@ -353,45 +354,42 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
 		if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
 			unmap_data.count = segs_to_unmap;
 			BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
-			put_free_pages(blkif, pages, segs_to_unmap);
+			put_free_pages(ring, pages, segs_to_unmap);
 			segs_to_unmap = 0;
 		}
 		kfree(persistent_gnt);
 	}
-	spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags);
 	if (segs_to_unmap > 0) {
 		unmap_data.count = segs_to_unmap;
 		BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
-		put_free_pages(blkif, pages, segs_to_unmap);
+		put_free_pages(ring, pages, segs_to_unmap);
 	}
 }
 
-static void purge_persistent_gnt(struct xen_blkif *blkif)
+static void purge_persistent_gnt(struct xen_blkif_ring *ring)
 {
 	struct persistent_gnt *persistent_gnt;
 	struct rb_node *n;
 	unsigned int num_clean, total;
 	bool scan_used = false, clean_used = false;
 	struct rb_root *root;
-	unsigned long flags;
 
-	spin_lock_irqsave(&blkif->pers_gnts_lock, flags);
-	if (blkif->persistent_gnt_c < xen_blkif_max_pgrants ||
-	    (blkif->persistent_gnt_c == xen_blkif_max_pgrants &&
-	    !blkif->vbd.overflow_max_grants)) {
+	if (ring->persistent_gnt_c < xen_blkif_max_pgrants ||
+	    (ring->persistent_gnt_c == xen_blkif_max_pgrants &&
+	    !ring->blkif->vbd.overflow_max_grants)) {
 		goto out;
 	}
 
-	if (work_busy(&blkif->persistent_purge_work)) {
+	if (work_busy(&ring->persistent_purge_work)) {
 		pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
 		goto out;
 	}
 
 	num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
-	num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
-	num_clean = min(blkif->persistent_gnt_c, num_clean);
+	num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
+	num_clean = min(ring->persistent_gnt_c, num_clean);
 	if ((num_clean == 0) ||
-	    (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use))))
+	    (num_clean > (ring->persistent_gnt_c - atomic_read(&ring->persistent_gnt_in_use))))
 		goto out;
 
 	/*
@@ -407,8 +405,8 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
 
 	pr_debug("Going to purge %u persistent grants\n", num_clean);
 
-	BUG_ON(!list_empty(&blkif->persistent_purge_list));
-	root = &blkif->persistent_gnts;
+	BUG_ON(!list_empty(&ring->persistent_purge_list));
+	root = &ring->persistent_gnts;
 purge_list:
 	foreach_grant_safe(persistent_gnt, n, root, node) {
 		BUG_ON(persistent_gnt->handle ==
@@ -427,7 +425,7 @@ purge_list:
 
 		rb_erase(&persistent_gnt->node, root);
 		list_add(&persistent_gnt->remove_node,
-		         &blkif->persistent_purge_list);
+			 &ring->persistent_purge_list);
 		if (--num_clean == 0)
 			goto finished;
 	}
@@ -448,18 +446,14 @@ finished:
 		goto purge_list;
 	}
 
-	blkif->persistent_gnt_c -= (total - num_clean);
-	spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags);
-	blkif->vbd.overflow_max_grants = 0;
+	ring->persistent_gnt_c -= (total - num_clean);
+	ring->blkif->vbd.overflow_max_grants = 0;
 
 	/* We can defer this work */
-	schedule_work(&blkif->persistent_purge_work);
+	schedule_work(&ring->persistent_purge_work);
 	pr_debug("Purged %u/%u\n", (total - num_clean), total);
-	return;
 
 out:
-	spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags);
-
 	return;
 }
 
@@ -591,14 +585,16 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
  * SCHEDULER FUNCTIONS
  */
 
-static void print_stats(struct xen_blkif *blkif)
+static void print_stats(struct xen_blkif_ring *ring)
 {
+	struct xen_blkif *blkif = ring->blkif;
+
 	pr_info("(%s): oo %3llu  |  rd %4llu  |  wr %4llu  |  f %4llu"
 		 "  |  ds %4llu | pg: %4u/%4d\n",
 		 current->comm, blkif->st_oo_req,
 		 blkif->st_rd_req, blkif->st_wr_req,
 		 blkif->st_f_req, blkif->st_ds_req,
-		 blkif->persistent_gnt_c,
+		 ring->persistent_gnt_c,
 		 xen_blkif_max_pgrants);
 	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
 	blkif->st_rd_req = 0;
@@ -651,23 +647,23 @@ int xen_blkif_schedule(void *arg)
 
 purge_gnt_list:
 		if (blkif->vbd.feature_gnt_persistent &&
-		    time_after(jiffies, blkif->next_lru)) {
-			purge_persistent_gnt(blkif);
-			blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
+		    time_after(jiffies, ring->next_lru)) {
+			purge_persistent_gnt(ring);
+			ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
 		}
 
 		/* Shrink if we have more than xen_blkif_max_buffer_pages */
-		shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
+		shrink_free_pagepool(ring, xen_blkif_max_buffer_pages);
 
-		if (log_stats && time_after(jiffies, blkif->st_print))
-			print_stats(blkif);
+		if (log_stats && time_after(jiffies, ring->blkif->st_print))
+			print_stats(ring);
 	}
 
 	/* Drain pending purge work */
-	flush_work(&blkif->persistent_purge_work);
+	flush_work(&ring->persistent_purge_work);
 
 	if (log_stats)
-		print_stats(blkif);
+		print_stats(ring);
 
 	ring->xenblkd = NULL;
 	xen_blkif_put(blkif);
@@ -680,21 +676,16 @@ purge_gnt_list:
  */
 void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
 {
-	struct xen_blkif *blkif = ring->blkif;
-	unsigned long flags;
-
 	/* Free all persistent grant pages */
-	spin_lock_irqsave(&blkif->pers_gnts_lock, flags);
-	if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
-		free_persistent_gnts(blkif, &blkif->persistent_gnts,
-			blkif->persistent_gnt_c);
+	if (!RB_EMPTY_ROOT(&ring->persistent_gnts))
+		free_persistent_gnts(ring, &ring->persistent_gnts,
+			ring->persistent_gnt_c);
 
-	BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
-	blkif->persistent_gnt_c = 0;
-	spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags);
+	BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
+	ring->persistent_gnt_c = 0;
 
 	/* Since we are shutting down remove all pages from the buffer */
-	shrink_free_pagepool(blkif, 0 /* All */);
+	shrink_free_pagepool(ring, 0 /* All */);
 }
 
 static unsigned int xen_blkbk_unmap_prepare(
@@ -705,13 +696,10 @@ static unsigned int xen_blkbk_unmap_prepare(
 	struct page **unmap_pages)
 {
 	unsigned int i, invcount = 0;
-	unsigned long flags;
 
 	for (i = 0; i < num; i++) {
 		if (pages[i]->persistent_gnt != NULL) {
-			spin_lock_irqsave(&ring->blkif->pers_gnts_lock, flags);
-			put_persistent_gnt(ring->blkif, pages[i]->persistent_gnt);
-			spin_unlock_irqrestore(&ring->blkif->pers_gnts_lock, flags);
+			put_persistent_gnt(ring, pages[i]->persistent_gnt);
 			continue;
 		}
 		if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
@@ -736,7 +724,7 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
 	   but is this the best way to deal with this? */
 	BUG_ON(result);
 
-	put_free_pages(blkif, data->pages, data->count);
+	put_free_pages(ring, data->pages, data->count);
 	make_response(ring, pending_req->id,
 		      pending_req->operation, pending_req->status);
 	free_req(ring, pending_req);
@@ -803,7 +791,7 @@ static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
 		if (invcount) {
 			ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
 			BUG_ON(ret);
-			put_free_pages(ring->blkif, unmap_pages, invcount);
+			put_free_pages(ring, unmap_pages, invcount);
 		}
 		pages += batch;
 		num -= batch;
@@ -824,7 +812,6 @@ static int xen_blkbk_map(struct xen_blkif_ring *ring,
 	int last_map = 0, map_until = 0;
 	int use_persistent_gnts;
 	struct xen_blkif *blkif = ring->blkif;
-	unsigned long irq_flags;
 
 	use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
 
@@ -838,11 +825,9 @@ again:
 		uint32_t flags;
 
 		if (use_persistent_gnts) {
-			spin_lock_irqsave(&blkif->pers_gnts_lock, irq_flags);
 			persistent_gnt = get_persistent_gnt(
-				blkif,
+				ring,
 				pages[i]->gref);
-			spin_unlock_irqrestore(&blkif->pers_gnts_lock, irq_flags);
 		}
 
 		if (persistent_gnt) {
@@ -853,7 +838,7 @@ again:
 			pages[i]->page = persistent_gnt->page;
 			pages[i]->persistent_gnt = persistent_gnt;
 		} else {
-			if (get_free_page(blkif, &pages[i]->page))
+			if (get_free_page(ring, &pages[i]->page))
 				goto out_of_memory;
 			addr = vaddr(pages[i]->page);
 			pages_to_gnt[segs_to_map] = pages[i]->page;
@@ -886,7 +871,7 @@ again:
 			BUG_ON(new_map_idx >= segs_to_map);
 			if (unlikely(map[new_map_idx].status != 0)) {
 				pr_debug("invalid buffer -- could not remap it\n");
-				put_free_pages(blkif, &pages[seg_idx]->page, 1);
+				put_free_pages(ring, &pages[seg_idx]->page, 1);
 				pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
 				ret |= 1;
 				goto next;
@@ -896,7 +881,7 @@ again:
 			continue;
 		}
 		if (use_persistent_gnts &&
-		    blkif->persistent_gnt_c < xen_blkif_max_pgrants) {
+		    ring->persistent_gnt_c < xen_blkif_max_pgrants) {
 			/*
 			 * We are using persistent grants, the grant is
 			 * not mapped but we might have room for it.
@@ -914,19 +899,16 @@ again:
 			persistent_gnt->gnt = map[new_map_idx].ref;
 			persistent_gnt->handle = map[new_map_idx].handle;
 			persistent_gnt->page = pages[seg_idx]->page;
-			spin_lock_irqsave(&blkif->pers_gnts_lock, irq_flags);
-			if (add_persistent_gnt(blkif,
+			if (add_persistent_gnt(ring,
 			                       persistent_gnt)) {
-				spin_unlock_irqrestore(&blkif->pers_gnts_lock, irq_flags);
 				kfree(persistent_gnt);
 				persistent_gnt = NULL;
 				goto next;
 			}
 			pages[seg_idx]->persistent_gnt = persistent_gnt;
 			pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
-				 persistent_gnt->gnt, blkif->persistent_gnt_c,
+				 persistent_gnt->gnt, ring->persistent_gnt_c,
 				 xen_blkif_max_pgrants);
-			spin_unlock_irqrestore(&blkif->pers_gnts_lock, irq_flags);
 			goto next;
 		}
 		if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
@@ -950,7 +932,7 @@ next:
 
 out_of_memory:
 	pr_alert("%s: out of memory\n", __func__);
-	put_free_pages(blkif, pages_to_gnt, segs_to_map);
+	put_free_pages(ring, pages_to_gnt, segs_to_map);
 	return -ENOMEM;
 }
 
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 847444dc1df4..3c244ecf22a4 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -291,6 +291,22 @@ struct xen_blkif_ring {
 	spinlock_t		pending_free_lock;
 	wait_queue_head_t	pending_free_wq;
 
+	/* Tree to store persistent grants. */
+	spinlock_t		pers_gnts_lock;
+	struct rb_root		persistent_gnts;
+	unsigned int		persistent_gnt_c;
+	atomic_t		persistent_gnt_in_use;
+	unsigned long           next_lru;
+
+	/* Used by the kworker that offload work from the persistent purge. */
+	struct list_head	persistent_purge_list;
+	struct work_struct	persistent_purge_work;
+
+	/* Buffer of free pages to map grant refs. */
+	spinlock_t		free_pages_lock;
+	int			free_pages_num;
+	struct list_head	free_pages;
+
 	struct work_struct	free_work;
 	/* Thread shutdown wait queue. */
 	wait_queue_head_t	shutdown_wq;
@@ -312,22 +328,6 @@ struct xen_blkif {
 	struct completion	drain_complete;
 	atomic_t		drain;
 
-	/* tree to store persistent grants */
-	spinlock_t		pers_gnts_lock;
-	struct rb_root		persistent_gnts;
-	unsigned int		persistent_gnt_c;
-	atomic_t		persistent_gnt_in_use;
-	unsigned long           next_lru;
-
-	/* used by the kworker that offload work from the persistent purge */
-	struct list_head	persistent_purge_list;
-	struct work_struct	persistent_purge_work;
-
-	/* buffer of free pages to map grant refs */
-	spinlock_t		free_pages_lock;
-	int			free_pages_num;
-	struct list_head	free_pages;
-
 	/* statistics */
 	unsigned long		st_print;
 	unsigned long long			st_rd_req;
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 0d6bb9383a68..2b8650a9a6a9 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -150,6 +150,10 @@ static int xen_blkif_alloc_rings(struct xen_blkif *blkif)
 		spin_lock_init(&ring->blk_ring_lock);
 		init_waitqueue_head(&ring->wq);
 		INIT_LIST_HEAD(&ring->pending_free);
+		INIT_LIST_HEAD(&ring->persistent_purge_list);
+		INIT_WORK(&ring->persistent_purge_work, xen_blkbk_unmap_purged_grants);
+		spin_lock_init(&ring->free_pages_lock);
+		INIT_LIST_HEAD(&ring->free_pages);
 
 		spin_lock_init(&ring->pending_free_lock);
 		init_waitqueue_head(&ring->pending_free_wq);
@@ -175,11 +179,7 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 	atomic_set(&blkif->refcnt, 1);
 	init_completion(&blkif->drain_complete);
 	INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
-	spin_lock_init(&blkif->free_pages_lock);
-	INIT_LIST_HEAD(&blkif->free_pages);
-	INIT_LIST_HEAD(&blkif->persistent_purge_list);
 	blkif->st_print = jiffies;
-	INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants);
 
 	return blkif;
 }
@@ -290,6 +290,12 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif)
 			i++;
 		}
 
+		BUG_ON(atomic_read(&ring->persistent_gnt_in_use) != 0);
+		BUG_ON(!list_empty(&ring->persistent_purge_list));
+		BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
+		BUG_ON(!list_empty(&ring->free_pages));
+		BUG_ON(ring->free_pages_num != 0);
+		BUG_ON(ring->persistent_gnt_c != 0);
 		WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
 	}
 	blkif->nr_ring_pages = 0;
@@ -304,13 +310,6 @@ static void xen_blkif_free(struct xen_blkif *blkif)
 	xen_vbd_free(&blkif->vbd);
 
 	/* Make sure everything is drained before shutting down */
-	BUG_ON(blkif->persistent_gnt_c != 0);
-	BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0);
-	BUG_ON(blkif->free_pages_num != 0);
-	BUG_ON(!list_empty(&blkif->persistent_purge_list));
-	BUG_ON(!list_empty(&blkif->free_pages));
-	BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
-
 	kfree(blkif->rings);
 	kmem_cache_free(xen_blkif_cachep, blkif);
 }
-- 
cgit 


From bde21f73b9be146fda0c689f2724cda9d7737565 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 25 Nov 2015 13:07:39 -0500
Subject: xen/blocks: Return -EXX instead of -1

Lets return sensible values instead of -1.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/xenbus.c | 2 +-
 drivers/block/xen-blkfront.c       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 2b8650a9a6a9..ca3a414de11c 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -996,7 +996,7 @@ static int connect_ring(struct backend_info *be)
 		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
 	else {
 		xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
-		return -1;
+		return -ENOSYS;
 	}
 	err = xenbus_gather(XBT_NIL, dev->otherend,
 			    "feature-persistent", "%u",
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 8b0f3d92d8b4..ef5ce43e307a 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -828,11 +828,11 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
 	info->tag_set.driver_data = info;
 
 	if (blk_mq_alloc_tag_set(&info->tag_set))
-		return -1;
+		return -EINVAL;
 	rq = blk_mq_init_queue(&info->tag_set);
 	if (IS_ERR(rq)) {
 		blk_mq_free_tag_set(&info->tag_set);
-		return -1;
+		return PTR_ERR(rq);
 	}
 
 	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
-- 
cgit 


From 2d0382fac17cef20d507a0211b82e0942b2ab271 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 25 Nov 2015 13:20:14 -0500
Subject: xen/blkback: Free resources if connect_ring failed.

With the multi-queue support we could fail at setting up
some of the rings and fail the connection. That meant that
all resources tied to rings[0..n-1] (where n is the ring
that failed to be setup). Eventually the frontend will switch
to the states and we will call xen_blkif_disconnect.

However we do not want to be at the mercy of the frontend
deciding when to change states. This allows us to do the
cleanup right away and freeing resources.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/xenbus.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index ca3a414de11c..c92b35882720 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -749,8 +749,14 @@ static void frontend_changed(struct xenbus_device *dev,
 		}
 
 		err = connect_ring(be);
-		if (err)
+		if (err) {
+			/*
+			 * Clean up so that memory resources can be used by
+			 * other devices. connect_ring reported already error.
+			 */
+			xen_blkif_disconnect(be->blkif);
 			break;
+		}
 		xen_update_blkif_status(be->blkif);
 		break;
 
-- 
cgit 


From a6e7af1288eeb7fca8361356998d31a92a291531 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Mon, 26 Oct 2015 14:47:21 +0900
Subject: xen-blkback: clear PF_NOFREEZE for xen_blkif_schedule()

xen_blkif_schedule() kthread calls try_to_freeze() at the beginning of
every attempt to purge the LRU. This operation can't ever succeed though,
as the kthread hasn't marked itself as freezable.

Before (hopefully eventually) kthread freezing gets converted to fileystem
freezing, we'd rather mark xen_blkif_schedule() freezable (as it can
generate I/O during suspend).

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index a00d6c6c2880..99b479f330af 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -613,6 +613,7 @@ int xen_blkif_schedule(void *arg)
 
 	xen_blkif_get(blkif);
 
+	set_freezable();
 	while (!kthread_should_stop()) {
 		if (try_to_freeze())
 			continue;
-- 
cgit 


From 2e073969d57f60fc0b863985779657624cbd4886 Mon Sep 17 00:00:00 2001
From: Julien Grall <julien.grall@citrix.com>
Date: Thu, 13 Aug 2015 19:23:10 +0100
Subject: xen-blkfront: Introduce blkif_ring_get_request
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The code to get a request is always the same. Therefore we can factorize
it in a single function.

Signed-off-by: Julien Grall <julien.grall@citrix.com>
Acked-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index ef5ce43e307a..0b32c90ffc3f 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -481,6 +481,23 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
 	return 0;
 }
 
+static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo,
+					    struct request *req,
+					    struct blkif_request **ring_req)
+{
+	unsigned long id;
+
+	*ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt);
+	rinfo->ring.req_prod_pvt++;
+
+	id = get_id_from_freelist(rinfo);
+	rinfo->shadow[id].request = req;
+
+	(*ring_req)->u.rw.id = id;
+
+	return id;
+}
+
 static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo)
 {
 	struct blkfront_info *info = rinfo->dev_info;
@@ -488,9 +505,7 @@ static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_inf
 	unsigned long id;
 
 	/* Fill out a communications ring structure. */
-	ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt);
-	id = get_id_from_freelist(rinfo);
-	rinfo->shadow[id].request = req;
+	id = blkif_ring_get_request(rinfo, req, &ring_req);
 
 	ring_req->operation = BLKIF_OP_DISCARD;
 	ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
@@ -501,8 +516,6 @@ static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_inf
 	else
 		ring_req->u.discard.flag = 0;
 
-	rinfo->ring.req_prod_pvt++;
-
 	/* Keep a private copy so we can reissue requests when recovering. */
 	rinfo->shadow[id].req = *ring_req;
 
@@ -635,9 +648,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
 		}
 
 	/* Fill out a communications ring structure. */
-	ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt);
-	id = get_id_from_freelist(rinfo);
-	rinfo->shadow[id].request = req;
+	id = blkif_ring_get_request(rinfo, req, &ring_req);
 
 	BUG_ON(info->max_indirect_segments == 0 &&
 	       GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST);
@@ -650,7 +661,6 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
 	for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
 	       num_grant += gnttab_count_grant(sg->offset, sg->length);
 
-	ring_req->u.rw.id = id;
 	rinfo->shadow[id].num_sg = num_sg;
 	if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
 		/*
@@ -716,8 +726,6 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
 	if (setup.segments)
 		kunmap_atomic(setup.segments);
 
-	rinfo->ring.req_prod_pvt++;
-
 	/* Keep a private copy so we can reissue requests when recovering. */
 	rinfo->shadow[id].req = *ring_req;
 
-- 
cgit 


From 6cc5683390472c450fd69975d1283db79202667f Mon Sep 17 00:00:00 2001
From: Julien Grall <julien.grall@citrix.com>
Date: Thu, 13 Aug 2015 13:13:35 +0100
Subject: xen/blkfront: Handle non-indirect grant with 64KB pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The minimal size of request in the block framework is always PAGE_SIZE.
It means that when 64KB guest is support, the request will at least be
64KB.

Although, if the backend doesn't support indirect descriptor (such as QDISK
in QEMU), a ring request is only able to accommodate 11 segments of 4KB
(i.e 44KB).

The current frontend is assuming that an I/O request will always fit in
a ring request. This is not true any more when using 64KB page
granularity and will therefore crash during boot.

On ARM64, the ABI is completely neutral to the page granularity used by
the domU. The guest has the choice between different page granularity
supported by the processors (for instance on ARM64: 4KB, 16KB, 64KB).
This can't be enforced by the hypervisor and therefore it's possible to
run guests using different page granularity.

So we can't mandate the block backend to support indirect descriptor
when the frontend is using 64KB page granularity and have to fix it
properly in the frontend.

The solution exposed below is based on modifying directly the frontend
guest rather than asking the block framework to support smaller size
(i.e < PAGE_SIZE). This is because the change is the block framework are
not trivial as everything seems to relying on a struct *page (see [1]).
Although, it may be possible that someone succeed to do it in the future
and we would therefore be able to use it.

Given that a block request may not fit in a single ring request, a
second request is introduced for the data that cannot fit in the first
one. This means that the second ring request should never be used on
Linux if the page size is smaller than 44KB.

To achieve the support of the extra ring request, the block queue size
is divided by two. Therefore, the ring will always contain enough space
to accommodate 2 ring requests. While this will reduce the overall
performance, it will make the implementation more contained. The way
forward to get better performance is to implement in the backend either
indirect descriptor or multiple grants ring.

Note that the parameters blk_queue_max_* helpers haven't been updated.
The block code will set the mimimum size supported and we may be able
to support directly any change in the block framework that lower down
the minimal size of a request.

[1] http://lists.xen.org/archives/html/xen-devel/2015-08/msg02200.html

Signed-off-by: Julien Grall <julien.grall@citrix.com>
Acked-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 228 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 212 insertions(+), 16 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 0b32c90ffc3f..f3d0d4758641 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -60,6 +60,20 @@
 
 #include <asm/xen/hypervisor.h>
 
+/*
+ * The minimal size of segment supported by the block framework is PAGE_SIZE.
+ * When Linux is using a different page size than Xen, it may not be possible
+ * to put all the data in a single segment.
+ * This can happen when the backend doesn't support indirect descriptor and
+ * therefore the maximum amount of data that a request can carry is
+ * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB
+ *
+ * Note that we only support one extra request. So the Linux page size
+ * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) =
+ * 88KB.
+ */
+#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE)
+
 enum blkif_state {
 	BLKIF_STATE_DISCONNECTED,
 	BLKIF_STATE_CONNECTED,
@@ -72,6 +86,13 @@ struct grant {
 	struct list_head node;
 };
 
+enum blk_req_status {
+	REQ_WAITING,
+	REQ_DONE,
+	REQ_ERROR,
+	REQ_EOPNOTSUPP,
+};
+
 struct blk_shadow {
 	struct blkif_request req;
 	struct request *request;
@@ -79,6 +100,14 @@ struct blk_shadow {
 	struct grant **indirect_grants;
 	struct scatterlist *sg;
 	unsigned int num_sg;
+	enum blk_req_status status;
+
+	#define NO_ASSOCIATED_ID ~0UL
+	/*
+	 * Id of the sibling if we ever need 2 requests when handling a
+	 * block I/O request
+	 */
+	unsigned long associated_id;
 };
 
 struct split_bio {
@@ -492,6 +521,8 @@ static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo,
 
 	id = get_id_from_freelist(rinfo);
 	rinfo->shadow[id].request = req;
+	rinfo->shadow[id].status = REQ_WAITING;
+	rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID;
 
 	(*ring_req)->u.rw.id = id;
 
@@ -533,6 +564,9 @@ struct setup_rw_req {
 	bool need_copy;
 	unsigned int bvec_off;
 	char *bvec_data;
+
+	bool require_extra_req;
+	struct blkif_request *extra_ring_req;
 };
 
 static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
@@ -546,8 +580,24 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
 	unsigned int grant_idx = setup->grant_idx;
 	struct blkif_request *ring_req = setup->ring_req;
 	struct blkfront_ring_info *rinfo = setup->rinfo;
+	/*
+	 * We always use the shadow of the first request to store the list
+	 * of grant associated to the block I/O request. This made the
+	 * completion more easy to handle even if the block I/O request is
+	 * split.
+	 */
 	struct blk_shadow *shadow = &rinfo->shadow[setup->id];
 
+	if (unlikely(setup->require_extra_req &&
+		     grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+		/*
+		 * We are using the second request, setup grant_idx
+		 * to be the index of the segment array.
+		 */
+		grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST;
+		ring_req = setup->extra_ring_req;
+	}
+
 	if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
 	    (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
 		if (setup->segments)
@@ -562,7 +612,11 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
 
 	gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo);
 	ref = gnt_list_entry->gref;
-	shadow->grants_used[grant_idx] = gnt_list_entry;
+	/*
+	 * All the grants are stored in the shadow of the first
+	 * request. Therefore we have to use the global index.
+	 */
+	shadow->grants_used[setup->grant_idx] = gnt_list_entry;
 
 	if (setup->need_copy) {
 		void *shared_data;
@@ -604,11 +658,31 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
 	(setup->grant_idx)++;
 }
 
+static void blkif_setup_extra_req(struct blkif_request *first,
+				  struct blkif_request *second)
+{
+	uint16_t nr_segments = first->u.rw.nr_segments;
+
+	/*
+	 * The second request is only present when the first request uses
+	 * all its segments. It's always the continuity of the first one.
+	 */
+	first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+	second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	second->u.rw.sector_number = first->u.rw.sector_number +
+		(BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512;
+
+	second->u.rw.handle = first->u.rw.handle;
+	second->operation = first->operation;
+}
+
 static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo)
 {
 	struct blkfront_info *info = rinfo->dev_info;
-	struct blkif_request *ring_req;
-	unsigned long id;
+	struct blkif_request *ring_req, *extra_ring_req = NULL;
+	unsigned long id, extra_id = NO_ASSOCIATED_ID;
+	bool require_extra_req = false;
 	int i;
 	struct setup_rw_req setup = {
 		.grant_idx = 0,
@@ -650,19 +724,19 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
 	/* Fill out a communications ring structure. */
 	id = blkif_ring_get_request(rinfo, req, &ring_req);
 
-	BUG_ON(info->max_indirect_segments == 0 &&
-	       GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST);
-	BUG_ON(info->max_indirect_segments &&
-	       GREFS(req->nr_phys_segments) > info->max_indirect_segments);
-
 	num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg);
 	num_grant = 0;
 	/* Calculate the number of grant used */
 	for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
 	       num_grant += gnttab_count_grant(sg->offset, sg->length);
 
+	require_extra_req = info->max_indirect_segments == 0 &&
+		num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	BUG_ON(!HAS_EXTRA_REQ && require_extra_req);
+
 	rinfo->shadow[id].num_sg = num_sg;
-	if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+	if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST &&
+	    likely(!require_extra_req)) {
 		/*
 		 * The indirect operation can only be a BLKIF_OP_READ or
 		 * BLKIF_OP_WRITE
@@ -702,10 +776,30 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
 			}
 		}
 		ring_req->u.rw.nr_segments = num_grant;
+		if (unlikely(require_extra_req)) {
+			extra_id = blkif_ring_get_request(rinfo, req,
+							  &extra_ring_req);
+			/*
+			 * Only the first request contains the scatter-gather
+			 * list.
+			 */
+			rinfo->shadow[extra_id].num_sg = 0;
+
+			blkif_setup_extra_req(ring_req, extra_ring_req);
+
+			/* Link the 2 requests together */
+			rinfo->shadow[extra_id].associated_id = id;
+			rinfo->shadow[id].associated_id = extra_id;
+		}
 	}
 
 	setup.ring_req = ring_req;
 	setup.id = id;
+
+	setup.require_extra_req = require_extra_req;
+	if (unlikely(require_extra_req))
+		setup.extra_ring_req = extra_ring_req;
+
 	for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) {
 		BUG_ON(sg->offset + sg->length > PAGE_SIZE);
 
@@ -728,6 +822,8 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
 
 	/* Keep a private copy so we can reissue requests when recovering. */
 	rinfo->shadow[id].req = *ring_req;
+	if (unlikely(require_extra_req))
+		rinfo->shadow[extra_id].req = *extra_ring_req;
 
 	if (max_grefs > 0)
 		gnttab_free_grant_references(setup.gref_head);
@@ -829,7 +925,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
 	memset(&info->tag_set, 0, sizeof(info->tag_set));
 	info->tag_set.ops = &blkfront_mq_ops;
 	info->tag_set.nr_hw_queues = info->nr_rings;
-	info->tag_set.queue_depth =  BLK_RING_SIZE(info);
+	if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
+		/*
+		 * When indirect descriptior is not supported, the I/O request
+		 * will be split between multiple request in the ring.
+		 * To avoid problems when sending the request, divide by
+		 * 2 the depth of the queue.
+		 */
+		info->tag_set.queue_depth =  BLK_RING_SIZE(info) / 2;
+	} else
+		info->tag_set.queue_depth = BLK_RING_SIZE(info);
 	info->tag_set.numa_node = NUMA_NO_NODE;
 	info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
 	info->tag_set.cmd_size = 0;
@@ -1269,20 +1374,93 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
 	kunmap_atomic(shared_data);
 }
 
-static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *rinfo,
+static enum blk_req_status blkif_rsp_to_req_status(int rsp)
+{
+	switch (rsp)
+	{
+	case BLKIF_RSP_OKAY:
+		return REQ_DONE;
+	case BLKIF_RSP_EOPNOTSUPP:
+		return REQ_EOPNOTSUPP;
+	case BLKIF_RSP_ERROR:
+		/* Fallthrough. */
+	default:
+		return REQ_ERROR;
+	}
+}
+
+/*
+ * Get the final status of the block request based on two ring response
+ */
+static int blkif_get_final_status(enum blk_req_status s1,
+				  enum blk_req_status s2)
+{
+	BUG_ON(s1 == REQ_WAITING);
+	BUG_ON(s2 == REQ_WAITING);
+
+	if (s1 == REQ_ERROR || s2 == REQ_ERROR)
+		return BLKIF_RSP_ERROR;
+	else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP)
+		return BLKIF_RSP_EOPNOTSUPP;
+	return BLKIF_RSP_OKAY;
+}
+
+static bool blkif_completion(unsigned long *id,
+			     struct blkfront_ring_info *rinfo,
 			     struct blkif_response *bret)
 {
 	int i = 0;
 	struct scatterlist *sg;
 	int num_sg, num_grant;
 	struct blkfront_info *info = rinfo->dev_info;
+	struct blk_shadow *s = &rinfo->shadow[*id];
 	struct copy_from_grant data = {
-		.s = s,
 		.grant_idx = 0,
 	};
 
 	num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
 		s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
+
+	/* The I/O request may be split in two. */
+	if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) {
+		struct blk_shadow *s2 = &rinfo->shadow[s->associated_id];
+
+		/* Keep the status of the current response in shadow. */
+		s->status = blkif_rsp_to_req_status(bret->status);
+
+		/* Wait the second response if not yet here. */
+		if (s2->status == REQ_WAITING)
+			return 0;
+
+		bret->status = blkif_get_final_status(s->status,
+						      s2->status);
+
+		/*
+		 * All the grants is stored in the first shadow in order
+		 * to make the completion code simpler.
+		 */
+		num_grant += s2->req.u.rw.nr_segments;
+
+		/*
+		 * The two responses may not come in order. Only the
+		 * first request will store the scatter-gather list.
+		 */
+		if (s2->num_sg != 0) {
+			/* Update "id" with the ID of the first response. */
+			*id = s->associated_id;
+			s = s2;
+		}
+
+		/*
+		 * We don't need anymore the second request, so recycling
+		 * it now.
+		 */
+		if (add_id_to_freelist(rinfo, s->associated_id))
+			WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n",
+			     info->gd->disk_name, s->associated_id);
+	}
+
+	data.s = s;
 	num_sg = s->num_sg;
 
 	if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
@@ -1352,6 +1530,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri
 			}
 		}
 	}
+
+	return 1;
 }
 
 static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -1391,8 +1571,14 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 		}
 		req  = rinfo->shadow[id].request;
 
-		if (bret->operation != BLKIF_OP_DISCARD)
-			blkif_completion(&rinfo->shadow[id], rinfo, bret);
+		if (bret->operation != BLKIF_OP_DISCARD) {
+			/*
+			 * We may need to wait for an extra response if the
+			 * I/O request is split in 2
+			 */
+			if (!blkif_completion(&id, rinfo, bret))
+				continue;
+		}
 
 		if (add_id_to_freelist(rinfo, id)) {
 			WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
@@ -2017,8 +2203,18 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
 	int err, i;
 	struct blkfront_info *info = rinfo->dev_info;
 
-	if (info->max_indirect_segments == 0)
-		grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	if (info->max_indirect_segments == 0) {
+		if (!HAS_EXTRA_REQ)
+			grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+		else {
+			/*
+			 * When an extra req is required, the maximum
+			 * grants supported is related to the size of the
+			 * Linux block segment.
+			 */
+			grants = GRANTS_PER_PSEG;
+		}
+	}
 	else
 		grants = info->max_indirect_segments;
 	psegs = grants / GRANTS_PER_PSEG;
-- 
cgit 


From db6fbc106786f26d95889c50c18b1f28aa543a17 Mon Sep 17 00:00:00 2001
From: Bob Liu <bob.liu@oracle.com>
Date: Wed, 9 Dec 2015 07:44:02 +0800
Subject: xen/blkback: make st_ statistics per ring

Make st_* statistics per ring and the VBD sysfs would iterate over all the
rings.

Note: xenvbd_sysfs_delif() is called in xen_blkbk_remove() before all rings
are torn down, so it's safe.

Signed-off-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
v2: Aligned the variables on the same column.
---
 drivers/block/xen-blkback/blkback.c | 34 +++++++++++++---------------
 drivers/block/xen-blkback/common.h  | 20 ++++++++---------
 drivers/block/xen-blkback/xenbus.c  | 45 ++++++++++++++++++++++++++++---------
 3 files changed, 61 insertions(+), 38 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 99b479f330af..148930c8c121 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -587,20 +587,18 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
 
 static void print_stats(struct xen_blkif_ring *ring)
 {
-	struct xen_blkif *blkif = ring->blkif;
-
 	pr_info("(%s): oo %3llu  |  rd %4llu  |  wr %4llu  |  f %4llu"
 		 "  |  ds %4llu | pg: %4u/%4d\n",
-		 current->comm, blkif->st_oo_req,
-		 blkif->st_rd_req, blkif->st_wr_req,
-		 blkif->st_f_req, blkif->st_ds_req,
+		 current->comm, ring->st_oo_req,
+		 ring->st_rd_req, ring->st_wr_req,
+		 ring->st_f_req, ring->st_ds_req,
 		 ring->persistent_gnt_c,
 		 xen_blkif_max_pgrants);
-	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
-	blkif->st_rd_req = 0;
-	blkif->st_wr_req = 0;
-	blkif->st_oo_req = 0;
-	blkif->st_ds_req = 0;
+	ring->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+	ring->st_rd_req = 0;
+	ring->st_wr_req = 0;
+	ring->st_oo_req = 0;
+	ring->st_ds_req = 0;
 }
 
 int xen_blkif_schedule(void *arg)
@@ -656,7 +654,7 @@ purge_gnt_list:
 		/* Shrink if we have more than xen_blkif_max_buffer_pages */
 		shrink_free_pagepool(ring, xen_blkif_max_buffer_pages);
 
-		if (log_stats && time_after(jiffies, ring->blkif->st_print))
+		if (log_stats && time_after(jiffies, ring->st_print))
 			print_stats(ring);
 	}
 
@@ -1018,7 +1016,7 @@ static int dispatch_discard_io(struct xen_blkif_ring *ring,
 			preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
 		goto fail_response;
 	}
-	blkif->st_ds_req++;
+	ring->st_ds_req++;
 
 	secure = (blkif->vbd.discard_secure &&
 		 (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
@@ -1145,7 +1143,7 @@ __do_block_io_op(struct xen_blkif_ring *ring)
 
 		pending_req = alloc_req(ring);
 		if (NULL == pending_req) {
-			ring->blkif->st_oo_req++;
+			ring->st_oo_req++;
 			more_to_do = 1;
 			break;
 		}
@@ -1243,17 +1241,17 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 
 	switch (req_operation) {
 	case BLKIF_OP_READ:
-		ring->blkif->st_rd_req++;
+		ring->st_rd_req++;
 		operation = READ;
 		break;
 	case BLKIF_OP_WRITE:
-		ring->blkif->st_wr_req++;
+		ring->st_wr_req++;
 		operation = WRITE_ODIRECT;
 		break;
 	case BLKIF_OP_WRITE_BARRIER:
 		drain = true;
 	case BLKIF_OP_FLUSH_DISKCACHE:
-		ring->blkif->st_f_req++;
+		ring->st_f_req++;
 		operation = WRITE_FLUSH;
 		break;
 	default:
@@ -1395,9 +1393,9 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 	blk_finish_plug(&plug);
 
 	if (operation == READ)
-		ring->blkif->st_rd_sect += preq.nr_sects;
+		ring->st_rd_sect += preq.nr_sects;
 	else if (operation & WRITE)
-		ring->blkif->st_wr_sect += preq.nr_sects;
+		ring->st_wr_sect += preq.nr_sects;
 
 	return 0;
 
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 3c244ecf22a4..b27c5ba15600 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -298,6 +298,16 @@ struct xen_blkif_ring {
 	atomic_t		persistent_gnt_in_use;
 	unsigned long           next_lru;
 
+	/* Statistics. */
+	unsigned long		st_print;
+	unsigned long long	st_rd_req;
+	unsigned long long	st_wr_req;
+	unsigned long long	st_oo_req;
+	unsigned long long	st_f_req;
+	unsigned long long	st_ds_req;
+	unsigned long long	st_rd_sect;
+	unsigned long long	st_wr_sect;
+
 	/* Used by the kworker that offload work from the persistent purge. */
 	struct list_head	persistent_purge_list;
 	struct work_struct	persistent_purge_work;
@@ -328,16 +338,6 @@ struct xen_blkif {
 	struct completion	drain_complete;
 	atomic_t		drain;
 
-	/* statistics */
-	unsigned long		st_print;
-	unsigned long long			st_rd_req;
-	unsigned long long			st_wr_req;
-	unsigned long long			st_oo_req;
-	unsigned long long			st_f_req;
-	unsigned long long			st_ds_req;
-	unsigned long long			st_rd_sect;
-	unsigned long long			st_wr_sect;
-
 	struct work_struct	free_work;
 	unsigned int 		nr_ring_pages;
 	/* All rings for this device. */
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index c92b35882720..44396b8a0cb2 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -159,6 +159,7 @@ static int xen_blkif_alloc_rings(struct xen_blkif *blkif)
 		init_waitqueue_head(&ring->pending_free_wq);
 		init_waitqueue_head(&ring->shutdown_wq);
 		ring->blkif = blkif;
+		ring->st_print = jiffies;
 		xen_blkif_get(blkif);
 	}
 
@@ -179,7 +180,6 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 	atomic_set(&blkif->refcnt, 1);
 	init_completion(&blkif->drain_complete);
 	INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
-	blkif->st_print = jiffies;
 
 	return blkif;
 }
@@ -329,25 +329,38 @@ int __init xen_blkif_interface_init(void)
  *  sysfs interface for VBD I/O requests
  */
 
-#define VBD_SHOW(name, format, args...)					\
+#define VBD_SHOW_ALLRING(name, format)					\
 	static ssize_t show_##name(struct device *_dev,			\
 				   struct device_attribute *attr,	\
 				   char *buf)				\
 	{								\
 		struct xenbus_device *dev = to_xenbus_device(_dev);	\
 		struct backend_info *be = dev_get_drvdata(&dev->dev);	\
+		struct xen_blkif *blkif = be->blkif;			\
+		unsigned int i;						\
+		unsigned long long result = 0;				\
 									\
-		return sprintf(buf, format, ##args);			\
+		if (!blkif->rings)				\
+			goto out;					\
+									\
+		for (i = 0; i < blkif->nr_rings; i++) {		\
+			struct xen_blkif_ring *ring = &blkif->rings[i];	\
+									\
+			result += ring->st_##name;			\
+		}							\
+									\
+out:									\
+		return sprintf(buf, format, result);			\
 	}								\
 	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
 
-VBD_SHOW(oo_req,  "%llu\n", be->blkif->st_oo_req);
-VBD_SHOW(rd_req,  "%llu\n", be->blkif->st_rd_req);
-VBD_SHOW(wr_req,  "%llu\n", be->blkif->st_wr_req);
-VBD_SHOW(f_req,  "%llu\n", be->blkif->st_f_req);
-VBD_SHOW(ds_req,  "%llu\n", be->blkif->st_ds_req);
-VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect);
-VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect);
+VBD_SHOW_ALLRING(oo_req,  "%llu\n");
+VBD_SHOW_ALLRING(rd_req,  "%llu\n");
+VBD_SHOW_ALLRING(wr_req,  "%llu\n");
+VBD_SHOW_ALLRING(f_req,  "%llu\n");
+VBD_SHOW_ALLRING(ds_req,  "%llu\n");
+VBD_SHOW_ALLRING(rd_sect, "%llu\n");
+VBD_SHOW_ALLRING(wr_sect, "%llu\n");
 
 static struct attribute *xen_vbdstat_attrs[] = {
 	&dev_attr_oo_req.attr,
@@ -365,6 +378,18 @@ static struct attribute_group xen_vbdstat_group = {
 	.attrs = xen_vbdstat_attrs,
 };
 
+#define VBD_SHOW(name, format, args...)					\
+	static ssize_t show_##name(struct device *_dev,			\
+				   struct device_attribute *attr,	\
+				   char *buf)				\
+	{								\
+		struct xenbus_device *dev = to_xenbus_device(_dev);	\
+		struct backend_info *be = dev_get_drvdata(&dev->dev);	\
+									\
+		return sprintf(buf, format, ##args);			\
+	}								\
+	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
 VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
 VBD_SHOW(mode, "%s\n", be->mode);
 
-- 
cgit 


From 93bb277f97a6d319361766bde228717faf4abdeb Mon Sep 17 00:00:00 2001
From: Bob Liu <bob.liu@oracle.com>
Date: Thu, 10 Dec 2015 09:16:48 +0800
Subject: xen/blkback: Fix two memory leaks.

This patch fixs two memleaks:
  backtrace:
    [<ffffffff817ba5e8>] kmemleak_alloc+0x28/0x50
    [<ffffffff81205e3b>] kmem_cache_alloc+0xbb/0x1d0
    [<ffffffff81534028>] xen_blkbk_probe+0x58/0x230
    [<ffffffff8146adb6>] xenbus_dev_probe+0x76/0x130
    [<ffffffff81511716>] driver_probe_device+0x166/0x2c0
    [<ffffffff815119bc>] __device_attach_driver+0xac/0xb0
    [<ffffffff8150fa57>] bus_for_each_drv+0x67/0x90
    [<ffffffff81511ab7>] __device_attach+0xc7/0x120
    [<ffffffff81511b23>] device_initial_probe+0x13/0x20
    [<ffffffff8151059a>] bus_probe_device+0x9a/0xb0
    [<ffffffff8150f0a1>] device_add+0x3b1/0x5c0
    [<ffffffff8150f47e>] device_register+0x1e/0x30
    [<ffffffff8146a9e8>] xenbus_probe_node+0x158/0x170
    [<ffffffff8146abaf>] xenbus_dev_changed+0x1af/0x1c0
    [<ffffffff8146b1bb>] backend_changed+0x1b/0x20
    [<ffffffff81468ca6>] xenwatch_thread+0xb6/0x160
unreferenced object 0xffff880007ba8ef8 (size 224):

  backtrace:
    [<ffffffff817ba5e8>] kmemleak_alloc+0x28/0x50
    [<ffffffff81205c73>] __kmalloc+0xd3/0x1e0
    [<ffffffff81534d87>] frontend_changed+0x2c7/0x580
    [<ffffffff8146af12>] xenbus_otherend_changed+0xa2/0xb0
    [<ffffffff8146b2c0>] frontend_changed+0x10/0x20
    [<ffffffff81468ca6>] xenwatch_thread+0xb6/0x160
    [<ffffffff810d3e97>] kthread+0xd7/0xf0
    [<ffffffff817c4a9f>] ret_from_fork+0x3f/0x70
    [<ffffffffffffffff>] 0xffffffffffffffff
unreferenced object 0xffff8800048dcd38 (size 224):

The first leak is caused by not put() the be->blkif reference
which we had gotten in xen_blkif_alloc(), while the second is
us not freeing blkif->rings in the right place.

Signed-off-by: Bob Liu <bob.liu@oracle.com>
Reported-and-Tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/xenbus.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 44396b8a0cb2..876763f7f13e 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -297,8 +297,16 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif)
 		BUG_ON(ring->free_pages_num != 0);
 		BUG_ON(ring->persistent_gnt_c != 0);
 		WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
+		xen_blkif_put(blkif);
 	}
 	blkif->nr_ring_pages = 0;
+	/*
+	 * blkif->rings was allocated in connect_ring, so we should free it in
+	 * here.
+	 */
+	kfree(blkif->rings);
+	blkif->rings = NULL;
+	blkif->nr_rings = 0;
 
 	return 0;
 }
@@ -310,7 +318,6 @@ static void xen_blkif_free(struct xen_blkif *blkif)
 	xen_vbd_free(&blkif->vbd);
 
 	/* Make sure everything is drained before shutting down */
-	kfree(blkif->rings);
 	kmem_cache_free(xen_blkif_cachep, blkif);
 }
 
@@ -484,7 +491,6 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
 static int xen_blkbk_remove(struct xenbus_device *dev)
 {
 	struct backend_info *be = dev_get_drvdata(&dev->dev);
-	unsigned int i;
 
 	pr_debug("%s %p %d\n", __func__, dev, dev->otherend_id);
 
@@ -499,12 +505,11 @@ static int xen_blkbk_remove(struct xenbus_device *dev)
 
 	dev_set_drvdata(&dev->dev, NULL);
 
-	if (be->blkif) {
+	if (be->blkif)
 		xen_blkif_disconnect(be->blkif);
-		for (i = 0; i < be->blkif->nr_rings; i++)
-			xen_blkif_put(be->blkif);
-	}
 
+	/* Put the reference we set in xen_blkif_alloc(). */
+	xen_blkif_put(be->blkif);
 	kfree(be->mode);
 	kfree(be);
 	return 0;
-- 
cgit 


From c31ecf6c126dbc7f30234eaf6c4a079649a38de7 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 18 Dec 2015 16:28:53 -0500
Subject: xen/blkfront: Fix crash if backend doesn't follow the right states.

We have split the setting up of all the resources in two steps:
1) talk_to_blkback  - which figures out the num_ring_pages (from
   the default value of zero), sets up shadow and so
2) blkfront_connect - does the real part of filling out the
   internal structures.

The problem is if we bypass the 1) step and go straight to 2)
and call blkfront_setup_indirect where we use the macro
BLK_RING_SIZE - which returns an negative value (because
sz is zero  - since num_ring_pages is zero - since it has never
been set).

We can fix this by making sure that we always have called
talk_to_blkback before going to blkfront_connect.

Or we could set in blkfront_probe info->nr_ring_pages = 1
to have a default value. But that looks odd - as we haven't
actually negotiated any ring size.

This patch changes XenbusStateConnected state to detect if
we haven't done the initial handshake - and if so continue
on as if were in XenbusStateInitWait state.

We also roll the error recovery (freeing the structure) into
talk_to_blkback error path - which is safe since that function
is only called from blkback_changed.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index f3d0d4758641..8a8dc91c39f7 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1867,6 +1867,9 @@ again:
  destroy_blkring:
 	blkif_free(info, 0);
 
+	kfree(info);
+	dev_set_drvdata(&dev->dev, NULL);
+
 	return err;
 }
 
@@ -2453,11 +2456,8 @@ static void blkback_changed(struct xenbus_device *dev,
 	case XenbusStateInitWait:
 		if (dev->state != XenbusStateInitialising)
 			break;
-		if (talk_to_blkback(dev, info)) {
-			kfree(info);
-			dev_set_drvdata(&dev->dev, NULL);
+		if (talk_to_blkback(dev, info))
 			break;
-		}
 	case XenbusStateInitialising:
 	case XenbusStateInitialised:
 	case XenbusStateReconfiguring:
@@ -2466,6 +2466,10 @@ static void blkback_changed(struct xenbus_device *dev,
 		break;
 
 	case XenbusStateConnected:
+		if (dev->state != XenbusStateInitialised) {
+			if (talk_to_blkback(dev, info))
+				break;
+		}
 		blkfront_connect(info);
 		break;
 
-- 
cgit 


From 9e35fdcb9cd54e381135310aae3d9bbb23cecda3 Mon Sep 17 00:00:00 2001
From: Zhu Yanjun <zyjzyj2000@gmail.com>
Date: Tue, 5 Jan 2016 18:39:04 +0800
Subject: mtip32xx: restrict variables visible in current code module

The modified variables are only used in the file mtip32xx.c.
As such, the static keyword is inserted to define that object
to be only visible to the current code module during compilation.

Signed-off-by: Zhu Yanjun <zyjzyj2000@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/mtip32xx/mtip32xx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 3457ac8c03e2..2641a98f2246 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -104,9 +104,9 @@
 /* Device instance number, incremented each time a device is probed. */
 static int instance;
 
-struct list_head online_list;
-struct list_head removing_list;
-spinlock_t dev_lock;
+static struct list_head online_list;
+static struct list_head removing_list;
+static spinlock_t dev_lock;
 
 /*
  * Global variable used to hold the major block device number
-- 
cgit 


From e93d12ae3be91d18b2a46deebb90a3f516db3d3c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 13 Jan 2016 23:04:08 +0100
Subject: null_blk: use sector_div instead of do_div

Dividing a sector_t number should be done using sector_div rather than do_div
to optimize the 32-bit sector_t case, and with the latest do_div optimizations,
we now get a compile-time warning for this:

arch/arm/include/asm/div64.h:32:95: note: expected 'uint64_t * {aka long long unsigned int *}' but argument is of type 'sector_t * {aka long unsigned int *}'
drivers/block/null_blk.c:521:81: warning: comparison of distinct pointer types lacks a cast

This changes the newly added code to use sector_div. It is a simplified version
of the original patch, as Linus Torvalds pointed out that we should not be using
an expensive division function in the first place.

This version was suggested by Matias Bjorling.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Matias Bjorling <m@bjorling.me>
Fixes: b2b7e00148a2 ("null_blk: register as a LightNVM device")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 5c8ba5484d86..a96ce7ec1ca5 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -510,17 +510,17 @@ static int null_lnvm_id(struct request_queue *q, struct nvm_id *id)
 	id->ppaf.ch_offset = 56;
 	id->ppaf.ch_len = 8;
 
-	do_div(size, bs); /* convert size to pages */
-	do_div(size, 256); /* concert size to pgs pr blk */
+	sector_div(size, bs); /* convert size to pages */
+	size >>= 8; /* concert size to pgs pr blk */
 	grp = &id->groups[0];
 	grp->mtype = 0;
 	grp->fmtype = 0;
 	grp->num_ch = 1;
 	grp->num_pg = 256;
 	blksize = size;
-	do_div(size, (1 << 16));
+	size >>= 16;
 	grp->num_lun = size + 1;
-	do_div(blksize, grp->num_lun);
+	sector_div(blksize, grp->num_lun);
 	grp->num_blk = blksize;
 	grp->num_pln = 1;
 
-- 
cgit