From ea2c18e1044e9ed8f780c965c50432060ab0e355 Mon Sep 17 00:00:00 2001
From: Masato Suzuki <masato.suzuki@wdc.com>
Date: Tue, 30 Oct 2018 16:14:05 +0900
Subject: null_blk: Add conventional zone configuration for zoned support

Allow the creation of conventional zones by adding the zone_nr_conv
configuration attribute. This new attribute is used only for zoned devices
and indicates the number of conventional zones to create. The default value
is 0. Since host-managed zoned block devices must always have at least one
sequential zone, if the value of zone_nr_conv is larger than or equal to
the total number of zones of the device nr_zones, zone_nr_conv is
automatically changed to nr_zones - 1.

Reviewed-by: Matias Bjorling <matias.bjorling@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Masato Suzuki <masato.suzuki@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/null_blk.h       |  1 +
 drivers/block/null_blk_main.c  |  7 +++++++
 drivers/block/null_blk_zoned.c | 27 ++++++++++++++++++++++++++-
 3 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
index 7685df43f1ef..b3df2793e7cd 100644
--- a/drivers/block/null_blk.h
+++ b/drivers/block/null_blk.h
@@ -49,6 +49,7 @@ struct nullb_device {
 	unsigned long completion_nsec; /* time in ns to complete a request */
 	unsigned long cache_size; /* disk cache size in MB */
 	unsigned long zone_size; /* zone size in MB if device is zoned */
+	unsigned int zone_nr_conv; /* number of conventional zones */
 	unsigned int submit_queues; /* number of submission queues */
 	unsigned int home_node; /* home node for the device */
 	unsigned int queue_mode; /* block interface */
diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 09339203dfba..9c045bee0985 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -188,6 +188,10 @@ static unsigned long g_zone_size = 256;
 module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
 MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
 
+static unsigned int g_zone_nr_conv;
+module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
+MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0");
+
 static struct nullb_device *null_alloc_dev(void);
 static void null_free_dev(struct nullb_device *dev);
 static void null_del_dev(struct nullb *nullb);
@@ -293,6 +297,7 @@ NULLB_DEVICE_ATTR(mbps, uint);
 NULLB_DEVICE_ATTR(cache_size, ulong);
 NULLB_DEVICE_ATTR(zoned, bool);
 NULLB_DEVICE_ATTR(zone_size, ulong);
+NULLB_DEVICE_ATTR(zone_nr_conv, uint);
 
 static ssize_t nullb_device_power_show(struct config_item *item, char *page)
 {
@@ -407,6 +412,7 @@ static struct configfs_attribute *nullb_device_attrs[] = {
 	&nullb_device_attr_badblocks,
 	&nullb_device_attr_zoned,
 	&nullb_device_attr_zone_size,
+	&nullb_device_attr_zone_nr_conv,
 	NULL,
 };
 
@@ -520,6 +526,7 @@ static struct nullb_device *null_alloc_dev(void)
 	dev->use_per_node_hctx = g_use_per_node_hctx;
 	dev->zoned = g_zoned;
 	dev->zone_size = g_zone_size;
+	dev->zone_nr_conv = g_zone_nr_conv;
 	return dev;
 }
 
diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
index c0b0e4a3fa8f..5d1c261a2cfd 100644
--- a/drivers/block/null_blk_zoned.c
+++ b/drivers/block/null_blk_zoned.c
@@ -29,7 +29,25 @@ int null_zone_init(struct nullb_device *dev)
 	if (!dev->zones)
 		return -ENOMEM;
 
-	for (i = 0; i < dev->nr_zones; i++) {
+	if (dev->zone_nr_conv >= dev->nr_zones) {
+		dev->zone_nr_conv = dev->nr_zones - 1;
+		pr_info("null_blk: changed the number of conventional zones to %u",
+			dev->zone_nr_conv);
+	}
+
+	for (i = 0; i <  dev->zone_nr_conv; i++) {
+		struct blk_zone *zone = &dev->zones[i];
+
+		zone->start = sector;
+		zone->len = dev->zone_size_sects;
+		zone->wp = zone->start + zone->len;
+		zone->type = BLK_ZONE_TYPE_CONVENTIONAL;
+		zone->cond = BLK_ZONE_COND_NOT_WP;
+
+		sector += dev->zone_size_sects;
+	}
+
+	for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) {
 		struct blk_zone *zone = &dev->zones[i];
 
 		zone->start = zone->wp = sector;
@@ -98,6 +116,8 @@ void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
 		if (zone->wp == zone->start + zone->len)
 			zone->cond = BLK_ZONE_COND_FULL;
 		break;
+	case BLK_ZONE_COND_NOT_WP:
+		break;
 	default:
 		/* Invalid zone condition */
 		cmd->error = BLK_STS_IOERR;
@@ -111,6 +131,11 @@ void null_zone_reset(struct nullb_cmd *cmd, sector_t sector)
 	unsigned int zno = null_zone_no(dev, sector);
 	struct blk_zone *zone = &dev->zones[zno];
 
+	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) {
+		cmd->error = BLK_STS_IOERR;
+		return;
+	}
+
 	zone->cond = BLK_ZONE_COND_EMPTY;
 	zone->wp = zone->start;
 }
-- 
cgit 


From fa182a1fa97dff56cda742f22d17d666420cd27f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 15 Oct 2018 08:54:23 -0600
Subject: sunvdc: convert to blk-mq

Convert from the old request_fn style driver to blk-mq.

Cc: David Miller <davem@davemloft.net>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/sunvdc.c | 149 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 98 insertions(+), 51 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index b54fa6726303..95cb4ea8e402 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -6,7 +6,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/hdreg.h>
 #include <linux/genhd.h>
 #include <linux/cdrom.h>
@@ -66,9 +66,10 @@ struct vdc_port {
 
 	u64			max_xfer_size;
 	u32			vdisk_block_size;
+	u32			drain;
 
 	u64			ldc_timeout;
-	struct timer_list	ldc_reset_timer;
+	struct delayed_work	ldc_reset_timer_work;
 	struct work_struct	ldc_reset_work;
 
 	/* The server fills these in for us in the disk attribute
@@ -80,12 +81,14 @@ struct vdc_port {
 	u8			vdisk_mtype;
 	u32			vdisk_phys_blksz;
 
+	struct blk_mq_tag_set	tag_set;
+
 	char			disk_name[32];
 };
 
 static void vdc_ldc_reset(struct vdc_port *port);
 static void vdc_ldc_reset_work(struct work_struct *work);
-static void vdc_ldc_reset_timer(struct timer_list *t);
+static void vdc_ldc_reset_timer_work(struct work_struct *work);
 
 static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio)
 {
@@ -175,11 +178,8 @@ static void vdc_blk_queue_start(struct vdc_port *port)
 	 * handshake completes, so check for initial handshake before we've
 	 * allocated a disk.
 	 */
-	if (port->disk && blk_queue_stopped(port->disk->queue) &&
-	    vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50) {
-		blk_start_queue(port->disk->queue);
-	}
-
+	if (port->disk && vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50)
+		blk_mq_start_hw_queues(port->disk->queue);
 }
 
 static void vdc_finish(struct vio_driver_state *vio, int err, int waiting_for)
@@ -197,7 +197,7 @@ static void vdc_handshake_complete(struct vio_driver_state *vio)
 {
 	struct vdc_port *port = to_vdc_port(vio);
 
-	del_timer(&port->ldc_reset_timer);
+	cancel_delayed_work(&port->ldc_reset_timer_work);
 	vdc_finish(vio, 0, WAITING_FOR_LINK_UP);
 	vdc_blk_queue_start(port);
 }
@@ -320,7 +320,7 @@ static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr,
 
 	rqe->req = NULL;
 
-	__blk_end_request(req, (desc->status ? BLK_STS_IOERR : 0), desc->size);
+	blk_mq_end_request(req, desc->status ? BLK_STS_IOERR : 0);
 
 	vdc_blk_queue_start(port);
 }
@@ -525,29 +525,40 @@ static int __send_request(struct request *req)
 	return err;
 }
 
-static void do_vdc_request(struct request_queue *rq)
+static blk_status_t vdc_queue_rq(struct blk_mq_hw_ctx *hctx,
+				 const struct blk_mq_queue_data *bd)
 {
-	struct request *req;
+	struct vdc_port *port = hctx->queue->queuedata;
+	struct vio_dring_state *dr;
+	unsigned long flags;
 
-	while ((req = blk_peek_request(rq)) != NULL) {
-		struct vdc_port *port;
-		struct vio_dring_state *dr;
+	dr = &port->vio.drings[VIO_DRIVER_TX_RING];
 
-		port = req->rq_disk->private_data;
-		dr = &port->vio.drings[VIO_DRIVER_TX_RING];
-		if (unlikely(vdc_tx_dring_avail(dr) < 1))
-			goto wait;
+	blk_mq_start_request(bd->rq);
 
-		blk_start_request(req);
+	spin_lock_irqsave(&port->vio.lock, flags);
 
-		if (__send_request(req) < 0) {
-			blk_requeue_request(rq, req);
-wait:
-			/* Avoid pointless unplugs. */
-			blk_stop_queue(rq);
-			break;
-		}
+	/*
+	 * Doing drain, just end the request in error
+	 */
+	if (unlikely(port->drain)) {
+		spin_unlock_irqrestore(&port->vio.lock, flags);
+		return BLK_STS_IOERR;
 	}
+
+	if (unlikely(vdc_tx_dring_avail(dr) < 1)) {
+		spin_unlock_irqrestore(&port->vio.lock, flags);
+		blk_mq_stop_hw_queue(hctx);
+		return BLK_STS_DEV_RESOURCE;
+	}
+
+	if (__send_request(bd->rq) < 0) {
+		spin_unlock_irqrestore(&port->vio.lock, flags);
+		return BLK_STS_IOERR;
+	}
+
+	spin_unlock_irqrestore(&port->vio.lock, flags);
+	return BLK_STS_OK;
 }
 
 static int generic_request(struct vdc_port *port, u8 op, void *buf, int len)
@@ -759,6 +770,32 @@ static void vdc_port_down(struct vdc_port *port)
 	vio_ldc_free(&port->vio);
 }
 
+static const struct blk_mq_ops vdc_mq_ops = {
+	.queue_rq	= vdc_queue_rq,
+};
+
+static void cleanup_queue(struct request_queue *q)
+{
+	struct vdc_port *port = q->queuedata;
+
+	blk_cleanup_queue(q);
+	blk_mq_free_tag_set(&port->tag_set);
+}
+
+static struct request_queue *init_queue(struct vdc_port *port)
+{
+	struct request_queue *q;
+	int ret;
+
+	q = blk_mq_init_sq_queue(&port->tag_set, &vdc_mq_ops, VDC_TX_RING_SIZE,
+					BLK_MQ_F_SHOULD_MERGE);
+	if (IS_ERR(q))
+		return q;
+
+	q->queuedata = port;
+	return q;
+}
+
 static int probe_disk(struct vdc_port *port)
 {
 	struct request_queue *q;
@@ -796,17 +833,17 @@ static int probe_disk(struct vdc_port *port)
 				    (u64)geom.num_sec);
 	}
 
-	q = blk_init_queue(do_vdc_request, &port->vio.lock);
-	if (!q) {
+	q = init_queue(port);
+	if (IS_ERR(q)) {
 		printk(KERN_ERR PFX "%s: Could not allocate queue.\n",
 		       port->vio.name);
-		return -ENOMEM;
+		return PTR_ERR(q);
 	}
 	g = alloc_disk(1 << PARTITION_SHIFT);
 	if (!g) {
 		printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n",
 		       port->vio.name);
-		blk_cleanup_queue(q);
+		cleanup_queue(q);
 		return -ENOMEM;
 	}
 
@@ -981,7 +1018,7 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 	 */
 	ldc_timeout = mdesc_get_property(hp, vdev->mp, "vdc-timeout", NULL);
 	port->ldc_timeout = ldc_timeout ? *ldc_timeout : 0;
-	timer_setup(&port->ldc_reset_timer, vdc_ldc_reset_timer, 0);
+	INIT_DELAYED_WORK(&port->ldc_reset_timer_work, vdc_ldc_reset_timer_work);
 	INIT_WORK(&port->ldc_reset_work, vdc_ldc_reset_work);
 
 	err = vio_driver_init(&port->vio, vdev, VDEV_DISK,
@@ -1034,18 +1071,14 @@ static int vdc_port_remove(struct vio_dev *vdev)
 	struct vdc_port *port = dev_get_drvdata(&vdev->dev);
 
 	if (port) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&port->vio.lock, flags);
-		blk_stop_queue(port->disk->queue);
-		spin_unlock_irqrestore(&port->vio.lock, flags);
+		blk_mq_stop_hw_queues(port->disk->queue);
 
 		flush_work(&port->ldc_reset_work);
-		del_timer_sync(&port->ldc_reset_timer);
+		cancel_delayed_work_sync(&port->ldc_reset_timer_work);
 		del_timer_sync(&port->vio.timer);
 
 		del_gendisk(port->disk);
-		blk_cleanup_queue(port->disk->queue);
+		cleanup_queue(port->disk->queue);
 		put_disk(port->disk);
 		port->disk = NULL;
 
@@ -1080,32 +1113,46 @@ static void vdc_requeue_inflight(struct vdc_port *port)
 		}
 
 		rqe->req = NULL;
-		blk_requeue_request(port->disk->queue, req);
+		blk_mq_requeue_request(req, false);
 	}
 }
 
 static void vdc_queue_drain(struct vdc_port *port)
 {
-	struct request *req;
+	struct request_queue *q = port->disk->queue;
 
-	while ((req = blk_fetch_request(port->disk->queue)) != NULL)
-		__blk_end_request_all(req, BLK_STS_IOERR);
+	/*
+	 * Mark the queue as draining, then freeze/quiesce to ensure
+	 * that all existing requests are seen in ->queue_rq() and killed
+	 */
+	port->drain = 1;
+	spin_unlock_irq(&port->vio.lock);
+
+	blk_mq_freeze_queue(q);
+	blk_mq_quiesce_queue(q);
+
+	spin_lock_irq(&port->vio.lock);
+	port->drain = 0;
+	blk_mq_unquiesce_queue(q);
+	blk_mq_unfreeze_queue(q);
 }
 
-static void vdc_ldc_reset_timer(struct timer_list *t)
+static void vdc_ldc_reset_timer_work(struct work_struct *work)
 {
-	struct vdc_port *port = from_timer(port, t, ldc_reset_timer);
-	struct vio_driver_state *vio = &port->vio;
-	unsigned long flags;
+	struct vdc_port *port;
+	struct vio_driver_state *vio;
 
-	spin_lock_irqsave(&vio->lock, flags);
+	port = container_of(work, struct vdc_port, ldc_reset_timer_work.work);
+	vio = &port->vio;
+
+	spin_lock_irq(&vio->lock);
 	if (!(port->vio.hs_state & VIO_HS_COMPLETE)) {
 		pr_warn(PFX "%s ldc down %llu seconds, draining queue\n",
 			port->disk_name, port->ldc_timeout);
 		vdc_queue_drain(port);
 		vdc_blk_queue_start(port);
 	}
-	spin_unlock_irqrestore(&vio->lock, flags);
+	spin_unlock_irq(&vio->lock);
 }
 
 static void vdc_ldc_reset_work(struct work_struct *work)
@@ -1129,7 +1176,7 @@ static void vdc_ldc_reset(struct vdc_port *port)
 	assert_spin_locked(&port->vio.lock);
 
 	pr_warn(PFX "%s ldc link reset\n", port->disk_name);
-	blk_stop_queue(port->disk->queue);
+	blk_mq_stop_hw_queues(port->disk->queue);
 	vdc_requeue_inflight(port);
 	vdc_port_down(port);
 
@@ -1146,7 +1193,7 @@ static void vdc_ldc_reset(struct vdc_port *port)
 	}
 
 	if (port->ldc_timeout)
-		mod_timer(&port->ldc_reset_timer,
+		mod_delayed_work(system_wq, &port->ldc_reset_timer_work,
 			  round_jiffies(jiffies + HZ * port->ldc_timeout));
 	mod_timer(&port->vio.timer, round_jiffies(jiffies + HZ));
 	return;
-- 
cgit 


From db1142a83b4caf73b6510d0efd6ef3ab7a508105 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 15 Oct 2018 08:58:05 -0600
Subject: ms_block: convert to blk-mq

Straight forward conversion, room for optimization in how everything
is punted to a work queue. Also looks plenty racy all over the map,
with the state changes. I fixed a bunch of them up while doing the
conversion, but there are surely more.

Cc: Maxim Levitsky <maximlevitsky@gmail.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/memstick/core/ms_block.c | 110 ++++++++++++++++++++++-----------------
 drivers/memstick/core/ms_block.h |   1 +
 2 files changed, 62 insertions(+), 49 deletions(-)

(limited to 'drivers')

diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index 8a02f11076f9..ee0be66a9a03 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -15,7 +15,7 @@
 #define pr_fmt(fmt) DRIVER_NAME ": " fmt
 
 #include <linux/module.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/memstick.h>
 #include <linux/idr.h>
 #include <linux/hdreg.h>
@@ -1873,69 +1873,65 @@ static void msb_io_work(struct work_struct *work)
 	struct msb_data *msb = container_of(work, struct msb_data, io_work);
 	int page, error, len;
 	sector_t lba;
-	unsigned long flags;
 	struct scatterlist *sg = msb->prealloc_sg;
+	struct request *req;
 
 	dbg_verbose("IO: work started");
 
 	while (1) {
-		spin_lock_irqsave(&msb->q_lock, flags);
+		spin_lock_irq(&msb->q_lock);
 
 		if (msb->need_flush_cache) {
 			msb->need_flush_cache = false;
-			spin_unlock_irqrestore(&msb->q_lock, flags);
+			spin_unlock_irq(&msb->q_lock);
 			msb_cache_flush(msb);
 			continue;
 		}
 
-		if (!msb->req) {
-			msb->req = blk_fetch_request(msb->queue);
-			if (!msb->req) {
-				dbg_verbose("IO: no more requests exiting");
-				spin_unlock_irqrestore(&msb->q_lock, flags);
-				return;
-			}
+		req = msb->req;
+		if (!req) {
+			dbg_verbose("IO: no more requests exiting");
+			spin_unlock_irq(&msb->q_lock);
+			return;
 		}
 
-		spin_unlock_irqrestore(&msb->q_lock, flags);
-
-		/* If card was removed meanwhile */
-		if (!msb->req)
-			return;
+		spin_unlock_irq(&msb->q_lock);
 
 		/* process the request */
 		dbg_verbose("IO: processing new request");
-		blk_rq_map_sg(msb->queue, msb->req, sg);
+		blk_rq_map_sg(msb->queue, req, sg);
 
-		lba = blk_rq_pos(msb->req);
+		lba = blk_rq_pos(req);
 
 		sector_div(lba, msb->page_size / 512);
 		page = sector_div(lba, msb->pages_in_block);
 
 		if (rq_data_dir(msb->req) == READ)
 			error = msb_do_read_request(msb, lba, page, sg,
-				blk_rq_bytes(msb->req), &len);
+				blk_rq_bytes(req), &len);
 		else
 			error = msb_do_write_request(msb, lba, page, sg,
-				blk_rq_bytes(msb->req), &len);
-
-		spin_lock_irqsave(&msb->q_lock, flags);
+				blk_rq_bytes(req), &len);
 
-		if (len)
-			if (!__blk_end_request(msb->req, BLK_STS_OK, len))
-				msb->req = NULL;
+		if (len && !blk_update_request(req, BLK_STS_OK, len)) {
+			__blk_mq_end_request(req, BLK_STS_OK);
+			spin_lock_irq(&msb->q_lock);
+			msb->req = NULL;
+			spin_unlock_irq(&msb->q_lock);
+		}
 
 		if (error && msb->req) {
 			blk_status_t ret = errno_to_blk_status(error);
+
 			dbg_verbose("IO: ending one sector of the request with error");
-			if (!__blk_end_request(msb->req, ret, msb->page_size))
-				msb->req = NULL;
+			blk_mq_end_request(req, ret);
+			spin_lock_irq(&msb->q_lock);
+			msb->req = NULL;
+			spin_unlock_irq(&msb->q_lock);
 		}
 
 		if (msb->req)
 			dbg_verbose("IO: request still pending");
-
-		spin_unlock_irqrestore(&msb->q_lock, flags);
 	}
 }
 
@@ -2002,29 +1998,40 @@ static int msb_bd_getgeo(struct block_device *bdev,
 	return 0;
 }
 
-static void msb_submit_req(struct request_queue *q)
+static blk_status_t msb_queue_rq(struct blk_mq_hw_ctx *hctx,
+				 const struct blk_mq_queue_data *bd)
 {
-	struct memstick_dev *card = q->queuedata;
+	struct memstick_dev *card = hctx->queue->queuedata;
 	struct msb_data *msb = memstick_get_drvdata(card);
-	struct request *req = NULL;
+	struct request *req = bd->rq;
 
 	dbg_verbose("Submit request");
 
+	spin_lock_irq(&msb->q_lock);
+
 	if (msb->card_dead) {
 		dbg("Refusing requests on removed card");
 
 		WARN_ON(!msb->io_queue_stopped);
 
-		while ((req = blk_fetch_request(q)) != NULL)
-			__blk_end_request_all(req, BLK_STS_IOERR);
-		return;
+		spin_unlock_irq(&msb->q_lock);
+		blk_mq_start_request(req);
+		return BLK_STS_IOERR;
 	}
 
-	if (msb->req)
-		return;
+	if (msb->req) {
+		spin_unlock_irq(&msb->q_lock);
+		return BLK_STS_DEV_RESOURCE;
+	}
+
+	blk_mq_start_request(req);
+	msb->req = req;
 
 	if (!msb->io_queue_stopped)
 		queue_work(msb->io_queue, &msb->io_work);
+
+	spin_unlock_irq(&msb->q_lock);
+	return BLK_STS_OK;
 }
 
 static int msb_check_card(struct memstick_dev *card)
@@ -2040,21 +2047,20 @@ static void msb_stop(struct memstick_dev *card)
 
 	dbg("Stopping all msblock IO");
 
+	blk_mq_stop_hw_queues(msb->queue);
 	spin_lock_irqsave(&msb->q_lock, flags);
-	blk_stop_queue(msb->queue);
 	msb->io_queue_stopped = true;
 	spin_unlock_irqrestore(&msb->q_lock, flags);
 
 	del_timer_sync(&msb->cache_flush_timer);
 	flush_workqueue(msb->io_queue);
 
+	spin_lock_irqsave(&msb->q_lock, flags);
 	if (msb->req) {
-		spin_lock_irqsave(&msb->q_lock, flags);
-		blk_requeue_request(msb->queue, msb->req);
+		blk_mq_requeue_request(msb->req, false);
 		msb->req = NULL;
-		spin_unlock_irqrestore(&msb->q_lock, flags);
 	}
-
+	spin_unlock_irqrestore(&msb->q_lock, flags);
 }
 
 static void msb_start(struct memstick_dev *card)
@@ -2077,9 +2083,7 @@ static void msb_start(struct memstick_dev *card)
 	msb->need_flush_cache = true;
 	msb->io_queue_stopped = false;
 
-	spin_lock_irqsave(&msb->q_lock, flags);
-	blk_start_queue(msb->queue);
-	spin_unlock_irqrestore(&msb->q_lock, flags);
+	blk_mq_start_hw_queues(msb->queue);
 
 	queue_work(msb->io_queue, &msb->io_work);
 
@@ -2092,10 +2096,15 @@ static const struct block_device_operations msb_bdops = {
 	.owner   = THIS_MODULE
 };
 
+static const struct blk_mq_ops msb_mq_ops = {
+	.queue_rq	= msb_queue_rq,
+};
+
 /* Registers the block device */
 static int msb_init_disk(struct memstick_dev *card)
 {
 	struct msb_data *msb = memstick_get_drvdata(card);
+	struct blk_mq_tag_set *set = NULL;
 	int rc;
 	unsigned long capacity;
 
@@ -2112,9 +2121,11 @@ static int msb_init_disk(struct memstick_dev *card)
 		goto out_release_id;
 	}
 
-	msb->queue = blk_init_queue(msb_submit_req, &msb->q_lock);
-	if (!msb->queue) {
-		rc = -ENOMEM;
+	msb->queue = blk_mq_init_sq_queue(&msb->tag_set, &msb_mq_ops, 2,
+						BLK_MQ_F_SHOULD_MERGE);
+	if (IS_ERR(msb->queue)) {
+		rc = PTR_ERR(msb->queue);
+		msb->queue = NULL;
 		goto out_put_disk;
 	}
 
@@ -2202,12 +2213,13 @@ static void msb_remove(struct memstick_dev *card)
 	/* Take care of unhandled + new requests from now on */
 	spin_lock_irqsave(&msb->q_lock, flags);
 	msb->card_dead = true;
-	blk_start_queue(msb->queue);
 	spin_unlock_irqrestore(&msb->q_lock, flags);
+	blk_mq_start_hw_queues(msb->queue);
 
 	/* Remove the disk */
 	del_gendisk(msb->disk);
 	blk_cleanup_queue(msb->queue);
+	blk_mq_free_tag_set(&msb->tag_set);
 	msb->queue = NULL;
 
 	mutex_lock(&msb_disk_lock);
diff --git a/drivers/memstick/core/ms_block.h b/drivers/memstick/core/ms_block.h
index 53962c3b21df..9ba84e0ced63 100644
--- a/drivers/memstick/core/ms_block.h
+++ b/drivers/memstick/core/ms_block.h
@@ -152,6 +152,7 @@ struct msb_data {
 	struct gendisk			*disk;
 	struct request_queue		*queue;
 	spinlock_t			q_lock;
+	struct blk_mq_tag_set		tag_set;
 	struct hd_geometry		geometry;
 	struct attribute_group		attr_group;
 	struct request			*req;
-- 
cgit 


From d0be12274dad242271fb2055275d10b67a0d7649 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 15 Oct 2018 09:00:02 -0600
Subject: mspro_block: convert to blk-mq

Straight forward conversion, there's room for improvement.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/memstick/core/mspro_block.c | 121 ++++++++++++++++++++----------------
 1 file changed, 66 insertions(+), 55 deletions(-)

(limited to 'drivers')

diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 0cd30dcb6801..aba50ec98b4d 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -12,7 +12,7 @@
  *
  */
 
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/kthread.h>
@@ -142,6 +142,7 @@ struct mspro_block_data {
 	struct gendisk        *disk;
 	struct request_queue  *queue;
 	struct request        *block_req;
+	struct blk_mq_tag_set tag_set;
 	spinlock_t            q_lock;
 
 	unsigned short        page_size;
@@ -152,7 +153,6 @@ struct mspro_block_data {
 	unsigned char         system;
 	unsigned char         read_only:1,
 			      eject:1,
-			      has_request:1,
 			      data_dir:1,
 			      active:1;
 	unsigned char         transfer_cmd;
@@ -694,13 +694,12 @@ static void h_mspro_block_setup_cmd(struct memstick_dev *card, u64 offset,
 
 /*** Data transfer ***/
 
-static int mspro_block_issue_req(struct memstick_dev *card, int chunk)
+static int mspro_block_issue_req(struct memstick_dev *card, bool chunk)
 {
 	struct mspro_block_data *msb = memstick_get_drvdata(card);
 	u64 t_off;
 	unsigned int count;
 
-try_again:
 	while (chunk) {
 		msb->current_page = 0;
 		msb->current_seg = 0;
@@ -709,9 +708,17 @@ try_again:
 					       msb->req_sg);
 
 		if (!msb->seg_count) {
-			chunk = __blk_end_request_cur(msb->block_req,
-					BLK_STS_RESOURCE);
-			continue;
+			unsigned int bytes = blk_rq_cur_bytes(msb->block_req);
+
+			chunk = blk_update_request(msb->block_req,
+							BLK_STS_RESOURCE,
+							bytes);
+			if (chunk)
+				continue;
+			__blk_mq_end_request(msb->block_req,
+						BLK_STS_RESOURCE);
+			msb->block_req = NULL;
+			break;
 		}
 
 		t_off = blk_rq_pos(msb->block_req);
@@ -729,30 +736,22 @@ try_again:
 		return 0;
 	}
 
-	dev_dbg(&card->dev, "blk_fetch\n");
-	msb->block_req = blk_fetch_request(msb->queue);
-	if (!msb->block_req) {
-		dev_dbg(&card->dev, "issue end\n");
-		return -EAGAIN;
-	}
-
-	dev_dbg(&card->dev, "trying again\n");
-	chunk = 1;
-	goto try_again;
+	return 1;
 }
 
 static int mspro_block_complete_req(struct memstick_dev *card, int error)
 {
 	struct mspro_block_data *msb = memstick_get_drvdata(card);
-	int chunk, cnt;
+	int cnt;
+	bool chunk;
 	unsigned int t_len = 0;
 	unsigned long flags;
 
 	spin_lock_irqsave(&msb->q_lock, flags);
-	dev_dbg(&card->dev, "complete %d, %d\n", msb->has_request ? 1 : 0,
+	dev_dbg(&card->dev, "complete %d, %d\n", msb->block_req ? 1 : 0,
 		error);
 
-	if (msb->has_request) {
+	if (msb->block_req) {
 		/* Nothing to do - not really an error */
 		if (error == -EAGAIN)
 			error = 0;
@@ -777,15 +776,17 @@ static int mspro_block_complete_req(struct memstick_dev *card, int error)
 		if (error && !t_len)
 			t_len = blk_rq_cur_bytes(msb->block_req);
 
-		chunk = __blk_end_request(msb->block_req,
+		chunk = blk_update_request(msb->block_req,
 				errno_to_blk_status(error), t_len);
-
-		error = mspro_block_issue_req(card, chunk);
-
-		if (!error)
-			goto out;
-		else
-			msb->has_request = 0;
+		if (chunk) {
+			error = mspro_block_issue_req(card, chunk);
+			if (!error)
+				goto out;
+		} else {
+			__blk_mq_end_request(msb->block_req,
+						errno_to_blk_status(error));
+			msb->block_req = NULL;
+		}
 	} else {
 		if (!error)
 			error = -EAGAIN;
@@ -806,8 +807,8 @@ static void mspro_block_stop(struct memstick_dev *card)
 
 	while (1) {
 		spin_lock_irqsave(&msb->q_lock, flags);
-		if (!msb->has_request) {
-			blk_stop_queue(msb->queue);
+		if (!msb->block_req) {
+			blk_mq_stop_hw_queues(msb->queue);
 			rc = 1;
 		}
 		spin_unlock_irqrestore(&msb->q_lock, flags);
@@ -822,32 +823,37 @@ static void mspro_block_stop(struct memstick_dev *card)
 static void mspro_block_start(struct memstick_dev *card)
 {
 	struct mspro_block_data *msb = memstick_get_drvdata(card);
-	unsigned long flags;
 
-	spin_lock_irqsave(&msb->q_lock, flags);
-	blk_start_queue(msb->queue);
-	spin_unlock_irqrestore(&msb->q_lock, flags);
+	blk_mq_start_hw_queues(msb->queue);
 }
 
-static void mspro_block_submit_req(struct request_queue *q)
+static blk_status_t mspro_queue_rq(struct blk_mq_hw_ctx *hctx,
+				   const struct blk_mq_queue_data *bd)
 {
-	struct memstick_dev *card = q->queuedata;
+	struct memstick_dev *card = hctx->queue->queuedata;
 	struct mspro_block_data *msb = memstick_get_drvdata(card);
-	struct request *req = NULL;
 
-	if (msb->has_request)
-		return;
+	spin_lock_irq(&msb->q_lock);
 
-	if (msb->eject) {
-		while ((req = blk_fetch_request(q)) != NULL)
-			__blk_end_request_all(req, BLK_STS_IOERR);
+	if (msb->block_req) {
+		spin_unlock_irq(&msb->q_lock);
+		return BLK_STS_DEV_RESOURCE;
+	}
 
-		return;
+	if (msb->eject) {
+		spin_unlock_irq(&msb->q_lock);
+		blk_mq_start_request(bd->rq);
+		return BLK_STS_IOERR;
 	}
 
-	msb->has_request = 1;
-	if (mspro_block_issue_req(card, 0))
-		msb->has_request = 0;
+	msb->block_req = bd->rq;
+	blk_mq_start_request(bd->rq);
+
+	if (mspro_block_issue_req(card, true))
+		msb->block_req = NULL;
+
+	spin_unlock_irq(&msb->q_lock);
+	return BLK_STS_OK;
 }
 
 /*** Initialization ***/
@@ -1167,6 +1173,10 @@ static int mspro_block_init_card(struct memstick_dev *card)
 
 }
 
+static const struct blk_mq_ops mspro_mq_ops = {
+	.queue_rq	= mspro_queue_rq,
+};
+
 static int mspro_block_init_disk(struct memstick_dev *card)
 {
 	struct mspro_block_data *msb = memstick_get_drvdata(card);
@@ -1206,9 +1216,11 @@ static int mspro_block_init_disk(struct memstick_dev *card)
 		goto out_release_id;
 	}
 
-	msb->queue = blk_init_queue(mspro_block_submit_req, &msb->q_lock);
-	if (!msb->queue) {
-		rc = -ENOMEM;
+	msb->queue = blk_mq_init_sq_queue(&msb->tag_set, &mspro_mq_ops, 2,
+						BLK_MQ_F_SHOULD_MERGE);
+	if (IS_ERR(msb->queue)) {
+		rc = PTR_ERR(msb->queue);
+		msb->queue = NULL;
 		goto out_put_disk;
 	}
 
@@ -1318,13 +1330,14 @@ static void mspro_block_remove(struct memstick_dev *card)
 
 	spin_lock_irqsave(&msb->q_lock, flags);
 	msb->eject = 1;
-	blk_start_queue(msb->queue);
 	spin_unlock_irqrestore(&msb->q_lock, flags);
+	blk_mq_start_hw_queues(msb->queue);
 
 	del_gendisk(msb->disk);
 	dev_dbg(&card->dev, "mspro block remove\n");
 
 	blk_cleanup_queue(msb->queue);
+	blk_mq_free_tag_set(&msb->tag_set);
 	msb->queue = NULL;
 
 	sysfs_remove_group(&card->dev.kobj, &msb->attr_group);
@@ -1344,8 +1357,9 @@ static int mspro_block_suspend(struct memstick_dev *card, pm_message_t state)
 	struct mspro_block_data *msb = memstick_get_drvdata(card);
 	unsigned long flags;
 
+	blk_mq_stop_hw_queues(msb->queue);
+
 	spin_lock_irqsave(&msb->q_lock, flags);
-	blk_stop_queue(msb->queue);
 	msb->active = 0;
 	spin_unlock_irqrestore(&msb->q_lock, flags);
 
@@ -1355,7 +1369,6 @@ static int mspro_block_suspend(struct memstick_dev *card, pm_message_t state)
 static int mspro_block_resume(struct memstick_dev *card)
 {
 	struct mspro_block_data *msb = memstick_get_drvdata(card);
-	unsigned long flags;
 	int rc = 0;
 
 #ifdef CONFIG_MEMSTICK_UNSAFE_RESUME
@@ -1401,9 +1414,7 @@ out_unlock:
 
 #endif /* CONFIG_MEMSTICK_UNSAFE_RESUME */
 
-	spin_lock_irqsave(&msb->q_lock, flags);
-	blk_start_queue(msb->queue);
-	spin_unlock_irqrestore(&msb->q_lock, flags);
+	blk_mq_start_hw_queues(msb->queue);
 	return rc;
 }
 
-- 
cgit 


From 600335205b8d162891b5ef2e32343f5b8020efd8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 26 Oct 2018 09:53:52 -0600
Subject: ide: convert to blk-mq

ide-disk and ide-cd tested as working just fine, ide-tape and
ide-floppy haven't. But the latter don't require changes, so they
should work without issue.

Add helper function to insert a request from a work queue, since we
cannot invoke the blk-mq request insertion from IRQ context.

Cc: David Miller <davem@davemloft.net>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-atapi.c |  25 ++++---
 drivers/ide/ide-cd.c    | 175 ++++++++++++++++++++++++++----------------------
 drivers/ide/ide-disk.c  |   5 +-
 drivers/ide/ide-io.c    | 100 +++++++++++++++------------
 drivers/ide/ide-park.c  |   4 +-
 drivers/ide/ide-pm.c    |  28 ++------
 drivers/ide/ide-probe.c |  68 ++++++++++++++-----
 7 files changed, 228 insertions(+), 177 deletions(-)

(limited to 'drivers')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 8b2b72b93885..33210bc67618 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -172,8 +172,8 @@ EXPORT_SYMBOL_GPL(ide_create_request_sense_cmd);
 void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 {
 	struct request_sense *sense = &drive->sense_data;
-	struct request *sense_rq = drive->sense_rq;
-	struct scsi_request *req = scsi_req(sense_rq);
+	struct request *sense_rq;
+	struct scsi_request *req;
 	unsigned int cmd_len, sense_len;
 	int err;
 
@@ -196,9 +196,16 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	if (ata_sense_request(rq) || drive->sense_rq_armed)
 		return;
 
+	sense_rq = drive->sense_rq;
+	if (!sense_rq) {
+		sense_rq = blk_mq_alloc_request(drive->queue, REQ_OP_DRV_IN,
+					BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
+		drive->sense_rq = sense_rq;
+	}
+	req = scsi_req(sense_rq);
+
 	memset(sense, 0, sizeof(*sense));
 
-	blk_rq_init(rq->q, sense_rq);
 	scsi_req_init(req);
 
 	err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len,
@@ -207,6 +214,8 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 		if (printk_ratelimit())
 			printk(KERN_WARNING PFX "%s: failed to map sense "
 					    "buffer\n", drive->name);
+		blk_mq_free_request(sense_rq);
+		drive->sense_rq = NULL;
 		return;
 	}
 
@@ -226,6 +235,8 @@ EXPORT_SYMBOL_GPL(ide_prep_sense);
 
 int ide_queue_sense_rq(ide_drive_t *drive, void *special)
 {
+	struct request *sense_rq = drive->sense_rq;
+
 	/* deferred failure from ide_prep_sense() */
 	if (!drive->sense_rq_armed) {
 		printk(KERN_WARNING PFX "%s: error queuing a sense request\n",
@@ -233,12 +244,12 @@ int ide_queue_sense_rq(ide_drive_t *drive, void *special)
 		return -ENOMEM;
 	}
 
-	drive->sense_rq->special = special;
+	sense_rq->special = special;
 	drive->sense_rq_armed = false;
 
 	drive->hwif->rq = NULL;
 
-	elv_add_request(drive->queue, drive->sense_rq, ELEVATOR_INSERT_FRONT);
+	ide_insert_request_head(drive, sense_rq);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
@@ -270,10 +281,8 @@ void ide_retry_pc(ide_drive_t *drive)
 	 */
 	drive->hwif->rq = NULL;
 	ide_requeue_and_plug(drive, failed_rq);
-	if (ide_queue_sense_rq(drive, pc)) {
-		blk_start_request(failed_rq);
+	if (ide_queue_sense_rq(drive, pc))
 		ide_complete_rq(drive, BLK_STS_IOERR, blk_rq_bytes(failed_rq));
-	}
 }
 EXPORT_SYMBOL_GPL(ide_retry_pc);
 
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index f9b59d41813f..4ecaf2ace4cb 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -258,11 +258,22 @@ static int ide_cd_breathe(ide_drive_t *drive, struct request *rq)
 		/*
 		 * take a breather
 		 */
-		blk_delay_queue(drive->queue, 1);
+		blk_mq_requeue_request(rq, false);
+		blk_mq_delay_kick_requeue_list(drive->queue, 1);
 		return 1;
 	}
 }
 
+static void ide_cd_free_sense(ide_drive_t *drive)
+{
+	if (!drive->sense_rq)
+		return;
+
+	blk_mq_free_request(drive->sense_rq);
+	drive->sense_rq = NULL;
+	drive->sense_rq_armed = false;
+}
+
 /**
  * Returns:
  * 0: if the request should be continued.
@@ -516,6 +527,82 @@ static bool ide_cd_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd)
 	return false;
 }
 
+/* standard prep_rq_fn that builds 10 byte cmds */
+static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
+{
+	int hard_sect = queue_logical_block_size(q);
+	long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
+	unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9);
+	struct scsi_request *req = scsi_req(rq);
+
+	if (rq_data_dir(rq) == READ)
+		req->cmd[0] = GPCMD_READ_10;
+	else
+		req->cmd[0] = GPCMD_WRITE_10;
+
+	/*
+	 * fill in lba
+	 */
+	req->cmd[2] = (block >> 24) & 0xff;
+	req->cmd[3] = (block >> 16) & 0xff;
+	req->cmd[4] = (block >>  8) & 0xff;
+	req->cmd[5] = block & 0xff;
+
+	/*
+	 * and transfer length
+	 */
+	req->cmd[7] = (blocks >> 8) & 0xff;
+	req->cmd[8] = blocks & 0xff;
+	req->cmd_len = 10;
+	return BLKPREP_OK;
+}
+
+/*
+ * Most of the SCSI commands are supported directly by ATAPI devices.
+ * This transform handles the few exceptions.
+ */
+static int ide_cdrom_prep_pc(struct request *rq)
+{
+	u8 *c = scsi_req(rq)->cmd;
+
+	/* transform 6-byte read/write commands to the 10-byte version */
+	if (c[0] == READ_6 || c[0] == WRITE_6) {
+		c[8] = c[4];
+		c[5] = c[3];
+		c[4] = c[2];
+		c[3] = c[1] & 0x1f;
+		c[2] = 0;
+		c[1] &= 0xe0;
+		c[0] += (READ_10 - READ_6);
+		scsi_req(rq)->cmd_len = 10;
+		return BLKPREP_OK;
+	}
+
+	/*
+	 * it's silly to pretend we understand 6-byte sense commands, just
+	 * reject with ILLEGAL_REQUEST and the caller should take the
+	 * appropriate action
+	 */
+	if (c[0] == MODE_SENSE || c[0] == MODE_SELECT) {
+		scsi_req(rq)->result = ILLEGAL_REQUEST;
+		return BLKPREP_KILL;
+	}
+
+	return BLKPREP_OK;
+}
+
+static int ide_cdrom_prep_fn(ide_drive_t *drive, struct request *rq)
+{
+	if (!blk_rq_is_passthrough(rq)) {
+		scsi_req_init(scsi_req(rq));
+
+		return ide_cdrom_prep_fs(drive->queue, rq);
+	} else if (blk_rq_is_scsi(rq))
+		return ide_cdrom_prep_pc(rq);
+
+	return 0;
+}
+
 static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
@@ -675,7 +762,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 out_end:
 	if (blk_rq_is_scsi(rq) && rc == 0) {
 		scsi_req(rq)->resid_len = 0;
-		blk_end_request_all(rq, BLK_STS_OK);
+		blk_mq_end_request(rq, BLK_STS_OK);
 		hwif->rq = NULL;
 	} else {
 		if (sense && uptodate)
@@ -705,6 +792,8 @@ out_end:
 		if (sense && rc == 2)
 			ide_error(drive, "request sense failure", stat);
 	}
+
+	ide_cd_free_sense(drive);
 	return ide_stopped;
 }
 
@@ -729,7 +818,7 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 		 * We may be retrying this request after an error.  Fix up any
 		 * weirdness which might be present in the request packet.
 		 */
-		q->prep_rq_fn(q, rq);
+		ide_cdrom_prep_fn(drive, rq);
 	}
 
 	/* fs requests *must* be hardware frame aligned */
@@ -1323,82 +1412,6 @@ static int ide_cdrom_probe_capabilities(ide_drive_t *drive)
 	return nslots;
 }
 
-/* standard prep_rq_fn that builds 10 byte cmds */
-static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
-{
-	int hard_sect = queue_logical_block_size(q);
-	long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
-	unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9);
-	struct scsi_request *req = scsi_req(rq);
-
-	q->initialize_rq_fn(rq);
-
-	if (rq_data_dir(rq) == READ)
-		req->cmd[0] = GPCMD_READ_10;
-	else
-		req->cmd[0] = GPCMD_WRITE_10;
-
-	/*
-	 * fill in lba
-	 */
-	req->cmd[2] = (block >> 24) & 0xff;
-	req->cmd[3] = (block >> 16) & 0xff;
-	req->cmd[4] = (block >>  8) & 0xff;
-	req->cmd[5] = block & 0xff;
-
-	/*
-	 * and transfer length
-	 */
-	req->cmd[7] = (blocks >> 8) & 0xff;
-	req->cmd[8] = blocks & 0xff;
-	req->cmd_len = 10;
-	return BLKPREP_OK;
-}
-
-/*
- * Most of the SCSI commands are supported directly by ATAPI devices.
- * This transform handles the few exceptions.
- */
-static int ide_cdrom_prep_pc(struct request *rq)
-{
-	u8 *c = scsi_req(rq)->cmd;
-
-	/* transform 6-byte read/write commands to the 10-byte version */
-	if (c[0] == READ_6 || c[0] == WRITE_6) {
-		c[8] = c[4];
-		c[5] = c[3];
-		c[4] = c[2];
-		c[3] = c[1] & 0x1f;
-		c[2] = 0;
-		c[1] &= 0xe0;
-		c[0] += (READ_10 - READ_6);
-		scsi_req(rq)->cmd_len = 10;
-		return BLKPREP_OK;
-	}
-
-	/*
-	 * it's silly to pretend we understand 6-byte sense commands, just
-	 * reject with ILLEGAL_REQUEST and the caller should take the
-	 * appropriate action
-	 */
-	if (c[0] == MODE_SENSE || c[0] == MODE_SELECT) {
-		scsi_req(rq)->result = ILLEGAL_REQUEST;
-		return BLKPREP_KILL;
-	}
-
-	return BLKPREP_OK;
-}
-
-static int ide_cdrom_prep_fn(struct request_queue *q, struct request *rq)
-{
-	if (!blk_rq_is_passthrough(rq))
-		return ide_cdrom_prep_fs(q, rq);
-	else if (blk_rq_is_scsi(rq))
-		return ide_cdrom_prep_pc(rq);
-
-	return 0;
-}
-
 struct cd_list_entry {
 	const char	*id_model;
 	const char	*id_firmware;
@@ -1508,7 +1521,7 @@ static int ide_cdrom_setup(ide_drive_t *drive)
 
 	ide_debug_log(IDE_DBG_PROBE, "enter");
 
-	blk_queue_prep_rq(q, ide_cdrom_prep_fn);
+	drive->prep_rq = ide_cdrom_prep_fn;
 	blk_queue_dma_alignment(q, 31);
 	blk_queue_update_dma_pad(q, 15);
 
@@ -1569,7 +1582,7 @@ static void ide_cd_release(struct device *dev)
 	if (devinfo->handle == drive)
 		unregister_cdrom(devinfo);
 	drive->driver_data = NULL;
-	blk_queue_prep_rq(drive->queue, NULL);
+	drive->prep_rq = NULL;
 	g->private_data = NULL;
 	put_disk(g);
 	kfree(info);
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index e3b4e659082d..f8567c8c9dd1 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -427,9 +427,8 @@ static void ide_disk_unlock_native_capacity(ide_drive_t *drive)
 		drive->dev_flags |= IDE_DFLAG_NOHPA; /* disable HPA on resume */
 }
 
-static int idedisk_prep_fn(struct request_queue *q, struct request *rq)
+static int idedisk_prep_fn(ide_drive_t *drive, struct request *rq)
 {
-	ide_drive_t *drive = q->queuedata;
 	struct ide_cmd *cmd;
 
 	if (req_op(rq) != REQ_OP_FLUSH)
@@ -548,7 +547,7 @@ static void update_flush(ide_drive_t *drive)
 
 		if (barrier) {
 			wc = true;
-			blk_queue_prep_rq(drive->queue, idedisk_prep_fn);
+			drive->prep_rq = idedisk_prep_fn;
 		}
 	}
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 0d93e0cfbeaf..5093c605c91c 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -67,7 +67,15 @@ int ide_end_rq(ide_drive_t *drive, struct request *rq, blk_status_t error,
 		ide_dma_on(drive);
 	}
 
-	return blk_end_request(rq, error, nr_bytes);
+	if (!blk_update_request(rq, error, nr_bytes)) {
+		if (rq == drive->sense_rq)
+			drive->sense_rq = NULL;
+
+		__blk_mq_end_request(rq, error);
+		return 0;
+	}
+
+	return 1;
 }
 EXPORT_SYMBOL_GPL(ide_end_rq);
 
@@ -307,8 +315,6 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 {
 	ide_startstop_t startstop;
 
-	BUG_ON(!(rq->rq_flags & RQF_STARTED));
-
 #ifdef DEBUG
 	printk("%s: start_request: current=0x%08lx\n",
 		drive->hwif->name, (unsigned long) rq);
@@ -320,6 +326,9 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 		goto kill_rq;
 	}
 
+	if (drive->prep_rq && drive->prep_rq(drive, rq))
+		return ide_stopped;
+
 	if (ata_pm_request(rq))
 		ide_check_pm_state(drive, rq);
 
@@ -430,44 +439,38 @@ static inline void ide_unlock_host(struct ide_host *host)
 	}
 }
 
-static void __ide_requeue_and_plug(struct request_queue *q, struct request *rq)
-{
-	if (rq)
-		blk_requeue_request(q, rq);
-	if (rq || blk_peek_request(q)) {
-		/* Use 3ms as that was the old plug delay */
-		blk_delay_queue(q, 3);
-	}
-}
-
 void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq)
 {
 	struct request_queue *q = drive->queue;
-	unsigned long flags;
 
-	spin_lock_irqsave(q->queue_lock, flags);
-	__ide_requeue_and_plug(q, rq);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	/* Use 3ms as that was the old plug delay */
+	if (rq) {
+		blk_mq_requeue_request(rq, false);
+		blk_mq_delay_kick_requeue_list(q, 3);
+	} else
+		blk_mq_delay_run_hw_queue(q->queue_hw_ctx[0], 3);
 }
 
 /*
  * Issue a new request to a device.
  */
-void do_ide_request(struct request_queue *q)
+blk_status_t ide_queue_rq(struct blk_mq_hw_ctx *hctx,
+			  const struct blk_mq_queue_data *bd)
 {
-	ide_drive_t	*drive = q->queuedata;
+	ide_drive_t	*drive = hctx->queue->queuedata;
 	ide_hwif_t	*hwif = drive->hwif;
 	struct ide_host *host = hwif->host;
 	struct request	*rq = NULL;
 	ide_startstop_t	startstop;
 
-	spin_unlock_irq(q->queue_lock);
-
 	/* HLD do_request() callback might sleep, make sure it's okay */
 	might_sleep();
 
 	if (ide_lock_host(host, hwif))
-		goto plug_device_2;
+		return BLK_STS_DEV_RESOURCE;
+
+	rq = bd->rq;
+	blk_mq_start_request(rq);
 
 	spin_lock_irq(&hwif->lock);
 
@@ -503,21 +506,16 @@ repeat:
 		hwif->cur_dev = drive;
 		drive->dev_flags &= ~(IDE_DFLAG_SLEEPING | IDE_DFLAG_PARKED);
 
-		spin_unlock_irq(&hwif->lock);
-		spin_lock_irq(q->queue_lock);
 		/*
 		 * we know that the queue isn't empty, but this can happen
 		 * if the q->prep_rq_fn() decides to kill a request
 		 */
-		if (!rq)
-			rq = blk_fetch_request(drive->queue);
-
-		spin_unlock_irq(q->queue_lock);
-		spin_lock_irq(&hwif->lock);
-
 		if (!rq) {
-			ide_unlock_port(hwif);
-			goto out;
+			rq = bd->rq;
+			if (!rq) {
+				ide_unlock_port(hwif);
+				goto out;
+			}
 		}
 
 		/*
@@ -551,23 +549,24 @@ repeat:
 		if (startstop == ide_stopped) {
 			rq = hwif->rq;
 			hwif->rq = NULL;
-			goto repeat;
+			if (rq)
+				goto repeat;
+			ide_unlock_port(hwif);
+			goto out;
 		}
-	} else
-		goto plug_device;
+	} else {
+plug_device:
+		spin_unlock_irq(&hwif->lock);
+		ide_unlock_host(host);
+		ide_requeue_and_plug(drive, rq);
+		return BLK_STS_OK;
+	}
+
 out:
 	spin_unlock_irq(&hwif->lock);
 	if (rq == NULL)
 		ide_unlock_host(host);
-	spin_lock_irq(q->queue_lock);
-	return;
-
-plug_device:
-	spin_unlock_irq(&hwif->lock);
-	ide_unlock_host(host);
-plug_device_2:
-	spin_lock_irq(q->queue_lock);
-	__ide_requeue_and_plug(q, rq);
+	return BLK_STS_OK;
 }
 
 static int drive_is_ready(ide_drive_t *drive)
@@ -887,3 +886,16 @@ void ide_pad_transfer(ide_drive_t *drive, int write, int len)
 	}
 }
 EXPORT_SYMBOL_GPL(ide_pad_transfer);
+
+void ide_insert_request_head(ide_drive_t *drive, struct request *rq)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	unsigned long flags;
+
+	spin_lock_irqsave(&hwif->lock, flags);
+	list_add_tail(&rq->queuelist, &drive->rq_list);
+	spin_unlock_irqrestore(&hwif->lock, flags);
+
+	kblockd_schedule_work(&drive->rq_work);
+}
+EXPORT_SYMBOL_GPL(ide_insert_request_head);
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 622f0edb3945..de9e85cf74d1 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -27,7 +27,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 		spin_unlock_irq(&hwif->lock);
 
 		if (start_queue)
-			blk_run_queue(q);
+			blk_mq_run_hw_queues(q, true);
 		return;
 	}
 	spin_unlock_irq(&hwif->lock);
@@ -54,7 +54,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	scsi_req(rq)->cmd[0] = REQ_UNPARK_HEADS;
 	scsi_req(rq)->cmd_len = 1;
 	ide_req(rq)->type = ATA_PRIV_MISC;
-	elv_add_request(q, rq, ELEVATOR_INSERT_FRONT);
+	ide_insert_request_head(drive, rq);
 
 out:
 	return;
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 59217aa1d1fb..ea10507e5190 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -40,32 +40,20 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	return ret;
 }
 
-static void ide_end_sync_rq(struct request *rq, blk_status_t error)
-{
-	complete(rq->end_io_data);
-}
-
 static int ide_pm_execute_rq(struct request *rq)
 {
 	struct request_queue *q = rq->q;
-	DECLARE_COMPLETION_ONSTACK(wait);
-
-	rq->end_io_data = &wait;
-	rq->end_io = ide_end_sync_rq;
 
 	spin_lock_irq(q->queue_lock);
 	if (unlikely(blk_queue_dying(q))) {
 		rq->rq_flags |= RQF_QUIET;
 		scsi_req(rq)->result = -ENXIO;
-		__blk_end_request_all(rq, BLK_STS_OK);
 		spin_unlock_irq(q->queue_lock);
+		blk_mq_end_request(rq, BLK_STS_OK);
 		return -ENXIO;
 	}
-	__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT);
-	__blk_run_queue_uncond(q);
 	spin_unlock_irq(q->queue_lock);
-
-	wait_for_completion_io(&wait);
+	blk_execute_rq(q, NULL, rq, true);
 
 	return scsi_req(rq)->result ? -EIO : 0;
 }
@@ -79,6 +67,8 @@ int generic_ide_resume(struct device *dev)
 	struct ide_pm_state rqpm;
 	int err;
 
+	blk_mq_start_stopped_hw_queues(drive->queue, true);
+
 	if (ide_port_acpi(hwif)) {
 		/* call ACPI _PS0 / _STM only once */
 		if ((drive->dn & 1) == 0 || pair == NULL) {
@@ -226,15 +216,14 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 #endif
 	spin_lock_irqsave(q->queue_lock, flags);
 	if (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND)
-		blk_stop_queue(q);
+		blk_mq_stop_hw_queues(q);
 	else
 		drive->dev_flags &= ~IDE_DFLAG_BLOCKED;
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
 	drive->hwif->rq = NULL;
 
-	if (blk_end_request(rq, BLK_STS_OK, 0))
-		BUG();
+	blk_mq_end_request(rq, BLK_STS_OK);
 }
 
 void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
@@ -260,7 +249,6 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 		ide_hwif_t *hwif = drive->hwif;
 		const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 		struct request_queue *q = drive->queue;
-		unsigned long flags;
 		int rc;
 #ifdef DEBUG_PM
 		printk("%s: Wakeup request inited, waiting for !BSY...\n", drive->name);
@@ -274,8 +262,6 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 		if (rc)
 			printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name);
 
-		spin_lock_irqsave(q->queue_lock, flags);
-		blk_start_queue(q);
-		spin_unlock_irqrestore(q->queue_lock, flags);
+		blk_mq_start_hw_queues(q);
 	}
 }
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 3b75a7b7a284..40384838e439 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -750,6 +750,11 @@ static void ide_initialize_rq(struct request *rq)
 	req->sreq.sense = req->sense;
 }
 
+static const struct blk_mq_ops ide_mq_ops = {
+	.queue_rq		= ide_queue_rq,
+	.initialize_rq_fn	= ide_initialize_rq,
+};
+
 /*
  * init request queue
  */
@@ -759,6 +764,7 @@ static int ide_init_queue(ide_drive_t *drive)
 	ide_hwif_t *hwif = drive->hwif;
 	int max_sectors = 256;
 	int max_sg_entries = PRD_ENTRIES;
+	struct blk_mq_tag_set *set;
 
 	/*
 	 *	Our default set up assumes the normal IDE case,
@@ -767,19 +773,26 @@ static int ide_init_queue(ide_drive_t *drive)
 	 *	limits and LBA48 we could raise it but as yet
 	 *	do not.
 	 */
-	q = blk_alloc_queue_node(GFP_KERNEL, hwif_to_node(hwif), NULL);
-	if (!q)
+
+	set = &drive->tag_set;
+	set->ops = &ide_mq_ops;
+	set->nr_hw_queues = 1;
+	set->queue_depth = 32;
+	set->reserved_tags = 1;
+	set->cmd_size = sizeof(struct ide_request);
+	set->numa_node = hwif_to_node(hwif);
+	set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
+	if (blk_mq_alloc_tag_set(set))
 		return 1;
 
-	q->request_fn = do_ide_request;
-	q->initialize_rq_fn = ide_initialize_rq;
-	q->cmd_size = sizeof(struct ide_request);
-	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
-	if (blk_init_allocated_queue(q) < 0) {
-		blk_cleanup_queue(q);
+	q = blk_mq_init_queue(set);
+	if (IS_ERR(q)) {
+		blk_mq_free_tag_set(set);
 		return 1;
 	}
 
+	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
+
 	q->queuedata = drive;
 	blk_queue_segment_boundary(q, 0xffff);
 
@@ -965,8 +978,12 @@ static void drive_release_dev (struct device *dev)
 
 	ide_proc_unregister_device(drive);
 
+	if (drive->sense_rq)
+		blk_mq_free_request(drive->sense_rq);
+
 	blk_cleanup_queue(drive->queue);
 	drive->queue = NULL;
+	blk_mq_free_tag_set(&drive->tag_set);
 
 	drive->dev_flags &= ~IDE_DFLAG_PRESENT;
 
@@ -1133,6 +1150,28 @@ static void ide_port_cable_detect(ide_hwif_t *hwif)
 	}
 }
 
+/*
+ * Deferred request list insertion handler
+ */
+static void drive_rq_insert_work(struct work_struct *work)
+{
+	ide_drive_t *drive = container_of(work, ide_drive_t, rq_work);
+	ide_hwif_t *hwif = drive->hwif;
+	struct request *rq;
+	LIST_HEAD(list);
+
+	spin_lock_irq(&hwif->lock);
+	if (!list_empty(&drive->rq_list))
+		list_splice_init(&drive->rq_list, &list);
+	spin_unlock_irq(&hwif->lock);
+
+	while (!list_empty(&list)) {
+		rq = list_first_entry(&list, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		blk_execute_rq_nowait(drive->queue, rq->rq_disk, rq, true, NULL);
+	}
+}
+
 static const u8 ide_hwif_to_major[] =
 	{ IDE0_MAJOR, IDE1_MAJOR, IDE2_MAJOR, IDE3_MAJOR, IDE4_MAJOR,
 	  IDE5_MAJOR, IDE6_MAJOR, IDE7_MAJOR, IDE8_MAJOR, IDE9_MAJOR };
@@ -1145,12 +1184,10 @@ static void ide_port_init_devices_data(ide_hwif_t *hwif)
 	ide_port_for_each_dev(i, drive, hwif) {
 		u8 j = (hwif->index * MAX_DRIVES) + i;
 		u16 *saved_id = drive->id;
-		struct request *saved_sense_rq = drive->sense_rq;
 
 		memset(drive, 0, sizeof(*drive));
 		memset(saved_id, 0, SECTOR_SIZE);
 		drive->id = saved_id;
-		drive->sense_rq = saved_sense_rq;
 
 		drive->media			= ide_disk;
 		drive->select			= (i << 4) | ATA_DEVICE_OBS;
@@ -1166,6 +1203,9 @@ static void ide_port_init_devices_data(ide_hwif_t *hwif)
 
 		INIT_LIST_HEAD(&drive->list);
 		init_completion(&drive->gendev_rel_comp);
+
+		INIT_WORK(&drive->rq_work, drive_rq_insert_work);
+		INIT_LIST_HEAD(&drive->rq_list);
 	}
 }
 
@@ -1255,7 +1295,6 @@ static void ide_port_free_devices(ide_hwif_t *hwif)
 	int i;
 
 	ide_port_for_each_dev(i, drive, hwif) {
-		kfree(drive->sense_rq);
 		kfree(drive->id);
 		kfree(drive);
 	}
@@ -1283,17 +1322,10 @@ static int ide_port_alloc_devices(ide_hwif_t *hwif, int node)
 		if (drive->id == NULL)
 			goto out_free_drive;
 
-		drive->sense_rq = kmalloc(sizeof(struct request) +
-				sizeof(struct ide_request), GFP_KERNEL);
-		if (!drive->sense_rq)
-			goto out_free_id;
-
 		hwif->devices[i] = drive;
 	}
 	return 0;
 
-out_free_id:
-	kfree(drive->id);
 out_free_drive:
 	kfree(drive);
 out_nomem:
-- 
cgit 


From 3a7ea2c483a53fc89e336f69c6ee1d7defe00811 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 10:17:28 -0600
Subject: scsi: provide mq_ops->busy() hook

Only the SCSI legacy path provides a way to check if target is
currently busy, provide the same for the MQ path.

Cc: linux-scsi@vger.kernel.org
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'drivers')

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index c7fccbb8f554..8b0345924a92 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1675,6 +1675,11 @@ static int scsi_lld_busy(struct request_queue *q)
 	return 0;
 }
 
+static bool scsi_mq_lld_busy(struct request_queue *q)
+{
+	return scsi_lld_busy(q);
+}
+
 /*
  * Kill a request for a dead device
  */
@@ -2326,6 +2331,7 @@ static const struct blk_mq_ops scsi_mq_ops = {
 	.init_request	= scsi_mq_init_request,
 	.exit_request	= scsi_mq_exit_request,
 	.initialize_rq_fn = scsi_initialize_rq,
+	.busy		= scsi_mq_lld_busy,
 	.map_queues	= scsi_map_queues,
 };
 
-- 
cgit 


From f664a3cc17b7d0a2bc3b3ab96181e1029b0ec0e6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 1 Nov 2018 16:36:27 -0600
Subject: scsi: kill off the legacy IO path

This removes the legacy (non-mq) IO path for SCSI.

Cc: linux-scsi@vger.kernel.org
Acked-by: Himanshu Madhani <himanshu.madhani@cavium.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/Kconfig          |  12 -
 drivers/scsi/cxlflash/main.c  |   6 -
 drivers/scsi/hosts.c          |  29 +-
 drivers/scsi/lpfc/lpfc_scsi.c |   2 +-
 drivers/scsi/qedi/qedi_main.c |   3 +-
 drivers/scsi/qla2xxx/qla_os.c |  30 +--
 drivers/scsi/scsi.c           |   5 +-
 drivers/scsi/scsi_debug.c     |   3 +-
 drivers/scsi/scsi_error.c     |   2 +-
 drivers/scsi/scsi_lib.c       | 603 +++---------------------------------------
 drivers/scsi/scsi_priv.h      |   1 -
 drivers/scsi/scsi_scan.c      |  10 +-
 drivers/scsi/scsi_sysfs.c     |   8 +-
 drivers/scsi/ufs/ufshcd.c     |   6 -
 14 files changed, 70 insertions(+), 650 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index f07444d30b21..dfdc6940de2f 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -50,18 +50,6 @@ config SCSI_NETLINK
 	default	n
 	depends on NET
 
-config SCSI_MQ_DEFAULT
-	bool "SCSI: use blk-mq I/O path by default"
-	default y
-	depends on SCSI
-	---help---
-	  This option enables the blk-mq based I/O path for SCSI devices by
-	  default.  With this option the scsi_mod.use_blk_mq module/boot
-	  option defaults to Y, without it to N, but it can still be
-	  overridden either way.
-
-	  If unsure say Y.
-
 config SCSI_PROC_FS
 	bool "legacy /proc/scsi/ support"
 	depends on SCSI && PROC_FS
diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
index 6637116529aa..abdc9eac4173 100644
--- a/drivers/scsi/cxlflash/main.c
+++ b/drivers/scsi/cxlflash/main.c
@@ -3088,12 +3088,6 @@ static ssize_t hwq_mode_store(struct device *dev,
 		return -EINVAL;
 	}
 
-	if ((mode == HWQ_MODE_TAG) && !shost_use_blk_mq(shost)) {
-		dev_info(cfgdev, "SCSI-MQ is not enabled, use a different "
-			 "HWQ steering mode.\n");
-		return -EINVAL;
-	}
-
 	afu->hwq_mode = mode;
 
 	return count;
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index ea4b0bb0c1cd..cc71136ba300 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -222,18 +222,9 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
 	if (error)
 		goto fail;
 
-	if (shost_use_blk_mq(shost)) {
-		error = scsi_mq_setup_tags(shost);
-		if (error)
-			goto fail;
-	} else {
-		shost->bqt = blk_init_tags(shost->can_queue,
-				shost->hostt->tag_alloc_policy);
-		if (!shost->bqt) {
-			error = -ENOMEM;
-			goto fail;
-		}
-	}
+	error = scsi_mq_setup_tags(shost);
+	if (error)
+		goto fail;
 
 	if (!shost->shost_gendev.parent)
 		shost->shost_gendev.parent = dev ? dev : &platform_bus;
@@ -309,8 +300,7 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
 	pm_runtime_disable(&shost->shost_gendev);
 	pm_runtime_set_suspended(&shost->shost_gendev);
 	pm_runtime_put_noidle(&shost->shost_gendev);
-	if (shost_use_blk_mq(shost))
-		scsi_mq_destroy_tags(shost);
+	scsi_mq_destroy_tags(shost);
  fail:
 	return error;
 }
@@ -344,13 +334,8 @@ static void scsi_host_dev_release(struct device *dev)
 		kfree(dev_name(&shost->shost_dev));
 	}
 
-	if (shost_use_blk_mq(shost)) {
-		if (shost->tag_set.tags)
-			scsi_mq_destroy_tags(shost);
-	} else {
-		if (shost->bqt)
-			blk_free_tags(shost->bqt);
-	}
+	if (shost->tag_set.tags)
+		scsi_mq_destroy_tags(shost);
 
 	kfree(shost->shost_data);
 
@@ -472,8 +457,6 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 	else
 		shost->dma_boundary = 0xffffffff;
 
-	shost->use_blk_mq = scsi_use_blk_mq || shost->hostt->force_blk_mq;
-
 	device_initialize(&shost->shost_gendev);
 	dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
 	shost->shost_gendev.bus = &scsi_bus_type;
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index 4fa6703a9ec9..baed2b891efb 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -3914,7 +3914,7 @@ int lpfc_sli4_scmd_to_wqidx_distr(struct lpfc_hba *phba,
 	uint32_t tag;
 	uint16_t hwq;
 
-	if (cmnd && shost_use_blk_mq(cmnd->device->host)) {
+	if (cmnd) {
 		tag = blk_mq_unique_tag(cmnd->request);
 		hwq = blk_mq_unique_tag_to_hwq(tag);
 
diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c
index 105b0e4d7818..311eb22068e1 100644
--- a/drivers/scsi/qedi/qedi_main.c
+++ b/drivers/scsi/qedi/qedi_main.c
@@ -644,8 +644,7 @@ static struct qedi_ctx *qedi_host_alloc(struct pci_dev *pdev)
 	qedi->max_active_conns = ISCSI_MAX_SESS_PER_HBA;
 	qedi->max_sqes = QEDI_SQ_SIZE;
 
-	if (shost_use_blk_mq(shost))
-		shost->nr_hw_queues = MIN_NUM_CPUS_MSIX(qedi);
+	shost->nr_hw_queues = MIN_NUM_CPUS_MSIX(qedi);
 
 	pci_set_drvdata(pdev, qedi);
 
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 518f15141170..4ea9f2b4e04f 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -857,13 +857,9 @@ qla2xxx_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 	}
 
 	if (ha->mqenable) {
-		if (shost_use_blk_mq(vha->host)) {
-			tag = blk_mq_unique_tag(cmd->request);
-			hwq = blk_mq_unique_tag_to_hwq(tag);
-			qpair = ha->queue_pair_map[hwq];
-		} else if (vha->vp_idx && vha->qpair) {
-			qpair = vha->qpair;
-		}
+		tag = blk_mq_unique_tag(cmd->request);
+		hwq = blk_mq_unique_tag_to_hwq(tag);
+		qpair = ha->queue_pair_map[hwq];
 
 		if (qpair)
 			return qla2xxx_mqueuecommand(host, cmd, qpair);
@@ -3153,7 +3149,7 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto probe_failed;
 	}
 
-	if (ha->mqenable && shost_use_blk_mq(host)) {
+	if (ha->mqenable) {
 		/* number of hardware queues supported by blk/scsi-mq*/
 		host->nr_hw_queues = ha->max_qpairs;
 
@@ -3265,25 +3261,17 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	    base_vha->mgmt_svr_loop_id, host->sg_tablesize);
 
 	if (ha->mqenable) {
-		bool mq = false;
 		bool startit = false;
 
-		if (QLA_TGT_MODE_ENABLED()) {
-			mq = true;
+		if (QLA_TGT_MODE_ENABLED())
 			startit = false;
-		}
 
-		if ((ql2x_ini_mode == QLA2XXX_INI_MODE_ENABLED) &&
-		    shost_use_blk_mq(host)) {
-			mq = true;
+		if (ql2x_ini_mode == QLA2XXX_INI_MODE_ENABLED)
 			startit = true;
-		}
 
-		if (mq) {
-			/* Create start of day qpairs for Block MQ */
-			for (i = 0; i < ha->max_qpairs; i++)
-				qla2xxx_create_qpair(base_vha, 5, 0, startit);
-		}
+		/* Create start of day qpairs for Block MQ */
+		for (i = 0; i < ha->max_qpairs; i++)
+			qla2xxx_create_qpair(base_vha, 5, 0, startit);
 	}
 
 	if (ha->flags.running_gold_fw)
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index fc1356d101b0..7675ff0ca2ea 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -780,11 +780,8 @@ MODULE_LICENSE("GPL");
 module_param(scsi_logging_level, int, S_IRUGO|S_IWUSR);
 MODULE_PARM_DESC(scsi_logging_level, "a bit mask of logging levels");
 
-#ifdef CONFIG_SCSI_MQ_DEFAULT
+/* This should go away in the future, it doesn't do anything anymore */
 bool scsi_use_blk_mq = true;
-#else
-bool scsi_use_blk_mq = false;
-#endif
 module_param_named(use_blk_mq, scsi_use_blk_mq, bool, S_IWUSR | S_IRUGO);
 
 static int __init init_scsi(void)
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 60bcc6df97a9..4740f1e9dd17 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -5881,8 +5881,7 @@ static int sdebug_driver_probe(struct device *dev)
 	}
 	/* Decide whether to tell scsi subsystem that we want mq */
 	/* Following should give the same answer for each host */
-	if (shost_use_blk_mq(hpnt))
-		hpnt->nr_hw_queues = submit_queues;
+	hpnt->nr_hw_queues = submit_queues;
 
 	sdbg_host->shost = hpnt;
 	*((struct sdebug_host_info **)hpnt->hostdata) = sdbg_host;
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index c736d61b1648..fff128aa9ec2 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -308,7 +308,7 @@ enum blk_eh_timer_return scsi_times_out(struct request *req)
 		 * error handler. In that case we can return immediately as no
 		 * further action is required.
 		 */
-		if (req->q->mq_ops && !blk_mq_mark_complete(req))
+		if (!blk_mq_mark_complete(req))
 			return rtn;
 		if (scsi_abort_command(scmd) != SUCCESS) {
 			set_host_byte(scmd, DID_TIME_OUT);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 8b0345924a92..651be30ba96a 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -168,8 +168,6 @@ static void scsi_mq_requeue_cmd(struct scsi_cmnd *cmd)
 static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy)
 {
 	struct scsi_device *device = cmd->device;
-	struct request_queue *q = device->request_queue;
-	unsigned long flags;
 
 	SCSI_LOG_MLQUEUE(1, scmd_printk(KERN_INFO, cmd,
 		"Inserting command %p into mlqueue\n", cmd));
@@ -190,26 +188,20 @@ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy)
 	 * before blk_cleanup_queue() finishes.
 	 */
 	cmd->result = 0;
-	if (q->mq_ops) {
-		/*
-		 * Before a SCSI command is dispatched,
-		 * get_device(&sdev->sdev_gendev) is called and the host,
-		 * target and device busy counters are increased. Since
-		 * requeuing a request causes these actions to be repeated and
-		 * since scsi_device_unbusy() has already been called,
-		 * put_device(&device->sdev_gendev) must still be called. Call
-		 * put_device() after blk_mq_requeue_request() to avoid that
-		 * removal of the SCSI device can start before requeueing has
-		 * happened.
-		 */
-		blk_mq_requeue_request(cmd->request, true);
-		put_device(&device->sdev_gendev);
-		return;
-	}
-	spin_lock_irqsave(q->queue_lock, flags);
-	blk_requeue_request(q, cmd->request);
-	kblockd_schedule_work(&device->requeue_work);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	/*
+	 * Before a SCSI command is dispatched,
+	 * get_device(&sdev->sdev_gendev) is called and the host,
+	 * target and device busy counters are increased. Since
+	 * requeuing a request causes these actions to be repeated and
+	 * since scsi_device_unbusy() has already been called,
+	 * put_device(&device->sdev_gendev) must still be called. Call
+	 * put_device() after blk_mq_requeue_request() to avoid that
+	 * removal of the SCSI device can start before requeueing has
+	 * happened.
+	 */
+	blk_mq_requeue_request(cmd->request, true);
+	put_device(&device->sdev_gendev);
 }
 
 /*
@@ -370,10 +362,7 @@ void scsi_device_unbusy(struct scsi_device *sdev)
 
 static void scsi_kick_queue(struct request_queue *q)
 {
-	if (q->mq_ops)
-		blk_mq_run_hw_queues(q, false);
-	else
-		blk_run_queue(q);
+	blk_mq_run_hw_queues(q, false);
 }
 
 /*
@@ -534,10 +523,7 @@ static void scsi_run_queue(struct request_queue *q)
 	if (!list_empty(&sdev->host->starved_list))
 		scsi_starved_list_run(sdev->host);
 
-	if (q->mq_ops)
-		blk_mq_run_hw_queues(q, false);
-	else
-		blk_run_queue(q);
+	blk_mq_run_hw_queues(q, false);
 }
 
 void scsi_requeue_run_queue(struct work_struct *work)
@@ -550,42 +536,6 @@ void scsi_requeue_run_queue(struct work_struct *work)
 	scsi_run_queue(q);
 }
 
-/*
- * Function:	scsi_requeue_command()
- *
- * Purpose:	Handle post-processing of completed commands.
- *
- * Arguments:	q	- queue to operate on
- *		cmd	- command that may need to be requeued.
- *
- * Returns:	Nothing
- *
- * Notes:	After command completion, there may be blocks left
- *		over which weren't finished by the previous command
- *		this can be for a number of reasons - the main one is
- *		I/O errors in the middle of the request, in which case
- *		we need to request the blocks that come after the bad
- *		sector.
- * Notes:	Upon return, cmd is a stale pointer.
- */
-static void scsi_requeue_command(struct request_queue *q, struct scsi_cmnd *cmd)
-{
-	struct scsi_device *sdev = cmd->device;
-	struct request *req = cmd->request;
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	blk_unprep_request(req);
-	req->special = NULL;
-	scsi_put_command(cmd);
-	blk_requeue_request(q, req);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
-	scsi_run_queue(q);
-
-	put_device(&sdev->sdev_gendev);
-}
-
 void scsi_run_host_queues(struct Scsi_Host *shost)
 {
 	struct scsi_device *sdev;
@@ -626,42 +576,6 @@ static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd)
 	scsi_del_cmd_from_list(cmd);
 }
 
-/*
- * Function:    scsi_release_buffers()
- *
- * Purpose:     Free resources allocate for a scsi_command.
- *
- * Arguments:   cmd	- command that we are bailing.
- *
- * Lock status: Assumed that no lock is held upon entry.
- *
- * Returns:     Nothing
- *
- * Notes:       In the event that an upper level driver rejects a
- *		command, we must release resources allocated during
- *		the __init_io() function.  Primarily this would involve
- *		the scatter-gather table.
- */
-static void scsi_release_buffers(struct scsi_cmnd *cmd)
-{
-	if (cmd->sdb.table.nents)
-		sg_free_table_chained(&cmd->sdb.table, false);
-
-	memset(&cmd->sdb, 0, sizeof(cmd->sdb));
-
-	if (scsi_prot_sg_count(cmd))
-		sg_free_table_chained(&cmd->prot_sdb->table, false);
-}
-
-static void scsi_release_bidi_buffers(struct scsi_cmnd *cmd)
-{
-	struct scsi_data_buffer *bidi_sdb = cmd->request->next_rq->special;
-
-	sg_free_table_chained(&bidi_sdb->table, false);
-	kmem_cache_free(scsi_sdb_cache, bidi_sdb);
-	cmd->request->next_rq->special = NULL;
-}
-
 /* Returns false when no more bytes to process, true if there are more */
 static bool scsi_end_request(struct request *req, blk_status_t error,
 		unsigned int bytes, unsigned int bidi_bytes)
@@ -687,37 +601,22 @@ static bool scsi_end_request(struct request *req, blk_status_t error,
 		destroy_rcu_head(&cmd->rcu);
 	}
 
-	if (req->mq_ctx) {
-		/*
-		 * In the MQ case the command gets freed by __blk_mq_end_request,
-		 * so we have to do all cleanup that depends on it earlier.
-		 *
-		 * We also can't kick the queues from irq context, so we
-		 * will have to defer it to a workqueue.
-		 */
-		scsi_mq_uninit_cmd(cmd);
-
-		__blk_mq_end_request(req, error);
-
-		if (scsi_target(sdev)->single_lun ||
-		    !list_empty(&sdev->host->starved_list))
-			kblockd_schedule_work(&sdev->requeue_work);
-		else
-			blk_mq_run_hw_queues(q, true);
-	} else {
-		unsigned long flags;
-
-		if (bidi_bytes)
-			scsi_release_bidi_buffers(cmd);
-		scsi_release_buffers(cmd);
-		scsi_put_command(cmd);
+	/*
+	 * In the MQ case the command gets freed by __blk_mq_end_request,
+	 * so we have to do all cleanup that depends on it earlier.
+	 *
+	 * We also can't kick the queues from irq context, so we
+	 * will have to defer it to a workqueue.
+	 */
+	scsi_mq_uninit_cmd(cmd);
 
-		spin_lock_irqsave(q->queue_lock, flags);
-		blk_finish_request(req, error);
-		spin_unlock_irqrestore(q->queue_lock, flags);
+	__blk_mq_end_request(req, error);
 
-		scsi_run_queue(q);
-	}
+	if (scsi_target(sdev)->single_lun ||
+	    !list_empty(&sdev->host->starved_list))
+		kblockd_schedule_work(&sdev->requeue_work);
+	else
+		blk_mq_run_hw_queues(q, true);
 
 	put_device(&sdev->sdev_gendev);
 	return false;
@@ -766,13 +665,7 @@ static void scsi_io_completion_reprep(struct scsi_cmnd *cmd,
 				      struct request_queue *q)
 {
 	/* A new command will be prepared and issued. */
-	if (q->mq_ops) {
-		scsi_mq_requeue_cmd(cmd);
-	} else {
-		/* Unprep request and put it back at head of the queue. */
-		scsi_release_buffers(cmd);
-		scsi_requeue_command(q, cmd);
-	}
+	scsi_mq_requeue_cmd(cmd);
 }
 
 /* Helper for scsi_io_completion() when special action required. */
@@ -1147,9 +1040,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
  */
 int scsi_init_io(struct scsi_cmnd *cmd)
 {
-	struct scsi_device *sdev = cmd->device;
 	struct request *rq = cmd->request;
-	bool is_mq = (rq->mq_ctx != NULL);
 	int error = BLKPREP_KILL;
 
 	if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq)))
@@ -1160,17 +1051,6 @@ int scsi_init_io(struct scsi_cmnd *cmd)
 		goto err_exit;
 
 	if (blk_bidi_rq(rq)) {
-		if (!rq->q->mq_ops) {
-			struct scsi_data_buffer *bidi_sdb =
-				kmem_cache_zalloc(scsi_sdb_cache, GFP_ATOMIC);
-			if (!bidi_sdb) {
-				error = BLKPREP_DEFER;
-				goto err_exit;
-			}
-
-			rq->next_rq->special = bidi_sdb;
-		}
-
 		error = scsi_init_sgtable(rq->next_rq, rq->next_rq->special);
 		if (error)
 			goto err_exit;
@@ -1210,14 +1090,7 @@ int scsi_init_io(struct scsi_cmnd *cmd)
 
 	return BLKPREP_OK;
 err_exit:
-	if (is_mq) {
-		scsi_mq_free_sgtables(cmd);
-	} else {
-		scsi_release_buffers(cmd);
-		cmd->request->special = NULL;
-		scsi_put_command(cmd);
-		put_device(&sdev->sdev_gendev);
-	}
+	scsi_mq_free_sgtables(cmd);
 	return error;
 }
 EXPORT_SYMBOL(scsi_init_io);
@@ -1423,75 +1296,6 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
 	return ret;
 }
 
-static int
-scsi_prep_return(struct request_queue *q, struct request *req, int ret)
-{
-	struct scsi_device *sdev = q->queuedata;
-
-	switch (ret) {
-	case BLKPREP_KILL:
-	case BLKPREP_INVALID:
-		scsi_req(req)->result = DID_NO_CONNECT << 16;
-		/* release the command and kill it */
-		if (req->special) {
-			struct scsi_cmnd *cmd = req->special;
-			scsi_release_buffers(cmd);
-			scsi_put_command(cmd);
-			put_device(&sdev->sdev_gendev);
-			req->special = NULL;
-		}
-		break;
-	case BLKPREP_DEFER:
-		/*
-		 * If we defer, the blk_peek_request() returns NULL, but the
-		 * queue must be restarted, so we schedule a callback to happen
-		 * shortly.
-		 */
-		if (atomic_read(&sdev->device_busy) == 0)
-			blk_delay_queue(q, SCSI_QUEUE_DELAY);
-		break;
-	default:
-		req->rq_flags |= RQF_DONTPREP;
-	}
-
-	return ret;
-}
-
-static int scsi_prep_fn(struct request_queue *q, struct request *req)
-{
-	struct scsi_device *sdev = q->queuedata;
-	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
-	int ret;
-
-	ret = scsi_prep_state_check(sdev, req);
-	if (ret != BLKPREP_OK)
-		goto out;
-
-	if (!req->special) {
-		/* Bail if we can't get a reference to the device */
-		if (unlikely(!get_device(&sdev->sdev_gendev))) {
-			ret = BLKPREP_DEFER;
-			goto out;
-		}
-
-		scsi_init_command(sdev, cmd);
-		req->special = cmd;
-	}
-
-	cmd->tag = req->tag;
-	cmd->request = req;
-	cmd->prot_op = SCSI_PROT_NORMAL;
-
-	ret = scsi_setup_cmnd(sdev, req);
-out:
-	return scsi_prep_return(q, req, ret);
-}
-
-static void scsi_unprep_fn(struct request_queue *q, struct request *req)
-{
-	scsi_uninit_cmd(blk_mq_rq_to_pdu(req));
-}
-
 /*
  * scsi_dev_queue_ready: if we can send requests to sdev, return 1 else
  * return 0.
@@ -1511,14 +1315,8 @@ static inline int scsi_dev_queue_ready(struct request_queue *q,
 		/*
 		 * unblock after device_blocked iterates to zero
 		 */
-		if (atomic_dec_return(&sdev->device_blocked) > 0) {
-			/*
-			 * For the MQ case we take care of this in the caller.
-			 */
-			if (!q->mq_ops)
-				blk_delay_queue(q, SCSI_QUEUE_DELAY);
+		if (atomic_dec_return(&sdev->device_blocked) > 0)
 			goto out_dec;
-		}
 		SCSI_LOG_MLQUEUE(3, sdev_printk(KERN_INFO, sdev,
 				   "unblocking device at zero depth\n"));
 	}
@@ -1653,13 +1451,13 @@ out_dec:
  * needs to return 'not busy'. Otherwise, request stacking drivers
  * may hold requests forever.
  */
-static int scsi_lld_busy(struct request_queue *q)
+static bool scsi_mq_lld_busy(struct request_queue *q)
 {
 	struct scsi_device *sdev = q->queuedata;
 	struct Scsi_Host *shost;
 
 	if (blk_queue_dying(q))
-		return 0;
+		return false;
 
 	shost = sdev->host;
 
@@ -1670,48 +1468,9 @@ static int scsi_lld_busy(struct request_queue *q)
 	 * in SCSI layer.
 	 */
 	if (scsi_host_in_recovery(shost) || scsi_device_is_busy(sdev))
-		return 1;
-
-	return 0;
-}
-
-static bool scsi_mq_lld_busy(struct request_queue *q)
-{
-	return scsi_lld_busy(q);
-}
-
-/*
- * Kill a request for a dead device
- */
-static void scsi_kill_request(struct request *req, struct request_queue *q)
-{
-	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
-	struct scsi_device *sdev;
-	struct scsi_target *starget;
-	struct Scsi_Host *shost;
-
-	blk_start_request(req);
-
-	scmd_printk(KERN_INFO, cmd, "killing request\n");
-
-	sdev = cmd->device;
-	starget = scsi_target(sdev);
-	shost = sdev->host;
-	scsi_init_cmd_errh(cmd);
-	cmd->result = DID_NO_CONNECT << 16;
-	atomic_inc(&cmd->device->iorequest_cnt);
-
-	/*
-	 * SCSI request completion path will do scsi_device_unbusy(),
-	 * bump busy counts.  To bump the counters, we need to dance
-	 * with the locks as normal issue path does.
-	 */
-	atomic_inc(&sdev->device_busy);
-	atomic_inc(&shost->host_busy);
-	if (starget->can_queue > 0)
-		atomic_inc(&starget->target_busy);
+		return true;
 
-	blk_complete_request(req);
+	return false;
 }
 
 static void scsi_softirq_done(struct request *rq)
@@ -1834,158 +1593,6 @@ static int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
 	return 0;
 }
 
-/**
- * scsi_done - Invoke completion on finished SCSI command.
- * @cmd: The SCSI Command for which a low-level device driver (LLDD) gives
- * ownership back to SCSI Core -- i.e. the LLDD has finished with it.
- *
- * Description: This function is the mid-level's (SCSI Core) interrupt routine,
- * which regains ownership of the SCSI command (de facto) from a LLDD, and
- * calls blk_complete_request() for further processing.
- *
- * This function is interrupt context safe.
- */
-static void scsi_done(struct scsi_cmnd *cmd)
-{
-	trace_scsi_dispatch_cmd_done(cmd);
-	blk_complete_request(cmd->request);
-}
-
-/*
- * Function:    scsi_request_fn()
- *
- * Purpose:     Main strategy routine for SCSI.
- *
- * Arguments:   q       - Pointer to actual queue.
- *
- * Returns:     Nothing
- *
- * Lock status: request queue lock assumed to be held when called.
- *
- * Note: See sd_zbc.c sd_zbc_write_lock_zone() for write order
- * protection for ZBC disks.
- */
-static void scsi_request_fn(struct request_queue *q)
-	__releases(q->queue_lock)
-	__acquires(q->queue_lock)
-{
-	struct scsi_device *sdev = q->queuedata;
-	struct Scsi_Host *shost;
-	struct scsi_cmnd *cmd;
-	struct request *req;
-
-	/*
-	 * To start with, we keep looping until the queue is empty, or until
-	 * the host is no longer able to accept any more requests.
-	 */
-	shost = sdev->host;
-	for (;;) {
-		int rtn;
-		/*
-		 * get next queueable request.  We do this early to make sure
-		 * that the request is fully prepared even if we cannot
-		 * accept it.
-		 */
-		req = blk_peek_request(q);
-		if (!req)
-			break;
-
-		if (unlikely(!scsi_device_online(sdev))) {
-			sdev_printk(KERN_ERR, sdev,
-				    "rejecting I/O to offline device\n");
-			scsi_kill_request(req, q);
-			continue;
-		}
-
-		if (!scsi_dev_queue_ready(q, sdev))
-			break;
-
-		/*
-		 * Remove the request from the request list.
-		 */
-		if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))
-			blk_start_request(req);
-
-		spin_unlock_irq(q->queue_lock);
-		cmd = blk_mq_rq_to_pdu(req);
-		if (cmd != req->special) {
-			printk(KERN_CRIT "impossible request in %s.\n"
-					 "please mail a stack trace to "
-					 "linux-scsi@vger.kernel.org\n",
-					 __func__);
-			blk_dump_rq_flags(req, "foo");
-			BUG();
-		}
-
-		/*
-		 * We hit this when the driver is using a host wide
-		 * tag map. For device level tag maps the queue_depth check
-		 * in the device ready fn would prevent us from trying
-		 * to allocate a tag. Since the map is a shared host resource
-		 * we add the dev to the starved list so it eventually gets
-		 * a run when a tag is freed.
-		 */
-		if (blk_queue_tagged(q) && !(req->rq_flags & RQF_QUEUED)) {
-			spin_lock_irq(shost->host_lock);
-			if (list_empty(&sdev->starved_entry))
-				list_add_tail(&sdev->starved_entry,
-					      &shost->starved_list);
-			spin_unlock_irq(shost->host_lock);
-			goto not_ready;
-		}
-
-		if (!scsi_target_queue_ready(shost, sdev))
-			goto not_ready;
-
-		if (!scsi_host_queue_ready(q, shost, sdev))
-			goto host_not_ready;
-	
-		if (sdev->simple_tags)
-			cmd->flags |= SCMD_TAGGED;
-		else
-			cmd->flags &= ~SCMD_TAGGED;
-
-		/*
-		 * Finally, initialize any error handling parameters, and set up
-		 * the timers for timeouts.
-		 */
-		scsi_init_cmd_errh(cmd);
-
-		/*
-		 * Dispatch the command to the low-level driver.
-		 */
-		cmd->scsi_done = scsi_done;
-		rtn = scsi_dispatch_cmd(cmd);
-		if (rtn) {
-			scsi_queue_insert(cmd, rtn);
-			spin_lock_irq(q->queue_lock);
-			goto out_delay;
-		}
-		spin_lock_irq(q->queue_lock);
-	}
-
-	return;
-
- host_not_ready:
-	if (scsi_target(sdev)->can_queue > 0)
-		atomic_dec(&scsi_target(sdev)->target_busy);
- not_ready:
-	/*
-	 * lock q, handle tag, requeue req, and decrement device_busy. We
-	 * must return with queue_lock held.
-	 *
-	 * Decrementing device_busy without checking it is OK, as all such
-	 * cases (host limits or settings) should run the queue at some
-	 * later time.
-	 */
-	spin_lock_irq(q->queue_lock);
-	blk_requeue_request(q, req);
-	atomic_dec(&sdev->device_busy);
-out_delay:
-	if (!atomic_read(&sdev->device_busy) && !scsi_device_blocked(sdev))
-		blk_delay_queue(q, SCSI_QUEUE_DELAY);
-}
-
 static inline blk_status_t prep_to_mq(int ret)
 {
 	switch (ret) {
@@ -2248,77 +1855,6 @@ void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(__scsi_init_queue);
 
-static int scsi_old_init_rq(struct request_queue *q, struct request *rq,
-			    gfp_t gfp)
-{
-	struct Scsi_Host *shost = q->rq_alloc_data;
-	const bool unchecked_isa_dma = shost->unchecked_isa_dma;
-	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
-
-	memset(cmd, 0, sizeof(*cmd));
-
-	if (unchecked_isa_dma)
-		cmd->flags |= SCMD_UNCHECKED_ISA_DMA;
-	cmd->sense_buffer = scsi_alloc_sense_buffer(unchecked_isa_dma, gfp,
-						    NUMA_NO_NODE);
-	if (!cmd->sense_buffer)
-		goto fail;
-	cmd->req.sense = cmd->sense_buffer;
-
-	if (scsi_host_get_prot(shost) >= SHOST_DIX_TYPE0_PROTECTION) {
-		cmd->prot_sdb = kmem_cache_zalloc(scsi_sdb_cache, gfp);
-		if (!cmd->prot_sdb)
-			goto fail_free_sense;
-	}
-
-	return 0;
-
-fail_free_sense:
-	scsi_free_sense_buffer(unchecked_isa_dma, cmd->sense_buffer);
-fail:
-	return -ENOMEM;
-}
-
-static void scsi_old_exit_rq(struct request_queue *q, struct request *rq)
-{
-	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
-
-	if (cmd->prot_sdb)
-		kmem_cache_free(scsi_sdb_cache, cmd->prot_sdb);
-	scsi_free_sense_buffer(cmd->flags & SCMD_UNCHECKED_ISA_DMA,
-			       cmd->sense_buffer);
-}
-
-struct request_queue *scsi_old_alloc_queue(struct scsi_device *sdev)
-{
-	struct Scsi_Host *shost = sdev->host;
-	struct request_queue *q;
-
-	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
-	if (!q)
-		return NULL;
-	q->cmd_size = sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
-	q->rq_alloc_data = shost;
-	q->request_fn = scsi_request_fn;
-	q->init_rq_fn = scsi_old_init_rq;
-	q->exit_rq_fn = scsi_old_exit_rq;
-	q->initialize_rq_fn = scsi_initialize_rq;
-
-	if (blk_init_allocated_queue(q) < 0) {
-		blk_cleanup_queue(q);
-		return NULL;
-	}
-
-	__scsi_init_queue(shost, q);
-	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
-	blk_queue_prep_rq(q, scsi_prep_fn);
-	blk_queue_unprep_rq(q, scsi_unprep_fn);
-	blk_queue_softirq_done(q, scsi_softirq_done);
-	blk_queue_rq_timed_out(q, scsi_times_out);
-	blk_queue_lld_busy(q, scsi_lld_busy);
-	return q;
-}
-
 static const struct blk_mq_ops scsi_mq_ops = {
 	.get_budget	= scsi_mq_get_budget,
 	.put_budget	= scsi_mq_put_budget,
@@ -2386,10 +1922,7 @@ struct scsi_device *scsi_device_from_queue(struct request_queue *q)
 {
 	struct scsi_device *sdev = NULL;
 
-	if (q->mq_ops) {
-		if (q->mq_ops == &scsi_mq_ops)
-			sdev = q->queuedata;
-	} else if (q->request_fn == scsi_request_fn)
+	if (q->mq_ops == &scsi_mq_ops)
 		sdev = q->queuedata;
 	if (!sdev || !get_device(&sdev->sdev_gendev))
 		sdev = NULL;
@@ -2992,39 +2525,6 @@ void sdev_evt_send_simple(struct scsi_device *sdev,
 }
 EXPORT_SYMBOL_GPL(sdev_evt_send_simple);
 
-/**
- * scsi_request_fn_active() - number of kernel threads inside scsi_request_fn()
- * @sdev: SCSI device to count the number of scsi_request_fn() callers for.
- */
-static int scsi_request_fn_active(struct scsi_device *sdev)
-{
-	struct request_queue *q = sdev->request_queue;
-	int request_fn_active;
-
-	WARN_ON_ONCE(sdev->host->use_blk_mq);
-
-	spin_lock_irq(q->queue_lock);
-	request_fn_active = q->request_fn_active;
-	spin_unlock_irq(q->queue_lock);
-
-	return request_fn_active;
-}
-
-/**
- * scsi_wait_for_queuecommand() - wait for ongoing queuecommand() calls
- * @sdev: SCSI device pointer.
- *
- * Wait until the ongoing shost->hostt->queuecommand() calls that are
- * invoked from scsi_request_fn() have finished.
- */
-static void scsi_wait_for_queuecommand(struct scsi_device *sdev)
-{
-	WARN_ON_ONCE(sdev->host->use_blk_mq);
-
-	while (scsi_request_fn_active(sdev))
-		msleep(20);
-}
-
 /**
  *	scsi_device_quiesce - Block user issued commands.
  *	@sdev:	scsi device to quiesce.
@@ -3148,7 +2648,6 @@ EXPORT_SYMBOL(scsi_target_resume);
 int scsi_internal_device_block_nowait(struct scsi_device *sdev)
 {
 	struct request_queue *q = sdev->request_queue;
-	unsigned long flags;
 	int err = 0;
 
 	err = scsi_device_set_state(sdev, SDEV_BLOCK);
@@ -3164,14 +2663,7 @@ int scsi_internal_device_block_nowait(struct scsi_device *sdev)
 	 * block layer from calling the midlayer with this device's
 	 * request queue. 
 	 */
-	if (q->mq_ops) {
-		blk_mq_quiesce_queue_nowait(q);
-	} else {
-		spin_lock_irqsave(q->queue_lock, flags);
-		blk_stop_queue(q);
-		spin_unlock_irqrestore(q->queue_lock, flags);
-	}
-
+	blk_mq_quiesce_queue_nowait(q);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait);
@@ -3202,12 +2694,8 @@ static int scsi_internal_device_block(struct scsi_device *sdev)
 
 	mutex_lock(&sdev->state_mutex);
 	err = scsi_internal_device_block_nowait(sdev);
-	if (err == 0) {
-		if (q->mq_ops)
-			blk_mq_quiesce_queue(q);
-		else
-			scsi_wait_for_queuecommand(sdev);
-	}
+	if (err == 0)
+		blk_mq_quiesce_queue(q);
 	mutex_unlock(&sdev->state_mutex);
 
 	return err;
@@ -3216,15 +2704,8 @@ static int scsi_internal_device_block(struct scsi_device *sdev)
 void scsi_start_queue(struct scsi_device *sdev)
 {
 	struct request_queue *q = sdev->request_queue;
-	unsigned long flags;
 
-	if (q->mq_ops) {
-		blk_mq_unquiesce_queue(q);
-	} else {
-		spin_lock_irqsave(q->queue_lock, flags);
-		blk_start_queue(q);
-		spin_unlock_irqrestore(q->queue_lock, flags);
-	}
+	blk_mq_unquiesce_queue(q);
 }
 
 /**
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index 99f1db5e467e..5f21547b2ad2 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -92,7 +92,6 @@ extern void scsi_queue_insert(struct scsi_cmnd *cmd, int reason);
 extern void scsi_io_completion(struct scsi_cmnd *, unsigned int);
 extern void scsi_run_host_queues(struct Scsi_Host *shost);
 extern void scsi_requeue_run_queue(struct work_struct *work);
-extern struct request_queue *scsi_old_alloc_queue(struct scsi_device *sdev);
 extern struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev);
 extern void scsi_start_queue(struct scsi_device *sdev);
 extern int scsi_mq_setup_tags(struct Scsi_Host *shost);
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 78ca63dfba4a..dd0d516f65e2 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -266,10 +266,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
 	 */
 	sdev->borken = 1;
 
-	if (shost_use_blk_mq(shost))
-		sdev->request_queue = scsi_mq_alloc_queue(sdev);
-	else
-		sdev->request_queue = scsi_old_alloc_queue(sdev);
+	sdev->request_queue = scsi_mq_alloc_queue(sdev);
 	if (!sdev->request_queue) {
 		/* release fn is set up in scsi_sysfs_device_initialise, so
 		 * have to free and put manually here */
@@ -280,11 +277,6 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
 	WARN_ON_ONCE(!blk_get_queue(sdev->request_queue));
 	sdev->request_queue->queuedata = sdev;
 
-	if (!shost_use_blk_mq(sdev->host)) {
-		blk_queue_init_tags(sdev->request_queue,
-				    sdev->host->cmd_per_lun, shost->bqt,
-				    shost->hostt->tag_alloc_policy);
-	}
 	scsi_change_queue_depth(sdev, sdev->host->cmd_per_lun ?
 					sdev->host->cmd_per_lun : 1);
 
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 3aee9464a7bf..6a9040faed00 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -367,7 +367,6 @@ store_shost_eh_deadline(struct device *dev, struct device_attribute *attr,
 
 static DEVICE_ATTR(eh_deadline, S_IRUGO | S_IWUSR, show_shost_eh_deadline, store_shost_eh_deadline);
 
-shost_rd_attr(use_blk_mq, "%d\n");
 shost_rd_attr(unique_id, "%u\n");
 shost_rd_attr(cmd_per_lun, "%hd\n");
 shost_rd_attr(can_queue, "%hd\n");
@@ -386,6 +385,13 @@ show_host_busy(struct device *dev, struct device_attribute *attr, char *buf)
 }
 static DEVICE_ATTR(host_busy, S_IRUGO, show_host_busy, NULL);
 
+static ssize_t
+show_use_blk_mq(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "1\n");
+}
+static DEVICE_ATTR(use_blk_mq, S_IRUGO, show_use_blk_mq, NULL);
+
 static struct attribute *scsi_sysfs_shost_attrs[] = {
 	&dev_attr_use_blk_mq.attr,
 	&dev_attr_unique_id.attr,
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 23d7cca36ff0..fb308ea8e9a5 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -8100,12 +8100,6 @@ int ufshcd_alloc_host(struct device *dev, struct ufs_hba **hba_handle)
 		goto out_error;
 	}
 
-	/*
-	 * Do not use blk-mq at this time because blk-mq does not support
-	 * runtime pm.
-	 */
-	host->use_blk_mq = false;
-
 	hba = shost_priv(host);
 	hba->host = host;
 	hba->dev = dev;
-- 
cgit 


From 583d6535cb9dadfd2678acfad27231076eeccf8e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 25 Oct 2018 11:07:57 +0200
Subject: dasd: remove dead code

Since e443343e509a we haven't had a request_fn attached to
this driver, hence any code inside an if (q->request_fn) is
unreachable.

Fixes: e443343e509a ("s390/dasd: blk-mq conversion")
[sth: Keep and fix the dasd_info->chanq_len counter.]
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Jan Hoeppner <hoeppner@linux.ibm.com>
Signed-off-by: Stefan Haberland <sth@linux.ibm.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/s390/block/dasd_ioctl.c | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

(limited to 'drivers')

diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index 2016e0ed5865..8e26001dc11c 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -412,6 +412,7 @@ static int dasd_ioctl_information(struct dasd_block *block,
 	struct ccw_dev_id dev_id;
 	struct dasd_device *base;
 	struct ccw_device *cdev;
+	struct list_head *l;
 	unsigned long flags;
 	int rc;
 
@@ -462,23 +463,10 @@ static int dasd_ioctl_information(struct dasd_block *block,
 
 	memcpy(dasd_info->type, base->discipline->name, 4);
 
-	if (block->request_queue->request_fn) {
-		struct list_head *l;
-#ifdef DASD_EXTENDED_PROFILING
-		{
-			struct list_head *l;
-			spin_lock_irqsave(&block->lock, flags);
-			list_for_each(l, &block->request_queue->queue_head)
-				dasd_info->req_queue_len++;
-			spin_unlock_irqrestore(&block->lock, flags);
-		}
-#endif				/* DASD_EXTENDED_PROFILING */
-		spin_lock_irqsave(get_ccwdev_lock(base->cdev), flags);
-		list_for_each(l, &base->ccw_queue)
-			dasd_info->chanq_len++;
-		spin_unlock_irqrestore(get_ccwdev_lock(base->cdev),
-				       flags);
-	}
+	spin_lock_irqsave(&block->queue_lock, flags);
+	list_for_each(l, &base->ccw_queue)
+		dasd_info->chanq_len++;
+	spin_unlock_irqrestore(&block->queue_lock, flags);
 
 	rc = 0;
 	if (copy_to_user(argp, dasd_info,
-- 
cgit 


From aae3b069d5ce865ca5ef2902c2a22cef7ab4f3a2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 26 Oct 2018 11:26:25 -0600
Subject: bsg: pass in desired timeout handler

This will ease in the conversion to blk-mq, where we can't set
a timeout handler after queue init.

Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: linux-scsi@vger.kernel.org
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Benjamin Block <bblock@linux.vnet.ibm.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_transport_fc.c    | 7 +++----
 drivers/scsi/scsi_transport_iscsi.c | 2 +-
 drivers/scsi/scsi_transport_sas.c   | 4 ++--
 drivers/scsi/ufs/ufs_bsg.c          | 2 +-
 4 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 381668fa135d..98aaffb4c715 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -3780,7 +3780,8 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
 	snprintf(bsg_name, sizeof(bsg_name),
 		 "fc_host%d", shost->host_no);
 
-	q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, i->f->dd_bsg_size);
+	q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, fc_bsg_job_timeout,
+				i->f->dd_bsg_size);
 	if (IS_ERR(q)) {
 		dev_err(dev,
 			"fc_host%d: bsg interface failed to initialize - setup queue\n",
@@ -3788,7 +3789,6 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
 		return PTR_ERR(q);
 	}
 	__scsi_init_queue(shost, q);
-	blk_queue_rq_timed_out(q, fc_bsg_job_timeout);
 	blk_queue_rq_timeout(q, FC_DEFAULT_BSG_TIMEOUT);
 	fc_host->rqst_q = q;
 	return 0;
@@ -3826,14 +3826,13 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
 		return -ENOTSUPP;
 
 	q = bsg_setup_queue(dev, dev_name(dev), fc_bsg_dispatch,
-			i->f->dd_bsg_size);
+				fc_bsg_job_timeout, i->f->dd_bsg_size);
 	if (IS_ERR(q)) {
 		dev_err(dev, "failed to setup bsg queue\n");
 		return PTR_ERR(q);
 	}
 	__scsi_init_queue(shost, q);
 	blk_queue_prep_rq(q, fc_bsg_rport_prep);
-	blk_queue_rq_timed_out(q, fc_bsg_job_timeout);
 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
 	rport->rqst_q = q;
 	return 0;
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 6fd2fe210fc3..26b11a775be9 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1542,7 +1542,7 @@ iscsi_bsg_host_add(struct Scsi_Host *shost, struct iscsi_cls_host *ihost)
 		return -ENOTSUPP;
 
 	snprintf(bsg_name, sizeof(bsg_name), "iscsi_host%d", shost->host_no);
-	q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, 0);
+	q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, NULL, 0);
 	if (IS_ERR(q)) {
 		shost_printk(KERN_ERR, shost, "bsg interface failed to "
 			     "initialize - no request queue\n");
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 0a165b2b3e81..cf6d47891d77 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -198,7 +198,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
 
 	if (rphy) {
 		q = bsg_setup_queue(&rphy->dev, dev_name(&rphy->dev),
-				sas_smp_dispatch, 0);
+				sas_smp_dispatch, NULL, 0);
 		if (IS_ERR(q))
 			return PTR_ERR(q);
 		rphy->q = q;
@@ -207,7 +207,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
 
 		snprintf(name, sizeof(name), "sas_host%d", shost->host_no);
 		q = bsg_setup_queue(&shost->shost_gendev, name,
-				sas_smp_dispatch, 0);
+				sas_smp_dispatch, NULL, 0);
 		if (IS_ERR(q))
 			return PTR_ERR(q);
 		to_sas_host_attrs(shost)->q = q;
diff --git a/drivers/scsi/ufs/ufs_bsg.c b/drivers/scsi/ufs/ufs_bsg.c
index e5f8e54bf644..dd0e9700a74c 100644
--- a/drivers/scsi/ufs/ufs_bsg.c
+++ b/drivers/scsi/ufs/ufs_bsg.c
@@ -193,7 +193,7 @@ int ufs_bsg_probe(struct ufs_hba *hba)
 	if (ret)
 		goto out;
 
-	q = bsg_setup_queue(bsg_dev, dev_name(bsg_dev), ufs_bsg_request, 0);
+	q = bsg_setup_queue(bsg_dev, dev_name(bsg_dev), ufs_bsg_request, NULL, 0);
 	if (IS_ERR(q)) {
 		ret = PTR_ERR(q);
 		goto out;
-- 
cgit 


From 5e28b8d8a1b03ce86f33d38a64a4983d2b5c7679 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 26 Oct 2018 11:27:02 -0600
Subject: bsg: provide bsg_remove_queue() helper

All drivers do unregister + cleanup, provide a helper for that.

Cc: linux-scsi@vger.kernel.org
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Benjamin Block <bblock@linux.vnet.ibm.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_transport_fc.c    | 5 +----
 drivers/scsi/scsi_transport_iscsi.c | 5 +----
 drivers/scsi/scsi_transport_sas.c   | 6 +-----
 drivers/scsi/ufs/ufs_bsg.c          | 2 +-
 4 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 98aaffb4c715..638f83ab04b2 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -3851,10 +3851,7 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
 static void
 fc_bsg_remove(struct request_queue *q)
 {
-	if (q) {
-		bsg_unregister_queue(q);
-		blk_cleanup_queue(q);
-	}
+	bsg_remove_queue(q);
 }
 
 
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 26b11a775be9..ff123023e5a5 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1576,10 +1576,7 @@ static int iscsi_remove_host(struct transport_container *tc,
 	struct Scsi_Host *shost = dev_to_shost(dev);
 	struct iscsi_cls_host *ihost = shost->shost_data;
 
-	if (ihost->bsg_q) {
-		bsg_unregister_queue(ihost->bsg_q);
-		blk_cleanup_queue(ihost->bsg_q);
-	}
+	bsg_remove_queue(ihost->bsg_q);
 	return 0;
 }
 
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index cf6d47891d77..692b46937e52 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -246,11 +246,7 @@ static int sas_host_remove(struct transport_container *tc, struct device *dev,
 	struct Scsi_Host *shost = dev_to_shost(dev);
 	struct request_queue *q = to_sas_host_attrs(shost)->q;
 
-	if (q) {
-		bsg_unregister_queue(q);
-		blk_cleanup_queue(q);
-	}
-
+	bsg_remove_queue(q);
 	return 0;
 }
 
diff --git a/drivers/scsi/ufs/ufs_bsg.c b/drivers/scsi/ufs/ufs_bsg.c
index dd0e9700a74c..775bb4e5e36e 100644
--- a/drivers/scsi/ufs/ufs_bsg.c
+++ b/drivers/scsi/ufs/ufs_bsg.c
@@ -157,7 +157,7 @@ void ufs_bsg_remove(struct ufs_hba *hba)
 	if (!hba->bsg_queue)
 		return;
 
-	bsg_unregister_queue(hba->bsg_queue);
+	bsg_remove_queue(hba->bsg_queue);
 
 	device_del(bsg_dev);
 	put_device(bsg_dev);
-- 
cgit 


From cd2f076f1d7ac20a93029ab38646b303f1c1468e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 24 Oct 2018 07:11:39 -0600
Subject: bsg: convert to use blk-mq

Requires a few changes to the FC transport class as well.

Cc: linux-scsi@vger.kernel.org
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Tested-by: Benjamin Block <bblock@linux.vnet.ibm.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_transport_fc.c | 59 ++++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 24 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 638f83ab04b2..d7035270d274 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -3592,7 +3592,7 @@ fc_bsg_job_timeout(struct request *req)
 
 	/* the blk_end_sync_io() doesn't check the error */
 	if (inflight)
-		__blk_complete_request(req);
+		blk_mq_end_request(req, BLK_STS_IOERR);
 	return BLK_EH_DONE;
 }
 
@@ -3684,14 +3684,9 @@ static void
 fc_bsg_goose_queue(struct fc_rport *rport)
 {
 	struct request_queue *q = rport->rqst_q;
-	unsigned long flags;
-
-	if (!q)
-		return;
 
-	spin_lock_irqsave(q->queue_lock, flags);
-	blk_run_queue_async(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	if (q)
+		blk_mq_run_hw_queues(q, true);
 }
 
 /**
@@ -3759,6 +3754,37 @@ static int fc_bsg_dispatch(struct bsg_job *job)
 		return fc_bsg_host_dispatch(shost, job);
 }
 
+static blk_status_t fc_bsg_rport_prep(struct fc_rport *rport)
+{
+	if (rport->port_state == FC_PORTSTATE_BLOCKED &&
+	    !(rport->flags & FC_RPORT_FAST_FAIL_TIMEDOUT))
+		return BLK_STS_RESOURCE;
+
+	if (rport->port_state != FC_PORTSTATE_ONLINE)
+		return BLK_STS_IOERR;
+
+	return BLK_STS_OK;
+}
+
+
+static int fc_bsg_dispatch_prep(struct bsg_job *job)
+{
+	struct fc_rport *rport = fc_bsg_to_rport(job);
+	blk_status_t ret;
+
+	ret = fc_bsg_rport_prep(rport);
+	switch (ret) {
+	case BLK_STS_OK:
+		break;
+	case BLK_STS_RESOURCE:
+		return -EAGAIN;
+	default:
+		return -EIO;
+	}
+
+	return fc_bsg_dispatch(job);
+}
+
 /**
  * fc_bsg_hostadd - Create and add the bsg hooks so we can receive requests
  * @shost:	shost for fc_host
@@ -3794,20 +3820,6 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
 	return 0;
 }
 
-static int fc_bsg_rport_prep(struct request_queue *q, struct request *req)
-{
-	struct fc_rport *rport = dev_to_rport(q->queuedata);
-
-	if (rport->port_state == FC_PORTSTATE_BLOCKED &&
-	    !(rport->flags & FC_RPORT_FAST_FAIL_TIMEDOUT))
-		return BLKPREP_DEFER;
-
-	if (rport->port_state != FC_PORTSTATE_ONLINE)
-		return BLKPREP_KILL;
-
-	return BLKPREP_OK;
-}
-
 /**
  * fc_bsg_rportadd - Create and add the bsg hooks so we can receive requests
  * @shost:	shost that rport is attached to
@@ -3825,14 +3837,13 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
 	if (!i->f->bsg_request)
 		return -ENOTSUPP;
 
-	q = bsg_setup_queue(dev, dev_name(dev), fc_bsg_dispatch,
+	q = bsg_setup_queue(dev, dev_name(dev), fc_bsg_dispatch_prep,
 				fc_bsg_job_timeout, i->f->dd_bsg_size);
 	if (IS_ERR(q)) {
 		dev_err(dev, "failed to setup bsg queue\n");
 		return PTR_ERR(q);
 	}
 	__scsi_init_queue(shost, q);
-	blk_queue_prep_rq(q, fc_bsg_rport_prep);
 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
 	rport->rqst_q = q;
 	return 0;
-- 
cgit 


From 92bc5a24844ada9b010f03c49a493e3edeadaa54 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 24 Oct 2018 13:52:28 -0600
Subject: block: remove __blk_put_request()

Now there's no difference between blk_put_request() and
__blk_put_request() anymore, get rid of the underscore version and
convert the few callers.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/osd/osd_initiator.c   | 4 ++--
 drivers/scsi/osst.c                | 2 +-
 drivers/scsi/scsi_error.c          | 2 +-
 drivers/scsi/sg.c                  | 2 +-
 drivers/scsi/st.c                  | 2 +-
 drivers/target/target_core_pscsi.c | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index e19fa883376f..60cf7c5eb880 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -506,11 +506,11 @@ static void osd_request_async_done(struct request *req, blk_status_t error)
 
 	_set_error_resid(or, req, error);
 	if (req->next_rq) {
-		__blk_put_request(req->q, req->next_rq);
+		blk_put_request(req->next_rq);
 		req->next_rq = NULL;
 	}
 
-	__blk_put_request(req->q, req);
+	blk_put_request(req);
 	or->request = NULL;
 	or->in.req = NULL;
 	or->out.req = NULL;
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index 7a1a1edde35d..664c1238a87f 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -341,7 +341,7 @@ static void osst_end_async(struct request *req, blk_status_t status)
 		blk_rq_unmap_user(SRpnt->bio);
 	}
 
-	__blk_put_request(req->q, req);
+	blk_put_request(req);
 }
 
 /* osst_request memory management */
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index fff128aa9ec2..dd338a8cd275 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1932,7 +1932,7 @@ maybe_retry:
 
 static void eh_lock_door_done(struct request *req, blk_status_t status)
 {
-	__blk_put_request(req->q, req);
+	blk_put_request(req);
 }
 
 /**
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index c6ad00703c5b..4e27460ec926 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1390,7 +1390,7 @@ sg_rq_end_io(struct request *rq, blk_status_t status)
 	 */
 	srp->rq = NULL;
 	scsi_req_free_cmd(scsi_req(rq));
-	__blk_put_request(rq->q, rq);
+	blk_put_request(rq);
 
 	write_lock_irqsave(&sfp->rq_list_lock, iflags);
 	if (unlikely(srp->orphan)) {
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 307df2fa39a3..7ff22d3f03e3 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -530,7 +530,7 @@ static void st_scsi_execute_end(struct request *req, blk_status_t status)
 		complete(SRpnt->waiting);
 
 	blk_rq_unmap_user(tmp);
-	__blk_put_request(req->q, req);
+	blk_put_request(req);
 }
 
 static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 47d76c862014..c062d363dce3 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1094,7 +1094,7 @@ static void pscsi_req_done(struct request *req, blk_status_t status)
 		break;
 	}
 
-	__blk_put_request(req->q, req);
+	blk_put_request(req);
 	kfree(pt);
 }
 
-- 
cgit 


From 9cf2bab6307659b940da65d16dcc8f82c69f3a97 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 31 Oct 2018 17:01:22 -0600
Subject: block: kill request ->cpu member

This was used for completion placement for the legacy path,
but for mq we have rq->mq_ctx->cpu for that. Add a helper
to get the request CPU assignment, as the mq_ctx type is
private to blk-mq.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/bnx2i/bnx2i_hwi.c    | 8 +-------
 drivers/scsi/csiostor/csio_scsi.c | 8 +-------
 drivers/scsi/qla2xxx/qla_os.c     | 2 +-
 3 files changed, 3 insertions(+), 15 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/bnx2i/bnx2i_hwi.c b/drivers/scsi/bnx2i/bnx2i_hwi.c
index e9e669a6c2bc..6bad2689edd4 100644
--- a/drivers/scsi/bnx2i/bnx2i_hwi.c
+++ b/drivers/scsi/bnx2i/bnx2i_hwi.c
@@ -1906,7 +1906,6 @@ static int bnx2i_queue_scsi_cmd_resp(struct iscsi_session *session,
 	struct iscsi_task *task;
 	struct scsi_cmnd *sc;
 	int rc = 0;
-	int cpu;
 
 	spin_lock(&session->back_lock);
 	task = iscsi_itt_to_task(bnx2i_conn->cls_conn->dd_data,
@@ -1917,14 +1916,9 @@ static int bnx2i_queue_scsi_cmd_resp(struct iscsi_session *session,
 	}
 	sc = task->sc;
 
-	if (!blk_rq_cpu_valid(sc->request))
-		cpu = smp_processor_id();
-	else
-		cpu = sc->request->cpu;
-
 	spin_unlock(&session->back_lock);
 
-	p = &per_cpu(bnx2i_percpu, cpu);
+	p = &per_cpu(bnx2i_percpu, blk_mq_rq_cpu(sc->request));
 	spin_lock(&p->p_work_lock);
 	if (unlikely(!p->iothread)) {
 		rc = -EINVAL;
diff --git a/drivers/scsi/csiostor/csio_scsi.c b/drivers/scsi/csiostor/csio_scsi.c
index 8c15b7acb4b7..a95debbea0e4 100644
--- a/drivers/scsi/csiostor/csio_scsi.c
+++ b/drivers/scsi/csiostor/csio_scsi.c
@@ -1780,16 +1780,10 @@ csio_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmnd)
 	int nsge = 0;
 	int rv = SCSI_MLQUEUE_HOST_BUSY, nr;
 	int retval;
-	int cpu;
 	struct csio_scsi_qset *sqset;
 	struct fc_rport *rport = starget_to_rport(scsi_target(cmnd->device));
 
-	if (!blk_rq_cpu_valid(cmnd->request))
-		cpu = smp_processor_id();
-	else
-		cpu = cmnd->request->cpu;
-
-	sqset = &hw->sqset[ln->portid][cpu];
+	sqset = &hw->sqset[ln->portid][blk_mq_rq_cpu(cmnd->request)];
 
 	nr = fc_remote_port_chkready(rport);
 	if (nr) {
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 4ea9f2b4e04f..29dfd1bd164d 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -1460,7 +1460,7 @@ __qla2xxx_eh_generic_reset(char *name, enum nexus_wait_type type,
 		goto eh_reset_failed;
 	}
 	err = 2;
-	if (do_reset(fcport, cmd->device->lun, cmd->request->cpu + 1)
+	if (do_reset(fcport, cmd->device->lun, blk_mq_rq_cpu(cmd->request) + 1)
 		!= QLA_SUCCESS) {
 		ql_log(ql_log_warn, vha, 0x800c,
 		    "do_reset failed for cmd=%p.\n", cmd);
-- 
cgit 


From ed76e329d74a4b15ac0f5fd3adbd52ec0178a134 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 13:06:14 -0600
Subject: blk-mq: abstract out queue map

This is in preparation for allowing multiple sets of maps per
queue, if so desired.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/virtio_blk.c            | 2 +-
 drivers/nvme/host/pci.c               | 2 +-
 drivers/scsi/qla2xxx/qla_os.c         | 5 +++--
 drivers/scsi/scsi_lib.c               | 2 +-
 drivers/scsi/smartpqi/smartpqi_init.c | 3 ++-
 drivers/scsi/virtio_scsi.c            | 3 ++-
 6 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 086c6bb12baa..6e869d05f91e 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -624,7 +624,7 @@ static int virtblk_map_queues(struct blk_mq_tag_set *set)
 {
 	struct virtio_blk *vblk = set->driver_data;
 
-	return blk_mq_virtio_map_queues(set, vblk->vdev, 0);
+	return blk_mq_virtio_map_queues(&set->map[0], vblk->vdev, 0);
 }
 
 #ifdef CONFIG_VIRTIO_BLK_SCSI
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c33bb201b884..49ad854d1b91 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -435,7 +435,7 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 {
 	struct nvme_dev *dev = set->driver_data;
 
-	return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev),
+	return blk_mq_pci_map_queues(&set->map[0], to_pci_dev(dev->dev),
 			dev->num_vecs > 1 ? 1 /* admin queue */ : 0);
 }
 
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 29dfd1bd164d..fdf3e52ee908 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -6934,11 +6934,12 @@ static int qla2xxx_map_queues(struct Scsi_Host *shost)
 {
 	int rc;
 	scsi_qla_host_t *vha = (scsi_qla_host_t *)shost->hostdata;
+	struct blk_mq_queue_map *qmap = &shost->tag_set.map[0];
 
 	if (USER_CTRL_IRQ(vha->hw))
-		rc = blk_mq_map_queues(&shost->tag_set);
+		rc = blk_mq_map_queues(qmap);
 	else
-		rc = blk_mq_pci_map_queues(&shost->tag_set, vha->hw->pdev, 0);
+		rc = blk_mq_pci_map_queues(qmap, vha->hw->pdev, 0);
 	return rc;
 }
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 651be30ba96a..ed81b8e74cfe 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1812,7 +1812,7 @@ static int scsi_map_queues(struct blk_mq_tag_set *set)
 
 	if (shost->hostt->map_queues)
 		return shost->hostt->map_queues(shost);
-	return blk_mq_map_queues(set);
+	return blk_mq_map_queues(&set->map[0]);
 }
 
 void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c
index a25a07a0b7f0..bac084260d80 100644
--- a/drivers/scsi/smartpqi/smartpqi_init.c
+++ b/drivers/scsi/smartpqi/smartpqi_init.c
@@ -5319,7 +5319,8 @@ static int pqi_map_queues(struct Scsi_Host *shost)
 {
 	struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
 
-	return blk_mq_pci_map_queues(&shost->tag_set, ctrl_info->pci_dev, 0);
+	return blk_mq_pci_map_queues(&shost->tag_set.map[0],
+					ctrl_info->pci_dev, 0);
 }
 
 static int pqi_getpciinfo_ioctl(struct pqi_ctrl_info *ctrl_info,
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 1c72db94270e..c3c95b314286 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -719,8 +719,9 @@ static void virtscsi_target_destroy(struct scsi_target *starget)
 static int virtscsi_map_queues(struct Scsi_Host *shost)
 {
 	struct virtio_scsi *vscsi = shost_priv(shost);
+	struct blk_mq_queue_map *qmap = &shost->tag_set.map[0];
 
-	return blk_mq_virtio_map_queues(&shost->tag_set, vscsi->vdev, 2);
+	return blk_mq_virtio_map_queues(qmap, vscsi->vdev, 2);
 }
 
 /*
-- 
cgit 


From 3b6592f70ad7b4c24dd3eb2ac9bbe3353d02c992 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 31 Oct 2018 08:36:31 -0600
Subject: nvme: utilize two queue maps, one for reads and one for writes

NVMe does round-robin between queues by default, which means that
sharing a queue map for both reads and writes can be problematic
in terms of read servicing. It's much easier to flood the queue
with writes and reduce the read servicing.

Implement two queue maps, one for reads and one for writes. The
write queue count is configurable through the 'write_queues'
parameter.

By default, we retain the previous behavior of having a single
queue set, shared between reads and writes. Setting 'write_queues'
to a non-zero value will create two queue sets, one for reads and
one for writes, the latter using the configurable number of
queues (hardware queue counts permitting).

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 200 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 181 insertions(+), 19 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 49ad854d1b91..1987df13b73e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -74,11 +74,29 @@ static int io_queue_depth = 1024;
 module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
 MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
 
+static int queue_count_set(const char *val, const struct kernel_param *kp);
+static const struct kernel_param_ops queue_count_ops = {
+	.set = queue_count_set,
+	.get = param_get_int,
+};
+
+static int write_queues;
+module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644);
+MODULE_PARM_DESC(write_queues,
+	"Number of queues to use for writes. If not set, reads and writes "
+	"will share a queue set.");
+
 struct nvme_dev;
 struct nvme_queue;
 
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
 
+enum {
+	NVMEQ_TYPE_READ,
+	NVMEQ_TYPE_WRITE,
+	NVMEQ_TYPE_NR,
+};
+
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
  */
@@ -92,6 +110,7 @@ struct nvme_dev {
 	struct dma_pool *prp_small_pool;
 	unsigned online_queues;
 	unsigned max_qid;
+	unsigned io_queues[NVMEQ_TYPE_NR];
 	unsigned int num_vecs;
 	int q_depth;
 	u32 db_stride;
@@ -134,6 +153,17 @@ static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
 	return param_set_int(val, kp);
 }
 
+static int queue_count_set(const char *val, const struct kernel_param *kp)
+{
+	int n = 0, ret;
+
+	ret = kstrtoint(val, 10, &n);
+	if (n > num_possible_cpus())
+		n = num_possible_cpus();
+
+	return param_set_int(val, kp);
+}
+
 static inline unsigned int sq_idx(unsigned int qid, u32 stride)
 {
 	return qid * 2 * stride;
@@ -218,9 +248,20 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
 }
 
+static unsigned int max_io_queues(void)
+{
+	return num_possible_cpus() + write_queues;
+}
+
+static unsigned int max_queue_count(void)
+{
+	/* IO queues + admin queue */
+	return 1 + max_io_queues();
+}
+
 static inline unsigned int nvme_dbbuf_size(u32 stride)
 {
-	return ((num_possible_cpus() + 1) * 8 * stride);
+	return (max_queue_count() * 8 * stride);
 }
 
 static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
@@ -431,12 +472,41 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
 	return 0;
 }
 
+static int queue_irq_offset(struct nvme_dev *dev)
+{
+	/* if we have more than 1 vec, admin queue offsets us by 1 */
+	if (dev->num_vecs > 1)
+		return 1;
+
+	return 0;
+}
+
 static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 {
 	struct nvme_dev *dev = set->driver_data;
+	int i, qoff, offset;
+
+	offset = queue_irq_offset(dev);
+	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
+		struct blk_mq_queue_map *map = &set->map[i];
+
+		map->nr_queues = dev->io_queues[i];
+		if (!map->nr_queues) {
+			BUG_ON(i == NVMEQ_TYPE_READ);
 
-	return blk_mq_pci_map_queues(&set->map[0], to_pci_dev(dev->dev),
-			dev->num_vecs > 1 ? 1 /* admin queue */ : 0);
+			/* shared set, resuse read set parameters */
+			map->nr_queues = dev->io_queues[NVMEQ_TYPE_READ];
+			qoff = 0;
+			offset = queue_irq_offset(dev);
+		}
+
+		map->queue_offset = qoff;
+		blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
+		qoff += map->nr_queues;
+		offset += map->nr_queues;
+	}
+
+	return 0;
 }
 
 /**
@@ -849,6 +919,14 @@ out_free_cmd:
 	return ret;
 }
 
+static int nvme_rq_flags_to_type(struct request_queue *q, unsigned int flags)
+{
+	if ((flags & REQ_OP_MASK) == REQ_OP_READ)
+		return NVMEQ_TYPE_READ;
+
+	return NVMEQ_TYPE_WRITE;
+}
+
 static void nvme_pci_complete_rq(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1475,13 +1553,14 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 };
 
 static const struct blk_mq_ops nvme_mq_ops = {
-	.queue_rq	= nvme_queue_rq,
-	.complete	= nvme_pci_complete_rq,
-	.init_hctx	= nvme_init_hctx,
-	.init_request	= nvme_init_request,
-	.map_queues	= nvme_pci_map_queues,
-	.timeout	= nvme_timeout,
-	.poll		= nvme_poll,
+	.queue_rq		= nvme_queue_rq,
+	.rq_flags_to_type	= nvme_rq_flags_to_type,
+	.complete		= nvme_pci_complete_rq,
+	.init_hctx		= nvme_init_hctx,
+	.init_request		= nvme_init_request,
+	.map_queues		= nvme_pci_map_queues,
+	.timeout		= nvme_timeout,
+	.poll			= nvme_poll,
 };
 
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
@@ -1891,6 +1970,87 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
 	return ret;
 }
 
+static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int nr_io_queues)
+{
+	unsigned int this_w_queues = write_queues;
+
+	/*
+	 * Setup read/write queue split
+	 */
+	if (nr_io_queues == 1) {
+		dev->io_queues[NVMEQ_TYPE_READ] = 1;
+		dev->io_queues[NVMEQ_TYPE_WRITE] = 0;
+		return;
+	}
+
+	/*
+	 * If 'write_queues' is set, ensure it leaves room for at least
+	 * one read queue
+	 */
+	if (this_w_queues >= nr_io_queues)
+		this_w_queues = nr_io_queues - 1;
+
+	/*
+	 * If 'write_queues' is set to zero, reads and writes will share
+	 * a queue set.
+	 */
+	if (!this_w_queues) {
+		dev->io_queues[NVMEQ_TYPE_WRITE] = 0;
+		dev->io_queues[NVMEQ_TYPE_READ] = nr_io_queues;
+	} else {
+		dev->io_queues[NVMEQ_TYPE_WRITE] = this_w_queues;
+		dev->io_queues[NVMEQ_TYPE_READ] = nr_io_queues - this_w_queues;
+	}
+}
+
+static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
+{
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+	int irq_sets[2];
+	struct irq_affinity affd = {
+		.pre_vectors = 1,
+		.nr_sets = ARRAY_SIZE(irq_sets),
+		.sets = irq_sets,
+	};
+	int result;
+
+	/*
+	 * For irq sets, we have to ask for minvec == maxvec. This passes
+	 * any reduction back to us, so we can adjust our queue counts and
+	 * IRQ vector needs.
+	 */
+	do {
+		nvme_calc_io_queues(dev, nr_io_queues);
+		irq_sets[0] = dev->io_queues[NVMEQ_TYPE_READ];
+		irq_sets[1] = dev->io_queues[NVMEQ_TYPE_WRITE];
+		if (!irq_sets[1])
+			affd.nr_sets = 1;
+
+		/*
+		 * Need IRQs for read+write queues, and one for the admin queue
+		 */
+		nr_io_queues = irq_sets[0] + irq_sets[1] + 1;
+
+		result = pci_alloc_irq_vectors_affinity(pdev, nr_io_queues,
+				nr_io_queues,
+				PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
+
+		/*
+		 * Need to reduce our vec counts
+		 */
+		if (result == -ENOSPC) {
+			nr_io_queues--;
+			if (!nr_io_queues)
+				return result;
+			continue;
+		} else if (result <= 0)
+			return -EIO;
+		break;
+	} while (1);
+
+	return result;
+}
+
 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
 	struct nvme_queue *adminq = &dev->queues[0];
@@ -1898,11 +2058,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	int result, nr_io_queues;
 	unsigned long size;
 
-	struct irq_affinity affd = {
-		.pre_vectors = 1
-	};
-
-	nr_io_queues = num_possible_cpus();
+	nr_io_queues = max_io_queues();
 	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
 	if (result < 0)
 		return result;
@@ -1937,13 +2093,18 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	 * setting up the full range we need.
 	 */
 	pci_free_irq_vectors(pdev);
-	result = pci_alloc_irq_vectors_affinity(pdev, 1, nr_io_queues + 1,
-			PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
+
+	result = nvme_setup_irqs(dev, nr_io_queues);
 	if (result <= 0)
 		return -EIO;
+
 	dev->num_vecs = result;
 	dev->max_qid = max(result - 1, 1);
 
+	dev_info(dev->ctrl.device, "%d/%d read/write queues\n",
+					dev->io_queues[NVMEQ_TYPE_READ],
+					dev->io_queues[NVMEQ_TYPE_WRITE]);
+
 	/*
 	 * Should investigate if there's a performance win from allocating
 	 * more queues than interrupt vectors; it might allow the submission
@@ -2045,6 +2206,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	if (!dev->ctrl.tagset) {
 		dev->tagset.ops = &nvme_mq_ops;
 		dev->tagset.nr_hw_queues = dev->online_queues - 1;
+		dev->tagset.nr_maps = NVMEQ_TYPE_NR;
 		dev->tagset.timeout = NVME_IO_TIMEOUT;
 		dev->tagset.numa_node = dev_to_node(dev->dev);
 		dev->tagset.queue_depth =
@@ -2491,8 +2653,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (!dev)
 		return -ENOMEM;
 
-	dev->queues = kcalloc_node(num_possible_cpus() + 1,
-			sizeof(struct nvme_queue), GFP_KERNEL, node);
+	dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
+					GFP_KERNEL, node);
 	if (!dev->queues)
 		goto free;
 
-- 
cgit 


From 4b04cc6a8f86c4842314def22332de1f15de8523 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 5 Nov 2018 12:44:33 -0700
Subject: nvme: add separate poll queue map

Adds support for defining a variable number of poll queues, currently
configurable with the 'poll_queues' module parameter. Defaults to
a single poll queue.

And now we finally have poll support without triggering interrupts!

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 97 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 80 insertions(+), 17 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 1987df13b73e..6aa86dfcb32c 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -86,6 +86,10 @@ MODULE_PARM_DESC(write_queues,
 	"Number of queues to use for writes. If not set, reads and writes "
 	"will share a queue set.");
 
+static int poll_queues = 1;
+module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644);
+MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
+
 struct nvme_dev;
 struct nvme_queue;
 
@@ -94,6 +98,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
 enum {
 	NVMEQ_TYPE_READ,
 	NVMEQ_TYPE_WRITE,
+	NVMEQ_TYPE_POLL,
 	NVMEQ_TYPE_NR,
 };
 
@@ -202,6 +207,7 @@ struct nvme_queue {
 	u16 last_cq_head;
 	u16 qid;
 	u8 cq_phase;
+	u8 polled;
 	u32 *dbbuf_sq_db;
 	u32 *dbbuf_cq_db;
 	u32 *dbbuf_sq_ei;
@@ -250,7 +256,7 @@ static inline void _nvme_check_size(void)
 
 static unsigned int max_io_queues(void)
 {
-	return num_possible_cpus() + write_queues;
+	return num_possible_cpus() + write_queues + poll_queues;
 }
 
 static unsigned int max_queue_count(void)
@@ -500,8 +506,15 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 			offset = queue_irq_offset(dev);
 		}
 
+		/*
+		 * The poll queue(s) doesn't have an IRQ (and hence IRQ
+		 * affinity), so use the regular blk-mq cpu mapping
+		 */
 		map->queue_offset = qoff;
-		blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
+		if (i != NVMEQ_TYPE_POLL)
+			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
+		else
+			blk_mq_map_queues(map);
 		qoff += map->nr_queues;
 		offset += map->nr_queues;
 	}
@@ -892,7 +905,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	 * We should not need to do this, but we're still using this to
 	 * ensure we can drain requests on a dying queue.
 	 */
-	if (unlikely(nvmeq->cq_vector < 0))
+	if (unlikely(nvmeq->cq_vector < 0 && !nvmeq->polled))
 		return BLK_STS_IOERR;
 
 	ret = nvme_setup_cmd(ns, req, &cmnd);
@@ -921,6 +934,8 @@ out_free_cmd:
 
 static int nvme_rq_flags_to_type(struct request_queue *q, unsigned int flags)
 {
+	if ((flags & REQ_HIPRI) && test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+		return NVMEQ_TYPE_POLL;
 	if ((flags & REQ_OP_MASK) == REQ_OP_READ)
 		return NVMEQ_TYPE_READ;
 
@@ -1094,7 +1109,10 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 		struct nvme_queue *nvmeq, s16 vector)
 {
 	struct nvme_command c;
-	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
+	int flags = NVME_QUEUE_PHYS_CONTIG;
+
+	if (vector != -1)
+		flags |= NVME_CQ_IRQ_ENABLED;
 
 	/*
 	 * Note: we (ab)use the fact that the prp fields survive if no data
@@ -1106,7 +1124,10 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 	c.create_cq.cqid = cpu_to_le16(qid);
 	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
 	c.create_cq.cq_flags = cpu_to_le16(flags);
-	c.create_cq.irq_vector = cpu_to_le16(vector);
+	if (vector != -1)
+		c.create_cq.irq_vector = cpu_to_le16(vector);
+	else
+		c.create_cq.irq_vector = 0;
 
 	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
 }
@@ -1348,13 +1369,14 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 	int vector;
 
 	spin_lock_irq(&nvmeq->cq_lock);
-	if (nvmeq->cq_vector == -1) {
+	if (nvmeq->cq_vector == -1 && !nvmeq->polled) {
 		spin_unlock_irq(&nvmeq->cq_lock);
 		return 1;
 	}
 	vector = nvmeq->cq_vector;
 	nvmeq->dev->online_queues--;
 	nvmeq->cq_vector = -1;
+	nvmeq->polled = false;
 	spin_unlock_irq(&nvmeq->cq_lock);
 
 	/*
@@ -1366,7 +1388,8 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
 		blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
 
-	pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq);
+	if (vector != -1)
+		pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq);
 
 	return 0;
 }
@@ -1500,7 +1523,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 	spin_unlock_irq(&nvmeq->cq_lock);
 }
 
-static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
+static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
 {
 	struct nvme_dev *dev = nvmeq->dev;
 	int result;
@@ -1510,7 +1533,11 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 	 * A queue's vector matches the queue identifier unless the controller
 	 * has only one vector available.
 	 */
-	vector = dev->num_vecs == 1 ? 0 : qid;
+	if (!polled)
+		vector = dev->num_vecs == 1 ? 0 : qid;
+	else
+		vector = -1;
+
 	result = adapter_alloc_cq(dev, qid, nvmeq, vector);
 	if (result)
 		return result;
@@ -1527,15 +1554,20 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 	 * xxx' warning if the create CQ/SQ command times out.
 	 */
 	nvmeq->cq_vector = vector;
+	nvmeq->polled = polled;
 	nvme_init_queue(nvmeq, qid);
-	result = queue_request_irq(nvmeq);
-	if (result < 0)
-		goto release_sq;
+
+	if (vector != -1) {
+		result = queue_request_irq(nvmeq);
+		if (result < 0)
+			goto release_sq;
+	}
 
 	return result;
 
 release_sq:
 	nvmeq->cq_vector = -1;
+	nvmeq->polled = false;
 	dev->online_queues--;
 	adapter_delete_sq(dev, qid);
 release_cq:
@@ -1686,7 +1718,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
 
 static int nvme_create_io_queues(struct nvme_dev *dev)
 {
-	unsigned i, max;
+	unsigned i, max, rw_queues;
 	int ret = 0;
 
 	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
@@ -1697,8 +1729,17 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
 	}
 
 	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
+	if (max != 1 && dev->io_queues[NVMEQ_TYPE_POLL]) {
+		rw_queues = dev->io_queues[NVMEQ_TYPE_READ] +
+				dev->io_queues[NVMEQ_TYPE_WRITE];
+	} else {
+		rw_queues = max;
+	}
+
 	for (i = dev->online_queues; i <= max; i++) {
-		ret = nvme_create_queue(&dev->queues[i], i);
+		bool polled = i > rw_queues;
+
+		ret = nvme_create_queue(&dev->queues[i], i, polled);
 		if (ret)
 			break;
 	}
@@ -1973,6 +2014,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
 static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int nr_io_queues)
 {
 	unsigned int this_w_queues = write_queues;
+	unsigned int this_p_queues = poll_queues;
 
 	/*
 	 * Setup read/write queue split
@@ -1980,9 +2022,28 @@ static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int nr_io_queues)
 	if (nr_io_queues == 1) {
 		dev->io_queues[NVMEQ_TYPE_READ] = 1;
 		dev->io_queues[NVMEQ_TYPE_WRITE] = 0;
+		dev->io_queues[NVMEQ_TYPE_POLL] = 0;
 		return;
 	}
 
+	/*
+	 * Configure number of poll queues, if set
+	 */
+	if (this_p_queues) {
+		/*
+		 * We need at least one queue left. With just one queue, we'll
+		 * have a single shared read/write set.
+		 */
+		if (this_p_queues >= nr_io_queues) {
+			this_w_queues = 0;
+			this_p_queues = nr_io_queues - 1;
+		}
+
+		dev->io_queues[NVMEQ_TYPE_POLL] = this_p_queues;
+		nr_io_queues -= this_p_queues;
+	} else
+		dev->io_queues[NVMEQ_TYPE_POLL] = 0;
+
 	/*
 	 * If 'write_queues' is set, ensure it leaves room for at least
 	 * one read queue
@@ -2099,11 +2160,13 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 		return -EIO;
 
 	dev->num_vecs = result;
-	dev->max_qid = max(result - 1, 1);
+	result = max(result - 1, 1);
+	dev->max_qid = result + dev->io_queues[NVMEQ_TYPE_POLL];
 
-	dev_info(dev->ctrl.device, "%d/%d read/write queues\n",
+	dev_info(dev->ctrl.device, "%d/%d/%d read/write/poll queues\n",
 					dev->io_queues[NVMEQ_TYPE_READ],
-					dev->io_queues[NVMEQ_TYPE_WRITE]);
+					dev->io_queues[NVMEQ_TYPE_WRITE],
+					dev->io_queues[NVMEQ_TYPE_POLL]);
 
 	/*
 	 * Should investigate if there's a performance win from allocating
-- 
cgit 


From dbef5257737b989d4785b62939f1e9cba3948689 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 7 Nov 2018 21:17:57 -0700
Subject: sunvdc: fix compiler warning

Stephen reports:

After merging the block tree, today's linux-next build (sparc64 defconfig)
produced this warning:

/home/sfr/next/next/drivers/block/sunvdc.c: In function 'init_queue':
/home/sfr/next/next/drivers/block/sunvdc.c:788:6: warning: unused variable 'ret' [-Wunused-variable]
  int ret;
      ^~~

Kill the unused variable.

Fixes: fa182a1fa97d ("sunvdc: convert to blk-mq")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/sunvdc.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 95cb4ea8e402..175a64619602 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -785,7 +785,6 @@ static void cleanup_queue(struct request_queue *q)
 static struct request_queue *init_queue(struct vdc_port *port)
 {
 	struct request_queue *q;
-	int ret;
 
 	q = blk_mq_init_sq_queue(&port->tag_set, &vdc_mq_ops, VDC_TX_RING_SIZE,
 					BLK_MQ_F_SHOULD_MERGE);
-- 
cgit 


From e051bd0ddfdd6696197ad8c12d7485fc68c39f7b Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 8 Nov 2018 11:08:09 +0000
Subject: ms_block: remove unused pointer 'set'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pointer 'set' is declared but not used, remove it. Cleans up warning:

warning: unused variable ‘set’ [-Wunused-variable]

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/memstick/core/ms_block.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index ee0be66a9a03..82daccc9ea62 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -2104,7 +2104,6 @@ static const struct blk_mq_ops msb_mq_ops = {
 static int msb_init_disk(struct memstick_dev *card)
 {
 	struct msb_data *msb = memstick_get_drvdata(card);
-	struct blk_mq_tag_set *set = NULL;
 	int rc;
 	unsigned long capacity;
 
-- 
cgit 


From b1ab5fa309e6c49e4e06270ec67dd7b3e9971d04 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Thu, 8 Nov 2018 14:01:01 +0100
Subject: block/loop: Don't grab "struct file" for vfs_getattr() operation.

vfs_getattr() needs "struct path" rather than "struct file".
Let's use path_get()/path_put() rather than get_file()/fput().

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index cb0cc8685076..a29ef169f360 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1204,7 +1204,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 static int
 loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 {
-	struct file *file;
+	struct path path;
 	struct kstat stat;
 	int ret;
 
@@ -1229,16 +1229,16 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 	}
 
 	/* Drop lo_ctl_mutex while we call into the filesystem. */
-	file = get_file(lo->lo_backing_file);
+	path = lo->lo_backing_file->f_path;
+	path_get(&path);
 	mutex_unlock(&lo->lo_ctl_mutex);
-	ret = vfs_getattr(&file->f_path, &stat, STATX_INO,
-			  AT_STATX_SYNC_AS_STAT);
+	ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
 	if (!ret) {
 		info->lo_device = huge_encode_dev(stat.dev);
 		info->lo_inode = stat.ino;
 		info->lo_rdevice = huge_encode_dev(stat.rdev);
 	}
-	fput(file);
+	path_put(&path);
 	return ret;
 }
 
-- 
cgit 


From 310ca162d779efee8a2dc3731439680f3e9c1e86 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Thu, 8 Nov 2018 14:01:02 +0100
Subject: block/loop: Use global lock for ioctl() operation.

syzbot is reporting NULL pointer dereference [1] which is caused by
race condition between ioctl(loop_fd, LOOP_CLR_FD, 0) versus
ioctl(other_loop_fd, LOOP_SET_FD, loop_fd) due to traversing other
loop devices at loop_validate_file() without holding corresponding
lo->lo_ctl_mutex locks.

Since ioctl() request on loop devices is not frequent operation, we don't
need fine grained locking. Let's use global lock in order to allow safe
traversal at loop_validate_file().

Note that syzbot is also reporting circular locking dependency between
bdev->bd_mutex and lo->lo_ctl_mutex [2] which is caused by calling
blkdev_reread_part() with lock held. This patch does not address it.

[1] https://syzkaller.appspot.com/bug?id=f3cfe26e785d85f9ee259f385515291d21bd80a3
[2] https://syzkaller.appspot.com/bug?id=bf154052f0eea4bc7712499e4569505907d15889

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-by: syzbot <syzbot+bf89c128e05dd6c62523@syzkaller.appspotmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 58 ++++++++++++++++++++++++++--------------------------
 drivers/block/loop.h |  1 -
 2 files changed, 29 insertions(+), 30 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index a29ef169f360..63008e879771 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -84,6 +84,7 @@
 
 static DEFINE_IDR(loop_index_idr);
 static DEFINE_MUTEX(loop_index_mutex);
+static DEFINE_MUTEX(loop_ctl_mutex);
 
 static int max_part;
 static int part_shift;
@@ -1046,7 +1047,7 @@ static int loop_clr_fd(struct loop_device *lo)
 	 */
 	if (atomic_read(&lo->lo_refcnt) > 1) {
 		lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
-		mutex_unlock(&lo->lo_ctl_mutex);
+		mutex_unlock(&loop_ctl_mutex);
 		return 0;
 	}
 
@@ -1099,12 +1100,12 @@ static int loop_clr_fd(struct loop_device *lo)
 	if (!part_shift)
 		lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
 	loop_unprepare_queue(lo);
-	mutex_unlock(&lo->lo_ctl_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 	/*
-	 * Need not hold lo_ctl_mutex to fput backing file.
-	 * Calling fput holding lo_ctl_mutex triggers a circular
+	 * Need not hold loop_ctl_mutex to fput backing file.
+	 * Calling fput holding loop_ctl_mutex triggers a circular
 	 * lock dependency possibility warning as fput can take
-	 * bd_mutex which is usually taken before lo_ctl_mutex.
+	 * bd_mutex which is usually taken before loop_ctl_mutex.
 	 */
 	fput(filp);
 	return 0;
@@ -1209,7 +1210,7 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 	int ret;
 
 	if (lo->lo_state != Lo_bound) {
-		mutex_unlock(&lo->lo_ctl_mutex);
+		mutex_unlock(&loop_ctl_mutex);
 		return -ENXIO;
 	}
 
@@ -1228,10 +1229,10 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 		       lo->lo_encrypt_key_size);
 	}
 
-	/* Drop lo_ctl_mutex while we call into the filesystem. */
+	/* Drop loop_ctl_mutex while we call into the filesystem. */
 	path = lo->lo_backing_file->f_path;
 	path_get(&path);
-	mutex_unlock(&lo->lo_ctl_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 	ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
 	if (!ret) {
 		info->lo_device = huge_encode_dev(stat.dev);
@@ -1323,7 +1324,7 @@ loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) {
 	int err;
 
 	if (!arg) {
-		mutex_unlock(&lo->lo_ctl_mutex);
+		mutex_unlock(&loop_ctl_mutex);
 		return -EINVAL;
 	}
 	err = loop_get_status(lo, &info64);
@@ -1341,7 +1342,7 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
 	int err;
 
 	if (!arg) {
-		mutex_unlock(&lo->lo_ctl_mutex);
+		mutex_unlock(&loop_ctl_mutex);
 		return -EINVAL;
 	}
 	err = loop_get_status(lo, &info64);
@@ -1399,7 +1400,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	struct loop_device *lo = bdev->bd_disk->private_data;
 	int err;
 
-	err = mutex_lock_killable_nested(&lo->lo_ctl_mutex, 1);
+	err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
 	if (err)
 		goto out_unlocked;
 
@@ -1411,7 +1412,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		err = loop_change_fd(lo, bdev, arg);
 		break;
 	case LOOP_CLR_FD:
-		/* loop_clr_fd would have unlocked lo_ctl_mutex on success */
+		/* loop_clr_fd would have unlocked loop_ctl_mutex on success */
 		err = loop_clr_fd(lo);
 		if (!err)
 			goto out_unlocked;
@@ -1424,7 +1425,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		break;
 	case LOOP_GET_STATUS:
 		err = loop_get_status_old(lo, (struct loop_info __user *) arg);
-		/* loop_get_status() unlocks lo_ctl_mutex */
+		/* loop_get_status() unlocks loop_ctl_mutex */
 		goto out_unlocked;
 	case LOOP_SET_STATUS64:
 		err = -EPERM;
@@ -1434,7 +1435,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		break;
 	case LOOP_GET_STATUS64:
 		err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
-		/* loop_get_status() unlocks lo_ctl_mutex */
+		/* loop_get_status() unlocks loop_ctl_mutex */
 		goto out_unlocked;
 	case LOOP_SET_CAPACITY:
 		err = -EPERM;
@@ -1454,7 +1455,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	default:
 		err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
 	}
-	mutex_unlock(&lo->lo_ctl_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 
 out_unlocked:
 	return err;
@@ -1571,7 +1572,7 @@ loop_get_status_compat(struct loop_device *lo,
 	int err;
 
 	if (!arg) {
-		mutex_unlock(&lo->lo_ctl_mutex);
+		mutex_unlock(&loop_ctl_mutex);
 		return -EINVAL;
 	}
 	err = loop_get_status(lo, &info64);
@@ -1588,19 +1589,19 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 
 	switch(cmd) {
 	case LOOP_SET_STATUS:
-		err = mutex_lock_killable(&lo->lo_ctl_mutex);
+		err = mutex_lock_killable(&loop_ctl_mutex);
 		if (!err) {
 			err = loop_set_status_compat(lo,
 						     (const struct compat_loop_info __user *)arg);
-			mutex_unlock(&lo->lo_ctl_mutex);
+			mutex_unlock(&loop_ctl_mutex);
 		}
 		break;
 	case LOOP_GET_STATUS:
-		err = mutex_lock_killable(&lo->lo_ctl_mutex);
+		err = mutex_lock_killable(&loop_ctl_mutex);
 		if (!err) {
 			err = loop_get_status_compat(lo,
 						     (struct compat_loop_info __user *)arg);
-			/* loop_get_status() unlocks lo_ctl_mutex */
+			/* loop_get_status() unlocks loop_ctl_mutex */
 		}
 		break;
 	case LOOP_SET_CAPACITY:
@@ -1647,7 +1648,7 @@ static void __lo_release(struct loop_device *lo)
 	if (atomic_dec_return(&lo->lo_refcnt))
 		return;
 
-	mutex_lock(&lo->lo_ctl_mutex);
+	mutex_lock(&loop_ctl_mutex);
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
 		/*
 		 * In autoclear mode, stop the loop thread
@@ -1665,7 +1666,7 @@ static void __lo_release(struct loop_device *lo)
 		blk_mq_unfreeze_queue(lo->lo_queue);
 	}
 
-	mutex_unlock(&lo->lo_ctl_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 }
 
 static void lo_release(struct gendisk *disk, fmode_t mode)
@@ -1711,10 +1712,10 @@ static int unregister_transfer_cb(int id, void *ptr, void *data)
 	struct loop_device *lo = ptr;
 	struct loop_func_table *xfer = data;
 
-	mutex_lock(&lo->lo_ctl_mutex);
+	mutex_lock(&loop_ctl_mutex);
 	if (lo->lo_encryption == xfer)
 		loop_release_xfer(lo);
-	mutex_unlock(&lo->lo_ctl_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 	return 0;
 }
 
@@ -1895,7 +1896,6 @@ static int loop_add(struct loop_device **l, int i)
 	if (!part_shift)
 		disk->flags |= GENHD_FL_NO_PART_SCAN;
 	disk->flags |= GENHD_FL_EXT_DEVT;
-	mutex_init(&lo->lo_ctl_mutex);
 	atomic_set(&lo->lo_refcnt, 0);
 	lo->lo_number		= i;
 	spin_lock_init(&lo->lo_lock);
@@ -2008,21 +2008,21 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 		ret = loop_lookup(&lo, parm);
 		if (ret < 0)
 			break;
-		ret = mutex_lock_killable(&lo->lo_ctl_mutex);
+		ret = mutex_lock_killable(&loop_ctl_mutex);
 		if (ret)
 			break;
 		if (lo->lo_state != Lo_unbound) {
 			ret = -EBUSY;
-			mutex_unlock(&lo->lo_ctl_mutex);
+			mutex_unlock(&loop_ctl_mutex);
 			break;
 		}
 		if (atomic_read(&lo->lo_refcnt) > 0) {
 			ret = -EBUSY;
-			mutex_unlock(&lo->lo_ctl_mutex);
+			mutex_unlock(&loop_ctl_mutex);
 			break;
 		}
 		lo->lo_disk->private_data = NULL;
-		mutex_unlock(&lo->lo_ctl_mutex);
+		mutex_unlock(&loop_ctl_mutex);
 		idr_remove(&loop_index_idr, lo->lo_number);
 		loop_remove(lo);
 		break;
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 4d42c7af7de7..af75a5ee4094 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -54,7 +54,6 @@ struct loop_device {
 
 	spinlock_t		lo_lock;
 	int			lo_state;
-	struct mutex		lo_ctl_mutex;
 	struct kthread_worker	worker;
 	struct task_struct	*worker_task;
 	bool			use_dio;
-- 
cgit 


From 967d1dc144b50ad005e5eecdfadfbcfb399ffff6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:03 +0100
Subject: loop: Fold __loop_release into loop_release

__loop_release() has a single call site. Fold it there. This is
currently not a huge win but it will make following replacement of
loop_index_mutex more obvious.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 63008e879771..3de2cd94225a 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1641,12 +1641,15 @@ out:
 	return err;
 }
 
-static void __lo_release(struct loop_device *lo)
+static void lo_release(struct gendisk *disk, fmode_t mode)
 {
+	struct loop_device *lo;
 	int err;
 
+	mutex_lock(&loop_index_mutex);
+	lo = disk->private_data;
 	if (atomic_dec_return(&lo->lo_refcnt))
-		return;
+		goto unlock_index;
 
 	mutex_lock(&loop_ctl_mutex);
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
@@ -1656,7 +1659,7 @@ static void __lo_release(struct loop_device *lo)
 		 */
 		err = loop_clr_fd(lo);
 		if (!err)
-			return;
+			goto unlock_index;
 	} else if (lo->lo_state == Lo_bound) {
 		/*
 		 * Otherwise keep thread (if running) and config,
@@ -1667,12 +1670,7 @@ static void __lo_release(struct loop_device *lo)
 	}
 
 	mutex_unlock(&loop_ctl_mutex);
-}
-
-static void lo_release(struct gendisk *disk, fmode_t mode)
-{
-	mutex_lock(&loop_index_mutex);
-	__lo_release(disk->private_data);
+unlock_index:
 	mutex_unlock(&loop_index_mutex);
 }
 
-- 
cgit 


From 0a42e99b58a208839626465af194cfe640ef9493 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:04 +0100
Subject: loop: Get rid of loop_index_mutex

Now that loop_ctl_mutex is global, just get rid of loop_index_mutex as
there is no good reason to keep these two separate and it just
complicates the locking.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3de2cd94225a..dcdc96f4d2d4 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -83,7 +83,6 @@
 #include <linux/uaccess.h>
 
 static DEFINE_IDR(loop_index_idr);
-static DEFINE_MUTEX(loop_index_mutex);
 static DEFINE_MUTEX(loop_ctl_mutex);
 
 static int max_part;
@@ -1626,9 +1625,11 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 static int lo_open(struct block_device *bdev, fmode_t mode)
 {
 	struct loop_device *lo;
-	int err = 0;
+	int err;
 
-	mutex_lock(&loop_index_mutex);
+	err = mutex_lock_killable(&loop_ctl_mutex);
+	if (err)
+		return err;
 	lo = bdev->bd_disk->private_data;
 	if (!lo) {
 		err = -ENXIO;
@@ -1637,7 +1638,7 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
 
 	atomic_inc(&lo->lo_refcnt);
 out:
-	mutex_unlock(&loop_index_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 	return err;
 }
 
@@ -1646,12 +1647,11 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 	struct loop_device *lo;
 	int err;
 
-	mutex_lock(&loop_index_mutex);
+	mutex_lock(&loop_ctl_mutex);
 	lo = disk->private_data;
 	if (atomic_dec_return(&lo->lo_refcnt))
-		goto unlock_index;
+		goto out_unlock;
 
-	mutex_lock(&loop_ctl_mutex);
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
 		/*
 		 * In autoclear mode, stop the loop thread
@@ -1659,7 +1659,7 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		 */
 		err = loop_clr_fd(lo);
 		if (!err)
-			goto unlock_index;
+			return;
 	} else if (lo->lo_state == Lo_bound) {
 		/*
 		 * Otherwise keep thread (if running) and config,
@@ -1669,9 +1669,8 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		blk_mq_unfreeze_queue(lo->lo_queue);
 	}
 
+out_unlock:
 	mutex_unlock(&loop_ctl_mutex);
-unlock_index:
-	mutex_unlock(&loop_index_mutex);
 }
 
 static const struct block_device_operations lo_fops = {
@@ -1972,7 +1971,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
 	struct kobject *kobj;
 	int err;
 
-	mutex_lock(&loop_index_mutex);
+	mutex_lock(&loop_ctl_mutex);
 	err = loop_lookup(&lo, MINOR(dev) >> part_shift);
 	if (err < 0)
 		err = loop_add(&lo, MINOR(dev) >> part_shift);
@@ -1980,7 +1979,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
 		kobj = NULL;
 	else
 		kobj = get_disk_and_module(lo->lo_disk);
-	mutex_unlock(&loop_index_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 
 	*part = 0;
 	return kobj;
@@ -1990,9 +1989,13 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 			       unsigned long parm)
 {
 	struct loop_device *lo;
-	int ret = -ENOSYS;
+	int ret;
 
-	mutex_lock(&loop_index_mutex);
+	ret = mutex_lock_killable(&loop_ctl_mutex);
+	if (ret)
+		return ret;
+
+	ret = -ENOSYS;
 	switch (cmd) {
 	case LOOP_CTL_ADD:
 		ret = loop_lookup(&lo, parm);
@@ -2006,9 +2009,6 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 		ret = loop_lookup(&lo, parm);
 		if (ret < 0)
 			break;
-		ret = mutex_lock_killable(&loop_ctl_mutex);
-		if (ret)
-			break;
 		if (lo->lo_state != Lo_unbound) {
 			ret = -EBUSY;
 			mutex_unlock(&loop_ctl_mutex);
@@ -2020,7 +2020,6 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 			break;
 		}
 		lo->lo_disk->private_data = NULL;
-		mutex_unlock(&loop_ctl_mutex);
 		idr_remove(&loop_index_idr, lo->lo_number);
 		loop_remove(lo);
 		break;
@@ -2030,7 +2029,7 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 			break;
 		ret = loop_add(&lo, -1);
 	}
-	mutex_unlock(&loop_index_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 
 	return ret;
 }
@@ -2114,10 +2113,10 @@ static int __init loop_init(void)
 				  THIS_MODULE, loop_probe, NULL, NULL);
 
 	/* pre-create number of devices given by config or max_loop */
-	mutex_lock(&loop_index_mutex);
+	mutex_lock(&loop_ctl_mutex);
 	for (i = 0; i < nr; i++)
 		loop_add(&lo, i);
-	mutex_unlock(&loop_index_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 
 	printk(KERN_INFO "loop: module loaded\n");
 	return 0;
-- 
cgit 


From a13165441d58b216adbd50252a9cc829d78a6bce Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:05 +0100
Subject: loop: Push lo_ctl_mutex down into individual ioctls

Push acquisition of lo_ctl_mutex down into individual ioctl handling
branches. This is a preparatory step for pushing the lock down into
individual ioctl handling functions so that they can release the lock as
they need it. We also factor out some simple ioctl handlers that will
not need any special handling to reduce unnecessary code duplication.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 88 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 63 insertions(+), 25 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index dcdc96f4d2d4..4c37578989c4 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1393,70 +1393,108 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
 	return 0;
 }
 
-static int lo_ioctl(struct block_device *bdev, fmode_t mode,
-	unsigned int cmd, unsigned long arg)
+static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,
+			   unsigned long arg)
 {
-	struct loop_device *lo = bdev->bd_disk->private_data;
 	int err;
 
 	err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
 	if (err)
-		goto out_unlocked;
+		return err;
+	switch (cmd) {
+	case LOOP_SET_CAPACITY:
+		err = loop_set_capacity(lo);
+		break;
+	case LOOP_SET_DIRECT_IO:
+		err = loop_set_dio(lo, arg);
+		break;
+	case LOOP_SET_BLOCK_SIZE:
+		err = loop_set_block_size(lo, arg);
+		break;
+	default:
+		err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
+	}
+	mutex_unlock(&loop_ctl_mutex);
+	return err;
+}
+
+static int lo_ioctl(struct block_device *bdev, fmode_t mode,
+	unsigned int cmd, unsigned long arg)
+{
+	struct loop_device *lo = bdev->bd_disk->private_data;
+	int err;
 
 	switch (cmd) {
 	case LOOP_SET_FD:
+		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+		if (err)
+			return err;
 		err = loop_set_fd(lo, mode, bdev, arg);
+		mutex_unlock(&loop_ctl_mutex);
 		break;
 	case LOOP_CHANGE_FD:
+		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+		if (err)
+			return err;
 		err = loop_change_fd(lo, bdev, arg);
+		mutex_unlock(&loop_ctl_mutex);
 		break;
 	case LOOP_CLR_FD:
+		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+		if (err)
+			return err;
 		/* loop_clr_fd would have unlocked loop_ctl_mutex on success */
 		err = loop_clr_fd(lo);
-		if (!err)
-			goto out_unlocked;
+		if (err)
+			mutex_unlock(&loop_ctl_mutex);
 		break;
 	case LOOP_SET_STATUS:
 		err = -EPERM;
-		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
+			err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+			if (err)
+				return err;
 			err = loop_set_status_old(lo,
 					(struct loop_info __user *)arg);
+			mutex_unlock(&loop_ctl_mutex);
+		}
 		break;
 	case LOOP_GET_STATUS:
+		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+		if (err)
+			return err;
 		err = loop_get_status_old(lo, (struct loop_info __user *) arg);
 		/* loop_get_status() unlocks loop_ctl_mutex */
-		goto out_unlocked;
+		break;
 	case LOOP_SET_STATUS64:
 		err = -EPERM;
-		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
+			err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+			if (err)
+				return err;
 			err = loop_set_status64(lo,
 					(struct loop_info64 __user *) arg);
+			mutex_unlock(&loop_ctl_mutex);
+		}
 		break;
 	case LOOP_GET_STATUS64:
+		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+		if (err)
+			return err;
 		err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
 		/* loop_get_status() unlocks loop_ctl_mutex */
-		goto out_unlocked;
-	case LOOP_SET_CAPACITY:
-		err = -EPERM;
-		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
-			err = loop_set_capacity(lo);
 		break;
+	case LOOP_SET_CAPACITY:
 	case LOOP_SET_DIRECT_IO:
-		err = -EPERM;
-		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
-			err = loop_set_dio(lo, arg);
-		break;
 	case LOOP_SET_BLOCK_SIZE:
-		err = -EPERM;
-		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
-			err = loop_set_block_size(lo, arg);
-		break;
+		if (!(mode & FMODE_WRITE) && !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		/* Fall through */
 	default:
-		err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
+		err = lo_simple_ioctl(lo, cmd, arg);
+		break;
 	}
-	mutex_unlock(&loop_ctl_mutex);
 
-out_unlocked:
 	return err;
 }
 
-- 
cgit 


From a2505b799a496b7b84d9a4a14ec870ff9e42e11b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:06 +0100
Subject: loop: Split setting of lo_state from loop_clr_fd

Move setting of lo_state to Lo_rundown out into the callers. That will
allow us to unlock loop_ctl_mutex while the loop device is protected
from other changes by its special state.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 52 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 21 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 4c37578989c4..eb01a685da4e 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -975,7 +975,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 		loop_reread_partitions(lo, bdev);
 
 	/* Grab the block_device to prevent its destruction after we
-	 * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev).
+	 * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev).
 	 */
 	bdgrab(bdev);
 	return 0;
@@ -1025,31 +1025,15 @@ loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
 	return err;
 }
 
-static int loop_clr_fd(struct loop_device *lo)
+static int __loop_clr_fd(struct loop_device *lo)
 {
 	struct file *filp = lo->lo_backing_file;
 	gfp_t gfp = lo->old_gfp_mask;
 	struct block_device *bdev = lo->lo_device;
 
-	if (lo->lo_state != Lo_bound)
+	if (WARN_ON_ONCE(lo->lo_state != Lo_rundown))
 		return -ENXIO;
 
-	/*
-	 * If we've explicitly asked to tear down the loop device,
-	 * and it has an elevated reference count, set it for auto-teardown when
-	 * the last reference goes away. This stops $!~#$@ udev from
-	 * preventing teardown because it decided that it needs to run blkid on
-	 * the loopback device whenever they appear. xfstests is notorious for
-	 * failing tests because blkid via udev races with a losetup
-	 * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
-	 * command to fail with EBUSY.
-	 */
-	if (atomic_read(&lo->lo_refcnt) > 1) {
-		lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
-		mutex_unlock(&loop_ctl_mutex);
-		return 0;
-	}
-
 	if (filp == NULL)
 		return -EINVAL;
 
@@ -1057,7 +1041,6 @@ static int loop_clr_fd(struct loop_device *lo)
 	blk_mq_freeze_queue(lo->lo_queue);
 
 	spin_lock_irq(&lo->lo_lock);
-	lo->lo_state = Lo_rundown;
 	lo->lo_backing_file = NULL;
 	spin_unlock_irq(&lo->lo_lock);
 
@@ -1110,6 +1093,30 @@ static int loop_clr_fd(struct loop_device *lo)
 	return 0;
 }
 
+static int loop_clr_fd(struct loop_device *lo)
+{
+	if (lo->lo_state != Lo_bound)
+		return -ENXIO;
+	/*
+	 * If we've explicitly asked to tear down the loop device,
+	 * and it has an elevated reference count, set it for auto-teardown when
+	 * the last reference goes away. This stops $!~#$@ udev from
+	 * preventing teardown because it decided that it needs to run blkid on
+	 * the loopback device whenever they appear. xfstests is notorious for
+	 * failing tests because blkid via udev races with a losetup
+	 * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
+	 * command to fail with EBUSY.
+	 */
+	if (atomic_read(&lo->lo_refcnt) > 1) {
+		lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
+		mutex_unlock(&loop_ctl_mutex);
+		return 0;
+	}
+	lo->lo_state = Lo_rundown;
+
+	return __loop_clr_fd(lo);
+}
+
 static int
 loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 {
@@ -1691,11 +1698,14 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		goto out_unlock;
 
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
+		if (lo->lo_state != Lo_bound)
+			goto out_unlock;
+		lo->lo_state = Lo_rundown;
 		/*
 		 * In autoclear mode, stop the loop thread
 		 * and remove configuration after last close.
 		 */
-		err = loop_clr_fd(lo);
+		err = __loop_clr_fd(lo);
 		if (!err)
 			return;
 	} else if (lo->lo_state == Lo_bound) {
-- 
cgit 


From 7ccd0791d98531df7cd59e92d55e4f063d48a070 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:07 +0100
Subject: loop: Push loop_ctl_mutex down into loop_clr_fd()

loop_clr_fd() has a weird locking convention that is expects
loop_ctl_mutex held, releases it on success and keeps it on failure.
Untangle the mess by moving locking of loop_ctl_mutex into
loop_clr_fd().

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 49 +++++++++++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 20 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index eb01a685da4e..d8a7b5da881b 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1027,15 +1027,22 @@ loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
 
 static int __loop_clr_fd(struct loop_device *lo)
 {
-	struct file *filp = lo->lo_backing_file;
+	struct file *filp = NULL;
 	gfp_t gfp = lo->old_gfp_mask;
 	struct block_device *bdev = lo->lo_device;
+	int err = 0;
 
-	if (WARN_ON_ONCE(lo->lo_state != Lo_rundown))
-		return -ENXIO;
+	mutex_lock(&loop_ctl_mutex);
+	if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) {
+		err = -ENXIO;
+		goto out_unlock;
+	}
 
-	if (filp == NULL)
-		return -EINVAL;
+	filp = lo->lo_backing_file;
+	if (filp == NULL) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
 
 	/* freeze request queue during the transition */
 	blk_mq_freeze_queue(lo->lo_queue);
@@ -1082,6 +1089,7 @@ static int __loop_clr_fd(struct loop_device *lo)
 	if (!part_shift)
 		lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
 	loop_unprepare_queue(lo);
+out_unlock:
 	mutex_unlock(&loop_ctl_mutex);
 	/*
 	 * Need not hold loop_ctl_mutex to fput backing file.
@@ -1089,14 +1097,22 @@ static int __loop_clr_fd(struct loop_device *lo)
 	 * lock dependency possibility warning as fput can take
 	 * bd_mutex which is usually taken before loop_ctl_mutex.
 	 */
-	fput(filp);
-	return 0;
+	if (filp)
+		fput(filp);
+	return err;
 }
 
 static int loop_clr_fd(struct loop_device *lo)
 {
-	if (lo->lo_state != Lo_bound)
+	int err;
+
+	err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	if (err)
+		return err;
+	if (lo->lo_state != Lo_bound) {
+		mutex_unlock(&loop_ctl_mutex);
 		return -ENXIO;
+	}
 	/*
 	 * If we've explicitly asked to tear down the loop device,
 	 * and it has an elevated reference count, set it for auto-teardown when
@@ -1113,6 +1129,7 @@ static int loop_clr_fd(struct loop_device *lo)
 		return 0;
 	}
 	lo->lo_state = Lo_rundown;
+	mutex_unlock(&loop_ctl_mutex);
 
 	return __loop_clr_fd(lo);
 }
@@ -1447,14 +1464,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		mutex_unlock(&loop_ctl_mutex);
 		break;
 	case LOOP_CLR_FD:
-		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
-		if (err)
-			return err;
-		/* loop_clr_fd would have unlocked loop_ctl_mutex on success */
-		err = loop_clr_fd(lo);
-		if (err)
-			mutex_unlock(&loop_ctl_mutex);
-		break;
+		return loop_clr_fd(lo);
 	case LOOP_SET_STATUS:
 		err = -EPERM;
 		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
@@ -1690,7 +1700,6 @@ out:
 static void lo_release(struct gendisk *disk, fmode_t mode)
 {
 	struct loop_device *lo;
-	int err;
 
 	mutex_lock(&loop_ctl_mutex);
 	lo = disk->private_data;
@@ -1701,13 +1710,13 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		if (lo->lo_state != Lo_bound)
 			goto out_unlock;
 		lo->lo_state = Lo_rundown;
+		mutex_unlock(&loop_ctl_mutex);
 		/*
 		 * In autoclear mode, stop the loop thread
 		 * and remove configuration after last close.
 		 */
-		err = __loop_clr_fd(lo);
-		if (!err)
-			return;
+		__loop_clr_fd(lo);
+		return;
 	} else if (lo->lo_state == Lo_bound) {
 		/*
 		 * Otherwise keep thread (if running) and config,
-- 
cgit 


From 4a5ce9ba5877e4640200d84a735361306ad1a1b8 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:08 +0100
Subject: loop: Push loop_ctl_mutex down to loop_get_status()

Push loop_ctl_mutex down to loop_get_status() to avoid the unusual
convention that the function gets called with loop_ctl_mutex held and
releases it.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 37 ++++++++++---------------------------
 1 file changed, 10 insertions(+), 27 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d8a7b5da881b..2e814f8af4df 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1232,6 +1232,9 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 	struct kstat stat;
 	int ret;
 
+	ret = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	if (ret)
+		return ret;
 	if (lo->lo_state != Lo_bound) {
 		mutex_unlock(&loop_ctl_mutex);
 		return -ENXIO;
@@ -1346,10 +1349,8 @@ loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) {
 	struct loop_info64 info64;
 	int err;
 
-	if (!arg) {
-		mutex_unlock(&loop_ctl_mutex);
+	if (!arg)
 		return -EINVAL;
-	}
 	err = loop_get_status(lo, &info64);
 	if (!err)
 		err = loop_info64_to_old(&info64, &info);
@@ -1364,10 +1365,8 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
 	struct loop_info64 info64;
 	int err;
 
-	if (!arg) {
-		mutex_unlock(&loop_ctl_mutex);
+	if (!arg)
 		return -EINVAL;
-	}
 	err = loop_get_status(lo, &info64);
 	if (!err && copy_to_user(arg, &info64, sizeof(info64)))
 		err = -EFAULT;
@@ -1477,12 +1476,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		}
 		break;
 	case LOOP_GET_STATUS:
-		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
-		if (err)
-			return err;
-		err = loop_get_status_old(lo, (struct loop_info __user *) arg);
-		/* loop_get_status() unlocks loop_ctl_mutex */
-		break;
+		return loop_get_status_old(lo, (struct loop_info __user *) arg);
 	case LOOP_SET_STATUS64:
 		err = -EPERM;
 		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
@@ -1495,12 +1489,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		}
 		break;
 	case LOOP_GET_STATUS64:
-		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
-		if (err)
-			return err;
-		err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
-		/* loop_get_status() unlocks loop_ctl_mutex */
-		break;
+		return loop_get_status64(lo, (struct loop_info64 __user *) arg);
 	case LOOP_SET_CAPACITY:
 	case LOOP_SET_DIRECT_IO:
 	case LOOP_SET_BLOCK_SIZE:
@@ -1625,10 +1614,8 @@ loop_get_status_compat(struct loop_device *lo,
 	struct loop_info64 info64;
 	int err;
 
-	if (!arg) {
-		mutex_unlock(&loop_ctl_mutex);
+	if (!arg)
 		return -EINVAL;
-	}
 	err = loop_get_status(lo, &info64);
 	if (!err)
 		err = loop_info64_to_compat(&info64, arg);
@@ -1651,12 +1638,8 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 		}
 		break;
 	case LOOP_GET_STATUS:
-		err = mutex_lock_killable(&loop_ctl_mutex);
-		if (!err) {
-			err = loop_get_status_compat(lo,
-						     (struct compat_loop_info __user *)arg);
-			/* loop_get_status() unlocks loop_ctl_mutex */
-		}
+		err = loop_get_status_compat(lo,
+				     (struct compat_loop_info __user *)arg);
 		break;
 	case LOOP_SET_CAPACITY:
 	case LOOP_CLR_FD:
-- 
cgit 


From 550df5fdacff94229cde0ed9b8085155654c1696 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:09 +0100
Subject: loop: Push loop_ctl_mutex down to loop_set_status()

Push loop_ctl_mutex down to loop_set_status(). We will need this to be
able to call loop_reread_partitions() without loop_ctl_mutex.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 51 +++++++++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 26 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 2e814f8af4df..af79a59732b7 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1141,46 +1141,55 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	struct loop_func_table *xfer;
 	kuid_t uid = current_uid();
 
+	err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	if (err)
+		return err;
 	if (lo->lo_encrypt_key_size &&
 	    !uid_eq(lo->lo_key_owner, uid) &&
-	    !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	if (lo->lo_state != Lo_bound)
-		return -ENXIO;
-	if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)
-		return -EINVAL;
+	    !capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto out_unlock;
+	}
+	if (lo->lo_state != Lo_bound) {
+		err = -ENXIO;
+		goto out_unlock;
+	}
+	if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
 
 	/* I/O need to be drained during transfer transition */
 	blk_mq_freeze_queue(lo->lo_queue);
 
 	err = loop_release_xfer(lo);
 	if (err)
-		goto exit;
+		goto out_unfreeze;
 
 	if (info->lo_encrypt_type) {
 		unsigned int type = info->lo_encrypt_type;
 
 		if (type >= MAX_LO_CRYPT) {
 			err = -EINVAL;
-			goto exit;
+			goto out_unfreeze;
 		}
 		xfer = xfer_funcs[type];
 		if (xfer == NULL) {
 			err = -EINVAL;
-			goto exit;
+			goto out_unfreeze;
 		}
 	} else
 		xfer = NULL;
 
 	err = loop_init_xfer(lo, xfer, info);
 	if (err)
-		goto exit;
+		goto out_unfreeze;
 
 	if (lo->lo_offset != info->lo_offset ||
 	    lo->lo_sizelimit != info->lo_sizelimit) {
 		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
 			err = -EFBIG;
-			goto exit;
+			goto out_unfreeze;
 		}
 	}
 
@@ -1212,7 +1221,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	/* update dio if lo_offset or transfer is changed */
 	__loop_update_dio(lo, lo->use_dio);
 
- exit:
+out_unfreeze:
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
 	if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) &&
@@ -1221,6 +1230,8 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 		lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
 		loop_reread_partitions(lo, lo->lo_device);
 	}
+out_unlock:
+	mutex_unlock(&loop_ctl_mutex);
 
 	return err;
 }
@@ -1467,12 +1478,8 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	case LOOP_SET_STATUS:
 		err = -EPERM;
 		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
-			err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
-			if (err)
-				return err;
 			err = loop_set_status_old(lo,
 					(struct loop_info __user *)arg);
-			mutex_unlock(&loop_ctl_mutex);
 		}
 		break;
 	case LOOP_GET_STATUS:
@@ -1480,12 +1487,8 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	case LOOP_SET_STATUS64:
 		err = -EPERM;
 		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
-			err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
-			if (err)
-				return err;
 			err = loop_set_status64(lo,
 					(struct loop_info64 __user *) arg);
-			mutex_unlock(&loop_ctl_mutex);
 		}
 		break;
 	case LOOP_GET_STATUS64:
@@ -1630,12 +1633,8 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 
 	switch(cmd) {
 	case LOOP_SET_STATUS:
-		err = mutex_lock_killable(&loop_ctl_mutex);
-		if (!err) {
-			err = loop_set_status_compat(lo,
-						     (const struct compat_loop_info __user *)arg);
-			mutex_unlock(&loop_ctl_mutex);
-		}
+		err = loop_set_status_compat(lo,
+			     (const struct compat_loop_info __user *)arg);
 		break;
 	case LOOP_GET_STATUS:
 		err = loop_get_status_compat(lo,
-- 
cgit 


From 757ecf40b7e029529768eb5f9562d5eeb3002106 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:10 +0100
Subject: loop: Push loop_ctl_mutex down to loop_set_fd()

Push lo_ctl_mutex down to loop_set_fd(). We will need this to be able to
call loop_reread_partitions() without lo_ctl_mutex.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index af79a59732b7..161e2a08f2e8 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -918,13 +918,17 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	if (!file)
 		goto out;
 
+	error = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	if (error)
+		goto out_putf;
+
 	error = -EBUSY;
 	if (lo->lo_state != Lo_unbound)
-		goto out_putf;
+		goto out_unlock;
 
 	error = loop_validate_file(file, bdev);
 	if (error)
-		goto out_putf;
+		goto out_unlock;
 
 	mapping = file->f_mapping;
 	inode = mapping->host;
@@ -936,10 +940,10 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	error = -EFBIG;
 	size = get_loop_size(lo, file);
 	if ((loff_t)(sector_t)size != size)
-		goto out_putf;
+		goto out_unlock;
 	error = loop_prepare_queue(lo);
 	if (error)
-		goto out_putf;
+		goto out_unlock;
 
 	error = 0;
 
@@ -978,11 +982,14 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	 * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev).
 	 */
 	bdgrab(bdev);
+	mutex_unlock(&loop_ctl_mutex);
 	return 0;
 
- out_putf:
+out_unlock:
+	mutex_unlock(&loop_ctl_mutex);
+out_putf:
 	fput(file);
- out:
+out:
 	/* This is safe: open() is still holding a reference. */
 	module_put(THIS_MODULE);
 	return error;
@@ -1460,12 +1467,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 
 	switch (cmd) {
 	case LOOP_SET_FD:
-		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
-		if (err)
-			return err;
-		err = loop_set_fd(lo, mode, bdev, arg);
-		mutex_unlock(&loop_ctl_mutex);
-		break;
+		return loop_set_fd(lo, mode, bdev, arg);
 	case LOOP_CHANGE_FD:
 		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
 		if (err)
-- 
cgit 


From c371077000f4138ee3c15fbed50101ff24bdc91d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:11 +0100
Subject: loop: Push loop_ctl_mutex down to loop_change_fd()

Push loop_ctl_mutex down to loop_change_fd(). We will need this to be
able to call loop_reread_partitions() without loop_ctl_mutex.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 161e2a08f2e8..ea5e313908b1 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -691,19 +691,22 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 	struct file	*file, *old_file;
 	int		error;
 
+	error = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	if (error)
+		return error;
 	error = -ENXIO;
 	if (lo->lo_state != Lo_bound)
-		goto out;
+		goto out_unlock;
 
 	/* the loop device has to be read-only */
 	error = -EINVAL;
 	if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
-		goto out;
+		goto out_unlock;
 
 	error = -EBADF;
 	file = fget(arg);
 	if (!file)
-		goto out;
+		goto out_unlock;
 
 	error = loop_validate_file(file, bdev);
 	if (error)
@@ -730,11 +733,13 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 	fput(old_file);
 	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
 		loop_reread_partitions(lo, bdev);
+	mutex_unlock(&loop_ctl_mutex);
 	return 0;
 
- out_putf:
+out_putf:
 	fput(file);
- out:
+out_unlock:
+	mutex_unlock(&loop_ctl_mutex);
 	return error;
 }
 
@@ -1469,12 +1474,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	case LOOP_SET_FD:
 		return loop_set_fd(lo, mode, bdev, arg);
 	case LOOP_CHANGE_FD:
-		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
-		if (err)
-			return err;
-		err = loop_change_fd(lo, bdev, arg);
-		mutex_unlock(&loop_ctl_mutex);
-		break;
+		return loop_change_fd(lo, bdev, arg);
 	case LOOP_CLR_FD:
 		return loop_clr_fd(lo);
 	case LOOP_SET_STATUS:
-- 
cgit 


From d57f3374ba4817f7c8d26fae8a13d20ac8d31b92 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:12 +0100
Subject: loop: Move special partition reread handling in loop_clr_fd()

The call of __blkdev_reread_part() from loop_reread_partition() happens
only when we need to invalidate partitions from loop_release(). Thus
move a detection for this into loop_clr_fd() and simplify
loop_reread_partition().

This makes loop_reread_partition() safe to use without loop_ctl_mutex
because we use only lo->lo_number and lo->lo_file_name in case of error
for reporting purposes (thus possibly reporting outdate information is
not a big deal) and we are safe from 'lo' going away under us by
elevated lo->lo_refcnt.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ea5e313908b1..f1d7a4fe30fc 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -630,18 +630,7 @@ static void loop_reread_partitions(struct loop_device *lo,
 {
 	int rc;
 
-	/*
-	 * bd_mutex has been held already in release path, so don't
-	 * acquire it if this function is called in such case.
-	 *
-	 * If the reread partition isn't from release path, lo_refcnt
-	 * must be at least one and it can only become zero when the
-	 * current holder is released.
-	 */
-	if (!atomic_read(&lo->lo_refcnt))
-		rc = __blkdev_reread_part(bdev);
-	else
-		rc = blkdev_reread_part(bdev);
+	rc = blkdev_reread_part(bdev);
 	if (rc)
 		pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n",
 			__func__, lo->lo_number, lo->lo_file_name, rc);
@@ -1095,8 +1084,24 @@ static int __loop_clr_fd(struct loop_device *lo)
 	module_put(THIS_MODULE);
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
-	if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev)
-		loop_reread_partitions(lo, bdev);
+	if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) {
+		/*
+		 * bd_mutex has been held already in release path, so don't
+		 * acquire it if this function is called in such case.
+		 *
+		 * If the reread partition isn't from release path, lo_refcnt
+		 * must be at least one and it can only become zero when the
+		 * current holder is released.
+		 */
+		if (!atomic_read(&lo->lo_refcnt))
+			err = __blkdev_reread_part(bdev);
+		else
+			err = blkdev_reread_part(bdev);
+		pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
+			__func__, lo->lo_number, err);
+		/* Device is gone, no point in returning error */
+		err = 0;
+	}
 	lo->lo_flags = 0;
 	if (!part_shift)
 		lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
-- 
cgit 


From 85b0a54a82e4fbceeb1aebb7cb6909edd1a24668 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:13 +0100
Subject: loop: Move loop_reread_partitions() out of loop_ctl_mutex

Calling loop_reread_partitions() under loop_ctl_mutex causes lockdep to
complain about circular lock dependency between bdev->bd_mutex and
lo->lo_ctl_mutex. The problem is that on loop device open or close
lo_open() and lo_release() get called with bdev->bd_mutex held and they
need to acquire loop_ctl_mutex. OTOH when loop_reread_partitions() is
called with loop_ctl_mutex held, it will call blkdev_reread_part() which
acquires bdev->bd_mutex. See syzbot report for details [1].

Move all calls of loop_rescan_partitions() out of loop_ctl_mutex to
avoid lockdep warning and fix deadlock possibility.

[1] https://syzkaller.appspot.com/bug?id=bf154052f0eea4bc7712499e4569505907d1588

Reported-by: syzbot <syzbot+4684a000d5abdade83fac55b1e7d1f935ef1936e@syzkaller.appspotmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f1d7a4fe30fc..cce5d4e8e863 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -679,6 +679,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 {
 	struct file	*file, *old_file;
 	int		error;
+	bool		partscan;
 
 	error = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
 	if (error)
@@ -720,9 +721,10 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
 	fput(old_file);
-	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
-		loop_reread_partitions(lo, bdev);
+	partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
 	mutex_unlock(&loop_ctl_mutex);
+	if (partscan)
+		loop_reread_partitions(lo, bdev);
 	return 0;
 
 out_putf:
@@ -903,6 +905,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	int		lo_flags = 0;
 	int		error;
 	loff_t		size;
+	bool		partscan;
 
 	/* This is safe, since we have a reference from open(). */
 	__module_get(THIS_MODULE);
@@ -969,14 +972,15 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	lo->lo_state = Lo_bound;
 	if (part_shift)
 		lo->lo_flags |= LO_FLAGS_PARTSCAN;
-	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
-		loop_reread_partitions(lo, bdev);
+	partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
 
 	/* Grab the block_device to prevent its destruction after we
 	 * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev).
 	 */
 	bdgrab(bdev);
 	mutex_unlock(&loop_ctl_mutex);
+	if (partscan)
+		loop_reread_partitions(lo, bdev);
 	return 0;
 
 out_unlock:
@@ -1157,6 +1161,8 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	int err;
 	struct loop_func_table *xfer;
 	kuid_t uid = current_uid();
+	struct block_device *bdev;
+	bool partscan = false;
 
 	err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
 	if (err)
@@ -1245,10 +1251,13 @@ out_unfreeze:
 	     !(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
 		lo->lo_flags |= LO_FLAGS_PARTSCAN;
 		lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
-		loop_reread_partitions(lo, lo->lo_device);
+		bdev = lo->lo_device;
+		partscan = true;
 	}
 out_unlock:
 	mutex_unlock(&loop_ctl_mutex);
+	if (partscan)
+		loop_reread_partitions(lo, bdev);
 
 	return err;
 }
-- 
cgit 


From 0da03cab87e6323ff2e05b14bc7d5c6fcc531efd Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:14 +0100
Subject: loop: Fix deadlock when calling blkdev_reread_part()

Calling blkdev_reread_part() under loop_ctl_mutex causes lockdep to
complain about circular lock dependency between bdev->bd_mutex and
lo->lo_ctl_mutex. The problem is that on loop device open or close
lo_open() and lo_release() get called with bdev->bd_mutex held and they
need to acquire loop_ctl_mutex. OTOH when loop_reread_partitions() is
called with loop_ctl_mutex held, it will call blkdev_reread_part() which
acquires bdev->bd_mutex. See syzbot report for details [1].

Move call to blkdev_reread_part() in __loop_clr_fd() from under
loop_ctl_mutex to finish fixing of the lockdep warning and the possible
deadlock.

[1] https://syzkaller.appspot.com/bug?id=bf154052f0eea4bc7712499e4569505907d1588

Reported-by: syzbot <syzbot+4684a000d5abdade83fac55b1e7d1f935ef1936e@syzkaller.appspotmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index cce5d4e8e863..b3f981ac8ef1 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1030,12 +1030,14 @@ loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
 	return err;
 }
 
-static int __loop_clr_fd(struct loop_device *lo)
+static int __loop_clr_fd(struct loop_device *lo, bool release)
 {
 	struct file *filp = NULL;
 	gfp_t gfp = lo->old_gfp_mask;
 	struct block_device *bdev = lo->lo_device;
 	int err = 0;
+	bool partscan = false;
+	int lo_number;
 
 	mutex_lock(&loop_ctl_mutex);
 	if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) {
@@ -1088,7 +1090,15 @@ static int __loop_clr_fd(struct loop_device *lo)
 	module_put(THIS_MODULE);
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
-	if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) {
+	partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
+	lo_number = lo->lo_number;
+	lo->lo_flags = 0;
+	if (!part_shift)
+		lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
+	loop_unprepare_queue(lo);
+out_unlock:
+	mutex_unlock(&loop_ctl_mutex);
+	if (partscan) {
 		/*
 		 * bd_mutex has been held already in release path, so don't
 		 * acquire it if this function is called in such case.
@@ -1097,21 +1107,15 @@ static int __loop_clr_fd(struct loop_device *lo)
 		 * must be at least one and it can only become zero when the
 		 * current holder is released.
 		 */
-		if (!atomic_read(&lo->lo_refcnt))
+		if (release)
 			err = __blkdev_reread_part(bdev);
 		else
 			err = blkdev_reread_part(bdev);
 		pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
-			__func__, lo->lo_number, err);
+			__func__, lo_number, err);
 		/* Device is gone, no point in returning error */
 		err = 0;
 	}
-	lo->lo_flags = 0;
-	if (!part_shift)
-		lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
-	loop_unprepare_queue(lo);
-out_unlock:
-	mutex_unlock(&loop_ctl_mutex);
 	/*
 	 * Need not hold loop_ctl_mutex to fput backing file.
 	 * Calling fput holding loop_ctl_mutex triggers a circular
@@ -1152,7 +1156,7 @@ static int loop_clr_fd(struct loop_device *lo)
 	lo->lo_state = Lo_rundown;
 	mutex_unlock(&loop_ctl_mutex);
 
-	return __loop_clr_fd(lo);
+	return __loop_clr_fd(lo, false);
 }
 
 static int
@@ -1713,7 +1717,7 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		 * In autoclear mode, stop the loop thread
 		 * and remove configuration after last close.
 		 */
-		__loop_clr_fd(lo);
+		__loop_clr_fd(lo, true);
 		return;
 	} else if (lo->lo_state == Lo_bound) {
 		/*
-- 
cgit 


From 1dded9acf6dc9a34cd27fcf8815507e4e65b3c4f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:15 +0100
Subject: loop: Avoid circular locking dependency between loop_ctl_mutex and
 bd_mutex

Code in loop_change_fd() drops reference to the old file (and also the
new file in a failure case) under loop_ctl_mutex. Similarly to a
situation in loop_set_fd() this can create a circular locking dependency
if this was the last reference holding the file open. Delay dropping of
the file reference until we have released loop_ctl_mutex.

Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b3f981ac8ef1..112afc9bc604 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -677,7 +677,7 @@ static int loop_validate_file(struct file *file, struct block_device *bdev)
 static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 			  unsigned int arg)
 {
-	struct file	*file, *old_file;
+	struct file	*file = NULL, *old_file;
 	int		error;
 	bool		partscan;
 
@@ -686,21 +686,21 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 		return error;
 	error = -ENXIO;
 	if (lo->lo_state != Lo_bound)
-		goto out_unlock;
+		goto out_err;
 
 	/* the loop device has to be read-only */
 	error = -EINVAL;
 	if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
-		goto out_unlock;
+		goto out_err;
 
 	error = -EBADF;
 	file = fget(arg);
 	if (!file)
-		goto out_unlock;
+		goto out_err;
 
 	error = loop_validate_file(file, bdev);
 	if (error)
-		goto out_putf;
+		goto out_err;
 
 	old_file = lo->lo_backing_file;
 
@@ -708,7 +708,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 
 	/* size of the new backing store needs to be the same */
 	if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
-		goto out_putf;
+		goto out_err;
 
 	/* and ... switch */
 	blk_mq_freeze_queue(lo->lo_queue);
@@ -719,18 +719,22 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 			     lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
 	loop_update_dio(lo);
 	blk_mq_unfreeze_queue(lo->lo_queue);
-
-	fput(old_file);
 	partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
 	mutex_unlock(&loop_ctl_mutex);
+	/*
+	 * We must drop file reference outside of loop_ctl_mutex as dropping
+	 * the file ref can take bd_mutex which creates circular locking
+	 * dependency.
+	 */
+	fput(old_file);
 	if (partscan)
 		loop_reread_partitions(lo, bdev);
 	return 0;
 
-out_putf:
-	fput(file);
-out_unlock:
+out_err:
 	mutex_unlock(&loop_ctl_mutex);
+	if (file)
+		fput(file);
 	return error;
 }
 
-- 
cgit 


From c28445fa06a3a54e06938559b9514c5a7f01c90f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:16 +0100
Subject: loop: Get rid of 'nested' acquisition of loop_ctl_mutex

The nested acquisition of loop_ctl_mutex (->lo_ctl_mutex back then) has
been introduced by commit f028f3b2f987e "loop: fix circular locking in
loop_clr_fd()" to fix lockdep complains about bd_mutex being acquired
after lo_ctl_mutex during partition rereading. Now that these are
properly fixed, let's stop fooling lockdep.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 112afc9bc604..bf6bc35aaf88 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -681,7 +681,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 	int		error;
 	bool		partscan;
 
-	error = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	error = mutex_lock_killable(&loop_ctl_mutex);
 	if (error)
 		return error;
 	error = -ENXIO;
@@ -919,7 +919,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	if (!file)
 		goto out;
 
-	error = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	error = mutex_lock_killable(&loop_ctl_mutex);
 	if (error)
 		goto out_putf;
 
@@ -1135,7 +1135,7 @@ static int loop_clr_fd(struct loop_device *lo)
 {
 	int err;
 
-	err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	err = mutex_lock_killable(&loop_ctl_mutex);
 	if (err)
 		return err;
 	if (lo->lo_state != Lo_bound) {
@@ -1172,7 +1172,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	struct block_device *bdev;
 	bool partscan = false;
 
-	err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	err = mutex_lock_killable(&loop_ctl_mutex);
 	if (err)
 		return err;
 	if (lo->lo_encrypt_key_size &&
@@ -1277,7 +1277,7 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 	struct kstat stat;
 	int ret;
 
-	ret = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	ret = mutex_lock_killable(&loop_ctl_mutex);
 	if (ret)
 		return ret;
 	if (lo->lo_state != Lo_bound) {
@@ -1466,7 +1466,7 @@ static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,
 {
 	int err;
 
-	err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	err = mutex_lock_killable(&loop_ctl_mutex);
 	if (err)
 		return err;
 	switch (cmd) {
-- 
cgit 


From 7baa85727d0406ffd2b2303cd803a145aa35c505 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 8 Nov 2018 10:24:07 -0700
Subject: blk-mq-tag: change busy_iter_fn to return whether to continue or not

We have this functionality in sbitmap, but we don't export it in
blk-mq for users of the tags busy iteration. This can be useful
for stopping the iteration, if the caller doesn't need to find
more requests.

Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 9 ++++++---
 drivers/block/nbd.c               | 3 ++-
 drivers/block/skd_main.c          | 8 +++++---
 drivers/nvme/host/core.c          | 4 ++--
 drivers/nvme/host/fc.c            | 3 ++-
 drivers/nvme/host/nvme.h          | 2 +-
 6 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index a7daa8acbab3..947aa10107a6 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2720,7 +2720,7 @@ static void mtip_softirq_done_fn(struct request *rq)
 	blk_mq_end_request(rq, cmd->status);
 }
 
-static void mtip_abort_cmd(struct request *req, void *data, bool reserved)
+static bool mtip_abort_cmd(struct request *req, void *data, bool reserved)
 {
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
 	struct driver_data *dd = data;
@@ -2730,14 +2730,16 @@ static void mtip_abort_cmd(struct request *req, void *data, bool reserved)
 	clear_bit(req->tag, dd->port->cmds_to_issue);
 	cmd->status = BLK_STS_IOERR;
 	mtip_softirq_done_fn(req);
+	return true;
 }
 
-static void mtip_queue_cmd(struct request *req, void *data, bool reserved)
+static bool mtip_queue_cmd(struct request *req, void *data, bool reserved)
 {
 	struct driver_data *dd = data;
 
 	set_bit(req->tag, dd->port->cmds_to_issue);
 	blk_abort_request(req);
+	return true;
 }
 
 /*
@@ -3920,12 +3922,13 @@ protocol_init_error:
 	return rv;
 }
 
-static void mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv)
+static bool mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv)
 {
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
 
 	cmd->status = BLK_STS_IOERR;
 	blk_mq_complete_request(rq);
+	return true;
 }
 
 /*
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 4d4d6129ff66..08696f5f00bb 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -734,12 +734,13 @@ static void recv_work(struct work_struct *work)
 	kfree(args);
 }
 
-static void nbd_clear_req(struct request *req, void *data, bool reserved)
+static bool nbd_clear_req(struct request *req, void *data, bool reserved)
 {
 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 
 	cmd->status = BLK_STS_IOERR;
 	blk_mq_complete_request(req);
+	return true;
 }
 
 static void nbd_clear_que(struct nbd_device *nbd)
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 2459dcc04b1c..a0196477165f 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -382,11 +382,12 @@ static void skd_log_skreq(struct skd_device *skdev,
  * READ/WRITE REQUESTS
  *****************************************************************************
  */
-static void skd_inc_in_flight(struct request *rq, void *data, bool reserved)
+static bool skd_inc_in_flight(struct request *rq, void *data, bool reserved)
 {
 	int *count = data;
 
 	count++;
+	return true;
 }
 
 static int skd_in_flight(struct skd_device *skdev)
@@ -1887,13 +1888,13 @@ static void skd_isr_fwstate(struct skd_device *skdev)
 		skd_skdev_state_to_str(skdev->state), skdev->state);
 }
 
-static void skd_recover_request(struct request *req, void *data, bool reserved)
+static bool skd_recover_request(struct request *req, void *data, bool reserved)
 {
 	struct skd_device *const skdev = data;
 	struct skd_request_context *skreq = blk_mq_rq_to_pdu(req);
 
 	if (skreq->state != SKD_REQ_STATE_BUSY)
-		return;
+		return true;
 
 	skd_log_skreq(skdev, skreq, "recover");
 
@@ -1904,6 +1905,7 @@ static void skd_recover_request(struct request *req, void *data, bool reserved)
 	skreq->state = SKD_REQ_STATE_IDLE;
 	skreq->status = BLK_STS_IOERR;
 	blk_mq_complete_request(req);
+	return true;
 }
 
 static void skd_recover_requests(struct skd_device *skdev)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2e65be8b1387..f172d63db2b5 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -268,14 +268,14 @@ void nvme_complete_rq(struct request *req)
 }
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
-void nvme_cancel_request(struct request *req, void *data, bool reserved)
+bool nvme_cancel_request(struct request *req, void *data, bool reserved)
 {
 	dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
 				"Cancelling I/O %d", req->tag);
 
 	nvme_req(req)->status = NVME_SC_ABORT_REQ;
 	blk_mq_complete_request(req);
-
+	return true;
 }
 EXPORT_SYMBOL_GPL(nvme_cancel_request);
 
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 0b70c8bab045..98c3c77f48f6 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2386,7 +2386,7 @@ nvme_fc_complete_rq(struct request *rq)
  * status. The done path will return the io request back to the block
  * layer with an error status.
  */
-static void
+static bool
 nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
 {
 	struct nvme_ctrl *nctrl = data;
@@ -2394,6 +2394,7 @@ nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
 	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req);
 
 	__nvme_fc_abort_op(ctrl, op);
+	return true;
 }
 
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index cee79cb388af..32a1f1cfdfb4 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -408,7 +408,7 @@ static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl)
 }
 
 void nvme_complete_rq(struct request *req);
-void nvme_cancel_request(struct request *req, void *data, bool reserved);
+bool nvme_cancel_request(struct request *req, void *data, bool reserved);
 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 		enum nvme_ctrl_state new_state);
 int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
-- 
cgit 


From cd94c9ed59bad76e68d1c8910fc564351ce2162c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:51:09 +0100
Subject: sx8: cleanup queue and disk allocation / freeing

Make the disk/queue alloc and free helpers per-port by moving the
trivial loops into the callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/sx8.c | 107 +++++++++++++++++++++++-----------------------------
 1 file changed, 48 insertions(+), 59 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 064b8c5c7a32..1f371b9c0954 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -1499,70 +1499,54 @@ static const struct blk_mq_ops carm_mq_ops = {
 	.queue_rq	= carm_queue_rq,
 };
 
-static int carm_init_disks(struct carm_host *host)
+static int carm_init_disk(struct carm_host *host, unsigned int port_no)
 {
-	unsigned int i;
-	int rc = 0;
-
-	for (i = 0; i < CARM_MAX_PORTS; i++) {
-		struct gendisk *disk;
-		struct request_queue *q;
-		struct carm_port *port;
-
-		port = &host->port[i];
-		port->host = host;
-		port->port_no = i;
-
-		disk = alloc_disk(CARM_MINORS_PER_MAJOR);
-		if (!disk) {
-			rc = -ENOMEM;
-			break;
-		}
+	struct carm_port *port = &host->port[port_no];
+	struct gendisk *disk;
+	struct request_queue *q;
 
-		port->disk = disk;
-		sprintf(disk->disk_name, DRV_NAME "/%u",
-			(unsigned int) (host->id * CARM_MAX_PORTS) + i);
-		disk->major = host->major;
-		disk->first_minor = i * CARM_MINORS_PER_MAJOR;
-		disk->fops = &carm_bd_ops;
-		disk->private_data = port;
-
-		q = blk_mq_init_sq_queue(&port->tag_set, &carm_mq_ops,
-					 max_queue, BLK_MQ_F_SHOULD_MERGE);
-		if (IS_ERR(q)) {
-			rc = PTR_ERR(q);
-			break;
-		}
-		disk->queue = q;
-		blk_queue_max_segments(q, CARM_MAX_REQ_SG);
-		blk_queue_segment_boundary(q, CARM_SG_BOUNDARY);
+	port->host = host;
+	port->port_no = port_no;
 
-		q->queuedata = port;
-	}
+	disk = alloc_disk(CARM_MINORS_PER_MAJOR);
+	if (!disk)
+		return -ENOMEM;
 
-	return rc;
+	port->disk = disk;
+	sprintf(disk->disk_name, DRV_NAME "/%u",
+		(unsigned int)host->id * CARM_MAX_PORTS + port_no);
+	disk->major = host->major;
+	disk->first_minor = port_no * CARM_MINORS_PER_MAJOR;
+	disk->fops = &carm_bd_ops;
+	disk->private_data = port;
+
+	q = blk_mq_init_sq_queue(&port->tag_set, &carm_mq_ops,
+				 max_queue, BLK_MQ_F_SHOULD_MERGE);
+	if (IS_ERR(q))
+		return PTR_ERR(q);
+	disk->queue = q;
+	blk_queue_max_segments(q, CARM_MAX_REQ_SG);
+	blk_queue_segment_boundary(q, CARM_SG_BOUNDARY);
+
+	q->queuedata = port;
+	return 0;
 }
 
-static void carm_free_disks(struct carm_host *host)
+static void carm_free_disk(struct carm_host *host, unsigned int port_no)
 {
-	unsigned int i;
+	struct carm_port *port = &host->port[port_no];
+	struct gendisk *disk = port->disk;
 
-	for (i = 0; i < CARM_MAX_PORTS; i++) {
-		struct carm_port *port = &host->port[i];
-		struct gendisk *disk = port->disk;
-
-		if (disk) {
-			struct request_queue *q = disk->queue;
+	if (!disk)
+		return;
 
-			if (disk->flags & GENHD_FL_UP)
-				del_gendisk(disk);
-			if (q) {
-				blk_mq_free_tag_set(&port->tag_set);
-				blk_cleanup_queue(q);
-			}
-			put_disk(disk);
-		}
+	if (disk->flags & GENHD_FL_UP)
+		del_gendisk(disk);
+	if (disk->queue) {
+		blk_mq_free_tag_set(&port->tag_set);
+		blk_cleanup_queue(disk->queue);
 	}
+	put_disk(disk);
 }
 
 static int carm_init_shm(struct carm_host *host)
@@ -1667,9 +1651,11 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (host->flags & FL_DYN_MAJOR)
 		host->major = rc;
 
-	rc = carm_init_disks(host);
-	if (rc)
-		goto err_out_blkdev_disks;
+	for (i = 0; i < CARM_MAX_PORTS; i++) {
+		rc = carm_init_disk(host, i);
+		if (rc)
+			goto err_out_blkdev_disks;
+	}
 
 	pci_set_master(pdev);
 
@@ -1699,7 +1685,8 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 err_out_free_irq:
 	free_irq(pdev->irq, host);
 err_out_blkdev_disks:
-	carm_free_disks(host);
+	for (i = 0; i < CARM_MAX_PORTS; i++)
+		carm_free_disk(host, i);
 	unregister_blkdev(host->major, host->name);
 err_out_free_majors:
 	if (host->major == 160)
@@ -1724,6 +1711,7 @@ err_out:
 static void carm_remove_one (struct pci_dev *pdev)
 {
 	struct carm_host *host = pci_get_drvdata(pdev);
+	unsigned int i;
 
 	if (!host) {
 		printk(KERN_ERR PFX "BUG: no host data for PCI(%s)\n",
@@ -1732,7 +1720,8 @@ static void carm_remove_one (struct pci_dev *pdev)
 	}
 
 	free_irq(pdev->irq, host);
-	carm_free_disks(host);
+	for (i = 0; i < CARM_MAX_PORTS; i++)
+		carm_free_disk(host, i);
 	unregister_blkdev(host->major, host->name);
 	if (host->major == 160)
 		clear_bit(0, &carm_major_alloc);
-- 
cgit 


From 72d7ce8eb2bca8f77eebdd3271e59f6a5998dc4a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:51:10 +0100
Subject: sx8: use a per-host tag_set

The current sx8 code spends a lot of effort dealing with the fact that
tags are per-host, but there might be multiple queues.  Now that the
driver has been converted to blk-mq it can take care of the blk-mq
tag_set concept that has been designed just for that.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/sx8.c | 343 +++++++++++++++-------------------------------------
 1 file changed, 95 insertions(+), 248 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 1f371b9c0954..4478eb7efee0 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -243,7 +243,6 @@ struct carm_port {
 	unsigned int			port_no;
 	struct gendisk			*disk;
 	struct carm_host		*host;
-	struct blk_mq_tag_set		tag_set;
 
 	/* attached device characteristics */
 	u64				capacity;
@@ -254,13 +253,10 @@ struct carm_port {
 };
 
 struct carm_request {
-	unsigned int			tag;
 	int				n_elem;
 	unsigned int			msg_type;
 	unsigned int			msg_subtype;
 	unsigned int			msg_bucket;
-	struct request			*rq;
-	struct carm_port		*port;
 	struct scatterlist		sg[CARM_MAX_REQ_SG];
 };
 
@@ -291,9 +287,6 @@ struct carm_host {
 	unsigned int			wait_q_cons;
 	struct request_queue		*wait_q[CARM_MAX_WAIT_Q];
 
-	unsigned int			n_msgs;
-	u64				msg_alloc;
-	struct carm_request		req[CARM_MAX_REQ];
 	void				*msg_base;
 	dma_addr_t			msg_dma;
 
@@ -478,10 +471,10 @@ static inline dma_addr_t carm_ref_msg_dma(struct carm_host *host,
 }
 
 static int carm_send_msg(struct carm_host *host,
-			 struct carm_request *crq)
+			 struct carm_request *crq, unsigned tag)
 {
 	void __iomem *mmio = host->mmio;
-	u32 msg = (u32) carm_ref_msg_dma(host, crq->tag);
+	u32 msg = (u32) carm_ref_msg_dma(host, tag);
 	u32 cm_bucket = crq->msg_bucket;
 	u32 tmp;
 	int rc = 0;
@@ -506,99 +499,24 @@ static int carm_send_msg(struct carm_host *host,
 	return rc;
 }
 
-static struct carm_request *carm_get_request(struct carm_host *host)
-{
-	unsigned int i;
-
-	/* obey global hardware limit on S/G entries */
-	if (host->hw_sg_used >= (CARM_MAX_HOST_SG - CARM_MAX_REQ_SG))
-		return NULL;
-
-	for (i = 0; i < max_queue; i++)
-		if ((host->msg_alloc & (1ULL << i)) == 0) {
-			struct carm_request *crq = &host->req[i];
-			crq->port = NULL;
-			crq->n_elem = 0;
-
-			host->msg_alloc |= (1ULL << i);
-			host->n_msgs++;
-
-			assert(host->n_msgs <= CARM_MAX_REQ);
-			sg_init_table(crq->sg, CARM_MAX_REQ_SG);
-			return crq;
-		}
-
-	DPRINTK("no request available, returning NULL\n");
-	return NULL;
-}
-
-static int carm_put_request(struct carm_host *host, struct carm_request *crq)
-{
-	assert(crq->tag < max_queue);
-
-	if (unlikely((host->msg_alloc & (1ULL << crq->tag)) == 0))
-		return -EINVAL; /* tried to clear a tag that was not active */
-
-	assert(host->hw_sg_used >= crq->n_elem);
-
-	host->msg_alloc &= ~(1ULL << crq->tag);
-	host->hw_sg_used -= crq->n_elem;
-	host->n_msgs--;
-
-	return 0;
-}
-
-static struct carm_request *carm_get_special(struct carm_host *host)
-{
-	unsigned long flags;
-	struct carm_request *crq = NULL;
-	struct request *rq;
-	int tries = 5000;
-
-	while (tries-- > 0) {
-		spin_lock_irqsave(&host->lock, flags);
-		crq = carm_get_request(host);
-		spin_unlock_irqrestore(&host->lock, flags);
-
-		if (crq)
-			break;
-		msleep(10);
-	}
-
-	if (!crq)
-		return NULL;
-
-	rq = blk_get_request(host->oob_q, REQ_OP_DRV_OUT, 0);
-	if (IS_ERR(rq)) {
-		spin_lock_irqsave(&host->lock, flags);
-		carm_put_request(host, crq);
-		spin_unlock_irqrestore(&host->lock, flags);
-		return NULL;
-	}
-
-	crq->rq = rq;
-	return crq;
-}
-
 static int carm_array_info (struct carm_host *host, unsigned int array_idx)
 {
 	struct carm_msg_ioctl *ioc;
-	unsigned int idx;
 	u32 msg_data;
 	dma_addr_t msg_dma;
 	struct carm_request *crq;
+	struct request *rq;
 	int rc;
 
-	crq = carm_get_special(host);
-	if (!crq) {
+	rq = blk_mq_alloc_request(host->oob_q, REQ_OP_DRV_OUT, 0);
+	if (IS_ERR(rq)) {
 		rc = -ENOMEM;
 		goto err_out;
 	}
+	crq = blk_mq_rq_to_pdu(rq);
 
-	idx = crq->tag;
-
-	ioc = carm_ref_msg(host, idx);
-	msg_dma = carm_ref_msg_dma(host, idx);
+	ioc = carm_ref_msg(host, rq->tag);
+	msg_dma = carm_ref_msg_dma(host, rq->tag);
 	msg_data = (u32) (msg_dma + sizeof(struct carm_array_info));
 
 	crq->msg_type = CARM_MSG_ARRAY;
@@ -612,7 +530,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
 	ioc->type	= CARM_MSG_ARRAY;
 	ioc->subtype	= CARM_ARRAY_INFO;
 	ioc->array_id	= (u8) array_idx;
-	ioc->handle	= cpu_to_le32(TAG_ENCODE(idx));
+	ioc->handle	= cpu_to_le32(TAG_ENCODE(rq->tag));
 	ioc->data_addr	= cpu_to_le32(msg_data);
 
 	spin_lock_irq(&host->lock);
@@ -620,9 +538,8 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
 	       host->state == HST_DEV_SCAN);
 	spin_unlock_irq(&host->lock);
 
-	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
-	crq->rq->special = crq;
-	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
+	DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag);
+	blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL);
 
 	return 0;
 
@@ -637,21 +554,21 @@ typedef unsigned int (*carm_sspc_t)(struct carm_host *, unsigned int, void *);
 
 static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 {
+	struct request *rq;
 	struct carm_request *crq;
 	struct carm_msg_ioctl *ioc;
 	void *mem;
-	unsigned int idx, msg_size;
+	unsigned int msg_size;
 	int rc;
 
-	crq = carm_get_special(host);
-	if (!crq)
+	rq = blk_mq_alloc_request(host->oob_q, REQ_OP_DRV_OUT, 0);
+	if (IS_ERR(rq))
 		return -ENOMEM;
+	crq = blk_mq_rq_to_pdu(rq);
 
-	idx = crq->tag;
+	mem = carm_ref_msg(host, rq->tag);
 
-	mem = carm_ref_msg(host, idx);
-
-	msg_size = func(host, idx, mem);
+	msg_size = func(host, rq->tag, mem);
 
 	ioc = mem;
 	crq->msg_type = ioc->type;
@@ -660,9 +577,8 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 	BUG_ON(rc < 0);
 	crq->msg_bucket = (u32) rc;
 
-	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
-	crq->rq->special = crq;
-	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
+	DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag);
+	blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL);
 
 	return 0;
 }
@@ -744,19 +660,6 @@ static unsigned int carm_fill_get_fw_ver(struct carm_host *host,
 	       sizeof(struct carm_fw_ver);
 }
 
-static inline void carm_end_request_queued(struct carm_host *host,
-					   struct carm_request *crq,
-					   blk_status_t error)
-{
-	struct request *req = crq->rq;
-	int rc;
-
-	blk_mq_end_request(req, error);
-
-	rc = carm_put_request(host, crq);
-	assert(rc == 0);
-}
-
 static inline void carm_push_q (struct carm_host *host, struct request_queue *q)
 {
 	unsigned int idx = host->wait_q_prod % CARM_MAX_WAIT_Q;
@@ -791,101 +694,50 @@ static inline void carm_round_robin(struct carm_host *host)
 	}
 }
 
-static inline void carm_end_rq(struct carm_host *host, struct carm_request *crq,
-			       blk_status_t error)
-{
-	carm_end_request_queued(host, crq, error);
-	if (max_queue == 1)
-		carm_round_robin(host);
-	else if ((host->n_msgs <= CARM_MSG_LOW_WATER) &&
-		 (host->hw_sg_used <= CARM_SG_LOW_WATER)) {
-		carm_round_robin(host);
-	}
-}
-
-static blk_status_t carm_oob_queue_rq(struct blk_mq_hw_ctx *hctx,
-				      const struct blk_mq_queue_data *bd)
+static inline enum dma_data_direction carm_rq_dir(struct request *rq)
 {
-	struct request_queue *q = hctx->queue;
-	struct carm_host *host = q->queuedata;
-	struct carm_request *crq;
-	int rc;
-
-	blk_mq_start_request(bd->rq);
-
-	spin_lock_irq(&host->lock);
-
-	crq = bd->rq->special;
-	assert(crq != NULL);
-	assert(crq->rq == bd->rq);
-
-	crq->n_elem = 0;
-
-	DPRINTK("send req\n");
-	rc = carm_send_msg(host, crq);
-	if (rc) {
-		carm_push_q(host, q);
-		spin_unlock_irq(&host->lock);
-		return BLK_STS_DEV_RESOURCE;
-	}
-
-	spin_unlock_irq(&host->lock);
-	return BLK_STS_OK;
+	return op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
 }
 
 static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx,
 				  const struct blk_mq_queue_data *bd)
 {
 	struct request_queue *q = hctx->queue;
+	struct request *rq = bd->rq;
 	struct carm_port *port = q->queuedata;
 	struct carm_host *host = port->host;
+	struct carm_request *crq = blk_mq_rq_to_pdu(rq);
 	struct carm_msg_rw *msg;
-	struct carm_request *crq;
-	struct request *rq = bd->rq;
 	struct scatterlist *sg;
-	int writing = 0, pci_dir, i, n_elem, rc;
-	u32 tmp;
+	int i, n_elem = 0, rc;
 	unsigned int msg_size;
+	u32 tmp;
+
+	crq->n_elem = 0;
+	sg_init_table(crq->sg, CARM_MAX_REQ_SG);
 
 	blk_mq_start_request(rq);
 
 	spin_lock_irq(&host->lock);
-
-	crq = carm_get_request(host);
-	if (!crq) {
-		carm_push_q(host, q);
-		spin_unlock_irq(&host->lock);
-		return BLK_STS_DEV_RESOURCE;
-	}
-	crq->rq = rq;
-
-	if (rq_data_dir(rq) == WRITE) {
-		writing = 1;
-		pci_dir = DMA_TO_DEVICE;
-	} else {
-		pci_dir = DMA_FROM_DEVICE;
-	}
+	if (req_op(rq) == REQ_OP_DRV_OUT)
+		goto send_msg;
 
 	/* get scatterlist from block layer */
 	sg = &crq->sg[0];
 	n_elem = blk_rq_map_sg(q, rq, sg);
-	if (n_elem <= 0) {
-		/* request with no s/g entries? */
-		carm_end_rq(host, crq, BLK_STS_IOERR);
-		spin_unlock_irq(&host->lock);
-		return BLK_STS_IOERR;
-	}
+	if (n_elem <= 0)
+		goto out_ioerr;
 
 	/* map scatterlist to PCI bus addresses */
-	n_elem = dma_map_sg(&host->pdev->dev, sg, n_elem, pci_dir);
-	if (n_elem <= 0) {
-		/* request with no s/g entries? */
-		carm_end_rq(host, crq, BLK_STS_IOERR);
-		spin_unlock_irq(&host->lock);
-		return BLK_STS_IOERR;
-	}
+	n_elem = dma_map_sg(&host->pdev->dev, sg, n_elem, carm_rq_dir(rq));
+	if (n_elem <= 0)
+		goto out_ioerr;
+
+	/* obey global hardware limit on S/G entries */
+	if (host->hw_sg_used >= CARM_MAX_HOST_SG - n_elem)
+		goto out_resource;
+
 	crq->n_elem = n_elem;
-	crq->port = port;
 	host->hw_sg_used += n_elem;
 
 	/*
@@ -893,9 +745,9 @@ static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx,
 	 */
 
 	VPRINTK("build msg\n");
-	msg = (struct carm_msg_rw *) carm_ref_msg(host, crq->tag);
+	msg = (struct carm_msg_rw *) carm_ref_msg(host, rq->tag);
 
-	if (writing) {
+	if (rq_data_dir(rq) == WRITE) {
 		msg->type = CARM_MSG_WRITE;
 		crq->msg_type = CARM_MSG_WRITE;
 	} else {
@@ -906,7 +758,7 @@ static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx,
 	msg->id		= port->port_no;
 	msg->sg_count	= n_elem;
 	msg->sg_type	= SGT_32BIT;
-	msg->handle	= cpu_to_le32(TAG_ENCODE(crq->tag));
+	msg->handle	= cpu_to_le32(TAG_ENCODE(rq->tag));
 	msg->lba	= cpu_to_le32(blk_rq_pos(rq) & 0xffffffff);
 	tmp		= (blk_rq_pos(rq) >> 16) >> 16;
 	msg->lba_high	= cpu_to_le16( (u16) tmp );
@@ -923,22 +775,28 @@ static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx,
 	rc = carm_lookup_bucket(msg_size);
 	BUG_ON(rc < 0);
 	crq->msg_bucket = (u32) rc;
-
+send_msg:
 	/*
 	 * queue read/write message to hardware
 	 */
-
-	VPRINTK("send msg, tag == %u\n", crq->tag);
-	rc = carm_send_msg(host, crq);
+	VPRINTK("send msg, tag == %u\n", rq->tag);
+	rc = carm_send_msg(host, crq, rq->tag);
 	if (rc) {
-		carm_put_request(host, crq);
-		carm_push_q(host, q);
-		spin_unlock_irq(&host->lock);
-		return BLK_STS_DEV_RESOURCE;
+		host->hw_sg_used -= n_elem;
+		goto out_resource;
 	}
 
 	spin_unlock_irq(&host->lock);
 	return BLK_STS_OK;
+out_resource:
+	dma_unmap_sg(&host->pdev->dev, &crq->sg[0], n_elem, carm_rq_dir(rq));
+	carm_push_q(host, q);
+	spin_unlock_irq(&host->lock);
+	return BLK_STS_DEV_RESOURCE;
+out_ioerr:
+	carm_round_robin(host);
+	spin_unlock_irq(&host->lock);
+	return BLK_STS_IOERR;
 }
 
 static void carm_handle_array_info(struct carm_host *host,
@@ -954,8 +812,6 @@ static void carm_handle_array_info(struct carm_host *host,
 
 	DPRINTK("ENTER\n");
 
-	carm_end_rq(host, crq, error);
-
 	if (error)
 		goto out;
 	if (le32_to_cpu(desc->array_status) & ARRAY_NO_EXIST)
@@ -1011,8 +867,6 @@ static void carm_handle_scan_chan(struct carm_host *host,
 
 	DPRINTK("ENTER\n");
 
-	carm_end_rq(host, crq, error);
-
 	if (error) {
 		new_state = HST_ERROR;
 		goto out;
@@ -1040,8 +894,6 @@ static void carm_handle_generic(struct carm_host *host,
 {
 	DPRINTK("ENTER\n");
 
-	carm_end_rq(host, crq, error);
-
 	assert(host->state == cur_state);
 	if (error)
 		host->state = HST_ERROR;
@@ -1050,28 +902,12 @@ static void carm_handle_generic(struct carm_host *host,
 	schedule_work(&host->fsm_task);
 }
 
-static inline void carm_handle_rw(struct carm_host *host,
-				  struct carm_request *crq, blk_status_t error)
-{
-	int pci_dir;
-
-	VPRINTK("ENTER\n");
-
-	if (rq_data_dir(crq->rq) == WRITE)
-		pci_dir = DMA_TO_DEVICE;
-	else
-		pci_dir = DMA_FROM_DEVICE;
-
-	dma_unmap_sg(&host->pdev->dev, &crq->sg[0], crq->n_elem, pci_dir);
-
-	carm_end_rq(host, crq, error);
-}
-
 static inline void carm_handle_resp(struct carm_host *host,
 				    __le32 ret_handle_le, u32 status)
 {
 	u32 handle = le32_to_cpu(ret_handle_le);
 	unsigned int msg_idx;
+	struct request *rq;
 	struct carm_request *crq;
 	blk_status_t error = (status == RMSG_OK) ? 0 : BLK_STS_IOERR;
 	u8 *mem;
@@ -1087,13 +923,15 @@ static inline void carm_handle_resp(struct carm_host *host,
 	msg_idx = TAG_DECODE(handle);
 	VPRINTK("tag == %u\n", msg_idx);
 
-	crq = &host->req[msg_idx];
+	rq = blk_mq_tag_to_rq(host->tag_set.tags[0], msg_idx);
+	crq = blk_mq_rq_to_pdu(rq);
 
 	/* fast path */
 	if (likely(crq->msg_type == CARM_MSG_READ ||
 		   crq->msg_type == CARM_MSG_WRITE)) {
-		carm_handle_rw(host, crq, error);
-		return;
+		dma_unmap_sg(&host->pdev->dev, &crq->sg[0], crq->n_elem,
+			     carm_rq_dir(rq));
+		goto done;
 	}
 
 	mem = carm_ref_msg(host, msg_idx);
@@ -1103,7 +941,7 @@ static inline void carm_handle_resp(struct carm_host *host,
 		switch (crq->msg_subtype) {
 		case CARM_IOC_SCAN_CHAN:
 			carm_handle_scan_chan(host, crq, mem, error);
-			break;
+			goto done;
 		default:
 			/* unknown / invalid response */
 			goto err_out;
@@ -1116,11 +954,11 @@ static inline void carm_handle_resp(struct carm_host *host,
 		case MISC_ALLOC_MEM:
 			carm_handle_generic(host, crq, error,
 					    HST_ALLOC_BUF, HST_SYNC_TIME);
-			break;
+			goto done;
 		case MISC_SET_TIME:
 			carm_handle_generic(host, crq, error,
 					    HST_SYNC_TIME, HST_GET_FW_VER);
-			break;
+			goto done;
 		case MISC_GET_FW_VER: {
 			struct carm_fw_ver *ver = (struct carm_fw_ver *)
 				(mem + sizeof(struct carm_msg_get_fw_ver));
@@ -1130,7 +968,7 @@ static inline void carm_handle_resp(struct carm_host *host,
 			}
 			carm_handle_generic(host, crq, error,
 					    HST_GET_FW_VER, HST_PORT_SCAN);
-			break;
+			goto done;
 		}
 		default:
 			/* unknown / invalid response */
@@ -1161,7 +999,13 @@ static inline void carm_handle_resp(struct carm_host *host,
 err_out:
 	printk(KERN_WARNING DRV_NAME "(%s): BUG: unhandled message type %d/%d\n",
 	       pci_name(host->pdev), crq->msg_type, crq->msg_subtype);
-	carm_end_rq(host, crq, BLK_STS_IOERR);
+	error = BLK_STS_IOERR;
+done:
+	host->hw_sg_used -= crq->n_elem;
+	blk_mq_end_request(blk_mq_rq_from_pdu(crq), error);
+
+	if (host->hw_sg_used <= CARM_SG_LOW_WATER)
+		carm_round_robin(host);
 }
 
 static inline void carm_handle_responses(struct carm_host *host)
@@ -1491,10 +1335,6 @@ static int carm_init_host(struct carm_host *host)
 	return 0;
 }
 
-static const struct blk_mq_ops carm_oob_mq_ops = {
-	.queue_rq	= carm_oob_queue_rq,
-};
-
 static const struct blk_mq_ops carm_mq_ops = {
 	.queue_rq	= carm_queue_rq,
 };
@@ -1520,15 +1360,15 @@ static int carm_init_disk(struct carm_host *host, unsigned int port_no)
 	disk->fops = &carm_bd_ops;
 	disk->private_data = port;
 
-	q = blk_mq_init_sq_queue(&port->tag_set, &carm_mq_ops,
-				 max_queue, BLK_MQ_F_SHOULD_MERGE);
+	q = blk_mq_init_queue(&host->tag_set);
 	if (IS_ERR(q))
 		return PTR_ERR(q);
-	disk->queue = q;
+
 	blk_queue_max_segments(q, CARM_MAX_REQ_SG);
 	blk_queue_segment_boundary(q, CARM_SG_BOUNDARY);
 
 	q->queuedata = port;
+	disk->queue = q;
 	return 0;
 }
 
@@ -1542,10 +1382,8 @@ static void carm_free_disk(struct carm_host *host, unsigned int port_no)
 
 	if (disk->flags & GENHD_FL_UP)
 		del_gendisk(disk);
-	if (disk->queue) {
-		blk_mq_free_tag_set(&port->tag_set);
+	if (disk->queue)
 		blk_cleanup_queue(disk->queue);
-	}
 	put_disk(disk);
 }
 
@@ -1602,9 +1440,6 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	INIT_WORK(&host->fsm_task, carm_fsm_task);
 	init_completion(&host->probe_comp);
 
-	for (i = 0; i < ARRAY_SIZE(host->req); i++)
-		host->req[i].tag = i;
-
 	host->mmio = ioremap(pci_resource_start(pdev, 0),
 			     pci_resource_len(pdev, 0));
 	if (!host->mmio) {
@@ -1621,14 +1456,26 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_out_iounmap;
 	}
 
-	q = blk_mq_init_sq_queue(&host->tag_set, &carm_oob_mq_ops, 1,
-					BLK_MQ_F_NO_SCHED);
+	memset(&host->tag_set, 0, sizeof(host->tag_set));
+	host->tag_set.ops = &carm_mq_ops;
+	host->tag_set.cmd_size = sizeof(struct carm_request);
+	host->tag_set.nr_hw_queues = 1;
+	host->tag_set.nr_maps = 1;
+	host->tag_set.queue_depth = max_queue;
+	host->tag_set.numa_node = NUMA_NO_NODE;
+	host->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+
+	rc = blk_mq_alloc_tag_set(&host->tag_set);
+	if (rc)
+		goto err_out_dma_free;
+
+	q = blk_mq_init_queue(&host->tag_set);
 	if (IS_ERR(q)) {
-		printk(KERN_ERR DRV_NAME "(%s): OOB queue alloc failure\n",
-		       pci_name(pdev));
 		rc = PTR_ERR(q);
+		blk_mq_free_tag_set(&host->tag_set);
 		goto err_out_dma_free;
 	}
+
 	host->oob_q = q;
 	q->queuedata = host;
 
-- 
cgit 


From b5fa0e9ec997031083a76972d2ee0e8db671420a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:48:54 +0100
Subject: mtip32xx: move the blk_rq_map_sg call to mtip_hw_submit_io

We have all arguments at hand in mtip_hw_submit_io, so keep the
rq to sg mapping close to the dma_map_sg call.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 947aa10107a6..1964af8a187b 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2171,7 +2171,6 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
  * @dd       Pointer to the driver data structure.
  * @start    First sector to read.
  * @nsect    Number of sectors to read.
- * @nents    Number of entries in scatter list for the read command.
  * @tag      The tag of this read command.
  * @callback Pointer to the function that should be called
  *	     when the read completes.
@@ -2183,7 +2182,7 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
  *	None
  */
 static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
-			      struct mtip_cmd *command, int nents,
+			      struct mtip_cmd *command,
 			      struct blk_mq_hw_ctx *hctx)
 {
 	struct host_to_dev_fis	*fis;
@@ -2191,8 +2190,10 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
 	int dma_dir = rq_data_dir(rq) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
 	u64 start = blk_rq_pos(rq);
 	unsigned int nsect = blk_rq_sectors(rq);
+	unsigned int nents;
 
 	/* Map the scatter list for DMA access */
+	nents = blk_rq_map_sg(hctx->queue, rq, command->sg);
 	nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir);
 
 	prefetch(&port->flags);
@@ -3548,7 +3549,6 @@ static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
 	struct driver_data *dd = hctx->queue->queuedata;
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
-	unsigned int nents;
 
 	if (is_se_active(dd))
 		return -ENODATA;
@@ -3579,11 +3579,8 @@ static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
 		return 0;
 	}
 
-	/* Create the scatter list for this request. */
-	nents = blk_rq_map_sg(hctx->queue, rq, cmd->sg);
-
 	/* Issue the read/write. */
-	mtip_hw_submit_io(dd, rq, cmd, nents, hctx);
+	mtip_hw_submit_io(dd, rq, cmd, hctx);
 	return 0;
 }
 
-- 
cgit 


From 10966fa13855c5c64444bb01a97b86158e63267c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:48:55 +0100
Subject: mtip32xx: merge mtip_submit_request into mtip_queue_rq

Factor out a new is_stopped helper that matches the existing
is_se_active helper, and merge the trivial amount of remaining code
into the only caller.  This also allows better error handling by
returning a BLK_STS_* directly instead of explicitly calling
blk_mq_end_request, and moving blk_mq_start_request closer to the
actual issue to hardware.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 78 ++++++++++++++-------------------------
 1 file changed, 28 insertions(+), 50 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 1964af8a187b..75a4ca0210f2 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3534,54 +3534,24 @@ static inline bool is_se_active(struct driver_data *dd)
 	return false;
 }
 
-/*
- * Block layer make request function.
- *
- * This function is called by the kernel to process a BIO for
- * the P320 device.
- *
- * @queue Pointer to the request queue. Unused other than to obtain
- *              the driver data structure.
- * @rq    Pointer to the request.
- *
- */
-static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static inline bool is_stopped(struct driver_data *dd, struct request *rq)
 {
-	struct driver_data *dd = hctx->queue->queuedata;
-	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
-
-	if (is_se_active(dd))
-		return -ENODATA;
-
-	if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) {
-		if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
-							&dd->dd_flag))) {
-			return -ENXIO;
-		}
-		if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) {
-			return -ENODATA;
-		}
-		if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT,
-							&dd->dd_flag) &&
-				rq_data_dir(rq))) {
-			return -ENODATA;
-		}
-		if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag) ||
-			test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)))
-			return -ENODATA;
-	}
-
-	if (req_op(rq) == REQ_OP_DISCARD) {
-		int err;
+	if (likely(!(dd->dd_flag & MTIP_DDF_STOP_IO)))
+		return false;
 
-		err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq));
-		blk_mq_end_request(rq, err ? BLK_STS_IOERR : BLK_STS_OK);
-		return 0;
-	}
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))
+		return true;
+	if (test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))
+		return true;
+	if (test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag) &&
+	    rq_data_dir(rq))
+		return true;
+	if (test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))
+		return true;
+	if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag))
+		return true;
 
-	/* Issue the read/write. */
-	mtip_hw_submit_io(dd, rq, cmd, hctx);
-	return 0;
+	return false;
 }
 
 static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
@@ -3647,8 +3617,9 @@ static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx,
 static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
 			 const struct blk_mq_queue_data *bd)
 {
+	struct driver_data *dd = hctx->queue->queuedata;
 	struct request *rq = bd->rq;
-	int ret;
+	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
 
 	mtip_init_cmd_header(rq);
 
@@ -3658,12 +3629,19 @@ static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (unlikely(mtip_check_unal_depth(hctx, rq)))
 		return BLK_STS_RESOURCE;
 
+	if (is_se_active(dd) || is_stopped(dd, rq))
+		return BLK_STS_IOERR;
+
 	blk_mq_start_request(rq);
 
-	ret = mtip_submit_request(hctx, rq);
-	if (likely(!ret))
-		return BLK_STS_OK;
-	return BLK_STS_IOERR;
+	if (req_op(rq) == REQ_OP_DISCARD) {
+		if (mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq)) < 0)
+			return BLK_STS_IOERR;
+	} else {
+		mtip_hw_submit_io(dd, rq, cmd, hctx);
+	}
+
+	return BLK_STS_OK;
 }
 
 static void mtip_free_cmd(struct blk_mq_tag_set *set, struct request *rq,
-- 
cgit 


From 81e66174ab0aad0bfb3d69abb334b0402fb25df4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:48:56 +0100
Subject: mtip32xx: return a blk_status_t from mtip_send_trim

This allows for better error propagation and simpler code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 75a4ca0210f2..d5277bba0c02 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -1423,23 +1423,19 @@ static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
  * @dd		pointer to driver_data structure
  * @lba		starting lba
  * @len		# of 512b sectors to trim
- *
- * return value
- *      -ENOMEM		Out of dma memory
- *      -EINVAL		Invalid parameters passed in, trim not supported
- *      -EIO		Error submitting trim request to hw
  */
-static int mtip_send_trim(struct driver_data *dd, unsigned int lba,
-				unsigned int len)
+static blk_status_t mtip_send_trim(struct driver_data *dd, unsigned int lba,
+		unsigned int len)
 {
-	int i, rv = 0;
 	u64 tlba, tlen, sect_left;
 	struct mtip_trim_entry *buf;
 	dma_addr_t dma_addr;
 	struct host_to_dev_fis fis;
+	blk_status_t ret = BLK_STS_OK;
+	int i;
 
 	if (!len || dd->trim_supp == false)
-		return -EINVAL;
+		return BLK_STS_IOERR;
 
 	/* Trim request too big */
 	WARN_ON(len > (MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES));
@@ -1454,7 +1450,7 @@ static int mtip_send_trim(struct driver_data *dd, unsigned int lba,
 	buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
 								GFP_KERNEL);
 	if (!buf)
-		return -ENOMEM;
+		return BLK_STS_RESOURCE;
 	memset(buf, 0, ATA_SECT_SIZE);
 
 	for (i = 0, sect_left = len, tlba = lba;
@@ -1486,10 +1482,10 @@ static int mtip_send_trim(struct driver_data *dd, unsigned int lba,
 					ATA_SECT_SIZE,
 					0,
 					MTIP_TRIM_TIMEOUT_MS) < 0)
-		rv = -EIO;
+		ret = BLK_STS_IOERR;
 
 	dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
-	return rv;
+	return ret;
 }
 
 /*
@@ -3634,13 +3630,9 @@ static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	blk_mq_start_request(rq);
 
-	if (req_op(rq) == REQ_OP_DISCARD) {
-		if (mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq)) < 0)
-			return BLK_STS_IOERR;
-	} else {
-		mtip_hw_submit_io(dd, rq, cmd, hctx);
-	}
-
+	if (req_op(rq) == REQ_OP_DISCARD)
+		return mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq));
+	mtip_hw_submit_io(dd, rq, cmd, hctx);
 	return BLK_STS_OK;
 }
 
-- 
cgit 


From 449a15d9e49a566a38481e51a21360c7d91a9c77 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:48:57 +0100
Subject: mtip32xx: remove __force_bit2int

There is no good excuse not to use proper __le16/32 types.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 33 +++++++++++++--------------------
 drivers/block/mtip32xx/mtip32xx.h | 28 +++++++++++++---------------
 2 files changed, 26 insertions(+), 35 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index d5277bba0c02..9245c325fe3c 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -181,9 +181,9 @@ static void mtip_init_cmd_header(struct request *rq)
 				(sizeof(struct mtip_cmd_hdr) * rq->tag);
 
 	if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
-		cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16);
+		cmd->command_header->ctbau = cpu_to_le32((cmd->command_dma >> 16) >> 16);
 
-	cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
+	cmd->command_header->ctba = cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
 }
 
 static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
@@ -1459,8 +1459,8 @@ static blk_status_t mtip_send_trim(struct driver_data *dd, unsigned int lba,
 		tlen = (sect_left >= MTIP_MAX_TRIM_ENTRY_LEN ?
 					MTIP_MAX_TRIM_ENTRY_LEN :
 					sect_left);
-		buf[i].lba = __force_bit2int cpu_to_le32(tlba);
-		buf[i].range = __force_bit2int cpu_to_le16(tlen);
+		buf[i].lba = cpu_to_le32(tlba);
+		buf[i].range = cpu_to_le16(tlen);
 		tlba += tlen;
 		sect_left -= tlen;
 	}
@@ -1590,11 +1590,9 @@ static inline void fill_command_sg(struct driver_data *dd,
 		if (dma_len > 0x400000)
 			dev_err(&dd->pdev->dev,
 				"DMA segment length truncated\n");
-		command_sg->info = __force_bit2int
-			cpu_to_le32((dma_len-1) & 0x3FFFFF);
-		command_sg->dba	= __force_bit2int
-			cpu_to_le32(sg_dma_address(sg));
-		command_sg->dba_upper = __force_bit2int
+		command_sg->info = cpu_to_le32((dma_len-1) & 0x3FFFFF);
+		command_sg->dba	=  cpu_to_le32(sg_dma_address(sg));
+		command_sg->dba_upper =
 			cpu_to_le32((sg_dma_address(sg) >> 16) >> 16);
 		command_sg++;
 		sg++;
@@ -2231,8 +2229,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
 
 	/* Populate the command header */
 	command->command_header->opts =
-			__force_bit2int cpu_to_le32(
-				(nents << 16) | 5 | AHCI_CMD_PREFETCH);
+			cpu_to_le32((nents << 16) | 5 | AHCI_CMD_PREFETCH);
 	command->command_header->byte_count = 0;
 
 	command->direction = dma_dir;
@@ -3586,20 +3583,16 @@ static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx,
 		return BLK_STS_RESOURCE;
 
 	/* Populate the SG list */
-	cmd->command_header->opts =
-		 __force_bit2int cpu_to_le32(icmd->opts | icmd->fis_len);
+	cmd->command_header->opts = cpu_to_le32(icmd->opts | icmd->fis_len);
 	if (icmd->buf_len) {
 		command_sg = cmd->command + AHCI_CMD_TBL_HDR_SZ;
 
-		command_sg->info =
-			__force_bit2int cpu_to_le32((icmd->buf_len-1) & 0x3FFFFF);
-		command_sg->dba	=
-			__force_bit2int cpu_to_le32(icmd->buffer & 0xFFFFFFFF);
+		command_sg->info = cpu_to_le32((icmd->buf_len-1) & 0x3FFFFF);
+		command_sg->dba	= cpu_to_le32(icmd->buffer & 0xFFFFFFFF);
 		command_sg->dba_upper =
-			__force_bit2int cpu_to_le32((icmd->buffer >> 16) >> 16);
+			cpu_to_le32((icmd->buffer >> 16) >> 16);
 
-		cmd->command_header->opts |=
-			__force_bit2int cpu_to_le32((1 << 16));
+		cmd->command_header->opts |= cpu_to_le32((1 << 16));
 	}
 
 	/* Populate the command header */
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index e20e55dab443..0aa1ea210822 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -126,8 +126,6 @@
 
 #define MTIP_DFS_MAX_BUF_SIZE 1024
 
-#define __force_bit2int (unsigned int __force)
-
 enum {
 	/* below are bit numbers in 'flags' defined in mtip_port */
 	MTIP_PF_IC_ACTIVE_BIT       = 0, /* pio/ioctl */
@@ -200,9 +198,9 @@ struct mtip_work {
 #define MTIP_MAX_TRIM_ENTRY_LEN		0xfff8
 
 struct mtip_trim_entry {
-	u32 lba;   /* starting lba of region */
-	u16 rsvd;  /* unused */
-	u16 range; /* # of 512b blocks to trim */
+	__le32 lba;   /* starting lba of region */
+	__le16 rsvd;  /* unused */
+	__le16 range; /* # of 512b blocks to trim */
 } __packed;
 
 struct mtip_trim {
@@ -278,24 +276,24 @@ struct mtip_cmd_hdr {
 	 * - Bit 5 Unused in this implementation.
 	 * - Bits 4:0 Length of the command FIS in DWords (DWord = 4 bytes).
 	 */
-	unsigned int opts;
+	__le32 opts;
 	/* This field is unsed when using NCQ. */
 	union {
-		unsigned int byte_count;
-		unsigned int status;
+		__le32 byte_count;
+		__le32 status;
 	};
 	/*
 	 * Lower 32 bits of the command table address associated with this
 	 * header. The command table addresses must be 128 byte aligned.
 	 */
-	unsigned int ctba;
+	__le32 ctba;
 	/*
 	 * If 64 bit addressing is used this field is the upper 32 bits
 	 * of the command table address associated with this command.
 	 */
-	unsigned int ctbau;
+	__le32 ctbau;
 	/* Reserved and unused. */
-	unsigned int res[4];
+	u32 res[4];
 };
 
 /* Command scatter gather structure (PRD). */
@@ -305,21 +303,21 @@ struct mtip_cmd_sg {
 	 * address must be 8 byte aligned signified by bits 2:0 being
 	 * set to 0.
 	 */
-	unsigned int dba;
+	__le32 dba;
 	/*
 	 * When 64 bit addressing is used this field is the upper
 	 * 32 bits of the data buffer address.
 	 */
-	unsigned int dba_upper;
+	__le32 dba_upper;
 	/* Unused. */
-	unsigned int reserved;
+	__le32 reserved;
 	/*
 	 * Bit 31: interrupt when this data block has been transferred.
 	 * Bits 30..22: reserved
 	 * Bits 21..0: byte count (minus 1).  For P320 the byte count must be
 	 * 8 byte aligned signified by bits 2:0 being set to 1.
 	 */
-	unsigned int info;
+	__le32 info;
 };
 struct mtip_port;
 
-- 
cgit 


From 643b5f68d0f9b5a269ebbcc9f0e6c658b41b864e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:48:58 +0100
Subject: mtip32xx: add missing endianess annotations on struct smart_attr

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index 0aa1ea210822..e8b4b3d5365a 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -172,10 +172,10 @@ enum {
 
 struct smart_attr {
 	u8 attr_id;
-	u16 flags;
+	__le16 flags;
 	u8 cur;
 	u8 worst;
-	u32 data;
+	__le32 data;
 	u8 res[3];
 } __packed;
 
-- 
cgit 


From 7bbf118f3b15682f79eee52c3eb96a13a71372c0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:48:59 +0100
Subject: mtip32xx: remove mtip_init_cmd_header

There isn't much need for this helper - we can just calculate the offset
for the command header once late in the submission path and fill out
the ctba and ctbau fields there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 44 +++++++++++++--------------------------
 drivers/block/mtip32xx/mtip32xx.h |  5 -----
 2 files changed, 15 insertions(+), 34 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 9245c325fe3c..6006f7baa1eb 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -168,24 +168,6 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev)
 	return false; /* device present */
 }
 
-/* we have to use runtime tag to setup command header */
-static void mtip_init_cmd_header(struct request *rq)
-{
-	struct driver_data *dd = rq->q->queuedata;
-	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
-
-	/* Point the command headers at the command tables. */
-	cmd->command_header = dd->port->command_list +
-				(sizeof(struct mtip_cmd_hdr) * rq->tag);
-	cmd->command_header_dma = dd->port->command_list_dma +
-				(sizeof(struct mtip_cmd_hdr) * rq->tag);
-
-	if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
-		cmd->command_header->ctbau = cpu_to_le32((cmd->command_dma >> 16) >> 16);
-
-	cmd->command_header->ctba = cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
-}
-
 static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
 {
 	struct request *rq;
@@ -197,9 +179,6 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
 	if (IS_ERR(rq))
 		return NULL;
 
-	/* Internal cmd isn't submitted via .queue_rq */
-	mtip_init_cmd_header(rq);
-
 	return blk_mq_rq_to_pdu(rq);
 }
 
@@ -2179,6 +2158,8 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
 			      struct mtip_cmd *command,
 			      struct blk_mq_hw_ctx *hctx)
 {
+	struct mtip_cmd_hdr *hdr =
+		dd->port->command_list + sizeof(struct mtip_cmd_hdr) * rq->tag;
 	struct host_to_dev_fis	*fis;
 	struct mtip_port *port = dd->port;
 	int dma_dir = rq_data_dir(rq) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
@@ -2228,9 +2209,11 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
 		fis->device |= 1 << 7;
 
 	/* Populate the command header */
-	command->command_header->opts =
-			cpu_to_le32((nents << 16) | 5 | AHCI_CMD_PREFETCH);
-	command->command_header->byte_count = 0;
+	hdr->ctba = cpu_to_le32(command->command_dma & 0xFFFFFFFF);
+	if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
+		hdr->ctbau = cpu_to_le32((command->command_dma >> 16) >> 16);
+	hdr->opts = cpu_to_le32((nents << 16) | 5 | AHCI_CMD_PREFETCH);
+	hdr->byte_count = 0;
 
 	command->direction = dma_dir;
 
@@ -3577,13 +3560,18 @@ static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx,
 	struct driver_data *dd = hctx->queue->queuedata;
 	struct mtip_int_cmd *icmd = rq->special;
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	struct mtip_cmd_hdr *hdr =
+		dd->port->command_list + sizeof(struct mtip_cmd_hdr) * rq->tag;
 	struct mtip_cmd_sg *command_sg;
 
 	if (mtip_commands_active(dd->port))
 		return BLK_STS_RESOURCE;
 
+	hdr->ctba = cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
+	if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
+		hdr->ctbau = cpu_to_le32((cmd->command_dma >> 16) >> 16);
 	/* Populate the SG list */
-	cmd->command_header->opts = cpu_to_le32(icmd->opts | icmd->fis_len);
+	hdr->opts = cpu_to_le32(icmd->opts | icmd->fis_len);
 	if (icmd->buf_len) {
 		command_sg = cmd->command + AHCI_CMD_TBL_HDR_SZ;
 
@@ -3592,11 +3580,11 @@ static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx,
 		command_sg->dba_upper =
 			cpu_to_le32((icmd->buffer >> 16) >> 16);
 
-		cmd->command_header->opts |= cpu_to_le32((1 << 16));
+		hdr->opts |= cpu_to_le32((1 << 16));
 	}
 
 	/* Populate the command header */
-	cmd->command_header->byte_count = 0;
+	hdr->byte_count = 0;
 
 	blk_mq_start_request(rq);
 	mtip_issue_non_ncq_command(dd->port, rq->tag);
@@ -3610,8 +3598,6 @@ static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct request *rq = bd->rq;
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
 
-	mtip_init_cmd_header(rq);
-
 	if (blk_rq_is_passthrough(rq))
 		return mtip_issue_reserved_cmd(hctx, rq);
 
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index e8b4b3d5365a..63414928f07c 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -323,11 +323,6 @@ struct mtip_port;
 
 /* Structure used to describe a command. */
 struct mtip_cmd {
-
-	struct mtip_cmd_hdr *command_header; /* ptr to command header entry */
-
-	dma_addr_t command_header_dma; /* corresponding physical address */
-
 	void *command; /* ptr to command table entry */
 
 	dma_addr_t command_dma; /* corresponding physical address */
-- 
cgit 


From 55c7bc37e05b5f7593b76d1c74e254b996b73d1a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:49:00 +0100
Subject: mtip32xx: remove mtip_get_int_command

Merging this function into the only callers makes the code flow easier.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 6006f7baa1eb..5ae86224b2f5 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -168,20 +168,6 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev)
 	return false; /* device present */
 }
 
-static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
-{
-	struct request *rq;
-
-	if (mtip_check_surprise_removal(dd->pdev))
-		return NULL;
-
-	rq = blk_mq_alloc_request(dd->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_RESERVED);
-	if (IS_ERR(rq))
-		return NULL;
-
-	return blk_mq_rq_to_pdu(rq);
-}
-
 static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd,
 					  unsigned int tag)
 {
@@ -1002,12 +988,15 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 		return -EFAULT;
 	}
 
-	int_cmd = mtip_get_int_command(dd);
-	if (!int_cmd) {
+	if (mtip_check_surprise_removal(dd->pdev))
+		return -EFAULT;
+
+	rq = blk_mq_alloc_request(dd->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_RESERVED);
+	if (IS_ERR(rq)) {
 		dbg_printk(MTIP_DRV_NAME "Unable to allocate tag for PIO cmd\n");
 		return -EFAULT;
 	}
-	rq = blk_mq_rq_from_pdu(int_cmd);
+
 	rq->special = &icmd;
 
 	set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
@@ -1029,6 +1018,7 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 	}
 
 	/* Copy the command to the command table */
+	int_cmd = blk_mq_rq_to_pdu(rq);
 	memcpy(int_cmd->command, fis, fis_len*4);
 
 	rq->timeout = timeout;
-- 
cgit 


From d85cb20453bc98da47ab4393a1a05afcafb39a0f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:49:01 +0100
Subject: mtip32xx: don't use req->special

Instead create add to the icmd into struct mtip_cmd which can be unioned
with the scatterlist used for the normal I/O path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 5 ++---
 drivers/block/mtip32xx/mtip32xx.h | 7 ++++++-
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 5ae86224b2f5..e99db2c9118f 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -997,8 +997,6 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 		return -EFAULT;
 	}
 
-	rq->special = &icmd;
-
 	set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
 
 	if (fis->command == ATA_CMD_SEC_ERASE_PREP)
@@ -1019,6 +1017,7 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 
 	/* Copy the command to the command table */
 	int_cmd = blk_mq_rq_to_pdu(rq);
+	int_cmd->icmd = &icmd;
 	memcpy(int_cmd->command, fis, fis_len*4);
 
 	rq->timeout = timeout;
@@ -3548,8 +3547,8 @@ static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx,
 		struct request *rq)
 {
 	struct driver_data *dd = hctx->queue->queuedata;
-	struct mtip_int_cmd *icmd = rq->special;
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	struct mtip_int_cmd *icmd = cmd->icmd;
 	struct mtip_cmd_hdr *hdr =
 		dd->port->command_list + sizeof(struct mtip_cmd_hdr) * rq->tag;
 	struct mtip_cmd_sg *command_sg;
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index 63414928f07c..c33f8c3d9fb4 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -321,6 +321,8 @@ struct mtip_cmd_sg {
 };
 struct mtip_port;
 
+struct mtip_int_cmd;
+
 /* Structure used to describe a command. */
 struct mtip_cmd {
 	void *command; /* ptr to command table entry */
@@ -331,7 +333,10 @@ struct mtip_cmd {
 
 	int unaligned; /* command is unaligned on 4k boundary */
 
-	struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */
+	union {
+		struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */
+		struct mtip_int_cmd *icmd;
+	};
 
 	int retries; /* The number of retries left for this command. */
 
-- 
cgit 


From 27d420bc475e68c85d567d96caf215999d76fd16 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:49:02 +0100
Subject: mtip32xxx: use for_each_sg

Use the proper helper instead of manually iterating the scatterlist,
which is broken in the presence of chained S/G lists.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index e99db2c9118f..a4c44db097e0 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -1549,11 +1549,11 @@ static inline void fill_command_sg(struct driver_data *dd,
 	int n;
 	unsigned int dma_len;
 	struct mtip_cmd_sg *command_sg;
-	struct scatterlist *sg = command->sg;
+	struct scatterlist *sg;
 
 	command_sg = command->command + AHCI_CMD_TBL_HDR_SZ;
 
-	for (n = 0; n < nents; n++) {
+	for_each_sg(command->sg, sg, nents, n) {
 		dma_len = sg_dma_len(sg);
 		if (dma_len > 0x400000)
 			dev_err(&dd->pdev->dev,
@@ -1563,7 +1563,6 @@ static inline void fill_command_sg(struct driver_data *dd,
 		command_sg->dba_upper =
 			cpu_to_le32((sg_dma_address(sg) >> 16) >> 16);
 		command_sg++;
-		sg++;
 	}
 }
 
-- 
cgit 


From 535ac5d3fe63b9ea1dda379f606f9d0d377d7184 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:42:35 +0100
Subject: ide: cleanup ->prep_rq calling convention

The return value is just used as a binary yes/no decision, so switch
it to a bool instead of the old BLKPREP_* values returned as an int.

Also clean up a few related comments.

Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-cd.c   | 22 +++++++++++-----------
 drivers/ide/ide-disk.c |  8 ++++----
 drivers/ide/ide-io.c   |  4 ++--
 3 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'drivers')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 4ecaf2ace4cb..69c1aede5f93 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -527,8 +527,8 @@ static bool ide_cd_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd)
 	return false;
 }
 
-/* standard prep_rq_fn that builds 10 byte cmds */
-static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
+/* standard prep_rq that builds 10 byte cmds */
+static bool ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
 {
 	int hard_sect = queue_logical_block_size(q);
 	long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
@@ -554,14 +554,14 @@ static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
 	req->cmd[7] = (blocks >> 8) & 0xff;
 	req->cmd[8] = blocks & 0xff;
 	req->cmd_len = 10;
-	return BLKPREP_OK;
+	return true;
 }
 
 /*
  * Most of the SCSI commands are supported directly by ATAPI devices.
  * This transform handles the few exceptions.
  */
-static int ide_cdrom_prep_pc(struct request *rq)
+static bool ide_cdrom_prep_pc(struct request *rq)
 {
 	u8 *c = scsi_req(rq)->cmd;
 
@@ -575,7 +575,7 @@ static int ide_cdrom_prep_pc(struct request *rq)
 		c[1] &= 0xe0;
 		c[0] += (READ_10 - READ_6);
 		scsi_req(rq)->cmd_len = 10;
-		return BLKPREP_OK;
+		return true;
 	}
 
 	/*
@@ -585,13 +585,13 @@ static int ide_cdrom_prep_pc(struct request *rq)
 	 */
 	if (c[0] == MODE_SENSE || c[0] == MODE_SELECT) {
 		scsi_req(rq)->result = ILLEGAL_REQUEST;
-		return BLKPREP_KILL;
+		return false;
 	}
 
-	return BLKPREP_OK;
+	return true;
 }
 
-static int ide_cdrom_prep_fn(ide_drive_t *drive, struct request *rq)
+static bool ide_cdrom_prep_rq(ide_drive_t *drive, struct request *rq)
 {
 	if (!blk_rq_is_passthrough(rq)) {
 		scsi_req_init(scsi_req(rq));
@@ -600,7 +600,7 @@ static int ide_cdrom_prep_fn(ide_drive_t *drive, struct request *rq)
 	} else if (blk_rq_is_scsi(rq))
 		return ide_cdrom_prep_pc(rq);
 
-	return 0;
+	return true;
 }
 
 static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
@@ -818,7 +818,7 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 		 * We may be retrying this request after an error.  Fix up any
 		 * weirdness which might be present in the request packet.
 		 */
-		ide_cdrom_prep_fn(drive, rq);
+		ide_cdrom_prep_rq(drive, rq);
 	}
 
 	/* fs requests *must* be hardware frame aligned */
@@ -1521,7 +1521,7 @@ static int ide_cdrom_setup(ide_drive_t *drive)
 
 	ide_debug_log(IDE_DBG_PROBE, "enter");
 
-	drive->prep_rq = ide_cdrom_prep_fn;
+	drive->prep_rq = ide_cdrom_prep_rq;
 	blk_queue_dma_alignment(q, 31);
 	blk_queue_update_dma_pad(q, 15);
 
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index f8567c8c9dd1..724db9af0d82 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -427,12 +427,12 @@ static void ide_disk_unlock_native_capacity(ide_drive_t *drive)
 		drive->dev_flags |= IDE_DFLAG_NOHPA; /* disable HPA on resume */
 }
 
-static int idedisk_prep_fn(ide_drive_t *drive, struct request *rq)
+static bool idedisk_prep_rq(ide_drive_t *drive, struct request *rq)
 {
 	struct ide_cmd *cmd;
 
 	if (req_op(rq) != REQ_OP_FLUSH)
-		return BLKPREP_OK;
+		return true;
 
 	if (rq->special) {
 		cmd = rq->special;
@@ -458,7 +458,7 @@ static int idedisk_prep_fn(ide_drive_t *drive, struct request *rq)
 	rq->special = cmd;
 	cmd->rq = rq;
 
-	return BLKPREP_OK;
+	return true;
 }
 
 ide_devset_get(multcount, mult_count);
@@ -547,7 +547,7 @@ static void update_flush(ide_drive_t *drive)
 
 		if (barrier) {
 			wc = true;
-			drive->prep_rq = idedisk_prep_fn;
+			drive->prep_rq = idedisk_prep_rq;
 		}
 	}
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 5093c605c91c..64e72640acf8 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -326,7 +326,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 		goto kill_rq;
 	}
 
-	if (drive->prep_rq && drive->prep_rq(drive, rq))
+	if (drive->prep_rq && !drive->prep_rq(drive, rq))
 		return ide_stopped;
 
 	if (ata_pm_request(rq))
@@ -508,7 +508,7 @@ repeat:
 
 		/*
 		 * we know that the queue isn't empty, but this can happen
-		 * if the q->prep_rq_fn() decides to kill a request
+		 * if ->prep_rq() decides to kill a request
 		 */
 		if (!rq) {
 			rq = bd->rq;
-- 
cgit 


From c092d4ec53c9c7e690ad517b414078db4da6870b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:42:36 +0100
Subject: scsi: simplify scsi_prep_state_check

Return a blk_status_t directly, and make the code a little more compact
by handling the fast path in the caller.

Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c | 102 +++++++++++++++++++++++-------------------------
 1 file changed, 48 insertions(+), 54 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index ed81b8e74cfe..5ecabb3e77b7 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1240,60 +1240,48 @@ static int scsi_setup_cmnd(struct scsi_device *sdev, struct request *req)
 		return scsi_setup_fs_cmnd(sdev, req);
 }
 
-static int
+static blk_status_t
 scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
 {
-	int ret = BLKPREP_OK;
-
-	/*
-	 * If the device is not in running state we will reject some
-	 * or all commands.
-	 */
-	if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
-		switch (sdev->sdev_state) {
-		case SDEV_OFFLINE:
-		case SDEV_TRANSPORT_OFFLINE:
-			/*
-			 * If the device is offline we refuse to process any
-			 * commands.  The device must be brought online
-			 * before trying any recovery commands.
-			 */
-			sdev_printk(KERN_ERR, sdev,
-				    "rejecting I/O to offline device\n");
-			ret = BLKPREP_KILL;
-			break;
-		case SDEV_DEL:
-			/*
-			 * If the device is fully deleted, we refuse to
-			 * process any commands as well.
-			 */
-			sdev_printk(KERN_ERR, sdev,
-				    "rejecting I/O to dead device\n");
-			ret = BLKPREP_KILL;
-			break;
-		case SDEV_BLOCK:
-		case SDEV_CREATED_BLOCK:
-			ret = BLKPREP_DEFER;
-			break;
-		case SDEV_QUIESCE:
-			/*
-			 * If the devices is blocked we defer normal commands.
-			 */
-			if (req && !(req->rq_flags & RQF_PREEMPT))
-				ret = BLKPREP_DEFER;
-			break;
-		default:
-			/*
-			 * For any other not fully online state we only allow
-			 * special commands.  In particular any user initiated
-			 * command is not allowed.
-			 */
-			if (req && !(req->rq_flags & RQF_PREEMPT))
-				ret = BLKPREP_KILL;
-			break;
-		}
+	switch (sdev->sdev_state) {
+	case SDEV_OFFLINE:
+	case SDEV_TRANSPORT_OFFLINE:
+		/*
+		 * If the device is offline we refuse to process any
+		 * commands.  The device must be brought online
+		 * before trying any recovery commands.
+		 */
+		sdev_printk(KERN_ERR, sdev,
+			    "rejecting I/O to offline device\n");
+		return BLK_STS_IOERR;
+	case SDEV_DEL:
+		/*
+		 * If the device is fully deleted, we refuse to
+		 * process any commands as well.
+		 */
+		sdev_printk(KERN_ERR, sdev,
+			    "rejecting I/O to dead device\n");
+		return BLK_STS_IOERR;
+	case SDEV_BLOCK:
+	case SDEV_CREATED_BLOCK:
+		return BLK_STS_RESOURCE;
+	case SDEV_QUIESCE:
+		/*
+		 * If the devices is blocked we defer normal commands.
+		 */
+		if (req && !(req->rq_flags & RQF_PREEMPT))
+			return BLK_STS_RESOURCE;
+		return BLK_STS_OK;
+	default:
+		/*
+		 * For any other not fully online state we only allow
+		 * special commands.  In particular any user initiated
+		 * command is not allowed.
+		 */
+		if (req && !(req->rq_flags & RQF_PREEMPT))
+			return BLK_STS_IOERR;
+		return BLK_STS_OK;
 	}
-	return ret;
 }
 
 /*
@@ -1700,9 +1688,15 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	blk_status_t ret;
 	int reason;
 
-	ret = prep_to_mq(scsi_prep_state_check(sdev, req));
-	if (ret != BLK_STS_OK)
-		goto out_put_budget;
+	/*
+	 * If the device is not in running state we will reject some or all
+	 * commands.
+	 */
+	if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
+		ret = scsi_prep_state_check(sdev, req);
+		if (ret != BLK_STS_OK)
+			goto out_put_budget;
+	}
 
 	ret = BLK_STS_RESOURCE;
 	if (!scsi_target_queue_ready(shost, sdev))
-- 
cgit 


From 785ba83b4f3e4fde236f03205dd1cd98fd6a5255 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:42:37 +0100
Subject: scsi: push blk_status_t up into scsi_setup_{fs,scsi}_cmnd

This just moves the prep_to_mq calls up in preparation of further removal
of BLKPREP_* usage.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 5ecabb3e77b7..e665c25da144 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1177,7 +1177,20 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
 	scsi_add_cmd_to_list(cmd);
 }
 
-static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
+static inline blk_status_t prep_to_mq(int ret)
+{
+	switch (ret) {
+	case BLKPREP_OK:
+		return BLK_STS_OK;
+	case BLKPREP_DEFER:
+		return BLK_STS_RESOURCE;
+	default:
+		return BLK_STS_IOERR;
+	}
+}
+
+static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev,
+		struct request *req)
 {
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 
@@ -1190,7 +1203,7 @@ static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
 	if (req->bio) {
 		int ret = scsi_init_io(cmd);
 		if (unlikely(ret))
-			return ret;
+			return prep_to_mq(ret);
 	} else {
 		BUG_ON(blk_rq_bytes(req));
 
@@ -1201,29 +1214,31 @@ static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
 	cmd->cmnd = scsi_req(req)->cmd;
 	cmd->transfersize = blk_rq_bytes(req);
 	cmd->allowed = scsi_req(req)->retries;
-	return BLKPREP_OK;
+	return BLK_STS_OK;
 }
 
 /*
  * Setup a normal block command.  These are simple request from filesystems
  * that still need to be translated to SCSI CDBs from the ULD.
  */
-static int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
+static blk_status_t scsi_setup_fs_cmnd(struct scsi_device *sdev,
+		struct request *req)
 {
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 
 	if (unlikely(sdev->handler && sdev->handler->prep_fn)) {
 		int ret = sdev->handler->prep_fn(sdev, req);
 		if (ret != BLKPREP_OK)
-			return ret;
+			return prep_to_mq(ret);
 	}
 
 	cmd->cmnd = scsi_req(req)->cmd = scsi_req(req)->__cmd;
 	memset(cmd->cmnd, 0, BLK_MAX_CDB);
-	return scsi_cmd_to_driver(cmd)->init_command(cmd);
+	return prep_to_mq(scsi_cmd_to_driver(cmd)->init_command(cmd));
 }
 
-static int scsi_setup_cmnd(struct scsi_device *sdev, struct request *req)
+static blk_status_t scsi_setup_cmnd(struct scsi_device *sdev,
+		struct request *req)
 {
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 
@@ -1581,18 +1596,6 @@ static int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
 	return 0;
 }
 
-static inline blk_status_t prep_to_mq(int ret)
-{
-	switch (ret) {
-	case BLKPREP_OK:
-		return BLK_STS_OK;
-	case BLKPREP_DEFER:
-		return BLK_STS_RESOURCE;
-	default:
-		return BLK_STS_IOERR;
-	}
-}
-
 /* Size in bytes of the sg-list stored in the scsi-mq command-private data. */
 static unsigned int scsi_mq_sgl_size(struct Scsi_Host *shost)
 {
@@ -1600,7 +1603,7 @@ static unsigned int scsi_mq_sgl_size(struct Scsi_Host *shost)
 		sizeof(struct scatterlist);
 }
 
-static int scsi_mq_prep_fn(struct request *req)
+static blk_status_t scsi_mq_prep_fn(struct request *req)
 {
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 	struct scsi_device *sdev = req->q->queuedata;
@@ -1705,7 +1708,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 		goto out_dec_target_busy;
 
 	if (!(req->rq_flags & RQF_DONTPREP)) {
-		ret = prep_to_mq(scsi_mq_prep_fn(req));
+		ret = scsi_mq_prep_fn(req);
 		if (ret != BLK_STS_OK)
 			goto out_dec_host_busy;
 		req->rq_flags |= RQF_DONTPREP;
-- 
cgit 


From 14784565f740e862adae4b1d7c91f51b4038c4f5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:42:38 +0100
Subject: scsi: clean up error handling in scsi_init_io

There is no need to call scsi_mq_free_sgtables until we have actually
allocated sgtables.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index e665c25da144..1f84e2cec57b 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1044,31 +1044,30 @@ int scsi_init_io(struct scsi_cmnd *cmd)
 	int error = BLKPREP_KILL;
 
 	if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq)))
-		goto err_exit;
+		return BLKPREP_KILL;
 
 	error = scsi_init_sgtable(rq, &cmd->sdb);
 	if (error)
-		goto err_exit;
+		return error;
 
 	if (blk_bidi_rq(rq)) {
 		error = scsi_init_sgtable(rq->next_rq, rq->next_rq->special);
 		if (error)
-			goto err_exit;
+			goto out_free_sgtables;
 	}
 
 	if (blk_integrity_rq(rq)) {
 		struct scsi_data_buffer *prot_sdb = cmd->prot_sdb;
 		int ivecs, count;
 
-		if (prot_sdb == NULL) {
+		if (WARN_ON_ONCE(!prot_sdb)) {
 			/*
 			 * This can happen if someone (e.g. multipath)
 			 * queues a command to a device on an adapter
 			 * that does not support DIX.
 			 */
-			WARN_ON_ONCE(1);
 			error = BLKPREP_KILL;
-			goto err_exit;
+			goto out_free_sgtables;
 		}
 
 		ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);
@@ -1076,7 +1075,7 @@ int scsi_init_io(struct scsi_cmnd *cmd)
 		if (sg_alloc_table_chained(&prot_sdb->table, ivecs,
 				prot_sdb->table.sgl)) {
 			error = BLKPREP_DEFER;
-			goto err_exit;
+			goto out_free_sgtables;
 		}
 
 		count = blk_rq_map_integrity_sg(rq->q, rq->bio,
@@ -1089,7 +1088,7 @@ int scsi_init_io(struct scsi_cmnd *cmd)
 	}
 
 	return BLKPREP_OK;
-err_exit:
+out_free_sgtables:
 	scsi_mq_free_sgtables(cmd);
 	return error;
 }
-- 
cgit 


From 159b2cbf59f44f2a0c005c1f323f8f05fb0a19f8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:42:39 +0100
Subject: scsi: return blk_status_t from scsi_init_io and ->init_command

Replace the old BLKPREP_* values with the BLK_STS_ ones that they are
converted to later anyway.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c | 45 +++++++++++++-------------
 drivers/scsi/sd.c       | 85 ++++++++++++++++++++++---------------------------
 drivers/scsi/sd.h       |  6 ++--
 drivers/scsi/sd_zbc.c   | 10 +++---
 drivers/scsi/sr.c       | 12 +++----
 5 files changed, 75 insertions(+), 83 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 1f84e2cec57b..3e3bdeee8f14 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1005,7 +1005,8 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 		scsi_io_completion_action(cmd, result);
 }
 
-static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
+static blk_status_t scsi_init_sgtable(struct request *req,
+		struct scsi_data_buffer *sdb)
 {
 	int count;
 
@@ -1014,7 +1015,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
 	 */
 	if (unlikely(sg_alloc_table_chained(&sdb->table,
 			blk_rq_nr_phys_segments(req), sdb->table.sgl)))
-		return BLKPREP_DEFER;
+		return BLK_STS_RESOURCE;
 
 	/* 
 	 * Next, walk the list, and fill in the addresses and sizes of
@@ -1024,7 +1025,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
 	BUG_ON(count > sdb->table.nents);
 	sdb->table.nents = count;
 	sdb->length = blk_rq_payload_bytes(req);
-	return BLKPREP_OK;
+	return BLK_STS_OK;
 }
 
 /*
@@ -1034,25 +1035,25 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
  *
  * Arguments:   cmd   - Command descriptor we wish to initialize
  *
- * Returns:     0 on success
- *		BLKPREP_DEFER if the failure is retryable
- *		BLKPREP_KILL if the failure is fatal
+ * Returns:     BLK_STS_OK on success
+ *		BLK_STS_RESOURCE if the failure is retryable
+ *		BLK_STS_IOERR if the failure is fatal
  */
-int scsi_init_io(struct scsi_cmnd *cmd)
+blk_status_t scsi_init_io(struct scsi_cmnd *cmd)
 {
 	struct request *rq = cmd->request;
-	int error = BLKPREP_KILL;
+	blk_status_t ret;
 
 	if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq)))
-		return BLKPREP_KILL;
+		return BLK_STS_IOERR;
 
-	error = scsi_init_sgtable(rq, &cmd->sdb);
-	if (error)
-		return error;
+	ret = scsi_init_sgtable(rq, &cmd->sdb);
+	if (ret)
+		return ret;
 
 	if (blk_bidi_rq(rq)) {
-		error = scsi_init_sgtable(rq->next_rq, rq->next_rq->special);
-		if (error)
+		ret = scsi_init_sgtable(rq->next_rq, rq->next_rq->special);
+		if (ret)
 			goto out_free_sgtables;
 	}
 
@@ -1066,7 +1067,7 @@ int scsi_init_io(struct scsi_cmnd *cmd)
 			 * queues a command to a device on an adapter
 			 * that does not support DIX.
 			 */
-			error = BLKPREP_KILL;
+			ret = BLK_STS_IOERR;
 			goto out_free_sgtables;
 		}
 
@@ -1074,7 +1075,7 @@ int scsi_init_io(struct scsi_cmnd *cmd)
 
 		if (sg_alloc_table_chained(&prot_sdb->table, ivecs,
 				prot_sdb->table.sgl)) {
-			error = BLKPREP_DEFER;
+			ret = BLK_STS_RESOURCE;
 			goto out_free_sgtables;
 		}
 
@@ -1087,10 +1088,10 @@ int scsi_init_io(struct scsi_cmnd *cmd)
 		cmd->prot_sdb->table.nents = count;
 	}
 
-	return BLKPREP_OK;
+	return BLK_STS_OK;
 out_free_sgtables:
 	scsi_mq_free_sgtables(cmd);
-	return error;
+	return ret;
 }
 EXPORT_SYMBOL(scsi_init_io);
 
@@ -1200,9 +1201,9 @@ static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev,
 	 * submit a request without an attached bio.
 	 */
 	if (req->bio) {
-		int ret = scsi_init_io(cmd);
-		if (unlikely(ret))
-			return prep_to_mq(ret);
+		blk_status_t ret = scsi_init_io(cmd);
+		if (unlikely(ret != BLK_STS_OK))
+			return ret;
 	} else {
 		BUG_ON(blk_rq_bytes(req));
 
@@ -1233,7 +1234,7 @@ static blk_status_t scsi_setup_fs_cmnd(struct scsi_device *sdev,
 
 	cmd->cmnd = scsi_req(req)->cmd = scsi_req(req)->__cmd;
 	memset(cmd->cmnd, 0, BLK_MAX_CDB);
-	return prep_to_mq(scsi_cmd_to_driver(cmd)->init_command(cmd));
+	return scsi_cmd_to_driver(cmd)->init_command(cmd);
 }
 
 static blk_status_t scsi_setup_cmnd(struct scsi_device *sdev,
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 3bb2b3351e35..4a6ed2fc8c71 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -114,7 +114,7 @@ static int sd_suspend_system(struct device *);
 static int sd_suspend_runtime(struct device *);
 static int sd_resume(struct device *);
 static void sd_rescan(struct device *);
-static int sd_init_command(struct scsi_cmnd *SCpnt);
+static blk_status_t sd_init_command(struct scsi_cmnd *SCpnt);
 static void sd_uninit_command(struct scsi_cmnd *SCpnt);
 static int sd_done(struct scsi_cmnd *);
 static void sd_eh_reset(struct scsi_cmnd *);
@@ -750,7 +750,7 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
 	blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
 }
 
-static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
+static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
 {
 	struct scsi_device *sdp = cmd->device;
 	struct request *rq = cmd->request;
@@ -761,7 +761,7 @@ static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
 
 	rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
 	if (!rq->special_vec.bv_page)
-		return BLKPREP_DEFER;
+		return BLK_STS_RESOURCE;
 	rq->special_vec.bv_offset = 0;
 	rq->special_vec.bv_len = data_len;
 	rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
@@ -784,7 +784,8 @@ static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
 	return scsi_init_io(cmd);
 }
 
-static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
+static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd,
+		bool unmap)
 {
 	struct scsi_device *sdp = cmd->device;
 	struct request *rq = cmd->request;
@@ -794,7 +795,7 @@ static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
 
 	rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
 	if (!rq->special_vec.bv_page)
-		return BLKPREP_DEFER;
+		return BLK_STS_RESOURCE;
 	rq->special_vec.bv_offset = 0;
 	rq->special_vec.bv_len = data_len;
 	rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
@@ -814,7 +815,8 @@ static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
 	return scsi_init_io(cmd);
 }
 
-static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap)
+static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd,
+		bool unmap)
 {
 	struct scsi_device *sdp = cmd->device;
 	struct request *rq = cmd->request;
@@ -824,7 +826,7 @@ static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap)
 
 	rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
 	if (!rq->special_vec.bv_page)
-		return BLKPREP_DEFER;
+		return BLK_STS_RESOURCE;
 	rq->special_vec.bv_offset = 0;
 	rq->special_vec.bv_len = data_len;
 	rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
@@ -844,7 +846,7 @@ static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap)
 	return scsi_init_io(cmd);
 }
 
-static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
+static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
 {
 	struct request *rq = cmd->request;
 	struct scsi_device *sdp = cmd->device;
@@ -862,7 +864,7 @@ static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
 	}
 
 	if (sdp->no_write_same)
-		return BLKPREP_INVALID;
+		return BLK_STS_TARGET;
 
 	if (sdkp->ws16 || sector > 0xffffffff || nr_sectors > 0xffff)
 		return sd_setup_write_same16_cmnd(cmd, false);
@@ -939,7 +941,7 @@ out:
  * Will set up either WRITE SAME(10) or WRITE SAME(16) depending on
  * the preference indicated by the target device.
  **/
-static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
+static blk_status_t sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
 {
 	struct request *rq = cmd->request;
 	struct scsi_device *sdp = cmd->device;
@@ -948,10 +950,10 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
 	sector_t sector = blk_rq_pos(rq);
 	unsigned int nr_sectors = blk_rq_sectors(rq);
 	unsigned int nr_bytes = blk_rq_bytes(rq);
-	int ret;
+	blk_status_t ret;
 
 	if (sdkp->device->no_write_same)
-		return BLKPREP_INVALID;
+		return BLK_STS_TARGET;
 
 	BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);
 
@@ -992,7 +994,7 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
 	return ret;
 }
 
-static int sd_setup_flush_cmnd(struct scsi_cmnd *cmd)
+static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd)
 {
 	struct request *rq = cmd->request;
 
@@ -1005,10 +1007,10 @@ static int sd_setup_flush_cmnd(struct scsi_cmnd *cmd)
 	cmd->allowed = SD_MAX_RETRIES;
 
 	rq->timeout = rq->q->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER;
-	return BLKPREP_OK;
+	return BLK_STS_OK;
 }
 
-static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
+static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 {
 	struct request *rq = SCpnt->request;
 	struct scsi_device *sdp = SCpnt->device;
@@ -1018,18 +1020,14 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 	sector_t threshold;
 	unsigned int this_count = blk_rq_sectors(rq);
 	unsigned int dif, dix;
-	int ret;
 	unsigned char protect;
+	blk_status_t ret;
 
 	ret = scsi_init_io(SCpnt);
-	if (ret != BLKPREP_OK)
+	if (ret != BLK_STS_OK)
 		return ret;
 	WARN_ON_ONCE(SCpnt != rq->special);
 
-	/* from here on until we're complete, any goto out
-	 * is used for a killable error condition */
-	ret = BLKPREP_KILL;
-
 	SCSI_LOG_HLQUEUE(1,
 		scmd_printk(KERN_INFO, SCpnt,
 			"%s: block=%llu, count=%d\n",
@@ -1042,7 +1040,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 						blk_rq_sectors(rq)));
 		SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,
 						"Retry with 0x%p\n", SCpnt));
-		goto out;
+		return BLK_STS_IOERR;
 	}
 
 	if (sdp->changed) {
@@ -1051,7 +1049,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 		 * the changed bit has been reset
 		 */
 		/* printk("SCSI disk has been changed or is not present. Prohibiting further I/O.\n"); */
-		goto out;
+		return BLK_STS_IOERR;
 	}
 
 	/*
@@ -1089,31 +1087,28 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 		if ((block & 1) || (blk_rq_sectors(rq) & 1)) {
 			scmd_printk(KERN_ERR, SCpnt,
 				    "Bad block number requested\n");
-			goto out;
-		} else {
-			block = block >> 1;
-			this_count = this_count >> 1;
+			return BLK_STS_IOERR;
 		}
+		block = block >> 1;
+		this_count = this_count >> 1;
 	}
 	if (sdp->sector_size == 2048) {
 		if ((block & 3) || (blk_rq_sectors(rq) & 3)) {
 			scmd_printk(KERN_ERR, SCpnt,
 				    "Bad block number requested\n");
-			goto out;
-		} else {
-			block = block >> 2;
-			this_count = this_count >> 2;
+			return BLK_STS_IOERR;
 		}
+		block = block >> 2;
+		this_count = this_count >> 2;
 	}
 	if (sdp->sector_size == 4096) {
 		if ((block & 7) || (blk_rq_sectors(rq) & 7)) {
 			scmd_printk(KERN_ERR, SCpnt,
 				    "Bad block number requested\n");
-			goto out;
-		} else {
-			block = block >> 3;
-			this_count = this_count >> 3;
+			return BLK_STS_IOERR;
 		}
+		block = block >> 3;
+		this_count = this_count >> 3;
 	}
 	if (rq_data_dir(rq) == WRITE) {
 		SCpnt->cmnd[0] = WRITE_6;
@@ -1125,7 +1120,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 		SCpnt->cmnd[0] = READ_6;
 	} else {
 		scmd_printk(KERN_ERR, SCpnt, "Unknown command %d\n", req_op(rq));
-		goto out;
+		return BLK_STS_IOERR;
 	}
 
 	SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,
@@ -1145,10 +1140,8 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 	if (protect && sdkp->protection_type == T10_PI_TYPE2_PROTECTION) {
 		SCpnt->cmnd = mempool_alloc(sd_cdb_pool, GFP_ATOMIC);
 
-		if (unlikely(SCpnt->cmnd == NULL)) {
-			ret = BLKPREP_DEFER;
-			goto out;
-		}
+		if (unlikely(!SCpnt->cmnd))
+			return BLK_STS_RESOURCE;
 
 		SCpnt->cmd_len = SD_EXT_CDB_SIZE;
 		memset(SCpnt->cmnd, 0, SCpnt->cmd_len);
@@ -1216,7 +1209,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 			 */
 			scmd_printk(KERN_ERR, SCpnt,
 				    "FUA write on READ/WRITE(6) drive\n");
-			goto out;
+			return BLK_STS_IOERR;
 		}
 
 		SCpnt->cmnd[1] |= (unsigned char) ((block >> 16) & 0x1f);
@@ -1240,12 +1233,10 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 	 * This indicates that the command is ready from our end to be
 	 * queued.
 	 */
-	ret = BLKPREP_OK;
- out:
-	return ret;
+	return BLK_STS_OK;
 }
 
-static int sd_init_command(struct scsi_cmnd *cmd)
+static blk_status_t sd_init_command(struct scsi_cmnd *cmd)
 {
 	struct request *rq = cmd->request;
 
@@ -1261,7 +1252,7 @@ static int sd_init_command(struct scsi_cmnd *cmd)
 		case SD_LBP_ZERO:
 			return sd_setup_write_same10_cmnd(cmd, false);
 		default:
-			return BLKPREP_INVALID;
+			return BLK_STS_TARGET;
 		}
 	case REQ_OP_WRITE_ZEROES:
 		return sd_setup_write_zeroes_cmnd(cmd);
@@ -1276,7 +1267,7 @@ static int sd_init_command(struct scsi_cmnd *cmd)
 		return sd_zbc_setup_reset_cmnd(cmd);
 	default:
 		WARN_ON_ONCE(1);
-		return BLKPREP_KILL;
+		return BLK_STS_NOTSUPP;
 	}
 }
 
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 1d63f3a23ffb..7f43e6839bce 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -271,7 +271,7 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp)
 
 extern int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buffer);
 extern void sd_zbc_print_zones(struct scsi_disk *sdkp);
-extern int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd);
+extern blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd);
 extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
 			    struct scsi_sense_hdr *sshdr);
 extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
@@ -288,9 +288,9 @@ static inline int sd_zbc_read_zones(struct scsi_disk *sdkp,
 
 static inline void sd_zbc_print_zones(struct scsi_disk *sdkp) {}
 
-static inline int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
+static inline blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
 {
-	return BLKPREP_INVALID;
+	return BLK_STS_TARGET;
 }
 
 static inline void sd_zbc_complete(struct scsi_cmnd *cmd,
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index e06c48c866e4..83365b29a4d8 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -185,7 +185,7 @@ static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp)
  *
  * Called from sd_init_command() for a REQ_OP_ZONE_RESET request.
  */
-int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
+blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
 {
 	struct request *rq = cmd->request;
 	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
@@ -194,14 +194,14 @@ int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
 
 	if (!sd_is_zoned(sdkp))
 		/* Not a zoned device */
-		return BLKPREP_KILL;
+		return BLK_STS_IOERR;
 
 	if (sdkp->device->changed)
-		return BLKPREP_KILL;
+		return BLK_STS_IOERR;
 
 	if (sector & (sd_zbc_zone_sectors(sdkp) - 1))
 		/* Unaligned request */
-		return BLKPREP_KILL;
+		return BLK_STS_IOERR;
 
 	cmd->cmd_len = 16;
 	memset(cmd->cmnd, 0, cmd->cmd_len);
@@ -214,7 +214,7 @@ int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
 	cmd->transfersize = 0;
 	cmd->allowed = 0;
 
-	return BLKPREP_OK;
+	return BLK_STS_OK;
 }
 
 /**
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 54dd70ae9731..38ddbbfe5f3c 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -80,7 +80,7 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_WORM);
 static DEFINE_MUTEX(sr_mutex);
 static int sr_probe(struct device *);
 static int sr_remove(struct device *);
-static int sr_init_command(struct scsi_cmnd *SCpnt);
+static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt);
 static int sr_done(struct scsi_cmnd *);
 static int sr_runtime_suspend(struct device *dev);
 
@@ -384,22 +384,22 @@ static int sr_done(struct scsi_cmnd *SCpnt)
 	return good_bytes;
 }
 
-static int sr_init_command(struct scsi_cmnd *SCpnt)
+static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt)
 {
 	int block = 0, this_count, s_size;
 	struct scsi_cd *cd;
 	struct request *rq = SCpnt->request;
-	int ret;
+	blk_status_t ret;
 
 	ret = scsi_init_io(SCpnt);
-	if (ret != BLKPREP_OK)
+	if (ret != BLK_STS_OK)
 		goto out;
 	WARN_ON_ONCE(SCpnt != rq->special);
 	cd = scsi_cd(rq->rq_disk);
 
 	/* from here on until we're complete, any goto out
 	 * is used for a killable error condition */
-	ret = BLKPREP_KILL;
+	ret = BLK_STS_IOERR;
 
 	SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, SCpnt,
 		"Doing sr request, block = %d\n", block));
@@ -516,7 +516,7 @@ static int sr_init_command(struct scsi_cmnd *SCpnt)
 	 * This indicates that the command is ready from our end to be
 	 * queued.
 	 */
-	ret = BLKPREP_OK;
+	ret = BLK_STS_OK;
  out:
 	return ret;
 }
-- 
cgit 


From 4c1cb67c03511a4a404aaaeb206222d5bebdc4ca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:42:40 +0100
Subject: scsi: return blk_status_t from device handler ->prep_fn

Remove the last use of the old BLKPREP_* values, which get converted
to BLK_STS_* later anyway.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/device_handler/scsi_dh_alua.c  | 21 +++++++++++----------
 drivers/scsi/device_handler/scsi_dh_emc.c   |  8 ++++----
 drivers/scsi/device_handler/scsi_dh_hp_sw.c |  7 +++----
 drivers/scsi/device_handler/scsi_dh_rdac.c  |  7 +++----
 drivers/scsi/scsi_lib.c                     | 18 +++---------------
 5 files changed, 24 insertions(+), 37 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
index 12dc7100bb4c..d7ac498ba35a 100644
--- a/drivers/scsi/device_handler/scsi_dh_alua.c
+++ b/drivers/scsi/device_handler/scsi_dh_alua.c
@@ -1071,28 +1071,29 @@ static void alua_check(struct scsi_device *sdev, bool force)
  * Fail I/O to all paths not in state
  * active/optimized or active/non-optimized.
  */
-static int alua_prep_fn(struct scsi_device *sdev, struct request *req)
+static blk_status_t alua_prep_fn(struct scsi_device *sdev, struct request *req)
 {
 	struct alua_dh_data *h = sdev->handler_data;
 	struct alua_port_group *pg;
 	unsigned char state = SCSI_ACCESS_STATE_OPTIMAL;
-	int ret = BLKPREP_OK;
 
 	rcu_read_lock();
 	pg = rcu_dereference(h->pg);
 	if (pg)
 		state = pg->state;
 	rcu_read_unlock();
-	if (state == SCSI_ACCESS_STATE_TRANSITIONING)
-		ret = BLKPREP_DEFER;
-	else if (state != SCSI_ACCESS_STATE_OPTIMAL &&
-		 state != SCSI_ACCESS_STATE_ACTIVE &&
-		 state != SCSI_ACCESS_STATE_LBA) {
-		ret = BLKPREP_KILL;
+
+	switch (state) {
+	case SCSI_ACCESS_STATE_OPTIMAL:
+	case SCSI_ACCESS_STATE_ACTIVE:
+	case SCSI_ACCESS_STATE_LBA:
+		return BLK_STS_OK;
+	case SCSI_ACCESS_STATE_TRANSITIONING:
+		return BLK_STS_RESOURCE;
+	default:
 		req->rq_flags |= RQF_QUIET;
+		return BLK_STS_IOERR;
 	}
-	return ret;
-
 }
 
 static void alua_rescan(struct scsi_device *sdev)
diff --git a/drivers/scsi/device_handler/scsi_dh_emc.c b/drivers/scsi/device_handler/scsi_dh_emc.c
index 95c47909a58f..bea8e13febb6 100644
--- a/drivers/scsi/device_handler/scsi_dh_emc.c
+++ b/drivers/scsi/device_handler/scsi_dh_emc.c
@@ -341,17 +341,17 @@ static int clariion_check_sense(struct scsi_device *sdev,
 	return SCSI_RETURN_NOT_HANDLED;
 }
 
-static int clariion_prep_fn(struct scsi_device *sdev, struct request *req)
+static blk_status_t clariion_prep_fn(struct scsi_device *sdev,
+		struct request *req)
 {
 	struct clariion_dh_data *h = sdev->handler_data;
-	int ret = BLKPREP_OK;
 
 	if (h->lun_state != CLARIION_LUN_OWNED) {
-		ret = BLKPREP_KILL;
 		req->rq_flags |= RQF_QUIET;
+		return BLK_STS_IOERR;
 	}
-	return ret;
 
+	return BLK_STS_OK;
 }
 
 static int clariion_std_inquiry(struct scsi_device *sdev,
diff --git a/drivers/scsi/device_handler/scsi_dh_hp_sw.c b/drivers/scsi/device_handler/scsi_dh_hp_sw.c
index e65a0ebb4b54..80129b033855 100644
--- a/drivers/scsi/device_handler/scsi_dh_hp_sw.c
+++ b/drivers/scsi/device_handler/scsi_dh_hp_sw.c
@@ -172,17 +172,16 @@ retry:
 	return rc;
 }
 
-static int hp_sw_prep_fn(struct scsi_device *sdev, struct request *req)
+static blk_status_t hp_sw_prep_fn(struct scsi_device *sdev, struct request *req)
 {
 	struct hp_sw_dh_data *h = sdev->handler_data;
-	int ret = BLKPREP_OK;
 
 	if (h->path_state != HP_SW_PATH_ACTIVE) {
-		ret = BLKPREP_KILL;
 		req->rq_flags |= RQF_QUIET;
+		return BLK_STS_IOERR;
 	}
-	return ret;
 
+	return BLK_STS_OK;
 }
 
 /*
diff --git a/drivers/scsi/device_handler/scsi_dh_rdac.c b/drivers/scsi/device_handler/scsi_dh_rdac.c
index d27fabae8ddd..65f1fe343c64 100644
--- a/drivers/scsi/device_handler/scsi_dh_rdac.c
+++ b/drivers/scsi/device_handler/scsi_dh_rdac.c
@@ -642,17 +642,16 @@ done:
 	return 0;
 }
 
-static int rdac_prep_fn(struct scsi_device *sdev, struct request *req)
+static blk_status_t rdac_prep_fn(struct scsi_device *sdev, struct request *req)
 {
 	struct rdac_dh_data *h = sdev->handler_data;
-	int ret = BLKPREP_OK;
 
 	if (h->state != RDAC_STATE_ACTIVE) {
-		ret = BLKPREP_KILL;
 		req->rq_flags |= RQF_QUIET;
+		return BLK_STS_IOERR;
 	}
-	return ret;
 
+	return BLK_STS_OK;
 }
 
 static int rdac_check_sense(struct scsi_device *sdev,
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 3e3bdeee8f14..5d83a162d03b 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1177,18 +1177,6 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
 	scsi_add_cmd_to_list(cmd);
 }
 
-static inline blk_status_t prep_to_mq(int ret)
-{
-	switch (ret) {
-	case BLKPREP_OK:
-		return BLK_STS_OK;
-	case BLKPREP_DEFER:
-		return BLK_STS_RESOURCE;
-	default:
-		return BLK_STS_IOERR;
-	}
-}
-
 static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev,
 		struct request *req)
 {
@@ -1227,9 +1215,9 @@ static blk_status_t scsi_setup_fs_cmnd(struct scsi_device *sdev,
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 
 	if (unlikely(sdev->handler && sdev->handler->prep_fn)) {
-		int ret = sdev->handler->prep_fn(sdev, req);
-		if (ret != BLKPREP_OK)
-			return prep_to_mq(ret);
+		blk_status_t ret = sdev->handler->prep_fn(sdev, req);
+		if (ret != BLK_STS_OK)
+			return ret;
 	}
 
 	cmd->cmnd = scsi_req(req)->cmd = scsi_req(req)->__cmd;
-- 
cgit 


From 511c49fe1804671800947b69281e07719fad25e2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 10 Nov 2018 09:30:44 +0100
Subject: fnic: fix fnic_scsi_host_{start,end}_tag

The way these functions abuse ->special to try to store the dummy
request looks completely broken, given that it actually stores the
original scsi command.

Instead switch to ->host_scribble and store the actual dummy command.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/fnic/fnic_scsi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c
index 96acfcecd540..cafbcfb85bfa 100644
--- a/drivers/scsi/fnic/fnic_scsi.c
+++ b/drivers/scsi/fnic/fnic_scsi.c
@@ -2274,7 +2274,7 @@ fnic_scsi_host_start_tag(struct fnic *fnic, struct scsi_cmnd *sc)
 		return SCSI_NO_TAG;
 
 	sc->tag = sc->request->tag = dummy->tag;
-	sc->request->special = sc;
+	sc->host_scribble = (unsigned char *)dummy;
 
 	return dummy->tag;
 }
@@ -2286,7 +2286,7 @@ fnic_scsi_host_start_tag(struct fnic *fnic, struct scsi_cmnd *sc)
 static inline void
 fnic_scsi_host_end_tag(struct fnic *fnic, struct scsi_cmnd *sc)
 {
-	struct request *dummy = sc->request->special;
+	struct request *dummy = (struct request *)sc->host_scribble;
 
 	blk_mq_free_request(dummy);
 }
-- 
cgit 


From 49f6613632f9504ae59ccbbb92a5b07f474d75b2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 10 Nov 2018 09:30:45 +0100
Subject: nullb: remove leftover legacy request code

null_softirq_done_fn is only used for the blk-mq path, so remove the
other branch.  Also rename the function to better match the method name.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/null_blk_main.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 9c045bee0985..16ba3db2a015 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -642,14 +642,11 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)
 	hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL);
 }
 
-static void null_softirq_done_fn(struct request *rq)
+static void null_complete_rq(struct request *rq)
 {
 	struct nullb *nullb = rq->q->queuedata;
 
-	if (nullb->dev->queue_mode == NULL_Q_MQ)
-		end_cmd(blk_mq_rq_to_pdu(rq));
-	else
-		end_cmd(rq->special);
+	end_cmd(blk_mq_rq_to_pdu(rq));
 }
 
 static struct nullb_page *null_alloc_page(gfp_t gfp_flags)
@@ -1357,7 +1354,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 static const struct blk_mq_ops null_mq_ops = {
 	.queue_rq       = null_queue_rq,
-	.complete	= null_softirq_done_fn,
+	.complete	= null_complete_rq,
 	.timeout	= null_timeout_rq,
 };
 
-- 
cgit 


From 1bee42438f32de405987b4a41b93624b184ae481 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 10 Nov 2018 09:30:46 +0100
Subject: skd_main: don't use req->special

Add a retries field to the internal request structure instead, which gets
set to zero on the first submission.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_main.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index a0196477165f..a10d5736d8f7 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -181,6 +181,7 @@ struct skd_request_context {
 	struct fit_completion_entry_v1 completion;
 
 	struct fit_comp_error_info err_info;
+	int retries;
 
 	blk_status_t status;
 };
@@ -495,6 +496,11 @@ static blk_status_t skd_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (unlikely(skdev->state != SKD_DRVR_STATE_ONLINE))
 		return skd_fail_all(q) ? BLK_STS_IOERR : BLK_STS_RESOURCE;
 
+	if (!(req->rq_flags & RQF_DONTPREP)) {
+		skreq->retries = 0;
+		req->rq_flags |= RQF_DONTPREP;
+	}
+
 	blk_mq_start_request(req);
 
 	WARN_ONCE(tag >= skd_max_queue_depth, "%#x > %#x (nr_requests = %lu)\n",
@@ -1426,7 +1432,7 @@ static void skd_resolve_req_exception(struct skd_device *skdev,
 		break;
 
 	case SKD_CHECK_STATUS_REQUEUE_REQUEST:
-		if ((unsigned long) ++req->special < SKD_MAX_RETRIES) {
+		if (++skreq->retries < SKD_MAX_RETRIES) {
 			skd_log_skreq(skdev, skreq, "retry");
 			blk_mq_requeue_request(req, true);
 			break;
-- 
cgit 


From 61e7712e25bbe964c9537bb1171bac4df7afa593 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 10 Nov 2018 09:30:47 +0100
Subject: aoe: replace ->special use with private data in the request

Makes the code a whole lot easier to read.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/aoe/aoe.h    |  4 ++++
 drivers/block/aoe/aoeblk.c |  1 +
 drivers/block/aoe/aoecmd.c | 27 +++++++++------------------
 drivers/block/aoe/aoedev.c | 11 ++++++-----
 4 files changed, 20 insertions(+), 23 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 7ca76ed2e71a..84d0fcebd6af 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -100,6 +100,10 @@ enum {
 	MAX_TAINT = 1000,	/* cap on aoetgt taint */
 };
 
+struct aoe_req {
+	unsigned long nr_bios;
+};
+
 struct buf {
 	ulong nframesout;
 	struct bio *bio;
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index ed26b7287256..e2c6aae2d636 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -387,6 +387,7 @@ aoeblk_gdalloc(void *vp)
 
 	set = &d->tag_set;
 	set->ops = &aoeblk_mq_ops;
+	set->cmd_size = sizeof(struct aoe_req);
 	set->nr_hw_queues = 1;
 	set->queue_depth = 128;
 	set->numa_node = NUMA_NO_NODE;
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index bb2fba651bd2..3cf9bc5d8d95 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -822,17 +822,6 @@ out:
 	spin_unlock_irqrestore(&d->lock, flags);
 }
 
-static unsigned long
-rqbiocnt(struct request *r)
-{
-	struct bio *bio;
-	unsigned long n = 0;
-
-	__rq_for_each_bio(bio, r)
-		n++;
-	return n;
-}
-
 static void
 bufinit(struct buf *buf, struct request *rq, struct bio *bio)
 {
@@ -847,6 +836,7 @@ nextbuf(struct aoedev *d)
 {
 	struct request *rq;
 	struct request_queue *q;
+	struct aoe_req *req;
 	struct buf *buf;
 	struct bio *bio;
 
@@ -865,7 +855,11 @@ nextbuf(struct aoedev *d)
 		blk_mq_start_request(rq);
 		d->ip.rq = rq;
 		d->ip.nxbio = rq->bio;
-		rq->special = (void *) rqbiocnt(rq);
+
+		req = blk_mq_rq_to_pdu(rq);
+		req->nr_bios = 0;
+		__rq_for_each_bio(bio, rq)
+			req->nr_bios++;
 	}
 	buf = mempool_alloc(d->bufpool, GFP_ATOMIC);
 	if (buf == NULL) {
@@ -1069,16 +1063,13 @@ aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
 static void
 aoe_end_buf(struct aoedev *d, struct buf *buf)
 {
-	struct request *rq;
-	unsigned long n;
+	struct request *rq = buf->rq;
+	struct aoe_req *req = blk_mq_rq_to_pdu(rq);
 
 	if (buf == d->ip.buf)
 		d->ip.buf = NULL;
-	rq = buf->rq;
 	mempool_free(buf, d->bufpool);
-	n = (unsigned long) rq->special;
-	rq->special = (void *) --n;
-	if (n == 0)
+	if (--req->nr_bios == 0)
 		aoe_end_request(d, rq, 0);
 }
 
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 9063f8efbd3b..5b49f1b33ebe 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -160,21 +160,22 @@ static void
 aoe_failip(struct aoedev *d)
 {
 	struct request *rq;
+	struct aoe_req *req;
 	struct bio *bio;
-	unsigned long n;
 
 	aoe_failbuf(d, d->ip.buf);
-
 	rq = d->ip.rq;
 	if (rq == NULL)
 		return;
+
+	req = blk_mq_rq_to_pdu(rq);
 	while ((bio = d->ip.nxbio)) {
 		bio->bi_status = BLK_STS_IOERR;
 		d->ip.nxbio = bio->bi_next;
-		n = (unsigned long) rq->special;
-		rq->special = (void *) --n;
+		req->nr_bios--;
 	}
-	if ((unsigned long) rq->special == 0)
+
+	if (!req->nr_bios)
 		aoe_end_request(d, rq, 0);
 }
 
-- 
cgit 


From 289d088b66182076d33b5579417d429371cf9dfd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 10 Nov 2018 09:30:48 +0100
Subject: pd: replace ->special use with private data in the request

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/paride/pd.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index ae4971e5d9a8..0ff9b12d0e35 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -242,6 +242,11 @@ struct pd_unit {
 
 static struct pd_unit pd[PD_UNITS];
 
+struct pd_req {
+	/* for REQ_OP_DRV_IN: */
+	enum action (*func)(struct pd_unit *disk);
+};
+
 static char pd_scratch[512];	/* scratch block buffer */
 
 static char *pd_errs[17] = { "ERR", "INDEX", "ECC", "DRQ", "SEEK", "WRERR",
@@ -502,8 +507,9 @@ static enum action do_pd_io_start(void)
 
 static enum action pd_special(void)
 {
-	enum action (*func)(struct pd_unit *) = pd_req->special;
-	return func(pd_current);
+	struct pd_req *req = blk_mq_rq_to_pdu(pd_req);
+
+	return req->func(pd_current);
 }
 
 static int pd_next_buf(void)
@@ -767,12 +773,14 @@ static int pd_special_command(struct pd_unit *disk,
 		      enum action (*func)(struct pd_unit *disk))
 {
 	struct request *rq;
+	struct pd_req *req;
 
 	rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
+	req = blk_mq_rq_to_pdu(rq);
 
-	rq->special = func;
+	req->func = func;
 	blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
 	blk_put_request(rq);
 	return 0;
@@ -892,9 +900,21 @@ static void pd_probe_drive(struct pd_unit *disk)
 	disk->gd = p;
 	p->private_data = disk;
 
-	p->queue = blk_mq_init_sq_queue(&disk->tag_set, &pd_mq_ops, 2,
-				BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING);
+	memset(&disk->tag_set, 0, sizeof(disk->tag_set));
+	disk->tag_set.ops = &pd_mq_ops;
+	disk->tag_set.cmd_size = sizeof(struct pd_req);
+	disk->tag_set.nr_hw_queues = 1;
+	disk->tag_set.nr_maps = 1;
+	disk->tag_set.queue_depth = 2;
+	disk->tag_set.numa_node = NUMA_NO_NODE;
+	disk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
+
+	if (blk_mq_alloc_tag_set(&disk->tag_set))
+		return;
+
+	p->queue = blk_mq_init_queue(&disk->tag_set);
 	if (IS_ERR(p->queue)) {
+		blk_mq_free_tag_set(&disk->tag_set);
 		p->queue = NULL;
 		return;
 	}
-- 
cgit 


From 22ce0a7ccf23d55d1fdaa2974002f8b5ae765665 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 10 Nov 2018 09:30:49 +0100
Subject: ide: don't use req->special

Just replace it with a field of the same name in struct ide_req.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-atapi.c    |  4 ++--
 drivers/ide/ide-cd.c       |  4 ++--
 drivers/ide/ide-devsets.c  |  4 ++--
 drivers/ide/ide-disk.c     |  6 +++---
 drivers/ide/ide-eh.c       |  2 +-
 drivers/ide/ide-floppy.c   |  2 +-
 drivers/ide/ide-io.c       | 14 +++++++++-----
 drivers/ide/ide-park.c     |  4 ++--
 drivers/ide/ide-pm.c       | 12 ++++++------
 drivers/ide/ide-tape.c     |  2 +-
 drivers/ide/ide-taskfile.c |  2 +-
 11 files changed, 30 insertions(+), 26 deletions(-)

(limited to 'drivers')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 33210bc67618..da58020a144e 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -94,7 +94,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 
 	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
-	rq->special = (char *)pc;
+	ide_req(rq)->special = pc;
 
 	if (buf && bufflen) {
 		error = blk_rq_map_kern(drive->queue, rq, buf, bufflen,
@@ -244,7 +244,7 @@ int ide_queue_sense_rq(ide_drive_t *drive, void *special)
 		return -ENOMEM;
 	}
 
-	sense_rq->special = special;
+	ide_req(sense_rq)->special = special;
 	drive->sense_rq_armed = false;
 
 	drive->hwif->rq = NULL;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 69c1aede5f93..1f03884a6808 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -211,12 +211,12 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
 static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
-	 * For ATA_PRIV_SENSE, "rq->special" points to the original
+	 * For ATA_PRIV_SENSE, "ide_req(rq)->special" points to the original
 	 * failed request.  Also, the sense data should be read
 	 * directly from rq which might be different from the original
 	 * sense buffer if it got copied during mapping.
 	 */
-	struct request *failed = (struct request *)rq->special;
+	struct request *failed = ide_req(rq)->special;
 	void *sense = bio_data(rq->bio);
 
 	if (failed) {
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index f4f8afdf8bbe..f2f93ed40356 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -171,7 +171,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
 	scsi_req(rq)->cmd_len = 5;
 	scsi_req(rq)->cmd[0] = REQ_DEVSET_EXEC;
 	*(int *)&scsi_req(rq)->cmd[1] = arg;
-	rq->special = setting->set;
+	ide_req(rq)->special = setting->set;
 
 	blk_execute_rq(q, NULL, rq, 0);
 	ret = scsi_req(rq)->result;
@@ -182,7 +182,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
 
 ide_startstop_t ide_do_devset(ide_drive_t *drive, struct request *rq)
 {
-	int err, (*setfunc)(ide_drive_t *, int) = rq->special;
+	int err, (*setfunc)(ide_drive_t *, int) = ide_req(rq)->special;
 
 	err = setfunc(drive, *(int *)&scsi_req(rq)->cmd[1]);
 	if (err)
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 724db9af0d82..197912af5c2f 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -434,8 +434,8 @@ static bool idedisk_prep_rq(ide_drive_t *drive, struct request *rq)
 	if (req_op(rq) != REQ_OP_FLUSH)
 		return true;
 
-	if (rq->special) {
-		cmd = rq->special;
+	if (ide_req(rq)->special) {
+		cmd = ide_req(rq)->special;
 		memset(cmd, 0, sizeof(*cmd));
 	} else {
 		cmd = kzalloc(sizeof(*cmd), GFP_ATOMIC);
@@ -455,7 +455,7 @@ static bool idedisk_prep_rq(ide_drive_t *drive, struct request *rq)
 	rq->cmd_flags &= ~REQ_OP_MASK;
 	rq->cmd_flags |= REQ_OP_DRV_OUT;
 	ide_req(rq)->type = ATA_PRIV_TASKFILE;
-	rq->special = cmd;
+	ide_req(rq)->special = cmd;
 	cmd->rq = rq;
 
 	return true;
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 47d5f3379748..e1323e058454 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -125,7 +125,7 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
 	/* retry only "normal" I/O: */
 	if (blk_rq_is_passthrough(rq)) {
 		if (ata_taskfile_request(rq)) {
-			struct ide_cmd *cmd = rq->special;
+			struct ide_cmd *cmd = ide_req(rq)->special;
 
 			if (cmd)
 				ide_complete_cmd(drive, cmd, stat, err);
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index a8df300f949c..780d33ccc5d8 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -276,7 +276,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		switch (ide_req(rq)->type) {
 		case ATA_PRIV_MISC:
 		case ATA_PRIV_SENSE:
-			pc = (struct ide_atapi_pc *)rq->special;
+			pc = (struct ide_atapi_pc *)ide_req(rq)->special;
 			break;
 		default:
 			BUG();
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 64e72640acf8..94e9c79c41cf 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -111,7 +111,7 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
 	}
 
 	if (rq && ata_taskfile_request(rq)) {
-		struct ide_cmd *orig_cmd = rq->special;
+		struct ide_cmd *orig_cmd = ide_req(rq)->special;
 
 		if (cmd->tf_flags & IDE_TFLAG_DYN)
 			kfree(orig_cmd);
@@ -261,7 +261,7 @@ EXPORT_SYMBOL_GPL(ide_init_sg_cmd);
 static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 		struct request *rq)
 {
-	struct ide_cmd *cmd = rq->special;
+	struct ide_cmd *cmd = ide_req(rq)->special;
 
 	if (cmd) {
 		if (cmd->protocol == ATA_PROT_PIO) {
@@ -352,7 +352,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 		if (ata_taskfile_request(rq))
 			return execute_drive_cmd(drive, rq);
 		else if (ata_pm_request(rq)) {
-			struct ide_pm_state *pm = rq->special;
+			struct ide_pm_state *pm = ide_req(rq)->special;
 #ifdef DEBUG_PM
 			printk("%s: start_power_step(step: %d)\n",
 				drive->name, pm->pm_step);
@@ -460,16 +460,20 @@ blk_status_t ide_queue_rq(struct blk_mq_hw_ctx *hctx,
 	ide_drive_t	*drive = hctx->queue->queuedata;
 	ide_hwif_t	*hwif = drive->hwif;
 	struct ide_host *host = hwif->host;
-	struct request	*rq = NULL;
+	struct request	*rq = bd->rq;
 	ide_startstop_t	startstop;
 
+	if (!(rq->rq_flags & RQF_DONTPREP)) {
+		rq->rq_flags |= RQF_DONTPREP;
+		ide_req(rq)->special = NULL;
+	}
+
 	/* HLD do_request() callback might sleep, make sure it's okay */
 	might_sleep();
 
 	if (ide_lock_host(host, hwif))
 		return BLK_STS_DEV_RESOURCE;
 
-	rq = bd->rq;
 	blk_mq_start_request(rq);
 
 	spin_lock_irq(&hwif->lock);
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index de9e85cf74d1..102aa3bc3e7f 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -36,7 +36,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	scsi_req(rq)->cmd[0] = REQ_PARK_HEADS;
 	scsi_req(rq)->cmd_len = 1;
 	ide_req(rq)->type = ATA_PRIV_MISC;
-	rq->special = &timeout;
+	ide_req(rq)->special = &timeout;
 	blk_execute_rq(q, NULL, rq, 1);
 	rc = scsi_req(rq)->result ? -EIO : 0;
 	blk_put_request(rq);
@@ -67,7 +67,7 @@ ide_startstop_t ide_do_park_unpark(ide_drive_t *drive, struct request *rq)
 
 	memset(&cmd, 0, sizeof(cmd));
 	if (scsi_req(rq)->cmd[0] == REQ_PARK_HEADS) {
-		drive->sleep = *(unsigned long *)rq->special;
+		drive->sleep = *(unsigned long *)ide_req(rq)->special;
 		drive->dev_flags |= IDE_DFLAG_SLEEPING;
 		tf->command = ATA_CMD_IDLEIMMEDIATE;
 		tf->feature = 0x44;
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index ea10507e5190..a8c53c98252d 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -21,7 +21,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	memset(&rqpm, 0, sizeof(rqpm));
 	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_PM_SUSPEND;
-	rq->special = &rqpm;
+	ide_req(rq)->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_SUSPEND;
 	if (mesg.event == PM_EVENT_PRETHAW)
 		mesg.event = PM_EVENT_FREEZE;
@@ -82,7 +82,7 @@ int generic_ide_resume(struct device *dev)
 	memset(&rqpm, 0, sizeof(rqpm));
 	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_PREEMPT);
 	ide_req(rq)->type = ATA_PRIV_PM_RESUME;
-	rq->special = &rqpm;
+	ide_req(rq)->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_RESUME;
 	rqpm.pm_state = PM_EVENT_ON;
 
@@ -101,7 +101,7 @@ int generic_ide_resume(struct device *dev)
 
 void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 {
-	struct ide_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = ide_req(rq)->special;
 
 #ifdef DEBUG_PM
 	printk(KERN_INFO "%s: complete_power_step(step: %d)\n",
@@ -131,7 +131,7 @@ void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 
 ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 {
-	struct ide_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = ide_req(rq)->special;
 	struct ide_cmd cmd = { };
 
 	switch (pm->pm_step) {
@@ -203,7 +203,7 @@ out_do_tf:
 void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 {
 	struct request_queue *q = drive->queue;
-	struct ide_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = ide_req(rq)->special;
 	unsigned long flags;
 
 	ide_complete_power_step(drive, rq);
@@ -228,7 +228,7 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 
 void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 {
-	struct ide_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = ide_req(rq)->special;
 
 	if (blk_rq_is_private(rq) &&
 	    ide_req(rq)->type == ATA_PRIV_PM_SUSPEND &&
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 34c1165226a4..db1a65f4b490 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -639,7 +639,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 		goto out;
 	}
 	if (req->cmd[13] & REQ_IDETAPE_PC1) {
-		pc = (struct ide_atapi_pc *)rq->special;
+		pc = (struct ide_atapi_pc *)ide_req(rq)->special;
 		req->cmd[13] &= ~(REQ_IDETAPE_PC1);
 		req->cmd[13] |= REQ_IDETAPE_PC2;
 		goto out;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index c21d5c50ae3a..17b2e379e872 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -440,7 +440,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 			goto put_req;
 	}
 
-	rq->special = cmd;
+	ide_req(rq)->special = cmd;
 	cmd->rq = rq;
 
 	blk_execute_rq(drive->queue, NULL, rq, 0);
-- 
cgit 


From 8e18ebef4dd4c83cea65730821da423553ed6019 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 10 Nov 2018 13:03:52 -0700
Subject: null_blk: remove unused nullb device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The compiler rightfully complains:

drivers/block/null_blk_main.c: In function ‘null_complete_rq’:
drivers/block/null_blk_main.c:647:16: warning: unused variable ‘nullb’ [-Wunused-variable]
  struct nullb *nullb = rq->q->queuedata;
                ^~~~~

Fixes: 49f6613632f9 ("nullb: remove leftover legacy request code")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/null_blk_main.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 16ba3db2a015..63c23fcfc4df 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -644,8 +644,6 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)
 
 static void null_complete_rq(struct request *rq)
 {
-	struct nullb *nullb = rq->q->queuedata;
-
 	end_cmd(blk_mq_rq_to_pdu(rq));
 }
 
-- 
cgit 


From 628bd85947091830a8c4872adfd5ed1d515a9cf2 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 12 Nov 2018 08:42:14 -0700
Subject: loop: Fix double mutex_unlock(&loop_ctl_mutex) in
 loop_control_ioctl()

Commit 0a42e99b58a20883 ("loop: Get rid of loop_index_mutex") forgot to
remove mutex_unlock(&loop_ctl_mutex) from loop_control_ioctl() when
replacing loop_index_mutex with loop_ctl_mutex.

Fixes: 0a42e99b58a20883 ("loop: Get rid of loop_index_mutex")
Reported-by: syzbot <syzbot+c0138741c2290fc5e63f@syzkaller.appspotmail.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index bf6bc35aaf88..176ab1f28eca 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -2074,12 +2074,10 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 			break;
 		if (lo->lo_state != Lo_unbound) {
 			ret = -EBUSY;
-			mutex_unlock(&loop_ctl_mutex);
 			break;
 		}
 		if (atomic_read(&lo->lo_refcnt) > 0) {
 			ret = -EBUSY;
-			mutex_unlock(&loop_ctl_mutex);
 			break;
 		}
 		lo->lo_disk->private_data = NULL;
-- 
cgit 


From d16a67667c611f00b5ec0017ad2b18f473af13d2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 12 Nov 2018 17:19:32 -0700
Subject: ide: don't clear special on ide_queue_rq() entry

We can't use RQF_DONTPREP to see if we should clear ->special,
as someone could have set that while inserting the request. Make
sure we clear it in our ->initialize_rq_fn() helper instead.

Fixes: 22ce0a7ccf23 ("ide: don't use req->special")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-io.c    | 5 -----
 drivers/ide/ide-probe.c | 1 +
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 94e9c79c41cf..c0dd0fad16a3 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -463,11 +463,6 @@ blk_status_t ide_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct request	*rq = bd->rq;
 	ide_startstop_t	startstop;
 
-	if (!(rq->rq_flags & RQF_DONTPREP)) {
-		rq->rq_flags |= RQF_DONTPREP;
-		ide_req(rq)->special = NULL;
-	}
-
 	/* HLD do_request() callback might sleep, make sure it's okay */
 	might_sleep();
 
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 40384838e439..63627be0811a 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -746,6 +746,7 @@ static void ide_initialize_rq(struct request *rq)
 {
 	struct ide_request *req = blk_mq_rq_to_pdu(rq);
 
+	req->special = NULL;
 	scsi_req_init(&req->sreq);
 	req->sreq.sense = req->sense;
 }
-- 
cgit 


From 30e066286e232772cad72c87008a958e23e40a33 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 14 Nov 2018 10:13:50 -0700
Subject: nvme: fix boot hang with only being able to get one IRQ vector

NVMe always asks for io_queues + 1 worth of IRQ vectors, which
means that even when we scale all the way down, we still ask
for 2 vectors and get -ENOSPC in return if the system can't
support more than 1.

Getting just 1 vector is fine, it just means that we'll have
1 IO queue and 1 admin queue, with a shared vector between them.
Check for this case and don't add our + 1 if it happens.

Fixes: 3b6592f70ad7 ("nvme: utilize two queue maps, one for reads and one for writes")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6aa86dfcb32c..ffbab5b01df4 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2073,7 +2073,7 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 		.nr_sets = ARRAY_SIZE(irq_sets),
 		.sets = irq_sets,
 	};
-	int result;
+	int result = 0;
 
 	/*
 	 * For irq sets, we have to ask for minvec == maxvec. This passes
@@ -2088,9 +2088,16 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 			affd.nr_sets = 1;
 
 		/*
-		 * Need IRQs for read+write queues, and one for the admin queue
+		 * Need IRQs for read+write queues, and one for the admin queue.
+		 * If we can't get more than one vector, we have to share the
+		 * admin queue and IO queue vector. For that case, don't add
+		 * an extra vector for the admin queue, or we'll continue
+		 * asking for 2 and get -ENOSPC in return.
 		 */
-		nr_io_queues = irq_sets[0] + irq_sets[1] + 1;
+		if (result == -ENOSPC && nr_io_queues == 1)
+			nr_io_queues = 1;
+		else
+			nr_io_queues = irq_sets[0] + irq_sets[1] + 1;
 
 		result = pci_alloc_irq_vectors_affinity(pdev, nr_io_queues,
 				nr_io_queues,
-- 
cgit 


From 39795d6534c6e698c4f9c065e0a5f4a2e5af7543 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:06 +0100
Subject: block: don't hold the queue_lock over blk_abort_request

There is nothing it could synchronize against, so don't go through
the pains of acquiring the lock.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ata/libata-eh.c             |  4 ----
 drivers/block/mtip32xx/mtip32xx.c   |  5 +----
 drivers/scsi/libsas/sas_ata.c       |  5 -----
 drivers/scsi/libsas/sas_scsi_host.c | 10 ++--------
 4 files changed, 3 insertions(+), 21 deletions(-)

(limited to 'drivers')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 01306c018398..938ed513b070 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -919,8 +919,6 @@ static void ata_eh_set_pending(struct ata_port *ap, int fastdrain)
 void ata_qc_schedule_eh(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
-	struct request_queue *q = qc->scsicmd->device->request_queue;
-	unsigned long flags;
 
 	WARN_ON(!ap->ops->error_handler);
 
@@ -932,9 +930,7 @@ void ata_qc_schedule_eh(struct ata_queued_cmd *qc)
 	 * Note that ATA_QCFLAG_FAILED is unconditionally set after
 	 * this function completes.
 	 */
-	spin_lock_irqsave(q->queue_lock, flags);
 	blk_abort_request(qc->scsicmd->request);
-	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 /**
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index a4c44db097e0..2b0ac9d01e51 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2770,10 +2770,7 @@ restart_eh:
 
 			blk_mq_quiesce_queue(dd->queue);
 
-			spin_lock(dd->queue->queue_lock);
-			blk_mq_tagset_busy_iter(&dd->tags,
-							mtip_queue_cmd, dd);
-			spin_unlock(dd->queue->queue_lock);
+			blk_mq_tagset_busy_iter(&dd->tags, mtip_queue_cmd, dd);
 
 			set_bit(MTIP_PF_ISSUE_CMDS_BIT, &dd->port->flags);
 
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 4f6cdf53e913..c90b278cc28c 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -601,12 +601,7 @@ void sas_ata_task_abort(struct sas_task *task)
 
 	/* Bounce SCSI-initiated commands to the SCSI EH */
 	if (qc->scsicmd) {
-		struct request_queue *q = qc->scsicmd->device->request_queue;
-		unsigned long flags;
-
-		spin_lock_irqsave(q->queue_lock, flags);
 		blk_abort_request(qc->scsicmd->request);
-		spin_unlock_irqrestore(q->queue_lock, flags);
 		return;
 	}
 
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index 33229348dcb6..af085432c5fe 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -930,16 +930,10 @@ void sas_task_abort(struct sas_task *task)
 		return;
 	}
 
-	if (dev_is_sata(task->dev)) {
+	if (dev_is_sata(task->dev))
 		sas_ata_task_abort(task);
-	} else {
-		struct request_queue *q = sc->device->request_queue;
-		unsigned long flags;
-
-		spin_lock_irqsave(q->queue_lock, flags);
+	else
 		blk_abort_request(sc->request);
-		spin_unlock_irqrestore(q->queue_lock, flags);
-	}
 }
 
 void sas_target_destroy(struct scsi_target *starget)
-- 
cgit 


From 8295a69bdc3cb8707e645f9b2de6f3019a521882 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:14 +0100
Subject: drbd: don't override the queue_lock

The DRBD req_lock and block layer queue_lock are used for entirely
different resources.  Stop using the req_lock as the block layer
queue_lock.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index fa8204214ac0..b66c59ce6260 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2792,7 +2792,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 
 	drbd_init_set_defaults(device);
 
-	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, &resource->req_lock);
+	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
 	if (!q)
 		goto out_no_q;
 	device->rq_queue = q;
-- 
cgit 


From 68fc68f2ff620852ee43ee7a2831bc5eeb9472d6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:15 +0100
Subject: umem: don't override the queue_lock

The umem card->lock and the block layer queue_lock are used for entirely
different resources.  Stop using card->lock as the block layer
queue_lock.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/umem.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index be3e3ab79950..8a27b5adc2b3 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -888,8 +888,7 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	card->biotail = &card->bio;
 	spin_lock_init(&card->lock);
 
-	card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE,
-					   &card->lock);
+	card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
 	if (!card->queue)
 		goto failed_alloc;
 
-- 
cgit 


From b061b326287d45aeaf313f7dddd02e88e31db14b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:16 +0100
Subject: mmc: simplify queue initialization

Merge three functions initializing the queue into a single one, and drop
an unused argument for it.

Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/mmc/core/block.c |  2 +-
 drivers/mmc/core/queue.c | 85 +++++++++++++++++-------------------------------
 drivers/mmc/core/queue.h |  3 +-
 3 files changed, 31 insertions(+), 59 deletions(-)

(limited to 'drivers')

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index c35b5b08bb33..27606e1382e5 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -2334,7 +2334,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
 	INIT_LIST_HEAD(&md->rpmbs);
 	md->usage = 1;
 
-	ret = mmc_init_queue(&md->queue, card, &md->lock, subname);
+	ret = mmc_init_queue(&md->queue, card, &md->lock);
 	if (ret)
 		goto err_putdisk;
 
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 6edffeed9953..26ad1f571677 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -378,14 +378,37 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card)
 	init_waitqueue_head(&mq->wait);
 }
 
-static int mmc_mq_init_queue(struct mmc_queue *mq, int q_depth,
-			     const struct blk_mq_ops *mq_ops, spinlock_t *lock)
+/* Set queue depth to get a reasonable value for q->nr_requests */
+#define MMC_QUEUE_DEPTH 64
+
+/**
+ * mmc_init_queue - initialise a queue structure.
+ * @mq: mmc queue
+ * @card: mmc card to attach this queue
+ * @lock: queue lock
+ *
+ * Initialise a MMC card request queue.
+ */
+int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
+		   spinlock_t *lock)
 {
+	struct mmc_host *host = card->host;
 	int ret;
 
+	mq->card = card;
+	mq->use_cqe = host->cqe_enabled;
+
 	memset(&mq->tag_set, 0, sizeof(mq->tag_set));
-	mq->tag_set.ops = mq_ops;
-	mq->tag_set.queue_depth = q_depth;
+	mq->tag_set.ops = &mmc_mq_ops;
+	/*
+	 * The queue depth for CQE must match the hardware because the request
+	 * tag is used to index the hardware queue.
+	 */
+	if (mq->use_cqe)
+		mq->tag_set.queue_depth =
+			min_t(int, card->ext_csd.cmdq_depth, host->cqe_qdepth);
+	else
+		mq->tag_set.queue_depth = MMC_QUEUE_DEPTH;
 	mq->tag_set.numa_node = NUMA_NO_NODE;
 	mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE |
 			    BLK_MQ_F_BLOCKING;
@@ -405,66 +428,16 @@ static int mmc_mq_init_queue(struct mmc_queue *mq, int q_depth,
 
 	mq->queue->queue_lock = lock;
 	mq->queue->queuedata = mq;
+	blk_queue_rq_timeout(mq->queue, 60 * HZ);
 
+	mmc_setup_queue(mq, card);
 	return 0;
 
 free_tag_set:
 	blk_mq_free_tag_set(&mq->tag_set);
-
 	return ret;
 }
 
-/* Set queue depth to get a reasonable value for q->nr_requests */
-#define MMC_QUEUE_DEPTH 64
-
-static int mmc_mq_init(struct mmc_queue *mq, struct mmc_card *card,
-			 spinlock_t *lock)
-{
-	struct mmc_host *host = card->host;
-	int q_depth;
-	int ret;
-
-	/*
-	 * The queue depth for CQE must match the hardware because the request
-	 * tag is used to index the hardware queue.
-	 */
-	if (mq->use_cqe)
-		q_depth = min_t(int, card->ext_csd.cmdq_depth, host->cqe_qdepth);
-	else
-		q_depth = MMC_QUEUE_DEPTH;
-
-	ret = mmc_mq_init_queue(mq, q_depth, &mmc_mq_ops, lock);
-	if (ret)
-		return ret;
-
-	blk_queue_rq_timeout(mq->queue, 60 * HZ);
-
-	mmc_setup_queue(mq, card);
-
-	return 0;
-}
-
-/**
- * mmc_init_queue - initialise a queue structure.
- * @mq: mmc queue
- * @card: mmc card to attach this queue
- * @lock: queue lock
- * @subname: partition subname
- *
- * Initialise a MMC card request queue.
- */
-int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
-		   spinlock_t *lock, const char *subname)
-{
-	struct mmc_host *host = card->host;
-
-	mq->card = card;
-
-	mq->use_cqe = host->cqe_enabled;
-
-	return mmc_mq_init(mq, card, lock);
-}
-
 void mmc_queue_suspend(struct mmc_queue *mq)
 {
 	blk_mq_quiesce_queue(mq->queue);
diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h
index 9bf3c9245075..29218e12900d 100644
--- a/drivers/mmc/core/queue.h
+++ b/drivers/mmc/core/queue.h
@@ -95,8 +95,7 @@ struct mmc_queue {
 	struct work_struct	complete_work;
 };
 
-extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *,
-			  const char *);
+extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *);
 extern void mmc_cleanup_queue(struct mmc_queue *);
 extern void mmc_queue_suspend(struct mmc_queue *);
 extern void mmc_queue_resume(struct mmc_queue *);
-- 
cgit 


From 310df020cdd7570e1a8ee43bd58999a743686eda Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:17 +0100
Subject: mmc: stop abusing the request queue_lock pointer

mmc uses the block layer struct request pointer to indirect their own
lock to the mmc_queue structure, given that the original lock isn't
reachable outside of block.c.  Add a lock pointer to struct mmc_queue
instead and stop overriding the block layer lock which protects fields
entirely separate from the mmc use.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/mmc/core/block.c | 22 ++++++++++------------
 drivers/mmc/core/queue.c | 26 +++++++++++++-------------
 drivers/mmc/core/queue.h |  1 +
 3 files changed, 24 insertions(+), 25 deletions(-)

(limited to 'drivers')

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 27606e1382e5..70ec465beb69 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -1483,7 +1483,7 @@ static void mmc_blk_cqe_complete_rq(struct mmc_queue *mq, struct request *req)
 		blk_mq_end_request(req, BLK_STS_OK);
 	}
 
-	spin_lock_irqsave(q->queue_lock, flags);
+	spin_lock_irqsave(mq->lock, flags);
 
 	mq->in_flight[mmc_issue_type(mq, req)] -= 1;
 
@@ -1491,7 +1491,7 @@ static void mmc_blk_cqe_complete_rq(struct mmc_queue *mq, struct request *req)
 
 	mmc_cqe_check_busy(mq);
 
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	spin_unlock_irqrestore(mq->lock, flags);
 
 	if (!mq->cqe_busy)
 		blk_mq_run_hw_queues(q, true);
@@ -1988,17 +1988,16 @@ static void mmc_blk_mq_poll_completion(struct mmc_queue *mq,
 
 static void mmc_blk_mq_dec_in_flight(struct mmc_queue *mq, struct request *req)
 {
-	struct request_queue *q = req->q;
 	unsigned long flags;
 	bool put_card;
 
-	spin_lock_irqsave(q->queue_lock, flags);
+	spin_lock_irqsave(mq->lock, flags);
 
 	mq->in_flight[mmc_issue_type(mq, req)] -= 1;
 
 	put_card = (mmc_tot_in_flight(mq) == 0);
 
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	spin_unlock_irqrestore(mq->lock, flags);
 
 	if (put_card)
 		mmc_put_card(mq->card, &mq->ctx);
@@ -2094,11 +2093,11 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq)
 		 * request does not need to wait (although it does need to
 		 * complete complete_req first).
 		 */
-		spin_lock_irqsave(q->queue_lock, flags);
+		spin_lock_irqsave(mq->lock, flags);
 		mq->complete_req = req;
 		mq->rw_wait = false;
 		waiting = mq->waiting;
-		spin_unlock_irqrestore(q->queue_lock, flags);
+		spin_unlock_irqrestore(mq->lock, flags);
 
 		/*
 		 * If 'waiting' then the waiting task will complete this
@@ -2117,10 +2116,10 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq)
 	/* Take the recovery path for errors or urgent background operations */
 	if (mmc_blk_rq_error(&mqrq->brq) ||
 	    mmc_blk_urgent_bkops_needed(mq, mqrq)) {
-		spin_lock_irqsave(q->queue_lock, flags);
+		spin_lock_irqsave(mq->lock, flags);
 		mq->recovery_needed = true;
 		mq->recovery_req = req;
-		spin_unlock_irqrestore(q->queue_lock, flags);
+		spin_unlock_irqrestore(mq->lock, flags);
 		wake_up(&mq->wait);
 		schedule_work(&mq->recovery_work);
 		return;
@@ -2136,7 +2135,6 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq)
 
 static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
 {
-	struct request_queue *q = mq->queue;
 	unsigned long flags;
 	bool done;
 
@@ -2144,7 +2142,7 @@ static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
 	 * Wait while there is another request in progress, but not if recovery
 	 * is needed. Also indicate whether there is a request waiting to start.
 	 */
-	spin_lock_irqsave(q->queue_lock, flags);
+	spin_lock_irqsave(mq->lock, flags);
 	if (mq->recovery_needed) {
 		*err = -EBUSY;
 		done = true;
@@ -2152,7 +2150,7 @@ static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
 		done = !mq->rw_wait;
 	}
 	mq->waiting = !done;
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	spin_unlock_irqrestore(mq->lock, flags);
 
 	return done;
 }
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 26ad1f571677..4485cf12218c 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -89,9 +89,9 @@ void mmc_cqe_recovery_notifier(struct mmc_request *mrq)
 	struct mmc_queue *mq = q->queuedata;
 	unsigned long flags;
 
-	spin_lock_irqsave(q->queue_lock, flags);
+	spin_lock_irqsave(mq->lock, flags);
 	__mmc_cqe_recovery_notifier(mq);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	spin_unlock_irqrestore(mq->lock, flags);
 }
 
 static enum blk_eh_timer_return mmc_cqe_timed_out(struct request *req)
@@ -128,14 +128,14 @@ static enum blk_eh_timer_return mmc_mq_timed_out(struct request *req,
 	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(q->queue_lock, flags);
+	spin_lock_irqsave(mq->lock, flags);
 
 	if (mq->recovery_needed || !mq->use_cqe)
 		ret = BLK_EH_RESET_TIMER;
 	else
 		ret = mmc_cqe_timed_out(req);
 
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	spin_unlock_irqrestore(mq->lock, flags);
 
 	return ret;
 }
@@ -157,9 +157,9 @@ static void mmc_mq_recovery_handler(struct work_struct *work)
 
 	mq->in_recovery = false;
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(mq->lock);
 	mq->recovery_needed = false;
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(mq->lock);
 
 	mmc_put_card(mq->card, &mq->ctx);
 
@@ -258,10 +258,10 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	issue_type = mmc_issue_type(mq, req);
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(mq->lock);
 
 	if (mq->recovery_needed || mq->busy) {
-		spin_unlock_irq(q->queue_lock);
+		spin_unlock_irq(mq->lock);
 		return BLK_STS_RESOURCE;
 	}
 
@@ -269,7 +269,7 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	case MMC_ISSUE_DCMD:
 		if (mmc_cqe_dcmd_busy(mq)) {
 			mq->cqe_busy |= MMC_CQE_DCMD_BUSY;
-			spin_unlock_irq(q->queue_lock);
+			spin_unlock_irq(mq->lock);
 			return BLK_STS_RESOURCE;
 		}
 		break;
@@ -294,7 +294,7 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	get_card = (mmc_tot_in_flight(mq) == 1);
 	cqe_retune_ok = (mmc_cqe_qcnt(mq) == 1);
 
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(mq->lock);
 
 	if (!(req->rq_flags & RQF_DONTPREP)) {
 		req_to_mmc_queue_req(req)->retries = 0;
@@ -328,12 +328,12 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (issued != MMC_REQ_STARTED) {
 		bool put_card = false;
 
-		spin_lock_irq(q->queue_lock);
+		spin_lock_irq(mq->lock);
 		mq->in_flight[issue_type] -= 1;
 		if (mmc_tot_in_flight(mq) == 0)
 			put_card = true;
 		mq->busy = false;
-		spin_unlock_irq(q->queue_lock);
+		spin_unlock_irq(mq->lock);
 		if (put_card)
 			mmc_put_card(card, &mq->ctx);
 	} else {
@@ -396,6 +396,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 	int ret;
 
 	mq->card = card;
+	mq->lock = lock;
 	mq->use_cqe = host->cqe_enabled;
 
 	memset(&mq->tag_set, 0, sizeof(mq->tag_set));
@@ -426,7 +427,6 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 		goto free_tag_set;
 	}
 
-	mq->queue->queue_lock = lock;
 	mq->queue->queuedata = mq;
 	blk_queue_rq_timeout(mq->queue, 60 * HZ);
 
diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h
index 29218e12900d..5421f1542e71 100644
--- a/drivers/mmc/core/queue.h
+++ b/drivers/mmc/core/queue.h
@@ -73,6 +73,7 @@ struct mmc_queue_req {
 
 struct mmc_queue {
 	struct mmc_card		*card;
+	spinlock_t		*lock;
 	struct mmc_ctx		ctx;
 	struct blk_mq_tag_set	tag_set;
 	struct mmc_blk_data	*blkdata;
-- 
cgit 


From 6d46964230d182c4b6097379738849a809d791dc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:18 +0100
Subject: block: remove the lock argument to blk_alloc_queue_node

With the legacy request path gone there is no real need to override the
queue_lock.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_main.c | 2 +-
 drivers/block/null_blk_main.c  | 3 +--
 drivers/block/umem.c           | 2 +-
 drivers/lightnvm/core.c        | 2 +-
 drivers/md/dm.c                | 2 +-
 drivers/nvdimm/pmem.c          | 2 +-
 drivers/nvme/host/multipath.c  | 2 +-
 7 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index b66c59ce6260..f973a2a845c8 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2792,7 +2792,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 
 	drbd_init_set_defaults(device);
 
-	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
+	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
 	if (!q)
 		goto out_no_q;
 	device->rq_queue = q;
diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 63c23fcfc4df..62c9654b9ce8 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -1659,8 +1659,7 @@ static int null_add_dev(struct nullb_device *dev)
 		}
 		null_init_queues(nullb);
 	} else if (dev->queue_mode == NULL_Q_BIO) {
-		nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node,
-						NULL);
+		nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node);
 		if (!nullb->q) {
 			rv = -ENOMEM;
 			goto out_cleanup_queues;
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 8a27b5adc2b3..aa035cf8a51d 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -888,7 +888,7 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	card->biotail = &card->bio;
 	spin_lock_init(&card->lock);
 
-	card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
+	card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
 	if (!card->queue)
 		goto failed_alloc;
 
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index efb976a863d2..60ab11fcc81c 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -389,7 +389,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 		goto err_dev;
 	}
 
-	tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node, NULL);
+	tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
 	if (!tqueue) {
 		ret = -ENOMEM;
 		goto err_disk;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c510179a7f84..a733e4c920af 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1896,7 +1896,7 @@ static struct mapped_device *alloc_dev(int minor)
 	INIT_LIST_HEAD(&md->table_devices);
 	spin_lock_init(&md->uevent_lock);
 
-	md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL);
+	md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
 	if (!md->queue)
 		goto bad;
 	md->queue->queuedata = md;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 0e39e3d1846f..f7019294740c 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -393,7 +393,7 @@ static int pmem_attach_disk(struct device *dev,
 		return -EBUSY;
 	}
 
-	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev), NULL);
+	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
 	if (!q)
 		return -ENOMEM;
 
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 5e3cc8c59a39..b82b0d3ca39a 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -276,7 +276,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 	if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
 		return 0;
 
-	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
+	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
 	if (!q)
 		goto out;
 	q->queuedata = head;
-- 
cgit 


From 0d945c1f966b2bcb67bb12be749da0a7fb00201b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Nov 2018 12:17:28 -0700
Subject: block: remove the queue_lock indirection

With the legacy request path gone there is no good reason to keep
queue_lock as a pointer, we can always use the embedded lock now.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>

Fixed floppy and blk-cgroup missing conversions and half done edits.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/floppy.c  |  8 ++++----
 drivers/block/pktcdvd.c |  4 ++--
 drivers/ide/ide-pm.c    | 10 +++++-----
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index a8cfa011c284..eeb4be8d000b 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2255,9 +2255,9 @@ static void request_done(int uptodate)
 			DRS->maxtrack = 1;
 
 		/* unlock chained buffers */
-		spin_lock_irqsave(q->queue_lock, flags);
+		spin_lock_irqsave(&q->queue_lock, flags);
 		floppy_end_request(req, 0);
-		spin_unlock_irqrestore(q->queue_lock, flags);
+		spin_unlock_irqrestore(&q->queue_lock, flags);
 	} else {
 		if (rq_data_dir(req) == WRITE) {
 			/* record write error information */
@@ -2269,9 +2269,9 @@ static void request_done(int uptodate)
 			DRWE->last_error_sector = blk_rq_pos(req);
 			DRWE->last_error_generation = DRS->generation;
 		}
-		spin_lock_irqsave(q->queue_lock, flags);
+		spin_lock_irqsave(&q->queue_lock, flags);
 		floppy_end_request(req, BLK_STS_IOERR);
-		spin_unlock_irqrestore(q->queue_lock, flags);
+		spin_unlock_irqrestore(&q->queue_lock, flags);
 	}
 }
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 9381f4e3b221..4adf4c8861cd 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2203,9 +2203,9 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 		 * Some CDRW drives can not handle writes larger than one packet,
 		 * even if the size is a multiple of the packet size.
 		 */
-		spin_lock_irq(q->queue_lock);
+		spin_lock_irq(&q->queue_lock);
 		blk_queue_max_hw_sectors(q, pd->settings.size);
-		spin_unlock_irq(q->queue_lock);
+		spin_unlock_irq(&q->queue_lock);
 		set_bit(PACKET_WRITABLE, &pd->flags);
 	} else {
 		pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index a8c53c98252d..51fe10ac02fa 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -44,15 +44,15 @@ static int ide_pm_execute_rq(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 	if (unlikely(blk_queue_dying(q))) {
 		rq->rq_flags |= RQF_QUIET;
 		scsi_req(rq)->result = -ENXIO;
-		spin_unlock_irq(q->queue_lock);
+		spin_unlock_irq(&q->queue_lock);
 		blk_mq_end_request(rq, BLK_STS_OK);
 		return -ENXIO;
 	}
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 	blk_execute_rq(q, NULL, rq, true);
 
 	return scsi_req(rq)->result ? -EIO : 0;
@@ -214,12 +214,12 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 	printk("%s: completing PM request, %s\n", drive->name,
 	       (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND) ? "suspend" : "resume");
 #endif
-	spin_lock_irqsave(q->queue_lock, flags);
+	spin_lock_irqsave(&q->queue_lock, flags);
 	if (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND)
 		blk_mq_stop_hw_queues(q);
 	else
 		drive->dev_flags &= ~IDE_DFLAG_BLOCKED;
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	spin_unlock_irqrestore(&q->queue_lock, flags);
 
 	drive->hwif->rq = NULL;
 
-- 
cgit 


From db29eb059cdc571f9d75cec4a41b9884b3b8286a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 15 Nov 2018 16:05:02 -0700
Subject: nvme: fix handling of EINVAL on pci_alloc_irq_vectors_affinity()

At least on SPARC, if MSI/MSI-X isn't supported, we get EINVAL if
we ask for more than one vector. This isn't covered by our ENOSPC
check.

If we get EINVAL, decrease our ask to just one vector, instead of
bailing out in error.

Fixes: 3b6592f70ad7 ("nvme: utilize two queue maps, one for reads and one for writes")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ffbab5b01df4..41730190d932 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2088,15 +2088,11 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 			affd.nr_sets = 1;
 
 		/*
-		 * Need IRQs for read+write queues, and one for the admin queue.
-		 * If we can't get more than one vector, we have to share the
-		 * admin queue and IO queue vector. For that case, don't add
-		 * an extra vector for the admin queue, or we'll continue
-		 * asking for 2 and get -ENOSPC in return.
+		 * If we got a failure and we're down to asking for just
+		 * 1 + 1 queues, just ask for a single vector. We'll share
+		 * that between the single IO queue and the admin queue.
 		 */
-		if (result == -ENOSPC && nr_io_queues == 1)
-			nr_io_queues = 1;
-		else
+		if (!(result < 0 && nr_io_queues == 1))
 			nr_io_queues = irq_sets[0] + irq_sets[1] + 1;
 
 		result = pci_alloc_irq_vectors_affinity(pdev, nr_io_queues,
@@ -2104,13 +2100,19 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 				PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
 
 		/*
-		 * Need to reduce our vec counts
+		 * Need to reduce our vec counts. If we get ENOSPC, the
+		 * platform should support mulitple vecs, we just need
+		 * to decrease our ask. If we get EINVAL, the platform
+		 * likely does not. Back down to ask for just one vector.
 		 */
 		if (result == -ENOSPC) {
 			nr_io_queues--;
 			if (!nr_io_queues)
 				return result;
 			continue;
+		} else if (result == -EINVAL) {
+			nr_io_queues = 1;
+			continue;
 		} else if (result <= 0)
 			return -EIO;
 		break;
-- 
cgit 


From 9334ae5e6f9972110c2be136178ca2591c072b62 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 15 Nov 2018 19:42:07 -0700
Subject: ide: clear ide_req()->special for non-passthrough requests

The initial patch cleared this for all requests, which is wrong
since internal uses can't have this cleared as that's what they
are using to pass data. The fix moved the initialization to the
mq_ops->initialize_rq_fn(), but that's only a partial fix since
it only catches uses from blk_get_request(), not requests coming
from the file system.

Keep the non-fs initialization, and add the IDE entry clear
IFF RQF_DONTPREP isn't set and it's a passthrough request.

Fixes: d16a67667c61 ("ide: don't clear special on ide_queue_rq() entry")
Fixes: 22ce0a7ccf23 ("ide: don't use req->special")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-io.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index c0dd0fad16a3..8445b484ae69 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -463,6 +463,11 @@ blk_status_t ide_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct request	*rq = bd->rq;
 	ide_startstop_t	startstop;
 
+	if (!blk_rq_is_passthrough(rq) && !(rq->rq_flags & RQF_DONTPREP)) {
+		rq->rq_flags |= RQF_DONTPREP;
+		ide_req(rq)->special = NULL;
+	}
+
 	/* HLD do_request() callback might sleep, make sure it's okay */
 	might_sleep();
 
-- 
cgit 


From dabcefab45d36ecb5a22f16577bb0f298876a22d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 14 Nov 2018 09:38:28 -0700
Subject: nvme: provide optimized poll function for separate poll queues

If we have separate poll queues, we know that they aren't using
interrupts. Hence we don't need to disable interrupts around
finding completions.

Provide a separate set of blk_mq_ops for such devices.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 45 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 41730190d932..89874e23e422 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1082,6 +1082,23 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
 	return __nvme_poll(nvmeq, tag);
 }
 
+static int nvme_poll_noirq(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+{
+	struct nvme_queue *nvmeq = hctx->driver_data;
+	u16 start, end;
+	bool found;
+
+	if (!nvme_cqe_pending(nvmeq))
+		return 0;
+
+	spin_lock(&nvmeq->cq_lock);
+	found = nvme_process_cq(nvmeq, &start, &end, tag);
+	spin_unlock(&nvmeq->cq_lock);
+
+	nvme_complete_cqes(nvmeq, start, end);
+	return found;
+}
+
 static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
 {
 	struct nvme_dev *dev = to_nvme_dev(ctrl);
@@ -1584,17 +1601,25 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 	.timeout	= nvme_timeout,
 };
 
+#define NVME_SHARED_MQ_OPS					\
+	.queue_rq		= nvme_queue_rq,		\
+	.rq_flags_to_type	= nvme_rq_flags_to_type,	\
+	.complete		= nvme_pci_complete_rq,		\
+	.init_hctx		= nvme_init_hctx,		\
+	.init_request		= nvme_init_request,		\
+	.map_queues		= nvme_pci_map_queues,		\
+	.timeout		= nvme_timeout			\
+
 static const struct blk_mq_ops nvme_mq_ops = {
-	.queue_rq		= nvme_queue_rq,
-	.rq_flags_to_type	= nvme_rq_flags_to_type,
-	.complete		= nvme_pci_complete_rq,
-	.init_hctx		= nvme_init_hctx,
-	.init_request		= nvme_init_request,
-	.map_queues		= nvme_pci_map_queues,
-	.timeout		= nvme_timeout,
+	NVME_SHARED_MQ_OPS,
 	.poll			= nvme_poll,
 };
 
+static const struct blk_mq_ops nvme_mq_poll_noirq_ops = {
+	NVME_SHARED_MQ_OPS,
+	.poll			= nvme_poll_noirq,
+};
+
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
 {
 	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
@@ -2276,7 +2301,11 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	int ret;
 
 	if (!dev->ctrl.tagset) {
-		dev->tagset.ops = &nvme_mq_ops;
+		if (!dev->io_queues[NVMEQ_TYPE_POLL])
+			dev->tagset.ops = &nvme_mq_ops;
+		else
+			dev->tagset.ops = &nvme_mq_poll_noirq_ops;
+
 		dev->tagset.nr_hw_queues = dev->online_queues - 1;
 		dev->tagset.nr_maps = NVMEQ_TYPE_NR;
 		dev->tagset.timeout = NVME_IO_TIMEOUT;
-- 
cgit 


From 344e9ffcbd1898e1dc04085564a6e05c30ea8199 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 15 Nov 2018 12:22:51 -0700
Subject: block: add queue_is_mq() helper

Various spots check for q->mq_ops being non-NULL, but provide
a helper to do this instead.

Where the ->mq_ops != NULL check is redundant, remove it.

Since mq == rq-based now that legacy is gone, get rid of the
queue_is_rq_based() and just use queue_is_mq() everywhere.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-rq.c    | 2 +-
 drivers/md/dm-table.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 7cd36e4d1310..1f1fe9a618ea 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -43,7 +43,7 @@ static unsigned dm_get_blk_mq_queue_depth(void)
 
 int dm_request_based(struct mapped_device *md)
 {
-	return queue_is_rq_based(md->queue);
+	return queue_is_mq(md->queue);
 }
 
 void dm_start_queue(struct request_queue *q)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 9038c302d5c2..844f7d0f2ef8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -919,12 +919,12 @@ static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev,
 	struct request_queue *q = bdev_get_queue(dev->bdev);
 	struct verify_rq_based_data *v = data;
 
-	if (q->mq_ops)
+	if (queue_is_mq(q))
 		v->mq_count++;
 	else
 		v->sq_count++;
 
-	return queue_is_rq_based(q);
+	return queue_is_mq(q);
 }
 
 static int dm_table_determine_type(struct dm_table *t)
-- 
cgit 


From 503f620f0cb8a9da9c2bba72f2141aaa4b0e3962 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 16 Nov 2018 09:10:02 +0100
Subject: floppy: remove queue_lock around floppy_end_request

There is nothing the queue_lock could protect inside floppy_end_request,
so remove it.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/floppy.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index eeb4be8d000b..218099dd8e44 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2254,10 +2254,7 @@ static void request_done(int uptodate)
 		if (block > _floppy->sect)
 			DRS->maxtrack = 1;
 
-		/* unlock chained buffers */
-		spin_lock_irqsave(&q->queue_lock, flags);
 		floppy_end_request(req, 0);
-		spin_unlock_irqrestore(&q->queue_lock, flags);
 	} else {
 		if (rq_data_dir(req) == WRITE) {
 			/* record write error information */
@@ -2269,9 +2266,7 @@ static void request_done(int uptodate)
 			DRWE->last_error_sector = blk_rq_pos(req);
 			DRWE->last_error_generation = DRS->generation;
 		}
-		spin_lock_irqsave(&q->queue_lock, flags);
 		floppy_end_request(req, BLK_STS_IOERR);
-		spin_unlock_irqrestore(&q->queue_lock, flags);
 	}
 }
 
-- 
cgit 


From a50f9aec1ac713dd25f1b1b443d97aa2fc69f740 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 16 Nov 2018 09:10:03 +0100
Subject: pktcdvd: remove queue_lock around blk_queue_max_hw_sectors

blk_queue_max_hw_sectors can't do anything with queue_lock protection
so don't hold it.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/pktcdvd.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 4adf4c8861cd..f5a71023f76c 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2203,9 +2203,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 		 * Some CDRW drives can not handle writes larger than one packet,
 		 * even if the size is a multiple of the packet size.
 		 */
-		spin_lock_irq(&q->queue_lock);
 		blk_queue_max_hw_sectors(q, pd->settings.size);
-		spin_unlock_irq(&q->queue_lock);
 		set_bit(PACKET_WRITABLE, &pd->flags);
 	} else {
 		pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
-- 
cgit 


From b2101f655f8f6bd510154364e24d1cdba037133f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 16 Nov 2018 09:10:04 +0100
Subject: ide: don't acquire queue lock in ide_pm_execute_rq

There is nothing we can synchronize against over a call to
blk_queue_dying.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-pm.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 51fe10ac02fa..56690f523100 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -44,15 +44,12 @@ static int ide_pm_execute_rq(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
-	spin_lock_irq(&q->queue_lock);
 	if (unlikely(blk_queue_dying(q))) {
 		rq->rq_flags |= RQF_QUIET;
 		scsi_req(rq)->result = -ENXIO;
-		spin_unlock_irq(&q->queue_lock);
 		blk_mq_end_request(rq, BLK_STS_OK);
 		return -ENXIO;
 	}
-	spin_unlock_irq(&q->queue_lock);
 	blk_execute_rq(q, NULL, rq, true);
 
 	return scsi_req(rq)->result ? -EIO : 0;
-- 
cgit 


From f04842734c7a68381ab3cb45628ffb3ff1029490 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 16 Nov 2018 09:10:05 +0100
Subject: ide: don't acquire queue_lock in ide_complete_pm_rq

blk_mq_stop_hw_queues doesn't need any locking, and the ide
dev_flags field isn't protected by it either.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-pm.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 56690f523100..192e6c65d34e 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -201,7 +201,6 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 {
 	struct request_queue *q = drive->queue;
 	struct ide_pm_state *pm = ide_req(rq)->special;
-	unsigned long flags;
 
 	ide_complete_power_step(drive, rq);
 	if (pm->pm_step != IDE_PM_COMPLETED)
@@ -211,12 +210,10 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 	printk("%s: completing PM request, %s\n", drive->name,
 	       (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND) ? "suspend" : "resume");
 #endif
-	spin_lock_irqsave(&q->queue_lock, flags);
 	if (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND)
 		blk_mq_stop_hw_queues(q);
 	else
 		drive->dev_flags &= ~IDE_DFLAG_BLOCKED;
-	spin_unlock_irqrestore(&q->queue_lock, flags);
 
 	drive->hwif->rq = NULL;
 
-- 
cgit 


From f5d72c5c55bc392523cbdcdedd575c280203d31c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 16 Nov 2018 09:10:06 +0100
Subject: mmc: stop abusing the request queue_lock pointer

Replace the lock in mmc_blk_data that is only used through a pointer
in struct mmc_queue and to protect fields in that structure with
an actual lock in struct mmc_queue.

Suggested-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/mmc/core/block.c | 24 +++++++++++-------------
 drivers/mmc/core/queue.c | 31 +++++++++++++++----------------
 drivers/mmc/core/queue.h |  4 ++--
 3 files changed, 28 insertions(+), 31 deletions(-)

(limited to 'drivers')

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 70ec465beb69..2c329a3e3fdb 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -100,7 +100,6 @@ static DEFINE_IDA(mmc_rpmb_ida);
  * There is one mmc_blk_data per slot.
  */
 struct mmc_blk_data {
-	spinlock_t	lock;
 	struct device	*parent;
 	struct gendisk	*disk;
 	struct mmc_queue queue;
@@ -1483,7 +1482,7 @@ static void mmc_blk_cqe_complete_rq(struct mmc_queue *mq, struct request *req)
 		blk_mq_end_request(req, BLK_STS_OK);
 	}
 
-	spin_lock_irqsave(mq->lock, flags);
+	spin_lock_irqsave(&mq->lock, flags);
 
 	mq->in_flight[mmc_issue_type(mq, req)] -= 1;
 
@@ -1491,7 +1490,7 @@ static void mmc_blk_cqe_complete_rq(struct mmc_queue *mq, struct request *req)
 
 	mmc_cqe_check_busy(mq);
 
-	spin_unlock_irqrestore(mq->lock, flags);
+	spin_unlock_irqrestore(&mq->lock, flags);
 
 	if (!mq->cqe_busy)
 		blk_mq_run_hw_queues(q, true);
@@ -1991,13 +1990,13 @@ static void mmc_blk_mq_dec_in_flight(struct mmc_queue *mq, struct request *req)
 	unsigned long flags;
 	bool put_card;
 
-	spin_lock_irqsave(mq->lock, flags);
+	spin_lock_irqsave(&mq->lock, flags);
 
 	mq->in_flight[mmc_issue_type(mq, req)] -= 1;
 
 	put_card = (mmc_tot_in_flight(mq) == 0);
 
-	spin_unlock_irqrestore(mq->lock, flags);
+	spin_unlock_irqrestore(&mq->lock, flags);
 
 	if (put_card)
 		mmc_put_card(mq->card, &mq->ctx);
@@ -2093,11 +2092,11 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq)
 		 * request does not need to wait (although it does need to
 		 * complete complete_req first).
 		 */
-		spin_lock_irqsave(mq->lock, flags);
+		spin_lock_irqsave(&mq->lock, flags);
 		mq->complete_req = req;
 		mq->rw_wait = false;
 		waiting = mq->waiting;
-		spin_unlock_irqrestore(mq->lock, flags);
+		spin_unlock_irqrestore(&mq->lock, flags);
 
 		/*
 		 * If 'waiting' then the waiting task will complete this
@@ -2116,10 +2115,10 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq)
 	/* Take the recovery path for errors or urgent background operations */
 	if (mmc_blk_rq_error(&mqrq->brq) ||
 	    mmc_blk_urgent_bkops_needed(mq, mqrq)) {
-		spin_lock_irqsave(mq->lock, flags);
+		spin_lock_irqsave(&mq->lock, flags);
 		mq->recovery_needed = true;
 		mq->recovery_req = req;
-		spin_unlock_irqrestore(mq->lock, flags);
+		spin_unlock_irqrestore(&mq->lock, flags);
 		wake_up(&mq->wait);
 		schedule_work(&mq->recovery_work);
 		return;
@@ -2142,7 +2141,7 @@ static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
 	 * Wait while there is another request in progress, but not if recovery
 	 * is needed. Also indicate whether there is a request waiting to start.
 	 */
-	spin_lock_irqsave(mq->lock, flags);
+	spin_lock_irqsave(&mq->lock, flags);
 	if (mq->recovery_needed) {
 		*err = -EBUSY;
 		done = true;
@@ -2150,7 +2149,7 @@ static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
 		done = !mq->rw_wait;
 	}
 	mq->waiting = !done;
-	spin_unlock_irqrestore(mq->lock, flags);
+	spin_unlock_irqrestore(&mq->lock, flags);
 
 	return done;
 }
@@ -2327,12 +2326,11 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
 		goto err_kfree;
 	}
 
-	spin_lock_init(&md->lock);
 	INIT_LIST_HEAD(&md->part);
 	INIT_LIST_HEAD(&md->rpmbs);
 	md->usage = 1;
 
-	ret = mmc_init_queue(&md->queue, card, &md->lock);
+	ret = mmc_init_queue(&md->queue, card);
 	if (ret)
 		goto err_putdisk;
 
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 4485cf12218c..35cc138b096d 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -89,9 +89,9 @@ void mmc_cqe_recovery_notifier(struct mmc_request *mrq)
 	struct mmc_queue *mq = q->queuedata;
 	unsigned long flags;
 
-	spin_lock_irqsave(mq->lock, flags);
+	spin_lock_irqsave(&mq->lock, flags);
 	__mmc_cqe_recovery_notifier(mq);
-	spin_unlock_irqrestore(mq->lock, flags);
+	spin_unlock_irqrestore(&mq->lock, flags);
 }
 
 static enum blk_eh_timer_return mmc_cqe_timed_out(struct request *req)
@@ -128,14 +128,14 @@ static enum blk_eh_timer_return mmc_mq_timed_out(struct request *req,
 	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(mq->lock, flags);
+	spin_lock_irqsave(&mq->lock, flags);
 
 	if (mq->recovery_needed || !mq->use_cqe)
 		ret = BLK_EH_RESET_TIMER;
 	else
 		ret = mmc_cqe_timed_out(req);
 
-	spin_unlock_irqrestore(mq->lock, flags);
+	spin_unlock_irqrestore(&mq->lock, flags);
 
 	return ret;
 }
@@ -157,9 +157,9 @@ static void mmc_mq_recovery_handler(struct work_struct *work)
 
 	mq->in_recovery = false;
 
-	spin_lock_irq(mq->lock);
+	spin_lock_irq(&mq->lock);
 	mq->recovery_needed = false;
-	spin_unlock_irq(mq->lock);
+	spin_unlock_irq(&mq->lock);
 
 	mmc_put_card(mq->card, &mq->ctx);
 
@@ -258,10 +258,10 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	issue_type = mmc_issue_type(mq, req);
 
-	spin_lock_irq(mq->lock);
+	spin_lock_irq(&mq->lock);
 
 	if (mq->recovery_needed || mq->busy) {
-		spin_unlock_irq(mq->lock);
+		spin_unlock_irq(&mq->lock);
 		return BLK_STS_RESOURCE;
 	}
 
@@ -269,7 +269,7 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	case MMC_ISSUE_DCMD:
 		if (mmc_cqe_dcmd_busy(mq)) {
 			mq->cqe_busy |= MMC_CQE_DCMD_BUSY;
-			spin_unlock_irq(mq->lock);
+			spin_unlock_irq(&mq->lock);
 			return BLK_STS_RESOURCE;
 		}
 		break;
@@ -294,7 +294,7 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	get_card = (mmc_tot_in_flight(mq) == 1);
 	cqe_retune_ok = (mmc_cqe_qcnt(mq) == 1);
 
-	spin_unlock_irq(mq->lock);
+	spin_unlock_irq(&mq->lock);
 
 	if (!(req->rq_flags & RQF_DONTPREP)) {
 		req_to_mmc_queue_req(req)->retries = 0;
@@ -328,12 +328,12 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (issued != MMC_REQ_STARTED) {
 		bool put_card = false;
 
-		spin_lock_irq(mq->lock);
+		spin_lock_irq(&mq->lock);
 		mq->in_flight[issue_type] -= 1;
 		if (mmc_tot_in_flight(mq) == 0)
 			put_card = true;
 		mq->busy = false;
-		spin_unlock_irq(mq->lock);
+		spin_unlock_irq(&mq->lock);
 		if (put_card)
 			mmc_put_card(card, &mq->ctx);
 	} else {
@@ -385,19 +385,18 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card)
  * mmc_init_queue - initialise a queue structure.
  * @mq: mmc queue
  * @card: mmc card to attach this queue
- * @lock: queue lock
  *
  * Initialise a MMC card request queue.
  */
-int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
-		   spinlock_t *lock)
+int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card)
 {
 	struct mmc_host *host = card->host;
 	int ret;
 
 	mq->card = card;
-	mq->lock = lock;
 	mq->use_cqe = host->cqe_enabled;
+	
+	spin_lock_init(&mq->lock);
 
 	memset(&mq->tag_set, 0, sizeof(mq->tag_set));
 	mq->tag_set.ops = &mmc_mq_ops;
diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h
index 5421f1542e71..fd11491ced9f 100644
--- a/drivers/mmc/core/queue.h
+++ b/drivers/mmc/core/queue.h
@@ -73,11 +73,11 @@ struct mmc_queue_req {
 
 struct mmc_queue {
 	struct mmc_card		*card;
-	spinlock_t		*lock;
 	struct mmc_ctx		ctx;
 	struct blk_mq_tag_set	tag_set;
 	struct mmc_blk_data	*blkdata;
 	struct request_queue	*queue;
+	spinlock_t		lock;
 	int			in_flight[MMC_ISSUE_MAX];
 	unsigned int		cqe_busy;
 #define MMC_CQE_DCMD_BUSY	BIT(0)
@@ -96,7 +96,7 @@ struct mmc_queue {
 	struct work_struct	complete_work;
 };
 
-extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *);
+extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *);
 extern void mmc_cleanup_queue(struct mmc_queue *);
 extern void mmc_queue_suspend(struct mmc_queue *);
 extern void mmc_queue_resume(struct mmc_queue *);
-- 
cgit 


From fce15a609f8f30cfacfaf684729add9582be780b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 18 Nov 2018 15:42:48 -0700
Subject: floppy: remove now unused 'flags' variable

With the locking removed, it's unused. Kill it.

Fixes: 503f620f0cb8 ("floppy: remove queue_lock around floppy_end_request")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/floppy.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 218099dd8e44..cad339ec9c19 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2231,7 +2231,6 @@ static void request_done(int uptodate)
 {
 	struct request *req = current_req;
 	struct request_queue *q;
-	unsigned long flags;
 	int block;
 	char msg[sizeof("request done ") + sizeof(int) * 3];
 
-- 
cgit 


From a4668d9ba4be1ca9f4a39798ba3419fdfef0750d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 19 Nov 2018 08:18:24 -0700
Subject: nvme: default to 0 poll queues

We need a better way of configuring this, and given that polling is
(still) a bit niche, let's default to using 0 poll queues. That way
we'll have the same read/write/poll behavior as 4.20, and users that
want to test/use polling are required to do manual configuration of the
number of poll queues.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 89874e23e422..57e790391b82 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -86,7 +86,7 @@ MODULE_PARM_DESC(write_queues,
 	"Number of queues to use for writes. If not set, reads and writes "
 	"will share a queue set.");
 
-static int poll_queues = 1;
+static int poll_queues = 0;
 module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644);
 MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
 
-- 
cgit 


From 85f4d4b65fdd67f1d6dc9eeb1d91923cef07eb6a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 6 Nov 2018 13:30:55 -0700
Subject: block: have ->poll_fn() return number of entries polled

We currently only really support sync poll, ie poll with 1 IO in flight.
This prepares us for supporting async poll.

Note that the returned value isn't necessarily 100% accurate. If poll
races with IRQ completion, we assume that the fact that the task is now
runnable means we found at least one entry. In reality it could be more
than 1, or not even 1. This is fine, the caller will just need to take
this into account.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/multipath.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 8b841f39734c..f9eeb3b58632 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -220,11 +220,11 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
 	return ret;
 }
 
-static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
+static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
 {
 	struct nvme_ns_head *head = q->queuedata;
 	struct nvme_ns *ns;
-	bool found = false;
+	int found = 0;
 	int srcu_idx;
 
 	srcu_idx = srcu_read_lock(&head->srcu);
-- 
cgit 


From 92f806d678e5136e4777b21e5ed5368482ac9ea9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 19 Nov 2018 11:37:31 -0700
Subject: nvme-fc: remove ->poll implementation

It's specifically looking for a given request, which we will not be
supporting going forward. Also kill the qla2xxx poll implementation
as that's the only user of the nvme-fc poll, and the now unused
->poll_queue() hook.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by:  James Smart <jsmart2021@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/fc.c          | 33 ---------------------------------
 drivers/scsi/qla2xxx/qla_nvme.c | 12 ------------
 2 files changed, 45 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 98c3c77f48f6..de797c641265 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2302,38 +2302,6 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return nvme_fc_start_fcp_op(ctrl, queue, op, data_len, io_dir);
 }
 
-static struct blk_mq_tags *
-nvme_fc_tagset(struct nvme_fc_queue *queue)
-{
-	if (queue->qnum == 0)
-		return queue->ctrl->admin_tag_set.tags[queue->qnum];
-
-	return queue->ctrl->tag_set.tags[queue->qnum - 1];
-}
-
-static int
-nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
-
-{
-	struct nvme_fc_queue *queue = hctx->driver_data;
-	struct nvme_fc_ctrl *ctrl = queue->ctrl;
-	struct request *req;
-	struct nvme_fc_fcp_op *op;
-
-	req = blk_mq_tag_to_rq(nvme_fc_tagset(queue), tag);
-	if (!req)
-		return 0;
-
-	op = blk_mq_rq_to_pdu(req);
-
-	if ((atomic_read(&op->state) == FCPOP_STATE_ACTIVE) &&
-		 (ctrl->lport->ops->poll_queue))
-		ctrl->lport->ops->poll_queue(&ctrl->lport->localport,
-						 queue->lldd_handle);
-
-	return ((atomic_read(&op->state) != FCPOP_STATE_ACTIVE));
-}
-
 static void
 nvme_fc_submit_async_event(struct nvme_ctrl *arg)
 {
@@ -2404,7 +2372,6 @@ static const struct blk_mq_ops nvme_fc_mq_ops = {
 	.init_request	= nvme_fc_init_request,
 	.exit_request	= nvme_fc_exit_request,
 	.init_hctx	= nvme_fc_init_hctx,
-	.poll		= nvme_fc_poll,
 	.timeout	= nvme_fc_timeout,
 };
 
diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c
index 7e78e7eff783..fccc733145fc 100644
--- a/drivers/scsi/qla2xxx/qla_nvme.c
+++ b/drivers/scsi/qla2xxx/qla_nvme.c
@@ -272,17 +272,6 @@ static void qla_nvme_fcp_abort(struct nvme_fc_local_port *lport,
 	schedule_work(&priv->abort_work);
 }
 
-static void qla_nvme_poll(struct nvme_fc_local_port *lport, void *hw_queue_handle)
-{
-	struct qla_qpair *qpair = hw_queue_handle;
-	unsigned long flags;
-	struct scsi_qla_host *vha = lport->private;
-
-	spin_lock_irqsave(&qpair->qp_lock, flags);
-	qla24xx_process_response_queue(vha, qpair->rsp);
-	spin_unlock_irqrestore(&qpair->qp_lock, flags);
-}
-
 static inline int qla2x00_start_nvme_mq(srb_t *sp)
 {
 	unsigned long   flags;
@@ -578,7 +567,6 @@ static struct nvme_fc_port_template qla_nvme_fc_transport = {
 	.ls_abort	= qla_nvme_ls_abort,
 	.fcp_io		= qla_nvme_post_cmd,
 	.fcp_abort	= qla_nvme_fcp_abort,
-	.poll_queue	= qla_nvme_poll,
 	.max_hw_queues  = 8,
 	.max_sgl_segments = 128,
 	.max_dif_sgl_segments = 64,
-- 
cgit 


From 1052b8ac5282daf35df331edcbdb645839d17e6a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 26 Nov 2018 08:21:49 -0700
Subject: blk-mq: when polling for IO, look for any completion

If we want to support async IO polling, then we have to allow finding
completions that aren't just for the one we are looking for. Always pass
in -1 to the mq_ops->poll() helper, and have that return how many events
were found in this poll loop.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c  | 14 +++++++-------
 drivers/nvme/host/rdma.c | 39 +++++++++++++++------------------------
 2 files changed, 22 insertions(+), 31 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 57e790391b82..de50d80ecc84 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1012,15 +1012,15 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
 	}
 }
 
-static inline bool nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
-		u16 *end, int tag)
+static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
+				  u16 *end, unsigned int tag)
 {
-	bool found = false;
+	int found = 0;
 
 	*start = nvmeq->cq_head;
-	while (!found && nvme_cqe_pending(nvmeq)) {
-		if (nvmeq->cqes[nvmeq->cq_head].command_id == tag)
-			found = true;
+	while (nvme_cqe_pending(nvmeq)) {
+		if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag)
+			found++;
 		nvme_update_cq_head(nvmeq);
 	}
 	*end = nvmeq->cq_head;
@@ -1062,7 +1062,7 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
 {
 	u16 start, end;
-	bool found;
+	int found;
 
 	if (!nvme_cqe_pending(nvmeq))
 		return 0;
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index d181cafedc58..c2c3e1a5b7af 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1409,12 +1409,11 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
 	WARN_ON_ONCE(ret);
 }
 
-static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
-		struct nvme_completion *cqe, struct ib_wc *wc, int tag)
+static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
+		struct nvme_completion *cqe, struct ib_wc *wc)
 {
 	struct request *rq;
 	struct nvme_rdma_request *req;
-	int ret = 0;
 
 	rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
 	if (!rq) {
@@ -1422,7 +1421,7 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
 			"tag 0x%x on QP %#x not found\n",
 			cqe->command_id, queue->qp->qp_num);
 		nvme_rdma_error_recovery(queue->ctrl);
-		return ret;
+		return;
 	}
 	req = blk_mq_rq_to_pdu(rq);
 
@@ -1437,6 +1436,8 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
 			nvme_rdma_error_recovery(queue->ctrl);
 		}
 	} else if (req->mr) {
+		int ret;
+
 		ret = nvme_rdma_inv_rkey(queue, req);
 		if (unlikely(ret < 0)) {
 			dev_err(queue->ctrl->ctrl.device,
@@ -1445,19 +1446,14 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
 			nvme_rdma_error_recovery(queue->ctrl);
 		}
 		/* the local invalidation completion will end the request */
-		return 0;
+		return;
 	}
 
-	if (refcount_dec_and_test(&req->ref)) {
-		if (rq->tag == tag)
-			ret = 1;
+	if (refcount_dec_and_test(&req->ref))
 		nvme_end_request(rq, req->status, req->result);
-	}
-
-	return ret;
 }
 
-static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
+static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 {
 	struct nvme_rdma_qe *qe =
 		container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
@@ -1465,11 +1461,10 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
 	struct ib_device *ibdev = queue->device->dev;
 	struct nvme_completion *cqe = qe->data;
 	const size_t len = sizeof(struct nvme_completion);
-	int ret = 0;
 
 	if (unlikely(wc->status != IB_WC_SUCCESS)) {
 		nvme_rdma_wr_error(cq, wc, "RECV");
-		return 0;
+		return;
 	}
 
 	ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
@@ -1484,16 +1479,10 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
 		nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
 				&cqe->result);
 	else
-		ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag);
+		nvme_rdma_process_nvme_rsp(queue, cqe, wc);
 	ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
 
 	nvme_rdma_post_recv(queue, qe);
-	return ret;
-}
-
-static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
-{
-	__nvme_rdma_recv_done(cq, wc, -1);
 }
 
 static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
@@ -1758,10 +1747,12 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
 		struct ib_cqe *cqe = wc.wr_cqe;
 
 		if (cqe) {
-			if (cqe->done == nvme_rdma_recv_done)
-				found |= __nvme_rdma_recv_done(cq, &wc, tag);
-			else
+			if (cqe->done == nvme_rdma_recv_done) {
+				nvme_rdma_recv_done(cq, &wc);
+				found++;
+			} else {
 				cqe->done(cq, &wc);
+			}
 		}
 	}
 
-- 
cgit 


From 9743139c5d11ab170f70a308dcb88c342390adfb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 16 Nov 2018 09:48:21 -0700
Subject: blk-mq: remove 'tag' parameter from mq_ops->poll()

We always pass in -1 now and none of the callers use the tag value,
remove the parameter.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c  | 8 ++++----
 drivers/nvme/host/rdma.c | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index de50d80ecc84..73effe586e5f 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1075,14 +1075,14 @@ static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
 	return found;
 }
 
-static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static int nvme_poll(struct blk_mq_hw_ctx *hctx)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
 
-	return __nvme_poll(nvmeq, tag);
+	return __nvme_poll(nvmeq, -1);
 }
 
-static int nvme_poll_noirq(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static int nvme_poll_noirq(struct blk_mq_hw_ctx *hctx)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
 	u16 start, end;
@@ -1092,7 +1092,7 @@ static int nvme_poll_noirq(struct blk_mq_hw_ctx *hctx, unsigned int tag)
 		return 0;
 
 	spin_lock(&nvmeq->cq_lock);
-	found = nvme_process_cq(nvmeq, &start, &end, tag);
+	found = nvme_process_cq(nvmeq, &start, &end, -1);
 	spin_unlock(&nvmeq->cq_lock);
 
 	nvme_complete_cqes(nvmeq, start, end);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index c2c3e1a5b7af..ccfde6c7c0a5 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1736,7 +1736,7 @@ err:
 	return BLK_STS_IOERR;
 }
 
-static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
 {
 	struct nvme_rdma_queue *queue = hctx->driver_data;
 	struct ib_cq *cq = queue->ib_cq;
-- 
cgit 


From e7d943910719b44738e86f91a26a64e3b61ae419 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 19 Nov 2018 08:29:33 -0700
Subject: nvme: remove opportunistic polling from bdev target

It doesn't set HIPRI on the bio, so polling for it is pretty silly.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/io-cmd-bdev.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index c1ec3475a140..c1cb2ed5531c 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -115,8 +115,6 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 	}
 
 	cookie = submit_bio(bio);
-
-	blk_poll(bdev_get_queue(req->ns->bdev), cookie);
 }
 
 static void nvmet_bdev_execute_flush(struct nvmet_req *req)
-- 
cgit 


From 0a1b8b87d064a47fad9ec475316002da28559207 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 26 Nov 2018 08:24:43 -0700
Subject: block: make blk_poll() take a parameter on whether to spin or not

blk_poll() has always kept spinning until it found an IO. This is
fine for SYNC polling, since we need to find one request we have
pending, but in preparation for ASYNC polling it can be beneficial
to just check if we have any entries available or not.

Existing callers are converted to pass in 'spin == true', to retain
the old behavior.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/multipath.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index f9eeb3b58632..ffebdd0ae34b 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -220,7 +220,7 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
 	return ret;
 }
 
-static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
+static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc, bool spin)
 {
 	struct nvme_ns_head *head = q->queuedata;
 	struct nvme_ns *ns;
@@ -230,7 +230,7 @@ static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
 	srcu_idx = srcu_read_lock(&head->srcu);
 	ns = srcu_dereference(head->current_path[numa_node_id()], &head->srcu);
 	if (likely(ns && nvme_path_is_optimized(ns)))
-		found = ns->queue->poll_fn(q, qc);
+		found = ns->queue->poll_fn(q, qc, spin);
 	srcu_read_unlock(&head->srcu, srcu_idx);
 	return found;
 }
-- 
cgit 


From f1342709d18af97b0e71449d5696b8873d1a456c Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 26 Nov 2018 09:54:29 -0700
Subject: scsi: Do not rely on blk-mq for double completions

The scsi timeout error handling had been directly updating the block
layer's request state to prevent a error handling and a natural completion
from completing the same request twice. Fix this layering violation
by having scsi control the fate of its commands with scsi owned flags
rather than use blk-mq's.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_error.c | 22 +++++++++++-----------
 drivers/scsi/scsi_lib.c   | 13 ++++++++++++-
 2 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index dd338a8cd275..16eef068e9e9 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -297,19 +297,19 @@ enum blk_eh_timer_return scsi_times_out(struct request *req)
 
 	if (rtn == BLK_EH_DONE) {
 		/*
-		 * For blk-mq, we must set the request state to complete now
-		 * before sending the request to the scsi error handler. This
-		 * will prevent a use-after-free in the event the LLD manages
-		 * to complete the request before the error handler finishes
-		 * processing this timed out request.
+		 * Set the command to complete first in order to prevent a real
+		 * completion from releasing the command while error handling
+		 * is using it. If the command was already completed, then the
+		 * lower level driver beat the timeout handler, and it is safe
+		 * to return without escalating error recovery.
 		 *
-		 * If the request was already completed, then the LLD beat the
-		 * time out handler from transferring the request to the scsi
-		 * error handler. In that case we can return immediately as no
-		 * further action is required.
+		 * If timeout handling lost the race to a real completion, the
+		 * block layer may ignore that due to a fake timeout injection,
+		 * so return RESET_TIMER to allow error handling another shot
+		 * at this command.
 		 */
-		if (!blk_mq_mark_complete(req))
-			return rtn;
+		if (test_and_set_bit(SCMD_STATE_COMPLETE, &scmd->state))
+			return BLK_EH_RESET_TIMER;
 		if (scsi_abort_command(scmd) != SUCCESS) {
 			set_host_byte(scmd, DID_TIME_OUT);
 			scsi_eh_scmd_add(scmd);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 0df15cb738d2..0dbf25512778 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1642,8 +1642,18 @@ static blk_status_t scsi_mq_prep_fn(struct request *req)
 
 static void scsi_mq_done(struct scsi_cmnd *cmd)
 {
+	if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->state)))
+		return;
 	trace_scsi_dispatch_cmd_done(cmd);
-	blk_mq_complete_request(cmd->request);
+
+	/*
+	 * If the block layer didn't complete the request due to a timeout
+	 * injection, scsi must clear its internal completed state so that the
+	 * timeout handler will see it needs to escalate its own error
+	 * recovery.
+	 */
+	if (unlikely(!blk_mq_complete_request(cmd->request)))
+		clear_bit(SCMD_STATE_COMPLETE, &cmd->state);
 }
 
 static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
@@ -1702,6 +1712,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (!scsi_host_queue_ready(q, shost, sdev))
 		goto out_dec_target_busy;
 
+	clear_bit(SCMD_STATE_COMPLETE, &cmd->state);
 	if (!(req->rq_flags & RQF_DONTPREP)) {
 		ret = scsi_mq_prep_fn(req);
 		if (ret != BLK_STS_OK)
-- 
cgit 


From a11f6ca9aef989b56cd31ff4ee2af4fb31a172ec Mon Sep 17 00:00:00 2001
From: Young Xiao <YangX92@hotmail.com>
Date: Wed, 28 Nov 2018 12:36:39 +0000
Subject: sunvdc: Do not spin in an infinite loop when vio_ldc_send() returns
 EAGAIN

__vdc_tx_trigger should only loop on EAGAIN a finite
number of times.

See commit adddc32d6fde ("sunvnet: Do not spin in an
infinite loop when vio_ldc_send() returns EAGAIN") for detail.

Signed-off-by: Young Xiao <YangX92@hotmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/sunvdc.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers')

diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 175a64619602..9c0553dd13e7 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -45,6 +45,8 @@ MODULE_VERSION(DRV_MODULE_VERSION);
 #define WAITING_FOR_GEN_CMD	0x04
 #define WAITING_FOR_ANY		-1
 
+#define	VDC_MAX_RETRIES	10
+
 static struct workqueue_struct *sunvdc_wq;
 
 struct vdc_req_entry {
@@ -431,6 +433,7 @@ static int __vdc_tx_trigger(struct vdc_port *port)
 		.end_idx		= dr->prod,
 	};
 	int err, delay;
+	int retries = 0;
 
 	hdr.seq = dr->snd_nxt;
 	delay = 1;
@@ -443,6 +446,8 @@ static int __vdc_tx_trigger(struct vdc_port *port)
 		udelay(delay);
 		if ((delay <<= 1) > 128)
 			delay = 128;
+		if (retries++ > VDC_MAX_RETRIES)
+			break;
 	} while (err == -EAGAIN);
 
 	if (err == -ENOTCONN)
-- 
cgit 


From 49379e6d1e9370d1e5dc09ca52aff29ae07c8ba6 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 29 Nov 2018 13:55:19 +0300
Subject: ataflop: fix error handling in atari_floppy_init()

Smatch complains that there is an off by one if the allocation fails in:

	DMABuffer = atari_stram_alloc(BUFFER_SIZE+512, "ataflop");

In that situation, "i" would be point to one element beyond the end of
the unit[] array.

There is a second bug because the error handling calls
blk_mq_free_tag_set(&unit[i].tag_set); regardless of whether
"disk->queue" is NULL or non-NULL.  So if blk_mq_init_sq_queue() fails,
then that means unit[i].tag_set->tags is NULL and it leads to an Oops.

It's easiest to call put_disk() before the goto to clean up the partial
iteration.  Then the earlier unit[] elements are fully allocated so we
can remove the checks whether "disk->queue" is NULL and the code is
simpler.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ataflop.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index f88b4c26d422..313064b1005c 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1982,6 +1982,7 @@ static int __init atari_floppy_init (void)
 							   &ataflop_mq_ops, 2,
 							   BLK_MQ_F_SHOULD_MERGE);
 		if (IS_ERR(unit[i].disk->queue)) {
+			put_disk(unit[i].disk);
 			ret = PTR_ERR(unit[i].disk->queue);
 			unit[i].disk->queue = NULL;
 			goto err;
@@ -2033,18 +2034,13 @@ static int __init atari_floppy_init (void)
 	return 0;
 
 err:
-	do {
+	while (--i >= 0) {
 		struct gendisk *disk = unit[i].disk;
 
-		if (disk) {
-			if (disk->queue) {
-				blk_cleanup_queue(disk->queue);
-				disk->queue = NULL;
-			}
-			blk_mq_free_tag_set(&unit[i].tag_set);
-			put_disk(unit[i].disk);
-		}
-	} while (i--);
+		blk_cleanup_queue(disk->queue);
+		blk_mq_free_tag_set(&unit[i].tag_set);
+		put_disk(unit[i].disk);
+	}
 
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
 	return ret;
-- 
cgit 


From 04f3eafda6e05adc56afed4d3ae6e24aaa429058 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 29 Nov 2018 10:02:29 -0700
Subject: nvme: implement mq_ops->commit_rqs() hook

Split the command submission and the SQ doorbell ring, and add the
doorbell ring as our ->commit_rqs() hook. This allows a list of
requests to be issued, with nvme only writing the SQ update when
it's necessary. This is more efficient if we have lists of requests
to issue, particularly on virtualized hardware, where writing the
SQ doorbell is more expensive than on real hardware. For those cases,
performance increases of 2-3x have been observed.

The use case for this is plugged IO, where blk-mq flushes a batch of
requests at the time.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 47 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 39 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 73effe586e5f..527907aa6903 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -203,6 +203,7 @@ struct nvme_queue {
 	u16 q_depth;
 	s16 cq_vector;
 	u16 sq_tail;
+	u16 last_sq_tail;
 	u16 cq_head;
 	u16 last_cq_head;
 	u16 qid;
@@ -522,22 +523,50 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 	return 0;
 }
 
+/*
+ * Write sq tail if we are asked to, or if the next command would wrap.
+ */
+static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
+{
+	if (!write_sq) {
+		u16 next_tail = nvmeq->sq_tail + 1;
+
+		if (next_tail == nvmeq->q_depth)
+			next_tail = 0;
+		if (next_tail != nvmeq->last_sq_tail)
+			return;
+	}
+
+	if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
+			nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
+		writel(nvmeq->sq_tail, nvmeq->q_db);
+	nvmeq->last_sq_tail = nvmeq->sq_tail;
+}
+
 /**
  * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
  * @nvmeq: The queue to use
  * @cmd: The command to send
+ * @write_sq: whether to write to the SQ doorbell
  */
-static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
+static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
+			    bool write_sq)
 {
 	spin_lock(&nvmeq->sq_lock);
-
 	memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd));
-
 	if (++nvmeq->sq_tail == nvmeq->q_depth)
 		nvmeq->sq_tail = 0;
-	if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
-			nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
-		writel(nvmeq->sq_tail, nvmeq->q_db);
+	nvme_write_sq_db(nvmeq, write_sq);
+	spin_unlock(&nvmeq->sq_lock);
+}
+
+static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
+{
+	struct nvme_queue *nvmeq = hctx->driver_data;
+
+	spin_lock(&nvmeq->sq_lock);
+	if (nvmeq->sq_tail != nvmeq->last_sq_tail)
+		nvme_write_sq_db(nvmeq, true);
 	spin_unlock(&nvmeq->sq_lock);
 }
 
@@ -923,7 +952,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	}
 
 	blk_mq_start_request(req);
-	nvme_submit_cmd(nvmeq, &cmnd);
+	nvme_submit_cmd(nvmeq, &cmnd, bd->last);
 	return BLK_STS_OK;
 out_cleanup_iod:
 	nvme_free_iod(dev, req);
@@ -1108,7 +1137,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
 	memset(&c, 0, sizeof(c));
 	c.common.opcode = nvme_admin_async_event;
 	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
-	nvme_submit_cmd(nvmeq, &c);
+	nvme_submit_cmd(nvmeq, &c, true);
 }
 
 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
@@ -1531,6 +1560,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 
 	spin_lock_irq(&nvmeq->cq_lock);
 	nvmeq->sq_tail = 0;
+	nvmeq->last_sq_tail = 0;
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
 	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
@@ -1603,6 +1633,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 
 #define NVME_SHARED_MQ_OPS					\
 	.queue_rq		= nvme_queue_rq,		\
+	.commit_rqs		= nvme_commit_rqs,		\
 	.rq_flags_to_type	= nvme_rq_flags_to_type,	\
 	.complete		= nvme_pci_complete_rq,		\
 	.init_hctx		= nvme_init_hctx,		\
-- 
cgit 


From 944e7c87967c820a0f34a935b1f2799944099750 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 26 Nov 2018 11:00:12 -0700
Subject: virtio_blk: implement mq_ops->commit_rqs() hook

We need this for blk-mq to kick things into gear, if we told it that
we had more IO coming, but then failed to deliver on that promise.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/virtio_blk.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'drivers')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 6e869d05f91e..912c4265e592 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -214,6 +214,20 @@ static void virtblk_done(struct virtqueue *vq)
 	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
 }
 
+static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx)
+{
+	struct virtio_blk *vblk = hctx->queue->queuedata;
+	struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num];
+	bool kick;
+
+	spin_lock_irq(&vq->lock);
+	kick = virtqueue_kick_prepare(vq->vq);
+	spin_unlock_irq(&vq->lock);
+
+	if (kick)
+		virtqueue_notify(vq->vq);
+}
+
 static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 			   const struct blk_mq_queue_data *bd)
 {
@@ -638,6 +652,7 @@ static void virtblk_initialize_rq(struct request *req)
 
 static const struct blk_mq_ops virtio_mq_ops = {
 	.queue_rq	= virtio_queue_rq,
+	.commit_rqs	= virtio_commit_rqs,
 	.complete	= virtblk_request_done,
 	.init_request	= virtblk_init_request,
 #ifdef CONFIG_VIRTIO_BLK_SCSI
-- 
cgit 


From 80ff2040ac3dd6d9d68d59159b0a6dffd8adc5ff Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 28 Nov 2018 06:09:07 -0700
Subject: ataflop: implement mq_ops->commit_rqs() hook

We need this for blk-mq to kick things into gear, if we told it that
we had more IO coming, but then failed to deliver on that promise.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ataflop.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'drivers')

diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 313064b1005c..b0dbbdfeb33e 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1471,6 +1471,15 @@ static void setup_req_params( int drive )
 			ReqTrack, ReqSector, (unsigned long)ReqData ));
 }
 
+static void ataflop_commit_rqs(struct blk_mq_hw_ctx *hctx)
+{
+	spin_lock_irq(&ataflop_lock);
+	atari_disable_irq(IRQ_MFP_FDC);
+	finish_fdc();
+	atari_enable_irq(IRQ_MFP_FDC);
+	spin_unlock_irq(&ataflop_lock);
+}
+
 static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx,
 				     const struct blk_mq_queue_data *bd)
 {
@@ -1947,6 +1956,7 @@ static const struct block_device_operations floppy_fops = {
 
 static const struct blk_mq_ops ataflop_mq_ops = {
 	.queue_rq = ataflop_queue_rq,
+	.commit_rqs = ataflop_commit_rqs,
 };
 
 static struct kobject *floppy_find(dev_t dev, int *part, void *data)
-- 
cgit 


From 5d2ee7122c73be6a3b6bfe90d237e8aed737cfaa Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 29 Nov 2018 17:36:41 -0700
Subject: sbitmap: optimize wakeup check

Even if we have no waiters on any of the sbitmap_queue wait states, we
still have to loop every entry to check. We do this for every IO, so
the cost adds up.

Shift a bit of the cost to the slow path, when we actually have waiters.
Wrap prepare_to_wait_exclusive() and finish_wait(), so we can maintain
an internal count of how many are currently active. Then we can simply
check this count in sbq_wake_ptr() and not have to loop if we don't
have any sleepers.

Convert the two users of sbitmap with waiting, blk-mq-tag and iSCSI.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/target/iscsi/iscsi_target_util.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index 36b742932c72..86987da86dd6 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -150,24 +150,26 @@ void iscsit_free_r2ts_from_list(struct iscsi_cmd *cmd)
 static int iscsit_wait_for_tag(struct se_session *se_sess, int state, int *cpup)
 {
 	int tag = -1;
-	DEFINE_WAIT(wait);
+	DEFINE_SBQ_WAIT(wait);
 	struct sbq_wait_state *ws;
+	struct sbitmap_queue *sbq;
 
 	if (state == TASK_RUNNING)
 		return tag;
 
-	ws = &se_sess->sess_tag_pool.ws[0];
+	sbq = &se_sess->sess_tag_pool;
+	ws = &sbq->ws[0];
 	for (;;) {
-		prepare_to_wait_exclusive(&ws->wait, &wait, state);
+		sbitmap_prepare_to_wait(sbq, ws, &wait, state);
 		if (signal_pending_state(state, current))
 			break;
-		tag = sbitmap_queue_get(&se_sess->sess_tag_pool, cpup);
+		tag = sbitmap_queue_get(sbq, cpup);
 		if (tag >= 0)
 			break;
 		schedule();
 	}
 
-	finish_wait(&ws->wait, &wait);
+	sbitmap_finish_wait(sbq, ws, &wait);
 	return tag;
 }
 
-- 
cgit 


From e20ba6e1da029136ded295f33076483d65ddf50a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:16 +0100
Subject: block: move queues types to the block layer

Having another indirect all in the fast path doesn't really help
in our post-spectre world.  Also having too many queue type is just
going to create confusion, so I'd rather manage them centrally.

Note that the queue type naming and ordering changes a bit - the
first index now is the default queue for everything not explicitly
marked, the optional ones are read and poll queues.

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 68 ++++++++++++++++++-------------------------------
 1 file changed, 25 insertions(+), 43 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 527907aa6903..a1bb4bb92e7f 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -95,13 +95,6 @@ struct nvme_queue;
 
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
 
-enum {
-	NVMEQ_TYPE_READ,
-	NVMEQ_TYPE_WRITE,
-	NVMEQ_TYPE_POLL,
-	NVMEQ_TYPE_NR,
-};
-
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
  */
@@ -115,7 +108,7 @@ struct nvme_dev {
 	struct dma_pool *prp_small_pool;
 	unsigned online_queues;
 	unsigned max_qid;
-	unsigned io_queues[NVMEQ_TYPE_NR];
+	unsigned io_queues[HCTX_MAX_TYPES];
 	unsigned int num_vecs;
 	int q_depth;
 	u32 db_stride;
@@ -499,10 +492,10 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 
 		map->nr_queues = dev->io_queues[i];
 		if (!map->nr_queues) {
-			BUG_ON(i == NVMEQ_TYPE_READ);
+			BUG_ON(i == HCTX_TYPE_DEFAULT);
 
 			/* shared set, resuse read set parameters */
-			map->nr_queues = dev->io_queues[NVMEQ_TYPE_READ];
+			map->nr_queues = dev->io_queues[HCTX_TYPE_DEFAULT];
 			qoff = 0;
 			offset = queue_irq_offset(dev);
 		}
@@ -512,7 +505,7 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 		 * affinity), so use the regular blk-mq cpu mapping
 		 */
 		map->queue_offset = qoff;
-		if (i != NVMEQ_TYPE_POLL)
+		if (i != HCTX_TYPE_POLL)
 			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
 		else
 			blk_mq_map_queues(map);
@@ -961,16 +954,6 @@ out_free_cmd:
 	return ret;
 }
 
-static int nvme_rq_flags_to_type(struct request_queue *q, unsigned int flags)
-{
-	if ((flags & REQ_HIPRI) && test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
-		return NVMEQ_TYPE_POLL;
-	if ((flags & REQ_OP_MASK) == REQ_OP_READ)
-		return NVMEQ_TYPE_READ;
-
-	return NVMEQ_TYPE_WRITE;
-}
-
 static void nvme_pci_complete_rq(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1634,7 +1617,6 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 #define NVME_SHARED_MQ_OPS					\
 	.queue_rq		= nvme_queue_rq,		\
 	.commit_rqs		= nvme_commit_rqs,		\
-	.rq_flags_to_type	= nvme_rq_flags_to_type,	\
 	.complete		= nvme_pci_complete_rq,		\
 	.init_hctx		= nvme_init_hctx,		\
 	.init_request		= nvme_init_request,		\
@@ -1785,9 +1767,9 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
 	}
 
 	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
-	if (max != 1 && dev->io_queues[NVMEQ_TYPE_POLL]) {
-		rw_queues = dev->io_queues[NVMEQ_TYPE_READ] +
-				dev->io_queues[NVMEQ_TYPE_WRITE];
+	if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
+		rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
+				dev->io_queues[HCTX_TYPE_READ];
 	} else {
 		rw_queues = max;
 	}
@@ -2076,9 +2058,9 @@ static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int nr_io_queues)
 	 * Setup read/write queue split
 	 */
 	if (nr_io_queues == 1) {
-		dev->io_queues[NVMEQ_TYPE_READ] = 1;
-		dev->io_queues[NVMEQ_TYPE_WRITE] = 0;
-		dev->io_queues[NVMEQ_TYPE_POLL] = 0;
+		dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
+		dev->io_queues[HCTX_TYPE_READ] = 0;
+		dev->io_queues[HCTX_TYPE_POLL] = 0;
 		return;
 	}
 
@@ -2095,10 +2077,10 @@ static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int nr_io_queues)
 			this_p_queues = nr_io_queues - 1;
 		}
 
-		dev->io_queues[NVMEQ_TYPE_POLL] = this_p_queues;
+		dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
 		nr_io_queues -= this_p_queues;
 	} else
-		dev->io_queues[NVMEQ_TYPE_POLL] = 0;
+		dev->io_queues[HCTX_TYPE_POLL] = 0;
 
 	/*
 	 * If 'write_queues' is set, ensure it leaves room for at least
@@ -2112,11 +2094,11 @@ static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int nr_io_queues)
 	 * a queue set.
 	 */
 	if (!this_w_queues) {
-		dev->io_queues[NVMEQ_TYPE_WRITE] = 0;
-		dev->io_queues[NVMEQ_TYPE_READ] = nr_io_queues;
+		dev->io_queues[HCTX_TYPE_DEFAULT] = nr_io_queues;
+		dev->io_queues[HCTX_TYPE_READ] = 0;
 	} else {
-		dev->io_queues[NVMEQ_TYPE_WRITE] = this_w_queues;
-		dev->io_queues[NVMEQ_TYPE_READ] = nr_io_queues - this_w_queues;
+		dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
+		dev->io_queues[HCTX_TYPE_READ] = nr_io_queues - this_w_queues;
 	}
 }
 
@@ -2138,8 +2120,8 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 	 */
 	do {
 		nvme_calc_io_queues(dev, nr_io_queues);
-		irq_sets[0] = dev->io_queues[NVMEQ_TYPE_READ];
-		irq_sets[1] = dev->io_queues[NVMEQ_TYPE_WRITE];
+		irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
+		irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
 		if (!irq_sets[1])
 			affd.nr_sets = 1;
 
@@ -2226,12 +2208,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 
 	dev->num_vecs = result;
 	result = max(result - 1, 1);
-	dev->max_qid = result + dev->io_queues[NVMEQ_TYPE_POLL];
+	dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
 
-	dev_info(dev->ctrl.device, "%d/%d/%d read/write/poll queues\n",
-					dev->io_queues[NVMEQ_TYPE_READ],
-					dev->io_queues[NVMEQ_TYPE_WRITE],
-					dev->io_queues[NVMEQ_TYPE_POLL]);
+	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
+					dev->io_queues[HCTX_TYPE_DEFAULT],
+					dev->io_queues[HCTX_TYPE_READ],
+					dev->io_queues[HCTX_TYPE_POLL]);
 
 	/*
 	 * Should investigate if there's a performance win from allocating
@@ -2332,13 +2314,13 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	int ret;
 
 	if (!dev->ctrl.tagset) {
-		if (!dev->io_queues[NVMEQ_TYPE_POLL])
+		if (!dev->io_queues[HCTX_TYPE_POLL])
 			dev->tagset.ops = &nvme_mq_ops;
 		else
 			dev->tagset.ops = &nvme_mq_poll_noirq_ops;
 
 		dev->tagset.nr_hw_queues = dev->online_queues - 1;
-		dev->tagset.nr_maps = NVMEQ_TYPE_NR;
+		dev->tagset.nr_maps = HCTX_MAX_TYPES;
 		dev->tagset.timeout = NVME_IO_TIMEOUT;
 		dev->tagset.numa_node = dev_to_node(dev->dev);
 		dev->tagset.queue_depth =
-- 
cgit 


From 4e224106673f1e8679249a9cac75712b896861b0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:17 +0100
Subject: nvme-pci: use atomic bitops to mark a queue enabled

This gets rid of all the messing with cq_vector and the ->polled field
by using an atomic bitop to mark the queue enabled or not.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 43 +++++++++++++++----------------------------
 1 file changed, 15 insertions(+), 28 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a1bb4bb92e7f..022395a319f4 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -201,7 +201,8 @@ struct nvme_queue {
 	u16 last_cq_head;
 	u16 qid;
 	u8 cq_phase;
-	u8 polled;
+	unsigned long flags;
+#define NVMEQ_ENABLED		0
 	u32 *dbbuf_sq_db;
 	u32 *dbbuf_cq_db;
 	u32 *dbbuf_sq_ei;
@@ -927,7 +928,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	 * We should not need to do this, but we're still using this to
 	 * ensure we can drain requests on a dying queue.
 	 */
-	if (unlikely(nvmeq->cq_vector < 0 && !nvmeq->polled))
+	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
 		return BLK_STS_IOERR;
 
 	ret = nvme_setup_cmd(ns, req, &cmnd);
@@ -1395,31 +1396,19 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
  */
 static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 {
-	int vector;
-
-	spin_lock_irq(&nvmeq->cq_lock);
-	if (nvmeq->cq_vector == -1 && !nvmeq->polled) {
-		spin_unlock_irq(&nvmeq->cq_lock);
+	if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
 		return 1;
-	}
-	vector = nvmeq->cq_vector;
-	nvmeq->dev->online_queues--;
-	nvmeq->cq_vector = -1;
-	nvmeq->polled = false;
-	spin_unlock_irq(&nvmeq->cq_lock);
 
-	/*
-	 * Ensure that nvme_queue_rq() sees it ->cq_vector == -1 without
-	 * having to grab the lock.
-	 */
+	/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
 	mb();
 
+	nvmeq->dev->online_queues--;
 	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
 		blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
-
-	if (vector != -1)
-		pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq);
-
+	if (nvmeq->cq_vector == -1)
+		return 0;
+	pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
+	nvmeq->cq_vector = -1;
 	return 0;
 }
 
@@ -1578,13 +1567,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
 	else if (result)
 		goto release_cq;
 
-	/*
-	 * Set cq_vector after alloc cq/sq, otherwise nvme_suspend_queue will
-	 * invoke free_irq for it and cause a 'Trying to free already-free IRQ
-	 * xxx' warning if the create CQ/SQ command times out.
-	 */
 	nvmeq->cq_vector = vector;
-	nvmeq->polled = polled;
 	nvme_init_queue(nvmeq, qid);
 
 	if (vector != -1) {
@@ -1593,11 +1576,11 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
 			goto release_sq;
 	}
 
+	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
 	return result;
 
 release_sq:
 	nvmeq->cq_vector = -1;
-	nvmeq->polled = false;
 	dev->online_queues--;
 	adapter_delete_sq(dev, qid);
 release_cq:
@@ -1751,6 +1734,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
 		return result;
 	}
 
+	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
 	return result;
 }
 
@@ -2173,6 +2157,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 
 	if (nr_io_queues == 0)
 		return 0;
+	
+	clear_bit(NVMEQ_ENABLED, &adminq->flags);
 
 	if (dev->cmb_use_sqes) {
 		result = nvme_cmb_qdepth(dev, nr_io_queues,
@@ -2227,6 +2213,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 		adminq->cq_vector = -1;
 		return result;
 	}
+	set_bit(NVMEQ_ENABLED, &adminq->flags);
 	return nvme_create_io_queues(dev);
 }
 
-- 
cgit 


From 6322307809649cba6f545640563f95d686ecf404 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:18 +0100
Subject: nvme-pci: cleanup SQ allocation a bit

Use a bit flag to mark if the SQ was allocated from the CMB, and clean
up the surrounding code a bit.

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 022395a319f4..b820dd0351cb 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -186,7 +186,6 @@ struct nvme_queue {
 	struct nvme_dev *dev;
 	spinlock_t sq_lock;
 	struct nvme_command *sq_cmds;
-	bool sq_cmds_is_io;
 	spinlock_t cq_lock ____cacheline_aligned_in_smp;
 	volatile struct nvme_completion *cqes;
 	struct blk_mq_tags **tags;
@@ -203,6 +202,7 @@ struct nvme_queue {
 	u8 cq_phase;
 	unsigned long flags;
 #define NVMEQ_ENABLED		0
+#define NVMEQ_SQ_CMB		1
 	u32 *dbbuf_sq_db;
 	u32 *dbbuf_cq_db;
 	u32 *dbbuf_sq_ei;
@@ -1366,17 +1366,15 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
 {
 	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
 				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+	if (!nvmeq->sq_cmds)
+		return;
 
-	if (nvmeq->sq_cmds) {
-		if (nvmeq->sq_cmds_is_io)
-			pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev),
-					nvmeq->sq_cmds,
-					SQ_SIZE(nvmeq->q_depth));
-		else
-			dma_free_coherent(nvmeq->q_dmadev,
-					  SQ_SIZE(nvmeq->q_depth),
-					  nvmeq->sq_cmds,
-					  nvmeq->sq_dma_addr);
+	if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
+		pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev),
+				nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth));
+	} else {
+		dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+				nvmeq->sq_cmds, nvmeq->sq_dma_addr);
 	}
 }
 
@@ -1462,15 +1460,14 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
 		nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth));
 		nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
 						nvmeq->sq_cmds);
-		nvmeq->sq_cmds_is_io = true;
-	}
-
-	if (!nvmeq->sq_cmds) {
-		nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
-					&nvmeq->sq_dma_addr, GFP_KERNEL);
-		nvmeq->sq_cmds_is_io = false;
+		if (nvmeq->sq_dma_addr) {
+			set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
+			return 0; 
+		}
 	}
 
+	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
+				&nvmeq->sq_dma_addr, GFP_KERNEL);
 	if (!nvmeq->sq_cmds)
 		return -ENOMEM;
 	return 0;
-- 
cgit 


From c6d962aebaf8ec5d867aac09ee33e3f528c2539d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:19 +0100
Subject: nvme-pci: only allow polling with separate poll queues

This will allow us to simplify both the regular NVMe interrupt handler
and the upcoming aio poll code.  In addition to that the separate
queues are generally a good idea for performance reasons.

Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b820dd0351cb..d42bb76e5e78 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1089,13 +1089,6 @@ static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
 }
 
 static int nvme_poll(struct blk_mq_hw_ctx *hctx)
-{
-	struct nvme_queue *nvmeq = hctx->driver_data;
-
-	return __nvme_poll(nvmeq, -1);
-}
-
-static int nvme_poll_noirq(struct blk_mq_hw_ctx *hctx)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
 	u16 start, end;
@@ -1605,12 +1598,11 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 
 static const struct blk_mq_ops nvme_mq_ops = {
 	NVME_SHARED_MQ_OPS,
-	.poll			= nvme_poll,
 };
 
-static const struct blk_mq_ops nvme_mq_poll_noirq_ops = {
+static const struct blk_mq_ops nvme_mq_poll_ops = {
 	NVME_SHARED_MQ_OPS,
-	.poll			= nvme_poll_noirq,
+	.poll			= nvme_poll,
 };
 
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
@@ -2298,10 +2290,10 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	int ret;
 
 	if (!dev->ctrl.tagset) {
-		if (!dev->io_queues[HCTX_TYPE_POLL])
-			dev->tagset.ops = &nvme_mq_ops;
+		if (dev->io_queues[HCTX_TYPE_POLL])
+			dev->tagset.ops = &nvme_mq_poll_ops;
 		else
-			dev->tagset.ops = &nvme_mq_poll_noirq_ops;
+			dev->tagset.ops = &nvme_mq_ops;
 
 		dev->tagset.nr_hw_queues = dev->online_queues - 1;
 		dev->tagset.nr_maps = HCTX_MAX_TYPES;
-- 
cgit 


From 0b2a8a9f4b564c7d923597828d93cd1f69ce40e0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:20 +0100
Subject: nvme-pci: consolidate code for polling non-dedicated queues

We have three places that can poll for I/O completions on a normal
interrupt-enabled queue.  All of them are in slow path code, so
consolidate them to a single helper that uses spin_lock_irqsave and
removes the fast path cqe_pending check.

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 35 ++++++++++++-----------------------
 1 file changed, 12 insertions(+), 23 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d42bb76e5e78..10c26a2e355a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1072,17 +1072,19 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 	return IRQ_NONE;
 }
 
-static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
+/*
+ * Poll for completions any queue, including those not dedicated to polling.
+ * Can be called from any context.
+ */
+static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag)
 {
+	unsigned long flags;
 	u16 start, end;
 	int found;
 
-	if (!nvme_cqe_pending(nvmeq))
-		return 0;
-
-	spin_lock_irq(&nvmeq->cq_lock);
+	spin_lock_irqsave(&nvmeq->cq_lock, flags);
 	found = nvme_process_cq(nvmeq, &start, &end, tag);
-	spin_unlock_irq(&nvmeq->cq_lock);
+	spin_unlock_irqrestore(&nvmeq->cq_lock, flags);
 
 	nvme_complete_cqes(nvmeq, start, end);
 	return found;
@@ -1279,7 +1281,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	/*
 	 * Did we miss an interrupt?
 	 */
-	if (__nvme_poll(nvmeq, req->tag)) {
+	if (nvme_poll_irqdisable(nvmeq, req->tag)) {
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, completion polled\n",
 			 req->tag, nvmeq->qid);
@@ -1406,18 +1408,13 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
 {
 	struct nvme_queue *nvmeq = &dev->queues[0];
-	u16 start, end;
 
 	if (shutdown)
 		nvme_shutdown_ctrl(&dev->ctrl);
 	else
 		nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
 
-	spin_lock_irq(&nvmeq->cq_lock);
-	nvme_process_cq(nvmeq, &start, &end, -1);
-	spin_unlock_irq(&nvmeq->cq_lock);
-
-	nvme_complete_cqes(nvmeq, start, end);
+	nvme_poll_irqdisable(nvmeq, -1);
 }
 
 static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
@@ -2217,17 +2214,9 @@ static void nvme_del_queue_end(struct request *req, blk_status_t error)
 static void nvme_del_cq_end(struct request *req, blk_status_t error)
 {
 	struct nvme_queue *nvmeq = req->end_io_data;
-	u16 start, end;
 
-	if (!error) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&nvmeq->cq_lock, flags);
-		nvme_process_cq(nvmeq, &start, &end, -1);
-		spin_unlock_irqrestore(&nvmeq->cq_lock, flags);
-
-		nvme_complete_cqes(nvmeq, start, end);
-	}
+	if (!error)
+		nvme_poll_irqdisable(nvmeq, -1);
 
 	nvme_del_queue_end(req, error);
 }
-- 
cgit 


From 5271edd41dd895773d3105f5065bbf5ded0dee7c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:21 +0100
Subject: nvme-pci: refactor nvme_disable_io_queues

Pass the opcode for the delete SQ/CQ command as an argument instead of
the somewhat confusing pass loop.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 10c26a2e355a..9ceba9900ca3 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2244,31 +2244,29 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
 	return 0;
 }
 
-static void nvme_disable_io_queues(struct nvme_dev *dev)
+static bool nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
 {
-	int pass, queues = dev->online_queues - 1;
+	int nr_queues = dev->online_queues - 1, sent = 0;
 	unsigned long timeout;
-	u8 opcode = nvme_admin_delete_sq;
 
-	for (pass = 0; pass < 2; pass++) {
-		int sent = 0, i = queues;
-
-		reinit_completion(&dev->ioq_wait);
+	reinit_completion(&dev->ioq_wait);
  retry:
-		timeout = ADMIN_TIMEOUT;
-		for (; i > 0; i--, sent++)
-			if (nvme_delete_queue(&dev->queues[i], opcode))
-				break;
-
-		while (sent--) {
-			timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout);
-			if (timeout == 0)
-				return;
-			if (i)
-				goto retry;
-		}
-		opcode = nvme_admin_delete_cq;
+	timeout = ADMIN_TIMEOUT;
+	while (nr_queues > 0) {
+		if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
+			break;
+		nr_queues--;
+		sent++;
 	}
+	while (sent--) {
+		timeout = wait_for_completion_io_timeout(&dev->ioq_wait,
+				timeout);
+		if (timeout == 0)
+			return false;
+		if (nr_queues)
+			goto retry;
+	}
+	return true;
 }
 
 /*
@@ -2428,7 +2426,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	nvme_stop_queues(&dev->ctrl);
 
 	if (!dead && dev->ctrl.queue_count > 0) {
-		nvme_disable_io_queues(dev);
+		if (nvme_disable_io_queues(dev, nvme_admin_delete_sq))
+			nvme_disable_io_queues(dev, nvme_admin_delete_cq);
 		nvme_disable_admin_queue(dev, shutdown);
 	}
 	for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
-- 
cgit 


From d1ed6aa14bc418531220478604c7b12c5e98fdca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:22 +0100
Subject: nvme-pci: don't poll from irq context when deleting queues

This is the last place outside of nvme_irq that handles CQEs from
interrupt context, and thus is in the way of removing the cq_lock for
normal queues, and avoiding lockdep warnings on the poll queues, for
which we already take it without IRQ disabling.

Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9ceba9900ca3..2d5a468c35b1 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -122,7 +122,6 @@ struct nvme_dev {
 	u32 cmbsz;
 	u32 cmbloc;
 	struct nvme_ctrl ctrl;
-	struct completion ioq_wait;
 
 	mempool_t *iod_mempool;
 
@@ -203,10 +202,12 @@ struct nvme_queue {
 	unsigned long flags;
 #define NVMEQ_ENABLED		0
 #define NVMEQ_SQ_CMB		1
+#define NVMEQ_DELETE_ERROR	2
 	u32 *dbbuf_sq_db;
 	u32 *dbbuf_cq_db;
 	u32 *dbbuf_sq_ei;
 	u32 *dbbuf_cq_ei;
+	struct completion delete_done;
 };
 
 /*
@@ -1535,6 +1536,8 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
 	int result;
 	s16 vector;
 
+	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
+
 	/*
 	 * A queue's vector matches the queue identifier unless the controller
 	 * has only one vector available.
@@ -2208,15 +2211,15 @@ static void nvme_del_queue_end(struct request *req, blk_status_t error)
 	struct nvme_queue *nvmeq = req->end_io_data;
 
 	blk_mq_free_request(req);
-	complete(&nvmeq->dev->ioq_wait);
+	complete(&nvmeq->delete_done);
 }
 
 static void nvme_del_cq_end(struct request *req, blk_status_t error)
 {
 	struct nvme_queue *nvmeq = req->end_io_data;
 
-	if (!error)
-		nvme_poll_irqdisable(nvmeq, -1);
+	if (error)
+		set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
 
 	nvme_del_queue_end(req, error);
 }
@@ -2238,6 +2241,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
 	req->timeout = ADMIN_TIMEOUT;
 	req->end_io_data = nvmeq;
 
+	init_completion(&nvmeq->delete_done);
 	blk_execute_rq_nowait(q, NULL, req, false,
 			opcode == nvme_admin_delete_cq ?
 				nvme_del_cq_end : nvme_del_queue_end);
@@ -2249,7 +2253,6 @@ static bool nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
 	int nr_queues = dev->online_queues - 1, sent = 0;
 	unsigned long timeout;
 
-	reinit_completion(&dev->ioq_wait);
  retry:
 	timeout = ADMIN_TIMEOUT;
 	while (nr_queues > 0) {
@@ -2258,11 +2261,20 @@ static bool nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
 		nr_queues--;
 		sent++;
 	}
-	while (sent--) {
-		timeout = wait_for_completion_io_timeout(&dev->ioq_wait,
+	while (sent) {
+		struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];
+
+		timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
 				timeout);
 		if (timeout == 0)
 			return false;
+
+		/* handle any remaining CQEs */
+		if (opcode == nvme_admin_delete_cq &&
+		    !test_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags))
+			nvme_poll_irqdisable(nvmeq, -1);
+
+		sent--;
 		if (nr_queues)
 			goto retry;
 	}
@@ -2746,7 +2758,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
 	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
 	mutex_init(&dev->shutdown_lock);
-	init_completion(&dev->ioq_wait);
 
 	result = nvme_setup_prp_pools(dev);
 	if (result)
-- 
cgit 


From 3a7afd8ee42a68d4f24ab9c947a4ef82d4d52375 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:23 +0100
Subject: nvme-pci: remove the CQ lock for interrupt driven queues

Now that we can't poll regular, interrupt driven I/O queues there
is almost nothing that can race with an interrupt.  The only
possible other contexts polling a CQ are the error handler and
queue shutdown, and both are so far off in the slow path that
we can simply use the big hammer of disabling interrupts.

With that we can stop taking the cq_lock for normal queues.

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2d5a468c35b1..4ccb4ea22ac6 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -185,7 +185,8 @@ struct nvme_queue {
 	struct nvme_dev *dev;
 	spinlock_t sq_lock;
 	struct nvme_command *sq_cmds;
-	spinlock_t cq_lock ____cacheline_aligned_in_smp;
+	 /* only used for poll queues: */
+	spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
 	volatile struct nvme_completion *cqes;
 	struct blk_mq_tags **tags;
 	dma_addr_t sq_dma_addr;
@@ -1050,12 +1051,16 @@ static irqreturn_t nvme_irq(int irq, void *data)
 	irqreturn_t ret = IRQ_NONE;
 	u16 start, end;
 
-	spin_lock(&nvmeq->cq_lock);
+	/*
+	 * The rmb/wmb pair ensures we see all updates from a previous run of
+	 * the irq handler, even if that was on another CPU.
+	 */
+	rmb();
 	if (nvmeq->cq_head != nvmeq->last_cq_head)
 		ret = IRQ_HANDLED;
 	nvme_process_cq(nvmeq, &start, &end, -1);
 	nvmeq->last_cq_head = nvmeq->cq_head;
-	spin_unlock(&nvmeq->cq_lock);
+	wmb();
 
 	if (start != end) {
 		nvme_complete_cqes(nvmeq, start, end);
@@ -1079,13 +1084,24 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
  */
 static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag)
 {
-	unsigned long flags;
+	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
 	u16 start, end;
 	int found;
 
-	spin_lock_irqsave(&nvmeq->cq_lock, flags);
+	/*
+	 * For a poll queue we need to protect against the polling thread
+	 * using the CQ lock.  For normal interrupt driven threads we have
+	 * to disable the interrupt to avoid racing with it.
+	 */
+	if (nvmeq->cq_vector == -1)
+		spin_lock(&nvmeq->cq_poll_lock);
+	else
+		disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
 	found = nvme_process_cq(nvmeq, &start, &end, tag);
-	spin_unlock_irqrestore(&nvmeq->cq_lock, flags);
+	if (nvmeq->cq_vector == -1)
+		spin_unlock(&nvmeq->cq_poll_lock);
+	else
+		enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
 
 	nvme_complete_cqes(nvmeq, start, end);
 	return found;
@@ -1100,9 +1116,9 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx)
 	if (!nvme_cqe_pending(nvmeq))
 		return 0;
 
-	spin_lock(&nvmeq->cq_lock);
+	spin_lock(&nvmeq->cq_poll_lock);
 	found = nvme_process_cq(nvmeq, &start, &end, -1);
-	spin_unlock(&nvmeq->cq_lock);
+	spin_unlock(&nvmeq->cq_poll_lock);
 
 	nvme_complete_cqes(nvmeq, start, end);
 	return found;
@@ -1482,7 +1498,7 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
 	nvmeq->q_dmadev = dev->dev;
 	nvmeq->dev = dev;
 	spin_lock_init(&nvmeq->sq_lock);
-	spin_lock_init(&nvmeq->cq_lock);
+	spin_lock_init(&nvmeq->cq_poll_lock);
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
 	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
@@ -1518,7 +1534,6 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 {
 	struct nvme_dev *dev = nvmeq->dev;
 
-	spin_lock_irq(&nvmeq->cq_lock);
 	nvmeq->sq_tail = 0;
 	nvmeq->last_sq_tail = 0;
 	nvmeq->cq_head = 0;
@@ -1527,7 +1542,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
 	nvme_dbbuf_init(dev, nvmeq, qid);
 	dev->online_queues++;
-	spin_unlock_irq(&nvmeq->cq_lock);
+	wmb(); /* ensure the first interrupt sees the initialization */
 }
 
 static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
-- 
cgit 


From f9801a484ad6dcc33b10c61b143efc3352541802 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:24 +0100
Subject: nvme-rdma: remove I/O polling support

The code was always a bit of a hack that digs far too much into
RDMA core internals.  Lets kick it out and reimplement proper
dedicated poll queues as needed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/rdma.c | 24 ------------------------
 1 file changed, 24 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 3ca096c1a506..75c01d20a133 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1738,29 +1738,6 @@ err:
 	return BLK_STS_IOERR;
 }
 
-static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
-{
-	struct nvme_rdma_queue *queue = hctx->driver_data;
-	struct ib_cq *cq = queue->ib_cq;
-	struct ib_wc wc;
-	int found = 0;
-
-	while (ib_poll_cq(cq, 1, &wc) > 0) {
-		struct ib_cqe *cqe = wc.wr_cqe;
-
-		if (cqe) {
-			if (cqe->done == nvme_rdma_recv_done) {
-				nvme_rdma_recv_done(cq, &wc);
-				found++;
-			} else {
-				cqe->done(cq, &wc);
-			}
-		}
-	}
-
-	return found;
-}
-
 static void nvme_rdma_complete_rq(struct request *rq)
 {
 	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
@@ -1782,7 +1759,6 @@ static const struct blk_mq_ops nvme_rdma_mq_ops = {
 	.init_request	= nvme_rdma_init_request,
 	.exit_request	= nvme_rdma_exit_request,
 	.init_hctx	= nvme_rdma_init_hctx,
-	.poll		= nvme_rdma_poll,
 	.timeout	= nvme_rdma_timeout,
 	.map_queues	= nvme_rdma_map_queues,
 };
-- 
cgit 


From 9d6610b76fa374eae3deb93bcbace4a06c2e3b95 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:25 +0100
Subject: nvme-mpath: remove I/O polling support

The ->poll_fn has been stale for a while, as a lot of places check for mq
ops.  But there is no real point in it anyway, as we don't even use
the multipath code for subsystems without multiple ports, which is usually
what we do high performance I/O to.  If it really becomes an issue we
should rework the nvme code to also skip the multipath code for any
private namespace, even if that could mean some trouble when rescanning.

Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/multipath.c | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index ffebdd0ae34b..ec310b1b9267 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -220,21 +220,6 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
 	return ret;
 }
 
-static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc, bool spin)
-{
-	struct nvme_ns_head *head = q->queuedata;
-	struct nvme_ns *ns;
-	int found = 0;
-	int srcu_idx;
-
-	srcu_idx = srcu_read_lock(&head->srcu);
-	ns = srcu_dereference(head->current_path[numa_node_id()], &head->srcu);
-	if (likely(ns && nvme_path_is_optimized(ns)))
-		found = ns->queue->poll_fn(q, qc, spin);
-	srcu_read_unlock(&head->srcu, srcu_idx);
-	return found;
-}
-
 static void nvme_requeue_work(struct work_struct *work)
 {
 	struct nvme_ns_head *head =
@@ -281,7 +266,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 		goto out;
 	q->queuedata = head;
 	blk_queue_make_request(q, nvme_ns_head_make_request);
-	q->poll_fn = nvme_ns_head_poll;
 	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
 	/* set to a default value for 512 until disk is validated */
 	blk_queue_logical_block_size(q, 512);
-- 
cgit 


From 376f7ef8bfeaee3993c2e85df1bbaa06725b9342 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:27 +0100
Subject: block: only allow polling if a poll queue_map exists

This avoids having to have differnet mq_ops for different setups
with or without poll queues.

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4ccb4ea22ac6..7732c4979a4e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1602,22 +1602,15 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 	.timeout	= nvme_timeout,
 };
 
-#define NVME_SHARED_MQ_OPS					\
-	.queue_rq		= nvme_queue_rq,		\
-	.commit_rqs		= nvme_commit_rqs,		\
-	.complete		= nvme_pci_complete_rq,		\
-	.init_hctx		= nvme_init_hctx,		\
-	.init_request		= nvme_init_request,		\
-	.map_queues		= nvme_pci_map_queues,		\
-	.timeout		= nvme_timeout			\
-
 static const struct blk_mq_ops nvme_mq_ops = {
-	NVME_SHARED_MQ_OPS,
-};
-
-static const struct blk_mq_ops nvme_mq_poll_ops = {
-	NVME_SHARED_MQ_OPS,
-	.poll			= nvme_poll,
+	.queue_rq	= nvme_queue_rq,
+	.complete	= nvme_pci_complete_rq,
+	.commit_rqs	= nvme_commit_rqs,
+	.init_hctx	= nvme_init_hctx,
+	.init_request	= nvme_init_request,
+	.map_queues	= nvme_pci_map_queues,
+	.timeout	= nvme_timeout,
+	.poll		= nvme_poll,
 };
 
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
@@ -2304,11 +2297,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	int ret;
 
 	if (!dev->ctrl.tagset) {
-		if (dev->io_queues[HCTX_TYPE_POLL])
-			dev->tagset.ops = &nvme_mq_poll_ops;
-		else
-			dev->tagset.ops = &nvme_mq_ops;
-
+		dev->tagset.ops = &nvme_mq_ops;
 		dev->tagset.nr_hw_queues = dev->online_queues - 1;
 		dev->tagset.nr_maps = HCTX_MAX_TYPES;
 		dev->tagset.timeout = NVME_IO_TIMEOUT;
-- 
cgit 


From 892ad71f622bbf39c6de321d5ca9b0fdec237c24 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:30 -0500
Subject: dm: set the static flush bio device on demand

The next patch changes the macro bio_set_dev() to associate a bio with a
blkg based on the device set. However, dm creates a static bio to be
used as the basis for cloning empty flush bios on creation. The
bio_set_dev() call in alloc_dev() will cause problems with the next
patch adding association to bio_set_dev() because the call is before the
bdev is associated with a gendisk (bd_disk is %NULL). To get around
this, set the device on the static bio every time and use that to clone
to the other bios.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a733e4c920af..ab72d79775ee 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1417,10 +1417,21 @@ static int __send_empty_flush(struct clone_info *ci)
 	unsigned target_nr = 0;
 	struct dm_target *ti;
 
+	/*
+	 * Empty flush uses a statically initialized bio, &md->flush_bio, as
+	 * the base for cloning.  However, blkg association requires that a
+	 * bdev is associated with a gendisk, which doesn't happen until the
+	 * bdev is opened.  So, blkg association is done at issue time of the
+	 * flush rather than when the device is created in alloc_dev().
+	 */
+	bio_set_dev(ci->bio, ci->io->md->bdev);
+
 	BUG_ON(bio_has_data(ci->bio));
 	while ((ti = dm_table_get_target(ci->map, target_nr++)))
 		__send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
 
+	bio_disassociate_blkg(ci->bio);
+
 	return 0;
 }
 
@@ -1939,7 +1950,6 @@ static struct mapped_device *alloc_dev(int minor)
 		goto bad;
 
 	bio_init(&md->flush_bio, NULL, 0);
-	bio_set_dev(&md->flush_bio, md->bdev);
 	md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
 
 	dm_stats_init(&md->stats);
-- 
cgit 


From db6638d7d177a8bc74c9e539e2e0d7d061c767b1 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:35 -0500
Subject: blkcg: remove bio->bi_css and instead use bio->bi_blkg

Prior patches ensured that any bio that interacts with a request_queue
is properly associated with a blkg. This makes bio->bi_css unnecessary
as blkg maintains a reference to blkcg already.

This removes the bio field bi_css and transfers corresponding uses to
access via bi_blkg.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 5 +++--
 drivers/md/raid0.c   | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 176ab1f28eca..0770004616de 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -77,6 +77,7 @@
 #include <linux/falloc.h>
 #include <linux/uio.h>
 #include <linux/ioprio.h>
+#include <linux/blk-cgroup.h>
 
 #include "loop.h"
 
@@ -1820,8 +1821,8 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	/* always use the first bio's css */
 #ifdef CONFIG_BLK_CGROUP
-	if (cmd->use_aio && rq->bio && rq->bio->bi_css) {
-		cmd->css = rq->bio->bi_css;
+	if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) {
+		cmd->css = &bio_blkcg(rq->bio)->css;
 		css_get(cmd->css);
 	} else
 #endif
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ac1cffd2a09b..f3fb5bb8c82a 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -542,7 +542,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
 		    !discard_bio)
 			continue;
 		bio_chain(discard_bio, bio);
-		bio_clone_blkcg_association(discard_bio, bio);
+		bio_clone_blkg_association(discard_bio, bio);
 		if (mddev->gendisk)
 			trace_block_bio_remap(bdev_get_queue(rdev->bdev),
 				discard_bio, disk_devt(mddev->gendisk),
-- 
cgit 


From 1190203555ec58d768ea802e8fdbb5c42187d9d5 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Mon, 29 Oct 2018 16:44:18 -0700
Subject: nvme: consolidate memset calls in the nvme_setup_cmd path

In function nvme_setup_cmd() we call command specific setup function
for flush, rw, and discard. Instead of calling memset in each function
lets call it once in the parent function.

This is purely code cleanup patch and it does not change any existing
functionality.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3ce7fc9df378..5c4f9402f4cd 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -536,7 +536,6 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
 static inline void nvme_setup_flush(struct nvme_ns *ns,
 		struct nvme_command *cmnd)
 {
-	memset(cmnd, 0, sizeof(*cmnd));
 	cmnd->common.opcode = nvme_cmd_flush;
 	cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
 }
@@ -569,7 +568,6 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 		return BLK_STS_IOERR;
 	}
 
-	memset(cmnd, 0, sizeof(*cmnd));
 	cmnd->dsm.opcode = nvme_cmd_dsm;
 	cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
 	cmnd->dsm.nr = cpu_to_le32(segments - 1);
@@ -598,7 +596,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 	if (req->cmd_flags & REQ_RAHEAD)
 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
-	memset(cmnd, 0, sizeof(*cmnd));
 	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
 	cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
 	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
@@ -663,6 +660,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 
 	nvme_clear_nvme_request(req);
 
+	memset(cmd, 0, sizeof(*cmd));
 	switch (req_op(req)) {
 	case REQ_OP_DRV_IN:
 	case REQ_OP_DRV_OUT:
-- 
cgit 


From 103e515efa89be33d04e45aae82de136f0c49865 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.com>
Date: Fri, 16 Nov 2018 09:22:29 +0100
Subject: nvme: add a numa_node field to struct nvme_ctrl

Instead of directly poking into the struct device add a new numa_node
field to struct nvme_ctrl.  This allows fabrics drivers where ctrl->dev
is a virtual device to support NUMA affinity as well.

Also expose the field as a sysfs attribute, and populate it for the
RDMA and FC transports.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c      | 4 +++-
 drivers/nvme/host/fc.c        | 5 +++--
 drivers/nvme/host/multipath.c | 4 ++--
 drivers/nvme/host/nvme.h      | 1 +
 drivers/nvme/host/rdma.c      | 5 +++--
 5 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 5c4f9402f4cd..e57c673ae737 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2766,6 +2766,7 @@ static ssize_t  field##_show(struct device *dev,				\
 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
 
 nvme_show_int_function(cntlid);
+nvme_show_int_function(numa_node);
 
 static ssize_t nvme_sysfs_delete(struct device *dev,
 				struct device_attribute *attr, const char *buf,
@@ -2845,6 +2846,7 @@ static struct attribute *nvme_dev_attrs[] = {
 	&dev_attr_subsysnqn.attr,
 	&dev_attr_address.attr,
 	&dev_attr_state.attr,
+	&dev_attr_numa_node.attr,
 	NULL
 };
 
@@ -3055,7 +3057,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	struct gendisk *disk;
 	struct nvme_id_ns *id;
 	char disk_name[DISK_NAME_LEN];
-	int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
+	int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT;
 
 	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
 	if (!ns)
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index e6994e4ce17e..b79e41938513 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2425,7 +2425,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
 	ctrl->tag_set.ops = &nvme_fc_mq_ops;
 	ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
 	ctrl->tag_set.reserved_tags = 1; /* fabric connect */
-	ctrl->tag_set.numa_node = NUMA_NO_NODE;
+	ctrl->tag_set.numa_node = ctrl->ctrl.numa_node;
 	ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
 	ctrl->tag_set.cmd_size =
 		struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
@@ -3018,6 +3018,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 
 	ctrl->ctrl.opts = opts;
 	ctrl->ctrl.nr_reconnects = 0;
+	ctrl->ctrl.numa_node = dev_to_node(lport->dev);
 	INIT_LIST_HEAD(&ctrl->ctrl_list);
 	ctrl->lport = lport;
 	ctrl->rport = rport;
@@ -3058,7 +3059,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 	ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
 	ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
 	ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
-	ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
+	ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node;
 	ctrl->admin_tag_set.cmd_size =
 		struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
 			    ctrl->lport->ops->fcprqst_priv_sz);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index ec310b1b9267..183ec17ba067 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -141,7 +141,7 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 		    test_bit(NVME_NS_ANA_PENDING, &ns->flags))
 			continue;
 
-		distance = node_distance(node, dev_to_node(ns->ctrl->dev));
+		distance = node_distance(node, ns->ctrl->numa_node);
 
 		switch (ns->ana_state) {
 		case NVME_ANA_OPTIMIZED:
@@ -261,7 +261,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 	if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
 		return 0;
 
-	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
+	q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node);
 	if (!q)
 		goto out;
 	q->queuedata = head;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ae77eb16fd1f..f1e4566e6ca8 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -153,6 +153,7 @@ struct nvme_ctrl {
 	struct request_queue *connect_q;
 	struct device *dev;
 	int instance;
+	int numa_node;
 	struct blk_mq_tag_set *tagset;
 	struct blk_mq_tag_set *admin_tagset;
 	struct list_head namespaces;
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 75c01d20a133..f2db848f6985 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -694,7 +694,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
 		set->ops = &nvme_rdma_admin_mq_ops;
 		set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
 		set->reserved_tags = 2; /* connect + keep-alive */
-		set->numa_node = NUMA_NO_NODE;
+		set->numa_node = nctrl->numa_node;
 		set->cmd_size = sizeof(struct nvme_rdma_request) +
 			SG_CHUNK_SIZE * sizeof(struct scatterlist);
 		set->driver_data = ctrl;
@@ -707,7 +707,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
 		set->ops = &nvme_rdma_mq_ops;
 		set->queue_depth = nctrl->sqsize + 1;
 		set->reserved_tags = 1; /* fabric connect */
-		set->numa_node = NUMA_NO_NODE;
+		set->numa_node = nctrl->numa_node;
 		set->flags = BLK_MQ_F_SHOULD_MERGE;
 		set->cmd_size = sizeof(struct nvme_rdma_request) +
 			SG_CHUNK_SIZE * sizeof(struct scatterlist);
@@ -763,6 +763,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
 		return error;
 
 	ctrl->device = ctrl->queues[0].device;
+	ctrl->ctrl.numa_node = dev_to_node(ctrl->device->dev->dma_device);
 
 	ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
 
-- 
cgit 


From 12b2117161ddbdcdb69777404c5aa2a9fe6ad7d5 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Fri, 2 Nov 2018 10:28:12 -0700
Subject: nvme: introduce ctrl attributes enumeration

We are growing more controller attributes, so use a proper enumeration
for it.  For now just add the 128-bit hostid which we support.

Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/admin-cmd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 1179f6314323..30778ffc46f5 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -304,7 +304,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 
 	/* XXX: figure out what to do about RTD3R/RTD3 */
 	id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL);
-	id->ctratt = cpu_to_le32(1 << 0);
+	id->ctratt = cpu_to_le32(NVME_CTRL_ATTR_HID_128_BIT);
 
 	id->oacs = 0;
 
-- 
cgit 


From 3e53ba38a9404434a8b57683825279f4305b5a76 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Fri, 2 Nov 2018 10:28:14 -0700
Subject: nvme: cache controller attributes

We get the controller attributes in identify, cache them as we'll need
them for traffic based keep alive support.

Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 1 +
 drivers/nvme/host/nvme.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index e57c673ae737..9de6244a345c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2409,6 +2409,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	ctrl->sgls = le32_to_cpu(id->sgls);
 	ctrl->kas = le16_to_cpu(id->kas);
 	ctrl->max_namespaces = le32_to_cpu(id->mnan);
+	ctrl->ctratt = le32_to_cpu(id->ctratt);
 
 	if (id->rtd3e) {
 		/* us -> s */
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index f1e4566e6ca8..4be7bbcfe66d 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -194,6 +194,7 @@ struct nvme_ctrl {
 	u8 apsta;
 	u32 oaes;
 	u32 aen_result;
+	u32 ctratt;
 	unsigned int shutdown_timeout;
 	unsigned int kato;
 	bool subsystem;
-- 
cgit 


From 6e3ca03ee934572d5de4fb2224c01e12c4d422c8 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Fri, 2 Nov 2018 10:28:15 -0700
Subject: nvme: support traffic based keep-alive

If the controller supports traffic based keep alive, we restart the keep
alive timer if any admin or io commands was completed during the kato
period.  This prevents a possible starvation of keep alive commands in
the presence of heavy traffic as in such case, we already have a health
indication from the host perspective.

Only set a comp_seen indicator in case the controller supports keep
alive to minimize the overhead for pci controllers.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 13 +++++++++++++
 drivers/nvme/host/nvme.h |  1 +
 2 files changed, 14 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9de6244a345c..48ffb1d685c2 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -251,6 +251,9 @@ void nvme_complete_rq(struct request *req)
 
 	trace_nvme_complete_rq(req);
 
+	if (nvme_req(req)->ctrl->kas)
+		nvme_req(req)->ctrl->comp_seen = true;
+
 	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
 		if ((req->cmd_flags & REQ_NVME_MPATH) &&
 		    blk_path_error(status)) {
@@ -839,6 +842,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
 		return;
 	}
 
+	ctrl->comp_seen = false;
 	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
 }
 
@@ -863,6 +867,15 @@ static void nvme_keep_alive_work(struct work_struct *work)
 {
 	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
 			struct nvme_ctrl, ka_work);
+	bool comp_seen = ctrl->comp_seen;
+
+	if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
+		dev_dbg(ctrl->device,
+			"reschedule traffic based keep-alive timer\n");
+		ctrl->comp_seen = false;
+		schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
+		return;
+	}
 
 	if (nvme_keep_alive(ctrl)) {
 		/* allocation failure, reset the controller */
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 4be7bbcfe66d..f2594d468f29 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -145,6 +145,7 @@ enum nvme_ctrl_state {
 };
 
 struct nvme_ctrl {
+	bool comp_seen;
 	enum nvme_ctrl_state state;
 	bool identified;
 	spinlock_t lock;
-- 
cgit 


From c09305ae49970e15cd18828c0f78b766e8cf224f Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Fri, 2 Nov 2018 10:28:13 -0700
Subject: nvmet: support for traffic based keep-alive

A controller that supports traffic based keep-alive can restart the keep
alive timer even when no keep-alive was not received in the kato period
as long as other admin or I/O commands were received.  For each command
set ctrl->cmd_seen to true, and when keep-alive timer expires, if any
commands were seen, resched ka_work instead of escalating to a fatal
error.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/admin-cmd.c |  3 ++-
 drivers/nvme/target/core.c      | 12 ++++++++++++
 drivers/nvme/target/nvmet.h     |  2 ++
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 30778ffc46f5..c9c6d25a3ec2 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -304,7 +304,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 
 	/* XXX: figure out what to do about RTD3R/RTD3 */
 	id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL);
-	id->ctratt = cpu_to_le32(NVME_CTRL_ATTR_HID_128_BIT);
+	id->ctratt = cpu_to_le32(NVME_CTRL_ATTR_HID_128_BIT |
+		NVME_CTRL_ATTR_TBKAS);
 
 	id->oacs = 0;
 
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index a5f9bbce863f..05d8d9bc058f 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -299,6 +299,15 @@ static void nvmet_keep_alive_timer(struct work_struct *work)
 {
 	struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work),
 			struct nvmet_ctrl, ka_work);
+	bool cmd_seen = ctrl->cmd_seen;
+
+	ctrl->cmd_seen = false;
+	if (cmd_seen) {
+		pr_debug("ctrl %d reschedule traffic based keep-alive timer\n",
+			ctrl->cntlid);
+		schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
+		return;
+	}
 
 	pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n",
 		ctrl->cntlid, ctrl->kato);
@@ -801,6 +810,9 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 		goto fail;
 	}
 
+	if (sq->ctrl)
+		sq->ctrl->cmd_seen = true;
+
 	return true;
 
 fail:
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index c2b4d9ee6391..860218edeb6c 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -163,6 +163,8 @@ struct nvmet_ctrl {
 	struct nvmet_cq		**cqs;
 	struct nvmet_sq		**sqs;
 
+	bool			cmd_seen;
+
 	struct mutex		lock;
 	u64			cap;
 	u32			cc;
-- 
cgit 


From 50a909db36f244bcfec6e02598d31c0b0a468175 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 14 Nov 2018 01:12:19 -0500
Subject: nvmet: use IOCB_NOWAIT for file-ns buffered I/O

This patch optimizes read command behavior when file-ns configured
with buffered I/O. Instead of offloading the buffered I/O read operations
to the worker threads, we first issue the read operation with IOCB_NOWAIT
and try and access the data from the cache. Here we only offload the
request to the worker thread and complete the request in the worker
thread context when IOCB_NOWAIT request fails.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/io-cmd-file.c | 130 ++++++++++++++++++++++++--------------
 1 file changed, 84 insertions(+), 46 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
index 01feebec29ea..12eaa8ddc248 100644
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -83,17 +83,16 @@ static void nvmet_file_init_bvec(struct bio_vec *bv, struct sg_page_iter *iter)
 }
 
 static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
-		unsigned long nr_segs, size_t count)
+		unsigned long nr_segs, size_t count, int ki_flags)
 {
 	struct kiocb *iocb = &req->f.iocb;
 	ssize_t (*call_iter)(struct kiocb *iocb, struct iov_iter *iter);
 	struct iov_iter iter;
-	int ki_flags = 0, rw;
-	ssize_t ret;
+	int rw;
 
 	if (req->cmd->rw.opcode == nvme_cmd_write) {
 		if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
-			ki_flags = IOCB_DSYNC;
+			ki_flags |= IOCB_DSYNC;
 		call_iter = req->ns->file->f_op->write_iter;
 		rw = WRITE;
 	} else {
@@ -107,12 +106,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
 	iocb->ki_filp = req->ns->file;
 	iocb->ki_flags = ki_flags | iocb_flags(req->ns->file);
 
-	ret = call_iter(iocb, &iter);
-
-	if (ret != -EIOCBQUEUED && iocb->ki_complete)
-		iocb->ki_complete(iocb, ret, 0);
-
-	return ret;
+	return call_iter(iocb, &iter);
 }
 
 static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2)
@@ -130,7 +124,7 @@ static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2)
 			NVME_SC_INTERNAL | NVME_SC_DNR : 0);
 }
 
-static void nvmet_file_execute_rw(struct nvmet_req *req)
+static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags)
 {
 	ssize_t nr_bvec = DIV_ROUND_UP(req->data_len, PAGE_SIZE);
 	struct sg_page_iter sg_pg_iter;
@@ -140,30 +134,14 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
 	ssize_t ret = 0;
 	loff_t pos;
 
-	if (!req->sg_cnt || !nr_bvec) {
-		nvmet_req_complete(req, 0);
-		return;
-	}
+
+	if (req->f.mpool_alloc && nr_bvec > NVMET_MAX_MPOOL_BVEC)
+		is_sync = true;
 
 	pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift;
 	if (unlikely(pos + req->data_len > req->ns->size)) {
 		nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR);
-		return;
-	}
-
-	if (nr_bvec > NVMET_MAX_INLINE_BIOVEC)
-		req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
-				GFP_KERNEL);
-	else
-		req->f.bvec = req->inline_bvec;
-
-	req->f.mpool_alloc = false;
-	if (unlikely(!req->f.bvec)) {
-		/* fallback under memory pressure */
-		req->f.bvec = mempool_alloc(req->ns->bvec_pool, GFP_KERNEL);
-		req->f.mpool_alloc = true;
-		if (nr_bvec > NVMET_MAX_MPOOL_BVEC)
-			is_sync = true;
+		return true;
 	}
 
 	memset(&req->f.iocb, 0, sizeof(struct kiocb));
@@ -177,9 +155,10 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
 
 		if (unlikely(is_sync) &&
 		    (nr_bvec - 1 == 0 || bv_cnt == NVMET_MAX_MPOOL_BVEC)) {
-			ret = nvmet_file_submit_bvec(req, pos, bv_cnt, len);
+			ret = nvmet_file_submit_bvec(req, pos, bv_cnt, len, 0);
 			if (ret < 0)
-				goto out;
+				goto complete;
+
 			pos += len;
 			bv_cnt = 0;
 			len = 0;
@@ -187,30 +166,92 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
 		nr_bvec--;
 	}
 
-	if (WARN_ON_ONCE(total_len != req->data_len))
+	if (WARN_ON_ONCE(total_len != req->data_len)) {
 		ret = -EIO;
-out:
-	if (unlikely(is_sync || ret)) {
-		nvmet_file_io_done(&req->f.iocb, ret < 0 ? ret : total_len, 0);
-		return;
+		goto complete;
+	}
+
+	if (unlikely(is_sync)) {
+		ret = total_len;
+		goto complete;
 	}
-	req->f.iocb.ki_complete = nvmet_file_io_done;
-	nvmet_file_submit_bvec(req, pos, bv_cnt, total_len);
+
+	/*
+	 * A NULL ki_complete ask for synchronous execution, which we want
+	 * for the IOCB_NOWAIT case.
+	 */
+	if (!(ki_flags & IOCB_NOWAIT))
+		req->f.iocb.ki_complete = nvmet_file_io_done;
+
+	ret = nvmet_file_submit_bvec(req, pos, bv_cnt, total_len, ki_flags);
+
+	switch (ret) {
+	case -EIOCBQUEUED:
+		return true;
+	case -EAGAIN:
+		if (WARN_ON_ONCE(!(ki_flags & IOCB_NOWAIT)))
+			goto complete;
+		return false;
+	case -EOPNOTSUPP:
+		/*
+		 * For file systems returning error -EOPNOTSUPP, handle
+		 * IOCB_NOWAIT error case separately and retry without
+		 * IOCB_NOWAIT.
+		 */
+		if ((ki_flags & IOCB_NOWAIT))
+			return false;
+		break;
+	}
+
+complete:
+	nvmet_file_io_done(&req->f.iocb, ret, 0);
+	return true;
 }
 
 static void nvmet_file_buffered_io_work(struct work_struct *w)
 {
 	struct nvmet_req *req = container_of(w, struct nvmet_req, f.work);
 
-	nvmet_file_execute_rw(req);
+	nvmet_file_execute_io(req, 0);
 }
 
-static void nvmet_file_execute_rw_buffered_io(struct nvmet_req *req)
+static void nvmet_file_submit_buffered_io(struct nvmet_req *req)
 {
 	INIT_WORK(&req->f.work, nvmet_file_buffered_io_work);
 	queue_work(buffered_io_wq, &req->f.work);
 }
 
+static void nvmet_file_execute_rw(struct nvmet_req *req)
+{
+	ssize_t nr_bvec = DIV_ROUND_UP(req->data_len, PAGE_SIZE);
+
+	if (!req->sg_cnt || !nr_bvec) {
+		nvmet_req_complete(req, 0);
+		return;
+	}
+
+	if (nr_bvec > NVMET_MAX_INLINE_BIOVEC)
+		req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
+				GFP_KERNEL);
+	else
+		req->f.bvec = req->inline_bvec;
+
+	if (unlikely(!req->f.bvec)) {
+		/* fallback under memory pressure */
+		req->f.bvec = mempool_alloc(req->ns->bvec_pool, GFP_KERNEL);
+		req->f.mpool_alloc = true;
+	} else
+		req->f.mpool_alloc = false;
+
+	if (req->ns->buffered_io) {
+		if (likely(!req->f.mpool_alloc) &&
+				nvmet_file_execute_io(req, IOCB_NOWAIT))
+			return;
+		nvmet_file_submit_buffered_io(req);
+	} else
+		nvmet_file_execute_io(req, 0);
+}
+
 u16 nvmet_file_flush(struct nvmet_req *req)
 {
 	if (vfs_fsync(req->ns->file, 1) < 0)
@@ -320,10 +361,7 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req)
 	switch (cmd->common.opcode) {
 	case nvme_cmd_read:
 	case nvme_cmd_write:
-		if (req->ns->buffered_io)
-			req->execute = nvmet_file_execute_rw_buffered_io;
-		else
-			req->execute = nvmet_file_execute_rw;
+		req->execute = nvmet_file_execute_rw;
 		req->data_len = nvmet_rw_len(req);
 		return 0;
 	case nvme_cmd_flush:
-- 
cgit 


From 6c8312ad509c3aaaa8720fee7abe45b813cd4d87 Mon Sep 17 00:00:00 2001
From: Jay Sternberg <jay.e.sternberg@intel.com>
Date: Mon, 12 Nov 2018 13:56:33 -0800
Subject: nvmet: provide aen bit functions for multiple controller types

Move nvmet_aen_disabled and nvmet_clear_aen in preparation for other types
of controllers to use, initially the discovery controller.

Signed-off-by: Jay Sternberg <jay.e.sternberg@intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/admin-cmd.c | 13 -------------
 drivers/nvme/target/core.c      |  7 -------
 drivers/nvme/target/nvmet.h     | 15 +++++++++++++++
 3 files changed, 15 insertions(+), 20 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index c9c6d25a3ec2..e82262c988f1 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -19,19 +19,6 @@
 #include <asm/unaligned.h>
 #include "nvmet.h"
 
-/*
- * This helper allows us to clear the AEN based on the RAE bit,
- * Please use this helper when processing the log pages which are
- * associated with the AEN.
- */
-static inline void nvmet_clear_aen(struct nvmet_req *req, u32 aen_bit)
-{
-	int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15;
-
-	if (!rae)
-		clear_bit(aen_bit, &req->sq->ctrl->aen_masked);
-}
-
 u32 nvmet_get_log_page_len(struct nvme_command *cmd)
 {
 	u32 len = le16_to_cpu(cmd->get_log_page.numdu);
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 05d8d9bc058f..f33c4a20b572 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -150,13 +150,6 @@ static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
 	schedule_work(&ctrl->async_event_work);
 }
 
-static bool nvmet_aen_disabled(struct nvmet_ctrl *ctrl, u32 aen)
-{
-	if (!(READ_ONCE(ctrl->aen_enabled) & aen))
-		return true;
-	return test_and_set_bit(aen, &ctrl->aen_masked);
-}
-
 static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
 {
 	u32 i;
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 860218edeb6c..7efee345d467 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -342,6 +342,21 @@ struct nvmet_async_event {
 	u8			log_page;
 };
 
+static inline void nvmet_clear_aen(struct nvmet_req *req, u32 aen_bit)
+{
+	int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15;
+
+	if (!rae)
+		clear_bit(aen_bit, &req->sq->ctrl->aen_masked);
+}
+
+static inline bool nvmet_aen_disabled(struct nvmet_ctrl *ctrl, u32 aen)
+{
+	if (!(READ_ONCE(ctrl->aen_enabled) & aen))
+		return true;
+	return test_and_set_bit(aen, &ctrl->aen_masked);
+}
+
 u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
 u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req);
 u16 nvmet_file_parse_io_cmd(struct nvmet_req *req);
-- 
cgit 


From 7114ddeb40c0ccc584d86df598da4054ca4cd79f Mon Sep 17 00:00:00 2001
From: Jay Sternberg <jay.e.sternberg@intel.com>
Date: Mon, 12 Nov 2018 13:56:34 -0800
Subject: nvmet: change aen mask functions to use bit numbers

Functions nvmet_aen_disabled and nvmet_clear_aen were using
values not bit numbers ie 1 << 9 not 9 for bit function clear_bit
and test_and_set_bit.

Signed-off-by: Jay Sternberg <jay.e.sternberg@intel.com>
Reviewed-by: Phil Cayton <phil.cayton@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/admin-cmd.c |  4 ++--
 drivers/nvme/target/core.c      |  4 ++--
 drivers/nvme/target/nvmet.h     | 10 +++++-----
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index e82262c988f1..2e89f4e3364b 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -176,7 +176,7 @@ static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req)
 	if (!status)
 		status = nvmet_zero_sgl(req, len, req->data_len - len);
 	ctrl->nr_changed_ns = 0;
-	nvmet_clear_aen(req, NVME_AEN_CFG_NS_ATTR);
+	nvmet_clear_aen_bit(req, NVME_AEN_BIT_NS_ATTR);
 	mutex_unlock(&ctrl->lock);
 out:
 	nvmet_req_complete(req, status);
@@ -239,7 +239,7 @@ static void nvmet_execute_get_log_page_ana(struct nvmet_req *req)
 
 	hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt);
 	hdr.ngrps = cpu_to_le16(ngrps);
-	nvmet_clear_aen(req, NVME_AEN_CFG_ANA_CHANGE);
+	nvmet_clear_aen_bit(req, NVME_AEN_BIT_ANA_CHANGE);
 	up_read(&nvmet_ana_sem);
 
 	kfree(desc);
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index f33c4a20b572..f42a105ef17f 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -180,7 +180,7 @@ void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
 
 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 		nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid));
-		if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_NS_ATTR))
+		if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR))
 			continue;
 		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
 				NVME_AER_NOTICE_NS_CHANGED,
@@ -197,7 +197,7 @@ void nvmet_send_ana_event(struct nvmet_subsys *subsys,
 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 		if (port && ctrl->port != port)
 			continue;
-		if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_ANA_CHANGE))
+		if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_ANA_CHANGE))
 			continue;
 		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
 				NVME_AER_NOTICE_ANA, NVME_LOG_ANA);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 7efee345d467..8ddc54fa98c7 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -342,19 +342,19 @@ struct nvmet_async_event {
 	u8			log_page;
 };
 
-static inline void nvmet_clear_aen(struct nvmet_req *req, u32 aen_bit)
+static inline void nvmet_clear_aen_bit(struct nvmet_req *req, u32 bn)
 {
 	int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15;
 
 	if (!rae)
-		clear_bit(aen_bit, &req->sq->ctrl->aen_masked);
+		clear_bit(bn, &req->sq->ctrl->aen_masked);
 }
 
-static inline bool nvmet_aen_disabled(struct nvmet_ctrl *ctrl, u32 aen)
+static inline bool nvmet_aen_bit_disabled(struct nvmet_ctrl *ctrl, u32 bn)
 {
-	if (!(READ_ONCE(ctrl->aen_enabled) & aen))
+	if (!(READ_ONCE(ctrl->aen_enabled) & (1 << bn)))
 		return true;
-	return test_and_set_bit(aen, &ctrl->aen_masked);
+	return test_and_set_bit(bn, &ctrl->aen_masked);
 }
 
 u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
-- 
cgit 


From f9362ac1738a41cc526fde82e76beb034d8c6053 Mon Sep 17 00:00:00 2001
From: Jay Sternberg <jay.e.sternberg@intel.com>
Date: Mon, 12 Nov 2018 13:56:35 -0800
Subject: nvmet: allow Keep Alive for Discovery controller

Per change to specification allowing Discovery controllers to have
explicit persistent connections, remove restriction on Discovery
controllers allowing kato on connect.

Signed-off-by: Jay Sternberg <jay.e.sternberg@intel.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/admin-cmd.c |  2 +-
 drivers/nvme/target/core.c      | 36 ++++++++++--------------------------
 drivers/nvme/target/discovery.c |  4 ++++
 drivers/nvme/target/nvmet.h     |  4 +++-
 4 files changed, 18 insertions(+), 28 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 2e89f4e3364b..ad94119f6a51 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -726,7 +726,7 @@ static void nvmet_execute_async_event(struct nvmet_req *req)
 	schedule_work(&ctrl->async_event_work);
 }
 
-static void nvmet_execute_keep_alive(struct nvmet_req *req)
+void nvmet_execute_keep_alive(struct nvmet_req *req)
 {
 	struct nvmet_ctrl *ctrl = req->sq->ctrl;
 
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index f42a105ef17f..46345c3f9d4a 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -1180,31 +1180,17 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 	ctrl->cntlid = ret;
 
 	ctrl->ops = req->ops;
-	if (ctrl->subsys->type == NVME_NQN_DISC) {
-		/* Don't accept keep-alive timeout for discovery controllers */
-		if (kato) {
-			status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
-			goto out_remove_ida;
-		}
 
-		/*
-		 * Discovery controllers use some arbitrary high value in order
-		 * to cleanup stale discovery sessions
-		 *
-		 * From the latest base diff RC:
-		 * "The Keep Alive command is not supported by
-		 * Discovery controllers. A transport may specify a
-		 * fixed Discovery controller activity timeout value
-		 * (e.g., 2 minutes).  If no commands are received
-		 * by a Discovery controller within that time
-		 * period, the controller may perform the
-		 * actions for Keep Alive Timer expiration".
-		 */
-		ctrl->kato = NVMET_DISC_KATO;
-	} else {
-		/* keep-alive timeout in seconds */
-		ctrl->kato = DIV_ROUND_UP(kato, 1000);
-	}
+	/*
+	 * Discovery controllers may use some arbitrary high value
+	 * in order to cleanup stale discovery sessions
+	 */
+	if ((ctrl->subsys->type == NVME_NQN_DISC) && !kato)
+		kato = NVMET_DISC_KATO_MS;
+
+	/* keep-alive timeout in seconds */
+	ctrl->kato = DIV_ROUND_UP(kato, 1000);
+
 	nvmet_start_keep_alive_timer(ctrl);
 
 	mutex_lock(&subsys->lock);
@@ -1215,8 +1201,6 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 	*ctrlp = ctrl;
 	return 0;
 
-out_remove_ida:
-	ida_simple_remove(&cntlid_ida, ctrl->cntlid);
 out_free_sqs:
 	kfree(ctrl->sqs);
 out_free_cqs:
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index bc0aa0bf1543..6e0f91eb34ce 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -194,6 +194,10 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
 	}
 
 	switch (cmd->common.opcode) {
+	case nvme_admin_keep_alive:
+		req->execute = nvmet_execute_keep_alive;
+		req->data_len = 0;
+		return 0;
 	case nvme_admin_get_log_page:
 		req->data_len = nvmet_get_log_page_len(cmd);
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 8ddc54fa98c7..c525fb0510c5 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -372,6 +372,8 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status);
 int nvmet_req_alloc_sgl(struct nvmet_req *req);
 void nvmet_req_free_sgl(struct nvmet_req *req);
 
+void nvmet_execute_keep_alive(struct nvmet_req *req);
+
 void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
 		u16 size);
 void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid,
@@ -442,7 +444,7 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd);
 #define NVMET_DEFAULT_ANA_GRPID	1
 
 #define NVMET_KAS		10
-#define NVMET_DISC_KATO		120
+#define NVMET_DISC_KATO_MS		120000
 
 int __init nvmet_init_configfs(void);
 void __exit nvmet_exit_configfs(void);
-- 
cgit 


From 90107455cce753c05a5e1e80cb84b09da1c87eef Mon Sep 17 00:00:00 2001
From: Jay Sternberg <jay.e.sternberg@intel.com>
Date: Mon, 12 Nov 2018 13:56:36 -0800
Subject: nvmet: make kato and AEN processing for use by other controllers

Make common process of get/set features available to other controllers by
making simple functions static inline and others not static and prototypes
in nvmet.h file

Also remove static from nvmet_execute_async_event and add prototype to
nvmet.h to allow used by other controllers

Signed-off-by: Jay Sternberg <jay.e.sternberg@intel.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/admin-cmd.c | 54 +++++++++++++++++++++++++++++------------
 drivers/nvme/target/nvmet.h     |  6 +++++
 2 files changed, 45 insertions(+), 15 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index ad94119f6a51..753515fc8028 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -587,11 +587,34 @@ static u16 nvmet_set_feat_write_protect(struct nvmet_req *req)
 	return status;
 }
 
+u16 nvmet_set_feat_kato(struct nvmet_req *req)
+{
+	u32 val32 = le32_to_cpu(req->cmd->common.cdw10[1]);
+
+	req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000);
+
+	nvmet_set_result(req, req->sq->ctrl->kato);
+
+	return 0;
+}
+
+u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask)
+{
+	u32 val32 = le32_to_cpu(req->cmd->common.cdw10[1]);
+
+	if (val32 & ~mask)
+		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+
+	WRITE_ONCE(req->sq->ctrl->aen_enabled, val32);
+	nvmet_set_result(req, val32);
+
+	return 0;
+}
+
 static void nvmet_execute_set_features(struct nvmet_req *req)
 {
 	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
 	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
-	u32 val32;
 	u16 status = 0;
 
 	switch (cdw10 & 0xff) {
@@ -600,19 +623,10 @@ static void nvmet_execute_set_features(struct nvmet_req *req)
 			(subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16));
 		break;
 	case NVME_FEAT_KATO:
-		val32 = le32_to_cpu(req->cmd->common.cdw10[1]);
-		req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000);
-		nvmet_set_result(req, req->sq->ctrl->kato);
+		status = nvmet_set_feat_kato(req);
 		break;
 	case NVME_FEAT_ASYNC_EVENT:
-		val32 = le32_to_cpu(req->cmd->common.cdw10[1]);
-		if (val32 & ~NVMET_AEN_CFG_ALL) {
-			status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
-			break;
-		}
-
-		WRITE_ONCE(req->sq->ctrl->aen_enabled, val32);
-		nvmet_set_result(req, val32);
+		status = nvmet_set_feat_async_event(req, NVMET_AEN_CFG_ALL);
 		break;
 	case NVME_FEAT_HOST_ID:
 		status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
@@ -648,6 +662,16 @@ static u16 nvmet_get_feat_write_protect(struct nvmet_req *req)
 	return 0;
 }
 
+void nvmet_get_feat_kato(struct nvmet_req *req)
+{
+	nvmet_set_result(req, req->sq->ctrl->kato * 1000);
+}
+
+void nvmet_get_feat_async_event(struct nvmet_req *req)
+{
+	nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled));
+}
+
 static void nvmet_execute_get_features(struct nvmet_req *req)
 {
 	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
@@ -677,7 +701,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
 		break;
 #endif
 	case NVME_FEAT_ASYNC_EVENT:
-		nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled));
+		nvmet_get_feat_async_event(req);
 		break;
 	case NVME_FEAT_VOLATILE_WC:
 		nvmet_set_result(req, 1);
@@ -687,7 +711,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
 			(subsys->max_qid-1) | ((subsys->max_qid-1) << 16));
 		break;
 	case NVME_FEAT_KATO:
-		nvmet_set_result(req, req->sq->ctrl->kato * 1000);
+		nvmet_get_feat_kato(req);
 		break;
 	case NVME_FEAT_HOST_ID:
 		/* need 128-bit host identifier flag */
@@ -710,7 +734,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
 	nvmet_req_complete(req, status);
 }
 
-static void nvmet_execute_async_event(struct nvmet_req *req)
+void nvmet_execute_async_event(struct nvmet_req *req)
 {
 	struct nvmet_ctrl *ctrl = req->sq->ctrl;
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index c525fb0510c5..a8ee265a3806 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -357,6 +357,12 @@ static inline bool nvmet_aen_bit_disabled(struct nvmet_ctrl *ctrl, u32 bn)
 	return test_and_set_bit(bn, &ctrl->aen_masked);
 }
 
+void nvmet_get_feat_kato(struct nvmet_req *req);
+void nvmet_get_feat_async_event(struct nvmet_req *req);
+u16 nvmet_set_feat_kato(struct nvmet_req *req);
+u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask);
+void nvmet_execute_async_event(struct nvmet_req *req);
+
 u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
 u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req);
 u16 nvmet_file_parse_io_cmd(struct nvmet_req *req);
-- 
cgit 


From f301c2b1368905340133ff8ef4485befdd0b7e2d Mon Sep 17 00:00:00 2001
From: Jay Sternberg <jay.e.sternberg@intel.com>
Date: Mon, 12 Nov 2018 13:56:37 -0800
Subject: nvmet: add defines for discovery change async events

Add AEN/AER values as defined by the specification

Signed-off-by: Jay Sternberg <jay.e.sternberg@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/nvmet.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index a8ee265a3806..bc99c700a583 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -36,6 +36,8 @@
  */
 #define NVMET_AEN_CFG_OPTIONAL \
 	(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_ANA_CHANGE)
+#define NVMET_DISC_AEN_CFG_OPTIONAL \
+	(NVME_AEN_CFG_DISC_CHANGE)
 
 /*
  * Plus mandatory SMART AENs (we'll never send them, but allow enabling them):
-- 
cgit 


From 6a8ec0ac5ede074232137e505d4b90e56c1e4511 Mon Sep 17 00:00:00 2001
From: Jay Sternberg <jay.e.sternberg@intel.com>
Date: Mon, 12 Nov 2018 13:56:38 -0800
Subject: nvmet: add support to Discovery controllers for commands

Add custom get/set features to commands allowed by Discovery controllers.

Signed-off-by: Jay Sternberg <jay.e.sternberg@intel.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/discovery.c | 53 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 6e0f91eb34ce..050e4d759b65 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -183,6 +183,47 @@ out:
 	nvmet_req_complete(req, status);
 }
 
+static void nvmet_execute_disc_set_features(struct nvmet_req *req)
+{
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
+	u16 stat;
+
+	switch (cdw10 & 0xff) {
+	case NVME_FEAT_KATO:
+		stat = nvmet_set_feat_kato(req);
+		break;
+	case NVME_FEAT_ASYNC_EVENT:
+		stat = nvmet_set_feat_async_event(req,
+						  NVMET_DISC_AEN_CFG_OPTIONAL);
+		break;
+	default:
+		stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		break;
+	}
+
+	nvmet_req_complete(req, stat);
+}
+
+static void nvmet_execute_disc_get_features(struct nvmet_req *req)
+{
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
+	u16 stat = 0;
+
+	switch (cdw10 & 0xff) {
+	case NVME_FEAT_KATO:
+		nvmet_get_feat_kato(req);
+		break;
+	case NVME_FEAT_ASYNC_EVENT:
+		nvmet_get_feat_async_event(req);
+		break;
+	default:
+		stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		break;
+	}
+
+	nvmet_req_complete(req, stat);
+}
+
 u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
 {
 	struct nvme_command *cmd = req->cmd;
@@ -194,6 +235,18 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
 	}
 
 	switch (cmd->common.opcode) {
+	case nvme_admin_set_features:
+		req->execute = nvmet_execute_disc_set_features;
+		req->data_len = 0;
+		return 0;
+	case nvme_admin_get_features:
+		req->execute = nvmet_execute_disc_get_features;
+		req->data_len = 0;
+		return 0;
+	case nvme_admin_async_event:
+		req->execute = nvmet_execute_async_event;
+		req->data_len = 0;
+		return 0;
 	case nvme_admin_keep_alive:
 		req->execute = nvmet_execute_keep_alive;
 		req->data_len = 0;
-- 
cgit 


From 253928eec61a52935584777f0dfba6cdb63967b6 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Mon, 12 Nov 2018 13:56:39 -0800
Subject: nvmet: allow host connect even if no allowed subsystems are exported

It is perfectly valid that a host connects to a discovery subsystem
and gets an empty discovery log page since no subsystems are
provisioned to it. No reason to disallow connecting to the discovery
subsystem all together.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Jay Sternberg <jay.e.sternberg@intel.com>
Reviewed-by: Phil Cayton <phil.cayton@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/core.c      | 34 +++++++---------------------------
 drivers/nvme/target/discovery.c |  2 +-
 drivers/nvme/target/nvmet.h     |  3 +--
 3 files changed, 9 insertions(+), 30 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 46345c3f9d4a..849080b4115c 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -1032,14 +1032,18 @@ u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd)
 	return 0;
 }
 
-static bool __nvmet_host_allowed(struct nvmet_subsys *subsys,
-		const char *hostnqn)
+bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn)
 {
 	struct nvmet_host_link *p;
 
+	lockdep_assert_held(&nvmet_config_sem);
+
 	if (subsys->allow_any_host)
 		return true;
 
+	if (subsys->type == NVME_NQN_DISC) /* allow all access to disc subsys */
+		return true;
+
 	list_for_each_entry(p, &subsys->hosts, entry) {
 		if (!strcmp(nvmet_host_name(p->host), hostnqn))
 			return true;
@@ -1048,30 +1052,6 @@ static bool __nvmet_host_allowed(struct nvmet_subsys *subsys,
 	return false;
 }
 
-static bool nvmet_host_discovery_allowed(struct nvmet_req *req,
-		const char *hostnqn)
-{
-	struct nvmet_subsys_link *s;
-
-	list_for_each_entry(s, &req->port->subsystems, entry) {
-		if (__nvmet_host_allowed(s->subsys, hostnqn))
-			return true;
-	}
-
-	return false;
-}
-
-bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
-		const char *hostnqn)
-{
-	lockdep_assert_held(&nvmet_config_sem);
-
-	if (subsys->type == NVME_NQN_DISC)
-		return nvmet_host_discovery_allowed(req, hostnqn);
-	else
-		return __nvmet_host_allowed(subsys, hostnqn);
-}
-
 /*
  * Note: ctrl->subsys->lock should be held when calling this function
  */
@@ -1122,7 +1102,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 
 	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
 	down_read(&nvmet_config_sem);
-	if (!nvmet_host_allowed(req, subsys, hostnqn)) {
+	if (!nvmet_host_allowed(subsys, hostnqn)) {
 		pr_info("connect by host %s for subsystem %s not allowed\n",
 			hostnqn, subsysnqn);
 		req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn);
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 050e4d759b65..5fbb1bb2c4fc 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -107,7 +107,7 @@ static void nvmet_execute_get_disc_log_page(struct nvmet_req *req)
 
 	down_read(&nvmet_config_sem);
 	list_for_each_entry(p, &req->port->subsystems, entry) {
-		if (!nvmet_host_allowed(req, p->subsys, ctrl->hostnqn))
+		if (!nvmet_host_allowed(p->subsys, ctrl->hostnqn))
 			continue;
 		if (residual_len >= entry_size) {
 			char traddr[NVMF_TRADDR_SIZE];
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index bc99c700a583..05b98f25d65c 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -468,8 +468,7 @@ extern u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
 extern u64 nvmet_ana_chgcnt;
 extern struct rw_semaphore nvmet_ana_sem;
 
-bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
-		const char *hostnqn);
+bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn);
 
 int nvmet_bdev_ns_enable(struct nvmet_ns *ns);
 int nvmet_file_ns_enable(struct nvmet_ns *ns);
-- 
cgit 


From b662a078576e7d6e235b4e1b94863f0474cd8555 Mon Sep 17 00:00:00 2001
From: Jay Sternberg <jay.e.sternberg@intel.com>
Date: Mon, 12 Nov 2018 13:56:40 -0800
Subject: nvmet: enable Discovery Controller AENs

Add functions to find connections requesting Discovery Change events
and send a notification to hosts that maintain an explicit persistent
connection and have and active Asynchronous Event Request pending.
Only Hosts that have access to the Subsystem effected by the change
will receive notifications of Discovery Change event.

Call these functions each time there is a configfs change that effects
the Discover Log Pages.

Set the OAES field in the Identify Controller response to advertise the
support for Asynchronous Event Notifications.

Signed-off-by: Jay Sternberg <jay.e.sternberg@intel.com>
Reviewed-by: Phil Cayton <phil.cayton@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/configfs.c  | 30 +++++++++++++-----
 drivers/nvme/target/core.c      |  2 +-
 drivers/nvme/target/discovery.c | 68 +++++++++++++++++++++++++++++++++++++++--
 drivers/nvme/target/nvmet.h     | 11 ++++++-
 4 files changed, 99 insertions(+), 12 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index d895579b6c5d..d37fd7713bbc 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -25,6 +25,9 @@
 static const struct config_item_type nvmet_host_type;
 static const struct config_item_type nvmet_subsys_type;
 
+static LIST_HEAD(nvmet_ports_list);
+struct list_head *nvmet_ports = &nvmet_ports_list;
+
 static const struct nvmet_transport_name {
 	u8		type;
 	const char	*name;
@@ -646,7 +649,8 @@ static int nvmet_port_subsys_allow_link(struct config_item *parent,
 	}
 
 	list_add_tail(&link->entry, &port->subsystems);
-	nvmet_genctr++;
+	nvmet_port_disc_changed(port, subsys);
+
 	up_write(&nvmet_config_sem);
 	return 0;
 
@@ -673,7 +677,8 @@ static void nvmet_port_subsys_drop_link(struct config_item *parent,
 
 found:
 	list_del(&p->entry);
-	nvmet_genctr++;
+	nvmet_port_disc_changed(port, subsys);
+
 	if (list_empty(&port->subsystems))
 		nvmet_disable_port(port);
 	up_write(&nvmet_config_sem);
@@ -722,7 +727,8 @@ static int nvmet_allowed_hosts_allow_link(struct config_item *parent,
 			goto out_free_link;
 	}
 	list_add_tail(&link->entry, &subsys->hosts);
-	nvmet_genctr++;
+	nvmet_subsys_disc_changed(subsys, host);
+
 	up_write(&nvmet_config_sem);
 	return 0;
 out_free_link:
@@ -748,7 +754,8 @@ static void nvmet_allowed_hosts_drop_link(struct config_item *parent,
 
 found:
 	list_del(&p->entry);
-	nvmet_genctr++;
+	nvmet_subsys_disc_changed(subsys, host);
+
 	up_write(&nvmet_config_sem);
 	kfree(p);
 }
@@ -787,7 +794,11 @@ static ssize_t nvmet_subsys_attr_allow_any_host_store(struct config_item *item,
 		goto out_unlock;
 	}
 
-	subsys->allow_any_host = allow_any_host;
+	if (subsys->allow_any_host != allow_any_host) {
+		subsys->allow_any_host = allow_any_host;
+		nvmet_subsys_disc_changed(subsys, NULL);
+	}
+
 out_unlock:
 	up_write(&nvmet_config_sem);
 	return ret ? ret : count;
@@ -936,7 +947,7 @@ static ssize_t nvmet_referral_enable_store(struct config_item *item,
 	if (enable)
 		nvmet_referral_enable(parent, port);
 	else
-		nvmet_referral_disable(port);
+		nvmet_referral_disable(parent, port);
 
 	return count;
 inval:
@@ -962,9 +973,10 @@ static struct configfs_attribute *nvmet_referral_attrs[] = {
 
 static void nvmet_referral_release(struct config_item *item)
 {
+	struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent);
 	struct nvmet_port *port = to_nvmet_port(item);
 
-	nvmet_referral_disable(port);
+	nvmet_referral_disable(parent, port);
 	kfree(port);
 }
 
@@ -1137,6 +1149,8 @@ static void nvmet_port_release(struct config_item *item)
 {
 	struct nvmet_port *port = to_nvmet_port(item);
 
+	list_del(&port->global_entry);
+
 	kfree(port->ana_state);
 	kfree(port);
 }
@@ -1189,6 +1203,8 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
 			port->ana_state[i] = NVME_ANA_INACCESSIBLE;
 	}
 
+	list_add(&port->global_entry, &nvmet_ports_list);
+
 	INIT_LIST_HEAD(&port->entry);
 	INIT_LIST_HEAD(&port->subsystems);
 	INIT_LIST_HEAD(&port->referrals);
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 849080b4115c..5aa5a3cc5395 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -130,7 +130,7 @@ static void nvmet_async_event_work(struct work_struct *work)
 	}
 }
 
-static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
+void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
 		u8 event_info, u8 log_page)
 {
 	struct nvmet_async_event *aen;
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 5fbb1bb2c4fc..eb45ebc3ac2c 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -20,24 +20,82 @@ struct nvmet_subsys *nvmet_disc_subsys;
 
 u64 nvmet_genctr;
 
+static void __nvmet_disc_changed(struct nvmet_port *port,
+				 struct nvmet_ctrl *ctrl)
+{
+	if (ctrl->port != port)
+		return;
+
+	if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_DISC_CHANGE))
+		return;
+
+	nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
+			      NVME_AER_NOTICE_DISC_CHANGED, NVME_LOG_DISC);
+}
+
+void nvmet_port_disc_changed(struct nvmet_port *port,
+			     struct nvmet_subsys *subsys)
+{
+	struct nvmet_ctrl *ctrl;
+
+	nvmet_genctr++;
+
+	list_for_each_entry(ctrl, &nvmet_disc_subsys->ctrls, subsys_entry) {
+		if (subsys && !nvmet_host_allowed(subsys, ctrl->hostnqn))
+			continue;
+
+		__nvmet_disc_changed(port, ctrl);
+	}
+}
+
+static void __nvmet_subsys_disc_changed(struct nvmet_port *port,
+					struct nvmet_subsys *subsys,
+					struct nvmet_host *host)
+{
+	struct nvmet_ctrl *ctrl;
+
+	list_for_each_entry(ctrl, &nvmet_disc_subsys->ctrls, subsys_entry) {
+		if (host && strcmp(nvmet_host_name(host), ctrl->hostnqn))
+			continue;
+
+		__nvmet_disc_changed(port, ctrl);
+	}
+}
+
+void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys,
+			       struct nvmet_host *host)
+{
+	struct nvmet_port *port;
+	struct nvmet_subsys_link *s;
+
+	nvmet_genctr++;
+
+	list_for_each_entry(port, nvmet_ports, global_entry)
+		list_for_each_entry(s, &port->subsystems, entry) {
+			if (s->subsys != subsys)
+				continue;
+			__nvmet_subsys_disc_changed(port, subsys, host);
+		}
+}
+
 void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port)
 {
 	down_write(&nvmet_config_sem);
 	if (list_empty(&port->entry)) {
 		list_add_tail(&port->entry, &parent->referrals);
 		port->enabled = true;
-		nvmet_genctr++;
+		nvmet_port_disc_changed(parent, NULL);
 	}
 	up_write(&nvmet_config_sem);
 }
 
-void nvmet_referral_disable(struct nvmet_port *port)
+void nvmet_referral_disable(struct nvmet_port *parent, struct nvmet_port *port)
 {
 	down_write(&nvmet_config_sem);
 	if (!list_empty(&port->entry)) {
 		port->enabled = false;
 		list_del_init(&port->entry);
-		nvmet_genctr++;
+		nvmet_port_disc_changed(parent, NULL);
 	}
 	up_write(&nvmet_config_sem);
 }
@@ -136,6 +194,8 @@ static void nvmet_execute_get_disc_log_page(struct nvmet_req *req)
 	hdr->numrec = cpu_to_le64(numrec);
 	hdr->recfmt = cpu_to_le16(0);
 
+	nvmet_clear_aen_bit(req, NVME_AEN_BIT_DISC_CHANGE);
+
 	up_read(&nvmet_config_sem);
 
 	status = nvmet_copy_to_sgl(req, 0, hdr, data_len);
@@ -174,6 +234,8 @@ static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req)
 	if (req->port->inline_data_size)
 		id->sgls |= cpu_to_le32(1 << 20);
 
+	id->oaes = cpu_to_le32(NVMET_DISC_AEN_CFG_OPTIONAL);
+
 	strlcpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn));
 
 	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 05b98f25d65c..31474940e373 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -139,6 +139,7 @@ struct nvmet_port {
 	struct list_head		subsystems;
 	struct config_group		referrals_group;
 	struct list_head		referrals;
+	struct list_head		global_entry;
 	struct config_group		ana_groups_group;
 	struct nvmet_ana_group		ana_default_group;
 	enum nvme_ana_state		*ana_state;
@@ -422,7 +423,7 @@ int nvmet_enable_port(struct nvmet_port *port);
 void nvmet_disable_port(struct nvmet_port *port);
 
 void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port);
-void nvmet_referral_disable(struct nvmet_port *port);
+void nvmet_referral_disable(struct nvmet_port *parent, struct nvmet_port *port);
 
 u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
 		size_t len);
@@ -432,6 +433,14 @@ u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len);
 
 u32 nvmet_get_log_page_len(struct nvme_command *cmd);
 
+extern struct list_head *nvmet_ports;
+void nvmet_port_disc_changed(struct nvmet_port *port,
+		struct nvmet_subsys *subsys);
+void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys,
+		struct nvmet_host *host);
+void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
+		u8 event_info, u8 log_page);
+
 #define NVMET_QUEUE_SIZE	1024
 #define NVMET_NR_QUEUES		128
 #define NVMET_MAX_CMD		NVMET_QUEUE_SIZE
-- 
cgit 


From 03198c4d9fc8e72ba8a5ad74959b61de7f2780a6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 16:46:23 +0100
Subject: nvmet: mark nvmet_genctr static

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/discovery.c | 2 +-
 drivers/nvme/target/nvmet.h     | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index eb45ebc3ac2c..4d8757ae8210 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -18,7 +18,7 @@
 
 struct nvmet_subsys *nvmet_disc_subsys;
 
-u64 nvmet_genctr;
+static u64 nvmet_genctr;
 
 static void __nvmet_disc_changed(struct nvmet_port *port,
 				 struct nvmet_ctrl *ctrl)
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 31474940e373..03988fe9d915 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -470,7 +470,6 @@ int __init nvmet_init_discovery(void);
 void nvmet_exit_discovery(void);
 
 extern struct nvmet_subsys *nvmet_disc_subsys;
-extern u64 nvmet_genctr;
 extern struct rw_semaphore nvmet_config_sem;
 
 extern u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
-- 
cgit 


From 6e2e312ea7ff73acfafaa5c9851e151e9483c761 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Wed, 14 Nov 2018 15:57:46 -0800
Subject: nvmet-fc: remove the IN_ISR deferred scheduling options

All target lldd's call the cmd receive and op completions in non-isr
thread contexts. As such the IN_ISR options are not necessary.
Remove the functionality and flags, which also removes cpu assignments
to queues.

Signed-off-by: James Smart <jsmart2021@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/fc.c | 66 ++----------------------------------------------
 1 file changed, 2 insertions(+), 64 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 409081a03b24..f98f5c5bea26 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -86,8 +86,6 @@ struct nvmet_fc_fcp_iod {
 	spinlock_t			flock;
 
 	struct nvmet_req		req;
-	struct work_struct		work;
-	struct work_struct		done_work;
 	struct work_struct		defer_work;
 
 	struct nvmet_fc_tgtport		*tgtport;
@@ -134,7 +132,6 @@ struct nvmet_fc_tgt_queue {
 	u16				sqsize;
 	u16				ersp_ratio;
 	__le16				sqhd;
-	int				cpu;
 	atomic_t			connected;
 	atomic_t			sqtail;
 	atomic_t			zrspcnt;
@@ -232,8 +229,6 @@ static LIST_HEAD(nvmet_fc_portentry_list);
 
 
 static void nvmet_fc_handle_ls_rqst_work(struct work_struct *work);
-static void nvmet_fc_handle_fcp_rqst_work(struct work_struct *work);
-static void nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work);
 static void nvmet_fc_fcp_rqst_op_defer_work(struct work_struct *work);
 static void nvmet_fc_tgt_a_put(struct nvmet_fc_tgt_assoc *assoc);
 static int nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc);
@@ -438,8 +433,6 @@ nvmet_fc_prep_fcp_iodlist(struct nvmet_fc_tgtport *tgtport,
 	int i;
 
 	for (i = 0; i < queue->sqsize; fod++, i++) {
-		INIT_WORK(&fod->work, nvmet_fc_handle_fcp_rqst_work);
-		INIT_WORK(&fod->done_work, nvmet_fc_fcp_rqst_op_done_work);
 		INIT_WORK(&fod->defer_work, nvmet_fc_fcp_rqst_op_defer_work);
 		fod->tgtport = tgtport;
 		fod->queue = queue;
@@ -517,10 +510,7 @@ nvmet_fc_queue_fcp_req(struct nvmet_fc_tgtport *tgtport,
 	fcpreq->hwqid = queue->qid ?
 			((queue->qid - 1) % tgtport->ops->max_hw_queues) : 0;
 
-	if (tgtport->ops->target_features & NVMET_FCTGTFEAT_CMD_IN_ISR)
-		queue_work_on(queue->cpu, queue->work_q, &fod->work);
-	else
-		nvmet_fc_handle_fcp_rqst(tgtport, fod);
+	nvmet_fc_handle_fcp_rqst(tgtport, fod);
 }
 
 static void
@@ -599,30 +589,6 @@ nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue,
 	queue_work(queue->work_q, &fod->defer_work);
 }
 
-static int
-nvmet_fc_queue_to_cpu(struct nvmet_fc_tgtport *tgtport, int qid)
-{
-	int cpu, idx, cnt;
-
-	if (tgtport->ops->max_hw_queues == 1)
-		return WORK_CPU_UNBOUND;
-
-	/* Simple cpu selection based on qid modulo active cpu count */
-	idx = !qid ? 0 : (qid - 1) % num_active_cpus();
-
-	/* find the n'th active cpu */
-	for (cpu = 0, cnt = 0; ; ) {
-		if (cpu_active(cpu)) {
-			if (cnt == idx)
-				break;
-			cnt++;
-		}
-		cpu = (cpu + 1) % num_possible_cpus();
-	}
-
-	return cpu;
-}
-
 static struct nvmet_fc_tgt_queue *
 nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
 			u16 qid, u16 sqsize)
@@ -653,7 +619,6 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
 	queue->qid = qid;
 	queue->sqsize = sqsize;
 	queue->assoc = assoc;
-	queue->cpu = nvmet_fc_queue_to_cpu(assoc->tgtport, qid);
 	INIT_LIST_HEAD(&queue->fod_list);
 	INIT_LIST_HEAD(&queue->avail_defer_list);
 	INIT_LIST_HEAD(&queue->pending_cmd_list);
@@ -2145,26 +2110,12 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
 	}
 }
 
-static void
-nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work)
-{
-	struct nvmet_fc_fcp_iod *fod =
-		container_of(work, struct nvmet_fc_fcp_iod, done_work);
-
-	nvmet_fc_fod_op_done(fod);
-}
-
 static void
 nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
 {
 	struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
-	struct nvmet_fc_tgt_queue *queue = fod->queue;
 
-	if (fod->tgtport->ops->target_features & NVMET_FCTGTFEAT_OPDONE_IN_ISR)
-		/* context switch so completion is not in ISR context */
-		queue_work_on(queue->cpu, queue->work_q, &fod->done_work);
-	else
-		nvmet_fc_fod_op_done(fod);
+	nvmet_fc_fod_op_done(fod);
 }
 
 /*
@@ -2332,19 +2283,6 @@ transport_error:
 	nvmet_fc_abort_op(tgtport, fod);
 }
 
-/*
- * Actual processing routine for received FC-NVME LS Requests from the LLD
- */
-static void
-nvmet_fc_handle_fcp_rqst_work(struct work_struct *work)
-{
-	struct nvmet_fc_fcp_iod *fod =
-		container_of(work, struct nvmet_fc_fcp_iod, work);
-	struct nvmet_fc_tgtport *tgtport = fod->tgtport;
-
-	nvmet_fc_handle_fcp_rqst(tgtport, fod);
-}
-
 /**
  * nvmet_fc_rcv_fcp_req - transport entry point called by an LLDD
  *                       upon the reception of a NVME FCP CMD IU.
-- 
cgit 


From e6a622fd6d66b83779357e3400f487fc159a7d83 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Mon, 19 Nov 2018 14:11:12 -0800
Subject: nvmet: support fabrics sq flow control

Technical proposal 8005 "fabrics SQ flow control" introduces a mode
where a host and controller agree to omit sq_head pointer updates
when sending nvme completions.

In case the host indicated desire to operate in this mode (connect attribute)
the controller will return back a connect completion with sq_head value
of 0xffff as indication that it will omit sq_head pointer updates.

This mode saves us an atomic update in the I/O path.

Reviewed-by: Hannes Reinecke <hare@suse.com>
[hch: suggested better implementation]
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/core.c        | 23 +++++++++++++----------
 drivers/nvme/target/fabrics-cmd.c |  6 ++++++
 drivers/nvme/target/nvmet.h       |  1 +
 3 files changed, 20 insertions(+), 10 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 5aa5a3cc5395..2df70010e9f2 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -597,26 +597,28 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
 	return ns;
 }
 
-static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
+static void nvmet_update_sq_head(struct nvmet_req *req)
 {
-	u32 old_sqhd, new_sqhd;
-	u16 sqhd;
-
-	if (status)
-		nvmet_set_status(req, status);
-
 	if (req->sq->size) {
+		u32 old_sqhd, new_sqhd;
+
 		do {
 			old_sqhd = req->sq->sqhd;
 			new_sqhd = (old_sqhd + 1) % req->sq->size;
 		} while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) !=
 					old_sqhd);
 	}
-	sqhd = req->sq->sqhd & 0x0000FFFF;
-	req->rsp->sq_head = cpu_to_le16(sqhd);
+	req->rsp->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF);
+}
+
+static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
+{
+	if (!req->sq->sqhd_disabled)
+		nvmet_update_sq_head(req);
 	req->rsp->sq_id = cpu_to_le16(req->sq->qid);
 	req->rsp->command_id = req->cmd->common.command_id;
-
+	if (status)
+		nvmet_set_status(req, status);
 	if (req->ns)
 		nvmet_put_namespace(req->ns);
 	req->ops->queue_response(req);
@@ -765,6 +767,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 	req->sg_cnt = 0;
 	req->transfer_len = 0;
 	req->rsp->status = 0;
+	req->rsp->sq_head = 0;
 	req->ns = NULL;
 
 	/* no support for fused commands yet */
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index d84ae004cb85..328ae46d8344 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -115,6 +115,12 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
 	/* note: convert queue size from 0's-based value to 1's-based value */
 	nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1);
 	nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1);
+
+	if (c->cattr & NVME_CONNECT_DISABLE_SQFLOW) {
+		req->sq->sqhd_disabled = true;
+		req->rsp->sq_head = cpu_to_le16(0xffff);
+	}
+
 	return 0;
 }
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 03988fe9d915..547108c41ce9 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -106,6 +106,7 @@ struct nvmet_sq {
 	u16			qid;
 	u16			size;
 	u32			sqhd;
+	bool			sqhd_disabled;
 	struct completion	free_done;
 	struct completion	confirm_done;
 };
-- 
cgit 


From 0445e1b5a2fed4612b7f72d9a56889c026b60aa9 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Mon, 19 Nov 2018 14:11:13 -0800
Subject: nvmet: don't override treq upon modification.

Only override the allowed parts of it.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
[hch: slight tweak to the NVME_TREQ_SECURE_CHANNEL_MASK definition]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/configfs.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index d37fd7713bbc..260a401db01c 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -153,7 +153,8 @@ CONFIGFS_ATTR(nvmet_, addr_traddr);
 static ssize_t nvmet_addr_treq_show(struct config_item *item,
 		char *page)
 {
-	switch (to_nvmet_port(item)->disc_addr.treq) {
+	switch (to_nvmet_port(item)->disc_addr.treq &
+		NVME_TREQ_SECURE_CHANNEL_MASK) {
 	case NVMF_TREQ_NOT_SPECIFIED:
 		return sprintf(page, "not specified\n");
 	case NVMF_TREQ_REQUIRED:
@@ -169,6 +170,7 @@ static ssize_t nvmet_addr_treq_store(struct config_item *item,
 		const char *page, size_t count)
 {
 	struct nvmet_port *port = to_nvmet_port(item);
+	u8 treq = port->disc_addr.treq & ~NVME_TREQ_SECURE_CHANNEL_MASK;
 
 	if (port->enabled) {
 		pr_err("Cannot modify address while enabled\n");
@@ -177,15 +179,16 @@ static ssize_t nvmet_addr_treq_store(struct config_item *item,
 	}
 
 	if (sysfs_streq(page, "not specified")) {
-		port->disc_addr.treq = NVMF_TREQ_NOT_SPECIFIED;
+		treq |= NVMF_TREQ_NOT_SPECIFIED;
 	} else if (sysfs_streq(page, "required")) {
-		port->disc_addr.treq = NVMF_TREQ_REQUIRED;
+		treq |= NVMF_TREQ_REQUIRED;
 	} else if (sysfs_streq(page, "not required")) {
-		port->disc_addr.treq = NVMF_TREQ_NOT_REQUIRED;
+		treq |= NVMF_TREQ_NOT_REQUIRED;
 	} else {
 		pr_err("Invalid value '%s' for treq\n", page);
 		return -EINVAL;
 	}
+	port->disc_addr.treq = treq;
 
 	return count;
 }
-- 
cgit 


From 9b95d2fb857f242aacbf4e205656818b0ef067e1 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Tue, 20 Nov 2018 10:34:19 +0100
Subject: nvmet: expose support for fabrics SQ flow control disable in treq

Technical Proposal introduces an indication for SQ flow control
disable support. Expose it since we are able to operate in this mode.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/configfs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers')

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 260a401db01c..db2cb64be7ba 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -1214,6 +1214,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
 	port->inline_data_size = -1;	/* < 0 == let the transport choose */
 
 	port->disc_addr.portid = cpu_to_le16(portid);
+	port->disc_addr.treq = NVMF_TREQ_DISABLE_SQFLOW;
 	config_group_init_type_name(&port->group, name, &nvmet_port_type);
 
 	config_group_init_type_name(&port->subsys_group,
-- 
cgit 


From 8154ed730bc64f68bc28feb20e641c2e8a0eeba5 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Mon, 19 Nov 2018 14:11:15 -0800
Subject: nvme: disable fabrics SQ flow control when asked by the user

As for now, we don't care about sq_head pointer updates anyway, so
at least allow the controller to micro-optimize by omiting this update.

Note that we will probably need to support it when a controller
that requires this comes along.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/fabrics.c | 13 ++++++++++++-
 drivers/nvme/host/fabrics.h |  2 ++
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index bd0969db6225..10074ac7731b 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -392,6 +392,9 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
 	cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 :
 		cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000);
 
+	if (ctrl->opts->disable_sqflow)
+		cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW;
+
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
@@ -451,6 +454,9 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
 	cmd.connect.qid = cpu_to_le16(qid);
 	cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize);
 
+	if (ctrl->opts->disable_sqflow)
+		cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW;
+
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
@@ -607,6 +613,7 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_HOST_TRADDR,		"host_traddr=%s"	},
 	{ NVMF_OPT_HOST_ID,		"hostid=%s"		},
 	{ NVMF_OPT_DUP_CONNECT,		"duplicate_connect"	},
+	{ NVMF_OPT_DISABLE_SQFLOW,	"disable_sqflow"	},
 	{ NVMF_OPT_ERR,			NULL			}
 };
 
@@ -817,6 +824,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 		case NVMF_OPT_DUP_CONNECT:
 			opts->duplicate_connect = true;
 			break;
+		case NVMF_OPT_DISABLE_SQFLOW:
+			opts->disable_sqflow = true;
+			break;
 		default:
 			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
 				p);
@@ -933,7 +943,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
 #define NVMF_REQUIRED_OPTS	(NVMF_OPT_TRANSPORT | NVMF_OPT_NQN)
 #define NVMF_ALLOWED_OPTS	(NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
 				 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
-				 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT)
+				 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
+				 NVMF_OPT_DISABLE_SQFLOW)
 
 static struct nvme_ctrl *
 nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 6ea6275f332a..ecd9a006a091 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -58,6 +58,7 @@ enum {
 	NVMF_OPT_CTRL_LOSS_TMO	= 1 << 11,
 	NVMF_OPT_HOST_ID	= 1 << 12,
 	NVMF_OPT_DUP_CONNECT	= 1 << 13,
+	NVMF_OPT_DISABLE_SQFLOW = 1 << 14,
 };
 
 /**
@@ -101,6 +102,7 @@ struct nvmf_ctrl_options {
 	unsigned int		kato;
 	struct nvmf_host	*host;
 	int			max_reconnects;
+	bool			disable_sqflow;
 };
 
 /*
-- 
cgit 


From 5c4072ad1c151c65c3d60f95536786042cd49e29 Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@mellanox.com>
Date: Mon, 19 Nov 2018 10:58:52 +0000
Subject: nvme: Remove unused forward declaration

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 48ffb1d685c2..71d2a89bbd1d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -97,7 +97,6 @@ static dev_t nvme_chr_devt;
 static struct class *nvme_class;
 static struct class *nvme_subsys_class;
 
-static void nvme_ns_remove(struct nvme_ns *ns);
 static int nvme_revalidate_disk(struct gendisk *disk);
 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
-- 
cgit 


From ad1f824948e4ed886529219cf7cd717d078c630d Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@mellanox.com>
Date: Mon, 19 Nov 2018 10:58:51 +0000
Subject: nvmet-rdma: Add unlikely for response allocated check

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 3f7971d3706d..932242e27b03 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -196,7 +196,7 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
 {
 	unsigned long flags;
 
-	if (rsp->allocated) {
+	if (unlikely(rsp->allocated)) {
 		kfree(rsp);
 		return;
 	}
-- 
cgit 


From cb019da3dabf60d792f76a913540815f06abb1d7 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Mon, 19 Nov 2018 13:35:30 -0800
Subject: nvmet: use unlikely for req status check

This patch adds unlikely in the nvmet request completion path for the
status check in the low level function __nvmet_req_complete.
This is helpful in the scenario where host and target connection is
working smoothly.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 2df70010e9f2..e468100b9211 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -617,7 +617,7 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
 		nvmet_update_sq_head(req);
 	req->rsp->sq_id = cpu_to_le16(req->sq->qid);
 	req->rsp->command_id = req->cmd->common.command_id;
-	if (status)
+	if (unlikely(status))
 		nvmet_set_status(req, status);
 	if (req->ns)
 		nvmet_put_namespace(req->ns);
-- 
cgit 


From 5a3a6d6965865d0da5c743a0f9c58f84373f88e7 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Mon, 19 Nov 2018 15:16:39 -0800
Subject: nvmet: fix the structure member indentation

This is a cleanup patch which fixes the structure member indentation
introduced by the p2p:

commit c6925093d0b2 ("nvmet: Optionally use PCI P2P memory").
We don't change any functionality in this patch.

This is needed so that any future members will also follow the uniform
indentation.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Acked-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/nvmet.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 547108c41ce9..7d8b7a7d572a 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -200,8 +200,8 @@ struct nvmet_ctrl {
 	char			subsysnqn[NVMF_NQN_FIELD_LEN];
 	char			hostnqn[NVMF_NQN_FIELD_LEN];
 
-	struct device *p2p_client;
-	struct radix_tree_root p2p_ns_map;
+	struct device		*p2p_client;
+	struct radix_tree_root	p2p_ns_map;
 };
 
 struct nvmet_subsys {
@@ -314,8 +314,8 @@ struct nvmet_req {
 	void (*execute)(struct nvmet_req *req);
 	const struct nvmet_fabrics_ops *ops;
 
-	struct pci_dev *p2p_dev;
-	struct device *p2p_client;
+	struct pci_dev		*p2p_dev;
+	struct device		*p2p_client;
 };
 
 extern struct workqueue_struct *buffered_io_wq;
-- 
cgit 


From 49cd84b6f8b677ef45731ed56ddb802cdbb94c9e Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 27 Nov 2018 09:40:57 -0700
Subject: nvme: implement Enhanced Command Retry

A controller may have an internal state that is not able to successfully
process commands for a short duration. In such states, an immediate
command requeue is expected to fail. The driver may exceed its max
retry count, which permanently ends the command in failure when the same
command would succeed after waiting for the controller to be ready.

NVMe ratified TP 4033 provides a delay hint in the completion status
code for failed commands. Implement the retry delay based on the command
completion status and the controller's requested delay.

Note that requeued commands are handled per request_queue, not per
individual request. If multiple commands fail, the controller should
consistently report the desired delay time for retryable commands in
all CQEs, otherwise the requeue list may be kicked too soon.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 47 +++++++++++++++++++++++++++++++++++++++++++++--
 drivers/nvme/host/nvme.h |  1 +
 2 files changed, 46 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 71d2a89bbd1d..f90576862736 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -244,6 +244,22 @@ static inline bool nvme_req_needs_retry(struct request *req)
 	return true;
 }
 
+static void nvme_retry_req(struct request *req)
+{
+	struct nvme_ns *ns = req->q->queuedata;
+	unsigned long delay = 0;
+	u16 crd;
+
+	/* The mask and shift result must be <= 3 */
+	crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
+	if (ns && crd)
+		delay = ns->ctrl->crdt[crd - 1] * 100;
+
+	nvme_req(req)->retries++;
+	blk_mq_requeue_request(req, false);
+	blk_mq_delay_kick_requeue_list(req->q, delay);
+}
+
 void nvme_complete_rq(struct request *req)
 {
 	blk_status_t status = nvme_error_status(req);
@@ -261,8 +277,7 @@ void nvme_complete_rq(struct request *req)
 		}
 
 		if (!blk_queue_dying(req->q)) {
-			nvme_req(req)->retries++;
-			blk_mq_requeue_request(req, true);
+			nvme_retry_req(req);
 			return;
 		}
 	}
@@ -1883,6 +1898,26 @@ static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
 	return ret;
 }
 
+static int nvme_configure_acre(struct nvme_ctrl *ctrl)
+{
+	struct nvme_feat_host_behavior *host;
+	int ret;
+
+	/* Don't bother enabling the feature if retry delay is not reported */
+	if (!ctrl->crdt[0])
+		return 0;
+
+	host = kzalloc(sizeof(*host), GFP_KERNEL);
+	if (!host)
+		return 0;
+
+	host->acre = NVME_ENABLE_ACRE;
+	ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
+				host, sizeof(*host), NULL);
+	kfree(host);
+	return ret;
+}
+
 static int nvme_configure_apst(struct nvme_ctrl *ctrl)
 {
 	/*
@@ -2404,6 +2439,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 		ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
 	}
 
+	ctrl->crdt[0] = le16_to_cpu(id->crdt1);
+	ctrl->crdt[1] = le16_to_cpu(id->crdt2);
+	ctrl->crdt[2] = le16_to_cpu(id->crdt3);
+
 	ctrl->oacs = le16_to_cpu(id->oacs);
 	ctrl->oncs = le16_to_cpup(&id->oncs);
 	ctrl->oaes = le32_to_cpu(id->oaes);
@@ -2504,6 +2543,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	if (ret < 0)
 		return ret;
 
+	ret = nvme_configure_acre(ctrl);
+	if (ret < 0)
+		return ret;
+
 	ctrl->identified = true;
 
 	return 0;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index f2594d468f29..79e621f5b326 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -181,6 +181,7 @@ struct nvme_ctrl {
 	u32 page_size;
 	u32 max_hw_sectors;
 	u32 max_segments;
+	u16 crdt[3];
 	u16 oncs;
 	u16 oacs;
 	u16 nssa;
-- 
cgit 


From 3236b458c475524d0735f6dd0bd250478434c7b1 Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@mellanox.com>
Date: Mon, 3 Dec 2018 15:50:05 +0000
Subject: nvme: remove unused function nvme_ctrl_ready

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/nvme.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 79e621f5b326..8e0ec365ce8d 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -368,15 +368,6 @@ static inline void nvme_fault_inject_fini(struct nvme_ns *ns) {}
 static inline void nvme_should_fail(struct request *req) {}
 #endif
 
-static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl)
-{
-	u32 val = 0;
-
-	if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val))
-		return false;
-	return val & NVME_CSTS_RDY;
-}
-
 static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
 {
 	if (!ctrl->subsystem)
-- 
cgit 


From 80a787ba3809deb694ee632919badb73890daf05 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 6 Dec 2018 11:41:16 -0500
Subject: dm: dont rewrite dm_disk(md)->part0.in_flight

generic_start_io_acct and generic_end_io_acct already update the variable
in_flight using atomic operations, so we don't have to overwrite them
again.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ab72d79775ee..33100e536c4e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -663,8 +663,7 @@ static void start_io_acct(struct dm_io *io)
 	generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
 			      &dm_disk(md)->part0);
 
-	atomic_set(&dm_disk(md)->part0.in_flight[rw],
-		   atomic_inc_return(&md->pending[rw]));
+	atomic_inc(&md->pending[rw]);
 
 	if (unlikely(dm_stats_used(&md->stats)))
 		dm_stats_account_io(&md->stats, bio_data_dir(bio),
@@ -693,7 +692,6 @@ static void end_io_acct(struct dm_io *io)
 	 * a flush.
 	 */
 	pending = atomic_dec_return(&md->pending[rw]);
-	atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
 	pending += atomic_read(&md->pending[rw^0x1]);
 
 	/* nudge anyone waiting on suspend queue */
-- 
cgit 


From dbd3bbd291a03f1383de6242e7999e8487cb869b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 6 Dec 2018 11:41:17 -0500
Subject: dm rq: leverage blk_mq_queue_busy() to check for outstanding IO

Now that request-based dm-multipath only supports blk-mq, make use of
the newly introduced blk_mq_queue_busy() to check for outstanding IO --
rather than (ab)using the block core's in_flight counters.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-rq.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 1f1fe9a618ea..d2397d8fcbd1 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -130,11 +130,11 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig)
  */
 static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 {
-	atomic_dec(&md->pending[rw]);
-
 	/* nudge anyone waiting on suspend queue */
-	if (!md_in_flight(md))
-		wake_up(&md->wait);
+	if (unlikely(waitqueue_active(&md->wait))) {
+		if (!blk_mq_queue_busy(md->queue))
+			wake_up(&md->wait);
+	}
 
 	/*
 	 * dm_put() must be at the end of this function. See the comment above
@@ -436,7 +436,6 @@ ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
 static void dm_start_request(struct mapped_device *md, struct request *orig)
 {
 	blk_mq_start_request(orig);
-	atomic_inc(&md->pending[rq_data_dir(orig)]);
 
 	if (unlikely(dm_stats_used(&md->stats))) {
 		struct dm_rq_target_io *tio = tio_from_request(orig);
-- 
cgit 


From 112f158f66cbe25fd561a5dfe9c3826e06abf757 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 6 Dec 2018 11:41:18 -0500
Subject: block: stop passing 'cpu' to all percpu stats methods

All of part_stat_* and related methods are used with preempt disabled,
so there is no need to pass cpu around to allow of them.  Just call
smp_processor_id() as needed.

Suggested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/md.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index fc488cb30a94..9a0a1e0934d5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -334,7 +334,6 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 	const int sgrp = op_stat_group(bio_op(bio));
 	struct mddev *mddev = q->queuedata;
 	unsigned int sectors;
-	int cpu;
 
 	blk_queue_split(q, &bio);
 
@@ -359,9 +358,9 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 
 	md_handle_request(mddev, bio);
 
-	cpu = part_stat_lock();
-	part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]);
-	part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors);
+	part_stat_lock();
+	part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
+	part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
 	part_stat_unlock();
 
 	return BLK_QC_T_NONE;
-- 
cgit 


From 6f75723190d88e1319bea623bfe0292bf3917965 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 6 Dec 2018 11:41:22 -0500
Subject: dm: remove the pending IO accounting

Remove the "pending" atomic counters, that duplicate block-core's
in_flight counters, and update md_in_flight() to look at percpu
in_flight counters.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-core.h |  2 --
 drivers/md/dm.c      | 34 +++++++++++++++-------------------
 2 files changed, 15 insertions(+), 21 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 224d44503a06..6fe883fac471 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -65,7 +65,6 @@ struct mapped_device {
 	 */
 	struct work_struct work;
 	wait_queue_head_t wait;
-	atomic_t pending[2];
 	spinlock_t deferred_lock;
 	struct bio_list deferred;
 
@@ -119,7 +118,6 @@ struct mapped_device {
 	struct srcu_struct io_barrier;
 };
 
-int md_in_flight(struct mapped_device *md);
 void disable_write_same(struct mapped_device *md);
 void disable_write_zeroes(struct mapped_device *md);
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 33100e536c4e..70568f8b6c53 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -646,25 +646,30 @@ static void free_tio(struct dm_target_io *tio)
 	bio_put(&tio->clone);
 }
 
-int md_in_flight(struct mapped_device *md)
+static bool md_in_flight(struct mapped_device *md)
 {
-	return atomic_read(&md->pending[READ]) +
-	       atomic_read(&md->pending[WRITE]);
+	int cpu;
+	struct hd_struct *part = &dm_disk(md)->part0;
+
+	for_each_possible_cpu(cpu) {
+		if (part_stat_local_read_cpu(part, in_flight[0], cpu) ||
+		    part_stat_local_read_cpu(part, in_flight[1], cpu))
+			return true;
+	}
+
+	return false;
 }
 
 static void start_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
 	struct bio *bio = io->orig_bio;
-	int rw = bio_data_dir(bio);
 
 	io->start_time = jiffies;
 
 	generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
 			      &dm_disk(md)->part0);
 
-	atomic_inc(&md->pending[rw]);
-
 	if (unlikely(dm_stats_used(&md->stats)))
 		dm_stats_account_io(&md->stats, bio_data_dir(bio),
 				    bio->bi_iter.bi_sector, bio_sectors(bio),
@@ -676,8 +681,6 @@ static void end_io_acct(struct dm_io *io)
 	struct mapped_device *md = io->md;
 	struct bio *bio = io->orig_bio;
 	unsigned long duration = jiffies - io->start_time;
-	int pending;
-	int rw = bio_data_dir(bio);
 
 	generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
 			    io->start_time);
@@ -687,16 +690,11 @@ static void end_io_acct(struct dm_io *io)
 				    bio->bi_iter.bi_sector, bio_sectors(bio),
 				    true, duration, &io->stats_aux);
 
-	/*
-	 * After this is decremented the bio must not be touched if it is
-	 * a flush.
-	 */
-	pending = atomic_dec_return(&md->pending[rw]);
-	pending += atomic_read(&md->pending[rw^0x1]);
-
 	/* nudge anyone waiting on suspend queue */
-	if (!pending)
-		wake_up(&md->wait);
+	if (unlikely(waitqueue_active(&md->wait))) {
+		if (!md_in_flight(md))
+			wake_up(&md->wait);
+	}
 }
 
 /*
@@ -1915,8 +1913,6 @@ static struct mapped_device *alloc_dev(int minor)
 	if (!md->disk)
 		goto bad;
 
-	atomic_set(&md->pending[0], 0);
-	atomic_set(&md->pending[1], 0);
 	init_waitqueue_head(&md->wait);
 	INIT_WORK(&md->work, dm_wq_work);
 	init_waitqueue_head(&md->eventq);
-- 
cgit 


From e4025e46f093d4549d3043c2c54d444cec480d2b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 10 Dec 2018 22:34:39 +0100
Subject: mtip32xx: avoid using semaphores

The "cmd_slot_unal" semaphore is never used in a blocking way
but only as an atomic counter. Change the code to using
atomic_dec_if_positive() as a better API.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 6 +++---
 drivers/block/mtip32xx/mtip32xx.h | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 2b0ac9d01e51..d7dc307d0cc8 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2680,7 +2680,7 @@ static void mtip_softirq_done_fn(struct request *rq)
 							cmd->direction);
 
 	if (unlikely(cmd->unaligned))
-		up(&dd->port->cmd_slot_unal);
+		atomic_inc(&dd->port->cmd_slot_unal);
 
 	blk_mq_end_request(rq, cmd->status);
 }
@@ -2990,7 +2990,7 @@ static int mtip_hw_init(struct driver_data *dd)
 	else
 		dd->unal_qdepth = 0;
 
-	sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth);
+	atomic_set(&dd->port->cmd_slot_unal, dd->unal_qdepth);
 
 	/* Spinlock to prevent concurrent issue */
 	for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
@@ -3533,7 +3533,7 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
 			cmd->unaligned = 1;
 	}
 
-	if (cmd->unaligned && down_trylock(&dd->port->cmd_slot_unal))
+	if (cmd->unaligned && atomic_dec_if_positive(&dd->port->cmd_slot_unal) >= 0)
 		return true;
 
 	return false;
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index c33f8c3d9fb4..abce25f27f57 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -433,8 +433,8 @@ struct mtip_port {
 	 */
 	unsigned long ic_pause_timer;
 
-	/* Semaphore to control queue depth of unaligned IOs */
-	struct semaphore cmd_slot_unal;
+	/* Counter to control queue depth of unaligned IOs */
+	atomic_t cmd_slot_unal;
 
 	/* Spinlock for working around command-issue bug. */
 	spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS];
-- 
cgit 


From 4ba09f69e20d0a768d91277847ddbd31f476590e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 10 Dec 2018 14:45:19 -0700
Subject: mtip32xx: use BLK_STS_DEV_RESOURCE for device resources

For cases where we can only fail with IO in-flight, we should be using
BLK_STS_DEV_RESOURCE instead of BLK_STS_RESOURCE. The latter refers to
system wide resource constraints.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index d7dc307d0cc8..88e8440e75c3 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3550,7 +3550,7 @@ static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx,
 	struct mtip_cmd_sg *command_sg;
 
 	if (mtip_commands_active(dd->port))
-		return BLK_STS_RESOURCE;
+		return BLK_STS_DEV_RESOURCE;
 
 	hdr->ctba = cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
 	if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
@@ -3587,7 +3587,7 @@ static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
 		return mtip_issue_reserved_cmd(hctx, rq);
 
 	if (unlikely(mtip_check_unal_depth(hctx, rq)))
-		return BLK_STS_RESOURCE;
+		return BLK_STS_DEV_RESOURCE;
 
 	if (is_se_active(dd) || is_stopped(dd, rq))
 		return BLK_STS_IOERR;
-- 
cgit 


From b7934ba4147a883f7a1b32c6408179274a4d6ed1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 10 Dec 2018 15:45:53 -0700
Subject: dm: fix inflight IO check

After switching to percpu inflight counters, the inflight check
is totally buggy. It's perfectly valid for some counters to be
non-zero while having a total inflight IO count of 0, that's how
these kinds of counters work (inc on one CPU, dec on another).
Fix the md_in_flight() check to sum all counters before returning
a false positive, potentially.

While at it, remove the inflight read for IO completion. We don't
need it, just wake anyone that's waiting for the IO count to drop
to zero. The caller needs to re-check that value anyway when woken,
which it does.

Fixes: 6f75723190d8 ("dm: remove the pending IO accounting")
Acked-by: Mike Snitzer <snitzer@redhat.com>
Reported-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 70568f8b6c53..79ad4b3d215c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -650,14 +650,14 @@ static bool md_in_flight(struct mapped_device *md)
 {
 	int cpu;
 	struct hd_struct *part = &dm_disk(md)->part0;
+	long sum = 0;
 
 	for_each_possible_cpu(cpu) {
-		if (part_stat_local_read_cpu(part, in_flight[0], cpu) ||
-		    part_stat_local_read_cpu(part, in_flight[1], cpu))
-			return true;
+		sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
+		sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
 	}
 
-	return false;
+	return sum != 0;
 }
 
 static void start_io_acct(struct dm_io *io)
@@ -691,10 +691,8 @@ static void end_io_acct(struct dm_io *io)
 				    true, duration, &io->stats_aux);
 
 	/* nudge anyone waiting on suspend queue */
-	if (unlikely(waitqueue_active(&md->wait))) {
-		if (!md_in_flight(md))
-			wake_up(&md->wait);
-	}
+	if (unlikely(waitqueue_active(&md->wait)))
+		wake_up(&md->wait);
 }
 
 /*
-- 
cgit 


From 6451fe73fa0f542a49bfacd7205b88a597897f58 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 9 Dec 2018 11:21:45 -0700
Subject: nvme: fix irq vs io_queue calculations

Guenter reported an boot hang issue on HPPA after we default to 0 poll
queues. We have two issues in the queue count calculations:

1) We don't separate the poll queues from the read/write queues. This is
   important, since the former doesn't need interrupts.
2) The adjust logic is broken.

Adjust the poll queue count before doing nvme_calc_io_queues(). The poll
queue count is only limited by the IO queue count we were able to get
from the controller, not failures in the IRQ allocation loop. This
leaves nvme_calc_io_queues() just adjusting the read/write queue map.

Reported-by: Reported-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 64 ++++++++++++++++++++++---------------------------
 1 file changed, 29 insertions(+), 35 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 7732c4979a4e..fb9d8270f32c 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2030,60 +2030,40 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
 	return ret;
 }
 
-static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int nr_io_queues)
+static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues)
 {
 	unsigned int this_w_queues = write_queues;
-	unsigned int this_p_queues = poll_queues;
 
 	/*
 	 * Setup read/write queue split
 	 */
-	if (nr_io_queues == 1) {
+	if (irq_queues == 1) {
 		dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
 		dev->io_queues[HCTX_TYPE_READ] = 0;
-		dev->io_queues[HCTX_TYPE_POLL] = 0;
 		return;
 	}
 
-	/*
-	 * Configure number of poll queues, if set
-	 */
-	if (this_p_queues) {
-		/*
-		 * We need at least one queue left. With just one queue, we'll
-		 * have a single shared read/write set.
-		 */
-		if (this_p_queues >= nr_io_queues) {
-			this_w_queues = 0;
-			this_p_queues = nr_io_queues - 1;
-		}
-
-		dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
-		nr_io_queues -= this_p_queues;
-	} else
-		dev->io_queues[HCTX_TYPE_POLL] = 0;
-
 	/*
 	 * If 'write_queues' is set, ensure it leaves room for at least
 	 * one read queue
 	 */
-	if (this_w_queues >= nr_io_queues)
-		this_w_queues = nr_io_queues - 1;
+	if (this_w_queues >= irq_queues)
+		this_w_queues = irq_queues - 1;
 
 	/*
 	 * If 'write_queues' is set to zero, reads and writes will share
 	 * a queue set.
 	 */
 	if (!this_w_queues) {
-		dev->io_queues[HCTX_TYPE_DEFAULT] = nr_io_queues;
+		dev->io_queues[HCTX_TYPE_DEFAULT] = irq_queues;
 		dev->io_queues[HCTX_TYPE_READ] = 0;
 	} else {
 		dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
-		dev->io_queues[HCTX_TYPE_READ] = nr_io_queues - this_w_queues;
+		dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues;
 	}
 }
 
-static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
+static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 {
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 	int irq_sets[2];
@@ -2093,6 +2073,20 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 		.sets = irq_sets,
 	};
 	int result = 0;
+	unsigned int irq_queues, this_p_queues;
+
+	/*
+	 * Poll queues don't need interrupts, but we need at least one IO
+	 * queue left over for non-polled IO.
+	 */
+	this_p_queues = poll_queues;
+	if (this_p_queues >= nr_io_queues) {
+		this_p_queues = nr_io_queues - 1;
+		irq_queues = 1;
+	} else {
+		irq_queues = nr_io_queues - this_p_queues;
+	}
+	dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
 
 	/*
 	 * For irq sets, we have to ask for minvec == maxvec. This passes
@@ -2100,7 +2094,7 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 	 * IRQ vector needs.
 	 */
 	do {
-		nvme_calc_io_queues(dev, nr_io_queues);
+		nvme_calc_io_queues(dev, irq_queues);
 		irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
 		irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
 		if (!irq_sets[1])
@@ -2111,11 +2105,11 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 		 * 1 + 1 queues, just ask for a single vector. We'll share
 		 * that between the single IO queue and the admin queue.
 		 */
-		if (!(result < 0 && nr_io_queues == 1))
-			nr_io_queues = irq_sets[0] + irq_sets[1] + 1;
+		if (result >= 0 && irq_queues > 1)
+			irq_queues = irq_sets[0] + irq_sets[1] + 1;
 
-		result = pci_alloc_irq_vectors_affinity(pdev, nr_io_queues,
-				nr_io_queues,
+		result = pci_alloc_irq_vectors_affinity(pdev, irq_queues,
+				irq_queues,
 				PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
 
 		/*
@@ -2125,12 +2119,12 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 		 * likely does not. Back down to ask for just one vector.
 		 */
 		if (result == -ENOSPC) {
-			nr_io_queues--;
-			if (!nr_io_queues)
+			irq_queues--;
+			if (!irq_queues)
 				return result;
 			continue;
 		} else if (result == -EINVAL) {
-			nr_io_queues = 1;
+			irq_queues = 1;
 			continue;
 		} else if (result <= 0)
 			return -EIO;
-- 
cgit 


From c4576aed8d85d808cd6443bda58393d525207d01 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 11 Dec 2018 09:10:26 -0500
Subject: dm: fix request-based dm's use of dm_wait_for_completion

The md->wait waitqueue is used by both bio-based and request-based DM.
Commit dbd3bbd291 ("dm rq: leverage blk_mq_queue_busy() to check for
outstanding IO") lost sight of the requirement that
dm_wait_for_completion() must work with all types of DM devices.

Fix md_in_flight() to call the blk-mq or bio-based method accordingly.

Fixes: dbd3bbd291 ("dm rq: leverage blk_mq_queue_busy() to check for outstanding IO")
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-rq.c |  6 ++----
 drivers/md/dm.c    | 10 +++++++++-
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index d2397d8fcbd1..4e06be4f0a62 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -131,10 +131,8 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig)
 static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 {
 	/* nudge anyone waiting on suspend queue */
-	if (unlikely(waitqueue_active(&md->wait))) {
-		if (!blk_mq_queue_busy(md->queue))
-			wake_up(&md->wait);
-	}
+	if (unlikely(waitqueue_active(&md->wait)))
+		wake_up(&md->wait);
 
 	/*
 	 * dm_put() must be at the end of this function. See the comment above
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 79ad4b3d215c..c414d40d645d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -646,7 +646,7 @@ static void free_tio(struct dm_target_io *tio)
 	bio_put(&tio->clone);
 }
 
-static bool md_in_flight(struct mapped_device *md)
+static bool md_in_flight_bios(struct mapped_device *md)
 {
 	int cpu;
 	struct hd_struct *part = &dm_disk(md)->part0;
@@ -660,6 +660,14 @@ static bool md_in_flight(struct mapped_device *md)
 	return sum != 0;
 }
 
+static bool md_in_flight(struct mapped_device *md)
+{
+	if (queue_is_mq(md->queue))
+		return blk_mq_queue_busy(md->queue);
+	else
+		return md_in_flight_bios(md);
+}
+
 static void start_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
-- 
cgit 


From f40a62d2674b317a263512996f9a7abbfc8178ec Mon Sep 17 00:00:00 2001
From: Zhoujie Wu <zjwu@marvell.com>
Date: Tue, 11 Dec 2018 20:16:07 +0100
Subject: lightnvm: pblk: ignore the smeta oob area scan
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The smeta area l2p mapping is empty, and actually the
recovery procedure only need to restore data sector's l2p
mapping. So ignore the smeta oob scan.

Signed-off-by: Zhoujie Wu <zjwu@marvell.com>
Reviewed-by: Javier González <javier@javigon.com>
Reviewed-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-recovery.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 5740b7509bd8..0fbd30e0a587 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -334,6 +334,7 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
 			       struct pblk_recov_alloc p)
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
+	struct pblk_line_meta *lm = &pblk->lm;
 	struct nvm_geo *geo = &dev->geo;
 	struct ppa_addr *ppa_list;
 	struct pblk_sec_meta *meta_list;
@@ -342,12 +343,12 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
 	void *data;
 	dma_addr_t dma_ppa_list, dma_meta_list;
 	__le64 *lba_list;
-	u64 paddr = 0;
+	u64 paddr = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
 	bool padded = false;
 	int rq_ppas, rq_len;
 	int i, j;
 	int ret;
-	u64 left_ppas = pblk_sec_in_open_line(pblk, line);
+	u64 left_ppas = pblk_sec_in_open_line(pblk, line) - lm->smeta_sec;
 
 	if (pblk_line_wp_is_unbalanced(pblk, line))
 		pblk_warn(pblk, "recovering unbalanced line (%d)\n", line->id);
-- 
cgit 


From 55e58c5e78aad9d3246f57e7718cf5ee7adde9e3 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Tue, 11 Dec 2018 20:16:08 +0100
Subject: lightnvm: Fix uninitialized return value in nvm_get_chunk_meta()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With gcc 4.1:

    drivers/lightnvm/core.c: In function ‘nvm_get_bb_meta’:
    drivers/lightnvm/core.c:977: warning: ‘ret’ may be used uninitialized in this function

and

    drivers/nvme/host/lightnvm.c: In function ‘nvme_nvm_get_chk_meta’:
    drivers/nvme/host/lightnvm.c:580: warning: ‘ret’ may be used uninitialized in this function

Indeed, if (for the former) the number of channels or LUNs is zero, or
(for both) the passed number of chunks is zero, ret will be returned
uninitialized.

Fix this by preinitializing ret to zero.

Fixes: aff3fb18f957de93 ("lightnvm: move bad block and chunk state logic to core")
Fixes: a294c199455187d1 ("lightnvm: implement get log report chunk helpers")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c      | 2 +-
 drivers/nvme/host/lightnvm.c | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 60ab11fcc81c..10e541cb8dc3 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -974,7 +974,7 @@ static int nvm_get_bb_meta(struct nvm_dev *dev, sector_t slba,
 	struct ppa_addr ppa;
 	u8 *blks;
 	int ch, lun, nr_blks;
-	int ret;
+	int ret = 0;
 
 	ppa.ppa = slba;
 	ppa = dev_to_generic_addr(dev, ppa);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index a4f3b263cd6c..d64805dc8efb 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -577,7 +577,8 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
 	struct ppa_addr ppa;
 	size_t left = nchks * sizeof(struct nvme_nvm_chk_meta);
 	size_t log_pos, offset, len;
-	int ret, i, max_len;
+	int i, max_len;
+	int ret = 0;
 
 	/*
 	 * limit requests to maximum 256K to avoid issuing arbitrary large
-- 
cgit 


From 96076f7dde51f332bce5fc5644ddc1e221f64a5a Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Tue, 11 Dec 2018 20:16:09 +0100
Subject: lightnvm: pblk: fix chunk close trace event check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The check for chunk closes suffers from an off-by-one issue, leading
to chunk close events not being traced.

Fixes: 4c44abf43d00 ("lightnvm: pblk: add trace events for chunk states")
Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 6944aac43b01..6581c35f51ee 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -531,7 +531,7 @@ void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd)
 		if (caddr == 0)
 			trace_pblk_chunk_state(pblk_disk_name(pblk),
 							ppa, NVM_CHK_ST_OPEN);
-		else if (caddr == chunk->cnlb)
+		else if (caddr == (chunk->cnlb - 1))
 			trace_pblk_chunk_state(pblk_disk_name(pblk),
 							ppa, NVM_CHK_ST_CLOSED);
 	}
-- 
cgit 


From c12fa401ac8c94a74aff68bb5736b3f1dc695fa8 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Tue, 11 Dec 2018 20:16:10 +0100
Subject: lightnvm: pblk: fix resubmission of overwritten write err lbas
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make sure we only look up valid lba addresses on the resubmission path.

If an lba is invalidated in the write buffer, that sector will be
submitted to disk (as it is already mapped to a ppa), and that write
might fail, resulting in a crash when trying to look up the lba in the
mapping table (as the lba is marked as invalid).

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Reviewed-by: Javier González <javier@javigon.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-write.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index fa8726493b39..3ddd16f47106 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -148,9 +148,11 @@ static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry,
 		w_ctx = &entry->w_ctx;
 
 		/* Check if the lba has been overwritten */
-		ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba);
-		if (!pblk_ppa_comp(ppa_l2p, entry->cacheline))
-			w_ctx->lba = ADDR_EMPTY;
+		if (w_ctx->lba != ADDR_EMPTY) {
+			ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba);
+			if (!pblk_ppa_comp(ppa_l2p, entry->cacheline))
+				w_ctx->lba = ADDR_EMPTY;
+		}
 
 		/* Mark up the entry as submittable again */
 		flags = READ_ONCE(w_ctx->flags);
-- 
cgit 


From ab3887be1e1a2594c9818a77b0f9dfabf0e4ab59 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Tue, 11 Dec 2018 20:16:11 +0100
Subject: lightnvm: pblk: account for write error sectors in emeta
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lines inflicted with write errors lines might be recovered
if they have not been recycled after write error garbage collection.

Ensure that the emeta accounting of valid lbas is correct
for such lines to avoid recovery inconsistencies.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Reviewed-by: Javier González <javier@javigon.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-write.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 3ddd16f47106..750f04b8a227 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -105,14 +105,20 @@ retry:
 }
 
 /* Map remaining sectors in chunk, starting from ppa */
-static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa)
+static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa,
+		int rqd_ppas)
 {
 	struct pblk_line *line;
 	struct ppa_addr map_ppa = *ppa;
+	__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
+	__le64 *lba_list;
 	u64 paddr;
 	int done = 0;
+	int n = 0;
 
 	line = pblk_ppa_to_line(pblk, *ppa);
+	lba_list = emeta_to_lbas(pblk, line->emeta->buf);
+
 	spin_lock(&line->lock);
 
 	while (!done)  {
@@ -121,10 +127,17 @@ static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa)
 		if (!test_and_set_bit(paddr, line->map_bitmap))
 			line->left_msecs--;
 
+		if (n < rqd_ppas && lba_list[paddr] != addr_empty)
+			line->nr_valid_lbas--;
+
+		lba_list[paddr] = addr_empty;
+
 		if (!test_and_set_bit(paddr, line->invalid_bitmap))
 			le32_add_cpu(line->vsc, -1);
 
 		done = nvm_next_ppa_in_chk(pblk->dev, &map_ppa);
+
+		n++;
 	}
 
 	line->w_err_gc->has_write_err = 1;
@@ -202,7 +215,7 @@ static void pblk_submit_rec(struct work_struct *work)
 
 	pblk_log_write_err(pblk, rqd);
 
-	pblk_map_remaining(pblk, ppa_list);
+	pblk_map_remaining(pblk, ppa_list, rqd->nr_ppas);
 	pblk_queue_resubmit(pblk, c_ctx);
 
 	pblk_up_rq(pblk, c_ctx->lun_bitmap);
-- 
cgit 


From 525f7bb2c9f9b2c6673854eade89e98fb3ba7802 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Tue, 11 Dec 2018 20:16:12 +0100
Subject: lightnvm: pblk: stop writes gracefully when running out of lines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If mapping fails (i.e. when running out of lines), handle the error
and stop writing.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Reviewed-by: Javier González <javier@javigon.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-map.c   | 47 ++++++++++++++++++++++++++-----------------
 drivers/lightnvm/pblk-write.c | 30 ++++++++++++++++++---------
 drivers/lightnvm/pblk.h       |  4 ++--
 3 files changed, 51 insertions(+), 30 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index 6dcbd44e3acb..5a3c28cce8ab 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -33,6 +33,9 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 	int nr_secs = pblk->min_write_pgs;
 	int i;
 
+	if (!line)
+		return -ENOSPC;
+
 	if (pblk_line_is_full(line)) {
 		struct pblk_line *prev_line = line;
 
@@ -42,8 +45,11 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 		line = pblk_line_replace_data(pblk);
 		pblk_line_close_meta(pblk, prev_line);
 
-		if (!line)
-			return -EINTR;
+		if (!line) {
+			pblk_pipeline_stop(pblk);
+			return -ENOSPC;
+		}
+
 	}
 
 	emeta = line->emeta;
@@ -84,7 +90,7 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 	return 0;
 }
 
-void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
+int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 		 unsigned long *lun_bitmap, unsigned int valid_secs,
 		 unsigned int off)
 {
@@ -93,20 +99,22 @@ void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 	unsigned int map_secs;
 	int min = pblk->min_write_pgs;
 	int i;
+	int ret;
 
 	for (i = off; i < rqd->nr_ppas; i += min) {
 		map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
-		if (pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
-					lun_bitmap, &meta_list[i], map_secs)) {
-			bio_put(rqd->bio);
-			pblk_free_rqd(pblk, rqd, PBLK_WRITE);
-			pblk_pipeline_stop(pblk);
-		}
+
+		ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
+					lun_bitmap, &meta_list[i], map_secs);
+		if (ret)
+			return ret;
 	}
+
+	return 0;
 }
 
 /* only if erase_ppa is set, acquire erase semaphore */
-void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
+int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 		       unsigned int sentry, unsigned long *lun_bitmap,
 		       unsigned int valid_secs, struct ppa_addr *erase_ppa)
 {
@@ -119,15 +127,16 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 	unsigned int map_secs;
 	int min = pblk->min_write_pgs;
 	int i, erase_lun;
+	int ret;
+
 
 	for (i = 0; i < rqd->nr_ppas; i += min) {
 		map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
-		if (pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
-					lun_bitmap, &meta_list[i], map_secs)) {
-			bio_put(rqd->bio);
-			pblk_free_rqd(pblk, rqd, PBLK_WRITE);
-			pblk_pipeline_stop(pblk);
-		}
+
+		ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
+					lun_bitmap, &meta_list[i], map_secs);
+		if (ret)
+			return ret;
 
 		erase_lun = pblk_ppa_to_pos(geo, ppa_list[i]);
 
@@ -163,7 +172,7 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 	 */
 	e_line = pblk_line_get_erase(pblk);
 	if (!e_line)
-		return;
+		return -ENOSPC;
 
 	/* Erase blocks that are bad in this line but might not be in next */
 	if (unlikely(pblk_ppa_empty(*erase_ppa)) &&
@@ -174,7 +183,7 @@ retry:
 		bit = find_next_bit(d_line->blk_bitmap,
 						lm->blk_per_line, bit + 1);
 		if (bit >= lm->blk_per_line)
-			return;
+			return 0;
 
 		spin_lock(&e_line->lock);
 		if (test_bit(bit, e_line->erase_bitmap)) {
@@ -188,4 +197,6 @@ retry:
 		*erase_ppa = pblk->luns[bit].bppa; /* set ch and lun */
 		erase_ppa->a.blk = e_line->id;
 	}
+
+	return 0;
 }
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 750f04b8a227..2bf78f81862d 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -334,12 +334,13 @@ static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
 	}
 
 	if (likely(!e_line || !atomic_read(&e_line->left_eblks)))
-		pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, 0);
+		ret = pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
+							valid, 0);
 	else
-		pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
+		ret = pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
 							valid, erase_ppa);
 
-	return 0;
+	return ret;
 }
 
 static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
@@ -563,7 +564,7 @@ static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd)
 							c_ctx->nr_padded);
 }
 
-static int pblk_submit_write(struct pblk *pblk)
+static int pblk_submit_write(struct pblk *pblk, int *secs_left)
 {
 	struct bio *bio;
 	struct nvm_rq *rqd;
@@ -572,6 +573,8 @@ static int pblk_submit_write(struct pblk *pblk)
 	unsigned long pos;
 	unsigned int resubmit;
 
+	*secs_left = 0;
+
 	spin_lock(&pblk->resubmit_lock);
 	resubmit = !list_empty(&pblk->resubmit_list);
 	spin_unlock(&pblk->resubmit_lock);
@@ -601,17 +604,17 @@ static int pblk_submit_write(struct pblk *pblk)
 		 */
 		secs_avail = pblk_rb_read_count(&pblk->rwb);
 		if (!secs_avail)
-			return 1;
+			return 0;
 
 		secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
 		if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
-			return 1;
+			return 0;
 
 		secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
 					secs_to_flush);
 		if (secs_to_sync > pblk->max_write_pgs) {
 			pblk_err(pblk, "bad buffer sync calculation\n");
-			return 1;
+			return 0;
 		}
 
 		secs_to_com = (secs_to_sync > secs_avail) ?
@@ -640,6 +643,7 @@ static int pblk_submit_write(struct pblk *pblk)
 	atomic_long_add(secs_to_sync, &pblk->sub_writes);
 #endif
 
+	*secs_left = 1;
 	return 0;
 
 fail_free_bio:
@@ -648,16 +652,22 @@ fail_put_bio:
 	bio_put(bio);
 	pblk_free_rqd(pblk, rqd, PBLK_WRITE);
 
-	return 1;
+	return -EINTR;
 }
 
 int pblk_write_ts(void *data)
 {
 	struct pblk *pblk = data;
+	int secs_left;
+	int write_failure = 0;
 
 	while (!kthread_should_stop()) {
-		if (!pblk_submit_write(pblk))
-			continue;
+		if (!write_failure) {
+			write_failure = pblk_submit_write(pblk, &secs_left);
+
+			if (secs_left)
+				continue;
+		}
 		set_current_state(TASK_INTERRUPTIBLE);
 		io_schedule();
 	}
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 02bb2e98f8a9..f415aae600c8 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -871,10 +871,10 @@ int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
 /*
  * pblk map
  */
-void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
+int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 		       unsigned int sentry, unsigned long *lun_bitmap,
 		       unsigned int valid_secs, struct ppa_addr *erase_ppa);
-void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
+int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 		 unsigned long *lun_bitmap, unsigned int valid_secs,
 		 unsigned int off);
 
-- 
cgit 


From 3bcebc5bac0935d662f30d317e33ffa660bebf93 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Tue, 11 Dec 2018 20:16:13 +0100
Subject: lightnvm: pblk: set conservative threshold for user writes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In a worst-case scenario (random writes), OP% of sectors
in each line will be invalid, and we will then need
to move data out of 100/OP% lines to free a single line.

So, to prevent the possibility of running out of lines,
temporarily block user writes when there is less than
100/OP% free lines.

Also ensure that pblk creation does not produce instances
with insufficient over provisioning.

Insufficient over-provising is not a problem on real hardware,
but often an issue when running QEMU simulations (with few lines).
100 lines is enough to create a sane instance with the standard
(11%) over provisioning.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Reviewed-by: Javier González <javier@javigon.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 40 +++++++++++++++++++++++++++++++---------
 drivers/lightnvm/pblk-rl.c   |  5 ++---
 drivers/lightnvm/pblk.h      | 12 +++++++++++-
 3 files changed, 44 insertions(+), 13 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 13822594647c..f083130d9920 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -635,7 +635,7 @@ static unsigned int calc_emeta_len(struct pblk *pblk)
 	return (lm->emeta_len[1] + lm->emeta_len[2] + lm->emeta_len[3]);
 }
 
-static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
+static int pblk_set_provision(struct pblk *pblk, int nr_free_chks)
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -643,23 +643,41 @@ static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
 	struct nvm_geo *geo = &dev->geo;
 	sector_t provisioned;
 	int sec_meta, blk_meta;
+	int minimum;
 
 	if (geo->op == NVM_TARGET_DEFAULT_OP)
 		pblk->op = PBLK_DEFAULT_OP;
 	else
 		pblk->op = geo->op;
 
-	provisioned = nr_free_blks;
+	minimum = pblk_get_min_chks(pblk);
+	provisioned = nr_free_chks;
 	provisioned *= (100 - pblk->op);
 	sector_div(provisioned, 100);
 
-	pblk->op_blks = nr_free_blks - provisioned;
+	if ((nr_free_chks - provisioned) < minimum) {
+		if (geo->op != NVM_TARGET_DEFAULT_OP) {
+			pblk_err(pblk, "OP too small to create a sane instance\n");
+			return -EINTR;
+		}
+
+		/* If the user did not specify an OP value, and PBLK_DEFAULT_OP
+		 * is not enough, calculate and set sane value
+		 */
+
+		provisioned = nr_free_chks - minimum;
+		pblk->op =  (100 * minimum) / nr_free_chks;
+		pblk_info(pblk, "Default OP insufficient, adjusting OP to %d\n",
+				pblk->op);
+	}
+
+	pblk->op_blks = nr_free_chks - provisioned;
 
 	/* Internally pblk manages all free blocks, but all calculations based
 	 * on user capacity consider only provisioned blocks
 	 */
-	pblk->rl.total_blocks = nr_free_blks;
-	pblk->rl.nr_secs = nr_free_blks * geo->clba;
+	pblk->rl.total_blocks = nr_free_chks;
+	pblk->rl.nr_secs = nr_free_chks * geo->clba;
 
 	/* Consider sectors used for metadata */
 	sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
@@ -667,8 +685,10 @@ static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
 
 	pblk->capacity = (provisioned - blk_meta) * geo->clba;
 
-	atomic_set(&pblk->rl.free_blocks, nr_free_blks);
-	atomic_set(&pblk->rl.free_user_blocks, nr_free_blks);
+	atomic_set(&pblk->rl.free_blocks, nr_free_chks);
+	atomic_set(&pblk->rl.free_user_blocks, nr_free_chks);
+
+	return 0;
 }
 
 static int pblk_setup_line_meta_chk(struct pblk *pblk, struct pblk_line *line,
@@ -984,7 +1004,7 @@ static int pblk_lines_init(struct pblk *pblk)
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	struct pblk_line *line;
 	void *chunk_meta;
-	long nr_free_chks = 0;
+	int nr_free_chks = 0;
 	int i, ret;
 
 	ret = pblk_line_meta_init(pblk);
@@ -1031,7 +1051,9 @@ static int pblk_lines_init(struct pblk *pblk)
 		goto fail_free_lines;
 	}
 
-	pblk_set_provision(pblk, nr_free_chks);
+	ret = pblk_set_provision(pblk, nr_free_chks);
+	if (ret)
+		goto fail_free_lines;
 
 	vfree(chunk_meta);
 	return 0;
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
index db55a1c89997..76116d5f78e4 100644
--- a/drivers/lightnvm/pblk-rl.c
+++ b/drivers/lightnvm/pblk-rl.c
@@ -214,11 +214,10 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
 	struct nvm_geo *geo = &dev->geo;
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	struct pblk_line_meta *lm = &pblk->lm;
-	int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE;
 	int sec_meta, blk_meta;
-
 	unsigned int rb_windows;
 
+
 	/* Consider sectors used for metadata */
 	sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
 	blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
@@ -226,7 +225,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
 	rl->high = pblk->op_blks - blk_meta - lm->blk_per_line;
 	rl->high_pw = get_count_order(rl->high);
 
-	rl->rsv_blocks = min_blocks;
+	rl->rsv_blocks = pblk_get_min_chks(pblk);
 
 	/* This will always be a power-of-2 */
 	rb_windows = budget / NVM_MAX_VLBA;
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index f415aae600c8..e5b88a25d4d6 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -905,7 +905,6 @@ int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
 #define PBLK_GC_MAX_READERS 8	/* Max number of outstanding GC reader jobs */
 #define PBLK_GC_RQ_QD 128	/* Queue depth for inflight GC requests */
 #define PBLK_GC_L_QD 4		/* Queue depth for inflight GC lines */
-#define PBLK_GC_RSV_LINE 1	/* Reserved lines for GC */
 
 int pblk_gc_init(struct pblk *pblk);
 void pblk_gc_exit(struct pblk *pblk, bool graceful);
@@ -1370,4 +1369,15 @@ static inline char *pblk_disk_name(struct pblk *pblk)
 
 	return disk->disk_name;
 }
+
+static inline unsigned int pblk_get_min_chks(struct pblk *pblk)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+	/* In a worst-case scenario every line will have OP invalid sectors.
+	 * We will then need a minimum of 1/OP lines to free up a single line
+	 */
+
+	return DIV_ROUND_UP(100, pblk->op) * lm->blk_per_line;
+
+}
 #endif /* PBLK_H_ */
-- 
cgit 


From c9a1d640d519b40d00dac850d1f17a7df1954689 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Tue, 11 Dec 2018 20:16:14 +0100
Subject: lightnvm: pblk: remove unused macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ADDR_POOL_SIZE is not used anymore, so remove the macro.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Reviewed-by: Javier González <javier@javigon.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index f083130d9920..3219f335fce9 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -207,9 +207,6 @@ static int pblk_rwb_init(struct pblk *pblk)
 	return pblk_rb_init(&pblk->rwb, buffer_size, threshold, geo->csecs);
 }
 
-/* Minimum pages needed within a lun */
-#define ADDR_POOL_SIZE 64
-
 static int pblk_set_addrf_12(struct pblk *pblk, struct nvm_geo *geo,
 			     struct nvm_addrf_12 *dst)
 {
-- 
cgit 


From 0934ce87b588a3da657b41804bf07518103875a4 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Tue, 11 Dec 2018 20:16:15 +0100
Subject: lightnvm: pblk: fix pblk_lines_init error handling path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The chunk metadata is allocated with vmalloc, so we need to use
vfree to free it.

Fixes: 090ee26fd512 ("lightnvm: use internal allocation for chunk log page")
Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Reviewed-by: Javier González <javier@javigon.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 3219f335fce9..0e37104de596 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -1060,7 +1060,7 @@ fail_free_lines:
 		pblk_line_meta_free(l_mg, &pblk->lines[i]);
 	kfree(pblk->lines);
 fail_free_chunk_meta:
-	kfree(chunk_meta);
+	vfree(chunk_meta);
 fail_free_luns:
 	kfree(pblk->luns);
 fail_free_meta:
-- 
cgit 


From e698d9f4e6254b838e4f1a3116ee069bbc378dc0 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Tue, 11 Dec 2018 20:16:16 +0100
Subject: lightnvm: pblk: remove dead code in pblk_recov_l2p
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the call to pblk_line_replace_data as it returns
directly because we have not set l_mg->data_next yet.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Reviewed-by: Javier González <javier@javigon.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-recovery.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 0fbd30e0a587..416d9840544b 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -805,7 +805,6 @@ next:
 		WARN_ON_ONCE(!test_and_clear_bit(meta_line,
 							&l_mg->meta_bitmap));
 		spin_unlock(&l_mg->free_lock);
-		pblk_line_replace_data(pblk);
 	} else {
 		spin_lock(&l_mg->free_lock);
 		/* Allocate next line for preparation */
-- 
cgit 


From 6e82f0ba00b0addeec27b3a95eb41e6223fc8c4f Mon Sep 17 00:00:00 2001
From: Hua Su <suhua.tanke@gmail.com>
Date: Tue, 11 Dec 2018 20:16:17 +0100
Subject: lightnvm: pblk: fix spelling in comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Hua Su <suhua.tanke@gmail.com>
Updated description.
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-rb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index b1f4b51783f4..9f7fa0fe9c77 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -147,7 +147,7 @@ int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold,
 
 	/*
 	 * Initialize rate-limiter, which controls access to the write buffer
-	 * but user and GC I/O
+	 * by user and GC I/O
 	 */
 	pblk_rl_init(&pblk->rl, rb->nr_entries);
 
-- 
cgit 


From fde201a466c6ad5efd72cb54fdf2cefa8b6c6ad7 Mon Sep 17 00:00:00 2001
From: Hua Su <suhua.tanke@gmail.com>
Date: Tue, 11 Dec 2018 20:16:18 +0100
Subject: lightnvm: pblk: add lock protection to list operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Protect the list_add on the pblk_line_init_bb() error
path in case this code is used for some other purpose
in the future.

Signed-off-by: Hua Su <suhua.tanke@gmail.com>
Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 6581c35f51ee..44c5dc046912 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -1295,15 +1295,22 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
 
 	ret = pblk_line_alloc_bitmaps(pblk, line);
 	if (ret)
-		return ret;
+		goto fail;
 
 	if (!pblk_line_init_bb(pblk, line, 0)) {
-		list_add(&line->list, &l_mg->free_list);
-		return -EINTR;
+		ret = -EINTR;
+		goto fail;
 	}
 
 	pblk_rl_free_lines_dec(&pblk->rl, line, true);
 	return 0;
+
+fail:
+	spin_lock(&l_mg->free_lock);
+	list_add(&line->list, &l_mg->free_list);
+	spin_unlock(&l_mg->free_lock);
+
+	return ret;
 }
 
 void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line)
-- 
cgit 


From 361d889f830ef61e4eae442c4c89fb14b626375f Mon Sep 17 00:00:00 2001
From: Javier González <javier@javigon.com>
Date: Tue, 11 Dec 2018 20:16:19 +0100
Subject: lightnvm: pblk: add comments wrt locking in recovery path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pblk's recovery path is single threaded and therefore a number of
assumptions regarding concurrency can be made. To avoid confusion, make
this explicit with a couple of comments in the code.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c     | 1 +
 drivers/lightnvm/pblk-recovery.c | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'drivers')

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 44c5dc046912..f1b411e7c7c9 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -1276,6 +1276,7 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
 	return 0;
 }
 
+/* Line allocations in the recovery path are always single threaded */
 int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 416d9840544b..4c726506a831 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -13,6 +13,9 @@
  * General Public License for more details.
  *
  * pblk-recovery.c - pblk's recovery path
+ *
+ * The L2P recovery path is single threaded as the L2P table is updated in order
+ * following the line sequence ID.
  */
 
 #include "pblk.h"
-- 
cgit 


From 85136c0102852fe505c0fbd3f1bf9d17038bb94d Mon Sep 17 00:00:00 2001
From: Matias Bjørling <mb@lightnvm.io>
Date: Tue, 11 Dec 2018 20:16:20 +0100
Subject: lightnvm: simplify geometry enumeration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the geometry of an OCSSD is enumerated using a two step
approach:

First, nvm_register is called, the OCSSD identify command is issued,
and second the geometry sos and csecs values are read either from the
OCSSD identify if it is a 1.2 drive, or from the NVMe namespace data
structure if it is a 2.0 device.

This patch recombines it into a single step, such that nvm_register can
use the csecs and sos fields independent of which version is used. This
enables one to dynamically size the lightnvm subsystem dma pool.

Reviewed-by: Igor Konopko <igor.j.konopko@intel.com>
Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c      | 12 +++++-------
 drivers/nvme/host/core.c     | 18 +++++++++---------
 drivers/nvme/host/lightnvm.c | 18 ++++++------------
 drivers/nvme/host/nvme.h     |  2 --
 4 files changed, 20 insertions(+), 30 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 10e541cb8dc3..69b841d682c7 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -1145,25 +1145,23 @@ int nvm_register(struct nvm_dev *dev)
 	if (!dev->q || !dev->ops)
 		return -EINVAL;
 
+	ret = nvm_init(dev);
+	if (ret)
+		return ret;
+
 	dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist");
 	if (!dev->dma_pool) {
 		pr_err("nvm: could not create dma pool\n");
+		nvm_free(dev);
 		return -ENOMEM;
 	}
 
-	ret = nvm_init(dev);
-	if (ret)
-		goto err_init;
-
 	/* register device with a supported media manager */
 	down_write(&nvm_lock);
 	list_add(&dev->devices, &nvm_devices);
 	up_write(&nvm_lock);
 
 	return 0;
-err_init:
-	dev->ops->destroy_dma_pool(dev->dma_pool);
-	return ret;
 }
 EXPORT_SYMBOL(nvm_register);
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1310753a01e5..c71e879821ad 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1549,8 +1549,6 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 	if (ns->noiob)
 		nvme_set_chunk_size(ns);
 	nvme_update_disk_info(disk, ns, id);
-	if (ns->ndev)
-		nvme_nvm_update_nvm_info(ns);
 #ifdef CONFIG_NVME_MULTIPATH
 	if (ns->head->disk) {
 		nvme_update_disk_info(ns->head->disk, ns, id);
@@ -3156,13 +3154,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	nvme_setup_streams_ns(ctrl, ns);
 	nvme_set_disk_name(disk_name, ns, ctrl, &flags);
 
-	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
-		if (nvme_nvm_register(ns, disk_name, node)) {
-			dev_warn(ctrl->device, "LightNVM init failure\n");
-			goto out_unlink_ns;
-		}
-	}
-
 	disk = alloc_disk_node(0, node);
 	if (!disk)
 		goto out_unlink_ns;
@@ -3176,6 +3167,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 	__nvme_revalidate_disk(disk, id);
 
+	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
+		if (nvme_nvm_register(ns, disk_name, node)) {
+			dev_warn(ctrl->device, "LightNVM init failure\n");
+			goto out_put_disk;
+		}
+	}
+
 	down_write(&ctrl->namespaces_rwsem);
 	list_add_tail(&ns->list, &ctrl->namespaces);
 	up_write(&ctrl->namespaces_rwsem);
@@ -3189,6 +3187,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	kfree(id);
 
 	return;
+ out_put_disk:
+	put_disk(ns->disk);
  out_unlink_ns:
 	mutex_lock(&ctrl->subsys->lock);
 	list_del_rcu(&ns->siblings);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index d64805dc8efb..51d957ccf328 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -973,22 +973,11 @@ int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg)
 	}
 }
 
-void nvme_nvm_update_nvm_info(struct nvme_ns *ns)
-{
-	struct nvm_dev *ndev = ns->ndev;
-	struct nvm_geo *geo = &ndev->geo;
-
-	if (geo->version == NVM_OCSSD_SPEC_12)
-		return;
-
-	geo->csecs = 1 << ns->lba_shift;
-	geo->sos = ns->ms;
-}
-
 int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
 {
 	struct request_queue *q = ns->queue;
 	struct nvm_dev *dev;
+	struct nvm_geo *geo;
 
 	_nvme_nvm_check_size();
 
@@ -996,6 +985,11 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
 	if (!dev)
 		return -ENOMEM;
 
+	/* Note that csecs and sos will be overridden if it is a 1.2 drive. */
+	geo = &dev->geo;
+	geo->csecs = 1 << ns->lba_shift;
+	geo->sos = ns->ms;
+
 	dev->q = q;
 	memcpy(dev->name, disk_name, DISK_NAME_LEN);
 	dev->ops = &nvme_nvm_dev_ops;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 8e0ec365ce8d..e20e737ac10c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -540,13 +540,11 @@ static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl)
 #endif /* CONFIG_NVME_MULTIPATH */
 
 #ifdef CONFIG_NVM
-void nvme_nvm_update_nvm_info(struct nvme_ns *ns);
 int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
 void nvme_nvm_unregister(struct nvme_ns *ns);
 extern const struct attribute_group nvme_nvm_attr_group;
 int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg);
 #else
-static inline void nvme_nvm_update_nvm_info(struct nvme_ns *ns) {};
 static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
 				    int node)
 {
-- 
cgit 


From 42bd0384d77ef7552954056928018f5cfa91a013 Mon Sep 17 00:00:00 2001
From: Javier González <javier@javigon.com>
Date: Tue, 11 Dec 2018 20:16:21 +0100
Subject: lightnvm: pblk: avoid ref warning on cache creation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current kref implementation around pblk global caches triggers a
false positive on refcount_inc_checked() (when called) as the kref is
initialized to 0. Instead of usint kref_inc() on a 0 reference, which is
in principle correct, use kref_init() to avoid the check. This is also
more explicit about what actually happens on cache creation.

In the process, do a small refactoring to use kref helpers.

Fixes: 1864de94ec9d6 "lightnvm: pblk: stop recreating global caches"
Signed-off-by: Javier González <javier@cnexlabs.com>
Reviewed-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 0e37104de596..72ad3e70318c 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -347,23 +347,19 @@ fail_destroy_ws:
 
 static int pblk_get_global_caches(void)
 {
-	int ret;
+	int ret = 0;
 
 	mutex_lock(&pblk_caches.mutex);
 
-	if (kref_read(&pblk_caches.kref) > 0) {
-		kref_get(&pblk_caches.kref);
-		mutex_unlock(&pblk_caches.mutex);
-		return 0;
-	}
+	if (kref_get_unless_zero(&pblk_caches.kref))
+		goto out;
 
 	ret = pblk_create_global_caches();
-
 	if (!ret)
-		kref_get(&pblk_caches.kref);
+		kref_init(&pblk_caches.kref);
 
+out:
 	mutex_unlock(&pblk_caches.mutex);
-
 	return ret;
 }
 
-- 
cgit 


From dd439496dfbcfee1eb1e0d14984f98acb2b84c16 Mon Sep 17 00:00:00 2001
From: Igor Konopko <igor.j.konopko@intel.com>
Date: Tue, 11 Dec 2018 20:16:22 +0100
Subject: lightnvm: pblk: move lba list to partial read context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently DMA allocated memory is reused on partial read
for lba_list_mem and lba_list_media arrays. In preparation
for dynamic DMA pool sizes we need to move this arrays
into pblk_pr_ctx structures.

Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Igor Konopko <igor.j.konopko@intel.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-read.c | 20 +++++---------------
 drivers/lightnvm/pblk.h      |  2 ++
 2 files changed, 7 insertions(+), 15 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 9fba614adeeb..19917d3c19b3 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -224,7 +224,6 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
 	unsigned long *read_bitmap = pr_ctx->bitmap;
 	int nr_secs = pr_ctx->orig_nr_secs;
 	int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
-	__le64 *lba_list_mem, *lba_list_media;
 	void *src_p, *dst_p;
 	int hole, i;
 
@@ -237,13 +236,9 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
 		rqd->ppa_list[0] = ppa;
 	}
 
-	/* Re-use allocated memory for intermediate lbas */
-	lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
-	lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size);
-
 	for (i = 0; i < nr_secs; i++) {
-		lba_list_media[i] = meta_list[i].lba;
-		meta_list[i].lba = lba_list_mem[i];
+		pr_ctx->lba_list_media[i] = meta_list[i].lba;
+		meta_list[i].lba = pr_ctx->lba_list_mem[i];
 	}
 
 	/* Fill the holes in the original bio */
@@ -255,7 +250,7 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
 		line = pblk_ppa_to_line(pblk, rqd->ppa_list[i]);
 		kref_put(&line->ref, pblk_line_put);
 
-		meta_list[hole].lba = lba_list_media[i];
+		meta_list[hole].lba = pr_ctx->lba_list_media[i];
 
 		src_bv = new_bio->bi_io_vec[i++];
 		dst_bv = bio->bi_io_vec[bio_init_idx + hole];
@@ -295,13 +290,9 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
 	struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
 	struct pblk_pr_ctx *pr_ctx;
 	struct bio *new_bio, *bio = r_ctx->private;
-	__le64 *lba_list_mem;
 	int nr_secs = rqd->nr_ppas;
 	int i;
 
-	/* Re-use allocated memory for intermediate lbas */
-	lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
-
 	new_bio = bio_alloc(GFP_KERNEL, nr_holes);
 
 	if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
@@ -312,12 +303,12 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
 		goto fail_free_pages;
 	}
 
-	pr_ctx = kmalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL);
+	pr_ctx = kzalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL);
 	if (!pr_ctx)
 		goto fail_free_pages;
 
 	for (i = 0; i < nr_secs; i++)
-		lba_list_mem[i] = meta_list[i].lba;
+		pr_ctx->lba_list_mem[i] = meta_list[i].lba;
 
 	new_bio->bi_iter.bi_sector = 0; /* internal bio */
 	bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
@@ -325,7 +316,6 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
 	rqd->bio = new_bio;
 	rqd->nr_ppas = nr_holes;
 
-	pr_ctx->ppa_ptr = NULL;
 	pr_ctx->orig_bio = bio;
 	bitmap_copy(pr_ctx->bitmap, read_bitmap, NVM_MAX_VLBA);
 	pr_ctx->bio_init_idx = bio_init_idx;
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index e5b88a25d4d6..0e9d3960ac4c 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -132,6 +132,8 @@ struct pblk_pr_ctx {
 	unsigned int bio_init_idx;
 	void *ppa_ptr;
 	dma_addr_t dma_ppa_list;
+	__le64 lba_list_mem[NVM_MAX_VLBA];
+	__le64 lba_list_media[NVM_MAX_VLBA];
 };
 
 /* Pad context */
-- 
cgit 


From faa79f27f0a46cd6c3ac3de5e7f3e142598217fc Mon Sep 17 00:00:00 2001
From: Igor Konopko <igor.j.konopko@intel.com>
Date: Tue, 11 Dec 2018 20:16:23 +0100
Subject: lightnvm: pblk: add helpers for OOB metadata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pblk currently assumes that size of OOB metadata on drive is always
equal to size of pblk_sec_meta struct. This commit add helpers which will
allow to handle different sizes of OOB metadata on drive in the future.

After this patch only OOB metadata equal to 16 bytes is supported.

Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Igor Konopko <igor.j.konopko@intel.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c     |  5 +++--
 drivers/lightnvm/pblk-init.c     |  6 +++++
 drivers/lightnvm/pblk-map.c      | 20 +++++++++++------
 drivers/lightnvm/pblk-read.c     | 48 ++++++++++++++++++++++++++--------------
 drivers/lightnvm/pblk-recovery.c | 16 +++++++++-----
 drivers/lightnvm/pblk.h          |  6 +++++
 6 files changed, 69 insertions(+), 32 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index f1b411e7c7c9..e732b2d12a23 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -796,10 +796,11 @@ static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line,
 	rqd.is_seq = 1;
 
 	for (i = 0; i < lm->smeta_sec; i++, paddr++) {
-		struct pblk_sec_meta *meta_list = rqd.meta_list;
+		struct pblk_sec_meta *meta = pblk_get_meta(pblk,
+							   rqd.meta_list, i);
 
 		rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
-		meta_list[i].lba = lba_list[paddr] = addr_empty;
+		meta->lba = lba_list[paddr] = addr_empty;
 	}
 
 	ret = pblk_submit_io_sync_sem(pblk, &rqd);
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 72ad3e70318c..33361bfb85c3 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -405,6 +405,12 @@ static int pblk_core_init(struct pblk *pblk)
 		queue_max_hw_sectors(dev->q) / (geo->csecs >> SECTOR_SHIFT));
 	pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
 
+	pblk->oob_meta_size = geo->sos;
+	if (pblk->oob_meta_size != sizeof(struct pblk_sec_meta)) {
+		pblk_err(pblk, "Unsupported metadata size\n");
+		return -EINVAL;
+	}
+
 	pblk->pad_dist = kcalloc(pblk->min_write_pgs - 1, sizeof(atomic64_t),
 								GFP_KERNEL);
 	if (!pblk->pad_dist)
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index 5a3c28cce8ab..81e503ec384e 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -22,7 +22,7 @@
 static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 			      struct ppa_addr *ppa_list,
 			      unsigned long *lun_bitmap,
-			      struct pblk_sec_meta *meta_list,
+			      void *meta_list,
 			      unsigned int valid_secs)
 {
 	struct pblk_line *line = pblk_line_get_data(pblk);
@@ -58,6 +58,7 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 	paddr = pblk_alloc_page(pblk, line, nr_secs);
 
 	for (i = 0; i < nr_secs; i++, paddr++) {
+		struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
 		__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
 
 		/* ppa to be sent to the device */
@@ -74,14 +75,15 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 			kref_get(&line->ref);
 			w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i);
 			w_ctx->ppa = ppa_list[i];
-			meta_list[i].lba = cpu_to_le64(w_ctx->lba);
+			meta->lba = cpu_to_le64(w_ctx->lba);
 			lba_list[paddr] = cpu_to_le64(w_ctx->lba);
 			if (lba_list[paddr] != addr_empty)
 				line->nr_valid_lbas++;
 			else
 				atomic64_inc(&pblk->pad_wa);
 		} else {
-			lba_list[paddr] = meta_list[i].lba = addr_empty;
+			lba_list[paddr] = addr_empty;
+			meta->lba = addr_empty;
 			__pblk_map_invalidate(pblk, line, paddr);
 		}
 	}
@@ -94,7 +96,8 @@ int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 		 unsigned long *lun_bitmap, unsigned int valid_secs,
 		 unsigned int off)
 {
-	struct pblk_sec_meta *meta_list = rqd->meta_list;
+	void *meta_list = rqd->meta_list;
+	void *meta_buffer;
 	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
 	unsigned int map_secs;
 	int min = pblk->min_write_pgs;
@@ -103,9 +106,10 @@ int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 
 	for (i = off; i < rqd->nr_ppas; i += min) {
 		map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
+		meta_buffer = pblk_get_meta(pblk, meta_list, i);
 
 		ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
-					lun_bitmap, &meta_list[i], map_secs);
+					lun_bitmap, meta_buffer, map_secs);
 		if (ret)
 			return ret;
 	}
@@ -121,7 +125,8 @@ int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
 	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_sec_meta *meta_list = rqd->meta_list;
+	void *meta_list = rqd->meta_list;
+	void *meta_buffer;
 	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
 	struct pblk_line *e_line, *d_line;
 	unsigned int map_secs;
@@ -132,9 +137,10 @@ int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 
 	for (i = 0; i < rqd->nr_ppas; i += min) {
 		map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
+		meta_buffer = pblk_get_meta(pblk, meta_list, i);
 
 		ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
-					lun_bitmap, &meta_list[i], map_secs);
+					lun_bitmap, meta_buffer, map_secs);
 		if (ret)
 			return ret;
 
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 19917d3c19b3..6becd85ca4c6 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -43,7 +43,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
 				 struct bio *bio, sector_t blba,
 				 unsigned long *read_bitmap)
 {
-	struct pblk_sec_meta *meta_list = rqd->meta_list;
+	void *meta_list = rqd->meta_list;
 	struct ppa_addr ppas[NVM_MAX_VLBA];
 	int nr_secs = rqd->nr_ppas;
 	bool advanced_bio = false;
@@ -53,12 +53,15 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
 
 	for (i = 0; i < nr_secs; i++) {
 		struct ppa_addr p = ppas[i];
+		struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
 		sector_t lba = blba + i;
 
 retry:
 		if (pblk_ppa_empty(p)) {
+			__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
+
 			WARN_ON(test_and_set_bit(i, read_bitmap));
-			meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
+			meta->lba = addr_empty;
 
 			if (unlikely(!advanced_bio)) {
 				bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE);
@@ -78,7 +81,7 @@ retry:
 				goto retry;
 			}
 			WARN_ON(test_and_set_bit(i, read_bitmap));
-			meta_list[i].lba = cpu_to_le64(lba);
+			meta->lba = cpu_to_le64(lba);
 			advanced_bio = true;
 #ifdef CONFIG_NVM_PBLK_DEBUG
 			atomic_long_inc(&pblk->cache_reads);
@@ -105,12 +108,13 @@ next:
 static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
 				sector_t blba)
 {
-	struct pblk_sec_meta *meta_lba_list = rqd->meta_list;
+	void *meta_list = rqd->meta_list;
 	int nr_lbas = rqd->nr_ppas;
 	int i;
 
 	for (i = 0; i < nr_lbas; i++) {
-		u64 lba = le64_to_cpu(meta_lba_list[i].lba);
+		struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
+		u64 lba = le64_to_cpu(meta->lba);
 
 		if (lba == ADDR_EMPTY)
 			continue;
@@ -134,17 +138,19 @@ static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
 static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd,
 				 u64 *lba_list, int nr_lbas)
 {
-	struct pblk_sec_meta *meta_lba_list = rqd->meta_list;
+	void *meta_lba_list = rqd->meta_list;
 	int i, j;
 
 	for (i = 0, j = 0; i < nr_lbas; i++) {
+		struct pblk_sec_meta *meta = pblk_get_meta(pblk,
+							   meta_lba_list, j);
 		u64 lba = lba_list[i];
 		u64 meta_lba;
 
 		if (lba == ADDR_EMPTY)
 			continue;
 
-		meta_lba = le64_to_cpu(meta_lba_list[j].lba);
+		meta_lba = le64_to_cpu(meta->lba);
 
 		if (lba != meta_lba) {
 #ifdef CONFIG_NVM_PBLK_DEBUG
@@ -216,10 +222,11 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
 	struct pblk *pblk = rqd->private;
 	struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
 	struct pblk_pr_ctx *pr_ctx = r_ctx->private;
+	struct pblk_sec_meta *meta;
 	struct bio *new_bio = rqd->bio;
 	struct bio *bio = pr_ctx->orig_bio;
 	struct bio_vec src_bv, dst_bv;
-	struct pblk_sec_meta *meta_list = rqd->meta_list;
+	void *meta_list = rqd->meta_list;
 	int bio_init_idx = pr_ctx->bio_init_idx;
 	unsigned long *read_bitmap = pr_ctx->bitmap;
 	int nr_secs = pr_ctx->orig_nr_secs;
@@ -237,8 +244,9 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
 	}
 
 	for (i = 0; i < nr_secs; i++) {
-		pr_ctx->lba_list_media[i] = meta_list[i].lba;
-		meta_list[i].lba = pr_ctx->lba_list_mem[i];
+		meta = pblk_get_meta(pblk, meta_list, i);
+		pr_ctx->lba_list_media[i] = le64_to_cpu(meta->lba);
+		meta->lba = cpu_to_le64(pr_ctx->lba_list_mem[i]);
 	}
 
 	/* Fill the holes in the original bio */
@@ -250,7 +258,8 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
 		line = pblk_ppa_to_line(pblk, rqd->ppa_list[i]);
 		kref_put(&line->ref, pblk_line_put);
 
-		meta_list[hole].lba = pr_ctx->lba_list_media[i];
+		meta = pblk_get_meta(pblk, meta_list, hole);
+		meta->lba = cpu_to_le64(pr_ctx->lba_list_media[i]);
 
 		src_bv = new_bio->bi_io_vec[i++];
 		dst_bv = bio->bi_io_vec[bio_init_idx + hole];
@@ -286,7 +295,7 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
 			    unsigned long *read_bitmap,
 			    int nr_holes)
 {
-	struct pblk_sec_meta *meta_list = rqd->meta_list;
+	void *meta_list = rqd->meta_list;
 	struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
 	struct pblk_pr_ctx *pr_ctx;
 	struct bio *new_bio, *bio = r_ctx->private;
@@ -307,8 +316,11 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
 	if (!pr_ctx)
 		goto fail_free_pages;
 
-	for (i = 0; i < nr_secs; i++)
-		pr_ctx->lba_list_mem[i] = meta_list[i].lba;
+	for (i = 0; i < nr_secs; i++) {
+		struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
+
+		pr_ctx->lba_list_mem[i] = le64_to_cpu(meta->lba);
+	}
 
 	new_bio->bi_iter.bi_sector = 0; /* internal bio */
 	bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
@@ -373,7 +385,7 @@ err:
 static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
 			 sector_t lba, unsigned long *read_bitmap)
 {
-	struct pblk_sec_meta *meta_list = rqd->meta_list;
+	struct pblk_sec_meta *meta = pblk_get_meta(pblk, rqd->meta_list, 0);
 	struct ppa_addr ppa;
 
 	pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
@@ -384,8 +396,10 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
 
 retry:
 	if (pblk_ppa_empty(ppa)) {
+		__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
+
 		WARN_ON(test_and_set_bit(0, read_bitmap));
-		meta_list[0].lba = cpu_to_le64(ADDR_EMPTY);
+		meta->lba = addr_empty;
 		return;
 	}
 
@@ -399,7 +413,7 @@ retry:
 		}
 
 		WARN_ON(test_and_set_bit(0, read_bitmap));
-		meta_list[0].lba = cpu_to_le64(lba);
+		meta->lba = cpu_to_le64(lba);
 
 #ifdef CONFIG_NVM_PBLK_DEBUG
 		atomic_long_inc(&pblk->cache_reads);
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 4c726506a831..e4dd634ba05f 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -127,7 +127,7 @@ static u64 pblk_sec_in_open_line(struct pblk *pblk, struct pblk_line *line)
 
 struct pblk_recov_alloc {
 	struct ppa_addr *ppa_list;
-	struct pblk_sec_meta *meta_list;
+	void *meta_list;
 	struct nvm_rq *rqd;
 	void *data;
 	dma_addr_t dma_ppa_list;
@@ -161,7 +161,7 @@ static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line,
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
-	struct pblk_sec_meta *meta_list;
+	void *meta_list;
 	struct pblk_pad_rq *pad_rq;
 	struct nvm_rq *rqd;
 	struct bio *bio;
@@ -240,12 +240,15 @@ next_pad_rq:
 
 		for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) {
 			struct ppa_addr dev_ppa;
+			struct pblk_sec_meta *meta;
 			__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
 
 			dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
 
 			pblk_map_invalidate(pblk, dev_ppa);
-			lba_list[w_ptr] = meta_list[i].lba = addr_empty;
+			lba_list[w_ptr] = addr_empty;
+			meta = pblk_get_meta(pblk, meta_list, i);
+			meta->lba = addr_empty;
 			rqd->ppa_list[i] = dev_ppa;
 		}
 	}
@@ -340,7 +343,7 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct nvm_geo *geo = &dev->geo;
 	struct ppa_addr *ppa_list;
-	struct pblk_sec_meta *meta_list;
+	void *meta_list;
 	struct nvm_rq *rqd;
 	struct bio *bio;
 	void *data;
@@ -438,7 +441,8 @@ retry_rq:
 	}
 
 	for (i = 0; i < rqd->nr_ppas; i++) {
-		u64 lba = le64_to_cpu(meta_list[i].lba);
+		struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
+		u64 lba = le64_to_cpu(meta->lba);
 
 		lba_list[paddr++] = cpu_to_le64(lba);
 
@@ -467,7 +471,7 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
 	struct nvm_geo *geo = &dev->geo;
 	struct nvm_rq *rqd;
 	struct ppa_addr *ppa_list;
-	struct pblk_sec_meta *meta_list;
+	void *meta_list;
 	struct pblk_recov_alloc p;
 	void *data;
 	dma_addr_t dma_ppa_list, dma_meta_list;
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 0e9d3960ac4c..80f356688803 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -634,6 +634,7 @@ struct pblk {
 
 	int min_write_pgs; /* Minimum amount of pages required by controller */
 	int max_write_pgs; /* Maximum amount of pages supported by controller */
+	int oob_meta_size; /* Size of OOB sector metadata */
 
 	sector_t capacity; /* Device capacity when bad blocks are subtracted */
 
@@ -1380,6 +1381,11 @@ static inline unsigned int pblk_get_min_chks(struct pblk *pblk)
 	 */
 
 	return DIV_ROUND_UP(100, pblk->op) * lm->blk_per_line;
+}
 
+static inline struct pblk_sec_meta *pblk_get_meta(struct pblk *pblk,
+							 void *meta, int index)
+{
+	return meta + pblk->oob_meta_size * index;
 }
 #endif /* PBLK_H_ */
-- 
cgit 


From 24828d0536bbedc9b265f2b01ffca99de3f6a7c7 Mon Sep 17 00:00:00 2001
From: Igor Konopko <igor.j.konopko@intel.com>
Date: Tue, 11 Dec 2018 20:16:24 +0100
Subject: lightnvm: dynamic DMA pool entry size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently lightnvm and pblk uses single DMA pool, for which the entry
size always is equal to PAGE_SIZE. The contents of each entry allocated
from the DMA pool consists of a PPA list (8bytes * 64), leaving
56bytes * 64 space for metadata. Since the metadata field can be bigger,
such as 128 bytes, the static size does not cover this use-case.

This patch adds support for I/O metadata above 56 bytes by changing DMA
pool size based on device meta size and allows pblk to use OOB metadata
>=16B.

Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Igor Konopko <igor.j.konopko@intel.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c          | 9 +++++++--
 drivers/lightnvm/pblk-core.c     | 8 ++++----
 drivers/lightnvm/pblk-init.c     | 2 +-
 drivers/lightnvm/pblk-recovery.c | 4 ++--
 drivers/lightnvm/pblk.h          | 6 +++++-
 drivers/nvme/host/lightnvm.c     | 5 +++--
 6 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 69b841d682c7..5f82036fe322 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -1140,7 +1140,7 @@ EXPORT_SYMBOL(nvm_alloc_dev);
 
 int nvm_register(struct nvm_dev *dev)
 {
-	int ret;
+	int ret, exp_pool_size;
 
 	if (!dev->q || !dev->ops)
 		return -EINVAL;
@@ -1149,7 +1149,12 @@ int nvm_register(struct nvm_dev *dev)
 	if (ret)
 		return ret;
 
-	dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist");
+	exp_pool_size = max_t(int, PAGE_SIZE,
+			      (NVM_MAX_VLBA * (sizeof(u64) + dev->geo.sos)));
+	exp_pool_size = round_up(exp_pool_size, PAGE_SIZE);
+
+	dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist",
+						  exp_pool_size);
 	if (!dev->dma_pool) {
 		pr_err("nvm: could not create dma pool\n");
 		nvm_free(dev);
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index e732b2d12a23..7e3397f8ead1 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -250,8 +250,8 @@ int pblk_alloc_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd)
 	if (rqd->nr_ppas == 1)
 		return 0;
 
-	rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
-	rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
+	rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size(pblk);
+	rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size(pblk);
 
 	return 0;
 }
@@ -846,8 +846,8 @@ int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line,
 	if (!meta_list)
 		return -ENOMEM;
 
-	ppa_list = meta_list + pblk_dma_meta_size;
-	dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+	ppa_list = meta_list + pblk_dma_meta_size(pblk);
+	dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk);
 
 next_rq:
 	memset(&rqd, 0, sizeof(struct nvm_rq));
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 33361bfb85c3..ff6a6df369c3 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -406,7 +406,7 @@ static int pblk_core_init(struct pblk *pblk)
 	pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
 
 	pblk->oob_meta_size = geo->sos;
-	if (pblk->oob_meta_size != sizeof(struct pblk_sec_meta)) {
+	if (pblk->oob_meta_size < sizeof(struct pblk_sec_meta)) {
 		pblk_err(pblk, "Unsupported metadata size\n");
 		return -EINVAL;
 	}
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index e4dd634ba05f..3a775d10f616 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -481,8 +481,8 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
 	if (!meta_list)
 		return -ENOMEM;
 
-	ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
-	dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+	ppa_list = (void *)(meta_list) + pblk_dma_meta_size(pblk);
+	dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk);
 
 	data = kcalloc(pblk->max_write_pgs, geo->csecs, GFP_KERNEL);
 	if (!data) {
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 80f356688803..9087d53d5c25 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -104,7 +104,6 @@ enum {
 	PBLK_RL_LOW = 4
 };
 
-#define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * NVM_MAX_VLBA)
 #define pblk_dma_ppa_size (sizeof(u64) * NVM_MAX_VLBA)
 
 /* write buffer completion context */
@@ -1388,4 +1387,9 @@ static inline struct pblk_sec_meta *pblk_get_meta(struct pblk *pblk,
 {
 	return meta + pblk->oob_meta_size * index;
 }
+
+static inline int pblk_dma_meta_size(struct pblk *pblk)
+{
+	return pblk->oob_meta_size * NVM_MAX_VLBA;
+}
 #endif /* PBLK_H_ */
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 51d957ccf328..ba268d7cf141 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -732,11 +732,12 @@ static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd)
 	return ret;
 }
 
-static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name)
+static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name,
+					int size)
 {
 	struct nvme_ns *ns = nvmdev->q->queuedata;
 
-	return dma_pool_create(name, ns->ctrl->dev, PAGE_SIZE, PAGE_SIZE, 0);
+	return dma_pool_create(name, ns->ctrl->dev, size, PAGE_SIZE, 0);
 }
 
 static void nvme_nvm_destroy_dma_pool(void *pool)
-- 
cgit 


From a16816b9e462e8ee86a908606bde54b53cfeca80 Mon Sep 17 00:00:00 2001
From: Igor Konopko <igor.j.konopko@intel.com>
Date: Tue, 11 Dec 2018 20:16:25 +0100
Subject: lightnvm: disable interleaved metadata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently pblk only check the size of I/O metadata and does not take
into account if this metadata is in a separate buffer or interleaved
in a single metadata buffer.

In reality only the first scenario is supported, where second mode will
break pblk functionality during any IO operation.

This patch prevents pblk to be instantiated in case device only
supports interleaved metadata.

Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Igor Konopko <igor.j.konopko@intel.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 6 ++++++
 drivers/nvme/host/lightnvm.c | 1 +
 2 files changed, 7 insertions(+)

(limited to 'drivers')

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index ff6a6df369c3..e8055b796381 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -1175,6 +1175,12 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 		return ERR_PTR(-EINVAL);
 	}
 
+	if (geo->ext) {
+		pblk_err(pblk, "extended metadata not supported\n");
+		kfree(pblk);
+		return ERR_PTR(-EINVAL);
+	}
+
 	spin_lock_init(&pblk->resubmit_lock);
 	spin_lock_init(&pblk->trans_lock);
 	spin_lock_init(&pblk->lock);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index ba268d7cf141..f145fc0220d6 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -990,6 +990,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
 	geo = &dev->geo;
 	geo->csecs = 1 << ns->lba_shift;
 	geo->sos = ns->ms;
+	geo->ext = ns->ext;
 
 	dev->q = q;
 	memcpy(dev->name, disk_name, DISK_NAME_LEN);
-- 
cgit 


From 55d8ec35398e7ab001989473cf6ed6f40b5ef4a6 Mon Sep 17 00:00:00 2001
From: Igor Konopko <igor.j.konopko@intel.com>
Date: Tue, 11 Dec 2018 20:16:26 +0100
Subject: lightnvm: pblk: support packed metadata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pblk performs recovery of open lines by storing the LBA in the per LBA
metadata field. Recovery therefore only works for drives that has this
field.

This patch adds support for packed metadata, which store l2p mapping
for open lines in last sector of every write unit and enables drives
without per IO metadata to recover open lines.

After this patch, drives with OOB size <16B will use packed metadata
and metadata size larger than16B will continue to use the device per
IO metadata.

Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Igor Konopko <igor.j.konopko@intel.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c     | 48 ++++++++++++++++++++++++++++++++++++----
 drivers/lightnvm/pblk-init.c     | 38 ++++++++++++++++++++++++++-----
 drivers/lightnvm/pblk-map.c      |  4 ++--
 drivers/lightnvm/pblk-rb.c       |  3 +++
 drivers/lightnvm/pblk-read.c     |  6 +++++
 drivers/lightnvm/pblk-recovery.c | 17 ++++++++++----
 drivers/lightnvm/pblk-sysfs.c    |  7 ++++++
 drivers/lightnvm/pblk-write.c    |  9 ++++----
 drivers/lightnvm/pblk.h          | 10 ++++++++-
 9 files changed, 122 insertions(+), 20 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 7e3397f8ead1..1ff165351180 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -376,7 +376,7 @@ void pblk_write_should_kick(struct pblk *pblk)
 {
 	unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb);
 
-	if (secs_avail >= pblk->min_write_pgs)
+	if (secs_avail >= pblk->min_write_pgs_data)
 		pblk_write_kick(pblk);
 }
 
@@ -407,7 +407,9 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	struct list_head *move_list = NULL;
-	int vsc = le32_to_cpu(*line->vsc);
+	int packed_meta = (le32_to_cpu(*line->vsc) / pblk->min_write_pgs_data)
+			* (pblk->min_write_pgs - pblk->min_write_pgs_data);
+	int vsc = le32_to_cpu(*line->vsc) + packed_meta;
 
 	lockdep_assert_held(&line->lock);
 
@@ -620,12 +622,15 @@ out:
 }
 
 int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
-		   unsigned long secs_to_flush)
+		   unsigned long secs_to_flush, bool skip_meta)
 {
 	int max = pblk->sec_per_write;
 	int min = pblk->min_write_pgs;
 	int secs_to_sync = 0;
 
+	if (skip_meta && pblk->min_write_pgs_data != pblk->min_write_pgs)
+		min = max = pblk->min_write_pgs_data;
+
 	if (secs_avail >= max)
 		secs_to_sync = max;
 	else if (secs_avail >= min)
@@ -852,7 +857,7 @@ int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line,
 next_rq:
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
-	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
 	rq_len = rq_ppas * geo->csecs;
 
 	bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len,
@@ -2169,3 +2174,38 @@ void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
 	}
 	spin_unlock(&pblk->trans_lock);
 }
+
+void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd)
+{
+	void *buffer;
+
+	if (pblk_is_oob_meta_supported(pblk)) {
+		/* Just use OOB metadata buffer as always */
+		buffer = rqd->meta_list;
+	} else {
+		/* We need to reuse last page of request (packed metadata)
+		 * in similar way as traditional oob metadata
+		 */
+		buffer = page_to_virt(
+			rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page);
+	}
+
+	return buffer;
+}
+
+void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd)
+{
+	void *meta_list = rqd->meta_list;
+	void *page;
+	int i = 0;
+
+	if (pblk_is_oob_meta_supported(pblk))
+		return;
+
+	page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page);
+	/* We need to fill oob meta buffer with data from packed metadata */
+	for (; i < rqd->nr_ppas; i++)
+		memcpy(pblk_get_meta(pblk, meta_list, i),
+			page + (i * sizeof(struct pblk_sec_meta)),
+			sizeof(struct pblk_sec_meta));
+}
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index e8055b796381..f9a3e47b6a93 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -399,6 +399,7 @@ static int pblk_core_init(struct pblk *pblk)
 	pblk->nr_flush_rst = 0;
 
 	pblk->min_write_pgs = geo->ws_opt;
+	pblk->min_write_pgs_data = pblk->min_write_pgs;
 	max_write_ppas = pblk->min_write_pgs * geo->all_luns;
 	pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
 	pblk->max_write_pgs = min_t(int, pblk->max_write_pgs,
@@ -406,9 +407,35 @@ static int pblk_core_init(struct pblk *pblk)
 	pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
 
 	pblk->oob_meta_size = geo->sos;
-	if (pblk->oob_meta_size < sizeof(struct pblk_sec_meta)) {
-		pblk_err(pblk, "Unsupported metadata size\n");
-		return -EINVAL;
+	if (!pblk_is_oob_meta_supported(pblk)) {
+		/* For drives which does not have OOB metadata feature
+		 * in order to support recovery feature we need to use
+		 * so called packed metadata. Packed metada will store
+		 * the same information as OOB metadata (l2p table mapping,
+		 * but in the form of the single page at the end of
+		 * every write request.
+		 */
+		if (pblk->min_write_pgs
+			* sizeof(struct pblk_sec_meta) > PAGE_SIZE) {
+			/* We want to keep all the packed metadata on single
+			 * page per write requests. So we need to ensure that
+			 * it will fit.
+			 *
+			 * This is more like sanity check, since there is
+			 * no device with such a big minimal write size
+			 * (above 1 metabytes).
+			 */
+			pblk_err(pblk, "Not supported min write size\n");
+			return -EINVAL;
+		}
+		/* For packed meta approach we do some simplification.
+		 * On read path we always issue requests which size
+		 * equal to max_write_pgs, with all pages filled with
+		 * user payload except of last one page which will be
+		 * filled with packed metadata.
+		 */
+		pblk->max_write_pgs = pblk->min_write_pgs;
+		pblk->min_write_pgs_data = pblk->min_write_pgs - 1;
 	}
 
 	pblk->pad_dist = kcalloc(pblk->min_write_pgs - 1, sizeof(atomic64_t),
@@ -641,7 +668,7 @@ static int pblk_set_provision(struct pblk *pblk, int nr_free_chks)
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct nvm_geo *geo = &dev->geo;
 	sector_t provisioned;
-	int sec_meta, blk_meta;
+	int sec_meta, blk_meta, clba;
 	int minimum;
 
 	if (geo->op == NVM_TARGET_DEFAULT_OP)
@@ -682,7 +709,8 @@ static int pblk_set_provision(struct pblk *pblk, int nr_free_chks)
 	sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
 	blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
 
-	pblk->capacity = (provisioned - blk_meta) * geo->clba;
+	clba = (geo->clba / pblk->min_write_pgs) * pblk->min_write_pgs_data;
+	pblk->capacity = (provisioned - blk_meta) * clba;
 
 	atomic_set(&pblk->rl.free_blocks, nr_free_chks);
 	atomic_set(&pblk->rl.free_user_blocks, nr_free_chks);
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index 81e503ec384e..79df583ea709 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -96,7 +96,7 @@ int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 		 unsigned long *lun_bitmap, unsigned int valid_secs,
 		 unsigned int off)
 {
-	void *meta_list = rqd->meta_list;
+	void *meta_list = pblk_get_meta_for_writes(pblk, rqd);
 	void *meta_buffer;
 	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
 	unsigned int map_secs;
@@ -125,7 +125,7 @@ int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
 	struct pblk_line_meta *lm = &pblk->lm;
-	void *meta_list = rqd->meta_list;
+	void *meta_list = pblk_get_meta_for_writes(pblk, rqd);
 	void *meta_buffer;
 	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
 	struct pblk_line *e_line, *d_line;
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index 9f7fa0fe9c77..d4ca8c64ee0f 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -552,6 +552,9 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
 		to_read = count;
 	}
 
+	/* Add space for packed metadata if in use*/
+	pad += (pblk->min_write_pgs - pblk->min_write_pgs_data);
+
 	c_ctx->sentry = pos;
 	c_ctx->nr_valid = to_read;
 	c_ctx->nr_padded = pad;
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 6becd85ca4c6..3789185144da 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -112,6 +112,9 @@ static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
 	int nr_lbas = rqd->nr_ppas;
 	int i;
 
+	if (!pblk_is_oob_meta_supported(pblk))
+		return;
+
 	for (i = 0; i < nr_lbas; i++) {
 		struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
 		u64 lba = le64_to_cpu(meta->lba);
@@ -141,6 +144,9 @@ static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd,
 	void *meta_lba_list = rqd->meta_list;
 	int i, j;
 
+	if (!pblk_is_oob_meta_supported(pblk))
+		return;
+
 	for (i = 0, j = 0; i < nr_lbas; i++) {
 		struct pblk_sec_meta *meta = pblk_get_meta(pblk,
 							   meta_lba_list, j);
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 3a775d10f616..3fcf062d752c 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -191,7 +191,7 @@ static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line,
 	kref_init(&pad_rq->ref);
 
 next_pad_rq:
-	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
 	if (rq_ppas < pblk->min_write_pgs) {
 		pblk_err(pblk, "corrupted pad line %d\n", line->id);
 		goto fail_free_pad;
@@ -371,17 +371,19 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
 next_rq:
 	memset(rqd, 0, pblk_g_rq_size);
 
-	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
 	if (!rq_ppas)
 		rq_ppas = pblk->min_write_pgs;
 	rq_len = rq_ppas * geo->csecs;
 
+retry_rq:
 	bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 
 	bio->bi_iter.bi_sector = 0; /* internal bio */
 	bio_set_op_attrs(bio, REQ_OP_READ, 0);
+	bio_get(bio);
 
 	rqd->bio = bio;
 	rqd->opcode = NVM_OP_PREAD;
@@ -394,7 +396,6 @@ next_rq:
 	if (pblk_io_aligned(pblk, rq_ppas))
 		rqd->is_seq = 1;
 
-retry_rq:
 	for (i = 0; i < rqd->nr_ppas; ) {
 		struct ppa_addr ppa;
 		int pos;
@@ -417,6 +418,7 @@ retry_rq:
 	if (ret) {
 		pblk_err(pblk, "I/O submission failed: %d\n", ret);
 		bio_put(bio);
+		bio_put(bio);
 		return ret;
 	}
 
@@ -428,18 +430,25 @@ retry_rq:
 
 		if (padded) {
 			pblk_log_read_err(pblk, rqd);
+			bio_put(bio);
 			return -EINTR;
 		}
 
 		pad_distance = pblk_pad_distance(pblk, line);
 		ret = pblk_recov_pad_line(pblk, line, pad_distance);
-		if (ret)
+		if (ret) {
+			bio_put(bio);
 			return ret;
+		}
 
 		padded = true;
+		bio_put(bio);
 		goto retry_rq;
 	}
 
+	pblk_get_packed_meta(pblk, rqd);
+	bio_put(bio);
+
 	for (i = 0; i < rqd->nr_ppas; i++) {
 		struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
 		u64 lba = le64_to_cpu(meta->lba);
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
index 2d2818155aa8..7d8958df9472 100644
--- a/drivers/lightnvm/pblk-sysfs.c
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -479,6 +479,13 @@ static ssize_t pblk_sysfs_set_sec_per_write(struct pblk *pblk,
 	if (kstrtouint(page, 0, &sec_per_write))
 		return -EINVAL;
 
+	if (!pblk_is_oob_meta_supported(pblk)) {
+		/* For packed metadata case it is
+		 * not allowed to change sec_per_write.
+		 */
+		return -EINVAL;
+	}
+
 	if (sec_per_write < pblk->min_write_pgs
 				|| sec_per_write > pblk->max_write_pgs
 				|| sec_per_write % pblk->min_write_pgs != 0)
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 2bf78f81862d..06d56deb645d 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -348,7 +348,7 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
 {
 	int secs_to_sync;
 
-	secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush);
+	secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush, true);
 
 #ifdef CONFIG_NVM_PBLK_DEBUG
 	if ((!secs_to_sync && secs_to_flush)
@@ -569,7 +569,7 @@ static int pblk_submit_write(struct pblk *pblk, int *secs_left)
 	struct bio *bio;
 	struct nvm_rq *rqd;
 	unsigned int secs_avail, secs_to_sync, secs_to_com;
-	unsigned int secs_to_flush;
+	unsigned int secs_to_flush, packed_meta_pgs;
 	unsigned long pos;
 	unsigned int resubmit;
 
@@ -607,7 +607,7 @@ static int pblk_submit_write(struct pblk *pblk, int *secs_left)
 			return 0;
 
 		secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
-		if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
+		if (!secs_to_flush && secs_avail < pblk->min_write_pgs_data)
 			return 0;
 
 		secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
@@ -622,7 +622,8 @@ static int pblk_submit_write(struct pblk *pblk, int *secs_left)
 		pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
 	}
 
-	bio = bio_alloc(GFP_KERNEL, secs_to_sync);
+	packed_meta_pgs = (pblk->min_write_pgs - pblk->min_write_pgs_data);
+	bio = bio_alloc(GFP_KERNEL, secs_to_sync + packed_meta_pgs);
 
 	bio->bi_iter.bi_sector = 0; /* internal bio */
 	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 9087d53d5c25..bc40b1381ff6 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -632,6 +632,7 @@ struct pblk {
 	int state;			/* pblk line state */
 
 	int min_write_pgs; /* Minimum amount of pages required by controller */
+	int min_write_pgs_data; /* Minimum amount of payload pages */
 	int max_write_pgs; /* Maximum amount of pages supported by controller */
 	int oob_meta_size; /* Size of OOB sector metadata */
 
@@ -838,7 +839,7 @@ void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
 u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
 u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
 int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
-		   unsigned long secs_to_flush);
+		   unsigned long secs_to_flush, bool skip_meta);
 void pblk_down_rq(struct pblk *pblk, struct ppa_addr ppa,
 		  unsigned long *lun_bitmap);
 void pblk_down_chunk(struct pblk *pblk, struct ppa_addr ppa);
@@ -862,6 +863,8 @@ void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
 			  u64 *lba_list, int nr_secs);
 void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
 			 sector_t blba, int nr_secs);
+void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd);
+void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd);
 
 /*
  * pblk user I/O write path
@@ -1392,4 +1395,9 @@ static inline int pblk_dma_meta_size(struct pblk *pblk)
 {
 	return pblk->oob_meta_size * NVM_MAX_VLBA;
 }
+
+static inline int pblk_is_oob_meta_supported(struct pblk *pblk)
+{
+	return pblk->oob_meta_size >= sizeof(struct pblk_sec_meta);
+}
 #endif /* PBLK_H_ */
-- 
cgit 


From 2c4d5356e64d7d538f24c23045478330fae4a065 Mon Sep 17 00:00:00 2001
From: Igor Konopko <igor.j.konopko@intel.com>
Date: Tue, 11 Dec 2018 20:16:27 +0100
Subject: lightnvm: pblk: do not overwrite ppa list with meta list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ehen using pblk with 0 sized metadata both ppa list and meta list
points to the same memory since pblk_dma_meta_size() returns 0 in
that case.

This patch fix that issue by ensuring that pblk_dma_meta_size()
always returns space equal to sizeof(struct pblk_sec_meta) and thus
ppa list and meta list points to different memory address.

Even that in that case drive does not really care about meta_list
pointer, this is the easiest way to fix that issue without introducing
changes in many places in the code just for 0 sized metadata case.

The same approach needs to be also done for pblk_get_sec_meta()
since we also cannot point to the same memory address in meta buffer
when we are using it for pblk recovery process

Reported-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Tested-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Igor Konopko <igor.j.konopko@intel.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index bc40b1381ff6..85e38ed62f85 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -1388,12 +1388,15 @@ static inline unsigned int pblk_get_min_chks(struct pblk *pblk)
 static inline struct pblk_sec_meta *pblk_get_meta(struct pblk *pblk,
 							 void *meta, int index)
 {
-	return meta + pblk->oob_meta_size * index;
+	return meta +
+	       max_t(int, sizeof(struct pblk_sec_meta), pblk->oob_meta_size)
+	       * index;
 }
 
 static inline int pblk_dma_meta_size(struct pblk *pblk)
 {
-	return pblk->oob_meta_size * NVM_MAX_VLBA;
+	return max_t(int, sizeof(struct pblk_sec_meta), pblk->oob_meta_size)
+	       * NVM_MAX_VLBA;
 }
 
 static inline int pblk_is_oob_meta_supported(struct pblk *pblk)
-- 
cgit 


From 3152a974678a1e80c3c16d4b86522ecc500be529 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:05 -0800
Subject: ath6kl: add ath6kl_ prefix to crypto_type

Prevent a namespace conflict as in following patches as skbuff.h will
include the crypto API.

Acked-by: David S. Miller <davem@davemloft.net>
Cc: Kalle Valo <kvalo@codeaurora.org>
Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/net/wireless/ath/ath6kl/cfg80211.c | 2 +-
 drivers/net/wireless/ath/ath6kl/common.h   | 2 +-
 drivers/net/wireless/ath/ath6kl/wmi.c      | 6 +++---
 drivers/net/wireless/ath/ath6kl/wmi.h      | 6 +++---
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index e121187f371f..fa049c4ae315 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -1322,7 +1322,7 @@ static int ath6kl_cfg80211_set_default_key(struct wiphy *wiphy,
 	struct ath6kl_vif *vif = netdev_priv(ndev);
 	struct ath6kl_key *key = NULL;
 	u8 key_usage;
-	enum crypto_type key_type = NONE_CRYPT;
+	enum ath6kl_crypto_type key_type = NONE_CRYPT;
 
 	ath6kl_dbg(ATH6KL_DBG_WLAN_CFG, "%s: index %d\n", __func__, key_index);
 
diff --git a/drivers/net/wireless/ath/ath6kl/common.h b/drivers/net/wireless/ath/ath6kl/common.h
index 4f82e8632d37..d6e5234f67a1 100644
--- a/drivers/net/wireless/ath/ath6kl/common.h
+++ b/drivers/net/wireless/ath/ath6kl/common.h
@@ -67,7 +67,7 @@ struct ath6kl_llc_snap_hdr {
 	__be16 eth_type;
 } __packed;
 
-enum crypto_type {
+enum ath6kl_crypto_type {
 	NONE_CRYPT          = 0x01,
 	WEP_CRYPT           = 0x02,
 	TKIP_CRYPT          = 0x04,
diff --git a/drivers/net/wireless/ath/ath6kl/wmi.c b/drivers/net/wireless/ath/ath6kl/wmi.c
index 777acc564ac9..9d7ac1ab2d02 100644
--- a/drivers/net/wireless/ath/ath6kl/wmi.c
+++ b/drivers/net/wireless/ath/ath6kl/wmi.c
@@ -1849,9 +1849,9 @@ int ath6kl_wmi_connect_cmd(struct wmi *wmi, u8 if_idx,
 			   enum network_type nw_type,
 			   enum dot11_auth_mode dot11_auth_mode,
 			   enum auth_mode auth_mode,
-			   enum crypto_type pairwise_crypto,
+			   enum ath6kl_crypto_type pairwise_crypto,
 			   u8 pairwise_crypto_len,
-			   enum crypto_type group_crypto,
+			   enum ath6kl_crypto_type group_crypto,
 			   u8 group_crypto_len, int ssid_len, u8 *ssid,
 			   u8 *bssid, u16 channel, u32 ctrl_flags,
 			   u8 nw_subtype)
@@ -2301,7 +2301,7 @@ int ath6kl_wmi_disctimeout_cmd(struct wmi *wmi, u8 if_idx, u8 timeout)
 }
 
 int ath6kl_wmi_addkey_cmd(struct wmi *wmi, u8 if_idx, u8 key_index,
-			  enum crypto_type key_type,
+			  enum ath6kl_crypto_type key_type,
 			  u8 key_usage, u8 key_len,
 			  u8 *key_rsc, unsigned int key_rsc_len,
 			  u8 *key_material,
diff --git a/drivers/net/wireless/ath/ath6kl/wmi.h b/drivers/net/wireless/ath/ath6kl/wmi.h
index a60bb49fe920..784940ba4c90 100644
--- a/drivers/net/wireless/ath/ath6kl/wmi.h
+++ b/drivers/net/wireless/ath/ath6kl/wmi.h
@@ -2556,9 +2556,9 @@ int ath6kl_wmi_connect_cmd(struct wmi *wmi, u8 if_idx,
 			   enum network_type nw_type,
 			   enum dot11_auth_mode dot11_auth_mode,
 			   enum auth_mode auth_mode,
-			   enum crypto_type pairwise_crypto,
+			   enum ath6kl_crypto_type pairwise_crypto,
 			   u8 pairwise_crypto_len,
-			   enum crypto_type group_crypto,
+			   enum ath6kl_crypto_type group_crypto,
 			   u8 group_crypto_len, int ssid_len, u8 *ssid,
 			   u8 *bssid, u16 channel, u32 ctrl_flags,
 			   u8 nw_subtype);
@@ -2610,7 +2610,7 @@ int ath6kl_wmi_config_debug_module_cmd(struct wmi *wmi, u32 valid, u32 config);
 
 int ath6kl_wmi_get_stats_cmd(struct wmi *wmi, u8 if_idx);
 int ath6kl_wmi_addkey_cmd(struct wmi *wmi, u8 if_idx, u8 key_index,
-			  enum crypto_type key_type,
+			  enum ath6kl_crypto_type key_type,
 			  u8 key_usage, u8 key_len,
 			  u8 *key_rsc, unsigned int key_rsc_len,
 			  u8 *key_material,
-- 
cgit 


From 1672ddb8d691e4433806373ec4104f37a86efab0 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:11 -0800
Subject: nvmet: Add install_queue callout

nvmet-tcp will implement it to allocate queue commands which
are only known at nvmf connect time (sq size).

Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/fabrics-cmd.c | 10 ++++++++++
 drivers/nvme/target/nvmet.h       |  1 +
 2 files changed, 11 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index 328ae46d8344..ee7d84621d65 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -121,6 +121,16 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
 		req->rsp->sq_head = cpu_to_le16(0xffff);
 	}
 
+	if (ctrl->ops->install_queue) {
+		u16 ret = ctrl->ops->install_queue(req->sq);
+
+		if (ret) {
+			pr_err("failed to install queue %d cntlid %d ret %x\n",
+				qid, ret, ctrl->cntlid);
+			return ret;
+		}
+	}
+
 	return 0;
 }
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 7d8b7a7d572a..89df51ee5bdf 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -279,6 +279,7 @@ struct nvmet_fabrics_ops {
 	void (*delete_ctrl)(struct nvmet_ctrl *ctrl);
 	void (*disc_traddr)(struct nvmet_req *req,
 			struct nvmet_port *port, char *traddr);
+	u16 (*install_queue)(struct nvmet_sq *nvme_sq);
 };
 
 #define NVMET_MAX_INLINE_BIOVEC	8
-- 
cgit 


From 3b49fa807284ec30669a95fd5f3806e127d29f4d Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:12 -0800
Subject: nvme-fabrics: allow user passing header digest

Header digest is a nvme-tcp specific feature, but nothing prevents other
transports reusing the concept so do not associate with tcp transport
solely.

Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fabrics.c | 5 +++++
 drivers/nvme/host/fabrics.h | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 10074ac7731b..4272f8a95db3 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -614,6 +614,7 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_HOST_ID,		"hostid=%s"		},
 	{ NVMF_OPT_DUP_CONNECT,		"duplicate_connect"	},
 	{ NVMF_OPT_DISABLE_SQFLOW,	"disable_sqflow"	},
+	{ NVMF_OPT_HDR_DIGEST,		"hdr_digest"		},
 	{ NVMF_OPT_ERR,			NULL			}
 };
 
@@ -633,6 +634,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 	opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY;
 	opts->kato = NVME_DEFAULT_KATO;
 	opts->duplicate_connect = false;
+	opts->hdr_digest = false;
 
 	options = o = kstrdup(buf, GFP_KERNEL);
 	if (!options)
@@ -827,6 +829,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 		case NVMF_OPT_DISABLE_SQFLOW:
 			opts->disable_sqflow = true;
 			break;
+		case NVMF_OPT_HDR_DIGEST:
+			opts->hdr_digest = true;
+			break;
 		default:
 			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
 				p);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index ecd9a006a091..a6127f1a9e8e 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -59,6 +59,7 @@ enum {
 	NVMF_OPT_HOST_ID	= 1 << 12,
 	NVMF_OPT_DUP_CONNECT	= 1 << 13,
 	NVMF_OPT_DISABLE_SQFLOW = 1 << 14,
+	NVMF_OPT_HDR_DIGEST	= 1 << 15,
 };
 
 /**
@@ -103,6 +104,7 @@ struct nvmf_ctrl_options {
 	struct nvmf_host	*host;
 	int			max_reconnects;
 	bool			disable_sqflow;
+	bool			hdr_digest;
 };
 
 /*
-- 
cgit 


From 20d44e86321299f61bb782a39aaa30f579823f58 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:13 -0800
Subject: nvme-fabrics: allow user passing data digest

Data digest is a nvme-tcp specific feature, but nothing prevents other
transports reusing the concept so do not associate with tcp transport
solely.

Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fabrics.c | 5 +++++
 drivers/nvme/host/fabrics.h | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 4272f8a95db3..9c62c6838b76 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -615,6 +615,7 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_DUP_CONNECT,		"duplicate_connect"	},
 	{ NVMF_OPT_DISABLE_SQFLOW,	"disable_sqflow"	},
 	{ NVMF_OPT_HDR_DIGEST,		"hdr_digest"		},
+	{ NVMF_OPT_DATA_DIGEST,		"data_digest"		},
 	{ NVMF_OPT_ERR,			NULL			}
 };
 
@@ -635,6 +636,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 	opts->kato = NVME_DEFAULT_KATO;
 	opts->duplicate_connect = false;
 	opts->hdr_digest = false;
+	opts->data_digest = false;
 
 	options = o = kstrdup(buf, GFP_KERNEL);
 	if (!options)
@@ -832,6 +834,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 		case NVMF_OPT_HDR_DIGEST:
 			opts->hdr_digest = true;
 			break;
+		case NVMF_OPT_DATA_DIGEST:
+			opts->data_digest = true;
+			break;
 		default:
 			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
 				p);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index a6127f1a9e8e..524a02a67817 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -60,6 +60,7 @@ enum {
 	NVMF_OPT_DUP_CONNECT	= 1 << 13,
 	NVMF_OPT_DISABLE_SQFLOW = 1 << 14,
 	NVMF_OPT_HDR_DIGEST	= 1 << 15,
+	NVMF_OPT_DATA_DIGEST	= 1 << 16,
 };
 
 /**
@@ -105,6 +106,7 @@ struct nvmf_ctrl_options {
 	int			max_reconnects;
 	bool			disable_sqflow;
 	bool			hdr_digest;
+	bool			data_digest;
 };
 
 /*
-- 
cgit 


From 872d26a391da92ed8f0c0f5cb5fef428067b7f30 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:15 -0800
Subject: nvmet-tcp: add NVMe over TCP target driver

This patch implements the TCP transport driver for the NVMe over Fabrics
target stack. This allows exporting NVMe over Fabrics functionality over
good old TCP/IP.

The driver implements the TP 8000 of how nvme over fabrics capsules and
data are encapsulated in nvme-tcp pdus and exchaged on top of a TCP byte
stream. nvme-tcp header and data digest are supported as well.

Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Roy Shterman <roys@lightbitslabs.com>
Signed-off-by: Solganik Alexander <sashas@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/Kconfig  |   10 +
 drivers/nvme/target/Makefile |    2 +
 drivers/nvme/target/tcp.c    | 1737 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1749 insertions(+)
 create mode 100644 drivers/nvme/target/tcp.c

(limited to 'drivers')

diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 3c7b61ddb0d1..d94f25cde019 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -60,3 +60,13 @@ config NVME_TARGET_FCLOOP
 	  to test NVMe-FC transport interfaces.
 
 	  If unsure, say N.
+
+config NVME_TARGET_TCP
+	tristate "NVMe over Fabrics TCP target support"
+	depends on INET
+	depends on NVME_TARGET
+	help
+	  This enables the NVMe TCP target support, which allows exporting NVMe
+	  devices over TCP.
+
+	  If unsure, say N.
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index 8118c93391c6..8c3ad0fb6860 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_NVME_TARGET_LOOP)		+= nvme-loop.o
 obj-$(CONFIG_NVME_TARGET_RDMA)		+= nvmet-rdma.o
 obj-$(CONFIG_NVME_TARGET_FC)		+= nvmet-fc.o
 obj-$(CONFIG_NVME_TARGET_FCLOOP)	+= nvme-fcloop.o
+obj-$(CONFIG_NVME_TARGET_TCP)		+= nvmet-tcp.o
 
 nvmet-y		+= core.o configfs.o admin-cmd.o fabrics-cmd.o \
 			discovery.o io-cmd-file.o io-cmd-bdev.o
@@ -12,3 +13,4 @@ nvme-loop-y	+= loop.o
 nvmet-rdma-y	+= rdma.o
 nvmet-fc-y	+= fc.o
 nvme-fcloop-y	+= fcloop.o
+nvmet-tcp-y	+= tcp.o
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
new file mode 100644
index 000000000000..d31bec260160
--- /dev/null
+++ b/drivers/nvme/target/tcp.c
@@ -0,0 +1,1737 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics TCP target.
+ * Copyright (c) 2018 Lightbits Labs. All rights reserved.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/nvme-tcp.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/inet.h>
+#include <linux/llist.h>
+#include <crypto/hash.h>
+
+#include "nvmet.h"
+
+#define NVMET_TCP_DEF_INLINE_DATA_SIZE	(4 * PAGE_SIZE)
+
+#define NVMET_TCP_RECV_BUDGET		8
+#define NVMET_TCP_SEND_BUDGET		8
+#define NVMET_TCP_IO_WORK_BUDGET	64
+
+enum nvmet_tcp_send_state {
+	NVMET_TCP_SEND_DATA_PDU,
+	NVMET_TCP_SEND_DATA,
+	NVMET_TCP_SEND_R2T,
+	NVMET_TCP_SEND_DDGST,
+	NVMET_TCP_SEND_RESPONSE
+};
+
+enum nvmet_tcp_recv_state {
+	NVMET_TCP_RECV_PDU,
+	NVMET_TCP_RECV_DATA,
+	NVMET_TCP_RECV_DDGST,
+	NVMET_TCP_RECV_ERR,
+};
+
+enum {
+	NVMET_TCP_F_INIT_FAILED = (1 << 0),
+};
+
+struct nvmet_tcp_cmd {
+	struct nvmet_tcp_queue		*queue;
+	struct nvmet_req		req;
+
+	struct nvme_tcp_cmd_pdu		*cmd_pdu;
+	struct nvme_tcp_rsp_pdu		*rsp_pdu;
+	struct nvme_tcp_data_pdu	*data_pdu;
+	struct nvme_tcp_r2t_pdu		*r2t_pdu;
+
+	u32				rbytes_done;
+	u32				wbytes_done;
+
+	u32				pdu_len;
+	u32				pdu_recv;
+	int				sg_idx;
+	int				nr_mapped;
+	struct msghdr			recv_msg;
+	struct kvec			*iov;
+	u32				flags;
+
+	struct list_head		entry;
+	struct llist_node		lentry;
+
+	/* send state */
+	u32				offset;
+	struct scatterlist		*cur_sg;
+	enum nvmet_tcp_send_state	state;
+
+	__le32				exp_ddgst;
+	__le32				recv_ddgst;
+};
+
+enum nvmet_tcp_queue_state {
+	NVMET_TCP_Q_CONNECTING,
+	NVMET_TCP_Q_LIVE,
+	NVMET_TCP_Q_DISCONNECTING,
+};
+
+struct nvmet_tcp_queue {
+	struct socket		*sock;
+	struct nvmet_tcp_port	*port;
+	struct work_struct	io_work;
+	int			cpu;
+	struct nvmet_cq		nvme_cq;
+	struct nvmet_sq		nvme_sq;
+
+	/* send state */
+	struct nvmet_tcp_cmd	*cmds;
+	unsigned int		nr_cmds;
+	struct list_head	free_list;
+	struct llist_head	resp_list;
+	struct list_head	resp_send_list;
+	int			send_list_len;
+	struct nvmet_tcp_cmd	*snd_cmd;
+
+	/* recv state */
+	int			offset;
+	int			left;
+	enum nvmet_tcp_recv_state rcv_state;
+	struct nvmet_tcp_cmd	*cmd;
+	union nvme_tcp_pdu	pdu;
+
+	/* digest state */
+	bool			hdr_digest;
+	bool			data_digest;
+	struct ahash_request	*snd_hash;
+	struct ahash_request	*rcv_hash;
+
+	spinlock_t		state_lock;
+	enum nvmet_tcp_queue_state state;
+
+	struct sockaddr_storage	sockaddr;
+	struct sockaddr_storage	sockaddr_peer;
+	struct work_struct	release_work;
+
+	int			idx;
+	struct list_head	queue_list;
+
+	struct nvmet_tcp_cmd	connect;
+
+	struct page_frag_cache	pf_cache;
+
+	void (*data_ready)(struct sock *);
+	void (*state_change)(struct sock *);
+	void (*write_space)(struct sock *);
+};
+
+struct nvmet_tcp_port {
+	struct socket		*sock;
+	struct work_struct	accept_work;
+	struct nvmet_port	*nport;
+	struct sockaddr_storage addr;
+	int			last_cpu;
+	void (*data_ready)(struct sock *);
+};
+
+static DEFINE_IDA(nvmet_tcp_queue_ida);
+static LIST_HEAD(nvmet_tcp_queue_list);
+static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
+
+static struct workqueue_struct *nvmet_tcp_wq;
+static struct nvmet_fabrics_ops nvmet_tcp_ops;
+static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
+static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
+
+static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
+		struct nvmet_tcp_cmd *cmd)
+{
+	return cmd - queue->cmds;
+}
+
+static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
+{
+	return nvme_is_write(cmd->req.cmd) &&
+		cmd->rbytes_done < cmd->req.transfer_len;
+}
+
+static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
+{
+	return nvmet_tcp_has_data_in(cmd) && !cmd->req.rsp->status;
+}
+
+static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd)
+{
+	return !nvme_is_write(cmd->req.cmd) &&
+		cmd->req.transfer_len > 0 &&
+		!cmd->req.rsp->status;
+}
+
+static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd)
+{
+	return nvme_is_write(cmd->req.cmd) && cmd->pdu_len &&
+		!cmd->rbytes_done;
+}
+
+static inline struct nvmet_tcp_cmd *
+nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_cmd *cmd;
+
+	cmd = list_first_entry_or_null(&queue->free_list,
+				struct nvmet_tcp_cmd, entry);
+	if (!cmd)
+		return NULL;
+	list_del_init(&cmd->entry);
+
+	cmd->rbytes_done = cmd->wbytes_done = 0;
+	cmd->pdu_len = 0;
+	cmd->pdu_recv = 0;
+	cmd->iov = NULL;
+	cmd->flags = 0;
+	return cmd;
+}
+
+static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd)
+{
+	if (unlikely(cmd == &cmd->queue->connect))
+		return;
+
+	list_add_tail(&cmd->entry, &cmd->queue->free_list);
+}
+
+static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
+{
+	return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
+static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
+{
+	return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
+static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
+		void *pdu, size_t len)
+{
+	struct scatterlist sg;
+
+	sg_init_one(&sg, pdu, len);
+	ahash_request_set_crypt(hash, &sg, pdu + len, len);
+	crypto_ahash_digest(hash);
+}
+
+static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
+	void *pdu, size_t len)
+{
+	struct nvme_tcp_hdr *hdr = pdu;
+	__le32 recv_digest;
+	__le32 exp_digest;
+
+	if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
+		pr_err("queue %d: header digest enabled but no header digest\n",
+			queue->idx);
+		return -EPROTO;
+	}
+
+	recv_digest = *(__le32 *)(pdu + hdr->hlen);
+	nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
+	exp_digest = *(__le32 *)(pdu + hdr->hlen);
+	if (recv_digest != exp_digest) {
+		pr_err("queue %d: header digest error: recv %#x expected %#x\n",
+			queue->idx, le32_to_cpu(recv_digest),
+			le32_to_cpu(exp_digest));
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
+{
+	struct nvme_tcp_hdr *hdr = pdu;
+	u8 digest_len = nvmet_tcp_hdgst_len(queue);
+	u32 len;
+
+	len = le32_to_cpu(hdr->plen) - hdr->hlen -
+		(hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0);
+
+	if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
+		pr_err("queue %d: data digest flag is cleared\n", queue->idx);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
+{
+	struct scatterlist *sg;
+	int i;
+
+	sg = &cmd->req.sg[cmd->sg_idx];
+
+	for (i = 0; i < cmd->nr_mapped; i++)
+		kunmap(sg_page(&sg[i]));
+}
+
+static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
+{
+	struct kvec *iov = cmd->iov;
+	struct scatterlist *sg;
+	u32 length, offset, sg_offset;
+
+	length = cmd->pdu_len;
+	cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE);
+	offset = cmd->rbytes_done;
+	cmd->sg_idx = DIV_ROUND_UP(offset, PAGE_SIZE);
+	sg_offset = offset % PAGE_SIZE;
+	sg = &cmd->req.sg[cmd->sg_idx];
+
+	while (length) {
+		u32 iov_len = min_t(u32, length, sg->length - sg_offset);
+
+		iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset;
+		iov->iov_len = iov_len;
+
+		length -= iov_len;
+		sg = sg_next(sg);
+		iov++;
+	}
+
+	iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov,
+		cmd->nr_mapped, cmd->pdu_len);
+}
+
+static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
+{
+	queue->rcv_state = NVMET_TCP_RECV_ERR;
+	if (queue->nvme_sq.ctrl)
+		nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
+	else
+		kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+}
+
+static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
+	u32 len = le32_to_cpu(sgl->length);
+
+	if (!cmd->req.data_len)
+		return 0;
+
+	if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) |
+			  NVME_SGL_FMT_OFFSET)) {
+		if (!nvme_is_write(cmd->req.cmd))
+			return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+
+		if (len > cmd->req.port->inline_data_size)
+			return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
+		cmd->pdu_len = len;
+	}
+	cmd->req.transfer_len += len;
+
+	cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt);
+	if (!cmd->req.sg)
+		return NVME_SC_INTERNAL;
+	cmd->cur_sg = cmd->req.sg;
+
+	if (nvmet_tcp_has_data_in(cmd)) {
+		cmd->iov = kmalloc_array(cmd->req.sg_cnt,
+				sizeof(*cmd->iov), GFP_KERNEL);
+		if (!cmd->iov)
+			goto err;
+	}
+
+	return 0;
+err:
+	sgl_free(cmd->req.sg);
+	return NVME_SC_INTERNAL;
+}
+
+static void nvmet_tcp_ddgst(struct ahash_request *hash,
+		struct nvmet_tcp_cmd *cmd)
+{
+	ahash_request_set_crypt(hash, cmd->req.sg,
+		(void *)&cmd->exp_ddgst, cmd->req.transfer_len);
+	crypto_ahash_digest(hash);
+}
+
+static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
+	struct nvmet_tcp_queue *queue = cmd->queue;
+	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+	u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue);
+
+	cmd->offset = 0;
+	cmd->state = NVMET_TCP_SEND_DATA_PDU;
+
+	pdu->hdr.type = nvme_tcp_c2h_data;
+	pdu->hdr.flags = NVME_TCP_F_DATA_LAST;
+	pdu->hdr.hlen = sizeof(*pdu);
+	pdu->hdr.pdo = pdu->hdr.hlen + hdgst;
+	pdu->hdr.plen =
+		cpu_to_le32(pdu->hdr.hlen + hdgst +
+				cmd->req.transfer_len + ddgst);
+	pdu->command_id = cmd->req.rsp->command_id;
+	pdu->data_length = cpu_to_le32(cmd->req.transfer_len);
+	pdu->data_offset = cpu_to_le32(cmd->wbytes_done);
+
+	if (queue->data_digest) {
+		pdu->hdr.flags |= NVME_TCP_F_DDGST;
+		nvmet_tcp_ddgst(queue->snd_hash, cmd);
+	}
+
+	if (cmd->queue->hdr_digest) {
+		pdu->hdr.flags |= NVME_TCP_F_HDGST;
+		nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+	}
+}
+
+static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
+	struct nvmet_tcp_queue *queue = cmd->queue;
+	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+
+	cmd->offset = 0;
+	cmd->state = NVMET_TCP_SEND_R2T;
+
+	pdu->hdr.type = nvme_tcp_r2t;
+	pdu->hdr.flags = 0;
+	pdu->hdr.hlen = sizeof(*pdu);
+	pdu->hdr.pdo = 0;
+	pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
+
+	pdu->command_id = cmd->req.cmd->common.command_id;
+	pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd);
+	pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done);
+	pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
+	if (cmd->queue->hdr_digest) {
+		pdu->hdr.flags |= NVME_TCP_F_HDGST;
+		nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+	}
+}
+
+static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
+	struct nvmet_tcp_queue *queue = cmd->queue;
+	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+
+	cmd->offset = 0;
+	cmd->state = NVMET_TCP_SEND_RESPONSE;
+
+	pdu->hdr.type = nvme_tcp_rsp;
+	pdu->hdr.flags = 0;
+	pdu->hdr.hlen = sizeof(*pdu);
+	pdu->hdr.pdo = 0;
+	pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
+	if (cmd->queue->hdr_digest) {
+		pdu->hdr.flags |= NVME_TCP_F_HDGST;
+		nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+	}
+}
+
+static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
+{
+	struct llist_node *node;
+
+	node = llist_del_all(&queue->resp_list);
+	if (!node)
+		return;
+
+	while (node) {
+		struct nvmet_tcp_cmd *cmd = llist_entry(node,
+					struct nvmet_tcp_cmd, lentry);
+
+		list_add(&cmd->entry, &queue->resp_send_list);
+		node = node->next;
+		queue->send_list_len++;
+	}
+}
+
+static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue)
+{
+	queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list,
+				struct nvmet_tcp_cmd, entry);
+	if (!queue->snd_cmd) {
+		nvmet_tcp_process_resp_list(queue);
+		queue->snd_cmd =
+			list_first_entry_or_null(&queue->resp_send_list,
+					struct nvmet_tcp_cmd, entry);
+		if (unlikely(!queue->snd_cmd))
+			return NULL;
+	}
+
+	list_del_init(&queue->snd_cmd->entry);
+	queue->send_list_len--;
+
+	if (nvmet_tcp_need_data_out(queue->snd_cmd))
+		nvmet_setup_c2h_data_pdu(queue->snd_cmd);
+	else if (nvmet_tcp_need_data_in(queue->snd_cmd))
+		nvmet_setup_r2t_pdu(queue->snd_cmd);
+	else
+		nvmet_setup_response_pdu(queue->snd_cmd);
+
+	return queue->snd_cmd;
+}
+
+static void nvmet_tcp_queue_response(struct nvmet_req *req)
+{
+	struct nvmet_tcp_cmd *cmd =
+		container_of(req, struct nvmet_tcp_cmd, req);
+	struct nvmet_tcp_queue	*queue = cmd->queue;
+
+	llist_add(&cmd->lentry, &queue->resp_list);
+	queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work);
+}
+
+static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
+{
+	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+	int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
+	int ret;
+
+	ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
+			offset_in_page(cmd->data_pdu) + cmd->offset,
+			left, MSG_DONTWAIT | MSG_MORE);
+	if (ret <= 0)
+		return ret;
+
+	cmd->offset += ret;
+	left -= ret;
+
+	if (left)
+		return -EAGAIN;
+
+	cmd->state = NVMET_TCP_SEND_DATA;
+	cmd->offset  = 0;
+	return 1;
+}
+
+static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvmet_tcp_queue *queue = cmd->queue;
+	int ret;
+
+	while (cmd->cur_sg) {
+		struct page *page = sg_page(cmd->cur_sg);
+		u32 left = cmd->cur_sg->length - cmd->offset;
+
+		ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
+					left, MSG_DONTWAIT | MSG_MORE);
+		if (ret <= 0)
+			return ret;
+
+		cmd->offset += ret;
+		cmd->wbytes_done += ret;
+
+		/* Done with sg?*/
+		if (cmd->offset == cmd->cur_sg->length) {
+			cmd->cur_sg = sg_next(cmd->cur_sg);
+			cmd->offset = 0;
+		}
+	}
+
+	if (queue->data_digest) {
+		cmd->state = NVMET_TCP_SEND_DDGST;
+		cmd->offset = 0;
+	} else {
+		nvmet_setup_response_pdu(cmd);
+	}
+	return 1;
+
+}
+
+static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
+		bool last_in_batch)
+{
+	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+	int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
+	int flags = MSG_DONTWAIT;
+	int ret;
+
+	if (!last_in_batch && cmd->queue->send_list_len)
+		flags |= MSG_MORE;
+	else
+		flags |= MSG_EOR;
+
+	ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
+		offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
+	if (ret <= 0)
+		return ret;
+	cmd->offset += ret;
+	left -= ret;
+
+	if (left)
+		return -EAGAIN;
+
+	kfree(cmd->iov);
+	sgl_free(cmd->req.sg);
+	cmd->queue->snd_cmd = NULL;
+	nvmet_tcp_put_cmd(cmd);
+	return 1;
+}
+
+static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
+{
+	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+	int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
+	int flags = MSG_DONTWAIT;
+	int ret;
+
+	if (!last_in_batch && cmd->queue->send_list_len)
+		flags |= MSG_MORE;
+	else
+		flags |= MSG_EOR;
+
+	ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu),
+		offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags);
+	if (ret <= 0)
+		return ret;
+	cmd->offset += ret;
+	left -= ret;
+
+	if (left)
+		return -EAGAIN;
+
+	cmd->queue->snd_cmd = NULL;
+	return 1;
+}
+
+static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvmet_tcp_queue *queue = cmd->queue;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
+	struct kvec iov = {
+		.iov_base = &cmd->exp_ddgst + cmd->offset,
+		.iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset
+	};
+	int ret;
+
+	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+	if (unlikely(ret <= 0))
+		return ret;
+
+	cmd->offset += ret;
+	nvmet_setup_response_pdu(cmd);
+	return 1;
+}
+
+static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
+		bool last_in_batch)
+{
+	struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
+	int ret = 0;
+
+	if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
+		cmd = nvmet_tcp_fetch_cmd(queue);
+		if (unlikely(!cmd))
+			return 0;
+	}
+
+	if (cmd->state == NVMET_TCP_SEND_DATA_PDU) {
+		ret = nvmet_try_send_data_pdu(cmd);
+		if (ret <= 0)
+			goto done_send;
+	}
+
+	if (cmd->state == NVMET_TCP_SEND_DATA) {
+		ret = nvmet_try_send_data(cmd);
+		if (ret <= 0)
+			goto done_send;
+	}
+
+	if (cmd->state == NVMET_TCP_SEND_DDGST) {
+		ret = nvmet_try_send_ddgst(cmd);
+		if (ret <= 0)
+			goto done_send;
+	}
+
+	if (cmd->state == NVMET_TCP_SEND_R2T) {
+		ret = nvmet_try_send_r2t(cmd, last_in_batch);
+		if (ret <= 0)
+			goto done_send;
+	}
+
+	if (cmd->state == NVMET_TCP_SEND_RESPONSE)
+		ret = nvmet_try_send_response(cmd, last_in_batch);
+
+done_send:
+	if (ret < 0) {
+		if (ret == -EAGAIN)
+			return 0;
+		return ret;
+	}
+
+	return 1;
+}
+
+static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue,
+		int budget, int *sends)
+{
+	int i, ret = 0;
+
+	for (i = 0; i < budget; i++) {
+		ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
+		if (ret <= 0)
+			break;
+		(*sends)++;
+	}
+
+	return ret;
+}
+
+static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
+{
+	queue->offset = 0;
+	queue->left = sizeof(struct nvme_tcp_hdr);
+	queue->cmd = NULL;
+	queue->rcv_state = NVMET_TCP_RECV_PDU;
+}
+
+static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
+
+	ahash_request_free(queue->rcv_hash);
+	ahash_request_free(queue->snd_hash);
+	crypto_free_ahash(tfm);
+}
+
+static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
+{
+	struct crypto_ahash *tfm;
+
+	tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+	if (!queue->snd_hash)
+		goto free_tfm;
+	ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
+
+	queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+	if (!queue->rcv_hash)
+		goto free_snd_hash;
+	ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
+
+	return 0;
+free_snd_hash:
+	ahash_request_free(queue->snd_hash);
+free_tfm:
+	crypto_free_ahash(tfm);
+	return -ENOMEM;
+}
+
+
+static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
+{
+	struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
+	struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp;
+	struct msghdr msg = {};
+	struct kvec iov;
+	int ret;
+
+	if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) {
+		pr_err("bad nvme-tcp pdu length (%d)\n",
+			le32_to_cpu(icreq->hdr.plen));
+		nvmet_tcp_fatal_error(queue);
+	}
+
+	if (icreq->pfv != NVME_TCP_PFV_1_0) {
+		pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv);
+		return -EPROTO;
+	}
+
+	if (icreq->hpda != 0) {
+		pr_err("queue %d: unsupported hpda %d\n", queue->idx,
+			icreq->hpda);
+		return -EPROTO;
+	}
+
+	if (icreq->maxr2t != 0) {
+		pr_err("queue %d: unsupported maxr2t %d\n", queue->idx,
+			le16_to_cpu(icreq->maxr2t) + 1);
+		return -EPROTO;
+	}
+
+	queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
+	queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
+	if (queue->hdr_digest || queue->data_digest) {
+		ret = nvmet_tcp_alloc_crypto(queue);
+		if (ret)
+			return ret;
+	}
+
+	memset(icresp, 0, sizeof(*icresp));
+	icresp->hdr.type = nvme_tcp_icresp;
+	icresp->hdr.hlen = sizeof(*icresp);
+	icresp->hdr.pdo = 0;
+	icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
+	icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
+	icresp->maxdata = 0xffff; /* FIXME: support r2t */
+	icresp->cpda = 0;
+	if (queue->hdr_digest)
+		icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
+	if (queue->data_digest)
+		icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
+
+	iov.iov_base = icresp;
+	iov.iov_len = sizeof(*icresp);
+	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+	if (ret < 0)
+		goto free_crypto;
+
+	queue->state = NVMET_TCP_Q_LIVE;
+	nvmet_prepare_receive_pdu(queue);
+	return 0;
+free_crypto:
+	if (queue->hdr_digest || queue->data_digest)
+		nvmet_tcp_free_crypto(queue);
+	return ret;
+}
+
+static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
+		struct nvmet_tcp_cmd *cmd, struct nvmet_req *req)
+{
+	int ret;
+
+	/* recover the expected data transfer length */
+	req->data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length);
+
+	if (!nvme_is_write(cmd->req.cmd) ||
+	    req->data_len > cmd->req.port->inline_data_size) {
+		nvmet_prepare_receive_pdu(queue);
+		return;
+	}
+
+	ret = nvmet_tcp_map_data(cmd);
+	if (unlikely(ret)) {
+		pr_err("queue %d: failed to map data\n", queue->idx);
+		nvmet_tcp_fatal_error(queue);
+		return;
+	}
+
+	queue->rcv_state = NVMET_TCP_RECV_DATA;
+	nvmet_tcp_map_pdu_iovec(cmd);
+	cmd->flags |= NVMET_TCP_F_INIT_FAILED;
+}
+
+static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
+{
+	struct nvme_tcp_data_pdu *data = &queue->pdu.data;
+	struct nvmet_tcp_cmd *cmd;
+
+	cmd = &queue->cmds[data->ttag];
+
+	if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
+		pr_err("ttag %u unexpected data offset %u (expected %u)\n",
+			data->ttag, le32_to_cpu(data->data_offset),
+			cmd->rbytes_done);
+		/* FIXME: use path and transport errors */
+		nvmet_req_complete(&cmd->req,
+			NVME_SC_INVALID_FIELD | NVME_SC_DNR);
+		return -EPROTO;
+	}
+
+	cmd->pdu_len = le32_to_cpu(data->data_length);
+	cmd->pdu_recv = 0;
+	nvmet_tcp_map_pdu_iovec(cmd);
+	queue->cmd = cmd;
+	queue->rcv_state = NVMET_TCP_RECV_DATA;
+
+	return 0;
+}
+
+static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
+{
+	struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
+	struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
+	struct nvmet_req *req;
+	int ret;
+
+	if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
+		if (hdr->type != nvme_tcp_icreq) {
+			pr_err("unexpected pdu type (%d) before icreq\n",
+				hdr->type);
+			nvmet_tcp_fatal_error(queue);
+			return -EPROTO;
+		}
+		return nvmet_tcp_handle_icreq(queue);
+	}
+
+	if (hdr->type == nvme_tcp_h2c_data) {
+		ret = nvmet_tcp_handle_h2c_data_pdu(queue);
+		if (unlikely(ret))
+			return ret;
+		return 0;
+	}
+
+	queue->cmd = nvmet_tcp_get_cmd(queue);
+	if (unlikely(!queue->cmd)) {
+		/* This should never happen */
+		pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
+			queue->idx, queue->nr_cmds, queue->send_list_len,
+			nvme_cmd->common.opcode);
+		nvmet_tcp_fatal_error(queue);
+		return -ENOMEM;
+	}
+
+	req = &queue->cmd->req;
+	memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
+
+	if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
+			&queue->nvme_sq, &nvmet_tcp_ops))) {
+		pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
+			req->cmd, req->cmd->common.command_id,
+			req->cmd->common.opcode,
+			le32_to_cpu(req->cmd->common.dptr.sgl.length));
+
+		nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
+		return -EAGAIN;
+	}
+
+	ret = nvmet_tcp_map_data(queue->cmd);
+	if (unlikely(ret)) {
+		pr_err("queue %d: failed to map data\n", queue->idx);
+		if (nvmet_tcp_has_inline_data(queue->cmd))
+			nvmet_tcp_fatal_error(queue);
+		else
+			nvmet_req_complete(req, ret);
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	if (nvmet_tcp_need_data_in(queue->cmd)) {
+		if (nvmet_tcp_has_inline_data(queue->cmd)) {
+			queue->rcv_state = NVMET_TCP_RECV_DATA;
+			nvmet_tcp_map_pdu_iovec(queue->cmd);
+			return 0;
+		}
+		/* send back R2T */
+		nvmet_tcp_queue_response(&queue->cmd->req);
+		goto out;
+	}
+
+	nvmet_req_execute(&queue->cmd->req);
+out:
+	nvmet_prepare_receive_pdu(queue);
+	return ret;
+}
+
+static const u8 nvme_tcp_pdu_sizes[] = {
+	[nvme_tcp_icreq]	= sizeof(struct nvme_tcp_icreq_pdu),
+	[nvme_tcp_cmd]		= sizeof(struct nvme_tcp_cmd_pdu),
+	[nvme_tcp_h2c_data]	= sizeof(struct nvme_tcp_data_pdu),
+};
+
+static inline u8 nvmet_tcp_pdu_size(u8 type)
+{
+	size_t idx = type;
+
+	return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) &&
+		nvme_tcp_pdu_sizes[idx]) ?
+			nvme_tcp_pdu_sizes[idx] : 0;
+}
+
+static inline bool nvmet_tcp_pdu_valid(u8 type)
+{
+	switch (type) {
+	case nvme_tcp_icreq:
+	case nvme_tcp_cmd:
+	case nvme_tcp_h2c_data:
+		/* fallthru */
+		return true;
+	}
+
+	return false;
+}
+
+static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
+{
+	struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
+	int len;
+	struct kvec iov;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
+
+recv:
+	iov.iov_base = (void *)&queue->pdu + queue->offset;
+	iov.iov_len = queue->left;
+	len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+			iov.iov_len, msg.msg_flags);
+	if (unlikely(len < 0))
+		return len;
+
+	queue->offset += len;
+	queue->left -= len;
+	if (queue->left)
+		return -EAGAIN;
+
+	if (queue->offset == sizeof(struct nvme_tcp_hdr)) {
+		u8 hdgst = nvmet_tcp_hdgst_len(queue);
+
+		if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) {
+			pr_err("unexpected pdu type %d\n", hdr->type);
+			nvmet_tcp_fatal_error(queue);
+			return -EIO;
+		}
+
+		if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) {
+			pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen);
+			return -EIO;
+		}
+
+		queue->left = hdr->hlen - queue->offset + hdgst;
+		goto recv;
+	}
+
+	if (queue->hdr_digest &&
+	    nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) {
+		nvmet_tcp_fatal_error(queue); /* fatal */
+		return -EPROTO;
+	}
+
+	if (queue->data_digest &&
+	    nvmet_tcp_check_ddgst(queue, &queue->pdu)) {
+		nvmet_tcp_fatal_error(queue); /* fatal */
+		return -EPROTO;
+	}
+
+	return nvmet_tcp_done_recv_pdu(queue);
+}
+
+static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvmet_tcp_queue *queue = cmd->queue;
+
+	nvmet_tcp_ddgst(queue->rcv_hash, cmd);
+	queue->offset = 0;
+	queue->left = NVME_TCP_DIGEST_LENGTH;
+	queue->rcv_state = NVMET_TCP_RECV_DDGST;
+}
+
+static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_cmd  *cmd = queue->cmd;
+	int ret;
+
+	while (msg_data_left(&cmd->recv_msg)) {
+		ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
+			cmd->recv_msg.msg_flags);
+		if (ret <= 0)
+			return ret;
+
+		cmd->pdu_recv += ret;
+		cmd->rbytes_done += ret;
+	}
+
+	nvmet_tcp_unmap_pdu_iovec(cmd);
+
+	if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
+	    cmd->rbytes_done == cmd->req.transfer_len) {
+		if (queue->data_digest) {
+			nvmet_tcp_prep_recv_ddgst(cmd);
+			return 0;
+		}
+		nvmet_req_execute(&cmd->req);
+	}
+
+	nvmet_prepare_receive_pdu(queue);
+	return 0;
+}
+
+static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_cmd *cmd = queue->cmd;
+	int ret;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
+	struct kvec iov = {
+		.iov_base = (void *)&cmd->recv_ddgst + queue->offset,
+		.iov_len = queue->left
+	};
+
+	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+			iov.iov_len, msg.msg_flags);
+	if (unlikely(ret < 0))
+		return ret;
+
+	queue->offset += ret;
+	queue->left -= ret;
+	if (queue->left)
+		return -EAGAIN;
+
+	if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) {
+		pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
+			queue->idx, cmd->req.cmd->common.command_id,
+			queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst),
+			le32_to_cpu(cmd->exp_ddgst));
+		nvmet_tcp_finish_cmd(cmd);
+		nvmet_tcp_fatal_error(queue);
+		ret = -EPROTO;
+		goto out;
+	}
+
+	if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
+	    cmd->rbytes_done == cmd->req.transfer_len)
+		nvmet_req_execute(&cmd->req);
+	ret = 0;
+out:
+	nvmet_prepare_receive_pdu(queue);
+	return ret;
+}
+
+static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue)
+{
+	int result;
+
+	if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR))
+		return 0;
+
+	if (queue->rcv_state == NVMET_TCP_RECV_PDU) {
+		result = nvmet_tcp_try_recv_pdu(queue);
+		if (result != 0)
+			goto done_recv;
+	}
+
+	if (queue->rcv_state == NVMET_TCP_RECV_DATA) {
+		result = nvmet_tcp_try_recv_data(queue);
+		if (result != 0)
+			goto done_recv;
+	}
+
+	if (queue->rcv_state == NVMET_TCP_RECV_DDGST) {
+		result = nvmet_tcp_try_recv_ddgst(queue);
+		if (result != 0)
+			goto done_recv;
+	}
+
+done_recv:
+	if (result < 0) {
+		if (result == -EAGAIN)
+			return 0;
+		return result;
+	}
+	return 1;
+}
+
+static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue,
+		int budget, int *recvs)
+{
+	int i, ret = 0;
+
+	for (i = 0; i < budget; i++) {
+		ret = nvmet_tcp_try_recv_one(queue);
+		if (ret <= 0)
+			break;
+		(*recvs)++;
+	}
+
+	return ret;
+}
+
+static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue)
+{
+	spin_lock(&queue->state_lock);
+	if (queue->state != NVMET_TCP_Q_DISCONNECTING) {
+		queue->state = NVMET_TCP_Q_DISCONNECTING;
+		schedule_work(&queue->release_work);
+	}
+	spin_unlock(&queue->state_lock);
+}
+
+static void nvmet_tcp_io_work(struct work_struct *w)
+{
+	struct nvmet_tcp_queue *queue =
+		container_of(w, struct nvmet_tcp_queue, io_work);
+	bool pending;
+	int ret, ops = 0;
+
+	do {
+		pending = false;
+
+		ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
+		if (ret > 0) {
+			pending = true;
+		} else if (ret < 0) {
+			if (ret == -EPIPE || ret == -ECONNRESET)
+				kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+			else
+				nvmet_tcp_fatal_error(queue);
+			return;
+		}
+
+		ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
+		if (ret > 0) {
+			/* transmitted message/data */
+			pending = true;
+		} else if (ret < 0) {
+			if (ret == -EPIPE || ret == -ECONNRESET)
+				kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+			else
+				nvmet_tcp_fatal_error(queue);
+			return;
+		}
+
+	} while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
+
+	/*
+	 * We exahusted our budget, requeue our selves
+	 */
+	if (pending)
+		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+}
+
+static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
+		struct nvmet_tcp_cmd *c)
+{
+	u8 hdgst = nvmet_tcp_hdgst_len(queue);
+
+	c->queue = queue;
+	c->req.port = queue->port->nport;
+
+	c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
+			sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+	if (!c->cmd_pdu)
+		return -ENOMEM;
+	c->req.cmd = &c->cmd_pdu->cmd;
+
+	c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
+			sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+	if (!c->rsp_pdu)
+		goto out_free_cmd;
+	c->req.rsp = &c->rsp_pdu->cqe;
+
+	c->data_pdu = page_frag_alloc(&queue->pf_cache,
+			sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+	if (!c->data_pdu)
+		goto out_free_rsp;
+
+	c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
+			sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+	if (!c->r2t_pdu)
+		goto out_free_data;
+
+	c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+
+	list_add_tail(&c->entry, &queue->free_list);
+
+	return 0;
+out_free_data:
+	page_frag_free(c->data_pdu);
+out_free_rsp:
+	page_frag_free(c->rsp_pdu);
+out_free_cmd:
+	page_frag_free(c->cmd_pdu);
+	return -ENOMEM;
+}
+
+static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c)
+{
+	page_frag_free(c->r2t_pdu);
+	page_frag_free(c->data_pdu);
+	page_frag_free(c->rsp_pdu);
+	page_frag_free(c->cmd_pdu);
+}
+
+static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_cmd *cmds;
+	int i, ret = -EINVAL, nr_cmds = queue->nr_cmds;
+
+	cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL);
+	if (!cmds)
+		goto out;
+
+	for (i = 0; i < nr_cmds; i++) {
+		ret = nvmet_tcp_alloc_cmd(queue, cmds + i);
+		if (ret)
+			goto out_free;
+	}
+
+	queue->cmds = cmds;
+
+	return 0;
+out_free:
+	while (--i >= 0)
+		nvmet_tcp_free_cmd(cmds + i);
+	kfree(cmds);
+out:
+	return ret;
+}
+
+static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_cmd *cmds = queue->cmds;
+	int i;
+
+	for (i = 0; i < queue->nr_cmds; i++)
+		nvmet_tcp_free_cmd(cmds + i);
+
+	nvmet_tcp_free_cmd(&queue->connect);
+	kfree(cmds);
+}
+
+static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
+{
+	struct socket *sock = queue->sock;
+
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_data_ready =  queue->data_ready;
+	sock->sk->sk_state_change = queue->state_change;
+	sock->sk->sk_write_space = queue->write_space;
+	sock->sk->sk_user_data = NULL;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
+{
+	nvmet_req_uninit(&cmd->req);
+	nvmet_tcp_unmap_pdu_iovec(cmd);
+	sgl_free(cmd->req.sg);
+}
+
+static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_cmd *cmd = queue->cmds;
+	int i;
+
+	for (i = 0; i < queue->nr_cmds; i++, cmd++) {
+		if (nvmet_tcp_need_data_in(cmd))
+			nvmet_tcp_finish_cmd(cmd);
+	}
+
+	if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
+		/* failed in connect */
+		nvmet_tcp_finish_cmd(&queue->connect);
+	}
+}
+
+static void nvmet_tcp_release_queue_work(struct work_struct *w)
+{
+	struct nvmet_tcp_queue *queue =
+		container_of(w, struct nvmet_tcp_queue, release_work);
+
+	mutex_lock(&nvmet_tcp_queue_mutex);
+	list_del_init(&queue->queue_list);
+	mutex_unlock(&nvmet_tcp_queue_mutex);
+
+	nvmet_tcp_restore_socket_callbacks(queue);
+	flush_work(&queue->io_work);
+
+	nvmet_tcp_uninit_data_in_cmds(queue);
+	nvmet_sq_destroy(&queue->nvme_sq);
+	cancel_work_sync(&queue->io_work);
+	sock_release(queue->sock);
+	nvmet_tcp_free_cmds(queue);
+	if (queue->hdr_digest || queue->data_digest)
+		nvmet_tcp_free_crypto(queue);
+	ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
+
+	kfree(queue);
+}
+
+static void nvmet_tcp_data_ready(struct sock *sk)
+{
+	struct nvmet_tcp_queue *queue;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	queue = sk->sk_user_data;
+	if (likely(queue))
+		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void nvmet_tcp_write_space(struct sock *sk)
+{
+	struct nvmet_tcp_queue *queue;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	queue = sk->sk_user_data;
+	if (unlikely(!queue))
+		goto out;
+
+	if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
+		queue->write_space(sk);
+		goto out;
+	}
+
+	if (sk_stream_is_writeable(sk)) {
+		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+	}
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void nvmet_tcp_state_change(struct sock *sk)
+{
+	struct nvmet_tcp_queue *queue;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	queue = sk->sk_user_data;
+	if (!queue)
+		goto done;
+
+	switch (sk->sk_state) {
+	case TCP_FIN_WAIT1:
+	case TCP_CLOSE_WAIT:
+	case TCP_CLOSE:
+		/* FALLTHRU */
+		sk->sk_user_data = NULL;
+		nvmet_tcp_schedule_release_queue(queue);
+		break;
+	default:
+		pr_warn("queue %d unhandled state %d\n",
+			queue->idx, sk->sk_state);
+	}
+done:
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
+{
+	struct socket *sock = queue->sock;
+	struct linger sol = { .l_onoff = 1, .l_linger = 0 };
+	int ret;
+
+	ret = kernel_getsockname(sock,
+		(struct sockaddr *)&queue->sockaddr);
+	if (ret < 0)
+		return ret;
+
+	ret = kernel_getpeername(sock,
+		(struct sockaddr *)&queue->sockaddr_peer);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Cleanup whatever is sitting in the TCP transmit queue on socket
+	 * close. This is done to prevent stale data from being sent should
+	 * the network connection be restored before TCP times out.
+	 */
+	ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
+			(char *)&sol, sizeof(sol));
+	if (ret)
+		return ret;
+
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_user_data = queue;
+	queue->data_ready = sock->sk->sk_data_ready;
+	sock->sk->sk_data_ready = nvmet_tcp_data_ready;
+	queue->state_change = sock->sk->sk_state_change;
+	sock->sk->sk_state_change = nvmet_tcp_state_change;
+	queue->write_space = sock->sk->sk_write_space;
+	sock->sk->sk_write_space = nvmet_tcp_write_space;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	return 0;
+}
+
+static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
+		struct socket *newsock)
+{
+	struct nvmet_tcp_queue *queue;
+	int ret;
+
+	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+	if (!queue)
+		return -ENOMEM;
+
+	INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
+	INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
+	queue->sock = newsock;
+	queue->port = port;
+	queue->nr_cmds = 0;
+	spin_lock_init(&queue->state_lock);
+	queue->state = NVMET_TCP_Q_CONNECTING;
+	INIT_LIST_HEAD(&queue->free_list);
+	init_llist_head(&queue->resp_list);
+	INIT_LIST_HEAD(&queue->resp_send_list);
+
+	queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL);
+	if (queue->idx < 0) {
+		ret = queue->idx;
+		goto out_free_queue;
+	}
+
+	ret = nvmet_tcp_alloc_cmd(queue, &queue->connect);
+	if (ret)
+		goto out_ida_remove;
+
+	ret = nvmet_sq_init(&queue->nvme_sq);
+	if (ret)
+		goto out_free_connect;
+
+	port->last_cpu = cpumask_next_wrap(port->last_cpu,
+				cpu_online_mask, -1, false);
+	queue->cpu = port->last_cpu;
+	nvmet_prepare_receive_pdu(queue);
+
+	mutex_lock(&nvmet_tcp_queue_mutex);
+	list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
+	mutex_unlock(&nvmet_tcp_queue_mutex);
+
+	ret = nvmet_tcp_set_queue_sock(queue);
+	if (ret)
+		goto out_destroy_sq;
+
+	queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+
+	return 0;
+out_destroy_sq:
+	mutex_lock(&nvmet_tcp_queue_mutex);
+	list_del_init(&queue->queue_list);
+	mutex_unlock(&nvmet_tcp_queue_mutex);
+	nvmet_sq_destroy(&queue->nvme_sq);
+out_free_connect:
+	nvmet_tcp_free_cmd(&queue->connect);
+out_ida_remove:
+	ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
+out_free_queue:
+	kfree(queue);
+	return ret;
+}
+
+static void nvmet_tcp_accept_work(struct work_struct *w)
+{
+	struct nvmet_tcp_port *port =
+		container_of(w, struct nvmet_tcp_port, accept_work);
+	struct socket *newsock;
+	int ret;
+
+	while (true) {
+		ret = kernel_accept(port->sock, &newsock, O_NONBLOCK);
+		if (ret < 0) {
+			if (ret != -EAGAIN)
+				pr_warn("failed to accept err=%d\n", ret);
+			return;
+		}
+		ret = nvmet_tcp_alloc_queue(port, newsock);
+		if (ret) {
+			pr_err("failed to allocate queue\n");
+			sock_release(newsock);
+		}
+	}
+}
+
+static void nvmet_tcp_listen_data_ready(struct sock *sk)
+{
+	struct nvmet_tcp_port *port;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	port = sk->sk_user_data;
+	if (!port)
+		goto out;
+
+	if (sk->sk_state == TCP_LISTEN)
+		schedule_work(&port->accept_work);
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int nvmet_tcp_add_port(struct nvmet_port *nport)
+{
+	struct nvmet_tcp_port *port;
+	__kernel_sa_family_t af;
+	int opt, ret;
+
+	port = kzalloc(sizeof(*port), GFP_KERNEL);
+	if (!port)
+		return -ENOMEM;
+
+	switch (nport->disc_addr.adrfam) {
+	case NVMF_ADDR_FAMILY_IP4:
+		af = AF_INET;
+		break;
+	case NVMF_ADDR_FAMILY_IP6:
+		af = AF_INET6;
+		break;
+	default:
+		pr_err("address family %d not supported\n",
+				nport->disc_addr.adrfam);
+		ret = -EINVAL;
+		goto err_port;
+	}
+
+	ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
+			nport->disc_addr.trsvcid, &port->addr);
+	if (ret) {
+		pr_err("malformed ip/port passed: %s:%s\n",
+			nport->disc_addr.traddr, nport->disc_addr.trsvcid);
+		goto err_port;
+	}
+
+	port->nport = nport;
+	port->last_cpu = -1;
+	INIT_WORK(&port->accept_work, nvmet_tcp_accept_work);
+	if (port->nport->inline_data_size < 0)
+		port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE;
+
+	ret = sock_create(port->addr.ss_family, SOCK_STREAM,
+				IPPROTO_TCP, &port->sock);
+	if (ret) {
+		pr_err("failed to create a socket\n");
+		goto err_port;
+	}
+
+	port->sock->sk->sk_user_data = port;
+	port->data_ready = port->sock->sk->sk_data_ready;
+	port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
+
+	opt = 1;
+	ret = kernel_setsockopt(port->sock, IPPROTO_TCP,
+			TCP_NODELAY, (char *)&opt, sizeof(opt));
+	if (ret) {
+		pr_err("failed to set TCP_NODELAY sock opt %d\n", ret);
+		goto err_sock;
+	}
+
+	ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_REUSEADDR,
+			(char *)&opt, sizeof(opt));
+	if (ret) {
+		pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret);
+		goto err_sock;
+	}
+
+	ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
+			sizeof(port->addr));
+	if (ret) {
+		pr_err("failed to bind port socket %d\n", ret);
+		goto err_sock;
+	}
+
+	ret = kernel_listen(port->sock, 128);
+	if (ret) {
+		pr_err("failed to listen %d on port sock\n", ret);
+		goto err_sock;
+	}
+
+	nport->priv = port;
+	pr_info("enabling port %d (%pISpc)\n",
+		le16_to_cpu(nport->disc_addr.portid), &port->addr);
+
+	return 0;
+
+err_sock:
+	sock_release(port->sock);
+err_port:
+	kfree(port);
+	return ret;
+}
+
+static void nvmet_tcp_remove_port(struct nvmet_port *nport)
+{
+	struct nvmet_tcp_port *port = nport->priv;
+
+	write_lock_bh(&port->sock->sk->sk_callback_lock);
+	port->sock->sk->sk_data_ready = port->data_ready;
+	port->sock->sk->sk_user_data = NULL;
+	write_unlock_bh(&port->sock->sk->sk_callback_lock);
+	cancel_work_sync(&port->accept_work);
+
+	sock_release(port->sock);
+	kfree(port);
+}
+
+static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl)
+{
+	struct nvmet_tcp_queue *queue;
+
+	mutex_lock(&nvmet_tcp_queue_mutex);
+	list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
+		if (queue->nvme_sq.ctrl == ctrl)
+			kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+	mutex_unlock(&nvmet_tcp_queue_mutex);
+}
+
+static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq)
+{
+	struct nvmet_tcp_queue *queue =
+		container_of(sq, struct nvmet_tcp_queue, nvme_sq);
+
+	if (sq->qid == 0) {
+		/* Let inflight controller teardown complete */
+		flush_scheduled_work();
+	}
+
+	queue->nr_cmds = sq->size * 2;
+	if (nvmet_tcp_alloc_cmds(queue))
+		return NVME_SC_INTERNAL;
+	return 0;
+}
+
+static void nvmet_tcp_disc_port_addr(struct nvmet_req *req,
+		struct nvmet_port *nport, char *traddr)
+{
+	struct nvmet_tcp_port *port = nport->priv;
+
+	if (inet_addr_is_any((struct sockaddr *)&port->addr)) {
+		struct nvmet_tcp_cmd *cmd =
+			container_of(req, struct nvmet_tcp_cmd, req);
+		struct nvmet_tcp_queue *queue = cmd->queue;
+
+		sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr);
+	} else {
+		memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
+	}
+}
+
+static struct nvmet_fabrics_ops nvmet_tcp_ops = {
+	.owner			= THIS_MODULE,
+	.type			= NVMF_TRTYPE_TCP,
+	.msdbd			= 1,
+	.has_keyed_sgls		= 0,
+	.add_port		= nvmet_tcp_add_port,
+	.remove_port		= nvmet_tcp_remove_port,
+	.queue_response		= nvmet_tcp_queue_response,
+	.delete_ctrl		= nvmet_tcp_delete_ctrl,
+	.install_queue		= nvmet_tcp_install_queue,
+	.disc_traddr		= nvmet_tcp_disc_port_addr,
+};
+
+static int __init nvmet_tcp_init(void)
+{
+	int ret;
+
+	nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0);
+	if (!nvmet_tcp_wq)
+		return -ENOMEM;
+
+	ret = nvmet_register_transport(&nvmet_tcp_ops);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	destroy_workqueue(nvmet_tcp_wq);
+	return ret;
+}
+
+static void __exit nvmet_tcp_exit(void)
+{
+	struct nvmet_tcp_queue *queue;
+
+	nvmet_unregister_transport(&nvmet_tcp_ops);
+
+	flush_scheduled_work();
+	mutex_lock(&nvmet_tcp_queue_mutex);
+	list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
+		kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+	mutex_unlock(&nvmet_tcp_queue_mutex);
+	flush_scheduled_work();
+
+	destroy_workqueue(nvmet_tcp_wq);
+}
+
+module_init(nvmet_tcp_init);
+module_exit(nvmet_tcp_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */
-- 
cgit 


From ad4f530e95a7b88a332035b9e0c5384441356576 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:16 -0800
Subject: nvmet: allow configfs tcp trtype configuration

Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/configfs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers')

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index db2cb64be7ba..618bbd006544 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -34,6 +34,7 @@ static const struct nvmet_transport_name {
 } nvmet_transport_names[] = {
 	{ NVMF_TRTYPE_RDMA,	"rdma" },
 	{ NVMF_TRTYPE_FC,	"fc" },
+	{ NVMF_TRTYPE_TCP,	"tcp" },
 	{ NVMF_TRTYPE_LOOP,	"loop" },
 };
 
-- 
cgit 


From 3f2304f8c6d6ed97849057bd16fee99e434ca796 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:17 -0800
Subject: nvme-tcp: add NVMe over TCP host driver

This patch implements the NVMe over TCP host driver. It can be used to
connect to remote NVMe over Fabrics subsystems over good old TCP/IP.

The driver implements the TP 8000 of how nvme over fabrics capsules and
data are encapsulated in nvme-tcp pdus and exchaged on top of a TCP byte
stream. nvme-tcp header and data digest are supported as well.

To connect to all NVMe over Fabrics controllers reachable on a given taget
port over TCP use the following command:

	nvme connect-all -t tcp -a $IPADDR

This requires the latest version of nvme-cli with TCP support.

Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Roy Shterman <roys@lightbitslabs.com>
Signed-off-by: Solganik Alexander <sashas@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/Kconfig  |   15 +
 drivers/nvme/host/Makefile |    3 +
 drivers/nvme/host/tcp.c    | 2242 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 2260 insertions(+)
 create mode 100644 drivers/nvme/host/tcp.c

(limited to 'drivers')

diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 88a8b5916624..0f345e207675 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -57,3 +57,18 @@ config NVME_FC
 	  from https://github.com/linux-nvme/nvme-cli.
 
 	  If unsure, say N.
+
+config NVME_TCP
+	tristate "NVM Express over Fabrics TCP host driver"
+	depends on INET
+	depends on BLK_DEV_NVME
+	select NVME_FABRICS
+	help
+	  This provides support for the NVMe over Fabrics protocol using
+	  the TCP transport.  This allows you to use remote block devices
+	  exported using the NVMe protocol set.
+
+	  To configure a NVMe over Fabrics controller use the nvme-cli tool
+	  from https://github.com/linux-nvme/nvme-cli.
+
+	  If unsure, say N.
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index aea459c65ae1..8a4b671c5f0c 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_BLK_DEV_NVME)		+= nvme.o
 obj-$(CONFIG_NVME_FABRICS)		+= nvme-fabrics.o
 obj-$(CONFIG_NVME_RDMA)			+= nvme-rdma.o
 obj-$(CONFIG_NVME_FC)			+= nvme-fc.o
+obj-$(CONFIG_NVME_TCP)			+= nvme-tcp.o
 
 nvme-core-y				:= core.o
 nvme-core-$(CONFIG_TRACING)		+= trace.o
@@ -21,3 +22,5 @@ nvme-fabrics-y				+= fabrics.o
 nvme-rdma-y				+= rdma.o
 
 nvme-fc-y				+= fc.o
+
+nvme-tcp-y				+= tcp.o
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
new file mode 100644
index 000000000000..15543358e245
--- /dev/null
+++ b/drivers/nvme/host/tcp.c
@@ -0,0 +1,2242 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics TCP host.
+ * Copyright (c) 2018 Lightbits Labs. All rights reserved.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/nvme-tcp.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/blk-mq.h>
+#include <crypto/hash.h>
+
+#include "nvme.h"
+#include "fabrics.h"
+
+struct nvme_tcp_queue;
+
+enum nvme_tcp_send_state {
+	NVME_TCP_SEND_CMD_PDU = 0,
+	NVME_TCP_SEND_H2C_PDU,
+	NVME_TCP_SEND_DATA,
+	NVME_TCP_SEND_DDGST,
+};
+
+struct nvme_tcp_request {
+	struct nvme_request	req;
+	void			*pdu;
+	struct nvme_tcp_queue	*queue;
+	u32			data_len;
+	u32			pdu_len;
+	u32			pdu_sent;
+	u16			ttag;
+	struct list_head	entry;
+	u32			ddgst;
+
+	struct bio		*curr_bio;
+	struct iov_iter		iter;
+
+	/* send state */
+	size_t			offset;
+	size_t			data_sent;
+	enum nvme_tcp_send_state state;
+};
+
+enum nvme_tcp_queue_flags {
+	NVME_TCP_Q_ALLOCATED	= 0,
+	NVME_TCP_Q_LIVE		= 1,
+};
+
+enum nvme_tcp_recv_state {
+	NVME_TCP_RECV_PDU = 0,
+	NVME_TCP_RECV_DATA,
+	NVME_TCP_RECV_DDGST,
+};
+
+struct nvme_tcp_ctrl;
+struct nvme_tcp_queue {
+	struct socket		*sock;
+	struct work_struct	io_work;
+	int			io_cpu;
+
+	spinlock_t		lock;
+	struct list_head	send_list;
+
+	/* recv state */
+	void			*pdu;
+	int			pdu_remaining;
+	int			pdu_offset;
+	size_t			data_remaining;
+	size_t			ddgst_remaining;
+
+	/* send state */
+	struct nvme_tcp_request *request;
+
+	int			queue_size;
+	size_t			cmnd_capsule_len;
+	struct nvme_tcp_ctrl	*ctrl;
+	unsigned long		flags;
+	bool			rd_enabled;
+
+	bool			hdr_digest;
+	bool			data_digest;
+	struct ahash_request	*rcv_hash;
+	struct ahash_request	*snd_hash;
+	__le32			exp_ddgst;
+	__le32			recv_ddgst;
+
+	struct page_frag_cache	pf_cache;
+
+	void (*state_change)(struct sock *);
+	void (*data_ready)(struct sock *);
+	void (*write_space)(struct sock *);
+};
+
+struct nvme_tcp_ctrl {
+	/* read only in the hot path */
+	struct nvme_tcp_queue	*queues;
+	struct blk_mq_tag_set	tag_set;
+
+	/* other member variables */
+	struct list_head	list;
+	struct blk_mq_tag_set	admin_tag_set;
+	struct sockaddr_storage addr;
+	struct sockaddr_storage src_addr;
+	struct nvme_ctrl	ctrl;
+
+	struct work_struct	err_work;
+	struct delayed_work	connect_work;
+	struct nvme_tcp_request async_req;
+};
+
+static LIST_HEAD(nvme_tcp_ctrl_list);
+static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
+static struct workqueue_struct *nvme_tcp_wq;
+static struct blk_mq_ops nvme_tcp_mq_ops;
+static struct blk_mq_ops nvme_tcp_admin_mq_ops;
+
+static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
+{
+	return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
+}
+
+static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
+{
+	return queue - queue->ctrl->queues;
+}
+
+static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
+{
+	u32 queue_idx = nvme_tcp_queue_id(queue);
+
+	if (queue_idx == 0)
+		return queue->ctrl->admin_tag_set.tags[queue_idx];
+	return queue->ctrl->tag_set.tags[queue_idx - 1];
+}
+
+static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
+{
+	return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
+static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
+{
+	return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
+static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
+{
+	return queue->cmnd_capsule_len - sizeof(struct nvme_command);
+}
+
+static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
+{
+	return req == &req->queue->ctrl->async_req;
+}
+
+static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
+{
+	struct request *rq;
+	unsigned int bytes;
+
+	if (unlikely(nvme_tcp_async_req(req)))
+		return false; /* async events don't have a request */
+
+	rq = blk_mq_rq_from_pdu(req);
+	bytes = blk_rq_payload_bytes(rq);
+
+	return rq_data_dir(rq) == WRITE && bytes &&
+		bytes <= nvme_tcp_inline_data_size(req->queue);
+}
+
+static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
+{
+	return req->iter.bvec->bv_page;
+}
+
+static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
+{
+	return req->iter.bvec->bv_offset + req->iter.iov_offset;
+}
+
+static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
+{
+	return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset,
+			req->pdu_len - req->pdu_sent);
+}
+
+static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req)
+{
+	return req->iter.iov_offset;
+}
+
+static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
+{
+	return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
+			req->pdu_len - req->pdu_sent : 0;
+}
+
+static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
+		int len)
+{
+	return nvme_tcp_pdu_data_left(req) <= len;
+}
+
+static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
+		unsigned int dir)
+{
+	struct request *rq = blk_mq_rq_from_pdu(req);
+	struct bio_vec *vec;
+	unsigned int size;
+	int nsegs;
+	size_t offset;
+
+	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
+		vec = &rq->special_vec;
+		nsegs = 1;
+		size = blk_rq_payload_bytes(rq);
+		offset = 0;
+	} else {
+		struct bio *bio = req->curr_bio;
+
+		vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+		nsegs = bio_segments(bio);
+		size = bio->bi_iter.bi_size;
+		offset = bio->bi_iter.bi_bvec_done;
+	}
+
+	iov_iter_bvec(&req->iter, dir, vec, nsegs, size);
+	req->iter.iov_offset = offset;
+}
+
+static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
+		int len)
+{
+	req->data_sent += len;
+	req->pdu_sent += len;
+	iov_iter_advance(&req->iter, len);
+	if (!iov_iter_count(&req->iter) &&
+	    req->data_sent < req->data_len) {
+		req->curr_bio = req->curr_bio->bi_next;
+		nvme_tcp_init_iter(req, WRITE);
+	}
+}
+
+static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req)
+{
+	struct nvme_tcp_queue *queue = req->queue;
+
+	spin_lock(&queue->lock);
+	list_add_tail(&req->entry, &queue->send_list);
+	spin_unlock(&queue->lock);
+
+	queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+}
+
+static inline struct nvme_tcp_request *
+nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
+{
+	struct nvme_tcp_request *req;
+
+	spin_lock(&queue->lock);
+	req = list_first_entry_or_null(&queue->send_list,
+			struct nvme_tcp_request, entry);
+	if (req)
+		list_del(&req->entry);
+	spin_unlock(&queue->lock);
+
+	return req;
+}
+
+static inline void nvme_tcp_ddgst_final(struct ahash_request *hash, u32 *dgst)
+{
+	ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
+	crypto_ahash_final(hash);
+}
+
+static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
+		struct page *page, off_t off, size_t len)
+{
+	struct scatterlist sg;
+
+	sg_init_marker(&sg, 1);
+	sg_set_page(&sg, page, len, off);
+	ahash_request_set_crypt(hash, &sg, NULL, len);
+	crypto_ahash_update(hash);
+}
+
+static inline void nvme_tcp_hdgst(struct ahash_request *hash,
+		void *pdu, size_t len)
+{
+	struct scatterlist sg;
+
+	sg_init_one(&sg, pdu, len);
+	ahash_request_set_crypt(hash, &sg, pdu + len, len);
+	crypto_ahash_digest(hash);
+}
+
+static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
+		void *pdu, size_t pdu_len)
+{
+	struct nvme_tcp_hdr *hdr = pdu;
+	__le32 recv_digest;
+	__le32 exp_digest;
+
+	if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d: header digest flag is cleared\n",
+			nvme_tcp_queue_id(queue));
+		return -EPROTO;
+	}
+
+	recv_digest = *(__le32 *)(pdu + hdr->hlen);
+	nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
+	exp_digest = *(__le32 *)(pdu + hdr->hlen);
+	if (recv_digest != exp_digest) {
+		dev_err(queue->ctrl->ctrl.device,
+			"header digest error: recv %#x expected %#x\n",
+			le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
+{
+	struct nvme_tcp_hdr *hdr = pdu;
+	u8 digest_len = nvme_tcp_hdgst_len(queue);
+	u32 len;
+
+	len = le32_to_cpu(hdr->plen) - hdr->hlen -
+		((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
+
+	if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d: data digest flag is cleared\n",
+		nvme_tcp_queue_id(queue));
+		return -EPROTO;
+	}
+	crypto_ahash_init(queue->rcv_hash);
+
+	return 0;
+}
+
+static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
+		struct request *rq, unsigned int hctx_idx)
+{
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+
+	page_frag_free(req->pdu);
+}
+
+static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
+		struct request *rq, unsigned int hctx_idx,
+		unsigned int numa_node)
+{
+	struct nvme_tcp_ctrl *ctrl = set->driver_data;
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
+	struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
+	u8 hdgst = nvme_tcp_hdgst_len(queue);
+
+	req->pdu = page_frag_alloc(&queue->pf_cache,
+		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
+		GFP_KERNEL | __GFP_ZERO);
+	if (!req->pdu)
+		return -ENOMEM;
+
+	req->queue = queue;
+	nvme_req(rq)->ctrl = &ctrl->ctrl;
+
+	return 0;
+}
+
+static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+		unsigned int hctx_idx)
+{
+	struct nvme_tcp_ctrl *ctrl = data;
+	struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
+
+	hctx->driver_data = queue;
+	return 0;
+}
+
+static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+		unsigned int hctx_idx)
+{
+	struct nvme_tcp_ctrl *ctrl = data;
+	struct nvme_tcp_queue *queue = &ctrl->queues[0];
+
+	hctx->driver_data = queue;
+	return 0;
+}
+
+static enum nvme_tcp_recv_state
+nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
+{
+	return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
+		(queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
+		NVME_TCP_RECV_DATA;
+}
+
+static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
+{
+	queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
+				nvme_tcp_hdgst_len(queue);
+	queue->pdu_offset = 0;
+	queue->data_remaining = -1;
+	queue->ddgst_remaining = 0;
+}
+
+static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
+{
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+		return;
+
+	queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->err_work);
+}
+
+static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
+		struct nvme_completion *cqe)
+{
+	struct request *rq;
+
+	rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
+	if (!rq) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d tag 0x%x not found\n",
+			nvme_tcp_queue_id(queue), cqe->command_id);
+		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+		return -EINVAL;
+	}
+
+	nvme_end_request(rq, cqe->status, cqe->result);
+
+	return 0;
+}
+
+static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
+		struct nvme_tcp_data_pdu *pdu)
+{
+	struct request *rq;
+
+	rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+	if (!rq) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d tag %#x not found\n",
+			nvme_tcp_queue_id(queue), pdu->command_id);
+		return -ENOENT;
+	}
+
+	if (!blk_rq_payload_bytes(rq)) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d tag %#x unexpected data\n",
+			nvme_tcp_queue_id(queue), rq->tag);
+		return -EIO;
+	}
+
+	queue->data_remaining = le32_to_cpu(pdu->data_length);
+
+	return 0;
+
+}
+
+static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
+		struct nvme_tcp_rsp_pdu *pdu)
+{
+	struct nvme_completion *cqe = &pdu->cqe;
+	int ret = 0;
+
+	/*
+	 * AEN requests are special as they don't time out and can
+	 * survive any kind of queue freeze and often don't respond to
+	 * aborts.  We don't even bother to allocate a struct request
+	 * for them but rather special case them here.
+	 */
+	if (unlikely(nvme_tcp_queue_id(queue) == 0 &&
+	    cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
+		nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
+				&cqe->result);
+	else
+		ret = nvme_tcp_process_nvme_cqe(queue, cqe);
+
+	return ret;
+}
+
+static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
+		struct nvme_tcp_r2t_pdu *pdu)
+{
+	struct nvme_tcp_data_pdu *data = req->pdu;
+	struct nvme_tcp_queue *queue = req->queue;
+	struct request *rq = blk_mq_rq_from_pdu(req);
+	u8 hdgst = nvme_tcp_hdgst_len(queue);
+	u8 ddgst = nvme_tcp_ddgst_len(queue);
+
+	req->pdu_len = le32_to_cpu(pdu->r2t_length);
+	req->pdu_sent = 0;
+
+	if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
+		dev_err(queue->ctrl->ctrl.device,
+			"req %d r2t len %u exceeded data len %u (%zu sent)\n",
+			rq->tag, req->pdu_len, req->data_len,
+			req->data_sent);
+		return -EPROTO;
+	}
+
+	if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
+		dev_err(queue->ctrl->ctrl.device,
+			"req %d unexpected r2t offset %u (expected %zu)\n",
+			rq->tag, le32_to_cpu(pdu->r2t_offset),
+			req->data_sent);
+		return -EPROTO;
+	}
+
+	memset(data, 0, sizeof(*data));
+	data->hdr.type = nvme_tcp_h2c_data;
+	data->hdr.flags = NVME_TCP_F_DATA_LAST;
+	if (queue->hdr_digest)
+		data->hdr.flags |= NVME_TCP_F_HDGST;
+	if (queue->data_digest)
+		data->hdr.flags |= NVME_TCP_F_DDGST;
+	data->hdr.hlen = sizeof(*data);
+	data->hdr.pdo = data->hdr.hlen + hdgst;
+	data->hdr.plen =
+		cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
+	data->ttag = pdu->ttag;
+	data->command_id = rq->tag;
+	data->data_offset = cpu_to_le32(req->data_sent);
+	data->data_length = cpu_to_le32(req->pdu_len);
+	return 0;
+}
+
+static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
+		struct nvme_tcp_r2t_pdu *pdu)
+{
+	struct nvme_tcp_request *req;
+	struct request *rq;
+	int ret;
+
+	rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+	if (!rq) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d tag %#x not found\n",
+			nvme_tcp_queue_id(queue), pdu->command_id);
+		return -ENOENT;
+	}
+	req = blk_mq_rq_to_pdu(rq);
+
+	ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
+	if (unlikely(ret))
+		return ret;
+
+	req->state = NVME_TCP_SEND_H2C_PDU;
+	req->offset = 0;
+
+	nvme_tcp_queue_request(req);
+
+	return 0;
+}
+
+static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
+		unsigned int *offset, size_t *len)
+{
+	struct nvme_tcp_hdr *hdr;
+	char *pdu = queue->pdu;
+	size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
+	int ret;
+
+	ret = skb_copy_bits(skb, *offset,
+		&pdu[queue->pdu_offset], rcv_len);
+	if (unlikely(ret))
+		return ret;
+
+	queue->pdu_remaining -= rcv_len;
+	queue->pdu_offset += rcv_len;
+	*offset += rcv_len;
+	*len -= rcv_len;
+	if (queue->pdu_remaining)
+		return 0;
+
+	hdr = queue->pdu;
+	if (queue->hdr_digest) {
+		ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
+		if (unlikely(ret))
+			return ret;
+	}
+
+
+	if (queue->data_digest) {
+		ret = nvme_tcp_check_ddgst(queue, queue->pdu);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	switch (hdr->type) {
+	case nvme_tcp_c2h_data:
+		ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
+		break;
+	case nvme_tcp_rsp:
+		nvme_tcp_init_recv_ctx(queue);
+		ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu);
+		break;
+	case nvme_tcp_r2t:
+		nvme_tcp_init_recv_ctx(queue);
+		ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
+		break;
+	default:
+		dev_err(queue->ctrl->ctrl.device,
+			"unsupported pdu type (%d)\n", hdr->type);
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
+			      unsigned int *offset, size_t *len)
+{
+	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
+	struct nvme_tcp_request *req;
+	struct request *rq;
+
+	rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+	if (!rq) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d tag %#x not found\n",
+			nvme_tcp_queue_id(queue), pdu->command_id);
+		return -ENOENT;
+	}
+	req = blk_mq_rq_to_pdu(rq);
+
+	while (true) {
+		int recv_len, ret;
+
+		recv_len = min_t(size_t, *len, queue->data_remaining);
+		if (!recv_len)
+			break;
+
+		if (!iov_iter_count(&req->iter)) {
+			req->curr_bio = req->curr_bio->bi_next;
+
+			/*
+			 * If we don`t have any bios it means that controller
+			 * sent more data than we requested, hence error
+			 */
+			if (!req->curr_bio) {
+				dev_err(queue->ctrl->ctrl.device,
+					"queue %d no space in request %#x",
+					nvme_tcp_queue_id(queue), rq->tag);
+				nvme_tcp_init_recv_ctx(queue);
+				return -EIO;
+			}
+			nvme_tcp_init_iter(req, READ);
+		}
+
+		/* we can read only from what is left in this bio */
+		recv_len = min_t(size_t, recv_len,
+				iov_iter_count(&req->iter));
+
+		if (queue->data_digest)
+			ret = skb_copy_and_hash_datagram_iter(skb, *offset,
+				&req->iter, recv_len, queue->rcv_hash);
+		else
+			ret = skb_copy_datagram_iter(skb, *offset,
+					&req->iter, recv_len);
+		if (ret) {
+			dev_err(queue->ctrl->ctrl.device,
+				"queue %d failed to copy request %#x data",
+				nvme_tcp_queue_id(queue), rq->tag);
+			return ret;
+		}
+
+		*len -= recv_len;
+		*offset += recv_len;
+		queue->data_remaining -= recv_len;
+	}
+
+	if (!queue->data_remaining) {
+		if (queue->data_digest) {
+			nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
+			queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
+		} else {
+			nvme_tcp_init_recv_ctx(queue);
+		}
+	}
+
+	return 0;
+}
+
+static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
+		struct sk_buff *skb, unsigned int *offset, size_t *len)
+{
+	char *ddgst = (char *)&queue->recv_ddgst;
+	size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
+	off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
+	int ret;
+
+	ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
+	if (unlikely(ret))
+		return ret;
+
+	queue->ddgst_remaining -= recv_len;
+	*offset += recv_len;
+	*len -= recv_len;
+	if (queue->ddgst_remaining)
+		return 0;
+
+	if (queue->recv_ddgst != queue->exp_ddgst) {
+		dev_err(queue->ctrl->ctrl.device,
+			"data digest error: recv %#x expected %#x\n",
+			le32_to_cpu(queue->recv_ddgst),
+			le32_to_cpu(queue->exp_ddgst));
+		return -EIO;
+	}
+
+	nvme_tcp_init_recv_ctx(queue);
+	return 0;
+}
+
+static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
+			     unsigned int offset, size_t len)
+{
+	struct nvme_tcp_queue *queue = desc->arg.data;
+	size_t consumed = len;
+	int result;
+
+	while (len) {
+		switch (nvme_tcp_recv_state(queue)) {
+		case NVME_TCP_RECV_PDU:
+			result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
+			break;
+		case NVME_TCP_RECV_DATA:
+			result = nvme_tcp_recv_data(queue, skb, &offset, &len);
+			break;
+		case NVME_TCP_RECV_DDGST:
+			result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
+			break;
+		default:
+			result = -EFAULT;
+		}
+		if (result) {
+			dev_err(queue->ctrl->ctrl.device,
+				"receive failed:  %d\n", result);
+			queue->rd_enabled = false;
+			nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+			return result;
+		}
+	}
+
+	return consumed;
+}
+
+static void nvme_tcp_data_ready(struct sock *sk)
+{
+	struct nvme_tcp_queue *queue;
+
+	read_lock(&sk->sk_callback_lock);
+	queue = sk->sk_user_data;
+	if (likely(queue && queue->rd_enabled))
+		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+	read_unlock(&sk->sk_callback_lock);
+}
+
+static void nvme_tcp_write_space(struct sock *sk)
+{
+	struct nvme_tcp_queue *queue;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	queue = sk->sk_user_data;
+	if (likely(queue && sk_stream_is_writeable(sk))) {
+		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+	}
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void nvme_tcp_state_change(struct sock *sk)
+{
+	struct nvme_tcp_queue *queue;
+
+	read_lock(&sk->sk_callback_lock);
+	queue = sk->sk_user_data;
+	if (!queue)
+		goto done;
+
+	switch (sk->sk_state) {
+	case TCP_CLOSE:
+	case TCP_CLOSE_WAIT:
+	case TCP_LAST_ACK:
+	case TCP_FIN_WAIT1:
+	case TCP_FIN_WAIT2:
+		/* fallthrough */
+		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+		break;
+	default:
+		dev_info(queue->ctrl->ctrl.device,
+			"queue %d socket state %d\n",
+			nvme_tcp_queue_id(queue), sk->sk_state);
+	}
+
+	queue->state_change(sk);
+done:
+	read_unlock(&sk->sk_callback_lock);
+}
+
+static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
+{
+	queue->request = NULL;
+}
+
+static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
+{
+	union nvme_result res = {};
+
+	nvme_end_request(blk_mq_rq_from_pdu(req),
+		NVME_SC_DATA_XFER_ERROR, res);
+}
+
+static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
+{
+	struct nvme_tcp_queue *queue = req->queue;
+
+	while (true) {
+		struct page *page = nvme_tcp_req_cur_page(req);
+		size_t offset = nvme_tcp_req_cur_offset(req);
+		size_t len = nvme_tcp_req_cur_length(req);
+		bool last = nvme_tcp_pdu_last_send(req, len);
+		int ret, flags = MSG_DONTWAIT;
+
+		if (last && !queue->data_digest)
+			flags |= MSG_EOR;
+		else
+			flags |= MSG_MORE;
+
+		ret = kernel_sendpage(queue->sock, page, offset, len, flags);
+		if (ret <= 0)
+			return ret;
+
+		nvme_tcp_advance_req(req, ret);
+		if (queue->data_digest)
+			nvme_tcp_ddgst_update(queue->snd_hash, page,
+					offset, ret);
+
+		/* fully successful last write*/
+		if (last && ret == len) {
+			if (queue->data_digest) {
+				nvme_tcp_ddgst_final(queue->snd_hash,
+					&req->ddgst);
+				req->state = NVME_TCP_SEND_DDGST;
+				req->offset = 0;
+			} else {
+				nvme_tcp_done_send_req(queue);
+			}
+			return 1;
+		}
+	}
+	return -EAGAIN;
+}
+
+static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
+{
+	struct nvme_tcp_queue *queue = req->queue;
+	struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+	bool inline_data = nvme_tcp_has_inline_data(req);
+	int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR);
+	u8 hdgst = nvme_tcp_hdgst_len(queue);
+	int len = sizeof(*pdu) + hdgst - req->offset;
+	int ret;
+
+	if (queue->hdr_digest && !req->offset)
+		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+
+	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
+			offset_in_page(pdu) + req->offset, len,  flags);
+	if (unlikely(ret <= 0))
+		return ret;
+
+	len -= ret;
+	if (!len) {
+		if (inline_data) {
+			req->state = NVME_TCP_SEND_DATA;
+			if (queue->data_digest)
+				crypto_ahash_init(queue->snd_hash);
+			nvme_tcp_init_iter(req, WRITE);
+		} else {
+			nvme_tcp_done_send_req(queue);
+		}
+		return 1;
+	}
+	req->offset += ret;
+
+	return -EAGAIN;
+}
+
+static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
+{
+	struct nvme_tcp_queue *queue = req->queue;
+	struct nvme_tcp_data_pdu *pdu = req->pdu;
+	u8 hdgst = nvme_tcp_hdgst_len(queue);
+	int len = sizeof(*pdu) - req->offset + hdgst;
+	int ret;
+
+	if (queue->hdr_digest && !req->offset)
+		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+
+	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
+			offset_in_page(pdu) + req->offset, len,
+			MSG_DONTWAIT | MSG_MORE);
+	if (unlikely(ret <= 0))
+		return ret;
+
+	len -= ret;
+	if (!len) {
+		req->state = NVME_TCP_SEND_DATA;
+		if (queue->data_digest)
+			crypto_ahash_init(queue->snd_hash);
+		if (!req->data_sent)
+			nvme_tcp_init_iter(req, WRITE);
+		return 1;
+	}
+	req->offset += ret;
+
+	return -EAGAIN;
+}
+
+static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
+{
+	struct nvme_tcp_queue *queue = req->queue;
+	int ret;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
+	struct kvec iov = {
+		.iov_base = &req->ddgst + req->offset,
+		.iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
+	};
+
+	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+	if (unlikely(ret <= 0))
+		return ret;
+
+	if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
+		nvme_tcp_done_send_req(queue);
+		return 1;
+	}
+
+	req->offset += ret;
+	return -EAGAIN;
+}
+
+static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
+{
+	struct nvme_tcp_request *req;
+	int ret = 1;
+
+	if (!queue->request) {
+		queue->request = nvme_tcp_fetch_request(queue);
+		if (!queue->request)
+			return 0;
+	}
+	req = queue->request;
+
+	if (req->state == NVME_TCP_SEND_CMD_PDU) {
+		ret = nvme_tcp_try_send_cmd_pdu(req);
+		if (ret <= 0)
+			goto done;
+		if (!nvme_tcp_has_inline_data(req))
+			return ret;
+	}
+
+	if (req->state == NVME_TCP_SEND_H2C_PDU) {
+		ret = nvme_tcp_try_send_data_pdu(req);
+		if (ret <= 0)
+			goto done;
+	}
+
+	if (req->state == NVME_TCP_SEND_DATA) {
+		ret = nvme_tcp_try_send_data(req);
+		if (ret <= 0)
+			goto done;
+	}
+
+	if (req->state == NVME_TCP_SEND_DDGST)
+		ret = nvme_tcp_try_send_ddgst(req);
+done:
+	if (ret == -EAGAIN)
+		ret = 0;
+	return ret;
+}
+
+static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
+{
+	struct sock *sk = queue->sock->sk;
+	read_descriptor_t rd_desc;
+	int consumed;
+
+	rd_desc.arg.data = queue;
+	rd_desc.count = 1;
+	lock_sock(sk);
+	consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
+	release_sock(sk);
+	return consumed;
+}
+
+static void nvme_tcp_io_work(struct work_struct *w)
+{
+	struct nvme_tcp_queue *queue =
+		container_of(w, struct nvme_tcp_queue, io_work);
+	unsigned long start = jiffies + msecs_to_jiffies(1);
+
+	do {
+		bool pending = false;
+		int result;
+
+		result = nvme_tcp_try_send(queue);
+		if (result > 0) {
+			pending = true;
+		} else if (unlikely(result < 0)) {
+			dev_err(queue->ctrl->ctrl.device,
+				"failed to send request %d\n", result);
+			if (result != -EPIPE)
+				nvme_tcp_fail_request(queue->request);
+			nvme_tcp_done_send_req(queue);
+			return;
+		}
+
+		result = nvme_tcp_try_recv(queue);
+		if (result > 0)
+			pending = true;
+
+		if (!pending)
+			return;
+
+	} while (time_after(jiffies, start)); /* quota is exhausted */
+
+	queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+}
+
+static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
+
+	ahash_request_free(queue->rcv_hash);
+	ahash_request_free(queue->snd_hash);
+	crypto_free_ahash(tfm);
+}
+
+static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
+{
+	struct crypto_ahash *tfm;
+
+	tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+	if (!queue->snd_hash)
+		goto free_tfm;
+	ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
+
+	queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+	if (!queue->rcv_hash)
+		goto free_snd_hash;
+	ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
+
+	return 0;
+free_snd_hash:
+	ahash_request_free(queue->snd_hash);
+free_tfm:
+	crypto_free_ahash(tfm);
+	return -ENOMEM;
+}
+
+static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
+{
+	struct nvme_tcp_request *async = &ctrl->async_req;
+
+	page_frag_free(async->pdu);
+}
+
+static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
+{
+	struct nvme_tcp_queue *queue = &ctrl->queues[0];
+	struct nvme_tcp_request *async = &ctrl->async_req;
+	u8 hdgst = nvme_tcp_hdgst_len(queue);
+
+	async->pdu = page_frag_alloc(&queue->pf_cache,
+		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
+		GFP_KERNEL | __GFP_ZERO);
+	if (!async->pdu)
+		return -ENOMEM;
+
+	async->queue = &ctrl->queues[0];
+	return 0;
+}
+
+static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
+
+	if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
+		return;
+
+	if (queue->hdr_digest || queue->data_digest)
+		nvme_tcp_free_crypto(queue);
+
+	sock_release(queue->sock);
+	kfree(queue->pdu);
+}
+
+static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
+{
+	struct nvme_tcp_icreq_pdu *icreq;
+	struct nvme_tcp_icresp_pdu *icresp;
+	struct msghdr msg = {};
+	struct kvec iov;
+	bool ctrl_hdgst, ctrl_ddgst;
+	int ret;
+
+	icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
+	if (!icreq)
+		return -ENOMEM;
+
+	icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
+	if (!icresp) {
+		ret = -ENOMEM;
+		goto free_icreq;
+	}
+
+	icreq->hdr.type = nvme_tcp_icreq;
+	icreq->hdr.hlen = sizeof(*icreq);
+	icreq->hdr.pdo = 0;
+	icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
+	icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
+	icreq->maxr2t = 0; /* single inflight r2t supported */
+	icreq->hpda = 0; /* no alignment constraint */
+	if (queue->hdr_digest)
+		icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
+	if (queue->data_digest)
+		icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
+
+	iov.iov_base = icreq;
+	iov.iov_len = sizeof(*icreq);
+	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+	if (ret < 0)
+		goto free_icresp;
+
+	memset(&msg, 0, sizeof(msg));
+	iov.iov_base = icresp;
+	iov.iov_len = sizeof(*icresp);
+	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+			iov.iov_len, msg.msg_flags);
+	if (ret < 0)
+		goto free_icresp;
+
+	ret = -EINVAL;
+	if (icresp->hdr.type != nvme_tcp_icresp) {
+		pr_err("queue %d: bad type returned %d\n",
+			nvme_tcp_queue_id(queue), icresp->hdr.type);
+		goto free_icresp;
+	}
+
+	if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
+		pr_err("queue %d: bad pdu length returned %d\n",
+			nvme_tcp_queue_id(queue), icresp->hdr.plen);
+		goto free_icresp;
+	}
+
+	if (icresp->pfv != NVME_TCP_PFV_1_0) {
+		pr_err("queue %d: bad pfv returned %d\n",
+			nvme_tcp_queue_id(queue), icresp->pfv);
+		goto free_icresp;
+	}
+
+	ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
+	if ((queue->data_digest && !ctrl_ddgst) ||
+	    (!queue->data_digest && ctrl_ddgst)) {
+		pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
+			nvme_tcp_queue_id(queue),
+			queue->data_digest ? "enabled" : "disabled",
+			ctrl_ddgst ? "enabled" : "disabled");
+		goto free_icresp;
+	}
+
+	ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
+	if ((queue->hdr_digest && !ctrl_hdgst) ||
+	    (!queue->hdr_digest && ctrl_hdgst)) {
+		pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
+			nvme_tcp_queue_id(queue),
+			queue->hdr_digest ? "enabled" : "disabled",
+			ctrl_hdgst ? "enabled" : "disabled");
+		goto free_icresp;
+	}
+
+	if (icresp->cpda != 0) {
+		pr_err("queue %d: unsupported cpda returned %d\n",
+			nvme_tcp_queue_id(queue), icresp->cpda);
+		goto free_icresp;
+	}
+
+	ret = 0;
+free_icresp:
+	kfree(icresp);
+free_icreq:
+	kfree(icreq);
+	return ret;
+}
+
+static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
+		int qid, size_t queue_size)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
+	struct linger sol = { .l_onoff = 1, .l_linger = 0 };
+	int ret, opt, rcv_pdu_size;
+
+	queue->ctrl = ctrl;
+	INIT_LIST_HEAD(&queue->send_list);
+	spin_lock_init(&queue->lock);
+	INIT_WORK(&queue->io_work, nvme_tcp_io_work);
+	queue->queue_size = queue_size;
+
+	if (qid > 0)
+		queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
+	else
+		queue->cmnd_capsule_len = sizeof(struct nvme_command) +
+						NVME_TCP_ADMIN_CCSZ;
+
+	ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
+			IPPROTO_TCP, &queue->sock);
+	if (ret) {
+		dev_err(ctrl->ctrl.device,
+			"failed to create socket: %d\n", ret);
+		return ret;
+	}
+
+	/* Single syn retry */
+	opt = 1;
+	ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
+			(char *)&opt, sizeof(opt));
+	if (ret) {
+		dev_err(ctrl->ctrl.device,
+			"failed to set TCP_SYNCNT sock opt %d\n", ret);
+		goto err_sock;
+	}
+
+	/* Set TCP no delay */
+	opt = 1;
+	ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
+			TCP_NODELAY, (char *)&opt, sizeof(opt));
+	if (ret) {
+		dev_err(ctrl->ctrl.device,
+			"failed to set TCP_NODELAY sock opt %d\n", ret);
+		goto err_sock;
+	}
+
+	/*
+	 * Cleanup whatever is sitting in the TCP transmit queue on socket
+	 * close. This is done to prevent stale data from being sent should
+	 * the network connection be restored before TCP times out.
+	 */
+	ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
+			(char *)&sol, sizeof(sol));
+	if (ret) {
+		dev_err(ctrl->ctrl.device,
+			"failed to set SO_LINGER sock opt %d\n", ret);
+		goto err_sock;
+	}
+
+	queue->sock->sk->sk_allocation = GFP_ATOMIC;
+	queue->io_cpu = (qid == 0) ? 0 : qid - 1;
+	queue->request = NULL;
+	queue->data_remaining = 0;
+	queue->ddgst_remaining = 0;
+	queue->pdu_remaining = 0;
+	queue->pdu_offset = 0;
+	sk_set_memalloc(queue->sock->sk);
+
+	if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
+		ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
+			sizeof(ctrl->src_addr));
+		if (ret) {
+			dev_err(ctrl->ctrl.device,
+				"failed to bind queue %d socket %d\n",
+				qid, ret);
+			goto err_sock;
+		}
+	}
+
+	queue->hdr_digest = nctrl->opts->hdr_digest;
+	queue->data_digest = nctrl->opts->data_digest;
+	if (queue->hdr_digest || queue->data_digest) {
+		ret = nvme_tcp_alloc_crypto(queue);
+		if (ret) {
+			dev_err(ctrl->ctrl.device,
+				"failed to allocate queue %d crypto\n", qid);
+			goto err_sock;
+		}
+	}
+
+	rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
+			nvme_tcp_hdgst_len(queue);
+	queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
+	if (!queue->pdu) {
+		ret = -ENOMEM;
+		goto err_crypto;
+	}
+
+	dev_dbg(ctrl->ctrl.device, "connecting queue %d\n",
+			nvme_tcp_queue_id(queue));
+
+	ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
+		sizeof(ctrl->addr), 0);
+	if (ret) {
+		dev_err(ctrl->ctrl.device,
+			"failed to connect socket: %d\n", ret);
+		goto err_rcv_pdu;
+	}
+
+	ret = nvme_tcp_init_connection(queue);
+	if (ret)
+		goto err_init_connect;
+
+	queue->rd_enabled = true;
+	set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
+	nvme_tcp_init_recv_ctx(queue);
+
+	write_lock_bh(&queue->sock->sk->sk_callback_lock);
+	queue->sock->sk->sk_user_data = queue;
+	queue->state_change = queue->sock->sk->sk_state_change;
+	queue->data_ready = queue->sock->sk->sk_data_ready;
+	queue->write_space = queue->sock->sk->sk_write_space;
+	queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
+	queue->sock->sk->sk_state_change = nvme_tcp_state_change;
+	queue->sock->sk->sk_write_space = nvme_tcp_write_space;
+	write_unlock_bh(&queue->sock->sk->sk_callback_lock);
+
+	return 0;
+
+err_init_connect:
+	kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+err_rcv_pdu:
+	kfree(queue->pdu);
+err_crypto:
+	if (queue->hdr_digest || queue->data_digest)
+		nvme_tcp_free_crypto(queue);
+err_sock:
+	sock_release(queue->sock);
+	queue->sock = NULL;
+	return ret;
+}
+
+static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
+{
+	struct socket *sock = queue->sock;
+
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_user_data  = NULL;
+	sock->sk->sk_data_ready = queue->data_ready;
+	sock->sk->sk_state_change = queue->state_change;
+	sock->sk->sk_write_space  = queue->write_space;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
+{
+	kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+	nvme_tcp_restore_sock_calls(queue);
+	cancel_work_sync(&queue->io_work);
+}
+
+static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
+
+	if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
+		return;
+
+	__nvme_tcp_stop_queue(queue);
+}
+
+static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+	int ret;
+
+	if (idx)
+		ret = nvmf_connect_io_queue(nctrl, idx);
+	else
+		ret = nvmf_connect_admin_queue(nctrl);
+
+	if (!ret) {
+		set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
+	} else {
+		__nvme_tcp_stop_queue(&ctrl->queues[idx]);
+		dev_err(nctrl->device,
+			"failed to connect queue: %d ret=%d\n", idx, ret);
+	}
+	return ret;
+}
+
+static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
+		bool admin)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+	struct blk_mq_tag_set *set;
+	int ret;
+
+	if (admin) {
+		set = &ctrl->admin_tag_set;
+		memset(set, 0, sizeof(*set));
+		set->ops = &nvme_tcp_admin_mq_ops;
+		set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
+		set->reserved_tags = 2; /* connect + keep-alive */
+		set->numa_node = NUMA_NO_NODE;
+		set->cmd_size = sizeof(struct nvme_tcp_request);
+		set->driver_data = ctrl;
+		set->nr_hw_queues = 1;
+		set->timeout = ADMIN_TIMEOUT;
+	} else {
+		set = &ctrl->tag_set;
+		memset(set, 0, sizeof(*set));
+		set->ops = &nvme_tcp_mq_ops;
+		set->queue_depth = nctrl->sqsize + 1;
+		set->reserved_tags = 1; /* fabric connect */
+		set->numa_node = NUMA_NO_NODE;
+		set->flags = BLK_MQ_F_SHOULD_MERGE;
+		set->cmd_size = sizeof(struct nvme_tcp_request);
+		set->driver_data = ctrl;
+		set->nr_hw_queues = nctrl->queue_count - 1;
+		set->timeout = NVME_IO_TIMEOUT;
+	}
+
+	ret = blk_mq_alloc_tag_set(set);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return set;
+}
+
+static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
+{
+	if (to_tcp_ctrl(ctrl)->async_req.pdu) {
+		nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
+		to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
+	}
+
+	nvme_tcp_free_queue(ctrl, 0);
+}
+
+static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
+{
+	int i;
+
+	for (i = 1; i < ctrl->queue_count; i++)
+		nvme_tcp_free_queue(ctrl, i);
+}
+
+static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
+{
+	int i;
+
+	for (i = 1; i < ctrl->queue_count; i++)
+		nvme_tcp_stop_queue(ctrl, i);
+}
+
+static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
+{
+	int i, ret = 0;
+
+	for (i = 1; i < ctrl->queue_count; i++) {
+		ret = nvme_tcp_start_queue(ctrl, i);
+		if (ret)
+			goto out_stop_queues;
+	}
+
+	return 0;
+
+out_stop_queues:
+	for (i--; i >= 1; i--)
+		nvme_tcp_stop_queue(ctrl, i);
+	return ret;
+}
+
+static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
+{
+	int ret;
+
+	ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
+	if (ret)
+		return ret;
+
+	ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
+	if (ret)
+		goto out_free_queue;
+
+	return 0;
+
+out_free_queue:
+	nvme_tcp_free_queue(ctrl, 0);
+	return ret;
+}
+
+static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
+{
+	int i, ret;
+
+	for (i = 1; i < ctrl->queue_count; i++) {
+		ret = nvme_tcp_alloc_queue(ctrl, i,
+				ctrl->sqsize + 1);
+		if (ret)
+			goto out_free_queues;
+	}
+
+	return 0;
+
+out_free_queues:
+	for (i--; i >= 1; i--)
+		nvme_tcp_free_queue(ctrl, i);
+
+	return ret;
+}
+
+static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
+{
+	return min(ctrl->queue_count - 1, num_online_cpus());
+}
+
+static int nvme_alloc_io_queues(struct nvme_ctrl *ctrl)
+{
+	unsigned int nr_io_queues;
+	int ret;
+
+	nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
+	ret = nvme_set_queue_count(ctrl, &nr_io_queues);
+	if (ret)
+		return ret;
+
+	ctrl->queue_count = nr_io_queues + 1;
+	if (ctrl->queue_count < 2)
+		return 0;
+
+	dev_info(ctrl->device,
+		"creating %d I/O queues.\n", nr_io_queues);
+
+	return nvme_tcp_alloc_io_queues(ctrl);
+}
+
+static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
+{
+	nvme_tcp_stop_io_queues(ctrl);
+	if (remove) {
+		if (ctrl->ops->flags & NVME_F_FABRICS)
+			blk_cleanup_queue(ctrl->connect_q);
+		blk_mq_free_tag_set(ctrl->tagset);
+	}
+	nvme_tcp_free_io_queues(ctrl);
+}
+
+static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
+{
+	int ret;
+
+	ret = nvme_alloc_io_queues(ctrl);
+	if (ret)
+		return ret;
+
+	if (new) {
+		ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
+		if (IS_ERR(ctrl->tagset)) {
+			ret = PTR_ERR(ctrl->tagset);
+			goto out_free_io_queues;
+		}
+
+		if (ctrl->ops->flags & NVME_F_FABRICS) {
+			ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
+			if (IS_ERR(ctrl->connect_q)) {
+				ret = PTR_ERR(ctrl->connect_q);
+				goto out_free_tag_set;
+			}
+		}
+	} else {
+		blk_mq_update_nr_hw_queues(ctrl->tagset,
+			ctrl->queue_count - 1);
+	}
+
+	ret = nvme_tcp_start_io_queues(ctrl);
+	if (ret)
+		goto out_cleanup_connect_q;
+
+	return 0;
+
+out_cleanup_connect_q:
+	if (new && (ctrl->ops->flags & NVME_F_FABRICS))
+		blk_cleanup_queue(ctrl->connect_q);
+out_free_tag_set:
+	if (new)
+		blk_mq_free_tag_set(ctrl->tagset);
+out_free_io_queues:
+	nvme_tcp_free_io_queues(ctrl);
+	return ret;
+}
+
+static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
+{
+	nvme_tcp_stop_queue(ctrl, 0);
+	if (remove) {
+		free_opal_dev(ctrl->opal_dev);
+		blk_cleanup_queue(ctrl->admin_q);
+		blk_mq_free_tag_set(ctrl->admin_tagset);
+	}
+	nvme_tcp_free_admin_queue(ctrl);
+}
+
+static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
+{
+	int error;
+
+	error = nvme_tcp_alloc_admin_queue(ctrl);
+	if (error)
+		return error;
+
+	if (new) {
+		ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
+		if (IS_ERR(ctrl->admin_tagset)) {
+			error = PTR_ERR(ctrl->admin_tagset);
+			goto out_free_queue;
+		}
+
+		ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
+		if (IS_ERR(ctrl->admin_q)) {
+			error = PTR_ERR(ctrl->admin_q);
+			goto out_free_tagset;
+		}
+	}
+
+	error = nvme_tcp_start_queue(ctrl, 0);
+	if (error)
+		goto out_cleanup_queue;
+
+	error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
+	if (error) {
+		dev_err(ctrl->device,
+			"prop_get NVME_REG_CAP failed\n");
+		goto out_stop_queue;
+	}
+
+	ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
+
+	error = nvme_enable_ctrl(ctrl, ctrl->cap);
+	if (error)
+		goto out_stop_queue;
+
+	error = nvme_init_identify(ctrl);
+	if (error)
+		goto out_stop_queue;
+
+	return 0;
+
+out_stop_queue:
+	nvme_tcp_stop_queue(ctrl, 0);
+out_cleanup_queue:
+	if (new)
+		blk_cleanup_queue(ctrl->admin_q);
+out_free_tagset:
+	if (new)
+		blk_mq_free_tag_set(ctrl->admin_tagset);
+out_free_queue:
+	nvme_tcp_free_admin_queue(ctrl);
+	return error;
+}
+
+static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
+		bool remove)
+{
+	blk_mq_quiesce_queue(ctrl->admin_q);
+	nvme_tcp_stop_queue(ctrl, 0);
+	blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl);
+	blk_mq_unquiesce_queue(ctrl->admin_q);
+	nvme_tcp_destroy_admin_queue(ctrl, remove);
+}
+
+static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
+		bool remove)
+{
+	if (ctrl->queue_count <= 1)
+		return;
+	nvme_stop_queues(ctrl);
+	nvme_tcp_stop_io_queues(ctrl);
+	blk_mq_tagset_busy_iter(ctrl->tagset, nvme_cancel_request, ctrl);
+	if (remove)
+		nvme_start_queues(ctrl);
+	nvme_tcp_destroy_io_queues(ctrl, remove);
+}
+
+static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
+{
+	/* If we are resetting/deleting then do nothing */
+	if (ctrl->state != NVME_CTRL_CONNECTING) {
+		WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
+			ctrl->state == NVME_CTRL_LIVE);
+		return;
+	}
+
+	if (nvmf_should_reconnect(ctrl)) {
+		dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
+			ctrl->opts->reconnect_delay);
+		queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
+				ctrl->opts->reconnect_delay * HZ);
+	} else {
+		dev_info(ctrl->device, "Removing controller...\n");
+		nvme_delete_ctrl(ctrl);
+	}
+}
+
+static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
+{
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+	int ret = -EINVAL;
+
+	ret = nvme_tcp_configure_admin_queue(ctrl, new);
+	if (ret)
+		return ret;
+
+	if (ctrl->icdoff) {
+		dev_err(ctrl->device, "icdoff is not supported!\n");
+		goto destroy_admin;
+	}
+
+	if (opts->queue_size > ctrl->sqsize + 1)
+		dev_warn(ctrl->device,
+			"queue_size %zu > ctrl sqsize %u, clamping down\n",
+			opts->queue_size, ctrl->sqsize + 1);
+
+	if (ctrl->sqsize + 1 > ctrl->maxcmd) {
+		dev_warn(ctrl->device,
+			"sqsize %u > ctrl maxcmd %u, clamping down\n",
+			ctrl->sqsize + 1, ctrl->maxcmd);
+		ctrl->sqsize = ctrl->maxcmd - 1;
+	}
+
+	if (ctrl->queue_count > 1) {
+		ret = nvme_tcp_configure_io_queues(ctrl, new);
+		if (ret)
+			goto destroy_admin;
+	}
+
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
+		/* state change failure is ok if we're in DELETING state */
+		WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+		ret = -EINVAL;
+		goto destroy_io;
+	}
+
+	nvme_start_ctrl(ctrl);
+	return 0;
+
+destroy_io:
+	if (ctrl->queue_count > 1)
+		nvme_tcp_destroy_io_queues(ctrl, new);
+destroy_admin:
+	nvme_tcp_stop_queue(ctrl, 0);
+	nvme_tcp_destroy_admin_queue(ctrl, new);
+	return ret;
+}
+
+static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
+{
+	struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
+			struct nvme_tcp_ctrl, connect_work);
+	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+
+	++ctrl->nr_reconnects;
+
+	if (nvme_tcp_setup_ctrl(ctrl, false))
+		goto requeue;
+
+	dev_info(ctrl->device, "Successfully reconnected (%d attepmpt)\n",
+			ctrl->nr_reconnects);
+
+	ctrl->nr_reconnects = 0;
+
+	return;
+
+requeue:
+	dev_info(ctrl->device, "Failed reconnect attempt %d\n",
+			ctrl->nr_reconnects);
+	nvme_tcp_reconnect_or_remove(ctrl);
+}
+
+static void nvme_tcp_error_recovery_work(struct work_struct *work)
+{
+	struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
+				struct nvme_tcp_ctrl, err_work);
+	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+
+	nvme_stop_keep_alive(ctrl);
+	nvme_tcp_teardown_io_queues(ctrl, false);
+	/* unquiesce to fail fast pending requests */
+	nvme_start_queues(ctrl);
+	nvme_tcp_teardown_admin_queue(ctrl, false);
+
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
+		/* state change failure is ok if we're in DELETING state */
+		WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+		return;
+	}
+
+	nvme_tcp_reconnect_or_remove(ctrl);
+}
+
+static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
+{
+	nvme_tcp_teardown_io_queues(ctrl, shutdown);
+	if (shutdown)
+		nvme_shutdown_ctrl(ctrl);
+	else
+		nvme_disable_ctrl(ctrl, ctrl->cap);
+	nvme_tcp_teardown_admin_queue(ctrl, shutdown);
+}
+
+static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
+{
+	nvme_tcp_teardown_ctrl(ctrl, true);
+}
+
+static void nvme_reset_ctrl_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(work, struct nvme_ctrl, reset_work);
+
+	nvme_stop_ctrl(ctrl);
+	nvme_tcp_teardown_ctrl(ctrl, false);
+
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
+		/* state change failure is ok if we're in DELETING state */
+		WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+		return;
+	}
+
+	if (nvme_tcp_setup_ctrl(ctrl, false))
+		goto out_fail;
+
+	return;
+
+out_fail:
+	++ctrl->nr_reconnects;
+	nvme_tcp_reconnect_or_remove(ctrl);
+}
+
+static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
+{
+	cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
+	cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
+}
+
+static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+
+	if (list_empty(&ctrl->list))
+		goto free_ctrl;
+
+	mutex_lock(&nvme_tcp_ctrl_mutex);
+	list_del(&ctrl->list);
+	mutex_unlock(&nvme_tcp_ctrl_mutex);
+
+	nvmf_free_options(nctrl->opts);
+free_ctrl:
+	kfree(ctrl->queues);
+	kfree(ctrl);
+}
+
+static void nvme_tcp_set_sg_null(struct nvme_command *c)
+{
+	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+
+	sg->addr = 0;
+	sg->length = 0;
+	sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
+			NVME_SGL_FMT_TRANSPORT_A;
+}
+
+static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
+		struct nvme_command *c, u32 data_len)
+{
+	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+
+	sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
+	sg->length = cpu_to_le32(data_len);
+	sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
+}
+
+static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
+		u32 data_len)
+{
+	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+
+	sg->addr = 0;
+	sg->length = cpu_to_le32(data_len);
+	sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
+			NVME_SGL_FMT_TRANSPORT_A;
+}
+
+static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
+	struct nvme_tcp_queue *queue = &ctrl->queues[0];
+	struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
+	struct nvme_command *cmd = &pdu->cmd;
+	u8 hdgst = nvme_tcp_hdgst_len(queue);
+
+	memset(pdu, 0, sizeof(*pdu));
+	pdu->hdr.type = nvme_tcp_cmd;
+	if (queue->hdr_digest)
+		pdu->hdr.flags |= NVME_TCP_F_HDGST;
+	pdu->hdr.hlen = sizeof(*pdu);
+	pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
+
+	cmd->common.opcode = nvme_admin_async_event;
+	cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
+	cmd->common.flags |= NVME_CMD_SGL_METABUF;
+	nvme_tcp_set_sg_null(cmd);
+
+	ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
+	ctrl->async_req.offset = 0;
+	ctrl->async_req.curr_bio = NULL;
+	ctrl->async_req.data_len = 0;
+
+	nvme_tcp_queue_request(&ctrl->async_req);
+}
+
+static enum blk_eh_timer_return
+nvme_tcp_timeout(struct request *rq, bool reserved)
+{
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
+	struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+
+	dev_dbg(ctrl->ctrl.device,
+		"queue %d: timeout request %#x type %d\n",
+		nvme_tcp_queue_id(req->queue), rq->tag,
+		pdu->hdr.type);
+
+	if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
+		union nvme_result res = {};
+
+		nvme_req(rq)->flags |= NVME_REQ_CANCELLED;
+		nvme_end_request(rq, NVME_SC_ABORT_REQ, res);
+		return BLK_EH_DONE;
+	}
+
+	/* queue error recovery */
+	nvme_tcp_error_recovery(&ctrl->ctrl);
+
+	return BLK_EH_RESET_TIMER;
+}
+
+static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
+			struct request *rq)
+{
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+	struct nvme_command *c = &pdu->cmd;
+
+	c->common.flags |= NVME_CMD_SGL_METABUF;
+
+	if (rq_data_dir(rq) == WRITE && req->data_len &&
+	    req->data_len <= nvme_tcp_inline_data_size(queue))
+		nvme_tcp_set_sg_inline(queue, c, req->data_len);
+	else
+		nvme_tcp_set_sg_host_data(c, req->data_len);
+
+	return 0;
+}
+
+static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
+		struct request *rq)
+{
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+	struct nvme_tcp_queue *queue = req->queue;
+	u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
+	blk_status_t ret;
+
+	ret = nvme_setup_cmd(ns, rq, &pdu->cmd);
+	if (ret)
+		return ret;
+
+	req->state = NVME_TCP_SEND_CMD_PDU;
+	req->offset = 0;
+	req->data_sent = 0;
+	req->pdu_len = 0;
+	req->pdu_sent = 0;
+	req->data_len = blk_rq_payload_bytes(rq);
+	req->curr_bio = rq->bio;
+
+	if (rq_data_dir(rq) == WRITE &&
+	    req->data_len <= nvme_tcp_inline_data_size(queue))
+		req->pdu_len = req->data_len;
+	else if (req->curr_bio)
+		nvme_tcp_init_iter(req, READ);
+
+	pdu->hdr.type = nvme_tcp_cmd;
+	pdu->hdr.flags = 0;
+	if (queue->hdr_digest)
+		pdu->hdr.flags |= NVME_TCP_F_HDGST;
+	if (queue->data_digest && req->pdu_len) {
+		pdu->hdr.flags |= NVME_TCP_F_DDGST;
+		ddgst = nvme_tcp_ddgst_len(queue);
+	}
+	pdu->hdr.hlen = sizeof(*pdu);
+	pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
+	pdu->hdr.plen =
+		cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
+
+	ret = nvme_tcp_map_data(queue, rq);
+	if (unlikely(ret)) {
+		dev_err(queue->ctrl->ctrl.device,
+			"Failed to map data (%d)\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
+		const struct blk_mq_queue_data *bd)
+{
+	struct nvme_ns *ns = hctx->queue->queuedata;
+	struct nvme_tcp_queue *queue = hctx->driver_data;
+	struct request *rq = bd->rq;
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
+	blk_status_t ret;
+
+	if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
+		return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
+
+	ret = nvme_tcp_setup_cmd_pdu(ns, rq);
+	if (unlikely(ret))
+		return ret;
+
+	blk_mq_start_request(rq);
+
+	nvme_tcp_queue_request(req);
+
+	return BLK_STS_OK;
+}
+
+static struct blk_mq_ops nvme_tcp_mq_ops = {
+	.queue_rq	= nvme_tcp_queue_rq,
+	.complete	= nvme_complete_rq,
+	.init_request	= nvme_tcp_init_request,
+	.exit_request	= nvme_tcp_exit_request,
+	.init_hctx	= nvme_tcp_init_hctx,
+	.timeout	= nvme_tcp_timeout,
+};
+
+static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
+	.queue_rq	= nvme_tcp_queue_rq,
+	.complete	= nvme_complete_rq,
+	.init_request	= nvme_tcp_init_request,
+	.exit_request	= nvme_tcp_exit_request,
+	.init_hctx	= nvme_tcp_init_admin_hctx,
+	.timeout	= nvme_tcp_timeout,
+};
+
+static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
+	.name			= "tcp",
+	.module			= THIS_MODULE,
+	.flags			= NVME_F_FABRICS,
+	.reg_read32		= nvmf_reg_read32,
+	.reg_read64		= nvmf_reg_read64,
+	.reg_write32		= nvmf_reg_write32,
+	.free_ctrl		= nvme_tcp_free_ctrl,
+	.submit_async_event	= nvme_tcp_submit_async_event,
+	.delete_ctrl		= nvme_tcp_delete_ctrl,
+	.get_address		= nvmf_get_address,
+	.stop_ctrl		= nvme_tcp_stop_ctrl,
+};
+
+static bool
+nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
+{
+	struct nvme_tcp_ctrl *ctrl;
+	bool found = false;
+
+	mutex_lock(&nvme_tcp_ctrl_mutex);
+	list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
+		found = nvmf_ip_options_match(&ctrl->ctrl, opts);
+		if (found)
+			break;
+	}
+	mutex_unlock(&nvme_tcp_ctrl_mutex);
+
+	return found;
+}
+
+static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
+		struct nvmf_ctrl_options *opts)
+{
+	struct nvme_tcp_ctrl *ctrl;
+	int ret;
+
+	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
+	if (!ctrl)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ctrl->list);
+	ctrl->ctrl.opts = opts;
+	ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
+	ctrl->ctrl.sqsize = opts->queue_size - 1;
+	ctrl->ctrl.kato = opts->kato;
+
+	INIT_DELAYED_WORK(&ctrl->connect_work,
+			nvme_tcp_reconnect_ctrl_work);
+	INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
+	INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
+
+	if (!(opts->mask & NVMF_OPT_TRSVCID)) {
+		opts->trsvcid =
+			kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
+		if (!opts->trsvcid) {
+			ret = -ENOMEM;
+			goto out_free_ctrl;
+		}
+		opts->mask |= NVMF_OPT_TRSVCID;
+	}
+
+	ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+			opts->traddr, opts->trsvcid, &ctrl->addr);
+	if (ret) {
+		pr_err("malformed address passed: %s:%s\n",
+			opts->traddr, opts->trsvcid);
+		goto out_free_ctrl;
+	}
+
+	if (opts->mask & NVMF_OPT_HOST_TRADDR) {
+		ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+			opts->host_traddr, NULL, &ctrl->src_addr);
+		if (ret) {
+			pr_err("malformed src address passed: %s\n",
+			       opts->host_traddr);
+			goto out_free_ctrl;
+		}
+	}
+
+	if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
+		ret = -EALREADY;
+		goto out_free_ctrl;
+	}
+
+	ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues),
+				GFP_KERNEL);
+	if (!ctrl->queues) {
+		ret = -ENOMEM;
+		goto out_free_ctrl;
+	}
+
+	ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
+	if (ret)
+		goto out_kfree_queues;
+
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
+		WARN_ON_ONCE(1);
+		ret = -EINTR;
+		goto out_uninit_ctrl;
+	}
+
+	ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
+	if (ret)
+		goto out_uninit_ctrl;
+
+	dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
+		ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
+
+	nvme_get_ctrl(&ctrl->ctrl);
+
+	mutex_lock(&nvme_tcp_ctrl_mutex);
+	list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
+	mutex_unlock(&nvme_tcp_ctrl_mutex);
+
+	return &ctrl->ctrl;
+
+out_uninit_ctrl:
+	nvme_uninit_ctrl(&ctrl->ctrl);
+	nvme_put_ctrl(&ctrl->ctrl);
+	if (ret > 0)
+		ret = -EIO;
+	return ERR_PTR(ret);
+out_kfree_queues:
+	kfree(ctrl->queues);
+out_free_ctrl:
+	kfree(ctrl);
+	return ERR_PTR(ret);
+}
+
+static struct nvmf_transport_ops nvme_tcp_transport = {
+	.name		= "tcp",
+	.module		= THIS_MODULE,
+	.required_opts	= NVMF_OPT_TRADDR,
+	.allowed_opts	= NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
+			  NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
+			  NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST,
+	.create_ctrl	= nvme_tcp_create_ctrl,
+};
+
+static int __init nvme_tcp_init_module(void)
+{
+	nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
+			WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+	if (!nvme_tcp_wq)
+		return -ENOMEM;
+
+	nvmf_register_transport(&nvme_tcp_transport);
+	return 0;
+}
+
+static void __exit nvme_tcp_cleanup_module(void)
+{
+	struct nvme_tcp_ctrl *ctrl;
+
+	nvmf_unregister_transport(&nvme_tcp_transport);
+
+	mutex_lock(&nvme_tcp_ctrl_mutex);
+	list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
+		nvme_delete_ctrl(&ctrl->ctrl);
+	mutex_unlock(&nvme_tcp_ctrl_mutex);
+	flush_workqueue(nvme_delete_wq);
+
+	destroy_workqueue(nvme_tcp_wq);
+}
+
+module_init(nvme_tcp_init_module);
+module_exit(nvme_tcp_cleanup_module);
+
+MODULE_LICENSE("GPL v2");
-- 
cgit 


From 8eb5d89f483141dd076529bf5f6aa235b425886e Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmx.com>
Date: Tue, 11 Dec 2018 07:24:34 +0800
Subject: nvme: add __exit annotation

Add __exit annotation to cleanup helper which is only
called once in the module.

Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 2 +-
 drivers/nvme/host/nvme.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c71e879821ad..3043cedb24e1 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3802,7 +3802,7 @@ out:
 	return result;
 }
 
-void nvme_core_exit(void)
+void __exit nvme_core_exit(void)
 {
 	ida_destroy(&nvme_subsystems_ida);
 	class_destroy(nvme_subsys_class);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index e20e737ac10c..c78a9dc7fdcf 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -565,6 +565,6 @@ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
 }
 
 int __init nvme_core_init(void);
-void nvme_core_exit(void);
+void __exit nvme_core_exit(void);
 
 #endif /* _NVME_H */
-- 
cgit 


From cb5b7262b011cfb793519bf97e54dff5282da23c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 12 Dec 2018 09:18:11 -0700
Subject: nvme: provide fallback for discard alloc failure

When boxes are run near (or to) OOM, we have a problem with the discard
page allocation in nvme. If we fail allocating the special page, we
return busy, and it'll get retried. But since ordering is honored for
dispatch requests, we can keep retrying this same IO and failing. Behind
that IO could be requests that want to free memory, but they never get
the chance.

Allocate a fixed discard page per controller for a safe fallback, and use
that if the initial allocation fails.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 41 +++++++++++++++++++++++++++++++++++------
 drivers/nvme/host/nvme.h |  3 +++
 2 files changed, 38 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3043cedb24e1..168f2c1eaf60 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -564,9 +564,19 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 	struct nvme_dsm_range *range;
 	struct bio *bio;
 
-	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
-	if (!range)
-		return BLK_STS_RESOURCE;
+	range = kmalloc_array(segments, sizeof(*range),
+				GFP_ATOMIC | __GFP_NOWARN);
+	if (!range) {
+		/*
+		 * If we fail allocation our range, fallback to the controller
+		 * discard page. If that's also busy, it's safe to return
+		 * busy, as we know we can make progress once that's freed.
+		 */
+		if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
+			return BLK_STS_RESOURCE;
+
+		range = page_address(ns->ctrl->discard_page);
+	}
 
 	__rq_for_each_bio(bio, req) {
 		u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
@@ -581,7 +591,10 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 	}
 
 	if (WARN_ON_ONCE(n != segments)) {
-		kfree(range);
+		if (virt_to_page(range) == ns->ctrl->discard_page)
+			clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
+		else
+			kfree(range);
 		return BLK_STS_IOERR;
 	}
 
@@ -664,8 +677,13 @@ void nvme_cleanup_cmd(struct request *req)
 				blk_rq_bytes(req) >> ns->lba_shift);
 	}
 	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
-		kfree(page_address(req->special_vec.bv_page) +
-		      req->special_vec.bv_offset);
+		struct nvme_ns *ns = req->rq_disk->private_data;
+		struct page *page = req->special_vec.bv_page;
+
+		if (page == ns->ctrl->discard_page)
+			clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
+		else
+			kfree(page_address(page) + req->special_vec.bv_offset);
 	}
 }
 EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
@@ -3578,6 +3596,7 @@ static void nvme_free_ctrl(struct device *dev)
 	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 	kfree(ctrl->effects);
 	nvme_mpath_uninit(ctrl);
+	kfree(ctrl->discard_page);
 
 	if (subsys) {
 		mutex_lock(&subsys->lock);
@@ -3618,6 +3637,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
 	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
 
+	BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
+			PAGE_SIZE);
+	ctrl->discard_page = alloc_page(GFP_KERNEL);
+	if (!ctrl->discard_page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
 	ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
 	if (ret < 0)
 		goto out;
@@ -3655,6 +3682,8 @@ out_free_name:
 out_release_instance:
 	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 out:
+	if (ctrl->discard_page)
+		__free_page(ctrl->discard_page);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index c78a9dc7fdcf..39b52f4d9b24 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -241,6 +241,9 @@ struct nvme_ctrl {
 	u16 maxcmd;
 	int nr_reconnects;
 	struct nvmf_ctrl_options *opts;
+
+	struct page *discard_page;
+	unsigned long discard_page_busy;
 };
 
 struct nvme_subsystem {
-- 
cgit 


From 16d3a280d4d73eae786ebf2576cd597a031bc9df Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 12 Dec 2018 23:01:54 -0800
Subject: nvmet: remove unused variable

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/io-cmd-bdev.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index c1cb2ed5531c..e0831230ca27 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -61,7 +61,6 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 	struct bio *bio;
 	struct scatterlist *sg;
 	sector_t sector;
-	blk_qc_t cookie;
 	int op, op_flags = 0, i;
 
 	if (!req->sg_cnt) {
@@ -114,7 +113,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 		sg_cnt--;
 	}
 
-	cookie = submit_bio(bio);
+	submit_bio(bio);
 }
 
 static void nvmet_bdev_execute_flush(struct nvmet_req *req)
-- 
cgit 


From b7c8f3663d0e0773aca3324c26bce3ca8343ec14 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:37 -0800
Subject: nvme: remove nvme_common command cdw10 array

This is a preparation patch which removes the nvme common command cdw10
array and replace with individual fields. This is needed for the nvmet
error log page implementation make is error log page entry offset
assignment easier.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c        | 18 +++++++++---------
 drivers/nvme/host/lightnvm.c    |  6 +++---
 drivers/nvme/host/trace.h       |  4 ++--
 drivers/nvme/target/admin-cmd.c | 12 ++++++------
 drivers/nvme/target/discovery.c |  4 ++--
 drivers/nvme/target/nvmet.h     |  2 +-
 6 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 168f2c1eaf60..4d8ee7186268 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1283,12 +1283,12 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	c.common.nsid = cpu_to_le32(cmd.nsid);
 	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
 	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
-	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
-	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
-	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
-	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
-	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
-	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
+	c.common.cdw10 = cpu_to_le32(cmd.cdw10);
+	c.common.cdw11 = cpu_to_le32(cmd.cdw11);
+	c.common.cdw12 = cpu_to_le32(cmd.cdw12);
+	c.common.cdw13 = cpu_to_le32(cmd.cdw13);
+	c.common.cdw14 = cpu_to_le32(cmd.cdw14);
+	c.common.cdw15 = cpu_to_le32(cmd.cdw15);
 
 	if (cmd.timeout_ms)
 		timeout = msecs_to_jiffies(cmd.timeout_ms);
@@ -1649,7 +1649,7 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
 	memset(&c, 0, sizeof(c));
 	c.common.opcode = op;
 	c.common.nsid = cpu_to_le32(ns->head->ns_id);
-	c.common.cdw10[0] = cpu_to_le32(cdw10);
+	c.common.cdw10 = cpu_to_le32(cdw10);
 
 	ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
 	nvme_put_ns_from_disk(head, srcu_idx);
@@ -1723,8 +1723,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 	else
 		cmd.common.opcode = nvme_admin_security_recv;
 	cmd.common.nsid = 0;
-	cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
-	cmd.common.cdw10[1] = cpu_to_le32(len);
+	cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
+	cmd.common.cdw11 = cpu_to_le32(len);
 
 	return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
 				      ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index f145fc0220d6..b759c25c89c8 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -937,9 +937,9 @@ static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin,
 	/* cdw11-12 */
 	c.ph_rw.length = cpu_to_le16(vcmd.nppas);
 	c.ph_rw.control  = cpu_to_le16(vcmd.control);
-	c.common.cdw10[3] = cpu_to_le32(vcmd.cdw13);
-	c.common.cdw10[4] = cpu_to_le32(vcmd.cdw14);
-	c.common.cdw10[5] = cpu_to_le32(vcmd.cdw15);
+	c.common.cdw13 = cpu_to_le32(vcmd.cdw13);
+	c.common.cdw14 = cpu_to_le32(vcmd.cdw14);
+	c.common.cdw15 = cpu_to_le32(vcmd.cdw15);
 
 	if (vcmd.timeout_ms)
 		timeout = msecs_to_jiffies(vcmd.timeout_ms);
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
index 196d5bd56718..1978deb6fcc7 100644
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/host/trace.h
@@ -115,8 +115,8 @@ TRACE_EVENT(nvme_setup_cmd,
 		__entry->nsid = le32_to_cpu(cmd->common.nsid);
 		__entry->metadata = le64_to_cpu(cmd->common.metadata);
 		__assign_disk_name(__entry->disk, req->rq_disk);
-		memcpy(__entry->cdw10, cmd->common.cdw10,
-		       sizeof(__entry->cdw10));
+		memcpy(__entry->cdw10, &cmd->common.cdw10,
+			6 * sizeof(__entry->cdw10));
 	    ),
 	    TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
 		      __entry->ctrl_id, __print_disk_name(__entry->disk),
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 753515fc8028..721b041a6b3b 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -557,7 +557,7 @@ static u16 nvmet_write_protect_flush_sync(struct nvmet_req *req)
 
 static u16 nvmet_set_feat_write_protect(struct nvmet_req *req)
 {
-	u32 write_protect = le32_to_cpu(req->cmd->common.cdw10[1]);
+	u32 write_protect = le32_to_cpu(req->cmd->common.cdw11);
 	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
 	u16 status = NVME_SC_FEATURE_NOT_CHANGEABLE;
 
@@ -589,7 +589,7 @@ static u16 nvmet_set_feat_write_protect(struct nvmet_req *req)
 
 u16 nvmet_set_feat_kato(struct nvmet_req *req)
 {
-	u32 val32 = le32_to_cpu(req->cmd->common.cdw10[1]);
+	u32 val32 = le32_to_cpu(req->cmd->common.cdw11);
 
 	req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000);
 
@@ -600,7 +600,7 @@ u16 nvmet_set_feat_kato(struct nvmet_req *req)
 
 u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask)
 {
-	u32 val32 = le32_to_cpu(req->cmd->common.cdw10[1]);
+	u32 val32 = le32_to_cpu(req->cmd->common.cdw11);
 
 	if (val32 & ~mask)
 		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
@@ -614,7 +614,7 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask)
 static void nvmet_execute_set_features(struct nvmet_req *req)
 {
 	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
-	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
 	u16 status = 0;
 
 	switch (cdw10 & 0xff) {
@@ -675,7 +675,7 @@ void nvmet_get_feat_async_event(struct nvmet_req *req)
 static void nvmet_execute_get_features(struct nvmet_req *req)
 {
 	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
-	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
 	u16 status = 0;
 
 	switch (cdw10 & 0xff) {
@@ -715,7 +715,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
 		break;
 	case NVME_FEAT_HOST_ID:
 		/* need 128-bit host identifier flag */
-		if (!(req->cmd->common.cdw10[1] & cpu_to_le32(1 << 0))) {
+		if (!(req->cmd->common.cdw11 & cpu_to_le32(1 << 0))) {
 			status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 			break;
 		}
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 4d8757ae8210..e1bb254671de 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -247,7 +247,7 @@ out:
 
 static void nvmet_execute_disc_set_features(struct nvmet_req *req)
 {
-	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
 	u16 stat;
 
 	switch (cdw10 & 0xff) {
@@ -268,7 +268,7 @@ static void nvmet_execute_disc_set_features(struct nvmet_req *req)
 
 static void nvmet_execute_disc_get_features(struct nvmet_req *req)
 {
-	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
 	u16 stat = 0;
 
 	switch (cdw10 & 0xff) {
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 89df51ee5bdf..dafee1af4829 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -349,7 +349,7 @@ struct nvmet_async_event {
 
 static inline void nvmet_clear_aen_bit(struct nvmet_req *req, u32 bn)
 {
-	int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15;
+	int rae = le32_to_cpu(req->cmd->common.cdw10) & 1 << 15;
 
 	if (!rae)
 		clear_bit(bn, &req->sq->ctrl->aen_masked);
-- 
cgit 


From e4a976254ec5ebdcdb3e1e1b40e795b3db6b6dab Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:39 -0800
Subject: nvmet: add error-log definitions

This patch adds necessary fields in the target data structures to
support error log page. For a target controller, we add a new error log
field to maintain the error log, at any given point we maintain error
entries equal to NVMET_ERROR_LOG_SLOTS for each controller. In the
following patch, we also update the error log page entry in the I/O
completion path so we introduce a spinlock for synchronization of the
log.

For nvmet_req, we add a new field error_loc to hold the location of
the error in the command when the actual error occurs for each request
and a starting LBA if applicable.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/core.c  | 5 +++++
 drivers/nvme/target/nvmet.h | 6 ++++++
 2 files changed, 11 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index e468100b9211..32dca1b50c21 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -769,6 +769,8 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 	req->rsp->status = 0;
 	req->rsp->sq_head = 0;
 	req->ns = NULL;
+	req->error_loc = -1;
+	req->error_slba = 0;
 
 	/* no support for fused commands yet */
 	if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
@@ -1174,6 +1176,9 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 	/* keep-alive timeout in seconds */
 	ctrl->kato = DIV_ROUND_UP(kato, 1000);
 
+	ctrl->err_counter = 0;
+	spin_lock_init(&ctrl->error_lock);
+
 	nvmet_start_keep_alive_timer(ctrl);
 
 	mutex_lock(&subsys->lock);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index dafee1af4829..61b3cc415905 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -202,6 +202,10 @@ struct nvmet_ctrl {
 
 	struct device		*p2p_client;
 	struct radix_tree_root	p2p_ns_map;
+
+	spinlock_t		error_lock;
+	u64			err_counter;
+	struct nvme_error_slot	slots[NVMET_ERROR_LOG_SLOTS];
 };
 
 struct nvmet_subsys {
@@ -317,6 +321,8 @@ struct nvmet_req {
 
 	struct pci_dev		*p2p_dev;
 	struct device		*p2p_client;
+	u16			error_loc;
+	u64			error_slba;
 };
 
 extern struct workqueue_struct *buffered_io_wq;
-- 
cgit 


From 76574f37bf4caa7150ea1559cd98e3017b9752d2 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:40 -0800
Subject: nvmet: add interface to update error-log page

This patch adds nvmet_req based interface to the nvmet-core so that
we can update the error log page. We update error log page in
the request completion path when status is not set to NVME_SC_SUCCESS.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/core.c  | 32 +++++++++++++++++++++++++++++++-
 drivers/nvme/target/nvmet.h |  5 -----
 2 files changed, 31 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 32dca1b50c21..c7753ecdaaa5 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -611,14 +611,44 @@ static void nvmet_update_sq_head(struct nvmet_req *req)
 	req->rsp->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF);
 }
 
+static void nvmet_set_error(struct nvmet_req *req, u16 status)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvme_error_slot *new_error_slot;
+	unsigned long flags;
+
+	req->rsp->status = cpu_to_le16(status << 1);
+
+	if (!ctrl || req->error_loc == -1)
+		return;
+
+	spin_lock_irqsave(&ctrl->error_lock, flags);
+	ctrl->err_counter++;
+	new_error_slot =
+		&ctrl->slots[ctrl->err_counter % NVMET_ERROR_LOG_SLOTS];
+
+	new_error_slot->error_count = cpu_to_le64(ctrl->err_counter);
+	new_error_slot->sqid = cpu_to_le16(req->sq->qid);
+	new_error_slot->cmdid = cpu_to_le16(req->cmd->common.command_id);
+	new_error_slot->status_field = cpu_to_le16(status << 1);
+	new_error_slot->param_error_location = cpu_to_le16(req->error_loc);
+	new_error_slot->lba = cpu_to_le64(req->error_slba);
+	new_error_slot->nsid = req->cmd->common.nsid;
+	spin_unlock_irqrestore(&ctrl->error_lock, flags);
+
+	/* set the more bit for this request */
+	req->rsp->status |= cpu_to_le16(1 << 14);
+}
+
 static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
 {
 	if (!req->sq->sqhd_disabled)
 		nvmet_update_sq_head(req);
 	req->rsp->sq_id = cpu_to_le16(req->sq->qid);
 	req->rsp->command_id = req->cmd->common.command_id;
+
 	if (unlikely(status))
-		nvmet_set_status(req, status);
+		nvmet_set_error(req, status);
 	if (req->ns)
 		nvmet_put_namespace(req->ns);
 	req->ops->queue_response(req);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 61b3cc415905..a2a7fd69b66a 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -327,11 +327,6 @@ struct nvmet_req {
 
 extern struct workqueue_struct *buffered_io_wq;
 
-static inline void nvmet_set_status(struct nvmet_req *req, u16 status)
-{
-	req->rsp->status = cpu_to_le16(status << 1);
-}
-
 static inline void nvmet_set_result(struct nvmet_req *req, u32 result)
 {
 	req->rsp->result.u32 = cpu_to_le32(result);
-- 
cgit 


From e81446afc7f5ee45fd90b1b5a22ffa247dca5b15 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:41 -0800
Subject: nvmet: add error log support in the core

This patch adds the support to maintain error log page for the
nvmet-core.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/core.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index c7753ecdaaa5..0b969977e361 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -51,22 +51,28 @@ static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
 u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
 		size_t len)
 {
-	if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len)
+	if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
+		req->error_loc = offsetof(struct nvme_common_command, dptr);
 		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
+	}
 	return 0;
 }
 
 u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
 {
-	if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len)
+	if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
+		req->error_loc = offsetof(struct nvme_common_command, dptr);
 		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
+	}
 	return 0;
 }
 
 u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len)
 {
-	if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len)
+	if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) {
+		req->error_loc = offsetof(struct nvme_common_command, dptr);
 		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
+	}
 	return 0;
 }
 
@@ -769,14 +775,20 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
 		return ret;
 
 	req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
-	if (unlikely(!req->ns))
+	if (unlikely(!req->ns)) {
+		req->error_loc = offsetof(struct nvme_common_command, nsid);
 		return NVME_SC_INVALID_NS | NVME_SC_DNR;
+	}
 	ret = nvmet_check_ana_state(req->port, req->ns);
-	if (unlikely(ret))
+	if (unlikely(ret)) {
+		req->error_loc = offsetof(struct nvme_common_command, nsid);
 		return ret;
+	}
 	ret = nvmet_io_cmd_check_access(req);
-	if (unlikely(ret))
+	if (unlikely(ret)) {
+		req->error_loc = offsetof(struct nvme_common_command, nsid);
 		return ret;
+	}
 
 	if (req->ns->file)
 		return nvmet_file_parse_io_cmd(req);
@@ -804,6 +816,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 
 	/* no support for fused commands yet */
 	if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
+		req->error_loc = offsetof(struct nvme_common_command, flags);
 		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 		goto fail;
 	}
@@ -814,6 +827,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 	 * byte aligned.
 	 */
 	if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) {
+		req->error_loc = offsetof(struct nvme_common_command, flags);
 		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 		goto fail;
 	}
@@ -859,9 +873,10 @@ EXPORT_SYMBOL_GPL(nvmet_req_uninit);
 
 void nvmet_req_execute(struct nvmet_req *req)
 {
-	if (unlikely(req->data_len != req->transfer_len))
+	if (unlikely(req->data_len != req->transfer_len)) {
+		req->error_loc = offsetof(struct nvme_common_command, dptr);
 		nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
-	else
+	} else
 		req->execute(req);
 }
 EXPORT_SYMBOL_GPL(nvmet_req_execute);
-- 
cgit 


From 84faf42b8aff3be95d96249da4152c77d81e1469 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:44 -0800
Subject: nvmet: add error log support for fabrics-cmd

This patch adds the support to maintain error log page for the fabrics
prop get, prop set, and admin connect commands. Here we also update the
discovery.c and add update set/get features and parse functions to
support error log page.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/discovery.c   | 10 ++++++++
 drivers/nvme/target/fabrics-cmd.c | 48 ++++++++++++++++++++++++++++-----------
 2 files changed, 45 insertions(+), 13 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index e1bb254671de..d2cb71a0b419 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -259,6 +259,8 @@ static void nvmet_execute_disc_set_features(struct nvmet_req *req)
 						  NVMET_DISC_AEN_CFG_OPTIONAL);
 		break;
 	default:
+		req->error_loc =
+			offsetof(struct nvme_common_command, cdw10);
 		stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 		break;
 	}
@@ -279,6 +281,8 @@ static void nvmet_execute_disc_get_features(struct nvmet_req *req)
 		nvmet_get_feat_async_event(req);
 		break;
 	default:
+		req->error_loc =
+			offsetof(struct nvme_common_command, cdw10);
 		stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 		break;
 	}
@@ -293,6 +297,8 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
 	if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
 		pr_err("got cmd %d while not ready\n",
 		       cmd->common.opcode);
+		req->error_loc =
+			offsetof(struct nvme_common_command, opcode);
 		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 	}
 
@@ -323,6 +329,8 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
 		default:
 			pr_err("unsupported get_log_page lid %d\n",
 			       cmd->get_log_page.lid);
+			req->error_loc =
+				offsetof(struct nvme_get_log_page_command, lid);
 		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 		}
 	case nvme_admin_identify:
@@ -335,10 +343,12 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
 		default:
 			pr_err("unsupported identify cns %d\n",
 			       cmd->identify.cns);
+			req->error_loc = offsetof(struct nvme_identify, cns);
 			return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 		}
 	default:
 		pr_err("unhandled cmd %d\n", cmd->common.opcode);
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
 		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 	}
 
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index ee7d84621d65..6cf1fd9eb32e 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -17,23 +17,26 @@
 
 static void nvmet_execute_prop_set(struct nvmet_req *req)
 {
+	u64 val = le64_to_cpu(req->cmd->prop_set.value);
 	u16 status = 0;
 
-	if (!(req->cmd->prop_set.attrib & 1)) {
-		u64 val = le64_to_cpu(req->cmd->prop_set.value);
-
-		switch (le32_to_cpu(req->cmd->prop_set.offset)) {
-		case NVME_REG_CC:
-			nvmet_update_cc(req->sq->ctrl, val);
-			break;
-		default:
-			status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
-			break;
-		}
-	} else {
+	if (req->cmd->prop_set.attrib & 1) {
+		req->error_loc =
+			offsetof(struct nvmf_property_set_command, attrib);
 		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		goto out;
 	}
 
+	switch (le32_to_cpu(req->cmd->prop_set.offset)) {
+	case NVME_REG_CC:
+		nvmet_update_cc(req->sq->ctrl, val);
+		break;
+	default:
+		req->error_loc =
+			offsetof(struct nvmf_property_set_command, offset);
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+	}
+out:
 	nvmet_req_complete(req, status);
 }
 
@@ -69,6 +72,14 @@ static void nvmet_execute_prop_get(struct nvmet_req *req)
 		}
 	}
 
+	if (status && req->cmd->prop_get.attrib & 1) {
+		req->error_loc =
+			offsetof(struct nvmf_property_get_command, offset);
+	} else {
+		req->error_loc =
+			offsetof(struct nvmf_property_get_command, attrib);
+	}
+
 	req->rsp->result.u64 = cpu_to_le64(val);
 	nvmet_req_complete(req, status);
 }
@@ -89,6 +100,7 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req)
 	default:
 		pr_err("received unknown capsule type 0x%x\n",
 			cmd->fabrics.fctype);
+		req->error_loc = offsetof(struct nvmf_common_command, fctype);
 		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 	}
 
@@ -105,10 +117,12 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
 	old = cmpxchg(&req->sq->ctrl, NULL, ctrl);
 	if (old) {
 		pr_warn("queue already connected!\n");
+		req->error_loc = offsetof(struct nvmf_connect_command, opcode);
 		return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
 	}
 	if (!sqsize) {
 		pr_warn("queue size zero!\n");
+		req->error_loc = offsetof(struct nvmf_connect_command, sqsize);
 		return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
 	}
 
@@ -157,6 +171,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
 	if (c->recfmt != 0) {
 		pr_warn("invalid connect version (%d).\n",
 			le16_to_cpu(c->recfmt));
+		req->error_loc = offsetof(struct nvmf_connect_command, recfmt);
 		status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR;
 		goto out;
 	}
@@ -171,8 +186,13 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
 
 	status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req,
 				  le32_to_cpu(c->kato), &ctrl);
-	if (status)
+	if (status) {
+		if (status == (NVME_SC_INVALID_FIELD | NVME_SC_DNR))
+			req->error_loc =
+				offsetof(struct nvme_common_command, opcode);
 		goto out;
+	}
+
 	uuid_copy(&ctrl->hostid, &d->hostid);
 
 	status = nvmet_install_queue(ctrl, req);
@@ -259,11 +279,13 @@ u16 nvmet_parse_connect_cmd(struct nvmet_req *req)
 	if (cmd->common.opcode != nvme_fabrics_command) {
 		pr_err("invalid command 0x%x on unconnected queue.\n",
 			cmd->fabrics.opcode);
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
 		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 	}
 	if (cmd->fabrics.fctype != nvme_fabrics_type_connect) {
 		pr_err("invalid capsule type 0x%x on unconnected queue.\n",
 			cmd->fabrics.fctype);
+		req->error_loc = offsetof(struct nvmf_common_command, fctype);
 		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 	}
 
-- 
cgit 


From 762a11dfee10783a2fe4c467a68bac601e5acf1c Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:45 -0800
Subject: nvmet: add error log support for rdma backend

This patch adds the support to maintain the error log page for rdma
transport, we mainly focus here on the NVME_INVALID_FIELD errors.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/rdma.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index fb84caddd94b..a8d23eb80192 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -630,8 +630,11 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
 	u64 off = le64_to_cpu(sgl->addr);
 	u32 len = le32_to_cpu(sgl->length);
 
-	if (!nvme_is_write(rsp->req.cmd))
+	if (!nvme_is_write(rsp->req.cmd)) {
+		rsp->req.error_loc =
+			offsetof(struct nvme_common_command, opcode);
 		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+	}
 
 	if (off + len > rsp->queue->dev->inline_data_size) {
 		pr_err("invalid inline data offset!\n");
@@ -696,6 +699,8 @@ static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
 			return nvmet_rdma_map_sgl_inline(rsp);
 		default:
 			pr_err("invalid SGL subtype: %#x\n", sgl->type);
+			rsp->req.error_loc =
+				offsetof(struct nvme_common_command, dptr);
 			return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 		}
 	case NVME_KEY_SGL_FMT_DATA_DESC:
@@ -706,10 +711,13 @@ static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
 			return nvmet_rdma_map_sgl_keyed(rsp, sgl, false);
 		default:
 			pr_err("invalid SGL subtype: %#x\n", sgl->type);
+			rsp->req.error_loc =
+				offsetof(struct nvme_common_command, dptr);
 			return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 		}
 	default:
 		pr_err("invalid SGL type: %#x\n", sgl->type);
+		rsp->req.error_loc = offsetof(struct nvme_common_command, dptr);
 		return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR;
 	}
 }
-- 
cgit 


From 2da6e00580f5bc13ed0ba0acaa9d7ce0df226e7e Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:46 -0800
Subject: nvmet: add error log support for admin-cmd

This patch adds the support to maintain the error log page for admin
commands.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/admin-cmd.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 721b041a6b3b..fa62db7a5e9e 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -47,6 +47,7 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
 	if (!ns) {
 		pr_err("Could not find namespace id : %d\n",
 				le32_to_cpu(req->cmd->get_log_page.nsid));
+		req->error_loc = offsetof(struct nvme_rw_command, nsid);
 		return NVME_SC_INVALID_NS;
 	}
 
@@ -380,6 +381,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
 	u16 status = 0;
 
 	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
+		req->error_loc = offsetof(struct nvme_identify, nsid);
 		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
 		goto out;
 	}
@@ -500,6 +502,7 @@ static void nvmet_execute_identify_desclist(struct nvmet_req *req)
 
 	ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
 	if (!ns) {
+		req->error_loc = offsetof(struct nvme_identify, nsid);
 		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
 		goto out;
 	}
@@ -562,8 +565,10 @@ static u16 nvmet_set_feat_write_protect(struct nvmet_req *req)
 	u16 status = NVME_SC_FEATURE_NOT_CHANGEABLE;
 
 	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->rw.nsid);
-	if (unlikely(!req->ns))
+	if (unlikely(!req->ns)) {
+		req->error_loc = offsetof(struct nvme_common_command, nsid);
 		return status;
+	}
 
 	mutex_lock(&subsys->lock);
 	switch (write_protect) {
@@ -602,8 +607,10 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask)
 {
 	u32 val32 = le32_to_cpu(req->cmd->common.cdw11);
 
-	if (val32 & ~mask)
+	if (val32 & ~mask) {
+		req->error_loc = offsetof(struct nvme_common_command, cdw11);
 		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+	}
 
 	WRITE_ONCE(req->sq->ctrl->aen_enabled, val32);
 	nvmet_set_result(req, val32);
@@ -635,6 +642,7 @@ static void nvmet_execute_set_features(struct nvmet_req *req)
 		status = nvmet_set_feat_write_protect(req);
 		break;
 	default:
+		req->error_loc = offsetof(struct nvme_common_command, cdw10);
 		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 		break;
 	}
@@ -648,9 +656,10 @@ static u16 nvmet_get_feat_write_protect(struct nvmet_req *req)
 	u32 result;
 
 	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->common.nsid);
-	if (!req->ns)
+	if (!req->ns)  {
+		req->error_loc = offsetof(struct nvme_common_command, nsid);
 		return NVME_SC_INVALID_NS | NVME_SC_DNR;
-
+	}
 	mutex_lock(&subsys->lock);
 	if (req->ns->readonly == true)
 		result = NVME_NS_WRITE_PROTECT;
@@ -716,6 +725,8 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
 	case NVME_FEAT_HOST_ID:
 		/* need 128-bit host identifier flag */
 		if (!(req->cmd->common.cdw11 & cpu_to_le32(1 << 0))) {
+			req->error_loc =
+				offsetof(struct nvme_common_command, cdw11);
 			status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 			break;
 		}
@@ -727,6 +738,8 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
 		status = nvmet_get_feat_write_protect(req);
 		break;
 	default:
+		req->error_loc =
+			offsetof(struct nvme_common_command, cdw10);
 		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 		break;
 	}
@@ -848,5 +861,6 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
 
 	pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
 	       req->sq->qid);
+	req->error_loc = offsetof(struct nvme_common_command, opcode);
 	return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 }
-- 
cgit 


From 3b031d15995f3e4c43e38159072f07efe3fa35d9 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:42 -0800
Subject: nvmet: add error log support for bdev backend

This patch adds the support for the block device backend to populate the
error log entries. Here we map the blk_status_t to the NVMe status.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/io-cmd-bdev.c | 84 +++++++++++++++++++++++++++++++++------
 1 file changed, 72 insertions(+), 12 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index e0831230ca27..b6d030d3259f 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -44,13 +44,69 @@ void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
 	}
 }
 
+static u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts)
+{
+	u16 status = NVME_SC_SUCCESS;
+
+	if (likely(blk_sts == BLK_STS_OK))
+		return status;
+	/*
+	 * Right now there exists M : 1 mapping between block layer error
+	 * to the NVMe status code (see nvme_error_status()). For consistency,
+	 * when we reverse map we use most appropriate NVMe Status code from
+	 * the group of the NVMe staus codes used in the nvme_error_status().
+	 */
+	switch (blk_sts) {
+	case BLK_STS_NOSPC:
+		status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR;
+		req->error_loc = offsetof(struct nvme_rw_command, length);
+		break;
+	case BLK_STS_TARGET:
+		status = NVME_SC_LBA_RANGE | NVME_SC_DNR;
+		req->error_loc = offsetof(struct nvme_rw_command, slba);
+		break;
+	case BLK_STS_NOTSUPP:
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
+		switch (req->cmd->common.opcode) {
+		case nvme_cmd_dsm:
+		case nvme_cmd_write_zeroes:
+			status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR;
+			break;
+		default:
+			status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+		}
+		break;
+	case BLK_STS_MEDIUM:
+		status = NVME_SC_ACCESS_DENIED;
+		req->error_loc = offsetof(struct nvme_rw_command, nsid);
+		break;
+	case BLK_STS_IOERR:
+		/* fallthru */
+	default:
+		status = NVME_SC_INTERNAL | NVME_SC_DNR;
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
+	}
+
+	switch (req->cmd->common.opcode) {
+	case nvme_cmd_read:
+	case nvme_cmd_write:
+		req->error_slba = le64_to_cpu(req->cmd->rw.slba);
+		break;
+	case nvme_cmd_write_zeroes:
+		req->error_slba =
+			le64_to_cpu(req->cmd->write_zeroes.slba);
+		break;
+	default:
+		req->error_slba = 0;
+	}
+	return status;
+}
+
 static void nvmet_bio_done(struct bio *bio)
 {
 	struct nvmet_req *req = bio->bi_private;
 
-	nvmet_req_complete(req,
-		bio->bi_status ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
-
+	nvmet_req_complete(req, blk_to_nvme_status(req, bio->bi_status));
 	if (bio != &req->b.inline_bio)
 		bio_put(bio);
 }
@@ -136,18 +192,21 @@ u16 nvmet_bdev_flush(struct nvmet_req *req)
 	return 0;
 }
 
-static u16 nvmet_bdev_discard_range(struct nvmet_ns *ns,
+static u16 nvmet_bdev_discard_range(struct nvmet_req *req,
 		struct nvme_dsm_range *range, struct bio **bio)
 {
+	struct nvmet_ns *ns = req->ns;
 	int ret;
 
 	ret = __blkdev_issue_discard(ns->bdev,
 			le64_to_cpu(range->slba) << (ns->blksize_shift - 9),
 			le32_to_cpu(range->nlb) << (ns->blksize_shift - 9),
 			GFP_KERNEL, 0, bio);
-	if (ret && ret != -EOPNOTSUPP)
-		return NVME_SC_INTERNAL | NVME_SC_DNR;
-	return 0;
+
+	if (ret)
+		req->error_slba = le64_to_cpu(range->slba);
+
+	return blk_to_nvme_status(req, errno_to_blk_status(ret));
 }
 
 static void nvmet_bdev_execute_discard(struct nvmet_req *req)
@@ -163,7 +222,7 @@ static void nvmet_bdev_execute_discard(struct nvmet_req *req)
 		if (status)
 			break;
 
-		status = nvmet_bdev_discard_range(req->ns, &range, &bio);
+		status = nvmet_bdev_discard_range(req, &range, &bio);
 		if (status)
 			break;
 	}
@@ -204,16 +263,16 @@ static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req)
 	u16 status = NVME_SC_SUCCESS;
 	sector_t sector;
 	sector_t nr_sector;
+	int ret;
 
 	sector = le64_to_cpu(write_zeroes->slba) <<
 		(req->ns->blksize_shift - 9);
 	nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) <<
 		(req->ns->blksize_shift - 9));
 
-	if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
-				GFP_KERNEL, &bio, 0))
-		status = NVME_SC_INTERNAL | NVME_SC_DNR;
-
+	ret = __blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
+			GFP_KERNEL, &bio, 0);
+	status = blk_to_nvme_status(req, errno_to_blk_status(ret));
 	if (bio) {
 		bio->bi_private = req;
 		bio->bi_end_io = nvmet_bio_done;
@@ -248,6 +307,7 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req)
 	default:
 		pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
 		       req->sq->qid);
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
 		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 	}
 }
-- 
cgit 


From c6aa3542e01026a94d24713ee2c0dce517e9b6de Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:43 -0800
Subject: nvmet: add error log support for file backend

This patch adds support for the file backend to populate the
error log entries. Here we map the errno to the NVMe status codes.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/core.c        | 38 ++++++++++++++++++++++++++++++++++++++
 drivers/nvme/target/io-cmd-file.c | 35 ++++++++++++++++++++---------------
 drivers/nvme/target/nvmet.h       |  2 ++
 3 files changed, 60 insertions(+), 15 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 0b969977e361..cc81d0231587 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -45,6 +45,44 @@ u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
 u64 nvmet_ana_chgcnt;
 DECLARE_RWSEM(nvmet_ana_sem);
 
+inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno)
+{
+	u16 status;
+
+	switch (errno) {
+	case -ENOSPC:
+		req->error_loc = offsetof(struct nvme_rw_command, length);
+		status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR;
+		break;
+	case -EREMOTEIO:
+		req->error_loc = offsetof(struct nvme_rw_command, slba);
+		status = NVME_SC_LBA_RANGE | NVME_SC_DNR;
+		break;
+	case -EOPNOTSUPP:
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
+		switch (req->cmd->common.opcode) {
+		case nvme_cmd_dsm:
+		case nvme_cmd_write_zeroes:
+			status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR;
+			break;
+		default:
+			status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+		}
+		break;
+	case -ENODATA:
+		req->error_loc = offsetof(struct nvme_rw_command, nsid);
+		status = NVME_SC_ACCESS_DENIED;
+		break;
+	case -EIO:
+		/* FALLTHRU */
+	default:
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
+		status = NVME_SC_INTERNAL | NVME_SC_DNR;
+	}
+
+	return status;
+}
+
 static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
 		const char *subsysnqn);
 
diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
index 12eaa8ddc248..517522305e5c 100644
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -112,6 +112,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
 static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2)
 {
 	struct nvmet_req *req = container_of(iocb, struct nvmet_req, f.iocb);
+	u16 status = NVME_SC_SUCCESS;
 
 	if (req->f.bvec != req->inline_bvec) {
 		if (likely(req->f.mpool_alloc == false))
@@ -120,8 +121,9 @@ static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2)
 			mempool_free(req->f.bvec, req->ns->bvec_pool);
 	}
 
-	nvmet_req_complete(req, ret != req->data_len ?
-			NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+	if (unlikely(ret != req->data_len))
+		status = errno_to_nvme_status(req, ret);
+	nvmet_req_complete(req, status);
 }
 
 static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags)
@@ -140,7 +142,7 @@ static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags)
 
 	pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift;
 	if (unlikely(pos + req->data_len > req->ns->size)) {
-		nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR);
+		nvmet_req_complete(req, errno_to_nvme_status(req, -ENOSPC));
 		return true;
 	}
 
@@ -254,9 +256,7 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
 
 u16 nvmet_file_flush(struct nvmet_req *req)
 {
-	if (vfs_fsync(req->ns->file, 1) < 0)
-		return NVME_SC_INTERNAL | NVME_SC_DNR;
-	return 0;
+	return errno_to_nvme_status(req, vfs_fsync(req->ns->file, 1));
 }
 
 static void nvmet_file_flush_work(struct work_struct *w)
@@ -277,30 +277,34 @@ static void nvmet_file_execute_discard(struct nvmet_req *req)
 	int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
 	struct nvme_dsm_range range;
 	loff_t offset, len;
-	u16 ret;
+	u16 status = 0;
+	int ret;
 	int i;
 
 	for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) {
-		ret = nvmet_copy_from_sgl(req, i * sizeof(range), &range,
+		status = nvmet_copy_from_sgl(req, i * sizeof(range), &range,
 					sizeof(range));
-		if (ret)
+		if (status)
 			break;
 
 		offset = le64_to_cpu(range.slba) << req->ns->blksize_shift;
 		len = le32_to_cpu(range.nlb);
 		len <<= req->ns->blksize_shift;
 		if (offset + len > req->ns->size) {
-			ret = NVME_SC_LBA_RANGE | NVME_SC_DNR;
+			req->error_slba = le64_to_cpu(range.slba);
+			status = errno_to_nvme_status(req, -ENOSPC);
 			break;
 		}
 
-		if (vfs_fallocate(req->ns->file, mode, offset, len)) {
-			ret = NVME_SC_INTERNAL | NVME_SC_DNR;
+		ret = vfs_fallocate(req->ns->file, mode, offset, len);
+		if (ret) {
+			req->error_slba = le64_to_cpu(range.slba);
+			status = errno_to_nvme_status(req, ret);
 			break;
 		}
 	}
 
-	nvmet_req_complete(req, ret);
+	nvmet_req_complete(req, status);
 }
 
 static void nvmet_file_dsm_work(struct work_struct *w)
@@ -340,12 +344,12 @@ static void nvmet_file_write_zeroes_work(struct work_struct *w)
 			req->ns->blksize_shift);
 
 	if (unlikely(offset + len > req->ns->size)) {
-		nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR);
+		nvmet_req_complete(req, errno_to_nvme_status(req, -ENOSPC));
 		return;
 	}
 
 	ret = vfs_fallocate(req->ns->file, mode, offset, len);
-	nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+	nvmet_req_complete(req, ret < 0 ? errno_to_nvme_status(req, ret) : 0);
 }
 
 static void nvmet_file_execute_write_zeroes(struct nvmet_req *req)
@@ -380,6 +384,7 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req)
 	default:
 		pr_err("unhandled cmd for file ns %d on qid %d\n",
 				cmd->common.opcode, req->sq->qid);
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
 		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 	}
 }
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index a2a7fd69b66a..3b5f0bcaf3e8 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -494,4 +494,6 @@ static inline u32 nvmet_rw_len(struct nvmet_req *req)
 	return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) <<
 			req->ns->blksize_shift;
 }
+
+u16 errno_to_nvme_status(struct nvmet_req *req, int errno);
 #endif /* _NVMET_H */
-- 
cgit 


From 11ad507784ed5113af97903af1b0c4aea6b90690 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:47 -0800
Subject: nvmet: add error log page cmd handler

Now that we have support for all the major parts of the target we add
a NVMe error log page handler so that host can read the log page.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/admin-cmd.c | 36 +++++++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index fa62db7a5e9e..00956b4106a8 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -37,6 +37,34 @@ static void nvmet_execute_get_log_page_noop(struct nvmet_req *req)
 	nvmet_req_complete(req, nvmet_zero_sgl(req, 0, req->data_len));
 }
 
+static void nvmet_execute_get_log_page_error(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	u16 status = NVME_SC_SUCCESS;
+	unsigned long flags;
+	off_t offset = 0;
+	u64 slot;
+	u64 i;
+
+	spin_lock_irqsave(&ctrl->error_lock, flags);
+	slot = ctrl->err_counter % NVMET_ERROR_LOG_SLOTS;
+
+	for (i = 0; i < NVMET_ERROR_LOG_SLOTS; i++) {
+		status = nvmet_copy_to_sgl(req, offset, &ctrl->slots[slot],
+				sizeof(struct nvme_error_slot));
+		if (status)
+			break;
+
+		if (slot == 0)
+			slot = NVMET_ERROR_LOG_SLOTS - 1;
+		else
+			slot--;
+		offset += sizeof(struct nvme_error_slot);
+	}
+	spin_unlock_irqrestore(&ctrl->error_lock, flags);
+	nvmet_req_complete(req, status);
+}
+
 static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
 		struct nvme_smart_log *slog)
 {
@@ -789,13 +817,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
 
 		switch (cmd->get_log_page.lid) {
 		case NVME_LOG_ERROR:
-			/*
-			 * We currently never set the More bit in the status
-			 * field, so all error log entries are invalid and can
-			 * be zeroed out.  This is called a minum viable
-			 * implementation (TM) of this mandatory log page.
-			 */
-			req->execute = nvmet_execute_get_log_page_noop;
+			req->execute = nvmet_execute_get_log_page_error;
 			return 0;
 		case NVME_LOG_SMART:
 			req->execute = nvmet_execute_get_log_page_smart;
-- 
cgit 


From 23454d59cc16ddddf4b2290bbe60d2d9581dfd9a Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:48 -0800
Subject: nvmet: update smart log with num err log entries

Now that we have error log page implementation update smart log command
handler to provide number of error log entries in the lifetime of the
controller field.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/admin-cmd.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 00956b4106a8..11baeb14c388 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -135,6 +135,7 @@ static void nvmet_execute_get_log_page_smart(struct nvmet_req *req)
 {
 	struct nvme_smart_log *log;
 	u16 status = NVME_SC_INTERNAL;
+	unsigned long flags;
 
 	if (req->data_len != sizeof(*log))
 		goto out;
@@ -150,6 +151,11 @@ static void nvmet_execute_get_log_page_smart(struct nvmet_req *req)
 	if (status)
 		goto out_free_log;
 
+	spin_lock_irqsave(&req->sq->ctrl->error_lock, flags);
+	put_unaligned_le64(req->sq->ctrl->err_counter,
+			&log->num_err_log_entries);
+	spin_unlock_irqrestore(&req->sq->ctrl->error_lock, flags);
+
 	status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log));
 out_free_log:
 	kfree(log);
-- 
cgit 


From e42b3867de4bd5ee3a1849afb68a1fa8627f7282 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Tue, 11 Dec 2018 23:38:54 -0800
Subject: blk-mq-rdma: pass in queue map to blk_mq_rdma_map_queues

Will be used by nvme-rdma for queue map separation support.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index f2db848f6985..5057d5ab5aaa 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1751,7 +1751,7 @@ static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
 {
 	struct nvme_rdma_ctrl *ctrl = set->driver_data;
 
-	return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0);
+	return blk_mq_rdma_map_queues(&set->map[0], ctrl->device->dev, 0);
 }
 
 static const struct blk_mq_ops nvme_rdma_mq_ops = {
-- 
cgit 


From fa9a1811e094658e53b0c82b6ce0431c4c54fc1b Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Tue, 11 Dec 2018 23:38:55 -0800
Subject: nvme-fabrics: add missing nvmf_ctrl_options documentation

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fabrics.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 524a02a67817..28dc916ef26b 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -88,6 +88,9 @@ enum {
  * @max_reconnects: maximum number of allowed reconnect attempts before removing
  *              the controller, (-1) means reconnect forever, zero means remove
  *              immediately;
+ * @disable_sqflow: disable controller sq flow control
+ * @hdr_digest: generate/verify header digest (TCP)
+ * @data_digest: generate/verify data digest (TCP)
  */
 struct nvmf_ctrl_options {
 	unsigned		mask;
-- 
cgit 


From 330f6b8a70771f0b059b7bcbc9a28d8023235b55 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Tue, 11 Dec 2018 23:38:56 -0800
Subject: nvme-fabrics: allow user to set nr_write_queues for separate queue
 maps

This argument will specify how many I/O queues will be connected in
create_ctrl in addition to nr_io_queues. With this configuration, I/O
that carries payload from the host to the target, will use the default
hctx queue map, and I/O that involves target to host transfers will use
the read hctx queue map.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fabrics.c | 13 +++++++++++++
 drivers/nvme/host/fabrics.h |  3 +++
 2 files changed, 16 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 9c62c6838b76..19ff0eae4582 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -616,6 +616,7 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_DISABLE_SQFLOW,	"disable_sqflow"	},
 	{ NVMF_OPT_HDR_DIGEST,		"hdr_digest"		},
 	{ NVMF_OPT_DATA_DIGEST,		"data_digest"		},
+	{ NVMF_OPT_NR_WRITE_QUEUES,	"nr_write_queues=%d"	},
 	{ NVMF_OPT_ERR,			NULL			}
 };
 
@@ -837,6 +838,18 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 		case NVMF_OPT_DATA_DIGEST:
 			opts->data_digest = true;
 			break;
+		case NVMF_OPT_NR_WRITE_QUEUES:
+			if (match_int(args, &token)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			if (token <= 0) {
+				pr_err("Invalid nr_write_queues %d\n", token);
+				ret = -EINVAL;
+				goto out;
+			}
+			opts->nr_write_queues = token;
+			break;
 		default:
 			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
 				p);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 28dc916ef26b..81b8fd1c0c5d 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -61,6 +61,7 @@ enum {
 	NVMF_OPT_DISABLE_SQFLOW = 1 << 14,
 	NVMF_OPT_HDR_DIGEST	= 1 << 15,
 	NVMF_OPT_DATA_DIGEST	= 1 << 16,
+	NVMF_OPT_NR_WRITE_QUEUES = 1 << 17,
 };
 
 /**
@@ -91,6 +92,7 @@ enum {
  * @disable_sqflow: disable controller sq flow control
  * @hdr_digest: generate/verify header digest (TCP)
  * @data_digest: generate/verify data digest (TCP)
+ * @nr_write_queues: number of queues for write I/O
  */
 struct nvmf_ctrl_options {
 	unsigned		mask;
@@ -110,6 +112,7 @@ struct nvmf_ctrl_options {
 	bool			disable_sqflow;
 	bool			hdr_digest;
 	bool			data_digest;
+	unsigned int		nr_write_queues;
 };
 
 /*
-- 
cgit 


From 873946f4b957bf6e6a0351dd5282c73e0c870be9 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Tue, 11 Dec 2018 23:38:57 -0800
Subject: nvme-tcp: support separate queue maps for read and write

Allow NVMF_OPT_NR_WRITE_QUEUES to describe additional write queues.  In
addition, implement .map_queues that will apply 2 queue maps for read
and write queue sets.

Note that with the separate queue map, HCTX_TYPE_READ will always use
nr_io_queues and HCTX_TYPE_DEFAULT will use nr_write_queues.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c | 47 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 41 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 15543358e245..51826479a41e 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1215,7 +1215,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
 	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
 	struct linger sol = { .l_onoff = 1, .l_linger = 0 };
-	int ret, opt, rcv_pdu_size;
+	int ret, opt, rcv_pdu_size, n;
 
 	queue->ctrl = ctrl;
 	INIT_LIST_HEAD(&queue->send_list);
@@ -1271,7 +1271,11 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 	}
 
 	queue->sock->sk->sk_allocation = GFP_ATOMIC;
-	queue->io_cpu = (qid == 0) ? 0 : qid - 1;
+	if (!qid)
+		n = 0;
+	else
+		n = (qid - 1) % num_online_cpus();
+	queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
 	queue->request = NULL;
 	queue->data_remaining = 0;
 	queue->ddgst_remaining = 0;
@@ -1433,6 +1437,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
 		set->driver_data = ctrl;
 		set->nr_hw_queues = nctrl->queue_count - 1;
 		set->timeout = NVME_IO_TIMEOUT;
+		set->nr_maps = 2 /* default + read */;
 	}
 
 	ret = blk_mq_alloc_tag_set(set);
@@ -1527,7 +1532,12 @@ out_free_queues:
 
 static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
 {
-	return min(ctrl->queue_count - 1, num_online_cpus());
+	unsigned int nr_io_queues;
+
+	nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
+	nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
+
+	return nr_io_queues;
 }
 
 static int nvme_alloc_io_queues(struct nvme_ctrl *ctrl)
@@ -2052,6 +2062,29 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_STS_OK;
 }
 
+static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
+{
+	struct nvme_tcp_ctrl *ctrl = set->driver_data;
+
+	set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+	set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues;
+	if (ctrl->ctrl.opts->nr_write_queues) {
+		/* separate read/write queues */
+		set->map[HCTX_TYPE_DEFAULT].nr_queues =
+				ctrl->ctrl.opts->nr_write_queues;
+		set->map[HCTX_TYPE_READ].queue_offset =
+				ctrl->ctrl.opts->nr_write_queues;
+	} else {
+		/* mixed read/write queues */
+		set->map[HCTX_TYPE_DEFAULT].nr_queues =
+				ctrl->ctrl.opts->nr_io_queues;
+		set->map[HCTX_TYPE_READ].queue_offset = 0;
+	}
+	blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
+	blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
+	return 0;
+}
+
 static struct blk_mq_ops nvme_tcp_mq_ops = {
 	.queue_rq	= nvme_tcp_queue_rq,
 	.complete	= nvme_complete_rq,
@@ -2059,6 +2092,7 @@ static struct blk_mq_ops nvme_tcp_mq_ops = {
 	.exit_request	= nvme_tcp_exit_request,
 	.init_hctx	= nvme_tcp_init_hctx,
 	.timeout	= nvme_tcp_timeout,
+	.map_queues	= nvme_tcp_map_queues,
 };
 
 static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
@@ -2113,7 +2147,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
 
 	INIT_LIST_HEAD(&ctrl->list);
 	ctrl->ctrl.opts = opts;
-	ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
+	ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1;
 	ctrl->ctrl.sqsize = opts->queue_size - 1;
 	ctrl->ctrl.kato = opts->kato;
 
@@ -2155,7 +2189,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
 		goto out_free_ctrl;
 	}
 
-	ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues),
+	ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
 				GFP_KERNEL);
 	if (!ctrl->queues) {
 		ret = -ENOMEM;
@@ -2206,7 +2240,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
 	.required_opts	= NVMF_OPT_TRADDR,
 	.allowed_opts	= NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
 			  NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
-			  NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST,
+			  NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
+			  NVMF_OPT_NR_WRITE_QUEUES,
 	.create_ctrl	= nvme_tcp_create_ctrl,
 };
 
-- 
cgit 


From b65bb777ef2237030f2802f2263ae9a0108f7acf Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Tue, 11 Dec 2018 23:38:58 -0800
Subject: nvme-rdma: support separate queue maps for read and write

llow NVMF_OPT_NR_WRITE_QUEUES to describe additional write queues.  In
addition, implement .map_queues that will apply 2 queue maps for read
and write queue sets.

Note that with the separate queue map, HCTX_TYPE_READ will always use
nr_io_queues and HCTX_TYPE_DEFAULT will use nr_write_queues.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 5057d5ab5aaa..ed726da77b5b 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -645,6 +645,8 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
 	nr_io_queues = min_t(unsigned int, nr_io_queues,
 				ibdev->num_comp_vectors);
 
+	nr_io_queues += min(opts->nr_write_queues, num_online_cpus());
+
 	ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
 	if (ret)
 		return ret;
@@ -714,6 +716,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
 		set->driver_data = ctrl;
 		set->nr_hw_queues = nctrl->queue_count - 1;
 		set->timeout = NVME_IO_TIMEOUT;
+		set->nr_maps = 2 /* default + read */;
 	}
 
 	ret = blk_mq_alloc_tag_set(set);
@@ -1751,7 +1754,25 @@ static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
 {
 	struct nvme_rdma_ctrl *ctrl = set->driver_data;
 
-	return blk_mq_rdma_map_queues(&set->map[0], ctrl->device->dev, 0);
+	set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+	set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues;
+	if (ctrl->ctrl.opts->nr_write_queues) {
+		/* separate read/write queues */
+		set->map[HCTX_TYPE_DEFAULT].nr_queues =
+				ctrl->ctrl.opts->nr_write_queues;
+		set->map[HCTX_TYPE_READ].queue_offset =
+				ctrl->ctrl.opts->nr_write_queues;
+	} else {
+		/* mixed read/write queues */
+		set->map[HCTX_TYPE_DEFAULT].nr_queues =
+				ctrl->ctrl.opts->nr_io_queues;
+		set->map[HCTX_TYPE_READ].queue_offset = 0;
+	}
+	blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
+			ctrl->device->dev, 0);
+	blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
+			ctrl->device->dev, 0);
+	return 0;
 }
 
 static const struct blk_mq_ops nvme_rdma_mq_ops = {
@@ -1906,7 +1927,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
 	INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
 	INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
 
-	ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
+	ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1;
 	ctrl->ctrl.sqsize = opts->queue_size - 1;
 	ctrl->ctrl.kato = opts->kato;
 
@@ -1957,7 +1978,8 @@ static struct nvmf_transport_ops nvme_rdma_transport = {
 	.module		= THIS_MODULE,
 	.required_opts	= NVMF_OPT_TRADDR,
 	.allowed_opts	= NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
-			  NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
+			  NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
+			  NVMF_OPT_NR_WRITE_QUEUES,
 	.create_ctrl	= nvme_rdma_create_ctrl,
 };
 
-- 
cgit 


From d2f96f487f471b4c718324ecd1540c89d2be52ac Mon Sep 17 00:00:00 2001
From: Shenghui Wang <shhuiw@foxmail.com>
Date: Thu, 13 Dec 2018 22:53:46 +0800
Subject: bcache: add comment for cache_set->fill_iter

We have the following define for btree iterator:
	struct btree_iter {
		size_t size, used;
	#ifdef CONFIG_BCACHE_DEBUG
		struct btree_keys *b;
	#endif
		struct btree_iter_set {
			struct bkey *k, *end;
		} data[MAX_BSETS];
	};

We can see that the length of data[] field is static MAX_BSETS, which is
defined as 4 currently.

But a btree node on disk could have too many bsets for an iterator to fit
on the stack - maybe far more that MAX_BSETS. Have to dynamically allocate
space to host more btree_iter_sets.

bch_cache_set_alloc() will make sure the pool cache_set->fill_iter can
allocate an iterator equipped with enough room that can host
	(sb.bucket_size / sb.block_size)
btree_iter_sets, which is more than static MAX_BSETS.

bch_btree_node_read_done() will use that pool to allocate one iterator, to
host many bsets in one btree node.

Add more comment around cache_set->fill_iter to make code less confusing.

Signed-off-by: Shenghui Wang <shhuiw@foxmail.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h | 6 +++++-
 drivers/md/bcache/btree.c  | 5 +++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index b61b83bbcfff..96d2213f279e 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -658,7 +658,11 @@ struct cache_set {
 
 	/*
 	 * A btree node on disk could have too many bsets for an iterator to fit
-	 * on the stack - have to dynamically allocate them
+	 * on the stack - have to dynamically allocate them.
+	 * bch_cache_set_alloc() will make sure the pool can allocate iterators
+	 * equipped with enough room that can host
+	 *     (sb.bucket_size / sb.block_size)
+	 * btree_iter_sets, which is more than static MAX_BSETS.
 	 */
 	mempool_t		fill_iter;
 
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 3f4211b5cd33..23cb1dc7296b 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -207,6 +207,11 @@ void bch_btree_node_read_done(struct btree *b)
 	struct bset *i = btree_bset_first(b);
 	struct btree_iter *iter;
 
+	/*
+	 * c->fill_iter can allocate an iterator with more memory space
+	 * than static MAX_BSETS.
+	 * See the comment arount cache_set->fill_iter.
+	 */
 	iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
 	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
 	iter->used = 0;
-- 
cgit 


From ae17102316550b4b230a283febe31b2a9ff30084 Mon Sep 17 00:00:00 2001
From: Shenghui Wang <shhuiw@foxmail.com>
Date: Thu, 13 Dec 2018 22:53:47 +0800
Subject: bcache: do not check if debug dentry is ERR or NULL explicitly on
 remove

debugfs_remove and debugfs_remove_recursive will check if the dentry
pointer is NULL or ERR, and will do nothing in that case.

Remove the check in cache_set_free and bch_debug_init.

Signed-off-by: Shenghui Wang <shhuiw@foxmail.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/debug.c | 3 +--
 drivers/md/bcache/super.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 8f448b9c96a1..8b123be05254 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -249,8 +249,7 @@ void bch_debug_init_cache_set(struct cache_set *c)
 
 void bch_debug_exit(void)
 {
-	if (!IS_ERR_OR_NULL(bcache_debug))
-		debugfs_remove_recursive(bcache_debug);
+	debugfs_remove_recursive(bcache_debug);
 }
 
 void __init bch_debug_init(void)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 7bbd670a5a84..5b59d44656c0 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1510,8 +1510,7 @@ static void cache_set_free(struct closure *cl)
 	struct cache *ca;
 	unsigned int i;
 
-	if (!IS_ERR_OR_NULL(c->debug))
-		debugfs_remove(c->debug);
+	debugfs_remove(c->debug);
 
 	bch_open_buckets_free(c);
 	bch_btree_cache_free(c);
-- 
cgit 


From 3db4d0783eaf2a2537f6b83679ad57cba0ad0481 Mon Sep 17 00:00:00 2001
From: Shenghui Wang <shhuiw@foxmail.com>
Date: Thu, 13 Dec 2018 22:53:48 +0800
Subject: bcache: update comment for bch_data_insert

commit 220bb38c21b8 ("bcache: Break up struct search") introduced
changes to struct search and s->iop. bypass/bio are fields of struct
data_insert_op now. Update the comment.

Signed-off-by: Shenghui Wang <shhuiw@foxmail.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/request.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 3bf35914bb57..15070412a32e 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -311,11 +311,11 @@ err:
  * data is written it calls bch_journal, and after the keys have been added to
  * the next journal write they're inserted into the btree.
  *
- * It inserts the data in s->cache_bio; bi_sector is used for the key offset,
+ * It inserts the data in op->bio; bi_sector is used for the key offset,
  * and op->inode is used for the key inode.
  *
- * If s->bypass is true, instead of inserting the data it invalidates the
- * region of the cache represented by s->cache_bio and op->inode.
+ * If op->bypass is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
  */
 void bch_data_insert(struct closure *cl)
 {
-- 
cgit 


From 4e361e020e7220fa8fc812acdca4d08814633f49 Mon Sep 17 00:00:00 2001
From: Shenghui Wang <shhuiw@foxmail.com>
Date: Thu, 13 Dec 2018 22:53:49 +0800
Subject: bcache: update comment in sysfs.c

We have struct cached_dev allocated by kzalloc in register_bcache(),
which initializes all the fields of cached_dev with 0s. And commit
ce4c3e19e520 ("bcache: Replace bch_read_string_list() by
__sysfs_match_string()") has remove the string "default".

Update the comment.

Signed-off-by: Shenghui Wang <shhuiw@foxmail.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/sysfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 26f035a0c5b9..d2e5c9892d4d 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -16,7 +16,7 @@
 #include <linux/sort.h>
 #include <linux/sched/clock.h>
 
-/* Default is -1; we skip past it for struct cached_dev's cache mode */
+/* Default is 0 ("writethrough") */
 static const char * const bch_cache_modes[] = {
 	"writethrough",
 	"writeback",
@@ -25,7 +25,7 @@ static const char * const bch_cache_modes[] = {
 	NULL
 };
 
-/* Default is -1; we skip past it for stop_when_cache_set_failed */
+/* Default is 0 ("auto") */
 static const char * const bch_stop_on_failure_modes[] = {
 	"auto",
 	"always",
-- 
cgit 


From 79b791466e525c98f6aeee9acf5726e7b27f4231 Mon Sep 17 00:00:00 2001
From: Shenghui Wang <shhuiw@foxmail.com>
Date: Thu, 13 Dec 2018 22:53:50 +0800
Subject: bcache: do not mark writeback_running too early

A fresh backing device is not attached to any cache_set, and
has no writeback kthread created until first attached to some
cache_set.

But bch_cached_dev_writeback_init run
"
	dc->writeback_running		= true;
	WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING,
			&dc->disk.flags));
"
for any newly formatted backing devices.

For a fresh standalone backing device, we can get something like
following even if no writeback kthread created:
------------------------
/sys/block/bcache0/bcache# cat writeback_running
1
/sys/block/bcache0/bcache# cat writeback_rate_debug
rate:		512.0k/sec
dirty:		0.0k
target:		0.0k
proportional:	0.0k
integral:	0.0k
change:		0.0k/sec
next io:	-15427384ms

The none ZERO fields are misleading as no alive writeback kthread yet.

Set dc->writeback_running false as no writeback thread created in
bch_cached_dev_writeback_init().

We have writeback thread created and woken up in bch_cached_dev_writeback
_start(). Set dc->writeback_running true before bch_writeback_queue()
called, as a writeback thread will check if dc->writeback_running is true
before writing back dirty data, and hung if false detected.

After the change, we can get the following output for a fresh standalone
backing device:
-----------------------
/sys/block/bcache0/bcache$ cat writeback_running
0
/sys/block/bcache0/bcache# cat writeback_rate_debug
rate:		0.0k/sec
dirty:		0.0k
target:		0.0k
proportional:	0.0k
integral:	0.0k
change:		0.0k/sec
next io:	0ms

v1 -> v2:
  Set dc->writeback_running before bch_writeback_queue() called,

Signed-off-by: Shenghui Wang <shhuiw@foxmail.com>
Signed-off-by: Coly Li <colyli@suse.de>

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/writeback.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 08c3a9f9676c..1696b212ec4e 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -777,7 +777,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	bch_keybuf_init(&dc->writeback_keys);
 
 	dc->writeback_metadata		= true;
-	dc->writeback_running		= true;
+	dc->writeback_running		= false;
 	dc->writeback_percent		= 10;
 	dc->writeback_delay		= 30;
 	atomic_long_set(&dc->writeback_rate.rate, 1024);
@@ -805,6 +805,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc)
 		cached_dev_put(dc);
 		return PTR_ERR(dc->writeback_thread);
 	}
+	dc->writeback_running = true;
 
 	WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
 	schedule_delayed_work(&dc->writeback_rate_update,
-- 
cgit 


From f383ae300c4b6466824fd5c8cbd0a6c4ed2b53d3 Mon Sep 17 00:00:00 2001
From: Shenghui Wang <shhuiw@foxmail.com>
Date: Thu, 13 Dec 2018 22:53:51 +0800
Subject: bcache: cannot set writeback_running via sysfs if no writeback
 kthread created

"echo 1 > writeback_running" marks writeback_running even if no
writeback kthread created as "d_strtoul(writeback_running)" will simply
set dc-> writeback_running without checking the existence of
dc->writeback_thread.

Add check for setting writeback_running via sysfs: if no writeback
kthread available, reject setting to 1.

v2 -> v3:
  * Make message on wrong assignment more clear.
  * Print name of bcache device instead of name of backing device.

Signed-off-by: Shenghui Wang <shhuiw@foxmail.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/sysfs.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index d2e5c9892d4d..9d5fe12f0c9c 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -384,8 +384,25 @@ STORE(bch_cached_dev)
 	mutex_lock(&bch_register_lock);
 	size = __cached_dev_store(kobj, attr, buf, size);
 
-	if (attr == &sysfs_writeback_running)
-		bch_writeback_queue(dc);
+	if (attr == &sysfs_writeback_running) {
+		/* dc->writeback_running changed in __cached_dev_store() */
+		if (IS_ERR_OR_NULL(dc->writeback_thread)) {
+			/*
+			 * reject setting it to 1 via sysfs if writeback
+			 * kthread is not created yet.
+			 */
+			if (dc->writeback_running) {
+				dc->writeback_running = false;
+				pr_err("%s: failed to run non-existent writeback thread",
+						dc->disk.disk->disk_name);
+			}
+		} else
+			/*
+			 * writeback kthread will check if dc->writeback_running
+			 * is true or false.
+			 */
+			bch_writeback_queue(dc);
+	}
 
 	if (attr == &sysfs_writeback_percent)
 		if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
-- 
cgit 


From cb07ad63682ffcce10e9beaed7828a26780e3df9 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 13 Dec 2018 22:53:52 +0800
Subject: bcache: introduce force_wake_up_gc()

Garbage collection thread starts to work when c->sectors_to_gc is
negative value, otherwise nothing will happen even the gc thread is
woken up by wake_up_gc().

force_wake_up_gc() sets c->sectors_to_gc to -1 before calling
wake_up_gc(), then gc thread may have chance to run if no one else sets
c->sectors_to_gc to a positive value before gc_should_run().

This routine can be called where the gc thread is woken up and required
to run in force.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/btree.h | 18 ++++++++++++++++++
 drivers/md/bcache/sysfs.c | 17 ++---------------
 2 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index a68d6c55783b..d1c72ef64edf 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -266,6 +266,24 @@ static inline void wake_up_gc(struct cache_set *c)
 	wake_up(&c->gc_wait);
 }
 
+static inline void force_wake_up_gc(struct cache_set *c)
+{
+	/*
+	 * Garbage collection thread only works when sectors_to_gc < 0,
+	 * calling wake_up_gc() won't start gc thread if sectors_to_gc is
+	 * not a nagetive value.
+	 * Therefore sectors_to_gc is set to -1 here, before waking up
+	 * gc thread by calling wake_up_gc(). Then gc_should_run() will
+	 * give a chance to permit gc thread to run. "Give a chance" means
+	 * before going into gc_should_run(), there is still possibility
+	 * that c->sectors_to_gc being set to other positive value. So
+	 * this routine won't 100% make sure gc thread will be woken up
+	 * to run.
+	 */
+	atomic_set(&c->sectors_to_gc, -1);
+	wake_up_gc(c);
+}
+
 #define MAP_DONE	0
 #define MAP_CONTINUE	1
 
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 9d5fe12f0c9c..c09748497cdc 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -742,21 +742,8 @@ STORE(__bch_cache_set)
 		bch_cache_accounting_clear(&c->accounting);
 	}
 
-	if (attr == &sysfs_trigger_gc) {
-		/*
-		 * Garbage collection thread only works when sectors_to_gc < 0,
-		 * when users write to sysfs entry trigger_gc, most of time
-		 * they want to forcibly triger gargage collection. Here -1 is
-		 * set to c->sectors_to_gc, to make gc_should_run() give a
-		 * chance to permit gc thread to run. "give a chance" means
-		 * before going into gc_should_run(), there is still chance
-		 * that c->sectors_to_gc being set to other positive value. So
-		 * writing sysfs entry trigger_gc won't always make sure gc
-		 * thread takes effect.
-		 */
-		atomic_set(&c->sectors_to_gc, -1);
-		wake_up_gc(c);
-	}
+	if (attr == &sysfs_trigger_gc)
+		force_wake_up_gc(c);
 
 	if (attr == &sysfs_prune_cache) {
 		struct shrink_control sc;
-- 
cgit 


From 7a671d8ef821bf5743fdff17fae0600648345b03 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 13 Dec 2018 22:53:53 +0800
Subject: bcache: option to automatically run gc thread after writeback

The option gc_after_writeback is disabled by default, because garbage
collection will discard SSD data which drops cached data.

Echo 1 into /sys/fs/bcache/<UUID>/internal/gc_after_writeback will
enable this option, which wakes up gc thread when writeback accomplished
and all cached data is clean.

This option is helpful for people who cares writing performance more. In
heavy writing workload, all cached data can be clean only happens when
writeback thread cleans all cached data in I/O idle time. In such
situation a following gc running may help to shrink bcache B+ tree and
discard more clean data, which may be helpful for future writing
requests.

If you are not sure whether this is helpful for your own workload,
please leave it as disabled by default.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h    | 14 ++++++++++++++
 drivers/md/bcache/sysfs.c     |  9 +++++++++
 drivers/md/bcache/writeback.c | 27 +++++++++++++++++++++++++++
 drivers/md/bcache/writeback.h |  2 ++
 4 files changed, 52 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 96d2213f279e..fdf75352e16a 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -626,6 +626,20 @@ struct cache_set {
 	/* Where in the btree gc currently is */
 	struct bkey		gc_done;
 
+	/*
+	 * For automatical garbage collection after writeback completed, this
+	 * varialbe is used as bit fields,
+	 * - 0000 0001b (BCH_ENABLE_AUTO_GC): enable gc after writeback
+	 * - 0000 0010b (BCH_DO_AUTO_GC):     do gc after writeback
+	 * This is an optimization for following write request after writeback
+	 * finished, but read hit rate dropped due to clean data on cache is
+	 * discarded. Unless user explicitly sets it via sysfs, it won't be
+	 * enabled.
+	 */
+#define BCH_ENABLE_AUTO_GC	1
+#define BCH_DO_AUTO_GC		2
+	uint8_t			gc_after_writeback;
+
 	/*
 	 * The allocation code needs gc_mark in struct bucket to be correct, but
 	 * it's not while a gc is in progress. Protected by bucket_lock.
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index c09748497cdc..621186b4240f 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -128,6 +128,7 @@ rw_attribute(expensive_debug_checks);
 rw_attribute(cache_replacement_policy);
 rw_attribute(btree_shrinker_disabled);
 rw_attribute(copy_gc_enabled);
+rw_attribute(gc_after_writeback);
 rw_attribute(size);
 
 static ssize_t bch_snprint_string_list(char *buf,
@@ -693,6 +694,7 @@ SHOW(__bch_cache_set)
 	sysfs_printf(gc_always_rewrite,		"%i", c->gc_always_rewrite);
 	sysfs_printf(btree_shrinker_disabled,	"%i", c->shrinker_disabled);
 	sysfs_printf(copy_gc_enabled,		"%i", c->copy_gc_enabled);
+	sysfs_printf(gc_after_writeback,	"%i", c->gc_after_writeback);
 	sysfs_printf(io_disable,		"%i",
 		     test_bit(CACHE_SET_IO_DISABLE, &c->flags));
 
@@ -793,6 +795,12 @@ STORE(__bch_cache_set)
 	sysfs_strtoul(gc_always_rewrite,	c->gc_always_rewrite);
 	sysfs_strtoul(btree_shrinker_disabled,	c->shrinker_disabled);
 	sysfs_strtoul(copy_gc_enabled,		c->copy_gc_enabled);
+	/*
+	 * write gc_after_writeback here may overwrite an already set
+	 * BCH_DO_AUTO_GC, it doesn't matter because this flag will be
+	 * set in next chance.
+	 */
+	sysfs_strtoul_clamp(gc_after_writeback, c->gc_after_writeback, 0, 1);
 
 	return size;
 }
@@ -873,6 +881,7 @@ static struct attribute *bch_cache_set_internal_files[] = {
 	&sysfs_gc_always_rewrite,
 	&sysfs_btree_shrinker_disabled,
 	&sysfs_copy_gc_enabled,
+	&sysfs_gc_after_writeback,
 	&sysfs_io_disable,
 	NULL
 };
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 1696b212ec4e..73f0efac2b9f 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -17,6 +17,15 @@
 #include <linux/sched/clock.h>
 #include <trace/events/bcache.h>
 
+static void update_gc_after_writeback(struct cache_set *c)
+{
+	if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) ||
+	    c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD)
+		return;
+
+	c->gc_after_writeback |= BCH_DO_AUTO_GC;
+}
+
 /* Rate limiting */
 static uint64_t __calc_target_rate(struct cached_dev *dc)
 {
@@ -191,6 +200,7 @@ static void update_writeback_rate(struct work_struct *work)
 		if (!set_at_max_writeback_rate(c, dc)) {
 			down_read(&dc->writeback_lock);
 			__update_writeback_rate(dc);
+			update_gc_after_writeback(c);
 			up_read(&dc->writeback_lock);
 		}
 	}
@@ -689,6 +699,23 @@ static int bch_writeback_thread(void *arg)
 				up_write(&dc->writeback_lock);
 				break;
 			}
+
+			/*
+			 * When dirty data rate is high (e.g. 50%+), there might
+			 * be heavy buckets fragmentation after writeback
+			 * finished, which hurts following write performance.
+			 * If users really care about write performance they
+			 * may set BCH_ENABLE_AUTO_GC via sysfs, then when
+			 * BCH_DO_AUTO_GC is set, garbage collection thread
+			 * will be wake up here. After moving gc, the shrunk
+			 * btree and discarded free buckets SSD space may be
+			 * helpful for following write requests.
+			 */
+			if (c->gc_after_writeback ==
+			    (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) {
+				c->gc_after_writeback &= ~BCH_DO_AUTO_GC;
+				force_wake_up_gc(c);
+			}
 		}
 
 		up_write(&dc->writeback_lock);
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index d2b9fdbc8994..ce63be98a39d 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -11,6 +11,8 @@
 #define WRITEBACK_RATE_UPDATE_SECS_MAX		60
 #define WRITEBACK_RATE_UPDATE_SECS_DEFAULT	5
 
+#define BCH_AUTO_GC_DIRTY_THRESHOLD	50
+
 /*
  * 14 (16384ths) is chosen here as something that each backing device
  * should be a reasonable fraction of the share, and not to blow up
-- 
cgit 


From 009673d02fa92acaa7ed0b1e1389610e4390ba49 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 13 Dec 2018 22:53:54 +0800
Subject: bcache: add MODULE_DESCRIPTION information

This patch moves MODULE_AUTHOR and MODULE_LICENSE to end of super.c, and
add MODULE_DESCRIPTION("Bcache: a Linux block layer cache").

This is preparation for adding module parameters.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 5b59d44656c0..61d3b63fa617 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -25,9 +25,6 @@
 #include <linux/reboot.h>
 #include <linux/sysfs.h>
 
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
-
 static const char bcache_magic[] = {
 	0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
 	0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
@@ -2469,3 +2466,7 @@ err:
 
 module_exit(bcache_exit);
 module_init(bcache_init);
+
+MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+MODULE_LICENSE("GPL");
-- 
cgit 


From 9aaf51654672b16566c5fe787da3ca41ebf6d297 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 13 Dec 2018 22:53:55 +0800
Subject: bcache: make cutoff_writeback and cutoff_writeback_sync tunable

Currently the cutoff writeback and cutoff writeback sync thresholds are
defined by CUTOFF_WRITEBACK (40) and CUTOFF_WRITEBACK_SYNC (70) as
static values. Most of time these they work fine, but when people want
to do research on bcache writeback mode performance tuning, there is no
chance to modify the soft and hard cutoff writeback values.

This patch introduces two module parameters bch_cutoff_writeback_sync
and bch_cutoff_writeback which permit people to tune the values when
loading bcache.ko. If they are not specified by module loading, current
values CUTOFF_WRITEBACK_SYNC and CUTOFF_WRITEBACK will be used as
default and nothing changes.

When people want to tune this two values,
- cutoff_writeback can be set in range [1, 70]
- cutoff_writeback_sync can be set in range [1, 90]
- cutoff_writeback always <= cutoff_writeback_sync

The default values are strongly recommended to most of users for most of
workloads. Anyway, if people wants to take their own risk to do research
on new writeback cutoff tuning for their own workload, now they can make
it.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c     | 40 ++++++++++++++++++++++++++++++++++++++++
 drivers/md/bcache/sysfs.c     |  7 +++++++
 drivers/md/bcache/writeback.h | 10 ++++++++--
 3 files changed, 55 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 61d3b63fa617..4dee119c3664 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -25,6 +25,9 @@
 #include <linux/reboot.h>
 #include <linux/sysfs.h>
 
+unsigned int bch_cutoff_writeback;
+unsigned int bch_cutoff_writeback_sync;
+
 static const char bcache_magic[] = {
 	0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
 	0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
@@ -2420,6 +2423,32 @@ static void bcache_exit(void)
 	mutex_destroy(&bch_register_lock);
 }
 
+/* Check and fixup module parameters */
+static void check_module_parameters(void)
+{
+	if (bch_cutoff_writeback_sync == 0)
+		bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
+	else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
+		pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u",
+			bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
+		bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
+	}
+
+	if (bch_cutoff_writeback == 0)
+		bch_cutoff_writeback = CUTOFF_WRITEBACK;
+	else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
+		pr_warn("set bch_cutoff_writeback (%u) to max value %u",
+			bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
+		bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
+	}
+
+	if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
+		pr_warn("set bch_cutoff_writeback (%u) to %u",
+			bch_cutoff_writeback, bch_cutoff_writeback_sync);
+		bch_cutoff_writeback = bch_cutoff_writeback_sync;
+	}
+}
+
 static int __init bcache_init(void)
 {
 	static const struct attribute *files[] = {
@@ -2428,6 +2457,8 @@ static int __init bcache_init(void)
 		NULL
 	};
 
+	check_module_parameters();
+
 	mutex_init(&bch_register_lock);
 	init_waitqueue_head(&unregister_wait);
 	register_reboot_notifier(&reboot);
@@ -2464,9 +2495,18 @@ err:
 	return -ENOMEM;
 }
 
+/*
+ * Module hooks
+ */
 module_exit(bcache_exit);
 module_init(bcache_init);
 
+module_param(bch_cutoff_writeback, uint, 0);
+MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback");
+
+module_param(bch_cutoff_writeback_sync, uint, 0);
+MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback");
+
 MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
 MODULE_LICENSE("GPL");
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 621186b4240f..482b128b3e9d 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -88,6 +88,8 @@ read_attribute(writeback_keys_done);
 read_attribute(writeback_keys_failed);
 read_attribute(io_errors);
 read_attribute(congested);
+read_attribute(cutoff_writeback);
+read_attribute(cutoff_writeback_sync);
 rw_attribute(congested_read_threshold_us);
 rw_attribute(congested_write_threshold_us);
 
@@ -686,6 +688,9 @@ SHOW(__bch_cache_set)
 	sysfs_print(congested_write_threshold_us,
 		    c->congested_write_threshold_us);
 
+	sysfs_print(cutoff_writeback, bch_cutoff_writeback);
+	sysfs_print(cutoff_writeback_sync, bch_cutoff_writeback_sync);
+
 	sysfs_print(active_journal_entries,	fifo_used(&c->journal.pin));
 	sysfs_printf(verify,			"%i", c->verify);
 	sysfs_printf(key_merging_disabled,	"%i", c->key_merging_disabled);
@@ -883,6 +888,8 @@ static struct attribute *bch_cache_set_internal_files[] = {
 	&sysfs_copy_gc_enabled,
 	&sysfs_gc_after_writeback,
 	&sysfs_io_disable,
+	&sysfs_cutoff_writeback,
+	&sysfs_cutoff_writeback_sync,
 	NULL
 };
 KTYPE(bch_cache_set_internal);
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index ce63be98a39d..6a743d3bb338 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -5,6 +5,9 @@
 #define CUTOFF_WRITEBACK	40
 #define CUTOFF_WRITEBACK_SYNC	70
 
+#define CUTOFF_WRITEBACK_MAX		70
+#define CUTOFF_WRITEBACK_SYNC_MAX	90
+
 #define MAX_WRITEBACKS_IN_PASS  5
 #define MAX_WRITESIZE_IN_PASS   5000	/* *512b */
 
@@ -55,6 +58,9 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
 	}
 }
 
+extern unsigned int bch_cutoff_writeback;
+extern unsigned int bch_cutoff_writeback_sync;
+
 static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 				    unsigned int cache_mode, bool would_skip)
 {
@@ -62,7 +68,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 
 	if (cache_mode != CACHE_MODE_WRITEBACK ||
 	    test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
-	    in_use > CUTOFF_WRITEBACK_SYNC)
+	    in_use > bch_cutoff_writeback_sync)
 		return false;
 
 	if (dc->partial_stripes_expensive &&
@@ -75,7 +81,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 
 	return (op_is_sync(bio->bi_opf) ||
 		bio->bi_opf & (REQ_META|REQ_PRIO) ||
-		in_use <= CUTOFF_WRITEBACK);
+		in_use <= bch_cutoff_writeback);
 }
 
 static inline void bch_writeback_queue(struct cached_dev *dc)
-- 
cgit 


From cc38ca7ed54a1df04ae9f23614b348ebfc918029 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 13 Dec 2018 22:53:56 +0800
Subject: bcache: set writeback_percent in a flexible range

Because CUTOFF_WRITEBACK is defined as 40, so before the changes of
dynamic cutoff writeback values, writeback_percent is limited to [0,
CUTOFF_WRITEBACK]. Any value larger than CUTOFF_WRITEBACK will be fixed
up to 40.

Now cutof writeback limit is a dynamic value bch_cutoff_writeback, so
the range of writeback_percent can be a more flexible range as [0,
bch_cutoff_writeback]. The flexibility is, it can be expended to a
larger or smaller range than [0, 40], depends on how value
bch_cutoff_writeback is specified.

The default value is still strongly recommended to most of users for
most of workloads. But for people who want to do research on bcache
writeback perforamnce tuning, they may have chance to specify more
flexible writeback_percent in range [0, 70].

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/sysfs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 482b128b3e9d..557a8a3270a1 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -267,7 +267,8 @@ STORE(__cached_dev)
 	d_strtoul(writeback_running);
 	d_strtoul(writeback_delay);
 
-	sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
+	sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent,
+			    0, bch_cutoff_writeback);
 
 	if (attr == &sysfs_writeback_rate) {
 		ssize_t ret;
-- 
cgit 


From e78bd0d26f7396c7bd17be60d9686be8ef8e453a Mon Sep 17 00:00:00 2001
From: Guoju Fang <fangguoju@gmail.com>
Date: Thu, 13 Dec 2018 22:53:57 +0800
Subject: bcache: print number of keys in trace_bcache_journal_write

Sometimes flush journal may be very frequent, so it's useful to dump
number of keys every time write journal.

Signed-off-by: Guoju Fang <fangguoju@gmail.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/journal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 522c7426f3a0..b2fd412715b1 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -663,7 +663,7 @@ static void journal_write_unlocked(struct closure *cl)
 				 REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
 		bch_bio_map(bio, w->data);
 
-		trace_bcache_journal_write(bio);
+		trace_bcache_journal_write(bio, w->data->keys);
 		bio_list_add(&list, bio);
 
 		SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
-- 
cgit 


From 092ff0520070fad8407b196f3bb6156ce77a6f98 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 13 Dec 2018 12:34:07 -0800
Subject: nvme: fix kernel paging oops

free the controller discard_page correctly.

Fixes: cb5b7262b011 ("nvme: provide fallback for discard alloc failure")
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4d8ee7186268..136512e8ba58 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3596,7 +3596,7 @@ static void nvme_free_ctrl(struct device *dev)
 	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 	kfree(ctrl->effects);
 	nvme_mpath_uninit(ctrl);
-	kfree(ctrl->discard_page);
+	__free_page(ctrl->discard_page);
 
 	if (subsys) {
 		mutex_lock(&subsys->lock);
-- 
cgit 


From e7cc005fef03d2b13246d800e497418d570ad6da Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmx.com>
Date: Sun, 16 Dec 2018 14:08:18 +0800
Subject: aoe: add __exit annotation

Add __exit annotation to cleanup helper which
is only called once in the module.

Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/aoe/aoemain.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c
index 251482066977..1e4e2971171c 100644
--- a/drivers/block/aoe/aoemain.c
+++ b/drivers/block/aoe/aoemain.c
@@ -24,7 +24,7 @@ static void discover_timer(struct timer_list *t)
 	aoecmd_cfg(0xffff, 0xff);
 }
 
-static void
+static void __exit
 aoe_exit(void)
 {
 	del_timer_sync(&timer);
-- 
cgit 


From 38a3499f6d0cb15bd673e517b0656807e22bfd24 Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmx.com>
Date: Sun, 16 Dec 2018 17:35:00 +0800
Subject: block: loop: check error using IS_ERR instead of IS_ERR_OR_NULL in
 loop_add()

blk_mq_init_queue() will not return NULL pointer to its caller,
so it's better to replace IS_ERR_OR_NULL using IS_ERR in loop_add().

If in the future things change to check NULL pointer inside loop_add(),
we should return -ENOMEM as return code instead of PTR_ERR(NULL).

Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 0770004616de..0939f36548c9 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1915,7 +1915,7 @@ static int loop_add(struct loop_device **l, int i)
 		goto out_free_idr;
 
 	lo->lo_queue = blk_mq_init_queue(&lo->tag_set);
-	if (IS_ERR_OR_NULL(lo->lo_queue)) {
+	if (IS_ERR(lo->lo_queue)) {
 		err = PTR_ERR(lo->lo_queue);
 		goto out_cleanup_tags;
 	}
-- 
cgit 


From 7e849dd9cf37bc52aff9b5236377c405040c959c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 17 Dec 2018 12:16:27 +0100
Subject: nvme-pci: don't share queue maps

Now that the block layer checks if a queue map has any queues inside
it there is no more reason to duplicate the maps for the non-default
types.

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index fb9d8270f32c..698b350b38cf 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -496,11 +496,7 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 		map->nr_queues = dev->io_queues[i];
 		if (!map->nr_queues) {
 			BUG_ON(i == HCTX_TYPE_DEFAULT);
-
-			/* shared set, resuse read set parameters */
-			map->nr_queues = dev->io_queues[HCTX_TYPE_DEFAULT];
-			qoff = 0;
-			offset = queue_irq_offset(dev);
+			continue;
 		}
 
 		/*
-- 
cgit 


From 3c94d83cb352627f221d971b05f163c17527de74 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 17 Dec 2018 21:11:17 -0700
Subject: blk-mq: change blk_mq_queue_busy() to blk_mq_queue_inflight()

There's a single user of this function, dm, and dm just wants
to check if IO is inflight, not that it's just allocated.

This fixes a hang with srp/002 in blktests with dm, where it tries
to suspend but waits for inflight IO to finish first. As it checks
for just allocated requests, this fails.

Tested-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c414d40d645d..dddbca63e140 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -663,7 +663,7 @@ static bool md_in_flight_bios(struct mapped_device *md)
 static bool md_in_flight(struct mapped_device *md)
 {
 	if (queue_is_mq(md->queue))
-		return blk_mq_queue_busy(md->queue);
+		return blk_mq_queue_inflight(md->queue);
 	else
 		return md_in_flight_bios(md);
 }
-- 
cgit 


From 66c6afbd7321983be4c4160aff4d79e52af84a60 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 14 Dec 2018 18:31:21 +0000
Subject: nvmet: fix comparison of a u16 with -1

Currently the u16 req->error_loc is being compared to -1 which
will always be false.  Fix this by casting -1 to u16 to fix this.

Detected by clang:
  warning: result of comparison of constant -1 with expression of
  type 'u16' (aka 'unsigned short') is always false
  [-Wtautological-constant-out-of-range-compare]

Fixes: 76574f37bf4c ("nvmet: add interface to update error-log page")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index cc81d0231587..b9c219c931eb 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -663,7 +663,7 @@ static void nvmet_set_error(struct nvmet_req *req, u16 status)
 
 	req->rsp->status = cpu_to_le16(status << 1);
 
-	if (!ctrl || req->error_loc == -1)
+	if (!ctrl || req->error_loc == (u16)-1)
 		return;
 
 	spin_lock_irqsave(&ctrl->error_lock, flags);
-- 
cgit 


From 5698b805fbf09a5dfa60829f5a179c0568a8dc3b Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Mon, 17 Dec 2018 18:35:29 -0800
Subject: nvmet: use a macro for default error location

This patch defines a new macro NVMET_NO_ERROR_LOC to represent the
default error location value in the nvme-error-log-page.
This is a pure cleanup patch and it does not change any functionality.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/core.c  | 4 ++--
 drivers/nvme/target/nvmet.h | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index b9c219c931eb..88d260f31835 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -663,7 +663,7 @@ static void nvmet_set_error(struct nvmet_req *req, u16 status)
 
 	req->rsp->status = cpu_to_le16(status << 1);
 
-	if (!ctrl || req->error_loc == (u16)-1)
+	if (!ctrl || req->error_loc == NVMET_NO_ERROR_LOC)
 		return;
 
 	spin_lock_irqsave(&ctrl->error_lock, flags);
@@ -849,7 +849,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 	req->rsp->status = 0;
 	req->rsp->sq_head = 0;
 	req->ns = NULL;
-	req->error_loc = -1;
+	req->error_loc = NVMET_NO_ERROR_LOC;
 	req->error_slba = 0;
 
 	/* no support for fused commands yet */
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 3b5f0bcaf3e8..3e4719fdba85 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -30,6 +30,7 @@
 
 #define NVMET_ASYNC_EVENTS		4
 #define NVMET_ERROR_LOG_SLOTS		128
+#define NVMET_NO_ERROR_LOC		((u16)-1)
 
 /*
  * Supported optional AENs:
-- 
cgit 


From ed92ad37e88555864f1f830db4037b9535b3392c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 14 Dec 2018 14:06:59 +0100
Subject: nvme-pci: only set nr_maps to 2 if poll queues are supported

The block layer now enables polling support on a queue if nr_maps
includes the poll map, so we should only set that if we actually
support poll queues.

Fixes:  6544d229bf ("block: enable polling by default if a poll map is initalized")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/pci.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 698b350b38cf..a3e0b9378e54 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2289,6 +2289,9 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	if (!dev->ctrl.tagset) {
 		dev->tagset.ops = &nvme_mq_ops;
 		dev->tagset.nr_hw_queues = dev->online_queues - 1;
+		dev->tagset.nr_maps = 2; /* default + read */
+		if (dev->io_queues[HCTX_TYPE_POLL])
+			dev->tagset.nr_maps++;
 		dev->tagset.nr_maps = HCTX_MAX_TYPES;
 		dev->tagset.timeout = NVME_IO_TIMEOUT;
 		dev->tagset.numa_node = dev_to_node(dev->dev);
-- 
cgit 


From 91a509f8b7a8a518723e1755b876b46c537baaef Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Dec 2018 09:48:00 +0100
Subject: nvme-pci: refactor nvme_poll_irqdisable to make sparse happy

By duplicating the nvme_process_cq in both branches we keep the
sparse lock context checking happy, so do it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/pci.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a3e0b9378e54..452b28130380 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1089,15 +1089,15 @@ static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag)
 	 * using the CQ lock.  For normal interrupt driven threads we have
 	 * to disable the interrupt to avoid racing with it.
 	 */
-	if (nvmeq->cq_vector == -1)
+	if (nvmeq->cq_vector == -1) {
 		spin_lock(&nvmeq->cq_poll_lock);
-	else
-		disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
-	found = nvme_process_cq(nvmeq, &start, &end, tag);
-	if (nvmeq->cq_vector == -1)
+		found = nvme_process_cq(nvmeq, &start, &end, tag);
 		spin_unlock(&nvmeq->cq_poll_lock);
-	else
+	} else {
+		disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
+		found = nvme_process_cq(nvmeq, &start, &end, tag);
 		enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
+	}
 
 	nvme_complete_cqes(nvmeq, start, end);
 	return found;
-- 
cgit 


From f4d10b5c85b5f118c6e55288afd053f5c8c5fc98 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Dec 2018 09:41:15 +0100
Subject: nvmet-tcp: fix endianess annotations

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/target/tcp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index d31bec260160..44b37b202e39 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -758,7 +758,7 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
 
 	if (icreq->maxr2t != 0) {
 		pr_err("queue %d: unsupported maxr2t %d\n", queue->idx,
-			le16_to_cpu(icreq->maxr2t) + 1);
+			le32_to_cpu(icreq->maxr2t) + 1);
 		return -EPROTO;
 	}
 
@@ -776,7 +776,7 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
 	icresp->hdr.pdo = 0;
 	icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
 	icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
-	icresp->maxdata = 0xffff; /* FIXME: support r2t */
+	icresp->maxdata = cpu_to_le32(0xffff); /* FIXME: support r2t */
 	icresp->cpda = 0;
 	if (queue->hdr_digest)
 		icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
-- 
cgit 


From a7273d40232fbc1f89212b973d588aeaa61161b2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Dec 2018 09:46:59 +0100
Subject: nvme-tcp: fix endianess annotations

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/tcp.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 51826479a41e..83417b4473b3 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -35,7 +35,7 @@ struct nvme_tcp_request {
 	u32			pdu_sent;
 	u16			ttag;
 	struct list_head	entry;
-	u32			ddgst;
+	__le32			ddgst;
 
 	struct bio		*curr_bio;
 	struct iov_iter		iter;
@@ -272,7 +272,8 @@ nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
 	return req;
 }
 
-static inline void nvme_tcp_ddgst_final(struct ahash_request *hash, u32 *dgst)
+static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
+		__le32 *dgst)
 {
 	ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
 	crypto_ahash_final(hash);
@@ -817,7 +818,7 @@ static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
 	union nvme_result res = {};
 
 	nvme_end_request(blk_mq_rq_from_pdu(req),
-		NVME_SC_DATA_XFER_ERROR, res);
+		cpu_to_le16(NVME_SC_DATA_XFER_ERROR), res);
 }
 
 static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
@@ -1960,7 +1961,7 @@ nvme_tcp_timeout(struct request *rq, bool reserved)
 		union nvme_result res = {};
 
 		nvme_req(rq)->flags |= NVME_REQ_CANCELLED;
-		nvme_end_request(rq, NVME_SC_ABORT_REQ, res);
+		nvme_end_request(rq, cpu_to_le16(NVME_SC_ABORT_REQ), res);
 		return BLK_EH_DONE;
 	}
 
-- 
cgit 


From 56a77d26d6316a3936497236c7e3a6a98fad950c Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 14 Dec 2018 11:42:43 +0000
Subject: nvme-tcp: fix spelling mistake "attepmpt" -> "attempt"

There is a spelling mistake in a dev_info message, fix it.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 83417b4473b3..f49cbe9aa19f 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1790,7 +1790,7 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
 	if (nvme_tcp_setup_ctrl(ctrl, false))
 		goto requeue;
 
-	dev_info(ctrl->device, "Successfully reconnected (%d attepmpt)\n",
+	dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
 			ctrl->nr_reconnects);
 
 	ctrl->nr_reconnects = 0;
-- 
cgit 


From 6287b51c77e6d8f05f772931cf51d80e81651a9f Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Fri, 14 Dec 2018 11:06:07 -0800
Subject: nvme-core: optionally poll sync commands

Pass poll bool to indicate that we need it to poll. This prepares us for
polling support in nvmf since connect is an I/O that will be queued
and has to be polled in order to complete. If poll is passed,
we call nvme_execute_rq_polled which sends the requests and polls
for its completion.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c    | 38 +++++++++++++++++++++++++++++++++-----
 drivers/nvme/host/fabrics.c | 10 +++++-----
 drivers/nvme/host/nvme.h    |  2 +-
 3 files changed, 39 insertions(+), 11 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 136512e8ba58..08f2c92602f4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -724,6 +724,31 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 }
 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
 
+static void nvme_end_sync_rq(struct request *rq, blk_status_t error)
+{
+	struct completion *waiting = rq->end_io_data;
+
+	rq->end_io_data = NULL;
+	complete(waiting);
+}
+
+static void nvme_execute_rq_polled(struct request_queue *q,
+		struct gendisk *bd_disk, struct request *rq, int at_head)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags));
+
+	rq->cmd_flags |= REQ_HIPRI;
+	rq->end_io_data = &wait;
+	blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq);
+
+	while (!completion_done(&wait)) {
+		blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true);
+		cond_resched();
+	}
+}
+
 /*
  * Returns 0 on success.  If the result is negative, it's a Linux error code;
  * if the result is positive, it's an NVM Express status code
@@ -731,7 +756,7 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd);
 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		union nvme_result *result, void *buffer, unsigned bufflen,
 		unsigned timeout, int qid, int at_head,
-		blk_mq_req_flags_t flags)
+		blk_mq_req_flags_t flags, bool poll)
 {
 	struct request *req;
 	int ret;
@@ -748,7 +773,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 			goto out;
 	}
 
-	blk_execute_rq(req->q, NULL, req, at_head);
+	if (poll)
+		nvme_execute_rq_polled(req->q, NULL, req, at_head);
+	else
+		blk_execute_rq(req->q, NULL, req, at_head);
 	if (result)
 		*result = nvme_req(req)->result;
 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
@@ -765,7 +793,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		void *buffer, unsigned bufflen)
 {
 	return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
-			NVME_QID_ANY, 0, 0);
+			NVME_QID_ANY, 0, 0, false);
 }
 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
 
@@ -1084,7 +1112,7 @@ static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword
 	c.features.dword11 = cpu_to_le32(dword11);
 
 	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
-			buffer, buflen, 0, NVME_QID_ANY, 0, 0);
+			buffer, buflen, 0, NVME_QID_ANY, 0, 0, false);
 	if (ret >= 0 && result)
 		*result = le32_to_cpu(res.u32);
 	return ret;
@@ -1727,7 +1755,7 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 	cmd.common.cdw11 = cpu_to_le32(len);
 
 	return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
-				      ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0);
+				      ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false);
 }
 EXPORT_SYMBOL_GPL(nvme_sec_submit);
 #endif /* CONFIG_BLK_SED_OPAL */
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 19ff0eae4582..d93a169f6403 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -159,7 +159,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
 	cmd.prop_get.offset = cpu_to_le32(off);
 
 	ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
-			NVME_QID_ANY, 0, 0);
+			NVME_QID_ANY, 0, 0, false);
 
 	if (ret >= 0)
 		*val = le64_to_cpu(res.u64);
@@ -206,7 +206,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
 	cmd.prop_get.offset = cpu_to_le32(off);
 
 	ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
-			NVME_QID_ANY, 0, 0);
+			NVME_QID_ANY, 0, 0, false);
 
 	if (ret >= 0)
 		*val = le64_to_cpu(res.u64);
@@ -252,7 +252,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
 	cmd.prop_set.value = cpu_to_le64(val);
 
 	ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0,
-			NVME_QID_ANY, 0, 0);
+			NVME_QID_ANY, 0, 0, false);
 	if (unlikely(ret))
 		dev_err(ctrl->device,
 			"Property Set error: %d, offset %#x\n",
@@ -406,7 +406,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
 
 	ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res,
 			data, sizeof(*data), 0, NVME_QID_ANY, 1,
-			BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
+			BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, false);
 	if (ret) {
 		nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
 				       &cmd, data);
@@ -468,7 +468,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
 
 	ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res,
 			data, sizeof(*data), 0, qid, 1,
-			BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
+			BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, false);
 	if (ret) {
 		nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
 				       &cmd, data);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 39b52f4d9b24..2b36ac922596 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -447,7 +447,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		union nvme_result *result, void *buffer, unsigned bufflen,
 		unsigned timeout, int qid, int at_head,
-		blk_mq_req_flags_t flags);
+		blk_mq_req_flags_t flags, bool poll);
 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
 int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
-- 
cgit 


From 26c682274e0a7d055e123499eac8ec39d0e04283 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Fri, 14 Dec 2018 11:06:08 -0800
Subject: nvme-fabrics: allow nvmf_connect_io_queue to poll

Preparation for polling support for fabrics. Polling support
means that our completion queues are not generating any interrupts
which means we need to poll for the nvmf io queue connect as well.

Reviewed by Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fabrics.c | 4 ++--
 drivers/nvme/host/fabrics.h | 2 +-
 drivers/nvme/host/fc.c      | 2 +-
 drivers/nvme/host/rdma.c    | 2 +-
 drivers/nvme/host/tcp.c     | 2 +-
 drivers/nvme/target/loop.c  | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index d93a169f6403..5ff14f46a8d9 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -441,7 +441,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_admin_queue);
  *	> 0: NVMe error status code
  *	< 0: Linux errno error code
  */
-int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
+int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid, bool poll)
 {
 	struct nvme_command cmd;
 	struct nvmf_connect_data *data;
@@ -468,7 +468,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
 
 	ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res,
 			data, sizeof(*data), 0, qid, 1,
-			BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, false);
+			BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, poll);
 	if (ret) {
 		nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
 				       &cmd, data);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 81b8fd1c0c5d..cad70a9f976a 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -168,7 +168,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val);
 int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
 int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
 int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl);
-int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid);
+int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid, bool poll);
 int nvmf_register_transport(struct nvmf_transport_ops *ops);
 void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
 void nvmf_free_options(struct nvmf_ctrl_options *opts);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index b79e41938513..89accc76d71c 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1975,7 +1975,7 @@ nvme_fc_connect_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize)
 					(qsize / 5));
 		if (ret)
 			break;
-		ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
+		ret = nvmf_connect_io_queue(&ctrl->ctrl, i, false);
 		if (ret)
 			break;
 
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index ed726da77b5b..b907ed43814f 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -598,7 +598,7 @@ static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
 	int ret;
 
 	if (idx)
-		ret = nvmf_connect_io_queue(&ctrl->ctrl, idx);
+		ret = nvmf_connect_io_queue(&ctrl->ctrl, idx, false);
 	else
 		ret = nvmf_connect_admin_queue(&ctrl->ctrl);
 
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index f49cbe9aa19f..de174912445e 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1394,7 +1394,7 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
 	int ret;
 
 	if (idx)
-		ret = nvmf_connect_io_queue(nctrl, idx);
+		ret = nvmf_connect_io_queue(nctrl, idx, false);
 	else
 		ret = nvmf_connect_admin_queue(nctrl);
 
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 9908082b32c4..4aac1b4a8112 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -345,7 +345,7 @@ static int nvme_loop_connect_io_queues(struct nvme_loop_ctrl *ctrl)
 	int i, ret;
 
 	for (i = 1; i < ctrl->ctrl.queue_count; i++) {
-		ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
+		ret = nvmf_connect_io_queue(&ctrl->ctrl, i, false);
 		if (ret)
 			return ret;
 		set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags);
-- 
cgit 


From 89d43802b0e7298bc0b464bdb5fff864074e2218 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Fri, 14 Dec 2018 11:06:09 -0800
Subject: nvme-fabrics: allow user to pass in nr_poll_queues

This argument will specify how many polling I/O queues to connect when
creating the controller. These I/O queues will host I/O that is set with
REQ_HIPRI.

Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fabrics.c | 13 +++++++++++++
 drivers/nvme/host/fabrics.h |  3 +++
 2 files changed, 16 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 5ff14f46a8d9..b2ab213f43de 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -617,6 +617,7 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_HDR_DIGEST,		"hdr_digest"		},
 	{ NVMF_OPT_DATA_DIGEST,		"data_digest"		},
 	{ NVMF_OPT_NR_WRITE_QUEUES,	"nr_write_queues=%d"	},
+	{ NVMF_OPT_NR_POLL_QUEUES,	"nr_poll_queues=%d"	},
 	{ NVMF_OPT_ERR,			NULL			}
 };
 
@@ -850,6 +851,18 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 			}
 			opts->nr_write_queues = token;
 			break;
+		case NVMF_OPT_NR_POLL_QUEUES:
+			if (match_int(args, &token)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			if (token <= 0) {
+				pr_err("Invalid nr_poll_queues %d\n", token);
+				ret = -EINVAL;
+				goto out;
+			}
+			opts->nr_poll_queues = token;
+			break;
 		default:
 			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
 				p);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index cad70a9f976a..478343b73e38 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -62,6 +62,7 @@ enum {
 	NVMF_OPT_HDR_DIGEST	= 1 << 15,
 	NVMF_OPT_DATA_DIGEST	= 1 << 16,
 	NVMF_OPT_NR_WRITE_QUEUES = 1 << 17,
+	NVMF_OPT_NR_POLL_QUEUES = 1 << 18,
 };
 
 /**
@@ -93,6 +94,7 @@ enum {
  * @hdr_digest: generate/verify header digest (TCP)
  * @data_digest: generate/verify data digest (TCP)
  * @nr_write_queues: number of queues for write I/O
+ * @nr_poll_queues: number of queues for polling I/O
  */
 struct nvmf_ctrl_options {
 	unsigned		mask;
@@ -113,6 +115,7 @@ struct nvmf_ctrl_options {
 	bool			hdr_digest;
 	bool			data_digest;
 	unsigned int		nr_write_queues;
+	unsigned int		nr_poll_queues;
 };
 
 /*
-- 
cgit 


From ff8519f9e91170cd24ed503aa77d144e71bd4d92 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Fri, 14 Dec 2018 11:06:10 -0800
Subject: nvme-rdma: implement polling queue map

When passed with nr_poll_queues setup additional queues with cq polling
context IB_POLL_DIRECT (no interrupts) and make sure to set
QUEUE_FLAG_POLL on the connect_q. In addition add the third queue
mapping for polling queues.

nvmf connect on this queue is polled for like all other requests so make
nvmf_connect_io_queue poll for polling queues.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 49 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index b907ed43814f..0a2fd2949ad7 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -162,6 +162,13 @@ static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
 	return queue - queue->ctrl->queues;
 }
 
+static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue)
+{
+	return nvme_rdma_queue_idx(queue) >
+		queue->ctrl->ctrl.opts->nr_io_queues +
+		queue->ctrl->ctrl.opts->nr_write_queues;
+}
+
 static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
 {
 	return queue->cmnd_capsule_len - sizeof(struct nvme_command);
@@ -440,6 +447,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
 	const int send_wr_factor = 3;			/* MR, SEND, INV */
 	const int cq_factor = send_wr_factor + 1;	/* + RECV */
 	int comp_vector, idx = nvme_rdma_queue_idx(queue);
+	enum ib_poll_context poll_ctx;
 	int ret;
 
 	queue->device = nvme_rdma_find_get_device(queue->cm_id);
@@ -456,10 +464,16 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
 	 */
 	comp_vector = idx == 0 ? idx : idx - 1;
 
+	/* Polling queues need direct cq polling context */
+	if (nvme_rdma_poll_queue(queue))
+		poll_ctx = IB_POLL_DIRECT;
+	else
+		poll_ctx = IB_POLL_SOFTIRQ;
+
 	/* +1 for ib_stop_cq */
 	queue->ib_cq = ib_alloc_cq(ibdev, queue,
 				cq_factor * queue->queue_size + 1,
-				comp_vector, IB_POLL_SOFTIRQ);
+				comp_vector, poll_ctx);
 	if (IS_ERR(queue->ib_cq)) {
 		ret = PTR_ERR(queue->ib_cq);
 		goto out_put_dev;
@@ -595,15 +609,17 @@ static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl)
 
 static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
 {
+	struct nvme_rdma_queue *queue = &ctrl->queues[idx];
+	bool poll = nvme_rdma_poll_queue(queue);
 	int ret;
 
 	if (idx)
-		ret = nvmf_connect_io_queue(&ctrl->ctrl, idx, false);
+		ret = nvmf_connect_io_queue(&ctrl->ctrl, idx, poll);
 	else
 		ret = nvmf_connect_admin_queue(&ctrl->ctrl);
 
 	if (!ret)
-		set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[idx].flags);
+		set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
 	else
 		dev_info(ctrl->ctrl.device,
 			"failed to connect queue: %d ret=%d\n", idx, ret);
@@ -646,6 +662,7 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
 				ibdev->num_comp_vectors);
 
 	nr_io_queues += min(opts->nr_write_queues, num_online_cpus());
+	nr_io_queues += min(opts->nr_poll_queues, num_online_cpus());
 
 	ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
 	if (ret)
@@ -716,7 +733,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
 		set->driver_data = ctrl;
 		set->nr_hw_queues = nctrl->queue_count - 1;
 		set->timeout = NVME_IO_TIMEOUT;
-		set->nr_maps = 2 /* default + read */;
+		set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
 	}
 
 	ret = blk_mq_alloc_tag_set(set);
@@ -1742,6 +1759,13 @@ err:
 	return BLK_STS_IOERR;
 }
 
+static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
+{
+	struct nvme_rdma_queue *queue = hctx->driver_data;
+
+	return ib_process_cq_direct(queue->ib_cq, -1);
+}
+
 static void nvme_rdma_complete_rq(struct request *rq)
 {
 	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
@@ -1772,6 +1796,17 @@ static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
 			ctrl->device->dev, 0);
 	blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
 			ctrl->device->dev, 0);
+
+	if (ctrl->ctrl.opts->nr_poll_queues) {
+		set->map[HCTX_TYPE_POLL].nr_queues =
+				ctrl->ctrl.opts->nr_poll_queues;
+		set->map[HCTX_TYPE_POLL].queue_offset =
+				ctrl->ctrl.opts->nr_io_queues;
+		if (ctrl->ctrl.opts->nr_write_queues)
+			set->map[HCTX_TYPE_POLL].queue_offset +=
+				ctrl->ctrl.opts->nr_write_queues;
+		blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
+	}
 	return 0;
 }
 
@@ -1783,6 +1818,7 @@ static const struct blk_mq_ops nvme_rdma_mq_ops = {
 	.init_hctx	= nvme_rdma_init_hctx,
 	.timeout	= nvme_rdma_timeout,
 	.map_queues	= nvme_rdma_map_queues,
+	.poll		= nvme_rdma_poll,
 };
 
 static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
@@ -1927,7 +1963,8 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
 	INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
 	INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
 
-	ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1;
+	ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
+				opts->nr_poll_queues + 1;
 	ctrl->ctrl.sqsize = opts->queue_size - 1;
 	ctrl->ctrl.kato = opts->kato;
 
@@ -1979,7 +2016,7 @@ static struct nvmf_transport_ops nvme_rdma_transport = {
 	.required_opts	= NVMF_OPT_TRADDR,
 	.allowed_opts	= NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
 			  NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
-			  NVMF_OPT_NR_WRITE_QUEUES,
+			  NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES,
 	.create_ctrl	= nvme_rdma_create_ctrl,
 };
 
-- 
cgit 


From 604c01d567cb9ee7d19dc598272cb90ab6229a8a Mon Sep 17 00:00:00 2001
From: yupeng <yupeng0921@gmail.com>
Date: Tue, 18 Dec 2018 17:59:53 +0100
Subject: nvme-pci: trace SQ status on completions

Export the disk name, queue id, sq_head, sq_tail to a trace event in
completion handling.

Usage example:

cd /sys/kernel/debug/tracing/events/nvme/nvme_sq

echo 'disk=="nvme1n1"' > filter

echo 1 > enable

cat /sys/kernel/debug/tracing/trace_pipe

Signed-off-by: yupeng <yupeng0921@gmail.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Keith Busch <keith.busch@intel.com>
[hch: slight formatting tweaks, use standard nvme tracepoint
 conventions]
Signed-off-by: Christoph Hellwig <hch@lst.de>

wip
---
 drivers/nvme/host/pci.c   |  2 ++
 drivers/nvme/host/trace.c |  3 +++
 drivers/nvme/host/trace.h | 23 +++++++++++++++++++++++
 3 files changed, 28 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 452b28130380..5a0bf6a24d50 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -32,6 +32,7 @@
 #include <linux/sed-opal.h>
 #include <linux/pci-p2pdma.h>
 
+#include "trace.h"
 #include "nvme.h"
 
 #define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
@@ -1003,6 +1004,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
 	}
 
 	req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
+	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
 	nvme_end_request(req, cqe->status, cqe->result);
 }
 
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 25b0e310f4a8..5566dda3237a 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -139,3 +139,6 @@ const char *nvme_trace_disk_name(struct trace_seq *p, char *name)
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(nvme_trace_disk_name);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_sq);
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
index 1978deb6fcc7..3564120aa7b3 100644
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/host/trace.h
@@ -184,6 +184,29 @@ TRACE_EVENT(nvme_async_event,
 
 #undef aer_name
 
+TRACE_EVENT(nvme_sq,
+	TP_PROTO(struct request *req, __le16 sq_head, int sq_tail),
+	TP_ARGS(req, sq_head, sq_tail),
+	TP_STRUCT__entry(
+		__field(int, ctrl_id)
+		__array(char, disk, DISK_NAME_LEN)
+		__field(int, qid)
+		__field(u16, sq_head)
+		__field(u16, sq_tail)
+	),
+	TP_fast_assign(
+		__entry->ctrl_id = nvme_req(req)->ctrl->instance;
+		__assign_disk_name(__entry->disk, req->rq_disk);
+		__entry->qid = nvme_req_qid(req);
+		__entry->sq_head = le16_to_cpu(sq_head);
+		__entry->sq_tail = sq_tail;
+	),
+	TP_printk("nvme%d: %sqid=%d, head=%u, tail=%u",
+		__entry->ctrl_id, __print_disk_name(__entry->disk),
+		__entry->qid, __entry->sq_head, __entry->sq_tail
+	)
+);
+
 #endif /* _TRACE_NVME_H */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit 


From dbe3ece1287dafe4113c64ada3113c39f344c64a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 19 Dec 2018 09:13:34 -0700
Subject: dm: don't reuse bio for flushes

DM currently has a statically allocated bio that it uses to issue empty
flushes. It doesn't submit this bio, it just uses it for maintaining
state while setting up clones. Multiple users can access this bio at the
same time. This wasn't previously an issue, even if it was a bit iffy,
but with the blkg associations it can become one.

We setup the blkg association, then clone bio's and submit, then remove
the blkg assocation again. But since we can have multiple tasks doing
this at the same time, against multiple blkg's, then we can either lose
references to a blkg, or put it twice. The latter causes complaints on
the percpu ref being <= 0 when released, and can cause use-after-free as
well. Ming reports that xfstest generic/475 triggers this:

------------[ cut here ]------------
percpu ref (blkg_release) <= 0 (0) after switching to atomic
WARNING: CPU: 13 PID: 0 at lib/percpu-refcount.c:155 percpu_ref_switch_to_atomic_rcu+0x2c9/0x4a0

Switch to just using an on-stack bio for this, and get rid of the
embedded bio.

Fixes: 5cdf2e3fea5e ("blkcg: associate blkg when associating a device")
Reported-by: Ming Lei <ming.lei@redhat.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-core.h |  3 ---
 drivers/md/dm.c      | 35 +++++++++++++++++++++++++----------
 2 files changed, 25 insertions(+), 13 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 6fe883fac471..95c6d86ab5e8 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -106,9 +106,6 @@ struct mapped_device {
 
 	struct block_device *bdev;
 
-	/* zero-length flush that will be cloned and submitted to targets */
-	struct bio flush_bio;
-
 	struct dm_stats stats;
 
 	/* for blk-mq request-based DM support */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dddbca63e140..f588a6a83d80 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1420,11 +1420,11 @@ static int __send_empty_flush(struct clone_info *ci)
 	struct dm_target *ti;
 
 	/*
-	 * Empty flush uses a statically initialized bio, &md->flush_bio, as
-	 * the base for cloning.  However, blkg association requires that a
-	 * bdev is associated with a gendisk, which doesn't happen until the
-	 * bdev is opened.  So, blkg association is done at issue time of the
-	 * flush rather than when the device is created in alloc_dev().
+	 * Empty flush uses a statically initialized bio, as the base for
+	 * cloning.  However, blkg association requires that a bdev is
+	 * associated with a gendisk, which doesn't happen until the bdev is
+	 * opened.  So, blkg association is done at issue time of the flush
+	 * rather than when the device is created in alloc_dev().
 	 */
 	bio_set_dev(ci->bio, ci->io->md->bdev);
 
@@ -1609,7 +1609,16 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
 	init_clone_info(&ci, md, map, bio);
 
 	if (bio->bi_opf & REQ_PREFLUSH) {
-		ci.bio = &ci.io->md->flush_bio;
+		struct bio flush_bio;
+
+		/*
+		 * Use an on-stack bio for this, it's safe since we don't
+		 * need to reference it after submit. It's just used as
+		 * the basis for the clone(s).
+		 */
+		bio_init(&flush_bio, NULL, 0);
+		flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+		ci.bio = &flush_bio;
 		ci.sector_count = 0;
 		error = __send_empty_flush(&ci);
 		/* dec_pending submits any data associated with flush */
@@ -1665,7 +1674,16 @@ static blk_qc_t __process_bio(struct mapped_device *md,
 	init_clone_info(&ci, md, map, bio);
 
 	if (bio->bi_opf & REQ_PREFLUSH) {
-		ci.bio = &ci.io->md->flush_bio;
+		struct bio flush_bio;
+
+		/*
+		 * Use an on-stack bio for this, it's safe since we don't
+		 * need to reference it after submit. It's just used as
+		 * the basis for the clone(s).
+		 */
+		bio_init(&flush_bio, NULL, 0);
+		flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+		ci.bio = &flush_bio;
 		ci.sector_count = 0;
 		error = __send_empty_flush(&ci);
 		/* dec_pending submits any data associated with flush */
@@ -1949,9 +1967,6 @@ static struct mapped_device *alloc_dev(int minor)
 	if (!md->bdev)
 		goto bad;
 
-	bio_init(&md->flush_bio, NULL, 0);
-	md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
-
 	dm_stats_init(&md->stats);
 
 	/* Populate the mapping, nobody knows we exist yet */
-- 
cgit