From b60503ba432b16fc84442a84e29a7aad2c0c363d Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 20 Jan 2011 12:50:14 -0500
Subject: NVMe: New driver

This driver is for devices that follow the NVM Express standard

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/Kconfig  |   11 +
 drivers/block/Makefile |    1 +
 drivers/block/nvme.c   | 1043 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1055 insertions(+)
 create mode 100644 drivers/block/nvme.c

(limited to 'drivers/block')

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 6f07ec1c2f58..35e56e1c948f 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -315,6 +315,17 @@ config BLK_DEV_NBD
 
 	  If unsure, say N.
 
+config BLK_DEV_NVME
+	tristate "NVM Express block device"
+	depends on PCI
+	---help---
+	  The NVM Express driver is for solid state drives directly
+	  connected to the PCI or PCI Express bus.  If you know you
+	  don't have one of these, it is safe to answer N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called nvme.
+
 config BLK_DEV_OSD
 	tristate "OSD object-as-blkdev support"
 	depends on SCSI_OSD_ULD
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 76646e9a1c91..349539ad3ad9 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_XILINX_SYSACE)	+= xsysace.o
 obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
 obj-$(CONFIG_MG_DISK)		+= mg_disk.o
 obj-$(CONFIG_SUNVDC)		+= sunvdc.o
+obj-$(CONFIG_BLK_DEV_NVME)	+= nvme.o
 obj-$(CONFIG_BLK_DEV_OSD)	+= osdblk.o
 
 obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
new file mode 100644
index 000000000000..ef66eccc2aa2
--- /dev/null
+++ b/drivers/block/nvme.c
@@ -0,0 +1,1043 @@
+/*
+ * NVM Express device driver
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/nvme.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kdev_t.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/version.h>
+
+#define NVME_Q_DEPTH 1024
+#define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
+#define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
+#define NVME_MINORS 64
+
+static int nvme_major;
+module_param(nvme_major, int, 0);
+
+/*
+ * Represents an NVM Express device.  Each nvme_dev is a PCI function.
+ */
+struct nvme_dev {
+	struct list_head node;
+	struct nvme_queue **queues;
+	u32 __iomem *dbs;
+	struct pci_dev *pci_dev;
+	int instance;
+	int queue_count;
+	u32 ctrl_config;
+	struct msix_entry *entry;
+	struct nvme_bar __iomem *bar;
+	struct list_head namespaces;
+};
+
+/*
+ * An NVM Express namespace is equivalent to a SCSI LUN
+ */
+struct nvme_ns {
+	struct list_head list;
+
+	struct nvme_dev *dev;
+	struct request_queue *queue;
+	struct gendisk *disk;
+
+	int ns_id;
+	int lba_shift;
+};
+
+/*
+ * An NVM Express queue.  Each device has at least two (one for admin
+ * commands and one for I/O commands).
+ */
+struct nvme_queue {
+	struct device *q_dmadev;
+	spinlock_t q_lock;
+	struct nvme_command *sq_cmds;
+	volatile struct nvme_completion *cqes;
+	dma_addr_t sq_dma_addr;
+	dma_addr_t cq_dma_addr;
+	wait_queue_head_t sq_full;
+	struct bio_list sq_cong;
+	u32 __iomem *q_db;
+	u16 q_depth;
+	u16 cq_vector;
+	u16 sq_head;
+	u16 sq_tail;
+	u16 cq_head;
+	u16 cq_cycle;
+	unsigned long cmdid_data[];
+};
+
+/*
+ * Check we didin't inadvertently grow the command struct
+ */
+static inline void _nvme_check_size(void)
+{
+	BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
+	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
+}
+
+/**
+ * alloc_cmdid - Allocate a Command ID
+ * @param nvmeq The queue that will be used for this command
+ * @param ctx A pointer that will be passed to the handler
+ * @param handler The ID of the handler to call
+ *
+ * Allocate a Command ID for a queue.  The data passed in will
+ * be passed to the completion handler.  This is implemented by using
+ * the bottom two bits of the ctx pointer to store the handler ID.
+ * Passing in a pointer that's not 4-byte aligned will cause a BUG.
+ * We can change this if it becomes a problem.
+ */
+static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler)
+{
+	int depth = nvmeq->q_depth;
+	unsigned long data = (unsigned long)ctx | handler;
+	int cmdid;
+
+	BUG_ON((unsigned long)ctx & 3);
+
+	do {
+		cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth);
+		if (cmdid >= depth)
+			return -EBUSY;
+	} while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
+
+	nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(depth)] = data;
+	return cmdid;
+}
+
+static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
+								int handler)
+{
+	int cmdid;
+	wait_event_killable(nvmeq->sq_full,
+			(cmdid = alloc_cmdid(nvmeq, ctx, handler)) >= 0);
+	return (cmdid < 0) ? -EINTR : cmdid;
+}
+
+/* If you need more than four handlers, you'll need to change how
+ * alloc_cmdid and nvme_process_cq work
+ */
+enum {
+	sync_completion_id = 0,
+	bio_completion_id,
+};
+
+static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
+{
+	unsigned long data;
+
+	data = nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(nvmeq->q_depth)];
+	clear_bit(cmdid, nvmeq->cmdid_data);
+	wake_up(&nvmeq->sq_full);
+	return data;
+}
+
+static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
+{
+	return ns->dev->queues[1];
+}
+
+static void put_nvmeq(struct nvme_queue *nvmeq)
+{
+}
+
+/**
+ * nvme_submit_cmd: Copy a command into a queue and ring the doorbell
+ * @nvmeq: The queue to use
+ * @cmd: The command to send
+ *
+ * Safe to use from interrupt context
+ */
+static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
+{
+	unsigned long flags;
+	u16 tail;
+	/* XXX: Need to check tail isn't going to overrun head */
+	spin_lock_irqsave(&nvmeq->q_lock, flags);
+	tail = nvmeq->sq_tail;
+	memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
+	writel(tail, nvmeq->q_db);
+	if (++tail == nvmeq->q_depth)
+		tail = 0;
+	nvmeq->sq_tail = tail;
+	spin_unlock_irqrestore(&nvmeq->q_lock, flags);
+
+	return 0;
+}
+
+struct nvme_req_info {
+	struct bio *bio;
+	int nents;
+	struct scatterlist sg[0];
+};
+
+/* XXX: use a mempool */
+static struct nvme_req_info *alloc_info(unsigned nseg, gfp_t gfp)
+{
+	return kmalloc(sizeof(struct nvme_req_info) +
+			sizeof(struct scatterlist) * nseg, gfp);
+}
+
+static void free_info(struct nvme_req_info *info)
+{
+	kfree(info);
+}
+
+static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
+{
+	struct nvme_req_info *info = ctx;
+	struct bio *bio = info->bio;
+	u16 status = le16_to_cpup(&cqe->status) >> 1;
+
+	dma_unmap_sg(nvmeq->q_dmadev, info->sg, info->nents,
+			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+	free_info(info);
+	bio_endio(bio, status ? -EIO : 0);
+}
+
+static int nvme_map_bio(struct device *dev, struct nvme_req_info *info,
+		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
+{
+	struct bio_vec *bvec;
+	struct scatterlist *sg = info->sg;
+	int i, nsegs;
+
+	sg_init_table(sg, psegs);
+	bio_for_each_segment(bvec, bio, i) {
+		sg_set_page(sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
+		/* XXX: handle non-mergable here */
+		nsegs++;
+	}
+	info->nents = nsegs;
+
+	return dma_map_sg(dev, info->sg, info->nents, dma_dir);
+}
+
+static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+								struct bio *bio)
+{
+	struct nvme_rw_command *cmnd;
+	struct nvme_req_info *info;
+	enum dma_data_direction dma_dir;
+	int cmdid;
+	u16 control;
+	u32 dsmgmt;
+	unsigned long flags;
+	int psegs = bio_phys_segments(ns->queue, bio);
+
+	info = alloc_info(psegs, GFP_NOIO);
+	if (!info)
+		goto congestion;
+	info->bio = bio;
+
+	cmdid = alloc_cmdid(nvmeq, info, bio_completion_id);
+	if (unlikely(cmdid < 0))
+		goto free_info;
+
+	control = 0;
+	if (bio->bi_rw & REQ_FUA)
+		control |= NVME_RW_FUA;
+	if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD))
+		control |= NVME_RW_LR;
+
+	dsmgmt = 0;
+	if (bio->bi_rw & REQ_RAHEAD)
+		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+
+	spin_lock_irqsave(&nvmeq->q_lock, flags);
+	cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail].rw;
+
+	if (bio_data_dir(bio)) {
+		cmnd->opcode = nvme_cmd_write;
+		dma_dir = DMA_TO_DEVICE;
+	} else {
+		cmnd->opcode = nvme_cmd_read;
+		dma_dir = DMA_FROM_DEVICE;
+	}
+
+	nvme_map_bio(nvmeq->q_dmadev, info, bio, dma_dir, psegs);
+
+	cmnd->flags = 1;
+	cmnd->command_id = cmdid;
+	cmnd->nsid = cpu_to_le32(ns->ns_id);
+	cmnd->prp1 = cpu_to_le64(sg_phys(info->sg));
+	/* XXX: Support more than one PRP */
+	cmnd->slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
+	cmnd->length = cpu_to_le16((bio->bi_size >> ns->lba_shift) - 1);
+	cmnd->control = cpu_to_le16(control);
+	cmnd->dsmgmt = cpu_to_le32(dsmgmt);
+
+	writel(nvmeq->sq_tail, nvmeq->q_db);
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+
+	spin_unlock_irqrestore(&nvmeq->q_lock, flags);
+
+	return 0;
+
+ free_info:
+	free_info(info);
+ congestion:
+	return -EBUSY;
+}
+
+/*
+ * NB: return value of non-zero would mean that we were a stacking driver.
+ * make_request must always succeed.
+ */
+static int nvme_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct nvme_ns *ns = q->queuedata;
+	struct nvme_queue *nvmeq = get_nvmeq(ns);
+
+	if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
+		blk_set_queue_congested(q, rw_is_sync(bio->bi_rw));
+		bio_list_add(&nvmeq->sq_cong, bio);
+	}
+	put_nvmeq(nvmeq);
+
+	return 0;
+}
+
+struct sync_cmd_info {
+	struct task_struct *task;
+	u32 result;
+	int status;
+};
+
+static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
+{
+	struct sync_cmd_info *cmdinfo = ctx;
+	cmdinfo->result = le32_to_cpup(&cqe->result);
+	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
+	wake_up_process(cmdinfo->task);
+}
+
+typedef void (*completion_fn)(struct nvme_queue *, void *,
+						struct nvme_completion *);
+
+static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
+{
+	u16 head, cycle;
+
+	static const completion_fn completions[4] = {
+		[sync_completion_id] = sync_completion,
+		[bio_completion_id]  = bio_completion,
+	};
+
+	head = nvmeq->cq_head;
+	cycle = nvmeq->cq_cycle;
+
+	for (;;) {
+		unsigned long data;
+		void *ptr;
+		unsigned char handler;
+		struct nvme_completion cqe = nvmeq->cqes[head];
+		if ((le16_to_cpu(cqe.status) & 1) != cycle)
+			break;
+		nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
+		if (++head == nvmeq->q_depth) {
+			head = 0;
+			cycle = !cycle;
+		}
+
+		data = free_cmdid(nvmeq, cqe.command_id);
+		handler = data & 3;
+		ptr = (void *)(data & ~3UL);
+		completions[handler](nvmeq, ptr, &cqe);
+	}
+
+	/* If the controller ignores the cq head doorbell and continuously
+	 * writes to the queue, it is theoretically possible to wrap around
+	 * the queue twice and mistakenly return IRQ_NONE.  Linux only
+	 * requires that 0.1% of your interrupts are handled, so this isn't
+	 * a big problem.
+	 */
+	if (head == nvmeq->cq_head && cycle == nvmeq->cq_cycle)
+		return IRQ_NONE;
+
+	writel(head, nvmeq->q_db + 1);
+	nvmeq->cq_head = head;
+	nvmeq->cq_cycle = cycle;
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t nvme_irq(int irq, void *data)
+{
+	return nvme_process_cq(data);
+}
+
+/*
+ * Returns 0 on success.  If the result is negative, it's a Linux error code;
+ * if the result is positive, it's an NVM Express status code
+ */
+static int nvme_submit_sync_cmd(struct nvme_queue *q, struct nvme_command *cmd,
+								u32 *result)
+{
+	int cmdid;
+	struct sync_cmd_info cmdinfo;
+
+	cmdinfo.task = current;
+	cmdinfo.status = -EINTR;
+
+	cmdid = alloc_cmdid_killable(q, &cmdinfo, sync_completion_id);
+	if (cmdid < 0)
+		return cmdid;
+	cmd->common.command_id = cmdid;
+
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	nvme_submit_cmd(q, cmd);
+	schedule();
+
+	if (result)
+		*result = cmdinfo.result;
+
+	return cmdinfo.status;
+}
+
+static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
+								u32 *result)
+{
+	return nvme_submit_sync_cmd(dev->queues[0], cmd, result);
+}
+
+static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
+{
+	int status;
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.delete_queue.opcode = opcode;
+	c.delete_queue.qid = cpu_to_le16(id);
+
+	status = nvme_submit_admin_cmd(dev, &c, NULL);
+	if (status)
+		return -EIO;
+	return 0;
+}
+
+static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
+						struct nvme_queue *nvmeq)
+{
+	int status;
+	struct nvme_command c;
+	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
+
+	memset(&c, 0, sizeof(c));
+	c.create_cq.opcode = nvme_admin_create_cq;
+	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
+	c.create_cq.cqid = cpu_to_le16(qid);
+	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
+	c.create_cq.cq_flags = cpu_to_le16(flags);
+	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
+
+	status = nvme_submit_admin_cmd(dev, &c, NULL);
+	if (status)
+		return -EIO;
+	return 0;
+}
+
+static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
+						struct nvme_queue *nvmeq)
+{
+	int status;
+	struct nvme_command c;
+	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
+
+	memset(&c, 0, sizeof(c));
+	c.create_sq.opcode = nvme_admin_create_sq;
+	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
+	c.create_sq.sqid = cpu_to_le16(qid);
+	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
+	c.create_sq.sq_flags = cpu_to_le16(flags);
+	c.create_sq.cqid = cpu_to_le16(qid);
+
+	status = nvme_submit_admin_cmd(dev, &c, NULL);
+	if (status)
+		return -EIO;
+	return 0;
+}
+
+static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
+{
+	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
+}
+
+static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
+{
+	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
+}
+
+static void nvme_free_queue(struct nvme_dev *dev, int qid)
+{
+	struct nvme_queue *nvmeq = dev->queues[qid];
+
+	free_irq(dev->entry[nvmeq->cq_vector].vector, nvmeq);
+
+	/* Don't tell the adapter to delete the admin queue */
+	if (qid) {
+		adapter_delete_sq(dev, qid);
+		adapter_delete_cq(dev, qid);
+	}
+
+	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
+				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+	kfree(nvmeq);
+}
+
+static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
+							int depth, int vector)
+{
+	struct device *dmadev = &dev->pci_dev->dev;
+	unsigned extra = (depth + BITS_TO_LONGS(depth)) * sizeof(long);
+	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
+	if (!nvmeq)
+		return NULL;
+
+	nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth),
+					&nvmeq->cq_dma_addr, GFP_KERNEL);
+	if (!nvmeq->cqes)
+		goto free_nvmeq;
+	memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth));
+
+	nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
+					&nvmeq->sq_dma_addr, GFP_KERNEL);
+	if (!nvmeq->sq_cmds)
+		goto free_cqdma;
+
+	nvmeq->q_dmadev = dmadev;
+	spin_lock_init(&nvmeq->q_lock);
+	nvmeq->cq_head = 0;
+	nvmeq->cq_cycle = 1;
+	init_waitqueue_head(&nvmeq->sq_full);
+	bio_list_init(&nvmeq->sq_cong);
+	nvmeq->q_db = &dev->dbs[qid * 2];
+	nvmeq->q_depth = depth;
+	nvmeq->cq_vector = vector;
+
+	return nvmeq;
+
+ free_cqdma:
+	dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes,
+							nvmeq->cq_dma_addr);
+ free_nvmeq:
+	kfree(nvmeq);
+	return NULL;
+}
+
+static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
+					int qid, int cq_size, int vector)
+{
+	int result;
+	struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector);
+
+	result = adapter_alloc_cq(dev, qid, nvmeq);
+	if (result < 0)
+		goto free_nvmeq;
+
+	result = adapter_alloc_sq(dev, qid, nvmeq);
+	if (result < 0)
+		goto release_cq;
+
+	result = request_irq(dev->entry[vector].vector, nvme_irq,
+				IRQF_DISABLED | IRQF_SHARED, "nvme", nvmeq);
+	if (result < 0)
+		goto release_sq;
+
+	return nvmeq;
+
+ release_sq:
+	adapter_delete_sq(dev, qid);
+ release_cq:
+	adapter_delete_cq(dev, qid);
+ free_nvmeq:
+	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
+				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+	kfree(nvmeq);
+	return NULL;
+}
+
+static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
+{
+	int result;
+	u32 aqa;
+	struct nvme_queue *nvmeq;
+
+	dev->dbs = ((void __iomem *)dev->bar) + 4096;
+
+	nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
+
+	aqa = nvmeq->q_depth - 1;
+	aqa |= aqa << 16;
+
+	dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;
+	dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
+	dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
+
+	writel(aqa, &dev->bar->aqa);
+	writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
+	writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
+	writel(dev->ctrl_config, &dev->bar->cc);
+
+	while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
+		msleep(100);
+		if (fatal_signal_pending(current))
+			return -EINTR;
+	}
+
+	result = request_irq(dev->entry[0].vector, nvme_irq,
+			IRQF_DISABLED | IRQF_SHARED, "nvme admin", nvmeq);
+	dev->queues[0] = nvmeq;
+	return result;
+}
+
+static int nvme_identify(struct nvme_ns *ns, void __user *addr, int cns)
+{
+	struct nvme_dev *dev = ns->dev;
+	int status;
+	struct nvme_command c;
+	void *page;
+	dma_addr_t dma_addr;
+
+	page = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
+								GFP_KERNEL);
+
+	memset(&c, 0, sizeof(c));
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.nsid = cns ? 0 : cpu_to_le32(ns->ns_id);
+	c.identify.prp1 = cpu_to_le64(dma_addr);
+	c.identify.cns = cpu_to_le32(cns);
+
+	status = nvme_submit_admin_cmd(dev, &c, NULL);
+
+	if (status)
+		status = -EIO;
+	else if (copy_to_user(addr, page, 4096))
+		status = -EFAULT;
+
+	dma_free_coherent(&dev->pci_dev->dev, 4096, page, dma_addr);
+
+	return status;
+}
+
+static int nvme_get_range_type(struct nvme_ns *ns, void __user *addr)
+{
+	struct nvme_dev *dev = ns->dev;
+	int status;
+	struct nvme_command c;
+	void *page;
+	dma_addr_t dma_addr;
+
+	page = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
+								GFP_KERNEL);
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = nvme_admin_get_features;
+	c.features.nsid = cpu_to_le32(ns->ns_id);
+	c.features.prp1 = cpu_to_le64(dma_addr);
+	c.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE);
+
+	status = nvme_submit_admin_cmd(dev, &c, NULL);
+
+	/* XXX: Assuming first range for now */
+	if (status)
+		status = -EIO;
+	else if (copy_to_user(addr, page, 64))
+		status = -EFAULT;
+
+	dma_free_coherent(&dev->pci_dev->dev, 4096, page, dma_addr);
+
+	return status;
+}
+
+static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
+							unsigned long arg)
+{
+	struct nvme_ns *ns = bdev->bd_disk->private_data;
+
+	switch (cmd) {
+	case NVME_IOCTL_IDENTIFY_NS:
+		return nvme_identify(ns, (void __user *)arg, 0);
+	case NVME_IOCTL_IDENTIFY_CTRL:
+		return nvme_identify(ns, (void __user *)arg, 1);
+	case NVME_IOCTL_GET_RANGE_TYPE:
+		return nvme_get_range_type(ns, (void __user *)arg);
+	default:
+		return -ENOTTY;
+	}
+}
+
+static const struct block_device_operations nvme_fops = {
+	.owner		= THIS_MODULE,
+	.ioctl		= nvme_ioctl,
+};
+
+static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int index,
+			struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
+{
+	struct nvme_ns *ns;
+	struct gendisk *disk;
+	int lbaf;
+
+	if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
+		return NULL;
+
+	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+	if (!ns)
+		return NULL;
+	ns->queue = blk_alloc_queue(GFP_KERNEL);
+	if (!ns->queue)
+		goto out_free_ns;
+	ns->queue->queue_flags = QUEUE_FLAG_DEFAULT | QUEUE_FLAG_NOMERGES |
+				QUEUE_FLAG_NONROT | QUEUE_FLAG_DISCARD;
+	blk_queue_make_request(ns->queue, nvme_make_request);
+	ns->dev = dev;
+	ns->queue->queuedata = ns;
+
+	disk = alloc_disk(NVME_MINORS);
+	if (!disk)
+		goto out_free_queue;
+	ns->ns_id = index;
+	ns->disk = disk;
+	lbaf = id->flbas & 0xf;
+	ns->lba_shift = id->lbaf[lbaf].ds;
+
+	disk->major = nvme_major;
+	disk->minors = NVME_MINORS;
+	disk->first_minor = NVME_MINORS * index;
+	disk->fops = &nvme_fops;
+	disk->private_data = ns;
+	disk->queue = ns->queue;
+	sprintf(disk->disk_name, "nvme%dn%d", dev->instance, index);
+	set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
+
+	return ns;
+
+ out_free_queue:
+	blk_cleanup_queue(ns->queue);
+ out_free_ns:
+	kfree(ns);
+	return NULL;
+}
+
+static void nvme_ns_free(struct nvme_ns *ns)
+{
+	put_disk(ns->disk);
+	blk_cleanup_queue(ns->queue);
+	kfree(ns);
+}
+
+static int set_queue_count(struct nvme_dev *dev, int sq_count, int cq_count)
+{
+	int status;
+	u32 result;
+	struct nvme_command c;
+	u32 q_count = (sq_count - 1) | ((cq_count - 1) << 16);
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = nvme_admin_get_features;
+	c.features.fid = cpu_to_le32(NVME_FEAT_NUM_QUEUES);
+	c.features.dword11 = cpu_to_le32(q_count);
+
+	status = nvme_submit_admin_cmd(dev, &c, &result);
+	if (status)
+		return -EIO;
+	return min(result & 0xffff, result >> 16) + 1;
+}
+
+/* XXX: Create per-CPU queues */
+static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
+{
+	int this_cpu;
+
+	set_queue_count(dev, 1, 1);
+
+	this_cpu = get_cpu();
+	dev->queues[1] = nvme_create_queue(dev, 1, NVME_Q_DEPTH, this_cpu);
+	put_cpu();
+	if (!dev->queues[1])
+		return -ENOMEM;
+	dev->queue_count++;
+
+	return 0;
+}
+
+static void nvme_free_queues(struct nvme_dev *dev)
+{
+	int i;
+
+	for (i = dev->queue_count - 1; i >= 0; i--)
+		nvme_free_queue(dev, i);
+}
+
+static int __devinit nvme_dev_add(struct nvme_dev *dev)
+{
+	int res, nn, i;
+	struct nvme_ns *ns, *next;
+	void *id;
+	dma_addr_t dma_addr;
+	struct nvme_command cid, crt;
+
+	res = nvme_setup_io_queues(dev);
+	if (res)
+		return res;
+
+	/* XXX: Switch to a SG list once prp2 works */
+	id = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr,
+								GFP_KERNEL);
+
+	memset(&cid, 0, sizeof(cid));
+	cid.identify.opcode = nvme_admin_identify;
+	cid.identify.nsid = 0;
+	cid.identify.prp1 = cpu_to_le64(dma_addr);
+	cid.identify.cns = cpu_to_le32(1);
+
+	res = nvme_submit_admin_cmd(dev, &cid, NULL);
+	if (res) {
+		res = -EIO;
+		goto out_free;
+	}
+
+	nn = le32_to_cpup(&((struct nvme_id_ctrl *)id)->nn);
+
+	cid.identify.cns = 0;
+	memset(&crt, 0, sizeof(crt));
+	crt.features.opcode = nvme_admin_get_features;
+	crt.features.prp1 = cpu_to_le64(dma_addr + 4096);
+	crt.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE);
+
+	for (i = 0; i < nn; i++) {
+		cid.identify.nsid = cpu_to_le32(i);
+		res = nvme_submit_admin_cmd(dev, &cid, NULL);
+		if (res)
+			continue;
+
+		if (((struct nvme_id_ns *)id)->ncap == 0)
+			continue;
+
+		crt.features.nsid = cpu_to_le32(i);
+		res = nvme_submit_admin_cmd(dev, &crt, NULL);
+		if (res)
+			continue;
+
+		ns = nvme_alloc_ns(dev, i, id, id + 4096);
+		if (ns)
+			list_add_tail(&ns->list, &dev->namespaces);
+	}
+	list_for_each_entry(ns, &dev->namespaces, list)
+		add_disk(ns->disk);
+
+	dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
+	return 0;
+
+ out_free:
+	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
+		list_del(&ns->list);
+		nvme_ns_free(ns);
+	}
+
+	dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
+	return res;
+}
+
+static int nvme_dev_remove(struct nvme_dev *dev)
+{
+	struct nvme_ns *ns, *next;
+
+	/* TODO: wait all I/O finished or cancel them */
+
+	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
+		list_del(&ns->list);
+		del_gendisk(ns->disk);
+		nvme_ns_free(ns);
+	}
+
+	nvme_free_queues(dev);
+
+	return 0;
+}
+
+/* XXX: Use an ida or something to let remove / add work correctly */
+static void nvme_set_instance(struct nvme_dev *dev)
+{
+	static int instance;
+	dev->instance = instance++;
+}
+
+static void nvme_release_instance(struct nvme_dev *dev)
+{
+}
+
+static int __devinit nvme_probe(struct pci_dev *pdev,
+						const struct pci_device_id *id)
+{
+	int result = -ENOMEM;
+	struct nvme_dev *dev;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+	dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry),
+								GFP_KERNEL);
+	if (!dev->entry)
+		goto free;
+	dev->queues = kcalloc(2, sizeof(void *), GFP_KERNEL);
+	if (!dev->queues)
+		goto free;
+
+	INIT_LIST_HEAD(&dev->namespaces);
+	dev->pci_dev = pdev;
+	pci_set_drvdata(pdev, dev);
+	dma_set_mask(&dev->pci_dev->dev, DMA_BIT_MASK(64));
+	nvme_set_instance(dev);
+
+	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
+	if (!dev->bar) {
+		result = -ENOMEM;
+		goto disable;
+	}
+
+	result = nvme_configure_admin_queue(dev);
+	if (result)
+		goto unmap;
+	dev->queue_count++;
+
+	result = nvme_dev_add(dev);
+	if (result)
+		goto delete;
+	return 0;
+
+ delete:
+	nvme_free_queues(dev);
+ unmap:
+	iounmap(dev->bar);
+ disable:
+	pci_disable_msix(pdev);
+	nvme_release_instance(dev);
+ free:
+	kfree(dev->queues);
+	kfree(dev->entry);
+	kfree(dev);
+	return result;
+}
+
+static void __devexit nvme_remove(struct pci_dev *pdev)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+	nvme_dev_remove(dev);
+	pci_disable_msix(pdev);
+	iounmap(dev->bar);
+	nvme_release_instance(dev);
+	kfree(dev->queues);
+	kfree(dev->entry);
+	kfree(dev);
+}
+
+/* These functions are yet to be implemented */
+#define nvme_error_detected NULL
+#define nvme_dump_registers NULL
+#define nvme_link_reset NULL
+#define nvme_slot_reset NULL
+#define nvme_error_resume NULL
+#define nvme_suspend NULL
+#define nvme_resume NULL
+
+static struct pci_error_handlers nvme_err_handler = {
+	.error_detected	= nvme_error_detected,
+	.mmio_enabled	= nvme_dump_registers,
+	.link_reset	= nvme_link_reset,
+	.slot_reset	= nvme_slot_reset,
+	.resume		= nvme_error_resume,
+};
+
+/* Move to pci_ids.h later */
+#define PCI_CLASS_STORAGE_EXPRESS	0x010802
+
+static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = {
+	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, nvme_id_table);
+
+static struct pci_driver nvme_driver = {
+	.name		= "nvme",
+	.id_table	= nvme_id_table,
+	.probe		= nvme_probe,
+	.remove		= __devexit_p(nvme_remove),
+	.suspend	= nvme_suspend,
+	.resume		= nvme_resume,
+	.err_handler	= &nvme_err_handler,
+};
+
+static int __init nvme_init(void)
+{
+	int result;
+
+	nvme_major = register_blkdev(nvme_major, "nvme");
+	if (nvme_major <= 0)
+		return -EBUSY;
+
+	result = pci_register_driver(&nvme_driver);
+	if (!result)
+		return 0;
+
+	unregister_blkdev(nvme_major, "nvme");
+	return result;
+}
+
+static void __exit nvme_exit(void)
+{
+	pci_unregister_driver(&nvme_driver);
+	unregister_blkdev(nvme_major, "nvme");
+}
+
+MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.1");
+module_init(nvme_init);
+module_exit(nvme_exit);
-- 
cgit 


From 3001082cac4bf6ffd09f72b39e6292ad6394ef17 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 20 Jan 2011 09:10:15 -0500
Subject: NVMe: Factor out queue_request_irq()

Two callers with an almost identical long string of arguments, and
introducing a third soon.  Time to factor out the commonalities.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index ef66eccc2aa2..b10e064795ed 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -568,6 +568,13 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	return NULL;
 }
 
+static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
+							const char *name)
+{
+	return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
+				IRQF_DISABLED | IRQF_SHARED, name, nvmeq);
+}
+
 static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
 					int qid, int cq_size, int vector)
 {
@@ -582,8 +589,7 @@ static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
 	if (result < 0)
 		goto release_cq;
 
-	result = request_irq(dev->entry[vector].vector, nvme_irq,
-				IRQF_DISABLED | IRQF_SHARED, "nvme", nvmeq);
+	result = queue_request_irq(dev, nvmeq, "nvme");
 	if (result < 0)
 		goto release_sq;
 
@@ -630,8 +636,7 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 			return -EINTR;
 	}
 
-	result = request_irq(dev->entry[0].vector, nvme_irq,
-			IRQF_DISABLED | IRQF_SHARED, "nvme admin", nvmeq);
+	result = queue_request_irq(dev, nvmeq, "nvme admin");
 	dev->queues[0] = nvmeq;
 	return result;
 }
-- 
cgit 


From b3b06812e199f248561ce7824a4a8a9cd573c05a Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 20 Jan 2011 09:14:34 -0500
Subject: NVMe: Reduce set_queue_count arguments by one

sq_count and cq_count are always the same, so just call it 'count'.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index b10e064795ed..7efd7e92b637 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -777,12 +777,12 @@ static void nvme_ns_free(struct nvme_ns *ns)
 	kfree(ns);
 }
 
-static int set_queue_count(struct nvme_dev *dev, int sq_count, int cq_count)
+static int set_queue_count(struct nvme_dev *dev, int count)
 {
 	int status;
 	u32 result;
 	struct nvme_command c;
-	u32 q_count = (sq_count - 1) | ((cq_count - 1) << 16);
+	u32 q_count = (count - 1) | ((count - 1) << 16);
 
 	memset(&c, 0, sizeof(c));
 	c.features.opcode = nvme_admin_get_features;
@@ -800,7 +800,7 @@ static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 {
 	int this_cpu;
 
-	set_queue_count(dev, 1, 1);
+	set_queue_count(dev, 1);
 
 	this_cpu = get_cpu();
 	dev->queues[1] = nvme_create_queue(dev, 1, NVME_Q_DEPTH, this_cpu);
-- 
cgit 


From 1b23484bd012c078de2ea939249e2fb2e85a0a6e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 20 Jan 2011 13:01:49 -0500
Subject: NVMe: Implement per-CPU queues

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 61 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 11 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 7efd7e92b637..b6a213c98584 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -172,11 +172,17 @@ static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 
 static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
 {
-	return ns->dev->queues[1];
+	int qid, cpu = get_cpu();
+	if (cpu < ns->dev->queue_count)
+		qid = cpu + 1;
+	else
+		qid = (cpu % rounddown_pow_of_two(ns->dev->queue_count)) + 1;
+	return ns->dev->queues[qid];
 }
 
 static void put_nvmeq(struct nvme_queue *nvmeq)
 {
+	put_cpu();
 }
 
 /**
@@ -795,19 +801,51 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 	return min(result & 0xffff, result >> 16) + 1;
 }
 
-/* XXX: Create per-CPU queues */
 static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 {
-	int this_cpu;
+	int result, cpu, i, nr_queues;
 
-	set_queue_count(dev, 1);
+	nr_queues = num_online_cpus();
+	result = set_queue_count(dev, nr_queues);
+	if (result < 0)
+		return result;
+	if (result < nr_queues)
+		nr_queues = result;
 
-	this_cpu = get_cpu();
-	dev->queues[1] = nvme_create_queue(dev, 1, NVME_Q_DEPTH, this_cpu);
-	put_cpu();
-	if (!dev->queues[1])
-		return -ENOMEM;
-	dev->queue_count++;
+	/* Deregister the admin queue's interrupt */
+	free_irq(dev->entry[0].vector, dev->queues[0]);
+
+	for (i = 0; i < nr_queues; i++)
+		dev->entry[i].entry = i;
+	for (;;) {
+		result = pci_enable_msix(dev->pci_dev, dev->entry, nr_queues);
+		if (result == 0) {
+			break;
+		} else if (result > 0) {
+			nr_queues = result;
+			continue;
+		} else {
+			nr_queues = 1;
+			break;
+		}
+	}
+
+	result = queue_request_irq(dev, dev->queues[0], "nvme admin");
+	/* XXX: handle failure here */
+
+	cpu = cpumask_first(cpu_online_mask);
+	for (i = 0; i < nr_queues; i++) {
+		irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
+		cpu = cpumask_next(cpu, cpu_online_mask);
+	}
+
+	for (i = 0; i < nr_queues; i++) {
+		dev->queues[i + 1] = nvme_create_queue(dev, i + 1,
+							NVME_Q_DEPTH, i);
+		if (!dev->queues[i + 1])
+			return -ENOMEM;
+		dev->queue_count++;
+	}
 
 	return 0;
 }
@@ -931,7 +969,8 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 								GFP_KERNEL);
 	if (!dev->entry)
 		goto free;
-	dev->queues = kcalloc(2, sizeof(void *), GFP_KERNEL);
+	dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *),
+								GFP_KERNEL);
 	if (!dev->queues)
 		goto free;
 
-- 
cgit 


From 821234603b265f59d7eebce16d9e8beca2a5752d Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 20 Jan 2011 13:24:06 -0500
Subject: NVMe: Rename 'cycle' to 'phase'

It's called the phase bit in the current draft

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index b6a213c98584..3d917a87ea93 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -93,7 +93,7 @@ struct nvme_queue {
 	u16 sq_head;
 	u16 sq_tail;
 	u16 cq_head;
-	u16 cq_cycle;
+	u16 cq_phase;
 	unsigned long cmdid_data[];
 };
 
@@ -364,7 +364,7 @@ typedef void (*completion_fn)(struct nvme_queue *, void *,
 
 static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 {
-	u16 head, cycle;
+	u16 head, phase;
 
 	static const completion_fn completions[4] = {
 		[sync_completion_id] = sync_completion,
@@ -372,19 +372,19 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 	};
 
 	head = nvmeq->cq_head;
-	cycle = nvmeq->cq_cycle;
+	phase = nvmeq->cq_phase;
 
 	for (;;) {
 		unsigned long data;
 		void *ptr;
 		unsigned char handler;
 		struct nvme_completion cqe = nvmeq->cqes[head];
-		if ((le16_to_cpu(cqe.status) & 1) != cycle)
+		if ((le16_to_cpu(cqe.status) & 1) != phase)
 			break;
 		nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
 		if (++head == nvmeq->q_depth) {
 			head = 0;
-			cycle = !cycle;
+			phase = !phase;
 		}
 
 		data = free_cmdid(nvmeq, cqe.command_id);
@@ -399,12 +399,12 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 	 * requires that 0.1% of your interrupts are handled, so this isn't
 	 * a big problem.
 	 */
-	if (head == nvmeq->cq_head && cycle == nvmeq->cq_cycle)
+	if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
 		return IRQ_NONE;
 
 	writel(head, nvmeq->q_db + 1);
 	nvmeq->cq_head = head;
-	nvmeq->cq_cycle = cycle;
+	nvmeq->cq_phase = phase;
 
 	return IRQ_HANDLED;
 }
@@ -557,7 +557,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	nvmeq->q_dmadev = dmadev;
 	spin_lock_init(&nvmeq->q_lock);
 	nvmeq->cq_head = 0;
-	nvmeq->cq_cycle = 1;
+	nvmeq->cq_phase = 1;
 	init_waitqueue_head(&nvmeq->sq_full);
 	bio_list_init(&nvmeq->sq_cong);
 	nvmeq->q_db = &dev->dbs[qid * 2];
-- 
cgit 


From 53c9577e9ca68a633c6e9df2b54eaecacfa77f62 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 20 Jan 2011 13:42:34 -0500
Subject: NVMe: Fix admin IRQ claim on real hardware

The admin IRQ is supposed to use the pin-based (or single message MSI)
interrupt.  Accomplish this by filling in entry[0]'s vector with the
INTx irq number.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 3d917a87ea93..44a9d5edd8db 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -979,6 +979,7 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	pci_set_drvdata(pdev, dev);
 	dma_set_mask(&dev->pci_dev->dev, DMA_BIT_MASK(64));
 	nvme_set_instance(dev);
+	dev->entry[0].vector = pdev->irq;
 
 	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
 	if (!dev->bar) {
-- 
cgit 


From 36c14ed9caa957c686d4a48fd598a5ec2aa0331b Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 24 Jan 2011 07:52:07 -0500
Subject: NVMe: Use PRP2 for the nvme_identify ioctl

DMA the result straight to userspace instead of bounce-buffering in the
kernel.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 58 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 17 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 44a9d5edd8db..c0ef1dd1cc90 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -647,33 +647,57 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 	return result;
 }
 
-static int nvme_identify(struct nvme_ns *ns, void __user *addr, int cns)
+static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
 {
 	struct nvme_dev *dev = ns->dev;
-	int status;
+	int i, err, count, nents, offset;
 	struct nvme_command c;
-	void *page;
-	dma_addr_t dma_addr;
-
-	page = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
-								GFP_KERNEL);
+	struct scatterlist sg[2];
+	struct page *pages[2];
+
+	if (addr & 3)
+		return -EINVAL;
+	offset = offset_in_page(addr);
+	count = offset ? 2 : 1;
+
+	err = get_user_pages_fast(addr, count, 1, pages);
+	if (err < count) {
+		count = err;
+		err = -EFAULT;
+		goto put_pages;
+	}
+	sg_init_table(sg, count);
+	for (i = 0; i < count; i++)
+		sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
+	nents = dma_map_sg(&dev->pci_dev->dev, sg, count, DMA_FROM_DEVICE);
+	if (!nents)
+		goto put_pages;
 
 	memset(&c, 0, sizeof(c));
 	c.identify.opcode = nvme_admin_identify;
 	c.identify.nsid = cns ? 0 : cpu_to_le32(ns->ns_id);
-	c.identify.prp1 = cpu_to_le64(dma_addr);
+	c.identify.prp1 = cpu_to_le64(sg_dma_address(&sg[0]) + offset);
+	if (count > 1) {
+		u64 dma_addr;
+		if (nents > 1)
+			dma_addr = sg_dma_address(&sg[1]);
+		else
+			dma_addr = sg_dma_address(&sg[0]) + PAGE_SIZE;
+		c.identify.prp2 = cpu_to_le64(dma_addr);
+	}
 	c.identify.cns = cpu_to_le32(cns);
 
-	status = nvme_submit_admin_cmd(dev, &c, NULL);
+	err = nvme_submit_admin_cmd(dev, &c, NULL);
 
-	if (status)
-		status = -EIO;
-	else if (copy_to_user(addr, page, 4096))
-		status = -EFAULT;
+	if (err)
+		err = -EIO;
 
-	dma_free_coherent(&dev->pci_dev->dev, 4096, page, dma_addr);
+	dma_unmap_sg(&dev->pci_dev->dev, sg, nents, DMA_FROM_DEVICE);
+ put_pages:
+	for (i = 0; i < count; i++)
+		put_page(pages[i]);
 
-	return status;
+	return err;
 }
 
 static int nvme_get_range_type(struct nvme_ns *ns, void __user *addr)
@@ -713,9 +737,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 
 	switch (cmd) {
 	case NVME_IOCTL_IDENTIFY_NS:
-		return nvme_identify(ns, (void __user *)arg, 0);
+		return nvme_identify(ns, arg, 0);
 	case NVME_IOCTL_IDENTIFY_CTRL:
-		return nvme_identify(ns, (void __user *)arg, 1);
+		return nvme_identify(ns, arg, 1);
 	case NVME_IOCTL_GET_RANGE_TYPE:
 		return nvme_get_range_type(ns, (void __user *)arg);
 	default:
-- 
cgit 


From ff22b54fda2078fc3cd1bcdcb7a5ce5d08fd6591 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 26 Jan 2011 10:02:29 -0500
Subject: NVMe: Add nvme_setup_prps()

Generalise the code from nvme_identify() that sets PRP1 & PRP2 so that
it's usable for commands sent by nvme_submit_bio_queue().

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 70 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 46 insertions(+), 24 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index c0ef1dd1cc90..1e57737b1760 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -240,6 +240,36 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 	bio_endio(bio, status ? -EIO : 0);
 }
 
+/* length is in bytes */
+static void nvme_setup_prps(struct nvme_common_command *cmd,
+					struct scatterlist *sg, int length)
+{
+	int dma_len = sg_dma_len(sg);
+	u64 dma_addr = sg_dma_address(sg);
+	int offset = offset_in_page(dma_addr);
+
+	cmd->prp1 = cpu_to_le64(dma_addr);
+	length -= (PAGE_SIZE - offset);
+	if (length <= 0)
+		return;
+
+	dma_len -= (PAGE_SIZE - offset);
+	if (dma_len) {
+		dma_addr += (PAGE_SIZE - offset);
+	} else {
+		sg = sg_next(sg);
+		dma_addr = sg_dma_address(sg);
+		dma_len = sg_dma_len(sg);
+	}
+
+	if (length <= PAGE_SIZE) {
+		cmd->prp2 = cpu_to_le64(dma_addr);
+		return;
+	}
+
+	/* XXX: support PRP lists */
+}
+
 static int nvme_map_bio(struct device *dev, struct nvme_req_info *info,
 		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
 {
@@ -261,7 +291,7 @@ static int nvme_map_bio(struct device *dev, struct nvme_req_info *info,
 static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 								struct bio *bio)
 {
-	struct nvme_rw_command *cmnd;
+	struct nvme_command *cmnd;
 	struct nvme_req_info *info;
 	enum dma_data_direction dma_dir;
 	int cmdid;
@@ -290,27 +320,26 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
 	spin_lock_irqsave(&nvmeq->q_lock, flags);
-	cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail].rw;
+	cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
 
 	if (bio_data_dir(bio)) {
-		cmnd->opcode = nvme_cmd_write;
+		cmnd->rw.opcode = nvme_cmd_write;
 		dma_dir = DMA_TO_DEVICE;
 	} else {
-		cmnd->opcode = nvme_cmd_read;
+		cmnd->rw.opcode = nvme_cmd_read;
 		dma_dir = DMA_FROM_DEVICE;
 	}
 
 	nvme_map_bio(nvmeq->q_dmadev, info, bio, dma_dir, psegs);
 
-	cmnd->flags = 1;
-	cmnd->command_id = cmdid;
-	cmnd->nsid = cpu_to_le32(ns->ns_id);
-	cmnd->prp1 = cpu_to_le64(sg_phys(info->sg));
-	/* XXX: Support more than one PRP */
-	cmnd->slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
-	cmnd->length = cpu_to_le16((bio->bi_size >> ns->lba_shift) - 1);
-	cmnd->control = cpu_to_le16(control);
-	cmnd->dsmgmt = cpu_to_le32(dsmgmt);
+	cmnd->rw.flags = 1;
+	cmnd->rw.command_id = cmdid;
+	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
+	nvme_setup_prps(&cmnd->common, info->sg, bio->bi_size);
+	cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
+	cmnd->rw.length = cpu_to_le16((bio->bi_size >> ns->lba_shift) - 1);
+	cmnd->rw.control = cpu_to_le16(control);
+	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 
 	writel(nvmeq->sq_tail, nvmeq->q_db);
 	if (++nvmeq->sq_tail == nvmeq->q_depth)
@@ -667,8 +696,9 @@ static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
 		goto put_pages;
 	}
 	sg_init_table(sg, count);
-	for (i = 0; i < count; i++)
-		sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
+	sg_set_page(&sg[0], pages[0], PAGE_SIZE - offset, offset);
+	if (count > 1)
+		sg_set_page(&sg[1], pages[1], offset, 0);
 	nents = dma_map_sg(&dev->pci_dev->dev, sg, count, DMA_FROM_DEVICE);
 	if (!nents)
 		goto put_pages;
@@ -676,15 +706,7 @@ static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
 	memset(&c, 0, sizeof(c));
 	c.identify.opcode = nvme_admin_identify;
 	c.identify.nsid = cns ? 0 : cpu_to_le32(ns->ns_id);
-	c.identify.prp1 = cpu_to_le64(sg_dma_address(&sg[0]) + offset);
-	if (count > 1) {
-		u64 dma_addr;
-		if (nents > 1)
-			dma_addr = sg_dma_address(&sg[1]);
-		else
-			dma_addr = sg_dma_address(&sg[0]) + PAGE_SIZE;
-		c.identify.prp2 = cpu_to_le64(dma_addr);
-	}
+	nvme_setup_prps(&c.common, sg, 4096);
 	c.identify.cns = cpu_to_le32(cns);
 
 	err = nvme_submit_admin_cmd(dev, &c, NULL);
-- 
cgit 


From b8deb62cf271fa9381edc8cf52bcae2f0225c55a Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 26 Jan 2011 10:08:25 -0500
Subject: NVMe: Zero the command before we send it

Make sure there's no left-over bits set from previous commands that used
this slot.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 1e57737b1760..25ca7af96469 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -322,6 +322,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	spin_lock_irqsave(&nvmeq->q_lock, flags);
 	cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
 
+	memset(cmnd, 0, sizeof(*cmnd));
 	if (bio_data_dir(bio)) {
 		cmnd->rw.opcode = nvme_cmd_write;
 		dma_dir = DMA_TO_DEVICE;
-- 
cgit 


From bd38c5557cf482fc195e2264b32ea62eed60730a Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 26 Jan 2011 14:34:32 -0500
Subject: NVMe: Change NVME_IOCTL_GET_RANGE_TYPE to return all the ranges

Factor out most of nvme_identify() into a new nvme_submit_user_admin_command()
function.  Change nvme_get_range_type() to call it and change nvme_ioctl to
realise that it's getting back all 64 ranges.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 49 +++++++++++++++++++------------------------------
 1 file changed, 19 insertions(+), 30 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 25ca7af96469..b28d188d10f8 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -677,18 +677,17 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 	return result;
 }
 
-static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
+static int nvme_submit_user_admin_command(struct nvme_dev *dev, unsigned long addr,
+					unsigned length, struct nvme_command *cmd)
 {
-	struct nvme_dev *dev = ns->dev;
 	int i, err, count, nents, offset;
-	struct nvme_command c;
 	struct scatterlist sg[2];
 	struct page *pages[2];
 
 	if (addr & 3)
 		return -EINVAL;
 	offset = offset_in_page(addr);
-	count = offset ? 2 : 1;
+	count = ((offset + length) > PAGE_SIZE) ? 2 : 1;
 
 	err = get_user_pages_fast(addr, count, 1, pages);
 	if (err < count) {
@@ -704,13 +703,9 @@ static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
 	if (!nents)
 		goto put_pages;
 
-	memset(&c, 0, sizeof(c));
-	c.identify.opcode = nvme_admin_identify;
-	c.identify.nsid = cns ? 0 : cpu_to_le32(ns->ns_id);
-	nvme_setup_prps(&c.common, sg, 4096);
-	c.identify.cns = cpu_to_le32(cns);
+	nvme_setup_prps(&cmd->common, sg, length);
 
-	err = nvme_submit_admin_cmd(dev, &c, NULL);
+	err = nvme_submit_admin_cmd(dev, cmd, NULL);
 
 	if (err)
 		err = -EIO;
@@ -723,34 +718,28 @@ static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
 	return err;
 }
 
-static int nvme_get_range_type(struct nvme_ns *ns, void __user *addr)
+static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
 {
-	struct nvme_dev *dev = ns->dev;
-	int status;
 	struct nvme_command c;
-	void *page;
-	dma_addr_t dma_addr;
 
-	page = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
-								GFP_KERNEL);
+	memset(&c, 0, sizeof(c));
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.nsid = cns ? 0 : cpu_to_le32(ns->ns_id);
+	c.identify.cns = cpu_to_le32(cns);
+
+	return nvme_submit_user_admin_command(ns->dev, addr, 4096, &c);
+}
+
+static int nvme_get_range_type(struct nvme_ns *ns, unsigned long addr)
+{
+	struct nvme_command c;
 
 	memset(&c, 0, sizeof(c));
 	c.features.opcode = nvme_admin_get_features;
 	c.features.nsid = cpu_to_le32(ns->ns_id);
-	c.features.prp1 = cpu_to_le64(dma_addr);
 	c.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE);
 
-	status = nvme_submit_admin_cmd(dev, &c, NULL);
-
-	/* XXX: Assuming first range for now */
-	if (status)
-		status = -EIO;
-	else if (copy_to_user(addr, page, 64))
-		status = -EFAULT;
-
-	dma_free_coherent(&dev->pci_dev->dev, 4096, page, dma_addr);
-
-	return status;
+	return nvme_submit_user_admin_command(ns->dev, addr, 4096, &c);
 }
 
 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
@@ -764,7 +753,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 	case NVME_IOCTL_IDENTIFY_CTRL:
 		return nvme_identify(ns, arg, 1);
 	case NVME_IOCTL_GET_RANGE_TYPE:
-		return nvme_get_range_type(ns, (void __user *)arg);
+		return nvme_get_range_type(ns, arg);
 	default:
 		return -ENOTTY;
 	}
-- 
cgit 


From 7fc3cdabba75c2516b8b645eb0ca7907aea70415 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 26 Jan 2011 17:05:50 -0500
Subject: NVMe: Create nvme_map_user_pages() and nvme_unmap_user_pages()

These are generalisations of the code that was in
nvme_submit_user_admin_command().

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 68 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 53 insertions(+), 15 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index b28d188d10f8..f44d6cd87ea2 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -677,17 +677,22 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 	return result;
 }
 
-static int nvme_submit_user_admin_command(struct nvme_dev *dev, unsigned long addr,
-					unsigned length, struct nvme_command *cmd)
+static int nvme_map_user_pages(struct nvme_dev *dev, int write,
+				unsigned long addr, unsigned length,
+				struct scatterlist **sgp)
 {
 	int i, err, count, nents, offset;
-	struct scatterlist sg[2];
-	struct page *pages[2];
+	struct scatterlist *sg;
+	struct page **pages;
 
 	if (addr & 3)
 		return -EINVAL;
+	if (!length)
+		return -EINVAL;
+
 	offset = offset_in_page(addr);
-	count = ((offset + length) > PAGE_SIZE) ? 2 : 1;
+	count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
+	pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
 
 	err = get_user_pages_fast(addr, count, 1, pages);
 	if (err < count) {
@@ -695,27 +700,60 @@ static int nvme_submit_user_admin_command(struct nvme_dev *dev, unsigned long ad
 		err = -EFAULT;
 		goto put_pages;
 	}
+
+	sg = kcalloc(count, sizeof(*sg), GFP_KERNEL);
 	sg_init_table(sg, count);
 	sg_set_page(&sg[0], pages[0], PAGE_SIZE - offset, offset);
-	if (count > 1)
-		sg_set_page(&sg[1], pages[1], offset, 0);
-	nents = dma_map_sg(&dev->pci_dev->dev, sg, count, DMA_FROM_DEVICE);
+	length -= (PAGE_SIZE - offset);
+	for (i = 1; i < count; i++) {
+		sg_set_page(&sg[i], pages[i], min_t(int, length, PAGE_SIZE), 0);
+		length -= PAGE_SIZE;
+	}
+
+	err = -ENOMEM;
+	nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
+				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	if (!nents)
 		goto put_pages;
 
-	nvme_setup_prps(&cmd->common, sg, length);
+	kfree(pages);
+	*sgp = sg;
+	return nents;
 
-	err = nvme_submit_admin_cmd(dev, cmd, NULL);
+ put_pages:
+	for (i = 0; i < count; i++)
+		put_page(pages[i]);
+	kfree(pages);
+	return err;
+}
 
-	if (err)
-		err = -EIO;
+static void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
+				unsigned long addr, int length,
+				struct scatterlist *sg, int nents)
+{
+	int i, count;
 
+	count = DIV_ROUND_UP(offset_in_page(addr) + length, PAGE_SIZE);
 	dma_unmap_sg(&dev->pci_dev->dev, sg, nents, DMA_FROM_DEVICE);
- put_pages:
+
 	for (i = 0; i < count; i++)
-		put_page(pages[i]);
+		put_page(sg_page(&sg[i]));
+}
 
-	return err;
+static int nvme_submit_user_admin_command(struct nvme_dev *dev,
+					unsigned long addr, unsigned length,
+					struct nvme_command *cmd)
+{
+	int err, nents;
+	struct scatterlist *sg;
+
+	nents = nvme_map_user_pages(dev, 0, addr, length, &sg);
+	if (nents < 0)
+		return nents;
+	nvme_setup_prps(&cmd->common, sg, length);
+	err = nvme_submit_admin_cmd(dev, cmd, NULL);
+	nvme_unmap_user_pages(dev, 0, addr, length, sg, nents);
+	return err ? -EIO : 0;
 }
 
 static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
-- 
cgit 


From a53295b6998f62d961c29e54051c1cf1d738c2b3 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 1 Feb 2011 16:13:29 -0500
Subject: NVMe: Add NVME_IOCTL_SUBMIT_IO

Allow userspace to submit synchronous I/O like the SCSI sg interface does.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index f44d6cd87ea2..40fb2e1bdfe4 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -780,6 +780,47 @@ static int nvme_get_range_type(struct nvme_ns *ns, unsigned long addr)
 	return nvme_submit_user_admin_command(ns->dev, addr, 4096, &c);
 }
 
+static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
+{
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_queue *nvmeq;
+	struct nvme_user_io io;
+	struct nvme_command c;
+	unsigned length;
+	u32 result;
+	int nents, status;
+	struct scatterlist *sg;
+
+	if (copy_from_user(&io, uio, sizeof(io)))
+		return -EFAULT;
+	length = io.nblocks << io.block_shift;
+	nents = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length, &sg);
+	if (nents < 0)
+		return nents;
+
+	memset(&c, 0, sizeof(c));
+	c.rw.opcode = io.opcode;
+	c.rw.flags = io.flags;
+	c.rw.nsid = cpu_to_le32(io.nsid);
+	c.rw.slba = cpu_to_le64(io.slba);
+	c.rw.length = cpu_to_le16(io.nblocks - 1);
+	c.rw.control = cpu_to_le16(io.control);
+	c.rw.dsmgmt = cpu_to_le16(io.dsmgmt);
+	c.rw.reftag = cpu_to_le32(io.reftag);	/* XXX: endian? */
+	c.rw.apptag = cpu_to_le16(io.apptag);
+	c.rw.appmask = cpu_to_le16(io.appmask);
+	/* XXX: metadata */
+	nvme_setup_prps(&c.common, sg, length);
+
+	nvmeq = get_nvmeq(ns);
+	status = nvme_submit_sync_cmd(nvmeq, &c, &result);
+	put_nvmeq(nvmeq);
+
+	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents);
+	put_user(result, &uio->result);
+	return status;
+}
+
 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 							unsigned long arg)
 {
@@ -792,6 +833,8 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 		return nvme_identify(ns, arg, 1);
 	case NVME_IOCTL_GET_RANGE_TYPE:
 		return nvme_get_range_type(ns, arg);
+	case NVME_IOCTL_SUBMIT_IO:
+		return nvme_submit_io(ns, (void __user *)arg);
 	default:
 		return -ENOTTY;
 	}
-- 
cgit 


From 51814232ecae90f888c902e252306df8d017f0dd Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 1 Feb 2011 16:18:08 -0500
Subject: NVMe: Read the model, serial & firmware rev from the controller

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 40fb2e1bdfe4..12e37c1cf057 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -58,6 +58,9 @@ struct nvme_dev {
 	struct msix_entry *entry;
 	struct nvme_bar __iomem *bar;
 	struct list_head namespaces;
+	char serial[20];
+	char model[40];
+	char firmware_rev[8];
 };
 
 /*
@@ -979,6 +982,7 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 {
 	int res, nn, i;
 	struct nvme_ns *ns, *next;
+	struct nvme_id_ctrl *ctrl;
 	void *id;
 	dma_addr_t dma_addr;
 	struct nvme_command cid, crt;
@@ -1003,7 +1007,11 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 		goto out_free;
 	}
 
-	nn = le32_to_cpup(&((struct nvme_id_ctrl *)id)->nn);
+	ctrl = id;
+	nn = le32_to_cpup(&ctrl->nn);
+	memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
+	memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
+	memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
 
 	cid.identify.cns = 0;
 	memset(&crt, 0, sizeof(crt));
-- 
cgit 


From 8e9f0e71150bf6277d0ea40bc8feb1338ddf13fd Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 31 Jan 2011 10:46:14 -0500
Subject: NVMe: Remove 'node' from nvme_dev

We don't keep a list of nvme_dev any more

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 12e37c1cf057..9377cf32f813 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -48,7 +48,6 @@ module_param(nvme_major, int, 0);
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
  */
 struct nvme_dev {
-	struct list_head node;
 	struct nvme_queue **queues;
 	u32 __iomem *dbs;
 	struct pci_dev *pci_dev;
-- 
cgit 


From 3f85d50b609e8a5ef151656210203a6e94c19538 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 1 Feb 2011 08:39:04 -0500
Subject: NVMe: Check returns from nvme_alloc_queue()

It can return NULL, so handle that.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 9377cf32f813..dc821776be94 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -619,6 +619,9 @@ static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
 	int result;
 	struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector);
 
+	if (!nvmeq)
+		return NULL;
+
 	result = adapter_alloc_cq(dev, qid, nvmeq);
 	if (result < 0)
 		goto free_nvmeq;
@@ -655,6 +658,8 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 	dev->dbs = ((void __iomem *)dev->bar) + 4096;
 
 	nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
+	if (!nvmeq)
+		return -ENOMEM;
 
 	aqa = nvmeq->q_depth - 1;
 	aqa |= aqa << 16;
-- 
cgit 


From 0ee5a7d7cb9309bd393a25c395f19fb12a842602 Mon Sep 17 00:00:00 2001
From: Shane Michael Matthews <shane.matthews@intel.com>
Date: Tue, 1 Feb 2011 08:49:30 -0500
Subject: NVMe: Enable and disable the PCI device

Call pci_enable_device_mem() at initialisation and pci_disable_device
at exit.

Signed-off-by: Shane Michael Matthews <shane.matthews@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index dc821776be94..1dda4b5c2302 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1103,6 +1103,9 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	if (!dev->queues)
 		goto free;
 
+	if (pci_enable_device_mem(pdev))
+		goto free;
+
 	INIT_LIST_HEAD(&dev->namespaces);
 	dev->pci_dev = pdev;
 	pci_set_drvdata(pdev, dev);
@@ -1133,6 +1136,7 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
  disable:
 	pci_disable_msix(pdev);
 	nvme_release_instance(dev);
+	pci_disable_device(pdev);
  free:
 	kfree(dev->queues);
 	kfree(dev->entry);
@@ -1147,6 +1151,7 @@ static void __devexit nvme_remove(struct pci_dev *pdev)
 	pci_disable_msix(pdev);
 	iounmap(dev->bar);
 	nvme_release_instance(dev);
+	pci_disable_device(pdev);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
-- 
cgit 


From f64d3365a3e5cb46e69db7e2c82a7cb9a5bed1b8 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 1 Feb 2011 09:01:59 -0500
Subject: NVMe: Enable device DMA

Need to call pci_set_master() to enable device DMA

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 1dda4b5c2302..128fd70031a9 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1105,6 +1105,7 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 
 	if (pci_enable_device_mem(pdev))
 		goto free;
+	pci_set_master(pdev);
 
 	INIT_LIST_HEAD(&dev->namespaces);
 	dev->pci_dev = pdev;
-- 
cgit 


From 2930353f9f2b9e4629e935acd970cb73c1171229 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 1 Feb 2011 16:23:39 -0500
Subject: NVMe: Allow queues to be allocated above 4GB

Need to call dma_set_coherent_mask() to allow queues to be allocated
above 4GB.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 128fd70031a9..46f872021369 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1110,7 +1110,8 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	INIT_LIST_HEAD(&dev->namespaces);
 	dev->pci_dev = pdev;
 	pci_set_drvdata(pdev, dev);
-	dma_set_mask(&dev->pci_dev->dev, DMA_BIT_MASK(64));
+	dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
+	dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
 	nvme_set_instance(dev);
 	dev->entry[0].vector = pdev->irq;
 
-- 
cgit 


From 574e8b95bc3780e10e9b5e9d51074d503dd3d5d9 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 1 Feb 2011 16:24:35 -0500
Subject: NVMe: Request I/O regions

Calling pci_request_selected_regions() reserves these regions for our use.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 46f872021369..bda91178f475 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1088,7 +1088,7 @@ static void nvme_release_instance(struct nvme_dev *dev)
 static int __devinit nvme_probe(struct pci_dev *pdev,
 						const struct pci_device_id *id)
 {
-	int result = -ENOMEM;
+	int bars, result = -ENOMEM;
 	struct nvme_dev *dev;
 
 	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
@@ -1106,6 +1106,9 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	if (pci_enable_device_mem(pdev))
 		goto free;
 	pci_set_master(pdev);
+	bars = pci_select_bars(pdev, IORESOURCE_MEM);
+	if (pci_request_selected_regions(pdev, bars, "nvme"))
+		goto disable;
 
 	INIT_LIST_HEAD(&dev->namespaces);
 	dev->pci_dev = pdev;
@@ -1118,7 +1121,7 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
 	if (!dev->bar) {
 		result = -ENOMEM;
-		goto disable;
+		goto disable_msix;
 	}
 
 	result = nvme_configure_admin_queue(dev);
@@ -1135,10 +1138,12 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	nvme_free_queues(dev);
  unmap:
 	iounmap(dev->bar);
- disable:
+ disable_msix:
 	pci_disable_msix(pdev);
 	nvme_release_instance(dev);
+ disable:
 	pci_disable_device(pdev);
+	pci_release_regions(pdev);
  free:
 	kfree(dev->queues);
 	kfree(dev->entry);
@@ -1154,6 +1159,7 @@ static void __devexit nvme_remove(struct pci_dev *pdev)
 	iounmap(dev->bar);
 	nvme_release_instance(dev);
 	pci_disable_device(pdev);
+	pci_release_regions(pdev);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
-- 
cgit 


From 5911f20039ce59d7e7834f0c42151cf759b6f786 Mon Sep 17 00:00:00 2001
From: Shane Michael Matthews <shane.matthews@intel.com>
Date: Tue, 1 Feb 2011 11:31:55 -0500
Subject: NVMe: Disable the device before we write the admin queues

In case the card has been left in a partially-configured state,
write 0 to the Enable bit.

Signed-off-by: Shane Michael Matthews <shane.matthews@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index bda91178f475..e3d921577b94 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -668,6 +668,7 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 	dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
 	dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
 
+	writel(0, &dev->bar->cc);
 	writel(aqa, &dev->bar->aqa);
 	writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
 	writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
-- 
cgit 


From 388f037f4e7f0a24bac6b1a24f144f5d939f58cf Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 1 Feb 2011 12:49:38 -0500
Subject: NVMe: Move sysfs entries to the right place

Because I wasn't setting driverfs_dev, the devices were showing up under
/sys/devices/virtual/block.  Now they appear underneath the PCI device
which they belong to.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index e3d921577b94..744db3877c42 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -889,6 +889,7 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int index,
 	disk->fops = &nvme_fops;
 	disk->private_data = ns;
 	disk->queue = ns->queue;
+	disk->driverfs_dev = &dev->pci_dev->dev;
 	sprintf(disk->disk_name, "nvme%dn%d", dev->instance, index);
 	set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
 
-- 
cgit 


From 6ee44cdced04a53dc4f27eb97067e6cd33784726 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 3 Feb 2011 10:58:26 -0500
Subject: NVMe: Add download / activate firmware ioctls

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 744db3877c42..7cdf7f69cdcd 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -829,6 +829,47 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	return status;
 }
 
+static int nvme_download_firmware(struct nvme_ns *ns,
+						struct nvme_dlfw __user *udlfw)
+{
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_dlfw dlfw;
+	struct nvme_command c;
+	int nents, status;
+	struct scatterlist *sg;
+
+	if (copy_from_user(&dlfw, udlfw, sizeof(dlfw)))
+		return -EFAULT;
+	if (dlfw.length >= (1 << 30))
+		return -EINVAL;
+
+	nents = nvme_map_user_pages(dev, 1, dlfw.addr, dlfw.length * 4, &sg);
+	if (nents < 0)
+		return nents;
+
+	memset(&c, 0, sizeof(c));
+	c.dlfw.opcode = nvme_admin_download_fw;
+	c.dlfw.numd = cpu_to_le32(dlfw.length);
+	c.dlfw.offset = cpu_to_le32(dlfw.offset);
+	nvme_setup_prps(&c.common, sg, dlfw.length * 4);
+
+	status = nvme_submit_admin_cmd(dev, &c, NULL);
+	nvme_unmap_user_pages(dev, 0, dlfw.addr, dlfw.length * 4, sg, nents);
+	return status;
+}
+
+static int nvme_activate_firmware(struct nvme_ns *ns, unsigned long arg)
+{
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = nvme_admin_activate_fw;
+	c.common.rsvd10[0] = cpu_to_le32(arg);
+
+	return nvme_submit_admin_cmd(dev, &c, NULL);
+}
+
 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 							unsigned long arg)
 {
@@ -843,6 +884,10 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 		return nvme_get_range_type(ns, arg);
 	case NVME_IOCTL_SUBMIT_IO:
 		return nvme_submit_io(ns, (void __user *)arg);
+	case NVME_IOCTL_DOWNLOAD_FW:
+		return nvme_download_firmware(ns, (void __user *)arg);
+	case NVME_IOCTL_ACTIVATE_FW:
+		return nvme_activate_firmware(ns, arg);
 	default:
 		return -ENOTTY;
 	}
-- 
cgit 


From db5d0c198d673b6a932b449d4db95a2ad50c755e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 3 Feb 2011 14:36:07 -0500
Subject: NVMe: Release 0.2

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 7cdf7f69cdcd..06a6aeaa827a 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1272,6 +1272,6 @@ static void __exit nvme_exit(void)
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("0.1");
+MODULE_VERSION("0.2");
 module_init(nvme_init);
 module_exit(nvme_exit);
-- 
cgit 


From 3c0cf138d7789feb3f335f6f1d24ad8fc8b3a23f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 4 Feb 2011 16:03:56 -0500
Subject: NVMe: Allow fatal signals to interrupt I/O

If the user sends a fatal signal, sleeping in the TASK_KILLABLE state
permits the task to be aborted.  The only wrinkle is making sure that
if/when the command completes later that it doesn't upset anything.
Handle this by setting the data pointer to 0, and checking the value
isn't NULL in the sync completion path.  Eventually, bios can be cancelled
through this path too.  Note that the cmdid isn't freed to prevent reuse.

We should also abort the command in the future, but this is a good start.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 06a6aeaa827a..4bfed59f3629 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -155,7 +155,9 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
 }
 
 /* If you need more than four handlers, you'll need to change how
- * alloc_cmdid and nvme_process_cq work
+ * alloc_cmdid and nvme_process_cq work.  Also, aborted commands take
+ * the sync_completion path (if they complete), so don't put anything
+ * else in slot zero.
  */
 enum {
 	sync_completion_id = 0,
@@ -172,6 +174,11 @@ static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 	return data;
 }
 
+static void clear_cmdid_data(struct nvme_queue *nvmeq, int cmdid)
+{
+	nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(nvmeq->q_depth)] = 0;
+}
+
 static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
 {
 	int qid, cpu = get_cpu();
@@ -386,6 +393,8 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct sync_cmd_info *cmdinfo = ctx;
+	if (!cmdinfo)
+		return;	/* Command aborted */
 	cmdinfo->result = le32_to_cpup(&cqe->result);
 	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
 	wake_up_process(cmdinfo->task);
@@ -446,12 +455,19 @@ static irqreturn_t nvme_irq(int irq, void *data)
 	return nvme_process_cq(data);
 }
 
+static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
+{
+	spin_lock_irq(&nvmeq->q_lock);
+	clear_cmdid_data(nvmeq, cmdid);
+	spin_unlock_irq(&nvmeq->q_lock);
+}
+
 /*
  * Returns 0 on success.  If the result is negative, it's a Linux error code;
  * if the result is positive, it's an NVM Express status code
  */
-static int nvme_submit_sync_cmd(struct nvme_queue *q, struct nvme_command *cmd,
-								u32 *result)
+static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
+					struct nvme_command *cmd, u32 *result)
 {
 	int cmdid;
 	struct sync_cmd_info cmdinfo;
@@ -459,15 +475,20 @@ static int nvme_submit_sync_cmd(struct nvme_queue *q, struct nvme_command *cmd,
 	cmdinfo.task = current;
 	cmdinfo.status = -EINTR;
 
-	cmdid = alloc_cmdid_killable(q, &cmdinfo, sync_completion_id);
+	cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion_id);
 	if (cmdid < 0)
 		return cmdid;
 	cmd->common.command_id = cmdid;
 
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	nvme_submit_cmd(q, cmd);
+	set_current_state(TASK_KILLABLE);
+	nvme_submit_cmd(nvmeq, cmd);
 	schedule();
 
+	if (cmdinfo.status == -EINTR) {
+		nvme_abort_command(nvmeq, cmdid);
+		return -EINTR;
+	}
+
 	if (result)
 		*result = cmdinfo.result;
 
-- 
cgit 


From b1ad37efcafe396ac3944853589688dd0ec3c64e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 4 Feb 2011 16:14:30 -0500
Subject: NVMe: Call put_nvmeq() before calling nvme_submit_sync_cmd()

We can't have preemption disabled when we call schedule().  Accept the
possibility that we'll get preempted, and it'll cost us some cacheline
bounces.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 4bfed59f3629..1c3cd6cc0ad9 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -842,8 +842,13 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	nvme_setup_prps(&c.common, sg, length);
 
 	nvmeq = get_nvmeq(ns);
-	status = nvme_submit_sync_cmd(nvmeq, &c, &result);
+	/* Since nvme_submit_sync_cmd sleeps, we can't keep preemption
+	 * disabled.  We may be preempted at any point, and be rescheduled
+	 * to a different CPU.  That will cause cacheline bouncing, but no
+	 * additional races since q_lock already protects against other CPUs.
+	 */
 	put_nvmeq(nvmeq);
+	status = nvme_submit_sync_cmd(nvmeq, &c, &result);
 
 	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents);
 	put_user(result, &uio->result);
-- 
cgit 


From 58ffacb545f76fc2c65d1fbfa5acf5184a2a09e6 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sun, 6 Feb 2011 07:28:06 -0500
Subject: NVMe: Add a module parameter to use a threaded interrupt

We're currently calling bio_endio from hard interrupt context.  This is
not a good idea for preemptible kernels as it will cause longer latencies.
Using a threaded interrupt will run the entire queue processing mechanism
(including bio_endio) in a thread, which can be preempted.  Unfortuantely,
it also adds about 7us of latency to the single-I/O case, so make it a
module parameter for the moment.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 1c3cd6cc0ad9..60c3786bc787 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -44,6 +44,9 @@
 static int nvme_major;
 module_param(nvme_major, int, 0);
 
+static int use_threaded_interrupts;
+module_param(use_threaded_interrupts, int, 0);
+
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
  */
@@ -455,6 +458,25 @@ static irqreturn_t nvme_irq(int irq, void *data)
 	return nvme_process_cq(data);
 }
 
+static irqreturn_t nvme_irq_thread(int irq, void *data)
+{
+	irqreturn_t result;
+	struct nvme_queue *nvmeq = data;
+	spin_lock(&nvmeq->q_lock);
+	result = nvme_process_cq(nvmeq);
+	spin_unlock(&nvmeq->q_lock);
+	return result;
+}
+
+static irqreturn_t nvme_irq_check(int irq, void *data)
+{
+	struct nvme_queue *nvmeq = data;
+	struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head];
+	if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase)
+		return IRQ_NONE;
+	return IRQ_WAKE_THREAD;
+}
+
 static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
 {
 	spin_lock_irq(&nvmeq->q_lock);
@@ -630,6 +652,11 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
 							const char *name)
 {
+	if (use_threaded_interrupts)
+		return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
+					nvme_irq_check, nvme_irq_thread,
+					IRQF_DISABLED | IRQF_SHARED,
+					name, nvmeq);
 	return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
 				IRQF_DISABLED | IRQF_SHARED, name, nvmeq);
 }
-- 
cgit 


From be7b62754e097adc0cb16c25c9ee86ee20de62fb Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sun, 6 Feb 2011 07:53:23 -0500
Subject: NVMe: Use a symbolic name to represent cancelled commands instead of
 0

I have plans for other special values in sync_completion.  Plus, this
is more self-documenting, and lets us detect bogus usages.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 60c3786bc787..802d763d9d06 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -31,6 +31,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/pci.h>
+#include <linux/poison.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/types.h>
@@ -158,15 +159,17 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
 }
 
 /* If you need more than four handlers, you'll need to change how
- * alloc_cmdid and nvme_process_cq work.  Also, aborted commands take
- * the sync_completion path (if they complete), so don't put anything
- * else in slot zero.
+ * alloc_cmdid and nvme_process_cq work.  Consider using a special
+ * CMD_CTX value instead, if that works for your situation.
  */
 enum {
 	sync_completion_id = 0,
 	bio_completion_id,
 };
 
+#define CMD_CTX_BASE		(POISON_POINTER_DELTA + sync_completion_id)
+#define CMD_CTX_CANCELLED	(0x2008 + CMD_CTX_BASE)
+
 static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 {
 	unsigned long data;
@@ -177,9 +180,10 @@ static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 	return data;
 }
 
-static void clear_cmdid_data(struct nvme_queue *nvmeq, int cmdid)
+static void cancel_cmdid_data(struct nvme_queue *nvmeq, int cmdid)
 {
-	nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(nvmeq->q_depth)] = 0;
+	nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(nvmeq->q_depth)] =
+							CMD_CTX_CANCELLED;
 }
 
 static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
@@ -396,8 +400,8 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct sync_cmd_info *cmdinfo = ctx;
-	if (!cmdinfo)
-		return;	/* Command aborted */
+	if ((unsigned long)cmdinfo == CMD_CTX_CANCELLED)
+		return;
 	cmdinfo->result = le32_to_cpup(&cqe->result);
 	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
 	wake_up_process(cmdinfo->task);
@@ -480,7 +484,7 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
 {
 	spin_lock_irq(&nvmeq->q_lock);
-	clear_cmdid_data(nvmeq, cmdid);
+	cancel_cmdid_data(nvmeq, cmdid);
 	spin_unlock_irq(&nvmeq->q_lock);
 }
 
-- 
cgit 


From b36235df01ec4141b4e589571d6789076c346d88 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sun, 6 Feb 2011 08:49:55 -0500
Subject: NVMe: Detect commands that are completed twice

Set the context value to CMD_CTX_COMPLETED, and print a message in the
sync_completion handler if we see it.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 802d763d9d06..2dd09e7e142d 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -169,12 +169,15 @@ enum {
 
 #define CMD_CTX_BASE		(POISON_POINTER_DELTA + sync_completion_id)
 #define CMD_CTX_CANCELLED	(0x2008 + CMD_CTX_BASE)
+#define CMD_CTX_COMPLETED	(0x2010 + CMD_CTX_BASE)
 
 static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 {
 	unsigned long data;
+	unsigned offset = cmdid + BITS_TO_LONGS(nvmeq->q_depth);
 
-	data = nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(nvmeq->q_depth)];
+	data = nvmeq->cmdid_data[offset];
+	nvmeq->cmdid_data[offset] = CMD_CTX_COMPLETED;
 	clear_bit(cmdid, nvmeq->cmdid_data);
 	wake_up(&nvmeq->sq_full);
 	return data;
@@ -182,8 +185,8 @@ static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 
 static void cancel_cmdid_data(struct nvme_queue *nvmeq, int cmdid)
 {
-	nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(nvmeq->q_depth)] =
-							CMD_CTX_CANCELLED;
+	unsigned offset = cmdid + BITS_TO_LONGS(nvmeq->q_depth);
+	nvmeq->cmdid_data[offset] = CMD_CTX_CANCELLED;
 }
 
 static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
@@ -402,6 +405,12 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
 	struct sync_cmd_info *cmdinfo = ctx;
 	if ((unsigned long)cmdinfo == CMD_CTX_CANCELLED)
 		return;
+	if (unlikely((unsigned long)cmdinfo == CMD_CTX_COMPLETED)) {
+		dev_warn(nvmeq->q_dmadev,
+				"completed id %d twice on queue %d\n",
+				cqe->command_id, le16_to_cpup(&cqe->sq_id));
+		return;
+	}
 	cmdinfo->result = le32_to_cpup(&cqe->result);
 	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
 	wake_up_process(cmdinfo->task);
-- 
cgit 


From 48e3d39816416b3bf03dee3a796c0c04427c1a31 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sun, 6 Feb 2011 08:51:15 -0500
Subject: NVMe: Detect command IDs completing that are out of range

If the adapter completes a command ID that is outside the bounds of
the array, return CMD_CTX_INVALID instead of random data, and print a
message in the sync_completion handler (which is rapidly becoming the
misc completion handler :-)

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 2dd09e7e142d..f4085d4fe0f2 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -170,12 +170,15 @@ enum {
 #define CMD_CTX_BASE		(POISON_POINTER_DELTA + sync_completion_id)
 #define CMD_CTX_CANCELLED	(0x2008 + CMD_CTX_BASE)
 #define CMD_CTX_COMPLETED	(0x2010 + CMD_CTX_BASE)
+#define CMD_CTX_INVALID		(0x2014 + CMD_CTX_BASE)
 
 static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 {
 	unsigned long data;
 	unsigned offset = cmdid + BITS_TO_LONGS(nvmeq->q_depth);
 
+	if (cmdid > nvmeq->q_depth)
+		return CMD_CTX_INVALID;
 	data = nvmeq->cmdid_data[offset];
 	nvmeq->cmdid_data[offset] = CMD_CTX_COMPLETED;
 	clear_bit(cmdid, nvmeq->cmdid_data);
@@ -411,6 +414,12 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
 				cqe->command_id, le16_to_cpup(&cqe->sq_id));
 		return;
 	}
+	if (unlikely((unsigned long)cmdinfo == CMD_CTX_INVALID)) {
+		dev_warn(nvmeq->q_dmadev,
+				"invalid id %d completed on queue %d\n",
+				cqe->command_id, le16_to_cpup(&cqe->sq_id));
+		return;
+	}
 	cmdinfo->result = le32_to_cpup(&cqe->result);
 	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
 	wake_up_process(cmdinfo->task);
-- 
cgit 


From ec6ce618d65b5ce1bef83a5509255107a0feac44 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sun, 6 Feb 2011 09:01:00 -0500
Subject: NVMe: Need to lock queue during interrupt handling

If we're sharing a queue between multiple CPUs and we cancel a sync I/O,
we must have the queue locked to avoid corrupting the stack of the thread
that submitted the I/O.  It turns out this is the same locking that's needed
for the threaded irq handler, so share that code.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index f4085d4fe0f2..139e6fc1e2a8 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -476,11 +476,6 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 }
 
 static irqreturn_t nvme_irq(int irq, void *data)
-{
-	return nvme_process_cq(data);
-}
-
-static irqreturn_t nvme_irq_thread(int irq, void *data)
 {
 	irqreturn_t result;
 	struct nvme_queue *nvmeq = data;
@@ -676,7 +671,7 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
 {
 	if (use_threaded_interrupts)
 		return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
-					nvme_irq_check, nvme_irq_thread,
+					nvme_irq_check, nvme_irq,
 					IRQF_DISABLED | IRQF_SHARED,
 					name, nvmeq);
 	return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
-- 
cgit 


From e85248e516c550382ba33ca325c272a0ca397e44 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sun, 6 Feb 2011 18:30:16 -0500
Subject: NVMe: Record the timeout for each command

In addition to recording the completion data for each command, record
the anticipated completion time.  Choose a timeout of 5 seconds for
normal I/Os and 60 seconds for admin I/Os.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 49 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 17 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 139e6fc1e2a8..60c1048dc8bc 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -41,6 +41,8 @@
 #define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
 #define NVME_MINORS 64
+#define IO_TIMEOUT	(5 * HZ)
+#define ADMIN_TIMEOUT	(60 * HZ)
 
 static int nvme_major;
 module_param(nvme_major, int, 0);
@@ -119,6 +121,16 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
 }
 
+struct nvme_cmd_info {
+	unsigned long ctx;
+	unsigned long timeout;
+};
+
+static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
+{
+	return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)];
+}
+
 /**
  * alloc_cmdid - Allocate a Command ID
  * @param nvmeq The queue that will be used for this command
@@ -131,10 +143,11 @@ static inline void _nvme_check_size(void)
  * Passing in a pointer that's not 4-byte aligned will cause a BUG.
  * We can change this if it becomes a problem.
  */
-static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler)
+static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler,
+							unsigned timeout)
 {
 	int depth = nvmeq->q_depth;
-	unsigned long data = (unsigned long)ctx | handler;
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
 	int cmdid;
 
 	BUG_ON((unsigned long)ctx & 3);
@@ -145,16 +158,17 @@ static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler)
 			return -EBUSY;
 	} while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
 
-	nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(depth)] = data;
+	info[cmdid].ctx = (unsigned long)ctx | handler;
+	info[cmdid].timeout = jiffies + timeout;
 	return cmdid;
 }
 
 static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
-								int handler)
+						int handler, unsigned timeout)
 {
 	int cmdid;
 	wait_event_killable(nvmeq->sq_full,
-			(cmdid = alloc_cmdid(nvmeq, ctx, handler)) >= 0);
+		(cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0);
 	return (cmdid < 0) ? -EINTR : cmdid;
 }
 
@@ -175,12 +189,12 @@ enum {
 static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 {
 	unsigned long data;
-	unsigned offset = cmdid + BITS_TO_LONGS(nvmeq->q_depth);
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
 
-	if (cmdid > nvmeq->q_depth)
+	if (cmdid >= nvmeq->q_depth)
 		return CMD_CTX_INVALID;
-	data = nvmeq->cmdid_data[offset];
-	nvmeq->cmdid_data[offset] = CMD_CTX_COMPLETED;
+	data = info[cmdid].ctx;
+	info[cmdid].ctx = CMD_CTX_COMPLETED;
 	clear_bit(cmdid, nvmeq->cmdid_data);
 	wake_up(&nvmeq->sq_full);
 	return data;
@@ -188,8 +202,8 @@ static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 
 static void cancel_cmdid_data(struct nvme_queue *nvmeq, int cmdid)
 {
-	unsigned offset = cmdid + BITS_TO_LONGS(nvmeq->q_depth);
-	nvmeq->cmdid_data[offset] = CMD_CTX_CANCELLED;
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+	info[cmdid].ctx = CMD_CTX_CANCELLED;
 }
 
 static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
@@ -327,7 +341,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		goto congestion;
 	info->bio = bio;
 
-	cmdid = alloc_cmdid(nvmeq, info, bio_completion_id);
+	cmdid = alloc_cmdid(nvmeq, info, bio_completion_id, IO_TIMEOUT);
 	if (unlikely(cmdid < 0))
 		goto free_info;
 
@@ -506,7 +520,7 @@ static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
  * if the result is positive, it's an NVM Express status code
  */
 static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
-					struct nvme_command *cmd, u32 *result)
+			struct nvme_command *cmd, u32 *result, unsigned timeout)
 {
 	int cmdid;
 	struct sync_cmd_info cmdinfo;
@@ -514,7 +528,8 @@ static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
 	cmdinfo.task = current;
 	cmdinfo.status = -EINTR;
 
-	cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion_id);
+	cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion_id,
+								timeout);
 	if (cmdid < 0)
 		return cmdid;
 	cmd->common.command_id = cmdid;
@@ -537,7 +552,7 @@ static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
 static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
 								u32 *result)
 {
-	return nvme_submit_sync_cmd(dev->queues[0], cmd, result);
+	return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT);
 }
 
 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
@@ -630,7 +645,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 							int depth, int vector)
 {
 	struct device *dmadev = &dev->pci_dev->dev;
-	unsigned extra = (depth + BITS_TO_LONGS(depth)) * sizeof(long);
+	unsigned extra = (depth / 8) + (depth * sizeof(struct nvme_cmd_info));
 	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
 	if (!nvmeq)
 		return NULL;
@@ -892,7 +907,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	 * additional races since q_lock already protects against other CPUs.
 	 */
 	put_nvmeq(nvmeq);
-	status = nvme_submit_sync_cmd(nvmeq, &c, &result);
+	status = nvme_submit_sync_cmd(nvmeq, &c, &result, IO_TIMEOUT);
 
 	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents);
 	put_user(result, &uio->result);
-- 
cgit 


From 9294bbed78926a895516ec016ba23033f58d1a88 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 7 Feb 2011 12:45:24 -0500
Subject: NVMe: Handle the congestion list a little better

In the bio completion handler, check for bios on the congestion list
for this NVM queue.  Also, lock the congestion list in the make_request
function as the queue may end up being shared between multiple CPUs.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 60c1048dc8bc..2a0dd5e60347 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -105,6 +105,8 @@ struct nvme_queue {
 	unsigned long cmdid_data[];
 };
 
+static void nvme_resubmit_bio(struct nvme_queue *nvmeq, struct bio *bio);
+
 /*
  * Check we didin't inadvertently grow the command struct
  */
@@ -274,6 +276,9 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	free_info(info);
 	bio_endio(bio, status ? -EIO : 0);
+	bio = bio_list_pop(&nvmeq->sq_cong);
+	if (bio)
+		nvme_resubmit_bio(nvmeq, bio);
 }
 
 /* length is in bytes */
@@ -392,6 +397,16 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	return -EBUSY;
 }
 
+static void nvme_resubmit_bio(struct nvme_queue *nvmeq, struct bio *bio)
+{
+	struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
+	if (nvme_submit_bio_queue(nvmeq, ns, bio))
+		bio_list_add_head(&nvmeq->sq_cong, bio);
+	else if (bio_list_empty(&nvmeq->sq_cong))
+		blk_clear_queue_congested(ns->queue, rw_is_sync(bio->bi_rw));
+	/* XXX: Need to duplicate the logic from __freed_request here */
+}
+
 /*
  * NB: return value of non-zero would mean that we were a stacking driver.
  * make_request must always succeed.
@@ -403,7 +418,9 @@ static int nvme_make_request(struct request_queue *q, struct bio *bio)
 
 	if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
 		blk_set_queue_congested(q, rw_is_sync(bio->bi_rw));
+		spin_lock_irq(&nvmeq->q_lock);
 		bio_list_add(&nvmeq->sq_cong, bio);
+		spin_unlock_irq(&nvmeq->q_lock);
 	}
 	put_nvmeq(nvmeq);
 
-- 
cgit 


From d2d8703481f60d67f49e3177196cbe474b11377c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 7 Feb 2011 15:55:59 -0500
Subject: NVMe: Renumber the special context values

If POISON_POINTER_DELTA isn't defined, ensure they're in page 0 which
should never be mapped.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 2a0dd5e60347..71bdf6f2c93b 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -184,9 +184,9 @@ enum {
 };
 
 #define CMD_CTX_BASE		(POISON_POINTER_DELTA + sync_completion_id)
-#define CMD_CTX_CANCELLED	(0x2008 + CMD_CTX_BASE)
-#define CMD_CTX_COMPLETED	(0x2010 + CMD_CTX_BASE)
-#define CMD_CTX_INVALID		(0x2014 + CMD_CTX_BASE)
+#define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
+#define CMD_CTX_COMPLETED	(0x310 + CMD_CTX_BASE)
+#define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
 
 static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 {
-- 
cgit 


From 51882d00f07da9601cc962a3596e48aafb4f4163 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 10 Feb 2011 08:49:59 -0500
Subject: NVMe: Advance the sg pointer when filling in an sg list

For multipage BIOs, we were always using sg[0] instead of advancing
through the list.  Oops :-)

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 71bdf6f2c93b..903e7f15b60d 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -321,6 +321,7 @@ static int nvme_map_bio(struct device *dev, struct nvme_req_info *info,
 	sg_init_table(sg, psegs);
 	bio_for_each_segment(bvec, bio, i) {
 		sg_set_page(sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
+		sg++;
 		/* XXX: handle non-mergable here */
 		nsegs++;
 	}
-- 
cgit 


From e025344c56e08b155f43ea09647969286c78377c Mon Sep 17 00:00:00 2001
From: Shane Michael Matthews <shane.matthews@intel.com>
Date: Thu, 10 Feb 2011 08:51:24 -0500
Subject: NVMe: Initial PRP List support

Add a pointer to the nvme_req_info to hold a new data structure
(nvme_prps) which contains a list of the pages allocated to this
particular request for holding PRP list entries.  nvme_setup_prps()
now returns this pointer.

To allocate and free the memory used for PRP lists, we need a struct
device, so we need to pass the nvme_queue pointer to many functions
which didn't use to need it.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 104 ++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 91 insertions(+), 13 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 903e7f15b60d..b1e8445985a2 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -247,21 +247,55 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 	return 0;
 }
 
+static __le64 *alloc_prp_list(struct nvme_queue *nvmeq, int length,
+				 dma_addr_t *addr)
+{
+	return dma_alloc_coherent(nvmeq->q_dmadev, PAGE_SIZE, addr, GFP_ATOMIC);
+}
+
+struct nvme_prps {
+	int npages;
+	dma_addr_t first_dma;
+	__le64 *list[0];
+};
+
+static void nvme_free_prps(struct nvme_queue *nvmeq, struct nvme_prps *prps)
+{
+	const int last_prp = PAGE_SIZE / 8 - 1;
+	int i;
+	dma_addr_t prp_dma;
+
+	if (!prps)
+		return;
+
+	prp_dma = prps->first_dma;
+	for (i = 0; i < prps->npages; i++) {
+		__le64 *prp_list = prps->list[i];
+		dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
+		dma_free_coherent(nvmeq->q_dmadev, PAGE_SIZE, prp_list,
+								prp_dma);
+		prp_dma = next_prp_dma;
+	}
+	kfree(prps);
+}
+
 struct nvme_req_info {
 	struct bio *bio;
 	int nents;
+	struct nvme_prps *prps;
 	struct scatterlist sg[0];
 };
 
 /* XXX: use a mempool */
 static struct nvme_req_info *alloc_info(unsigned nseg, gfp_t gfp)
 {
-	return kmalloc(sizeof(struct nvme_req_info) +
+	return kzalloc(sizeof(struct nvme_req_info) +
 			sizeof(struct scatterlist) * nseg, gfp);
 }
 
-static void free_info(struct nvme_req_info *info)
+static void free_info(struct nvme_queue *nvmeq, struct nvme_req_info *info)
 {
+	nvme_free_prps(nvmeq, info->prps);
 	kfree(info);
 }
 
@@ -274,7 +308,7 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 
 	dma_unmap_sg(nvmeq->q_dmadev, info->sg, info->nents,
 			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-	free_info(info);
+	free_info(nvmeq, info);
 	bio_endio(bio, status ? -EIO : 0);
 	bio = bio_list_pop(&nvmeq->sq_cong);
 	if (bio)
@@ -282,17 +316,22 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 }
 
 /* length is in bytes */
-static void nvme_setup_prps(struct nvme_common_command *cmd,
+static struct nvme_prps *nvme_setup_prps(struct nvme_queue *nvmeq,
+					struct nvme_common_command *cmd,
 					struct scatterlist *sg, int length)
 {
 	int dma_len = sg_dma_len(sg);
 	u64 dma_addr = sg_dma_address(sg);
 	int offset = offset_in_page(dma_addr);
+	__le64 *prp_list;
+	dma_addr_t prp_dma;
+	int nprps, npages, i, prp_page;
+	struct nvme_prps *prps = NULL;
 
 	cmd->prp1 = cpu_to_le64(dma_addr);
 	length -= (PAGE_SIZE - offset);
 	if (length <= 0)
-		return;
+		return prps;
 
 	dma_len -= (PAGE_SIZE - offset);
 	if (dma_len) {
@@ -305,10 +344,42 @@ static void nvme_setup_prps(struct nvme_common_command *cmd,
 
 	if (length <= PAGE_SIZE) {
 		cmd->prp2 = cpu_to_le64(dma_addr);
-		return;
+		return prps;
 	}
 
-	/* XXX: support PRP lists */
+	nprps = DIV_ROUND_UP(length, PAGE_SIZE);
+	npages = DIV_ROUND_UP(8 * nprps, PAGE_SIZE);
+	prps = kmalloc(sizeof(*prps) + sizeof(__le64 *) * npages, GFP_ATOMIC);
+	prps->npages = npages;
+	prp_page = 0;
+	prp_list = alloc_prp_list(nvmeq, length, &prp_dma);
+	prps->list[prp_page++] = prp_list;
+	prps->first_dma = prp_dma;
+	cmd->prp2 = cpu_to_le64(prp_dma);
+	i = 0;
+	for (;;) {
+		if (i == PAGE_SIZE / 8 - 1) {
+			__le64 *old_prp_list = prp_list;
+			prp_list = alloc_prp_list(nvmeq, length, &prp_dma);
+			prps->list[prp_page++] = prp_list;
+			old_prp_list[i] = cpu_to_le64(prp_dma);
+			i = 0;
+		}
+		prp_list[i++] = cpu_to_le64(dma_addr);
+		dma_len -= PAGE_SIZE;
+		dma_addr += PAGE_SIZE;
+		length -= PAGE_SIZE;
+		if (length <= 0)
+			break;
+		if (dma_len > 0)
+			continue;
+		BUG_ON(dma_len < 0);
+		sg = sg_next(sg);
+		dma_addr = sg_dma_address(sg);
+		dma_len = sg_dma_len(sg);
+	}
+
+	return prps;
 }
 
 static int nvme_map_bio(struct device *dev, struct nvme_req_info *info,
@@ -378,7 +449,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	cmnd->rw.flags = 1;
 	cmnd->rw.command_id = cmdid;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
-	nvme_setup_prps(&cmnd->common, info->sg, bio->bi_size);
+	info->prps = nvme_setup_prps(nvmeq, &cmnd->common, info->sg,
+								bio->bi_size);
 	cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
 	cmnd->rw.length = cpu_to_le16((bio->bi_size >> ns->lba_shift) - 1);
 	cmnd->rw.control = cpu_to_le16(control);
@@ -393,7 +465,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	return 0;
 
  free_info:
-	free_info(info);
+	free_info(nvmeq, info);
  congestion:
 	return -EBUSY;
 }
@@ -852,13 +924,15 @@ static int nvme_submit_user_admin_command(struct nvme_dev *dev,
 {
 	int err, nents;
 	struct scatterlist *sg;
+	struct nvme_prps *prps;
 
 	nents = nvme_map_user_pages(dev, 0, addr, length, &sg);
 	if (nents < 0)
 		return nents;
-	nvme_setup_prps(&cmd->common, sg, length);
+	prps = nvme_setup_prps(dev->queues[0], &cmd->common, sg, length);
 	err = nvme_submit_admin_cmd(dev, cmd, NULL);
 	nvme_unmap_user_pages(dev, 0, addr, length, sg, nents);
+	nvme_free_prps(dev->queues[0], prps);
 	return err ? -EIO : 0;
 }
 
@@ -896,6 +970,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	u32 result;
 	int nents, status;
 	struct scatterlist *sg;
+	struct nvme_prps *prps;
 
 	if (copy_from_user(&io, uio, sizeof(io)))
 		return -EFAULT;
@@ -915,10 +990,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	c.rw.reftag = cpu_to_le32(io.reftag);	/* XXX: endian? */
 	c.rw.apptag = cpu_to_le16(io.apptag);
 	c.rw.appmask = cpu_to_le16(io.appmask);
+	nvmeq = get_nvmeq(ns);
 	/* XXX: metadata */
-	nvme_setup_prps(&c.common, sg, length);
+	prps = nvme_setup_prps(nvmeq, &c.common, sg, length);
 
-	nvmeq = get_nvmeq(ns);
 	/* Since nvme_submit_sync_cmd sleeps, we can't keep preemption
 	 * disabled.  We may be preempted at any point, and be rescheduled
 	 * to a different CPU.  That will cause cacheline bouncing, but no
@@ -928,6 +1003,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	status = nvme_submit_sync_cmd(nvmeq, &c, &result, IO_TIMEOUT);
 
 	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents);
+	nvme_free_prps(nvmeq, prps);
 	put_user(result, &uio->result);
 	return status;
 }
@@ -940,6 +1016,7 @@ static int nvme_download_firmware(struct nvme_ns *ns,
 	struct nvme_command c;
 	int nents, status;
 	struct scatterlist *sg;
+	struct nvme_prps *prps;
 
 	if (copy_from_user(&dlfw, udlfw, sizeof(dlfw)))
 		return -EFAULT;
@@ -954,10 +1031,11 @@ static int nvme_download_firmware(struct nvme_ns *ns,
 	c.dlfw.opcode = nvme_admin_download_fw;
 	c.dlfw.numd = cpu_to_le32(dlfw.length);
 	c.dlfw.offset = cpu_to_le32(dlfw.offset);
-	nvme_setup_prps(&c.common, sg, dlfw.length * 4);
+	prps = nvme_setup_prps(dev->queues[0], &c.common, sg, dlfw.length * 4);
 
 	status = nvme_submit_admin_cmd(dev, &c, NULL);
 	nvme_unmap_user_pages(dev, 0, dlfw.addr, dlfw.length * 4, sg, nents);
+	nvme_free_prps(dev->queues[0], prps);
 	return status;
 }
 
-- 
cgit 


From d534df3c730af9073a9ddc076d9fd65cbdca22b3 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 10 Feb 2011 09:03:06 -0500
Subject: NVMe: Rename nvme_req_info to nvme_bio

There are too many things called 'info' in this driver.  This data
structure is auxiliary information for a struct bio, so call it nvme_bio,
or nbio when used as a variable.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index b1e8445985a2..11df0e90edad 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -279,7 +279,7 @@ static void nvme_free_prps(struct nvme_queue *nvmeq, struct nvme_prps *prps)
 	kfree(prps);
 }
 
-struct nvme_req_info {
+struct nvme_bio {
 	struct bio *bio;
 	int nents;
 	struct nvme_prps *prps;
@@ -287,28 +287,28 @@ struct nvme_req_info {
 };
 
 /* XXX: use a mempool */
-static struct nvme_req_info *alloc_info(unsigned nseg, gfp_t gfp)
+static struct nvme_bio *alloc_nbio(unsigned nseg, gfp_t gfp)
 {
-	return kzalloc(sizeof(struct nvme_req_info) +
+	return kzalloc(sizeof(struct nvme_bio) +
 			sizeof(struct scatterlist) * nseg, gfp);
 }
 
-static void free_info(struct nvme_queue *nvmeq, struct nvme_req_info *info)
+static void free_nbio(struct nvme_queue *nvmeq, struct nvme_bio *nbio)
 {
-	nvme_free_prps(nvmeq, info->prps);
-	kfree(info);
+	nvme_free_prps(nvmeq, nbio->prps);
+	kfree(nbio);
 }
 
 static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
-	struct nvme_req_info *info = ctx;
-	struct bio *bio = info->bio;
+	struct nvme_bio *nbio = ctx;
+	struct bio *bio = nbio->bio;
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
 
-	dma_unmap_sg(nvmeq->q_dmadev, info->sg, info->nents,
+	dma_unmap_sg(nvmeq->q_dmadev, nbio->sg, nbio->nents,
 			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-	free_info(nvmeq, info);
+	free_nbio(nvmeq, nbio);
 	bio_endio(bio, status ? -EIO : 0);
 	bio = bio_list_pop(&nvmeq->sq_cong);
 	if (bio)
@@ -382,11 +382,11 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_queue *nvmeq,
 	return prps;
 }
 
-static int nvme_map_bio(struct device *dev, struct nvme_req_info *info,
+static int nvme_map_bio(struct device *dev, struct nvme_bio *nbio,
 		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
 {
 	struct bio_vec *bvec;
-	struct scatterlist *sg = info->sg;
+	struct scatterlist *sg = nbio->sg;
 	int i, nsegs;
 
 	sg_init_table(sg, psegs);
@@ -396,16 +396,16 @@ static int nvme_map_bio(struct device *dev, struct nvme_req_info *info,
 		/* XXX: handle non-mergable here */
 		nsegs++;
 	}
-	info->nents = nsegs;
+	nbio->nents = nsegs;
 
-	return dma_map_sg(dev, info->sg, info->nents, dma_dir);
+	return dma_map_sg(dev, nbio->sg, nbio->nents, dma_dir);
 }
 
 static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 								struct bio *bio)
 {
 	struct nvme_command *cmnd;
-	struct nvme_req_info *info;
+	struct nvme_bio *nbio;
 	enum dma_data_direction dma_dir;
 	int cmdid;
 	u16 control;
@@ -413,14 +413,14 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	unsigned long flags;
 	int psegs = bio_phys_segments(ns->queue, bio);
 
-	info = alloc_info(psegs, GFP_NOIO);
-	if (!info)
+	nbio = alloc_nbio(psegs, GFP_NOIO);
+	if (!nbio)
 		goto congestion;
-	info->bio = bio;
+	nbio->bio = bio;
 
-	cmdid = alloc_cmdid(nvmeq, info, bio_completion_id, IO_TIMEOUT);
+	cmdid = alloc_cmdid(nvmeq, nbio, bio_completion_id, IO_TIMEOUT);
 	if (unlikely(cmdid < 0))
-		goto free_info;
+		goto free_nbio;
 
 	control = 0;
 	if (bio->bi_rw & REQ_FUA)
@@ -444,12 +444,12 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		dma_dir = DMA_FROM_DEVICE;
 	}
 
-	nvme_map_bio(nvmeq->q_dmadev, info, bio, dma_dir, psegs);
+	nvme_map_bio(nvmeq->q_dmadev, nbio, bio, dma_dir, psegs);
 
 	cmnd->rw.flags = 1;
 	cmnd->rw.command_id = cmdid;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
-	info->prps = nvme_setup_prps(nvmeq, &cmnd->common, info->sg,
+	nbio->prps = nvme_setup_prps(nvmeq, &cmnd->common, nbio->sg,
 								bio->bi_size);
 	cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
 	cmnd->rw.length = cpu_to_le16((bio->bi_size >> ns->lba_shift) - 1);
@@ -464,8 +464,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 
 	return 0;
 
- free_info:
-	free_info(nvmeq, info);
+ free_nbio:
+	free_nbio(nvmeq, nbio);
  congestion:
 	return -EBUSY;
 }
-- 
cgit 


From 091b609258b8e01cc45b01a41ca5e496f674d989 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 10 Feb 2011 09:56:01 -0500
Subject: NVMe: Switch to use DMA Pool API

Calling dma_free_coherent from interrupt context causes warnings.
Using the DMA pools delays freeing until pool destruction, so avoids
the problem.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 39 ++++++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 7 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 11df0e90edad..80fe6a7a8163 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -57,6 +57,7 @@ struct nvme_dev {
 	struct nvme_queue **queues;
 	u32 __iomem *dbs;
 	struct pci_dev *pci_dev;
+	struct dma_pool *prp_page_pool;
 	int instance;
 	int queue_count;
 	u32 ctrl_config;
@@ -88,6 +89,7 @@ struct nvme_ns {
  */
 struct nvme_queue {
 	struct device *q_dmadev;
+	struct nvme_dev *dev;
 	spinlock_t q_lock;
 	struct nvme_command *sq_cmds;
 	volatile struct nvme_completion *cqes;
@@ -247,10 +249,9 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 	return 0;
 }
 
-static __le64 *alloc_prp_list(struct nvme_queue *nvmeq, int length,
-				 dma_addr_t *addr)
+static __le64 *alloc_prp_list(struct nvme_dev *dev, dma_addr_t *addr)
 {
-	return dma_alloc_coherent(nvmeq->q_dmadev, PAGE_SIZE, addr, GFP_ATOMIC);
+	return dma_pool_alloc(dev->prp_page_pool, GFP_ATOMIC, addr);
 }
 
 struct nvme_prps {
@@ -262,6 +263,7 @@ struct nvme_prps {
 static void nvme_free_prps(struct nvme_queue *nvmeq, struct nvme_prps *prps)
 {
 	const int last_prp = PAGE_SIZE / 8 - 1;
+	struct nvme_dev *dev = nvmeq->dev;
 	int i;
 	dma_addr_t prp_dma;
 
@@ -272,8 +274,7 @@ static void nvme_free_prps(struct nvme_queue *nvmeq, struct nvme_prps *prps)
 	for (i = 0; i < prps->npages; i++) {
 		__le64 *prp_list = prps->list[i];
 		dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
-		dma_free_coherent(nvmeq->q_dmadev, PAGE_SIZE, prp_list,
-								prp_dma);
+		dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
 		prp_dma = next_prp_dma;
 	}
 	kfree(prps);
@@ -320,6 +321,7 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_queue *nvmeq,
 					struct nvme_common_command *cmd,
 					struct scatterlist *sg, int length)
 {
+	struct nvme_dev *dev = nvmeq->dev;
 	int dma_len = sg_dma_len(sg);
 	u64 dma_addr = sg_dma_address(sg);
 	int offset = offset_in_page(dma_addr);
@@ -352,7 +354,7 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_queue *nvmeq,
 	prps = kmalloc(sizeof(*prps) + sizeof(__le64 *) * npages, GFP_ATOMIC);
 	prps->npages = npages;
 	prp_page = 0;
-	prp_list = alloc_prp_list(nvmeq, length, &prp_dma);
+	prp_list = alloc_prp_list(dev, &prp_dma);
 	prps->list[prp_page++] = prp_list;
 	prps->first_dma = prp_dma;
 	cmd->prp2 = cpu_to_le64(prp_dma);
@@ -360,7 +362,7 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_queue *nvmeq,
 	for (;;) {
 		if (i == PAGE_SIZE / 8 - 1) {
 			__le64 *old_prp_list = prp_list;
-			prp_list = alloc_prp_list(nvmeq, length, &prp_dma);
+			prp_list = alloc_prp_list(dev, &prp_dma);
 			prps->list[prp_page++] = prp_list;
 			old_prp_list[i] = cpu_to_le64(prp_dma);
 			i = 0;
@@ -752,6 +754,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 		goto free_cqdma;
 
 	nvmeq->q_dmadev = dmadev;
+	nvmeq->dev = dev;
 	spin_lock_init(&nvmeq->q_lock);
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
@@ -1302,6 +1305,22 @@ static int nvme_dev_remove(struct nvme_dev *dev)
 	return 0;
 }
 
+static int nvme_setup_prp_pools(struct nvme_dev *dev)
+{
+	struct device *dmadev = &dev->pci_dev->dev;
+	dev->prp_page_pool = dma_pool_create("prp list page", dmadev,
+						PAGE_SIZE, PAGE_SIZE, 0);
+	if (!dev->prp_page_pool)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void nvme_release_prp_pools(struct nvme_dev *dev)
+{
+	dma_pool_destroy(dev->prp_page_pool);
+}
+
 /* XXX: Use an ida or something to let remove / add work correctly */
 static void nvme_set_instance(struct nvme_dev *dev)
 {
@@ -1346,6 +1365,10 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	nvme_set_instance(dev);
 	dev->entry[0].vector = pdev->irq;
 
+	result = nvme_setup_prp_pools(dev);
+	if (result)
+		goto disable_msix;
+
 	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
 	if (!dev->bar) {
 		result = -ENOMEM;
@@ -1369,6 +1392,7 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
  disable_msix:
 	pci_disable_msix(pdev);
 	nvme_release_instance(dev);
+	nvme_release_prp_pools(dev);
  disable:
 	pci_disable_device(pdev);
 	pci_release_regions(pdev);
@@ -1386,6 +1410,7 @@ static void __devexit nvme_remove(struct pci_dev *pdev)
 	pci_disable_msix(pdev);
 	iounmap(dev->bar);
 	nvme_release_instance(dev);
+	nvme_release_prp_pools(dev);
 	pci_disable_device(pdev);
 	pci_release_regions(pdev);
 	kfree(dev->queues);
-- 
cgit 


From 99802a7aee2b3dd720e382c52b892cc6a8122b11 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 10 Feb 2011 10:30:34 -0500
Subject: NVMe: Optimise memory usage for I/Os between 4k and 128k

Add a second memory pool for smaller I/Os.  We can pack 16 of these on a
single page instead of using an entire page for each one.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 80fe6a7a8163..cd7aeba8310b 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -58,6 +58,7 @@ struct nvme_dev {
 	u32 __iomem *dbs;
 	struct pci_dev *pci_dev;
 	struct dma_pool *prp_page_pool;
+	struct dma_pool *prp_small_pool;
 	int instance;
 	int queue_count;
 	u32 ctrl_config;
@@ -249,11 +250,6 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 	return 0;
 }
 
-static __le64 *alloc_prp_list(struct nvme_dev *dev, dma_addr_t *addr)
-{
-	return dma_pool_alloc(dev->prp_page_pool, GFP_ATOMIC, addr);
-}
-
 struct nvme_prps {
 	int npages;
 	dma_addr_t first_dma;
@@ -271,6 +267,9 @@ static void nvme_free_prps(struct nvme_queue *nvmeq, struct nvme_prps *prps)
 		return;
 
 	prp_dma = prps->first_dma;
+
+	if (prps->npages == 0)
+		dma_pool_free(dev->prp_small_pool, prps->list[0], prp_dma);
 	for (i = 0; i < prps->npages; i++) {
 		__le64 *prp_list = prps->list[i];
 		dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
@@ -322,6 +321,7 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_queue *nvmeq,
 					struct scatterlist *sg, int length)
 {
 	struct nvme_dev *dev = nvmeq->dev;
+	struct dma_pool *pool;
 	int dma_len = sg_dma_len(sg);
 	u64 dma_addr = sg_dma_address(sg);
 	int offset = offset_in_page(dma_addr);
@@ -352,9 +352,16 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_queue *nvmeq,
 	nprps = DIV_ROUND_UP(length, PAGE_SIZE);
 	npages = DIV_ROUND_UP(8 * nprps, PAGE_SIZE);
 	prps = kmalloc(sizeof(*prps) + sizeof(__le64 *) * npages, GFP_ATOMIC);
-	prps->npages = npages;
 	prp_page = 0;
-	prp_list = alloc_prp_list(dev, &prp_dma);
+	if (nprps <= (256 / 8)) {
+		pool = dev->prp_small_pool;
+		prps->npages = 0;
+	} else {
+		pool = dev->prp_page_pool;
+		prps->npages = npages;
+	}
+
+	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
 	prps->list[prp_page++] = prp_list;
 	prps->first_dma = prp_dma;
 	cmd->prp2 = cpu_to_le64(prp_dma);
@@ -362,7 +369,7 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_queue *nvmeq,
 	for (;;) {
 		if (i == PAGE_SIZE / 8 - 1) {
 			__le64 *old_prp_list = prp_list;
-			prp_list = alloc_prp_list(dev, &prp_dma);
+			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
 			prps->list[prp_page++] = prp_list;
 			old_prp_list[i] = cpu_to_le64(prp_dma);
 			i = 0;
@@ -1313,12 +1320,20 @@ static int nvme_setup_prp_pools(struct nvme_dev *dev)
 	if (!dev->prp_page_pool)
 		return -ENOMEM;
 
+	/* Optimisation for I/Os between 4k and 128k */
+	dev->prp_small_pool = dma_pool_create("prp list 256", dmadev,
+						256, 256, 0);
+	if (!dev->prp_small_pool) {
+		dma_pool_destroy(dev->prp_page_pool);
+		return -ENOMEM;
+	}
 	return 0;
 }
 
 static void nvme_release_prp_pools(struct nvme_dev *dev)
 {
 	dma_pool_destroy(dev->prp_page_pool);
+	dma_pool_destroy(dev->prp_small_pool);
 }
 
 /* XXX: Use an ida or something to let remove / add work correctly */
-- 
cgit 


From d567760c409f981d35fc755b51d5bf56a99a467b Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 10 Feb 2011 10:47:55 -0500
Subject: NVMe: Pass the nvme_dev to nvme_free_prps and nvme_setup_prps

We were passing the nvme_queue to access the q_dmadev for the
dma_alloc_coherent calls, but since we moved to the dma pool API,
we really only need the nvme_dev.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index cd7aeba8310b..2948043483fe 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -256,10 +256,9 @@ struct nvme_prps {
 	__le64 *list[0];
 };
 
-static void nvme_free_prps(struct nvme_queue *nvmeq, struct nvme_prps *prps)
+static void nvme_free_prps(struct nvme_dev *dev, struct nvme_prps *prps)
 {
 	const int last_prp = PAGE_SIZE / 8 - 1;
-	struct nvme_dev *dev = nvmeq->dev;
 	int i;
 	dma_addr_t prp_dma;
 
@@ -295,7 +294,7 @@ static struct nvme_bio *alloc_nbio(unsigned nseg, gfp_t gfp)
 
 static void free_nbio(struct nvme_queue *nvmeq, struct nvme_bio *nbio)
 {
-	nvme_free_prps(nvmeq, nbio->prps);
+	nvme_free_prps(nvmeq->dev, nbio->prps);
 	kfree(nbio);
 }
 
@@ -316,11 +315,10 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 }
 
 /* length is in bytes */
-static struct nvme_prps *nvme_setup_prps(struct nvme_queue *nvmeq,
+static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 					struct nvme_common_command *cmd,
 					struct scatterlist *sg, int length)
 {
-	struct nvme_dev *dev = nvmeq->dev;
 	struct dma_pool *pool;
 	int dma_len = sg_dma_len(sg);
 	u64 dma_addr = sg_dma_address(sg);
@@ -458,7 +456,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	cmnd->rw.flags = 1;
 	cmnd->rw.command_id = cmdid;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
-	nbio->prps = nvme_setup_prps(nvmeq, &cmnd->common, nbio->sg,
+	nbio->prps = nvme_setup_prps(nvmeq->dev, &cmnd->common, nbio->sg,
 								bio->bi_size);
 	cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
 	cmnd->rw.length = cpu_to_le16((bio->bi_size >> ns->lba_shift) - 1);
@@ -939,10 +937,10 @@ static int nvme_submit_user_admin_command(struct nvme_dev *dev,
 	nents = nvme_map_user_pages(dev, 0, addr, length, &sg);
 	if (nents < 0)
 		return nents;
-	prps = nvme_setup_prps(dev->queues[0], &cmd->common, sg, length);
+	prps = nvme_setup_prps(dev, &cmd->common, sg, length);
 	err = nvme_submit_admin_cmd(dev, cmd, NULL);
 	nvme_unmap_user_pages(dev, 0, addr, length, sg, nents);
-	nvme_free_prps(dev->queues[0], prps);
+	nvme_free_prps(dev, prps);
 	return err ? -EIO : 0;
 }
 
@@ -1000,10 +998,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	c.rw.reftag = cpu_to_le32(io.reftag);	/* XXX: endian? */
 	c.rw.apptag = cpu_to_le16(io.apptag);
 	c.rw.appmask = cpu_to_le16(io.appmask);
-	nvmeq = get_nvmeq(ns);
 	/* XXX: metadata */
-	prps = nvme_setup_prps(nvmeq, &c.common, sg, length);
+	prps = nvme_setup_prps(dev, &c.common, sg, length);
 
+	nvmeq = get_nvmeq(ns);
 	/* Since nvme_submit_sync_cmd sleeps, we can't keep preemption
 	 * disabled.  We may be preempted at any point, and be rescheduled
 	 * to a different CPU.  That will cause cacheline bouncing, but no
@@ -1013,7 +1011,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	status = nvme_submit_sync_cmd(nvmeq, &c, &result, IO_TIMEOUT);
 
 	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents);
-	nvme_free_prps(nvmeq, prps);
+	nvme_free_prps(dev, prps);
 	put_user(result, &uio->result);
 	return status;
 }
@@ -1041,11 +1039,11 @@ static int nvme_download_firmware(struct nvme_ns *ns,
 	c.dlfw.opcode = nvme_admin_download_fw;
 	c.dlfw.numd = cpu_to_le32(dlfw.length);
 	c.dlfw.offset = cpu_to_le32(dlfw.offset);
-	prps = nvme_setup_prps(dev->queues[0], &c.common, sg, dlfw.length * 4);
+	prps = nvme_setup_prps(dev, &c.common, sg, dlfw.length * 4);
 
 	status = nvme_submit_admin_cmd(dev, &c, NULL);
 	nvme_unmap_user_pages(dev, 0, dlfw.addr, dlfw.length * 4, sg, nents);
-	nvme_free_prps(dev->queues[0], prps);
+	nvme_free_prps(dev, prps);
 	return status;
 }
 
-- 
cgit 


From 1974b1ae8852324a75fb8cfecbc7b758fd5a2c3c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 10 Feb 2011 12:01:09 -0500
Subject: NVMe: Check for DMA mapping failure

If dma_map_sg returns 0 (failure), we need to fail the I/O.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 2948043483fe..bfdca3a3a41a 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -451,7 +451,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		dma_dir = DMA_FROM_DEVICE;
 	}
 
-	nvme_map_bio(nvmeq->q_dmadev, nbio, bio, dma_dir, psegs);
+	if (nvme_map_bio(nvmeq->q_dmadev, nbio, bio, dma_dir, psegs) == 0)
+		goto mapping_failed;
 
 	cmnd->rw.flags = 1;
 	cmnd->rw.command_id = cmdid;
@@ -471,6 +472,11 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 
 	return 0;
 
+ mapping_failed:
+	free_nbio(nvmeq, nbio);
+	bio_endio(bio, -ENOMEM);
+	return 0;
+
  free_nbio:
 	free_nbio(nvmeq, nbio);
  congestion:
-- 
cgit 


From 768308400f5b4ce665a072eb976a851978b7706e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 10 Feb 2011 13:55:39 -0500
Subject: NVMe: Handle physical merging of bvec entries

In order to not overrun the sg array, we have to merge physically
contiguous pages into a single sg entry.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index bfdca3a3a41a..c0e84b688f50 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -392,19 +392,25 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 static int nvme_map_bio(struct device *dev, struct nvme_bio *nbio,
 		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
 {
-	struct bio_vec *bvec;
-	struct scatterlist *sg = nbio->sg;
-	int i, nsegs;
+	struct bio_vec *bvec, *bvprv = NULL;
+	struct scatterlist *sg = NULL;
+	int i, nsegs = 0;
 
-	sg_init_table(sg, psegs);
+	sg_init_table(nbio->sg, psegs);
 	bio_for_each_segment(bvec, bio, i) {
-		sg_set_page(sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
-		sg++;
-		/* XXX: handle non-mergable here */
-		nsegs++;
+		if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) {
+			sg->length += bvec->bv_len;
+		} else {
+			/* Check bvprv && offset == 0 */
+			sg = sg ? sg + 1 : nbio->sg;
+			sg_set_page(sg, bvec->bv_page, bvec->bv_len,
+							bvec->bv_offset);
+			nsegs++;
+		}
+		bvprv = bvec;
 	}
 	nbio->nents = nsegs;
-
+	sg_mark_end(sg);
 	return dma_map_sg(dev, nbio->sg, nbio->nents, dma_dir);
 }
 
-- 
cgit 


From eeee322647a67c20d9277c5e02c42b2126ea74bc Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 14 Feb 2011 15:55:33 -0500
Subject: NVMe: Handle failures differently in nvme_submit_bio_queue()

Return -EBUSY if the queue is full or -ENOMEM if we failed to allocate
memory (or map a scatterlist).  Also use GFP_ATOMIC to allocate the
nvme_bio and move the locking to the callers of nvme_submit_bio_queue().

In nvme_make_request(), don't permit an I/O to jump the queue -- if the
congestion list already has an entry, just add to the tail, rather than
trying to submit.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index c0e84b688f50..61a241741ca6 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -420,17 +420,17 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	struct nvme_command *cmnd;
 	struct nvme_bio *nbio;
 	enum dma_data_direction dma_dir;
-	int cmdid;
+	int cmdid, result = -ENOMEM;
 	u16 control;
 	u32 dsmgmt;
-	unsigned long flags;
 	int psegs = bio_phys_segments(ns->queue, bio);
 
-	nbio = alloc_nbio(psegs, GFP_NOIO);
+	nbio = alloc_nbio(psegs, GFP_ATOMIC);
 	if (!nbio)
-		goto congestion;
+		goto nomem;
 	nbio->bio = bio;
 
+	result = -EBUSY;
 	cmdid = alloc_cmdid(nvmeq, nbio, bio_completion_id, IO_TIMEOUT);
 	if (unlikely(cmdid < 0))
 		goto free_nbio;
@@ -445,7 +445,6 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	if (bio->bi_rw & REQ_RAHEAD)
 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
-	spin_lock_irqsave(&nvmeq->q_lock, flags);
 	cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
 
 	memset(cmnd, 0, sizeof(*cmnd));
@@ -457,8 +456,9 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		dma_dir = DMA_FROM_DEVICE;
 	}
 
+	result = -ENOMEM;
 	if (nvme_map_bio(nvmeq->q_dmadev, nbio, bio, dma_dir, psegs) == 0)
-		goto mapping_failed;
+		goto free_nbio;
 
 	cmnd->rw.flags = 1;
 	cmnd->rw.command_id = cmdid;
@@ -474,19 +474,12 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	if (++nvmeq->sq_tail == nvmeq->q_depth)
 		nvmeq->sq_tail = 0;
 
-	spin_unlock_irqrestore(&nvmeq->q_lock, flags);
-
-	return 0;
-
- mapping_failed:
-	free_nbio(nvmeq, nbio);
-	bio_endio(bio, -ENOMEM);
 	return 0;
 
  free_nbio:
 	free_nbio(nvmeq, nbio);
- congestion:
-	return -EBUSY;
+ nomem:
+	return result;
 }
 
 static void nvme_resubmit_bio(struct nvme_queue *nvmeq, struct bio *bio)
@@ -507,13 +500,18 @@ static int nvme_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct nvme_ns *ns = q->queuedata;
 	struct nvme_queue *nvmeq = get_nvmeq(ns);
+	int result = -EBUSY;
 
-	if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
-		blk_set_queue_congested(q, rw_is_sync(bio->bi_rw));
-		spin_lock_irq(&nvmeq->q_lock);
+	spin_lock_irq(&nvmeq->q_lock);
+	if (bio_list_empty(&nvmeq->sq_cong))
+		result = nvme_submit_bio_queue(nvmeq, ns, bio);
+	if (unlikely(result)) {
+		if (bio_list_empty(&nvmeq->sq_cong))
+			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
 		bio_list_add(&nvmeq->sq_cong, bio);
-		spin_unlock_irq(&nvmeq->q_lock);
 	}
+
+	spin_unlock_irq(&nvmeq->q_lock);
 	put_nvmeq(nvmeq);
 
 	return 0;
-- 
cgit 


From 1fa6aeadf18aeebd7a217d7a3a933856448375b6 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 2 Mar 2011 18:37:18 -0500
Subject: NVMe: Add a kthread to handle the congestion list

Instead of trying to resubmit I/Os in the I/O completion path (in
interrupt context), wake up a kthread which will resubmit I/O from
user context.  This allows mke2fs to run to completion.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 86 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 67 insertions(+), 19 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 61a241741ca6..606371e62905 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -26,6 +26,7 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/kdev_t.h>
+#include <linux/kthread.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/module.h>
@@ -50,10 +51,15 @@ module_param(nvme_major, int, 0);
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0);
 
+static DEFINE_SPINLOCK(dev_list_lock);
+static LIST_HEAD(dev_list);
+static struct task_struct *nvme_thread;
+
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
  */
 struct nvme_dev {
+	struct list_head node;
 	struct nvme_queue **queues;
 	u32 __iomem *dbs;
 	struct pci_dev *pci_dev;
@@ -97,6 +103,7 @@ struct nvme_queue {
 	dma_addr_t sq_dma_addr;
 	dma_addr_t cq_dma_addr;
 	wait_queue_head_t sq_full;
+	wait_queue_t sq_cong_wait;
 	struct bio_list sq_cong;
 	u32 __iomem *q_db;
 	u16 q_depth;
@@ -108,8 +115,6 @@ struct nvme_queue {
 	unsigned long cmdid_data[];
 };
 
-static void nvme_resubmit_bio(struct nvme_queue *nvmeq, struct bio *bio);
-
 /*
  * Check we didin't inadvertently grow the command struct
  */
@@ -309,9 +314,6 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	free_nbio(nvmeq, nbio);
 	bio_endio(bio, status ? -EIO : 0);
-	bio = bio_list_pop(&nvmeq->sq_cong);
-	if (bio)
-		nvme_resubmit_bio(nvmeq, bio);
 }
 
 /* length is in bytes */
@@ -482,16 +484,6 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	return result;
 }
 
-static void nvme_resubmit_bio(struct nvme_queue *nvmeq, struct bio *bio)
-{
-	struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
-	if (nvme_submit_bio_queue(nvmeq, ns, bio))
-		bio_list_add_head(&nvmeq->sq_cong, bio);
-	else if (bio_list_empty(&nvmeq->sq_cong))
-		blk_clear_queue_congested(ns->queue, rw_is_sync(bio->bi_rw));
-	/* XXX: Need to duplicate the logic from __freed_request here */
-}
-
 /*
  * NB: return value of non-zero would mean that we were a stacking driver.
  * make_request must always succeed.
@@ -774,6 +766,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
 	init_waitqueue_head(&nvmeq->sq_full);
+	init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
 	bio_list_init(&nvmeq->sq_cong);
 	nvmeq->q_db = &dev->dbs[qid * 2];
 	nvmeq->q_depth = depth;
@@ -1097,6 +1090,43 @@ static const struct block_device_operations nvme_fops = {
 	.ioctl		= nvme_ioctl,
 };
 
+static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
+{
+	while (bio_list_peek(&nvmeq->sq_cong)) {
+		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
+		struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
+		if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
+			bio_list_add_head(&nvmeq->sq_cong, bio);
+			break;
+		}
+	}
+}
+
+static int nvme_kthread(void *data)
+{
+	struct nvme_dev *dev;
+
+	while (!kthread_should_stop()) {
+		__set_current_state(TASK_RUNNING);
+		spin_lock(&dev_list_lock);
+		list_for_each_entry(dev, &dev_list, node) {
+			int i;
+			for (i = 0; i < dev->queue_count; i++) {
+				struct nvme_queue *nvmeq = dev->queues[i];
+				spin_lock_irq(&nvmeq->q_lock);
+				if (nvme_process_cq(nvmeq))
+					printk("process_cq did something\n");
+				nvme_resubmit_bios(nvmeq);
+				spin_unlock_irq(&nvmeq->q_lock);
+			}
+		}
+		spin_unlock(&dev_list_lock);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(HZ);
+	}
+	return 0;
+}
+
 static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int index,
 			struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
 {
@@ -1307,6 +1337,10 @@ static int nvme_dev_remove(struct nvme_dev *dev)
 {
 	struct nvme_ns *ns, *next;
 
+	spin_lock(&dev_list_lock);
+	list_del(&dev->node);
+	spin_unlock(&dev_list_lock);
+
 	/* TODO: wait all I/O finished or cancel them */
 
 	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
@@ -1406,6 +1440,11 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	result = nvme_dev_add(dev);
 	if (result)
 		goto delete;
+
+	spin_lock(&dev_list_lock);
+	list_add(&dev->node, &dev_list);
+	spin_unlock(&dev_list_lock);
+
 	return 0;
 
  delete:
@@ -1479,17 +1518,25 @@ static struct pci_driver nvme_driver = {
 
 static int __init nvme_init(void)
 {
-	int result;
+	int result = -EBUSY;
+
+	nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
+	if (IS_ERR(nvme_thread))
+		return PTR_ERR(nvme_thread);
 
 	nvme_major = register_blkdev(nvme_major, "nvme");
 	if (nvme_major <= 0)
-		return -EBUSY;
+		goto kill_kthread;
 
 	result = pci_register_driver(&nvme_driver);
-	if (!result)
-		return 0;
+	if (result)
+		goto unregister_blkdev;
+	return 0;
 
+ unregister_blkdev:
 	unregister_blkdev(nvme_major, "nvme");
+ kill_kthread:
+	kthread_stop(nvme_thread);
 	return result;
 }
 
@@ -1497,6 +1544,7 @@ static void __exit nvme_exit(void)
 {
 	pci_unregister_driver(&nvme_driver);
 	unregister_blkdev(nvme_major, "nvme");
+	kthread_stop(nvme_thread);
 }
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
-- 
cgit 


From ad8a5df97cb060aa4d817af25587c99e2d2fda97 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 14 Feb 2011 17:35:00 -0500
Subject: NVMe: Release 0.3

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 606371e62905..7554625fb94d 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1549,6 +1549,6 @@ static void __exit nvme_exit(void)
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("0.2");
+MODULE_VERSION("0.3");
 module_init(nvme_init);
 module_exit(nvme_exit);
-- 
cgit 


From ca1615424c9adfdbe7d484771d7a7c5ecc4bb6d2 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 15 Feb 2011 13:44:13 -0500
Subject: NVMe: Remove setting of 'flags' in rw command

This was the data transfer bit until spec rev 0.92

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 7554625fb94d..37cdf0711954 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -462,7 +462,6 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	if (nvme_map_bio(nvmeq->q_dmadev, nbio, bio, dma_dir, psegs) == 0)
 		goto free_nbio;
 
-	cmnd->rw.flags = 1;
 	cmnd->rw.command_id = cmdid;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
 	nbio->prps = nvme_setup_prps(nvmeq->dev, &cmnd->common, nbio->sg,
-- 
cgit 


From b348b7d54368c87811907a8e88f0d96713c43009 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 15 Feb 2011 16:16:02 -0500
Subject: NVMe: Rename nr_queues to nr_io_queues

I got confused about whether this included the admin queue or not, and
had to resort to reading the spec.  It doesn't include the admin queue,
so make that clear in the name.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 37cdf0711954..f3aa8097e675 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1202,29 +1202,30 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 
 static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 {
-	int result, cpu, i, nr_queues;
+	int result, cpu, i, nr_io_queues;
 
-	nr_queues = num_online_cpus();
-	result = set_queue_count(dev, nr_queues);
+	nr_io_queues = num_online_cpus();
+	result = set_queue_count(dev, nr_io_queues);
 	if (result < 0)
 		return result;
-	if (result < nr_queues)
-		nr_queues = result;
+	if (result < nr_io_queues)
+		nr_io_queues = result;
 
 	/* Deregister the admin queue's interrupt */
 	free_irq(dev->entry[0].vector, dev->queues[0]);
 
-	for (i = 0; i < nr_queues; i++)
+	for (i = 0; i < nr_io_queues; i++)
 		dev->entry[i].entry = i;
 	for (;;) {
-		result = pci_enable_msix(dev->pci_dev, dev->entry, nr_queues);
+		result = pci_enable_msix(dev->pci_dev, dev->entry,
+								nr_io_queues);
 		if (result == 0) {
 			break;
 		} else if (result > 0) {
-			nr_queues = result;
+			nr_io_queues = result;
 			continue;
 		} else {
-			nr_queues = 1;
+			nr_io_queues = 1;
 			break;
 		}
 	}
@@ -1233,12 +1234,12 @@ static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 	/* XXX: handle failure here */
 
 	cpu = cpumask_first(cpu_online_mask);
-	for (i = 0; i < nr_queues; i++) {
+	for (i = 0; i < nr_io_queues; i++) {
 		irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
 		cpu = cpumask_next(cpu, cpu_online_mask);
 	}
 
-	for (i = 0; i < nr_queues; i++) {
+	for (i = 0; i < nr_io_queues; i++) {
 		dev->queues[i + 1] = nvme_create_queue(dev, i + 1,
 							NVME_Q_DEPTH, i);
 		if (!dev->queues[i + 1])
-- 
cgit 


From 740216fc59cba54f65187c9ed92f29bce3cf8778 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 15 Feb 2011 16:28:20 -0500
Subject: NVMe: Let the kthread take care of devices earlier

If interrupts are misconfigured, the kthread will be needed to process
admin queue completions.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index f3aa8097e675..df1d8bda8c7c 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1112,6 +1112,8 @@ static int nvme_kthread(void *data)
 			int i;
 			for (i = 0; i < dev->queue_count; i++) {
 				struct nvme_queue *nvmeq = dev->queues[i];
+				if (!nvmeq)
+					continue;
 				spin_lock_irq(&nvmeq->q_lock);
 				if (nvme_process_cq(nvmeq))
 					printk("process_cq did something\n");
@@ -1437,17 +1439,21 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 		goto unmap;
 	dev->queue_count++;
 
-	result = nvme_dev_add(dev);
-	if (result)
-		goto delete;
-
 	spin_lock(&dev_list_lock);
 	list_add(&dev->node, &dev_list);
 	spin_unlock(&dev_list_lock);
 
+	result = nvme_dev_add(dev);
+	if (result)
+		goto delete;
+
 	return 0;
 
  delete:
+	spin_lock(&dev_list_lock);
+	list_del(&dev->node);
+	spin_unlock(&dev_list_lock);
+
 	nvme_free_queues(dev);
  unmap:
 	iounmap(dev->bar);
-- 
cgit 


From 7547881d0951384f9833ec3a80fac8f3f16f3b98 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 16 Feb 2011 09:59:59 -0500
Subject: NVMe: Correct SQ doorbell semantics

The value written to the doorbell needs to be the first free index in
the queue, not the most recently used index in the queue.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index df1d8bda8c7c..af45e286d5dd 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -246,9 +246,9 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 	spin_lock_irqsave(&nvmeq->q_lock, flags);
 	tail = nvmeq->sq_tail;
 	memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
-	writel(tail, nvmeq->q_db);
 	if (++tail == nvmeq->q_depth)
 		tail = 0;
+	writel(tail, nvmeq->q_db);
 	nvmeq->sq_tail = tail;
 	spin_unlock_irqrestore(&nvmeq->q_lock, flags);
 
@@ -471,9 +471,9 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	cmnd->rw.control = cpu_to_le16(control);
 	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 
-	writel(nvmeq->sq_tail, nvmeq->q_db);
 	if (++nvmeq->sq_tail == nvmeq->q_depth)
 		nvmeq->sq_tail = 0;
+	writel(nvmeq->sq_tail, nvmeq->q_db);
 
 	return 0;
 
-- 
cgit 


From c42705592be2a539f3027b6f3907de8e8f9591a8 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 22 Feb 2011 14:15:34 -0500
Subject: NVMe: Mark CMD_CTX_CANCELLED as being unlikely

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index af45e286d5dd..ce919b49b30d 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -518,7 +518,7 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct sync_cmd_info *cmdinfo = ctx;
-	if ((unsigned long)cmdinfo == CMD_CTX_CANCELLED)
+	if (unlikely((unsigned long)cmdinfo == CMD_CTX_CANCELLED))
 		return;
 	if (unlikely((unsigned long)cmdinfo == CMD_CTX_COMPLETED)) {
 		dev_warn(nvmeq->q_dmadev,
-- 
cgit 


From 00df5cb4eb927078850086f8becc3286a69ea12e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 22 Feb 2011 14:18:30 -0500
Subject: NVMe: Implement Flush

Linux implements Flush as a bit in the bio.  That means there may also be
data associated with the flush; if so the flush should be sent before the
data.  To avoid completing the bio twice, I add CMD_CTX_FLUSH to indicate
the completion routine should do nothing.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index ce919b49b30d..d99b400ccd79 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -191,10 +191,12 @@ enum {
 	bio_completion_id,
 };
 
+/* Special values must be a multiple of 4, and less than 0x1000 */
 #define CMD_CTX_BASE		(POISON_POINTER_DELTA + sync_completion_id)
 #define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
 #define CMD_CTX_COMPLETED	(0x310 + CMD_CTX_BASE)
 #define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
+#define CMD_CTX_FLUSH		(0x318 + CMD_CTX_BASE)
 
 static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 {
@@ -416,6 +418,33 @@ static int nvme_map_bio(struct device *dev, struct nvme_bio *nbio,
 	return dma_map_sg(dev, nbio->sg, nbio->nents, dma_dir);
 }
 
+static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+								int cmdid)
+{
+	struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+
+	memset(cmnd, 0, sizeof(*cmnd));
+	cmnd->common.opcode = nvme_cmd_flush;
+	cmnd->common.command_id = cmdid;
+	cmnd->common.nsid = cpu_to_le32(ns->ns_id);
+
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+	writel(nvmeq->sq_tail, nvmeq->q_db);
+
+	return 0;
+}
+
+static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
+{
+	int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH,
+						sync_completion_id, IO_TIMEOUT);
+	if (unlikely(cmdid < 0))
+		return cmdid;
+
+	return nvme_submit_flush(nvmeq, ns, cmdid);
+}
+
 static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 								struct bio *bio)
 {
@@ -427,6 +456,12 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	u32 dsmgmt;
 	int psegs = bio_phys_segments(ns->queue, bio);
 
+	if ((bio->bi_rw & REQ_FLUSH) && psegs) {
+		result = nvme_submit_flush_data(nvmeq, ns);
+		if (result)
+			return result;
+	}
+
 	nbio = alloc_nbio(psegs, GFP_ATOMIC);
 	if (!nbio)
 		goto nomem;
@@ -437,6 +472,9 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	if (unlikely(cmdid < 0))
 		goto free_nbio;
 
+	if ((bio->bi_rw & REQ_FLUSH) && !psegs)
+		return nvme_submit_flush(nvmeq, ns, cmdid);
+
 	control = 0;
 	if (bio->bi_rw & REQ_FUA)
 		control |= NVME_RW_FUA;
@@ -520,6 +558,8 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
 	struct sync_cmd_info *cmdinfo = ctx;
 	if (unlikely((unsigned long)cmdinfo == CMD_CTX_CANCELLED))
 		return;
+	if ((unsigned long)cmdinfo == CMD_CTX_FLUSH)
+		return;
 	if (unlikely((unsigned long)cmdinfo == CMD_CTX_COMPLETED)) {
 		dev_warn(nvmeq->q_dmadev,
 				"completed id %d twice on queue %d\n",
-- 
cgit 


From 1ad2f8932a72bf375361727949ced2cb4e8cfcef Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 23 Feb 2011 15:20:00 -0500
Subject: NVMe: Handle bios that contain non-virtually contiguous addresses

NVMe scatterlists must be virtually contiguous, like almost all I/Os.
However, when the filesystem lays out files with a hole, it can be that
adjacent LBAs map to non-adjacent virtual addresses.  Handle this by
submitting one NVMe command at a time for each virtually discontiguous
range.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 38 +++++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index d99b400ccd79..240922706a93 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -315,7 +315,14 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 	dma_unmap_sg(nvmeq->q_dmadev, nbio->sg, nbio->nents,
 			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	free_nbio(nvmeq, nbio);
-	bio_endio(bio, status ? -EIO : 0);
+	if (status)
+		bio_endio(bio, -EIO);
+	if (bio->bi_vcnt > bio->bi_idx) {
+		bio_list_add(&nvmeq->sq_cong, bio);
+		wake_up_process(nvme_thread);
+	} else {
+		bio_endio(bio, 0);
+	}
 }
 
 /* length is in bytes */
@@ -393,29 +400,41 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 	return prps;
 }
 
+/* NVMe scatterlists require no holes in the virtual address */
+#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2)	((vec2)->bv_offset || \
+			(((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
+
 static int nvme_map_bio(struct device *dev, struct nvme_bio *nbio,
 		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
 {
 	struct bio_vec *bvec, *bvprv = NULL;
 	struct scatterlist *sg = NULL;
-	int i, nsegs = 0;
+	int i, old_idx, length = 0, nsegs = 0;
 
 	sg_init_table(nbio->sg, psegs);
+	old_idx = bio->bi_idx;
 	bio_for_each_segment(bvec, bio, i) {
 		if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) {
 			sg->length += bvec->bv_len;
 		} else {
-			/* Check bvprv && offset == 0 */
+			if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec))
+				break;
 			sg = sg ? sg + 1 : nbio->sg;
 			sg_set_page(sg, bvec->bv_page, bvec->bv_len,
 							bvec->bv_offset);
 			nsegs++;
 		}
+		length += bvec->bv_len;
 		bvprv = bvec;
 	}
+	bio->bi_idx = i;
 	nbio->nents = nsegs;
 	sg_mark_end(sg);
-	return dma_map_sg(dev, nbio->sg, nbio->nents, dma_dir);
+	if (dma_map_sg(dev, nbio->sg, nbio->nents, dma_dir) == 0) {
+		bio->bi_idx = old_idx;
+		return -ENOMEM;
+	}
+	return length;
 }
 
 static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
@@ -451,7 +470,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	struct nvme_command *cmnd;
 	struct nvme_bio *nbio;
 	enum dma_data_direction dma_dir;
-	int cmdid, result = -ENOMEM;
+	int cmdid, length, result = -ENOMEM;
 	u16 control;
 	u32 dsmgmt;
 	int psegs = bio_phys_segments(ns->queue, bio);
@@ -496,16 +515,17 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		dma_dir = DMA_FROM_DEVICE;
 	}
 
-	result = -ENOMEM;
-	if (nvme_map_bio(nvmeq->q_dmadev, nbio, bio, dma_dir, psegs) == 0)
+	result = nvme_map_bio(nvmeq->q_dmadev, nbio, bio, dma_dir, psegs);
+	if (result < 0)
 		goto free_nbio;
+	length = result;
 
 	cmnd->rw.command_id = cmdid;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
 	nbio->prps = nvme_setup_prps(nvmeq->dev, &cmnd->common, nbio->sg,
-								bio->bi_size);
+								length);
 	cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
-	cmnd->rw.length = cpu_to_le16((bio->bi_size >> ns->lba_shift) - 1);
+	cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
 	cmnd->rw.control = cpu_to_le16(control);
 	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 
-- 
cgit 


From d8ee9d69f275769aaad40ef7c944565ff8d2d24f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 24 Feb 2011 08:46:00 -0500
Subject: NVMe: Fix discontiguous accesses

When we submit subsequent portions of the I/O, we need to access the
updated block, not start reading again from the original position.
This was showing up as miscompares in the XFS randholes testcase.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 240922706a93..562d75a0fc50 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -529,6 +529,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	cmnd->rw.control = cpu_to_le16(control);
 	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 
+	bio->bi_sector += length >> 9;
+
 	if (++nvmeq->sq_tail == nvmeq->q_depth)
 		nvmeq->sq_tail = 0;
 	writel(nvmeq->sq_tail, nvmeq->q_db);
-- 
cgit 


From e6d15f79f997a98b3a69abbc462fc9041cc1a7b4 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 24 Feb 2011 08:49:41 -0500
Subject: NVMe: Reduce maximum queue depth by 1

The spec says we're not allowed to completely fill the submission queue.
Solve this by reducing the number of allocatable cmdids by 1.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 562d75a0fc50..45bfae1ebd50 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -156,7 +156,7 @@ static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
 static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler,
 							unsigned timeout)
 {
-	int depth = nvmeq->q_depth;
+	int depth = nvmeq->q_depth - 1;
 	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
 	int cmdid;
 
-- 
cgit 


From b57ab0fada358357571f0eb448cdf2f144785321 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 24 Feb 2011 16:20:14 -0500
Subject: NVMe: Version 0.4

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 45bfae1ebd50..a8549dff4691 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1617,6 +1617,6 @@ static void __exit nvme_exit(void)
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("0.3");
+MODULE_VERSION("0.4");
 module_init(nvme_init);
 module_exit(nvme_exit);
-- 
cgit 


From 714a7a22884b74862540bc84955274d86b2f6040 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 16 Mar 2011 16:28:24 -0400
Subject: NVMe: Convert comments to kernel-doc notation

Reported-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index a8549dff4691..e392919e0eac 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -142,10 +142,10 @@ static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
 }
 
 /**
- * alloc_cmdid - Allocate a Command ID
- * @param nvmeq The queue that will be used for this command
- * @param ctx A pointer that will be passed to the handler
- * @param handler The ID of the handler to call
+ * alloc_cmdid() - Allocate a Command ID
+ * @nvmeq: The queue that will be used for this command
+ * @ctx: A pointer that will be passed to the handler
+ * @handler: The ID of the handler to call
  *
  * Allocate a Command ID for a queue.  The data passed in will
  * be passed to the completion handler.  This is implemented by using
@@ -234,7 +234,7 @@ static void put_nvmeq(struct nvme_queue *nvmeq)
 }
 
 /**
- * nvme_submit_cmd: Copy a command into a queue and ring the doorbell
+ * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
  * @nvmeq: The queue to use
  * @cmd: The command to send
  *
-- 
cgit 


From fa92282149842645931580225647238428374758 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 16 Mar 2011 16:29:00 -0400
Subject: NVMe: Fix comment formatting

Reported-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index e392919e0eac..740a9c1b81aa 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -182,7 +182,8 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
 	return (cmdid < 0) ? -EINTR : cmdid;
 }
 
-/* If you need more than four handlers, you'll need to change how
+/*
+ * If you need more than four handlers, you'll need to change how
  * alloc_cmdid and nvme_process_cq work.  Consider using a special
  * CMD_CTX value instead, if that works for your situation.
  */
@@ -1066,7 +1067,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	prps = nvme_setup_prps(dev, &c.common, sg, length);
 
 	nvmeq = get_nvmeq(ns);
-	/* Since nvme_submit_sync_cmd sleeps, we can't keep preemption
+	/*
+	 * Since nvme_submit_sync_cmd sleeps, we can't keep preemption
 	 * disabled.  We may be preempted at any point, and be rescheduled
 	 * to a different CPU.  That will cause cacheline bouncing, but no
 	 * additional races since q_lock already protects against other CPUs.
-- 
cgit 


From 19e899b2f9f89f4a290dd5c9c24d15987a18ab21 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 16 Mar 2011 16:29:24 -0400
Subject: NVMe: Remove outdated comments

The head can never overrun the tail since we won't allocate enough command
IDs to let that happen.  The status codes are in sync with the spec.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 740a9c1b81aa..d4f95eb51dc1 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -245,7 +245,6 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 {
 	unsigned long flags;
 	u16 tail;
-	/* XXX: Need to check tail isn't going to overrun head */
 	spin_lock_irqsave(&nvmeq->q_lock, flags);
 	tail = nvmeq->sq_tail;
 	memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
-- 
cgit 


From ac88c36a385b848cb9efcb877fdfc4153a60bcab Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 16 Mar 2011 16:29:58 -0400
Subject: NVMe: Fix interpretation of 'Number of Namespaces' field

The spec says this is a 0s based value.  We don't need to handle the
maximal value because it's reserved to mean "every namespace".

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index d4f95eb51dc1..0d5c918b7d59 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1364,7 +1364,7 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 	crt.features.prp1 = cpu_to_le64(dma_addr + 4096);
 	crt.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE);
 
-	for (i = 0; i < nn; i++) {
+	for (i = 0; i <= nn; i++) {
 		cid.identify.nsid = cpu_to_le32(i);
 		res = nvme_submit_admin_cmd(dev, &cid, NULL);
 		if (res)
-- 
cgit 


From 7523d834dd1573610078eb1ac0933f6490232f90 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 16 Mar 2011 16:43:40 -0400
Subject: NVMe: Fix off-by-one when filling in PRP lists

If the last element in the PRP list fits on the end of the page, there's
no need to allocate an extra page to put that single element in.  It can
fit on the end of the page.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 0d5c918b7d59..cf89db8c41ee 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -376,12 +376,13 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 	cmd->prp2 = cpu_to_le64(prp_dma);
 	i = 0;
 	for (;;) {
-		if (i == PAGE_SIZE / 8 - 1) {
+		if (i == PAGE_SIZE / 8) {
 			__le64 *old_prp_list = prp_list;
 			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
 			prps->list[prp_page++] = prp_list;
-			old_prp_list[i] = cpu_to_le64(prp_dma);
-			i = 0;
+			prp_list[0] = old_prp_list[i - 1];
+			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
+			i = 1;
 		}
 		prp_list[i++] = cpu_to_le64(dma_addr);
 		dma_len -= PAGE_SIZE;
-- 
cgit 


From 3cb967c03926edd2c414082f4cc0feb7b372edae Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 16 Mar 2011 16:45:49 -0400
Subject: NVMe: Remove the kthread from the wait queue

Once there are no more bios on the congestion list, we can stop waking
up the nvme kthread every time a completion happens.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index cf89db8c41ee..8d3c0b79ac2b 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1162,6 +1162,9 @@ static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
 			bio_list_add_head(&nvmeq->sq_cong, bio);
 			break;
 		}
+		if (bio_list_empty(&nvmeq->sq_cong))
+			remove_wait_queue(&nvmeq->sq_full,
+							&nvmeq->sq_cong_wait);
 	}
 }
 
-- 
cgit 


From 9ecdc946212f7cd592986b2c519b470404caa6b8 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 16 Mar 2011 16:52:19 -0400
Subject: NVMe: Simplify queue lookup

Fill in all the num_possible_cpus() entries with duplicate pointers.
This reduces the complexity of the frequently-called get_nvmeq(), as
well as avoiding a bug in it when there are fewer queues than CPUs.

Reported-by: Shane Michael Matthews <shane.matthews@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 8d3c0b79ac2b..f94f1731478f 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -221,12 +221,7 @@ static void cancel_cmdid_data(struct nvme_queue *nvmeq, int cmdid)
 
 static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
 {
-	int qid, cpu = get_cpu();
-	if (cpu < ns->dev->queue_count)
-		qid = cpu + 1;
-	else
-		qid = (cpu % rounddown_pow_of_two(ns->dev->queue_count)) + 1;
-	return ns->dev->queues[qid];
+	return ns->dev->queues[get_cpu() + 1];
 }
 
 static void put_nvmeq(struct nvme_queue *nvmeq)
@@ -1316,6 +1311,11 @@ static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 		dev->queue_count++;
 	}
 
+	for (; i < num_possible_cpus(); i++) {
+		int target = i % rounddown_pow_of_two(dev->queue_count - 1);
+		dev->queues[i + 1] = dev->queues[target + 1];
+	}
+
 	return 0;
 }
 
-- 
cgit 


From 4948168280b269a514045766ddd872cfac5968e1 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sat, 19 Mar 2011 14:55:38 -0400
Subject: NVMe: Add compat_ioctl

Make ioctls work for 32-bit applications on 64-bit kernels.  The structures
are defined to be the same for both 32- and 64-bit applications, so
we can use the same handler for both.

Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index f94f1731478f..d0b52622e261 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1146,6 +1146,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 static const struct block_device_operations nvme_fops = {
 	.owner		= THIS_MODULE,
 	.ioctl		= nvme_ioctl,
+	.compat_ioctl	= nvme_ioctl,
 };
 
 static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
-- 
cgit 


From 6c7d49455ceb63064f992347d9185ff5bf43497a Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 21 Mar 2011 09:48:57 -0400
Subject: NVMe: Change the definition of nvme_user_io

The read and write commands don't define a 'result', so there's no need
to copy it back to userspace.

Remove the ability of the ioctl to submit commands to a different
namespace; it's just asking for trouble, and the use case I have in mind
will be addressed througha  different ioctl in the future.  That removes
the need for both the block_shift and nsid arguments.

Check that the opcode is one of 'read' or 'write'.  Future opcodes may
be added in the future, but we will need a different structure definition
for them.

The nblocks field is redefined to be 0-based.  This allows the user to
request the full 65536 blocks.

Don't byteswap the reftag, apptag and appmask.  Martin Petersen tells
me these are calculated in big-endian and are transmitted to the device
in big-endian.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index d0b52622e261..90a96ec8a596 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1035,29 +1035,37 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	struct nvme_user_io io;
 	struct nvme_command c;
 	unsigned length;
-	u32 result;
 	int nents, status;
 	struct scatterlist *sg;
 	struct nvme_prps *prps;
 
 	if (copy_from_user(&io, uio, sizeof(io)))
 		return -EFAULT;
-	length = io.nblocks << io.block_shift;
-	nents = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length, &sg);
+	length = (io.nblocks + 1) << ns->lba_shift;
+
+	switch (io.opcode) {
+	case nvme_cmd_write:
+	case nvme_cmd_read:
+		nents = nvme_map_user_pages(dev, io.opcode & 1, io.addr,
+								length, &sg);
+	default:
+		return -EFAULT;
+	}
+
 	if (nents < 0)
 		return nents;
 
 	memset(&c, 0, sizeof(c));
 	c.rw.opcode = io.opcode;
 	c.rw.flags = io.flags;
-	c.rw.nsid = cpu_to_le32(io.nsid);
+	c.rw.nsid = cpu_to_le32(ns->ns_id);
 	c.rw.slba = cpu_to_le64(io.slba);
-	c.rw.length = cpu_to_le16(io.nblocks - 1);
+	c.rw.length = cpu_to_le16(io.nblocks);
 	c.rw.control = cpu_to_le16(io.control);
 	c.rw.dsmgmt = cpu_to_le16(io.dsmgmt);
-	c.rw.reftag = cpu_to_le32(io.reftag);	/* XXX: endian? */
-	c.rw.apptag = cpu_to_le16(io.apptag);
-	c.rw.appmask = cpu_to_le16(io.appmask);
+	c.rw.reftag = io.reftag;
+	c.rw.apptag = io.apptag;
+	c.rw.appmask = io.appmask;
 	/* XXX: metadata */
 	prps = nvme_setup_prps(dev, &c.common, sg, length);
 
@@ -1069,11 +1077,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	 * additional races since q_lock already protects against other CPUs.
 	 */
 	put_nvmeq(nvmeq);
-	status = nvme_submit_sync_cmd(nvmeq, &c, &result, IO_TIMEOUT);
+	status = nvme_submit_sync_cmd(nvmeq, &c, NULL, IO_TIMEOUT);
 
 	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents);
 	nvme_free_prps(dev, prps);
-	put_user(result, &uio->result);
 	return status;
 }
 
-- 
cgit 


From 8ef700678f65e2eef1c3a94cdedb79d757608392 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 21 Mar 2011 10:28:43 -0400
Subject: NVMe: Version 0.5

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 90a96ec8a596..d3eeca5a3c4c 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1630,6 +1630,6 @@ static void __exit nvme_exit(void)
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("0.4");
+MODULE_VERSION("0.5");
 module_init(nvme_init);
 module_exit(nvme_exit);
-- 
cgit 


From 7f53f9d2424533256ae86f7df5661a17de743de8 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 22 Mar 2011 15:55:45 -0400
Subject: NVMe: Correct the Controller Configuration settings

The arbitration field was extended by one bit, shifting the shutdown
notification bits by one.  Also, the SQ/CQ entry size was made
configurable for future extensions.

Reported-by: Paul Luse <paul.e.luse@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index d3eeca5a3c4c..014a7f6e39bc 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -905,6 +905,7 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 	dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;
 	dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
 	dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
+	dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
 
 	writel(0, &dev->bar->cc);
 	writel(aqa, &dev->bar->aqa);
-- 
cgit 


From aba2080f3f1639f9202f1a52993669844abcfb80 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sun, 27 Mar 2011 08:52:06 -0400
Subject: NVMe: Fix warning in free_irq

We need to clear the affinity mask before calling free_irq()

Reported-by: Shane Michael Matthews <shane.matthews@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 014a7f6e39bc..bcc780ac4ec0 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -781,8 +781,10 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
 static void nvme_free_queue(struct nvme_dev *dev, int qid)
 {
 	struct nvme_queue *nvmeq = dev->queues[qid];
+	int vector = dev->entry[nvmeq->cq_vector].vector;
 
-	free_irq(dev->entry[nvmeq->cq_vector].vector, nvmeq);
+	irq_set_affinity_hint(vector, NULL);
+	free_irq(vector, nvmeq);
 
 	/* Don't tell the adapter to delete the admin queue */
 	if (qid) {
-- 
cgit 


From 22605f96810d073eb74051d0295b6577d6a6a563 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 19 Apr 2011 15:04:20 -0400
Subject: NVMe: Time out initialisation after a few seconds

THe device reports (in its capability register) how long it will take
to initialise.  If that time elapses before the ready bit becomes set,
conclude the device is broken and refuse to initialise it.  Log a nice
error message so the user knows why we did nothing.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index bcc780ac4ec0..57f2b33a47dd 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -893,6 +893,8 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 {
 	int result;
 	u32 aqa;
+	u64 cap;
+	unsigned long timeout;
 	struct nvme_queue *nvmeq;
 
 	dev->dbs = ((void __iomem *)dev->bar) + 4096;
@@ -915,10 +917,18 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 	writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
 	writel(dev->ctrl_config, &dev->bar->cc);
 
+	cap = readq(&dev->bar->cap);
+	timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+
 	while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
 		msleep(100);
 		if (fatal_signal_pending(current))
 			return -EINTR;
+		if (time_after(jiffies, timeout)) {
+			dev_err(&dev->pci_dev->dev,
+				"Device not ready; aborting initialisation\n");
+			return -ENODEV;
+		}
 	}
 
 	result = queue_request_irq(dev, nvmeq, "nvme admin");
-- 
cgit 


From 09a58f536436efed02ead722e835cb4ce7674afc Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 28 Apr 2011 23:09:09 -0700
Subject: NVMe: Fix bug in error handling

When an I/O completed with an error, we would call bio_endio twice
(once with -EIO and once with 0).  Found by inspection.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 57f2b33a47dd..205405e7f6b0 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -310,9 +310,9 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 	dma_unmap_sg(nvmeq->q_dmadev, nbio->sg, nbio->nents,
 			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	free_nbio(nvmeq, nbio);
-	if (status)
+	if (status) {
 		bio_endio(bio, -EIO);
-	if (bio->bi_vcnt > bio->bi_idx) {
+	} else if (bio->bi_vcnt > bio->bi_idx) {
 		bio_list_add(&nvmeq->sq_cong, bio);
 		wake_up_process(nvme_thread);
 	} else {
-- 
cgit 


From 21075bdee0a6f56058920d889df4ae561bfed754 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 28 Apr 2011 23:17:36 -0700
Subject: NVMe: Rename cancel_cmdid_data to cancel_cmdid

The trailing '_data' on the end was annoying and inconsistent.  Also, make
it actually return the data since this is needed for timing out commands.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 205405e7f6b0..9ca9db903ceb 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -213,10 +213,13 @@ static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 	return data;
 }
 
-static void cancel_cmdid_data(struct nvme_queue *nvmeq, int cmdid)
+static unsigned long cancel_cmdid(struct nvme_queue *nvmeq, int cmdid)
 {
+	unsigned long data;
 	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+	data = info[cmdid].ctx;
 	info[cmdid].ctx = CMD_CTX_CANCELLED;
+	return data;
 }
 
 static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
@@ -667,7 +670,7 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
 {
 	spin_lock_irq(&nvmeq->q_lock);
-	cancel_cmdid_data(nvmeq, cmdid);
+	cancel_cmdid(nvmeq, cmdid);
 	spin_unlock_irq(&nvmeq->q_lock);
 }
 
-- 
cgit 


From 8de055350fbaa96b6563892c195a60be583faa9c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 12 May 2011 13:50:28 -0400
Subject: NVMe: Add support for timing out I/Os

In the kthread, walk the list of outstanding I/Os and check they've not
hit the timeout.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 37 +++++++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 9ca9db903ceb..9c0ab2af0fae 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -18,6 +18,7 @@
 
 #include <linux/nvme.h>
 #include <linux/bio.h>
+#include <linux/bitops.h>
 #include <linux/blkdev.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -601,15 +602,15 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
 typedef void (*completion_fn)(struct nvme_queue *, void *,
 						struct nvme_completion *);
 
+static const completion_fn nvme_completions[4] = {
+	[sync_completion_id] = sync_completion,
+	[bio_completion_id]  = bio_completion,
+};
+
 static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 {
 	u16 head, phase;
 
-	static const completion_fn completions[4] = {
-		[sync_completion_id] = sync_completion,
-		[bio_completion_id]  = bio_completion,
-	};
-
 	head = nvmeq->cq_head;
 	phase = nvmeq->cq_phase;
 
@@ -629,7 +630,7 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 		data = free_cmdid(nvmeq, cqe.command_id);
 		handler = data & 3;
 		ptr = (void *)(data & ~3UL);
-		completions[handler](nvmeq, ptr, &cqe);
+		nvme_completions[handler](nvmeq, ptr, &cqe);
 	}
 
 	/* If the controller ignores the cq head doorbell and continuously
@@ -1172,6 +1173,29 @@ static const struct block_device_operations nvme_fops = {
 	.compat_ioctl	= nvme_ioctl,
 };
 
+static void nvme_timeout_ios(struct nvme_queue *nvmeq)
+{
+	int depth = nvmeq->q_depth - 1;
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+	unsigned long now = jiffies;
+	int cmdid;
+
+	for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
+		unsigned long data;
+		void *ptr;
+		unsigned char handler;
+		static struct nvme_completion cqe = { .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, };
+
+		if (!time_after(now, info[cmdid].timeout))
+			continue;
+		dev_warn(nvmeq->q_dmadev, "Timing out I/O %d\n", cmdid);
+		data = cancel_cmdid(nvmeq, cmdid);
+		handler = data & 3;
+		ptr = (void *)(data & ~3UL);
+		nvme_completions[handler](nvmeq, ptr, &cqe);
+	}
+}
+
 static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
 {
 	while (bio_list_peek(&nvmeq->sq_cong)) {
@@ -1203,6 +1227,7 @@ static int nvme_kthread(void *data)
 				spin_lock_irq(&nvmeq->q_lock);
 				if (nvme_process_cq(nvmeq))
 					printk("process_cq did something\n");
+				nvme_timeout_ios(nvmeq);
 				nvme_resubmit_bios(nvmeq);
 				spin_unlock_irq(&nvmeq->q_lock);
 			}
-- 
cgit 


From fd63e9ceeeae58cfe877c2d49d41c1bf7532303c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 6 May 2011 08:37:54 -0400
Subject: NVMe: Add include of delay.h for msleep

Previously it was being implicitly included through some other header file

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 9c0ab2af0fae..b285a7e0624d 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -20,6 +20,7 @@
 #include <linux/bio.h>
 #include <linux/bitops.h>
 #include <linux/blkdev.h>
+#include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/genhd.h>
-- 
cgit 


From 5aff9382ddc8aac6eb0c70ffbb351652d71da69a Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 6 May 2011 08:45:47 -0400
Subject: NVMe: Use an IDA to allocate minor numbers

The current approach of using the namespace ID as the minor number
doesn't work when there are multiple adapters in the machine.  Rather
than statically partitioning the number of namespaces between adapters,
dynamically allocate minor numbers to namespaces as they are detected.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 38 ++++++++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index b285a7e0624d..79012c53ae9c 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -24,6 +24,7 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/genhd.h>
+#include <linux/idr.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
@@ -1240,7 +1241,34 @@ static int nvme_kthread(void *data)
 	return 0;
 }
 
-static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int index,
+static DEFINE_IDA(nvme_index_ida);
+
+static int nvme_get_ns_idx(void)
+{
+	int index, error;
+
+	do {
+		if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL))
+			return -1;
+
+		spin_lock(&dev_list_lock);
+		error = ida_get_new(&nvme_index_ida, &index);
+		spin_unlock(&dev_list_lock);
+	} while (error == -EAGAIN);
+
+	if (error)
+		index = -1;
+	return index;
+}
+
+static void nvme_put_ns_idx(int index)
+{
+	spin_lock(&dev_list_lock);
+	ida_remove(&nvme_index_ida, index);
+	spin_unlock(&dev_list_lock);
+}
+
+static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
 			struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
 {
 	struct nvme_ns *ns;
@@ -1265,19 +1293,19 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int index,
 	disk = alloc_disk(NVME_MINORS);
 	if (!disk)
 		goto out_free_queue;
-	ns->ns_id = index;
+	ns->ns_id = nsid;
 	ns->disk = disk;
 	lbaf = id->flbas & 0xf;
 	ns->lba_shift = id->lbaf[lbaf].ds;
 
 	disk->major = nvme_major;
 	disk->minors = NVME_MINORS;
-	disk->first_minor = NVME_MINORS * index;
+	disk->first_minor = NVME_MINORS * nvme_get_ns_idx();
 	disk->fops = &nvme_fops;
 	disk->private_data = ns;
 	disk->queue = ns->queue;
 	disk->driverfs_dev = &dev->pci_dev->dev;
-	sprintf(disk->disk_name, "nvme%dn%d", dev->instance, index);
+	sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
 	set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
 
 	return ns;
@@ -1291,7 +1319,9 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int index,
 
 static void nvme_ns_free(struct nvme_ns *ns)
 {
+	int index = ns->disk->first_minor / NVME_MINORS;
 	put_disk(ns->disk);
+	nvme_put_ns_idx(index);
 	blk_cleanup_queue(ns->queue);
 	kfree(ns);
 }
-- 
cgit 


From b77954cbddff28d55a36fad3c16f4daebb0f01df Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 12 May 2011 13:51:41 -0400
Subject: NVMe: Handle failures from memory allocations in nvme_setup_prps

If any of the memory allocations in nvme_setup_prps fail, handle it by
modifying the passed-in data length to reflect the number of bytes we are
actually able to send.  Also allow the caller to specify the GFP flags
they need; for user-initiated commands, we can use GFP_KERNEL allocations.

The various callers are updated to handle this possibility; the main
I/O path is already prepared for this possibility (as it may happen
due to nvme_map_bio being unable to map all the segments of the I/O).
The other callers return -ENOMEM instead of doing partial I/Os.

Reported-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 56 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 41 insertions(+), 15 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 79012c53ae9c..ddc21ba24a70 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -329,9 +329,11 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 /* length is in bytes */
 static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 					struct nvme_common_command *cmd,
-					struct scatterlist *sg, int length)
+					struct scatterlist *sg, int *len,
+					gfp_t gfp)
 {
 	struct dma_pool *pool;
+	int length = *len;
 	int dma_len = sg_dma_len(sg);
 	u64 dma_addr = sg_dma_address(sg);
 	int offset = offset_in_page(dma_addr);
@@ -361,7 +363,12 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 
 	nprps = DIV_ROUND_UP(length, PAGE_SIZE);
 	npages = DIV_ROUND_UP(8 * nprps, PAGE_SIZE);
-	prps = kmalloc(sizeof(*prps) + sizeof(__le64 *) * npages, GFP_ATOMIC);
+	prps = kmalloc(sizeof(*prps) + sizeof(__le64 *) * npages, gfp);
+	if (!prps) {
+		cmd->prp2 = cpu_to_le64(dma_addr);
+		*len = (*len - length) + PAGE_SIZE;
+		return prps;
+	}
 	prp_page = 0;
 	if (nprps <= (256 / 8)) {
 		pool = dev->prp_small_pool;
@@ -371,7 +378,13 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 		prps->npages = npages;
 	}
 
-	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+	prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
+	if (!prp_list) {
+		cmd->prp2 = cpu_to_le64(dma_addr);
+		*len = (*len - length) + PAGE_SIZE;
+		kfree(prps);
+		return NULL;
+	}
 	prps->list[prp_page++] = prp_list;
 	prps->first_dma = prp_dma;
 	cmd->prp2 = cpu_to_le64(prp_dma);
@@ -379,7 +392,11 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 	for (;;) {
 		if (i == PAGE_SIZE / 8) {
 			__le64 *old_prp_list = prp_list;
-			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+			prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
+			if (!prp_list) {
+				*len = (*len - length);
+				return prps;
+			}
 			prps->list[prp_page++] = prp_list;
 			prp_list[0] = old_prp_list[i - 1];
 			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
@@ -525,7 +542,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	cmnd->rw.command_id = cmdid;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
 	nbio->prps = nvme_setup_prps(nvmeq->dev, &cmnd->common, nbio->sg,
-								length);
+							&length, GFP_ATOMIC);
 	cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
 	cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
 	cmnd->rw.control = cpu_to_le16(control);
@@ -1009,15 +1026,18 @@ static int nvme_submit_user_admin_command(struct nvme_dev *dev,
 					unsigned long addr, unsigned length,
 					struct nvme_command *cmd)
 {
-	int err, nents;
+	int err, nents, tmplen = length;
 	struct scatterlist *sg;
 	struct nvme_prps *prps;
 
 	nents = nvme_map_user_pages(dev, 0, addr, length, &sg);
 	if (nents < 0)
 		return nents;
-	prps = nvme_setup_prps(dev, &cmd->common, sg, length);
-	err = nvme_submit_admin_cmd(dev, cmd, NULL);
+	prps = nvme_setup_prps(dev, &cmd->common, sg, &tmplen, GFP_KERNEL);
+	if (tmplen != length)
+		err = -ENOMEM;
+	else
+		err = nvme_submit_admin_cmd(dev, cmd, NULL);
 	nvme_unmap_user_pages(dev, 0, addr, length, sg, nents);
 	nvme_free_prps(dev, prps);
 	return err ? -EIO : 0;
@@ -1086,7 +1106,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	c.rw.apptag = io.apptag;
 	c.rw.appmask = io.appmask;
 	/* XXX: metadata */
-	prps = nvme_setup_prps(dev, &c.common, sg, length);
+	prps = nvme_setup_prps(dev, &c.common, sg, &length, GFP_KERNEL);
 
 	nvmeq = get_nvmeq(ns);
 	/*
@@ -1096,7 +1116,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	 * additional races since q_lock already protects against other CPUs.
 	 */
 	put_nvmeq(nvmeq);
-	status = nvme_submit_sync_cmd(nvmeq, &c, NULL, IO_TIMEOUT);
+	if (length != (io.nblocks + 1) << ns->lba_shift)
+		status = -ENOMEM;
+	else
+		status = nvme_submit_sync_cmd(nvmeq, &c, NULL, IO_TIMEOUT);
 
 	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents);
 	nvme_free_prps(dev, prps);
@@ -1109,7 +1132,7 @@ static int nvme_download_firmware(struct nvme_ns *ns,
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_dlfw dlfw;
 	struct nvme_command c;
-	int nents, status;
+	int nents, status, length;
 	struct scatterlist *sg;
 	struct nvme_prps *prps;
 
@@ -1117,8 +1140,9 @@ static int nvme_download_firmware(struct nvme_ns *ns,
 		return -EFAULT;
 	if (dlfw.length >= (1 << 30))
 		return -EINVAL;
+	length = dlfw.length * 4;
 
-	nents = nvme_map_user_pages(dev, 1, dlfw.addr, dlfw.length * 4, &sg);
+	nents = nvme_map_user_pages(dev, 1, dlfw.addr, length, &sg);
 	if (nents < 0)
 		return nents;
 
@@ -1126,9 +1150,11 @@ static int nvme_download_firmware(struct nvme_ns *ns,
 	c.dlfw.opcode = nvme_admin_download_fw;
 	c.dlfw.numd = cpu_to_le32(dlfw.length);
 	c.dlfw.offset = cpu_to_le32(dlfw.offset);
-	prps = nvme_setup_prps(dev, &c.common, sg, dlfw.length * 4);
-
-	status = nvme_submit_admin_cmd(dev, &c, NULL);
+	prps = nvme_setup_prps(dev, &c.common, sg, &length, GFP_KERNEL);
+	if (length != dlfw.length * 4)
+		status = -ENOMEM;
+	else
+		status = nvme_submit_admin_cmd(dev, &c, NULL);
 	nvme_unmap_user_pages(dev, 0, dlfw.addr, dlfw.length * 4, sg, nents);
 	nvme_free_prps(dev, prps);
 	return status;
-- 
cgit 


From 184d2944cb3b92a2e8e1733c59d1e531ad6e924a Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 11 May 2011 21:36:38 -0400
Subject: NVMe: Add a few calling convention notes

For the benefit of reviewers, add comments to a few functions describing
their calling context

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index ddc21ba24a70..12062c108bd9 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -155,6 +155,9 @@ static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
  * the bottom two bits of the ctx pointer to store the handler ID.
  * Passing in a pointer that's not 4-byte aligned will cause a BUG.
  * We can change this if it becomes a problem.
+ *
+ * May be called with local interrupts disabled and the q_lock held,
+ * or with interrupts enabled and no locks held.
  */
 static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler,
 							unsigned timeout)
@@ -202,6 +205,9 @@ enum {
 #define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
 #define CMD_CTX_FLUSH		(0x318 + CMD_CTX_BASE)
 
+/*
+ * Called with local interrupts disabled and the q_lock held.  May not sleep.
+ */
 static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 {
 	unsigned long data;
@@ -326,7 +332,7 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 	}
 }
 
-/* length is in bytes */
+/* length is in bytes.  gfp flags indicates whether we may sleep. */
 static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 					struct nvme_common_command *cmd,
 					struct scatterlist *sg, int *len,
@@ -483,6 +489,9 @@ static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
 	return nvme_submit_flush(nvmeq, ns, cmdid);
 }
 
+/*
+ * Called with local interrupts disabled and the q_lock held.  May not sleep.
+ */
 static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 								struct bio *bio)
 {
-- 
cgit 


From be5e09484078e95af20acb13e215cd8aec705893 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 11 May 2011 21:38:57 -0400
Subject: NVMe: Version 0.6

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 12062c108bd9..d1cd91becdb0 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1737,6 +1737,6 @@ static void __exit nvme_exit(void)
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("0.5");
+MODULE_VERSION("0.6");
 module_init(nvme_init);
 module_exit(nvme_exit);
-- 
cgit 


From 6f0f54499f2edf7e25410cdd99e6f030f3485fd1 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 11 May 2011 13:30:59 -0700
Subject: NVMe: Return real error from nvme_create_queue

nvme_setup_io_queues() was assuming that a NULL return from
nvme_create_queue() was an out-of-memory error.  That's not necessarily
true; the adapter might return -EIO, for example.  Change the calling
convention to return an ERR_PTR on failure instead of NULL.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index d1cd91becdb0..843edbd79c56 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -892,7 +892,7 @@ static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
 	struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector);
 
 	if (!nvmeq)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	result = adapter_alloc_cq(dev, qid, nvmeq);
 	if (result < 0)
@@ -918,7 +918,7 @@ static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
 	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
 					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
 	kfree(nvmeq);
-	return NULL;
+	return ERR_PTR(result);
 }
 
 static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
@@ -1421,8 +1421,8 @@ static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 	for (i = 0; i < nr_io_queues; i++) {
 		dev->queues[i + 1] = nvme_create_queue(dev, i + 1,
 							NVME_Q_DEPTH, i);
-		if (!dev->queues[i + 1])
-			return -ENOMEM;
+		if (IS_ERR(dev->queues[i + 1]))
+			return PTR_ERR(dev->queues[i + 1]);
 		dev->queue_count++;
 	}
 
-- 
cgit 


From eac623ba7a91474a688eb5d0fcd0eaa6a56dc41c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 20 May 2011 09:34:43 -0400
Subject: NVMe: Add the nvme thread to the wait queue before waking it up

If the I/O was not completed by a single NVMe command, we add the
bio to the congestion list and wake up the kthread to resubmit it.
But the kthread calls remove_wait_queue() unconditionally, which
will oops if it's not on the wait queue.  So add the kthread to
the wait queue before waking it up.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 843edbd79c56..f5e51a6116e3 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -325,6 +325,8 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 	if (status) {
 		bio_endio(bio, -EIO);
 	} else if (bio->bi_vcnt > bio->bi_idx) {
+		if (bio_list_empty(&nvmeq->sq_cong))
+			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
 		bio_list_add(&nvmeq->sq_cong, bio);
 		wake_up_process(nvme_thread);
 	} else {
-- 
cgit 


From 6bbf1acddeed0bfb345a5578f9fcada16f1e514f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 20 May 2011 13:03:42 -0400
Subject: NVMe: Rework ioctls

Remove the special-purpose IDENTIFY, GET_RANGE_TYPE, DOWNLOAD_FIRMWARE
and ACTIVATE_FIRMWARE commands.  Replace them with a generic ADMIN_CMD
ioctl that can submit any admin command.

Add a new ID ioctl that returns the namespace ID of the queried device.
It corresponds to the SCSI Idlun ioctl.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 128 ++++++++++++++++-----------------------------------
 1 file changed, 40 insertions(+), 88 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index f5e51a6116e3..9e3c724b95c3 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1033,51 +1033,6 @@ static void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
 		put_page(sg_page(&sg[i]));
 }
 
-static int nvme_submit_user_admin_command(struct nvme_dev *dev,
-					unsigned long addr, unsigned length,
-					struct nvme_command *cmd)
-{
-	int err, nents, tmplen = length;
-	struct scatterlist *sg;
-	struct nvme_prps *prps;
-
-	nents = nvme_map_user_pages(dev, 0, addr, length, &sg);
-	if (nents < 0)
-		return nents;
-	prps = nvme_setup_prps(dev, &cmd->common, sg, &tmplen, GFP_KERNEL);
-	if (tmplen != length)
-		err = -ENOMEM;
-	else
-		err = nvme_submit_admin_cmd(dev, cmd, NULL);
-	nvme_unmap_user_pages(dev, 0, addr, length, sg, nents);
-	nvme_free_prps(dev, prps);
-	return err ? -EIO : 0;
-}
-
-static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
-{
-	struct nvme_command c;
-
-	memset(&c, 0, sizeof(c));
-	c.identify.opcode = nvme_admin_identify;
-	c.identify.nsid = cns ? 0 : cpu_to_le32(ns->ns_id);
-	c.identify.cns = cpu_to_le32(cns);
-
-	return nvme_submit_user_admin_command(ns->dev, addr, 4096, &c);
-}
-
-static int nvme_get_range_type(struct nvme_ns *ns, unsigned long addr)
-{
-	struct nvme_command c;
-
-	memset(&c, 0, sizeof(c));
-	c.features.opcode = nvme_admin_get_features;
-	c.features.nsid = cpu_to_le32(ns->ns_id);
-	c.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE);
-
-	return nvme_submit_user_admin_command(ns->dev, addr, 4096, &c);
-}
-
 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 {
 	struct nvme_dev *dev = ns->dev;
@@ -1096,10 +1051,11 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	switch (io.opcode) {
 	case nvme_cmd_write:
 	case nvme_cmd_read:
+	case nvme_cmd_compare:
 		nents = nvme_map_user_pages(dev, io.opcode & 1, io.addr,
 								length, &sg);
 	default:
-		return -EFAULT;
+		return -EINVAL;
 	}
 
 	if (nents < 0)
@@ -1137,70 +1093,66 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	return status;
 }
 
-static int nvme_download_firmware(struct nvme_ns *ns,
-						struct nvme_dlfw __user *udlfw)
+static int nvme_user_admin_cmd(struct nvme_ns *ns,
+					struct nvme_admin_cmd __user *ucmd)
 {
 	struct nvme_dev *dev = ns->dev;
-	struct nvme_dlfw dlfw;
+	struct nvme_admin_cmd cmd;
 	struct nvme_command c;
-	int nents, status, length;
+	int status, length, nents = 0;
 	struct scatterlist *sg;
-	struct nvme_prps *prps;
+	struct nvme_prps *prps = NULL;
 
-	if (copy_from_user(&dlfw, udlfw, sizeof(dlfw)))
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
 		return -EFAULT;
-	if (dlfw.length >= (1 << 30))
-		return -EINVAL;
-	length = dlfw.length * 4;
-
-	nents = nvme_map_user_pages(dev, 1, dlfw.addr, length, &sg);
-	if (nents < 0)
-		return nents;
 
 	memset(&c, 0, sizeof(c));
-	c.dlfw.opcode = nvme_admin_download_fw;
-	c.dlfw.numd = cpu_to_le32(dlfw.length);
-	c.dlfw.offset = cpu_to_le32(dlfw.offset);
-	prps = nvme_setup_prps(dev, &c.common, sg, &length, GFP_KERNEL);
-	if (length != dlfw.length * 4)
+	c.common.opcode = cmd.opcode;
+	c.common.flags = cmd.flags;
+	c.common.nsid = cpu_to_le32(cmd.nsid);
+	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
+	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
+	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
+	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
+	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
+	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
+
+	length = cmd.data_len;
+	if (cmd.data_len) {
+		nents = nvme_map_user_pages(dev, 1, cmd.addr, length, &sg);
+		if (nents < 0)
+			return nents;
+		prps = nvme_setup_prps(dev, &c.common, sg, &length, GFP_KERNEL);
+	}
+
+	if (length != cmd.data_len)
 		status = -ENOMEM;
 	else
 		status = nvme_submit_admin_cmd(dev, &c, NULL);
-	nvme_unmap_user_pages(dev, 0, dlfw.addr, dlfw.length * 4, sg, nents);
-	nvme_free_prps(dev, prps);
+	if (cmd.data_len) {
+		nvme_unmap_user_pages(dev, 0, cmd.addr, cmd.data_len, sg,
+									nents);
+		nvme_free_prps(dev, prps);
+	}
 	return status;
 }
 
-static int nvme_activate_firmware(struct nvme_ns *ns, unsigned long arg)
-{
-	struct nvme_dev *dev = ns->dev;
-	struct nvme_command c;
-
-	memset(&c, 0, sizeof(c));
-	c.common.opcode = nvme_admin_activate_fw;
-	c.common.rsvd10[0] = cpu_to_le32(arg);
-
-	return nvme_submit_admin_cmd(dev, &c, NULL);
-}
-
 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 							unsigned long arg)
 {
 	struct nvme_ns *ns = bdev->bd_disk->private_data;
 
 	switch (cmd) {
-	case NVME_IOCTL_IDENTIFY_NS:
-		return nvme_identify(ns, arg, 0);
-	case NVME_IOCTL_IDENTIFY_CTRL:
-		return nvme_identify(ns, arg, 1);
-	case NVME_IOCTL_GET_RANGE_TYPE:
-		return nvme_get_range_type(ns, arg);
+	case NVME_IOCTL_ID:
+		return ns->ns_id;
+	case NVME_IOCTL_ADMIN_CMD:
+		return nvme_user_admin_cmd(ns, (void __user *)arg);
 	case NVME_IOCTL_SUBMIT_IO:
 		return nvme_submit_io(ns, (void __user *)arg);
-	case NVME_IOCTL_DOWNLOAD_FW:
-		return nvme_download_firmware(ns, (void __user *)arg);
-	case NVME_IOCTL_ACTIVATE_FW:
-		return nvme_activate_firmware(ns, arg);
 	default:
 		return -ENOTTY;
 	}
-- 
cgit 


From 6413214c5d424fd5aae6567848340f962ad2ce0f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 9 Aug 2011 12:56:37 -0400
Subject: Fix bug in NVME_IOCTL_SUBMIT_IO

Missing 'break' in the switch statement meant that we'd fall through
to the 'return -EINVAL' case.
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 9e3c724b95c3..0956e1241520 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1054,6 +1054,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	case nvme_cmd_compare:
 		nents = nvme_map_user_pages(dev, io.opcode & 1, io.addr,
 								length, &sg);
+		break;
 	default:
 		return -EINVAL;
 	}
-- 
cgit 


From d0ba1e497bca83a3d353eb47c9658afc54d83228 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 13 Sep 2011 17:01:39 -0400
Subject: NVMe: Correct sg list setup in nvme_map_user_pages

Our SG list was constructed to always fill the entire first page, even
if that was more than the length of the I/O.  This is probably harmless,
but some IOMMUs might do something bad.

Correcting the first call to sg_set_page() made it look a lot closer to
the sg_set_page() in the loop, so fold the first call to sg_set_page()
into the loop.

Reported-by: Nisheeth Bhat <nisheeth.bhat@intel.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 drivers/block/nvme.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 0956e1241520..5843409cac6d 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -996,11 +996,11 @@ static int nvme_map_user_pages(struct nvme_dev *dev, int write,
 
 	sg = kcalloc(count, sizeof(*sg), GFP_KERNEL);
 	sg_init_table(sg, count);
-	sg_set_page(&sg[0], pages[0], PAGE_SIZE - offset, offset);
-	length -= (PAGE_SIZE - offset);
-	for (i = 1; i < count; i++) {
-		sg_set_page(&sg[i], pages[i], min_t(int, length, PAGE_SIZE), 0);
-		length -= PAGE_SIZE;
+	for (i = 0; i < count; i++) {
+		sg_set_page(&sg[i], pages[i],
+				min_t(int, length, PAGE_SIZE - offset), offset);
+		length -= (PAGE_SIZE - offset);
+		offset = 0;
 	}
 
 	err = -ENOMEM;
-- 
cgit 


From d1a490e026efb22851ed60588b5fad1281d80ec3 Mon Sep 17 00:00:00 2001
From: Nisheeth Bhat <nisheeth.bhat@intel.com>
Date: Thu, 15 Sep 2011 16:52:24 -0400
Subject: NVMe: Fix calls to dma_unmap_sg

dma_unmap_sg() must be called with the same 'nents' passed to
dma_map_sg(), not the number returned from dma_map_sg().

Signed-off-by: Nisheeth Bhat <nisheeth.bhat@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 5843409cac6d..a7f82fbdaf87 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1021,13 +1021,12 @@ static int nvme_map_user_pages(struct nvme_dev *dev, int write,
 }
 
 static void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
-				unsigned long addr, int length,
-				struct scatterlist *sg, int nents)
+			unsigned long addr, int length, struct scatterlist *sg)
 {
 	int i, count;
 
 	count = DIV_ROUND_UP(offset_in_page(addr) + length, PAGE_SIZE);
-	dma_unmap_sg(&dev->pci_dev->dev, sg, nents, DMA_FROM_DEVICE);
+	dma_unmap_sg(&dev->pci_dev->dev, sg, count, DMA_FROM_DEVICE);
 
 	for (i = 0; i < count; i++)
 		put_page(sg_page(&sg[i]));
@@ -1089,7 +1088,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	else
 		status = nvme_submit_sync_cmd(nvmeq, &c, NULL, IO_TIMEOUT);
 
-	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents);
+	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg);
 	nvme_free_prps(dev, prps);
 	return status;
 }
@@ -1135,8 +1134,7 @@ static int nvme_user_admin_cmd(struct nvme_ns *ns,
 	else
 		status = nvme_submit_admin_cmd(dev, &c, NULL);
 	if (cmd.data_len) {
-		nvme_unmap_user_pages(dev, 0, cmd.addr, cmd.data_len, sg,
-									nents);
+		nvme_unmap_user_pages(dev, 0, cmd.addr, cmd.data_len, sg);
 		nvme_free_prps(dev, prps);
 	}
 	return status;
-- 
cgit 


From 684f5c2025b067a23722e620d0b3b858d8dc5d01 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 19 Sep 2011 17:14:53 -0400
Subject: NVMe: Fix memory leak in nvme_dev_add()

The driver was allocating 8k of memory, then freeing 4k of it.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index a7f82fbdaf87..705f66ebd15f 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1457,7 +1457,7 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 	list_for_each_entry(ns, &dev->namespaces, list)
 		add_disk(ns->disk);
 
-	dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
+	dma_free_coherent(&dev->pci_dev->dev, 8192, id, dma_addr);
 	return 0;
 
  out_free:
@@ -1466,7 +1466,7 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 		nvme_ns_free(ns);
 	}
 
-	dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
+	dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr);
 	return res;
 }
 
-- 
cgit 


From bc5fc7e4b22ca855902aba02b28c96f09b446407 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 19 Sep 2011 17:08:14 -0400
Subject: NVMe: Create nvme_identify and nvme_get_features functions

Instead of open-coding calls to nvme_submit_admin_cmd, these
small wrappers are simpler to use (the patch removes 14 lines from
nvme_dev_add() for example).

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 76 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 43 insertions(+), 33 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 705f66ebd15f..b77894a75855 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -812,6 +812,34 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
 	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
 }
 
+static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
+							dma_addr_t dma_addr)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.nsid = cpu_to_le32(nsid);
+	c.identify.prp1 = cpu_to_le64(dma_addr);
+	c.identify.cns = cpu_to_le32(cns);
+
+	return nvme_submit_admin_cmd(dev, &c, NULL);
+}
+
+static int nvme_get_features(struct nvme_dev *dev, unsigned fid,
+			unsigned dword11, dma_addr_t dma_addr, u32 *result)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = nvme_admin_get_features;
+	c.features.prp1 = cpu_to_le64(dma_addr);
+	c.features.fid = cpu_to_le32(fid);
+	c.features.dword11 = cpu_to_le32(dword11);
+
+	return nvme_submit_admin_cmd(dev, &c, result);
+}
+
 static void nvme_free_queue(struct nvme_dev *dev, int qid)
 {
 	struct nvme_queue *nvmeq = dev->queues[qid];
@@ -1318,15 +1346,10 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 {
 	int status;
 	u32 result;
-	struct nvme_command c;
 	u32 q_count = (count - 1) | ((count - 1) << 16);
 
-	memset(&c, 0, sizeof(c));
-	c.features.opcode = nvme_admin_get_features;
-	c.features.fid = cpu_to_le32(NVME_FEAT_NUM_QUEUES);
-	c.features.dword11 = cpu_to_le32(q_count);
-
-	status = nvme_submit_admin_cmd(dev, &c, &result);
+	status = nvme_get_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
+								&result);
 	if (status)
 		return -EIO;
 	return min(result & 0xffff, result >> 16) + 1;
@@ -1400,65 +1423,51 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 	int res, nn, i;
 	struct nvme_ns *ns, *next;
 	struct nvme_id_ctrl *ctrl;
-	void *id;
+	struct nvme_id_ns *id_ns;
+	void *mem;
 	dma_addr_t dma_addr;
-	struct nvme_command cid, crt;
 
 	res = nvme_setup_io_queues(dev);
 	if (res)
 		return res;
 
-	/* XXX: Switch to a SG list once prp2 works */
-	id = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr,
+	mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr,
 								GFP_KERNEL);
 
-	memset(&cid, 0, sizeof(cid));
-	cid.identify.opcode = nvme_admin_identify;
-	cid.identify.nsid = 0;
-	cid.identify.prp1 = cpu_to_le64(dma_addr);
-	cid.identify.cns = cpu_to_le32(1);
-
-	res = nvme_submit_admin_cmd(dev, &cid, NULL);
+	res = nvme_identify(dev, 0, 1, dma_addr);
 	if (res) {
 		res = -EIO;
 		goto out_free;
 	}
 
-	ctrl = id;
+	ctrl = mem;
 	nn = le32_to_cpup(&ctrl->nn);
 	memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
 	memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
 	memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
 
-	cid.identify.cns = 0;
-	memset(&crt, 0, sizeof(crt));
-	crt.features.opcode = nvme_admin_get_features;
-	crt.features.prp1 = cpu_to_le64(dma_addr + 4096);
-	crt.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE);
-
+	id_ns = mem;
 	for (i = 0; i <= nn; i++) {
-		cid.identify.nsid = cpu_to_le32(i);
-		res = nvme_submit_admin_cmd(dev, &cid, NULL);
+		res = nvme_identify(dev, i, 0, dma_addr);
 		if (res)
 			continue;
 
-		if (((struct nvme_id_ns *)id)->ncap == 0)
+		if (id_ns->ncap == 0)
 			continue;
 
-		crt.features.nsid = cpu_to_le32(i);
-		res = nvme_submit_admin_cmd(dev, &crt, NULL);
+		res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i,
+							dma_addr + 4096, NULL);
 		if (res)
 			continue;
 
-		ns = nvme_alloc_ns(dev, i, id, id + 4096);
+		ns = nvme_alloc_ns(dev, i, mem, mem + 4096);
 		if (ns)
 			list_add_tail(&ns->list, &dev->namespaces);
 	}
 	list_for_each_entry(ns, &dev->namespaces, list)
 		add_disk(ns->disk);
 
-	dma_free_coherent(&dev->pci_dev->dev, 8192, id, dma_addr);
-	return 0;
+	goto out;
 
  out_free:
 	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
@@ -1466,6 +1475,7 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 		nvme_ns_free(ns);
 	}
 
+ out:
 	dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr);
 	return res;
 }
-- 
cgit 


From 0d1bc9125890426b52ca2de6abedd32e31722e5c Mon Sep 17 00:00:00 2001
From: Nisheeth Bhat <nisheeth.bhat@intel.com>
Date: Thu, 29 Sep 2011 10:10:10 -0400
Subject: Fix calculation of number of pages in a PRP List

The existing calculation underestimated the number of pages required
as it did not take into account the pointer at the end of each page.
The replacement calculation may overestimate the number of pages required
if the last page in the PRP List is entirely full.  By using ->npages
as a counter as we fill in the pages, we ensure that we don't try to
free a page that was never allocated.

Signed-off-by: Nisheeth Bhat <nisheeth.bhat@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index b77894a75855..3afdc750aaa8 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -265,7 +265,7 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 }
 
 struct nvme_prps {
-	int npages;
+	int npages;		/* 0 means small pool in use */
 	dma_addr_t first_dma;
 	__le64 *list[0];
 };
@@ -347,7 +347,7 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 	int offset = offset_in_page(dma_addr);
 	__le64 *prp_list;
 	dma_addr_t prp_dma;
-	int nprps, npages, i, prp_page;
+	int nprps, npages, i;
 	struct nvme_prps *prps = NULL;
 
 	cmd->prp1 = cpu_to_le64(dma_addr);
@@ -370,20 +370,20 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 	}
 
 	nprps = DIV_ROUND_UP(length, PAGE_SIZE);
-	npages = DIV_ROUND_UP(8 * nprps, PAGE_SIZE);
+	npages = DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
 	prps = kmalloc(sizeof(*prps) + sizeof(__le64 *) * npages, gfp);
 	if (!prps) {
 		cmd->prp2 = cpu_to_le64(dma_addr);
 		*len = (*len - length) + PAGE_SIZE;
 		return prps;
 	}
-	prp_page = 0;
+
 	if (nprps <= (256 / 8)) {
 		pool = dev->prp_small_pool;
 		prps->npages = 0;
 	} else {
 		pool = dev->prp_page_pool;
-		prps->npages = npages;
+		prps->npages = 1;
 	}
 
 	prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
@@ -393,7 +393,7 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 		kfree(prps);
 		return NULL;
 	}
-	prps->list[prp_page++] = prp_list;
+	prps->list[0] = prp_list;
 	prps->first_dma = prp_dma;
 	cmd->prp2 = cpu_to_le64(prp_dma);
 	i = 0;
@@ -405,7 +405,7 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 				*len = (*len - length);
 				return prps;
 			}
-			prps->list[prp_page++] = prp_list;
+			prps->list[prps->npages++] = prp_list;
 			prp_list[0] = old_prp_list[i - 1];
 			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
 			i = 1;
-- 
cgit 


From 2b2c1896871838cdf549442e8ad0264be5fa74e3 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 7 Oct 2011 13:10:13 -0400
Subject: NVMe: Don't probe namespace 0

ECN 001 documented that namespace 0 is not valid.  Sending an Identify
with CNS of 0 and Namespace of 0 is an undefined command.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 3afdc750aaa8..660aa5dfe569 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1447,7 +1447,7 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 	memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
 
 	id_ns = mem;
-	for (i = 0; i <= nn; i++) {
+	for (i = 1; i <= nn; i++) {
 		res = nvme_identify(dev, i, 0, dma_addr);
 		if (res)
 			continue;
-- 
cgit 


From ce38c149576fd0a3360fec3bef4012212d42e736 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 7 Oct 2011 13:20:37 -0400
Subject: NVMe: Version 0.7

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 660aa5dfe569..cfe5932821d8 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1700,6 +1700,6 @@ static void __exit nvme_exit(void)
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("0.6");
+MODULE_VERSION("0.7");
 module_init(nvme_init);
 module_exit(nvme_exit);
-- 
cgit 


From f1938f6e1ee1583c87ec74dc406fdd8694e99ac8 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 20 Oct 2011 17:00:41 -0400
Subject: NVMe: Implement doorbell stride capability

The doorbell stride allows devices to spread out their doorbells instead
of packing them tightly.  This feature was added as part of ECN 003.

This patch also enables support for more than 512 queues :-)

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index cfe5932821d8..a17f80fa3881 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -70,6 +70,7 @@ struct nvme_dev {
 	struct dma_pool *prp_small_pool;
 	int instance;
 	int queue_count;
+	int db_stride;
 	u32 ctrl_config;
 	struct msix_entry *entry;
 	struct nvme_bar __iomem *bar;
@@ -672,7 +673,7 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 	if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
 		return IRQ_NONE;
 
-	writel(head, nvmeq->q_db + 1);
+	writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
 	nvmeq->cq_head = head;
 	nvmeq->cq_phase = phase;
 
@@ -889,7 +890,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	init_waitqueue_head(&nvmeq->sq_full);
 	init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
 	bio_list_init(&nvmeq->sq_cong);
-	nvmeq->q_db = &dev->dbs[qid * 2];
+	nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];
 	nvmeq->q_depth = depth;
 	nvmeq->cq_vector = vector;
 
@@ -981,6 +982,7 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 
 	cap = readq(&dev->bar->cap);
 	timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+	dev->db_stride = NVME_CAP_STRIDE(cap);
 
 	while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
 		msleep(100);
@@ -1357,7 +1359,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 
 static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 {
-	int result, cpu, i, nr_io_queues;
+	int result, cpu, i, nr_io_queues, db_bar_size;
 
 	nr_io_queues = num_online_cpus();
 	result = set_queue_count(dev, nr_io_queues);
@@ -1369,6 +1371,15 @@ static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 	/* Deregister the admin queue's interrupt */
 	free_irq(dev->entry[0].vector, dev->queues[0]);
 
+	db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3));
+	if (db_bar_size > 8192) {
+		iounmap(dev->bar);
+		dev->bar = ioremap(pci_resource_start(dev->pci_dev, 0),
+								db_bar_size);
+		dev->dbs = ((void __iomem *)dev->bar) + 4096;
+		dev->queues[0]->q_db = dev->dbs;
+	}
+
 	for (i = 0; i < nr_io_queues; i++)
 		dev->entry[i].entry = i;
 	for (;;) {
-- 
cgit 


From 88523a61558a040546bf7d8b079ae0755d8e7005 Mon Sep 17 00:00:00 2001
From: Sam Bradshaw <sbradshaw@micron.com>
Date: Tue, 30 Aug 2011 08:34:26 -0600
Subject: block: Add driver for Micron RealSSD pcie flash cards

This adds mtip32xx, a driver supporting Microns line of
pci-express flash storage cards.

Signed-off-by: Asai Thambi S P <asamymuthupa@micron.com>
Signed-off-by: Sam Bradshaw <sbradshaw@micron.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/Kconfig             |    2 +
 drivers/block/Makefile            |    1 +
 drivers/block/mtip32xx/Kconfig    |    9 +
 drivers/block/mtip32xx/Makefile   |    5 +
 drivers/block/mtip32xx/mtip32xx.c | 3581 +++++++++++++++++++++++++++++++++++++
 drivers/block/mtip32xx/mtip32xx.h |  445 +++++
 6 files changed, 4043 insertions(+)
 create mode 100644 drivers/block/mtip32xx/Kconfig
 create mode 100644 drivers/block/mtip32xx/Makefile
 create mode 100644 drivers/block/mtip32xx/mtip32xx.c
 create mode 100644 drivers/block/mtip32xx/mtip32xx.h

(limited to 'drivers/block')

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 6f07ec1c2f58..a30aa103f95b 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -116,6 +116,8 @@ config PARIDE
 
 source "drivers/block/paride/Kconfig"
 
+source "drivers/block/mtip32xx/Kconfig"
+
 config BLK_CPQ_DA
 	tristate "Compaq SMART2 support"
 	depends on PCI && VIRT_TO_BUS
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 76646e9a1c91..ad7b74a44ef3 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -39,5 +39,6 @@ obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= xen-blkfront.o
 obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= xen-blkback/
 obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
 obj-$(CONFIG_BLK_DEV_RBD)     += rbd.o
+obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
 
 swim_mod-y	:= swim.o swim_asm.o
diff --git a/drivers/block/mtip32xx/Kconfig b/drivers/block/mtip32xx/Kconfig
new file mode 100644
index 000000000000..b5dd14e072f2
--- /dev/null
+++ b/drivers/block/mtip32xx/Kconfig
@@ -0,0 +1,9 @@
+#
+# mtip32xx device driver configuration
+#
+
+config BLK_DEV_PCIESSD_MTIP32XX
+	tristate "Block Device Driver for Micron PCIe SSDs"
+	depends on HOTPLUG_PCI_PCIE
+	help
+          This enables the block driver for Micron PCIe SSDs.
diff --git a/drivers/block/mtip32xx/Makefile b/drivers/block/mtip32xx/Makefile
new file mode 100644
index 000000000000..4fbef8c8329b
--- /dev/null
+++ b/drivers/block/mtip32xx/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for  Block device driver for Micron PCIe SSD
+#
+
+obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx.o
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
new file mode 100644
index 000000000000..847b8ff7b8c2
--- /dev/null
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -0,0 +1,3581 @@
+/*
+ * Driver for the Micron P320 SSD
+ *   Copyright (C) 2011 Micron Technology, Inc.
+ *
+ * Portions of this code were derived from works subjected to the
+ * following copyright:
+ *    Copyright (C) 2009 Integrated Device Technology, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/ata.h>
+#include <linux/delay.h>
+#include <linux/hdreg.h>
+#include <linux/uaccess.h>
+#include <linux/random.h>
+#include <linux/smp.h>
+#include <linux/compat.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/dma-mapping.h>
+#include <linux/idr.h>
+#include <../drivers/ata/ahci.h>
+#include "mtip32xx.h"
+
+#define HW_CMD_SLOT_SZ		(MTIP_MAX_COMMAND_SLOTS * 32)
+#define HW_CMD_TBL_SZ		(AHCI_CMD_TBL_HDR_SZ + (MTIP_MAX_SG * 16))
+#define HW_CMD_TBL_AR_SZ	(HW_CMD_TBL_SZ * MTIP_MAX_COMMAND_SLOTS)
+#define HW_PORT_PRIV_DMA_SZ \
+		(HW_CMD_SLOT_SZ + HW_CMD_TBL_AR_SZ + AHCI_RX_FIS_SZ)
+
+#define HOST_HSORG		0xFC
+#define HSORG_DISABLE_SLOTGRP_INTR (1<<24)
+#define HSORG_DISABLE_SLOTGRP_PXIS (1<<16)
+#define HSORG_HWREV		0xFF00
+#define HSORG_STYLE		0x8
+#define HSORG_SLOTGROUPS	0x7
+
+#define PORT_COMMAND_ISSUE	0x38
+#define PORT_SDBV		0x7C
+
+#define PORT_OFFSET		0x100
+#define PORT_MEM_SIZE		0x80
+
+#define PORT_IRQ_ERR \
+	(PORT_IRQ_HBUS_ERR | PORT_IRQ_IF_ERR | PORT_IRQ_CONNECT | \
+	 PORT_IRQ_PHYRDY | PORT_IRQ_UNK_FIS | PORT_IRQ_BAD_PMP | \
+	 PORT_IRQ_TF_ERR | PORT_IRQ_HBUS_DATA_ERR | PORT_IRQ_IF_NONFATAL | \
+	 PORT_IRQ_OVERFLOW)
+#define PORT_IRQ_LEGACY \
+	(PORT_IRQ_PIOS_FIS | PORT_IRQ_D2H_REG_FIS)
+#define PORT_IRQ_HANDLED \
+	(PORT_IRQ_SDB_FIS | PORT_IRQ_LEGACY | \
+	 PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR | \
+	 PORT_IRQ_CONNECT | PORT_IRQ_PHYRDY)
+#define DEF_PORT_IRQ \
+	(PORT_IRQ_ERR | PORT_IRQ_LEGACY | PORT_IRQ_SDB_FIS)
+
+/* product numbers */
+#define MTIP_PRODUCT_UNKNOWN	0x00
+#define MTIP_PRODUCT_ASICFPGA	0x11
+
+/* Device instance number, incremented each time a device is probed. */
+static int instance;
+
+/*
+ * Global variable used to hold the major block device number
+ * allocated in mtip_init().
+ */
+int mtip_major;
+
+static DEFINE_SPINLOCK(rssd_index_lock);
+static DEFINE_IDA(rssd_index_ida);
+
+struct mtip_compat_ide_task_request_s {
+	__u8		io_ports[8];
+	__u8		hob_ports[8];
+	ide_reg_valid_t	out_flags;
+	ide_reg_valid_t	in_flags;
+	int		data_phase;
+	int		req_cmd;
+	compat_ulong_t	out_size;
+	compat_ulong_t	in_size;
+};
+
+static int mtip_exec_internal_command(struct mtip_port *port,
+				void *fis,
+				int fisLen,
+				dma_addr_t buffer,
+				int bufLen,
+				u32 opts,
+				gfp_t atomic,
+				unsigned long timeout);
+
+/*
+ * Obtain an empty command slot.
+ *
+ * This function needs to be reentrant since it could be called
+ * at the same time on multiple CPUs. The allocation of the
+ * command slot must be atomic.
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ *	>= 0	Index of command slot obtained.
+ *	-1	No command slots available.
+ */
+static int get_slot(struct mtip_port *port)
+{
+	int slot, i;
+	unsigned int num_command_slots = port->dd->slot_groups * 32;
+
+	/*
+	 * Try 10 times, because there is a small race here.
+	 *  that's ok, because it's still cheaper than a lock.
+	 *
+	 * Race: Since this section is not protected by lock, same bit
+	 * could be chosen by different process contexts running in
+	 * different processor. So instead of costly lock, we are going
+	 * with loop.
+	 */
+	for (i = 0; i < 10; i++) {
+		slot = find_next_zero_bit(port->allocated,
+					 num_command_slots, 1);
+		if ((slot < num_command_slots) &&
+		    (!test_and_set_bit(slot, port->allocated)))
+			return slot;
+	}
+	dev_warn(&port->dd->pdev->dev, "Failed to get a tag.\n");
+
+	if (mtip_check_surprise_removal(port->dd->pdev)) {
+		/* Device not present, clean outstanding commands */
+		mtip_command_cleanup(port->dd);
+	}
+	return -1;
+}
+
+/*
+ * Release a command slot.
+ *
+ * @port Pointer to the port data structure.
+ * @tag  Tag of command to release
+ *
+ * return value
+ *	None
+ */
+static inline void release_slot(struct mtip_port *port, int tag)
+{
+	smp_mb__before_clear_bit();
+	clear_bit(tag, port->allocated);
+	smp_mb__after_clear_bit();
+}
+
+/*
+ * Issue a command to the hardware.
+ *
+ * Set the appropriate bit in the s_active and Command Issue hardware
+ * registers, causing hardware command processing to begin.
+ *
+ * @port Pointer to the port structure.
+ * @tag  The tag of the command to be issued.
+ *
+ * return value
+ *      None
+ */
+static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag)
+{
+	unsigned long flags = 0;
+
+	atomic_set(&port->commands[tag].active, 1);
+
+	spin_lock_irqsave(&port->cmd_issue_lock, flags);
+
+	writel((1 << MTIP_TAG_BIT(tag)),
+			port->s_active[MTIP_TAG_INDEX(tag)]);
+	writel((1 << MTIP_TAG_BIT(tag)),
+			port->cmd_issue[MTIP_TAG_INDEX(tag)]);
+
+	spin_unlock_irqrestore(&port->cmd_issue_lock, flags);
+}
+
+/*
+ * Called periodically to see if any read/write commands are
+ * taking too long to complete.
+ *
+ * @data Pointer to the PORT data structure.
+ *
+ * return value
+ *	None
+ */
+void mtip_timeout_function(unsigned long int data)
+{
+	struct mtip_port *port = (struct mtip_port *) data;
+	struct host_to_dev_fis *fis;
+	struct mtip_cmd *command;
+	int tag, cmdto_cnt = 0;
+	unsigned int bit, group;
+	unsigned int num_command_slots = port->dd->slot_groups * 32;
+
+	if (unlikely(!port))
+		return;
+
+	if (atomic_read(&port->dd->resumeflag) == true) {
+		mod_timer(&port->cmd_timer,
+			jiffies + msecs_to_jiffies(30000));
+		return;
+	}
+
+	for (tag = 0; tag < num_command_slots; tag++) {
+		/*
+		 * Skip internal command slot as it has
+		 * its own timeout mechanism
+		 */
+		if (tag == MTIP_TAG_INTERNAL)
+			continue;
+
+		if (atomic_read(&port->commands[tag].active) &&
+		   (time_after(jiffies, port->commands[tag].comp_time))) {
+			group = tag >> 5;
+			bit = tag & 0x1f;
+
+			command = &port->commands[tag];
+			fis = (struct host_to_dev_fis *) command->command;
+
+			dev_warn(&port->dd->pdev->dev,
+				"Timeout for command tag %d\n", tag);
+
+			cmdto_cnt++;
+			if (cmdto_cnt == 1)
+				atomic_inc(&port->dd->eh_active);
+
+			/*
+			 * Clear the completed bit. This should prevent
+			 *  any interrupt handlers from trying to retire
+			 *  the command.
+			 */
+			writel(1 << bit, port->completed[group]);
+
+			/* Call the async completion callback. */
+			if (likely(command->async_callback))
+				command->async_callback(command->async_data,
+							 -EIO);
+			command->async_callback = NULL;
+			command->comp_func = NULL;
+
+			/* Unmap the DMA scatter list entries */
+			dma_unmap_sg(&port->dd->pdev->dev,
+					command->sg,
+					command->scatter_ents,
+					command->direction);
+
+			/*
+			 * Clear the allocated bit and active tag for the
+			 * command.
+			 */
+			atomic_set(&port->commands[tag].active, 0);
+			release_slot(port, tag);
+
+			up(&port->cmd_slot);
+		}
+	}
+
+	if (cmdto_cnt) {
+		dev_warn(&port->dd->pdev->dev,
+			"%d commands timed out: restarting port",
+			cmdto_cnt);
+		mtip_restart_port(port);
+		atomic_dec(&port->dd->eh_active);
+	}
+
+	/* Restart the timer */
+	mod_timer(&port->cmd_timer,
+		jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
+}
+
+/*
+ * IO completion function.
+ *
+ * This completion function is called by the driver ISR when a
+ * command that was issued by the kernel completes. It first calls the
+ * asynchronous completion function which normally calls back into the block
+ * layer passing the asynchronous callback data, then unmaps the
+ * scatter list associated with the completed command, and finally
+ * clears the allocated bit associated with the completed command.
+ *
+ * @port   Pointer to the port data structure.
+ * @tag    Tag of the command.
+ * @data   Pointer to driver_data.
+ * @status Completion status.
+ *
+ * return value
+ *	None
+ */
+static void mtip_async_complete(struct mtip_port *port,
+				int tag,
+				void *data,
+				int status)
+{
+	struct mtip_cmd *command;
+	struct driver_data *dd = data;
+	int cb_status = status ? -EIO : 0;
+
+	if (unlikely(!dd) || unlikely(!port))
+		return;
+
+	command = &port->commands[tag];
+
+	if (unlikely(status == PORT_IRQ_TF_ERR)) {
+		dev_warn(&port->dd->pdev->dev,
+			"Command tag %d failed due to TFE\n", tag);
+	}
+
+	/* Upper layer callback */
+	if (likely(command->async_callback))
+		command->async_callback(command->async_data, cb_status);
+
+	command->async_callback = NULL;
+	command->comp_func = NULL;
+
+	/* Unmap the DMA scatter list entries */
+	dma_unmap_sg(&dd->pdev->dev,
+		command->sg,
+		command->scatter_ents,
+		command->direction);
+
+	/* Clear the allocated and active bits for the command */
+	atomic_set(&port->commands[tag].active, 0);
+	release_slot(port, tag);
+
+	up(&port->cmd_slot);
+}
+
+/*
+ * Internal command completion callback function.
+ *
+ * This function is normally called by the driver ISR when an internal
+ * command completed. This function signals the command completion by
+ * calling complete().
+ *
+ * @port   Pointer to the port data structure.
+ * @tag    Tag of the command that has completed.
+ * @data   Pointer to a completion structure.
+ * @status Completion status.
+ *
+ * return value
+ *	None
+ */
+static void mtip_completion(struct mtip_port *port,
+			    int tag,
+			    void *data,
+			    int status)
+{
+	struct mtip_cmd *command = &port->commands[tag];
+	struct completion *waiting = data;
+	if (unlikely(status == PORT_IRQ_TF_ERR))
+		dev_warn(&port->dd->pdev->dev,
+			"Internal command %d completed with TFE\n", tag);
+
+	command->async_callback = NULL;
+	command->comp_func = NULL;
+
+	complete(waiting);
+}
+
+/*
+ * Enable/disable the reception of FIS
+ *
+ * @port   Pointer to the port data structure
+ * @enable 1 to enable, 0 to disable
+ *
+ * return value
+ *	Previous state: 1 enabled, 0 disabled
+ */
+static int mtip_enable_fis(struct mtip_port *port, int enable)
+{
+	u32 tmp;
+
+	/* enable FIS reception */
+	tmp = readl(port->mmio + PORT_CMD);
+	if (enable)
+		writel(tmp | PORT_CMD_FIS_RX, port->mmio + PORT_CMD);
+	else
+		writel(tmp & ~PORT_CMD_FIS_RX, port->mmio + PORT_CMD);
+
+	/* Flush */
+	readl(port->mmio + PORT_CMD);
+
+	return (((tmp & PORT_CMD_FIS_RX) == PORT_CMD_FIS_RX));
+}
+
+/*
+ * Enable/disable the DMA engine
+ *
+ * @port   Pointer to the port data structure
+ * @enable 1 to enable, 0 to disable
+ *
+ * return value
+ *	Previous state: 1 enabled, 0 disabled.
+ */
+static int mtip_enable_engine(struct mtip_port *port, int enable)
+{
+	u32 tmp;
+
+	/* enable FIS reception */
+	tmp = readl(port->mmio + PORT_CMD);
+	if (enable)
+		writel(tmp | PORT_CMD_START, port->mmio + PORT_CMD);
+	else
+		writel(tmp & ~PORT_CMD_START, port->mmio + PORT_CMD);
+
+	readl(port->mmio + PORT_CMD);
+	return (((tmp & PORT_CMD_START) == PORT_CMD_START));
+}
+
+/*
+ * Enables the port DMA engine and FIS reception.
+ *
+ * return value
+ *	None
+ */
+static inline void mtip_start_port(struct mtip_port *port)
+{
+	/* Enable FIS reception */
+	mtip_enable_fis(port, 1);
+
+	/* Enable the DMA engine */
+	mtip_enable_engine(port, 1);
+}
+
+/*
+ * Deinitialize a port by disabling port interrupts, the DMA engine,
+ * and FIS reception.
+ *
+ * @port Pointer to the port structure
+ *
+ * return value
+ *	None
+ */
+static inline void mtip_deinit_port(struct mtip_port *port)
+{
+	/* Disable interrupts on this port */
+	writel(0, port->mmio + PORT_IRQ_MASK);
+
+	/* Disable the DMA engine */
+	mtip_enable_engine(port, 0);
+
+	/* Disable FIS reception */
+	mtip_enable_fis(port, 0);
+}
+
+/*
+ * Initialize a port.
+ *
+ * This function deinitializes the port by calling mtip_deinit_port() and
+ * then initializes it by setting the command header and RX FIS addresses,
+ * clearing the SError register and any pending port interrupts before
+ * re-enabling the default set of port interrupts.
+ *
+ * @port Pointer to the port structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_init_port(struct mtip_port *port)
+{
+	int i;
+	mtip_deinit_port(port);
+
+	/* Program the command list base and FIS base addresses */
+	if (readl(port->dd->mmio + HOST_CAP) & HOST_CAP_64) {
+		writel((port->command_list_dma >> 16) >> 16,
+			 port->mmio + PORT_LST_ADDR_HI);
+		writel((port->rxfis_dma >> 16) >> 16,
+			 port->mmio + PORT_FIS_ADDR_HI);
+	}
+
+	writel(port->command_list_dma & 0xffffffff,
+			port->mmio + PORT_LST_ADDR);
+	writel(port->rxfis_dma & 0xffffffff, port->mmio + PORT_FIS_ADDR);
+
+	/* Clear SError */
+	writel(readl(port->mmio + PORT_SCR_ERR), port->mmio + PORT_SCR_ERR);
+
+	/* reset the completed registers.*/
+	for (i = 0; i < port->dd->slot_groups; i++)
+		writel(0xFFFFFFFF, port->completed[i]);
+
+	/* Clear any pending interrupts for this port */
+	writel(readl(port->mmio + PORT_IRQ_STAT), port->mmio + PORT_IRQ_STAT);
+
+	/* Enable port interrupts */
+	writel(DEF_PORT_IRQ, port->mmio + PORT_IRQ_MASK);
+}
+
+/*
+ * Reset the HBA (without sleeping)
+ *
+ * Just like hba_reset, except does not call sleep, so can be
+ * run from interrupt/tasklet context.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0	The reset was successful.
+ *	-1	The HBA Reset bit did not clear.
+ */
+int hba_reset_nosleep(struct driver_data *dd)
+{
+	unsigned long timeout;
+
+	/* Chip quirk: quiesce any chip function */
+	mdelay(10);
+
+	/* Set the reset bit */
+	writel(HOST_RESET, dd->mmio + HOST_CTL);
+
+	/* Flush */
+	readl(dd->mmio + HOST_CTL);
+
+	/*
+	 * Wait 10ms then spin for up to 1 second
+	 * waiting for reset acknowledgement
+	 */
+	timeout = jiffies + msecs_to_jiffies(1000);
+	mdelay(10);
+	while ((readl(dd->mmio + HOST_CTL) & HOST_RESET)
+		 && time_before(jiffies, timeout))
+		mdelay(1);
+
+	if (readl(dd->mmio + HOST_CTL) & HOST_RESET)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Restart a port
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ *	None
+ */
+void mtip_restart_port(struct mtip_port *port)
+{
+	unsigned long timeout;
+
+	/* Disable the DMA engine */
+	mtip_enable_engine(port, 0);
+
+	/* Chip quirk: wait up to 500ms for PxCMD.CR == 0 */
+	timeout = jiffies + msecs_to_jiffies(500);
+	while ((readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON)
+		 && time_before(jiffies, timeout))
+		;
+
+	/*
+	 * Chip quirk: escalate to hba reset if
+	 * PxCMD.CR not clear after 500 ms
+	 */
+	if (readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON) {
+		dev_warn(&port->dd->pdev->dev,
+			"PxCMD.CR not clear, escalating reset\n");
+
+		if (hba_reset_nosleep(port->dd))
+			dev_err(&port->dd->pdev->dev,
+				"HBA reset escalation failed.\n");
+
+		/* 30 ms delay before com reset to quiesce chip */
+		mdelay(30);
+	}
+
+	dev_warn(&port->dd->pdev->dev, "Issuing COM reset\n");
+
+	/* Set PxSCTL.DET */
+	writel(readl(port->mmio + PORT_SCR_CTL) |
+			 1, port->mmio + PORT_SCR_CTL);
+	readl(port->mmio + PORT_SCR_CTL);
+
+	/* Wait 1 ms to quiesce chip function */
+	timeout = jiffies + msecs_to_jiffies(1);
+	while (time_before(jiffies, timeout))
+		;
+
+	/* Clear PxSCTL.DET */
+	writel(readl(port->mmio + PORT_SCR_CTL) & ~1,
+			 port->mmio + PORT_SCR_CTL);
+	readl(port->mmio + PORT_SCR_CTL);
+
+	/* Wait 500 ms for bit 0 of PORT_SCR_STS to be set */
+	timeout = jiffies + msecs_to_jiffies(500);
+	while (((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0)
+			 && time_before(jiffies, timeout))
+		;
+
+	if ((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0)
+		dev_warn(&port->dd->pdev->dev,
+			"COM reset failed\n");
+
+	/* Clear SError, the PxSERR.DIAG.x should be set so clear it */
+	writel(readl(port->mmio + PORT_SCR_ERR), port->mmio + PORT_SCR_ERR);
+
+	/* Enable the DMA engine */
+	mtip_enable_engine(port, 1);
+}
+
+/*
+ * Helper function for tag logging
+ */
+static void print_tags(struct driver_data *dd,
+			char *msg,
+			unsigned long *tagbits)
+{
+	unsigned int tag, count = 0;
+
+	for (tag = 0; tag < (dd->slot_groups) * 32; tag++) {
+		if (test_bit(tag, tagbits))
+			count++;
+	}
+	if (count)
+		dev_info(&dd->pdev->dev, "%s [%i tags]\n", msg, count);
+}
+
+/*
+ * Handle an error.
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_handle_tfe(struct driver_data *dd)
+{
+	int group, tag, bit, reissue;
+	struct mtip_port *port;
+	struct mtip_cmd  *command;
+	u32 completed;
+	struct host_to_dev_fis *fis;
+	unsigned long tagaccum[SLOTBITS_IN_LONGS];
+
+	dev_warn(&dd->pdev->dev, "Taskfile error\n");
+
+	port = dd->port;
+
+	/* Stop the timer to prevent command timeouts. */
+	del_timer(&port->cmd_timer);
+
+	/* Set eh_active */
+	atomic_inc(&dd->eh_active);
+
+	/* Loop through all the groups */
+	for (group = 0; group < dd->slot_groups; group++) {
+		completed = readl(port->completed[group]);
+
+		/* clear completed status register in the hardware.*/
+		writel(completed, port->completed[group]);
+
+		/* clear the tag accumulator */
+		memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+		/* Process successfully completed commands */
+		for (bit = 0; bit < 32 && completed; bit++) {
+			if (!(completed & (1<<bit)))
+				continue;
+			tag = (group << 5) + bit;
+
+			/* Skip the internal command slot */
+			if (tag == MTIP_TAG_INTERNAL)
+				continue;
+
+			command = &port->commands[tag];
+			if (likely(command->comp_func)) {
+				set_bit(tag, tagaccum);
+				atomic_set(&port->commands[tag].active, 0);
+				command->comp_func(port,
+					 tag,
+					 command->comp_data,
+					 0);
+			} else {
+				dev_err(&port->dd->pdev->dev,
+					"Missing completion func for tag %d",
+					tag);
+				if (mtip_check_surprise_removal(dd->pdev)) {
+					mtip_command_cleanup(dd);
+					/* don't proceed further */
+					return;
+				}
+			}
+		}
+	}
+	print_tags(dd, "TFE tags completed:", tagaccum);
+
+	/* Restart the port */
+	mdelay(20);
+	mtip_restart_port(port);
+
+	/* clear the tag accumulator */
+	memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+	/* Loop through all the groups */
+	for (group = 0; group < dd->slot_groups; group++) {
+		for (bit = 0; bit < 32; bit++) {
+			reissue = 1;
+			tag = (group << 5) + bit;
+
+			/* If the active bit is set re-issue the command */
+			if (atomic_read(&port->commands[tag].active) == 0)
+				continue;
+
+			fis = (struct host_to_dev_fis *)
+				port->commands[tag].command;
+
+			/* Should re-issue? */
+			if (tag == MTIP_TAG_INTERNAL ||
+			    fis->command == ATA_CMD_SET_FEATURES)
+				reissue = 0;
+
+			/*
+			 * First check if this command has
+			 *  exceeded its retries.
+			 */
+			if (reissue &&
+			    (port->commands[tag].retries-- > 0)) {
+
+				set_bit(tag, tagaccum);
+
+				/* Update the timeout value. */
+				port->commands[tag].comp_time =
+					jiffies + msecs_to_jiffies(
+					MTIP_NCQ_COMMAND_TIMEOUT_MS);
+				/* Re-issue the command. */
+				mtip_issue_ncq_command(port, tag);
+
+				continue;
+			}
+
+			/* Retire a command that will not be reissued */
+			dev_warn(&port->dd->pdev->dev,
+				"retiring tag %d\n", tag);
+			atomic_set(&port->commands[tag].active, 0);
+
+			if (port->commands[tag].comp_func)
+				port->commands[tag].comp_func(
+					port,
+					tag,
+					port->commands[tag].comp_data,
+					PORT_IRQ_TF_ERR);
+			else
+				dev_warn(&port->dd->pdev->dev,
+					"Bad completion for tag %d\n",
+					tag);
+		}
+	}
+	print_tags(dd, "TFE tags reissued:", tagaccum);
+
+	/* Decrement eh_active */
+	atomic_dec(&dd->eh_active);
+
+	mod_timer(&port->cmd_timer,
+		 jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
+}
+
+/*
+ * Handle a set device bits interrupt
+ */
+static inline void mtip_process_sdbf(struct driver_data *dd)
+{
+	struct mtip_port  *port = dd->port;
+	int group, tag, bit;
+	u32 completed;
+	struct mtip_cmd *command;
+
+	/* walk all bits in all slot groups */
+	for (group = 0; group < dd->slot_groups; group++) {
+		completed = readl(port->completed[group]);
+
+		/* clear completed status register in the hardware.*/
+		writel(completed, port->completed[group]);
+
+		/* Process completed commands. */
+		for (bit = 0;
+		     (bit < 32) && completed;
+		     bit++, completed >>= 1) {
+			if (completed & 0x01) {
+				tag = (group << 5) | bit;
+
+				/* skip internal command slot. */
+				if (unlikely(tag == MTIP_TAG_INTERNAL))
+					continue;
+
+				command = &port->commands[tag];
+
+				/* make internal callback */
+				if (likely(command->comp_func)) {
+					command->comp_func(
+						port,
+						tag,
+						command->comp_data,
+						0);
+				} else {
+					dev_warn(&dd->pdev->dev,
+						"Null completion "
+						"for tag %d",
+						tag);
+
+					if (mtip_check_surprise_removal(
+						dd->pdev)) {
+						mtip_command_cleanup(dd);
+						return;
+					}
+				}
+			}
+		}
+	}
+}
+
+/*
+ * Process legacy pio and d2h interrupts
+ */
+static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat)
+{
+	struct mtip_port *port = dd->port;
+	struct mtip_cmd *cmd = &port->commands[MTIP_TAG_INTERNAL];
+
+	if (port->internal_cmd_in_progress &&
+	    cmd != NULL &&
+	    !(readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+		& (1 << MTIP_TAG_INTERNAL))) {
+		if (cmd->comp_func) {
+			cmd->comp_func(port,
+				MTIP_TAG_INTERNAL,
+				cmd->comp_data,
+				0);
+			return;
+		}
+	}
+
+	dev_warn(&dd->pdev->dev, "IRQ status 0x%x ignored.\n", port_stat);
+
+	return;
+}
+
+/*
+ * Demux and handle errors
+ */
+static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat)
+{
+	if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR)))
+		mtip_handle_tfe(dd);
+
+	if (unlikely(port_stat & PORT_IRQ_CONNECT)) {
+		dev_warn(&dd->pdev->dev,
+			"Clearing PxSERR.DIAG.x\n");
+		writel((1 << 26), dd->port->mmio + PORT_SCR_ERR);
+	}
+
+	if (unlikely(port_stat & PORT_IRQ_PHYRDY)) {
+		dev_warn(&dd->pdev->dev,
+			"Clearing PxSERR.DIAG.n\n");
+		writel((1 << 16), dd->port->mmio + PORT_SCR_ERR);
+	}
+
+	if (unlikely(port_stat & ~PORT_IRQ_HANDLED)) {
+		dev_warn(&dd->pdev->dev,
+			"Port stat errors %x unhandled\n",
+			(port_stat & ~PORT_IRQ_HANDLED));
+	}
+}
+
+static inline irqreturn_t mtip_handle_irq(struct driver_data *data)
+{
+	struct driver_data *dd = (struct driver_data *) data;
+	struct mtip_port *port = dd->port;
+	u32 hba_stat, port_stat;
+	int rv = IRQ_NONE;
+
+	hba_stat = readl(dd->mmio + HOST_IRQ_STAT);
+	if (hba_stat) {
+		rv = IRQ_HANDLED;
+
+		/* Acknowledge the interrupt status on the port.*/
+		port_stat = readl(port->mmio + PORT_IRQ_STAT);
+		writel(port_stat, port->mmio + PORT_IRQ_STAT);
+
+		/* Demux port status */
+		if (likely(port_stat & PORT_IRQ_SDB_FIS))
+			mtip_process_sdbf(dd);
+
+		if (unlikely(port_stat & PORT_IRQ_ERR)) {
+			if (unlikely(mtip_check_surprise_removal(dd->pdev))) {
+				mtip_command_cleanup(dd);
+				/* don't proceed further */
+				return IRQ_HANDLED;
+			}
+
+			mtip_process_errors(dd, port_stat & PORT_IRQ_ERR);
+		}
+
+		if (unlikely(port_stat & PORT_IRQ_LEGACY))
+			mtip_process_legacy(dd, port_stat & PORT_IRQ_LEGACY);
+	}
+
+	/* acknowledge interrupt */
+	writel(hba_stat, dd->mmio + HOST_IRQ_STAT);
+
+	return rv;
+}
+
+/*
+ * Wrapper for mtip_handle_irq
+ * (ignores return code)
+ */
+static void mtip_tasklet(unsigned long data)
+{
+	mtip_handle_irq((struct driver_data *) data);
+}
+
+/*
+ * HBA interrupt subroutine.
+ *
+ * @irq		IRQ number.
+ * @instance	Pointer to the driver data structure.
+ *
+ * return value
+ *	IRQ_HANDLED	A HBA interrupt was pending and handled.
+ *	IRQ_NONE	This interrupt was not for the HBA.
+ */
+static irqreturn_t mtip_irq_handler(int irq, void *instance)
+{
+	struct driver_data *dd = instance;
+	tasklet_schedule(&dd->tasklet);
+	return IRQ_HANDLED;
+}
+
+static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag)
+{
+	atomic_set(&port->commands[tag].active, 1);
+	writel(1 << MTIP_TAG_BIT(tag),
+		port->cmd_issue[MTIP_TAG_INDEX(tag)]);
+}
+
+/*
+ * Wait for port to quiesce
+ *
+ * @port    Pointer to port data structure
+ * @timeout Max duration to wait (ms)
+ *
+ * return value
+ *	0	Success
+ *	-EBUSY  Commands still active
+ */
+static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
+{
+	unsigned long to;
+	unsigned int n, active;
+
+	to = jiffies + msecs_to_jiffies(timeout);
+	do {
+		/*
+		 * Ignore s_active bit 0 of array element 0.
+		 * This bit will always be set
+		 */
+		active = readl(port->s_active[0]) & 0xfffffffe;
+		for (n = 1; n < port->dd->slot_groups; n++)
+			active |= readl(port->s_active[n]);
+
+		if (!active)
+			break;
+
+		msleep(20);
+	} while (time_before(jiffies, to));
+
+	return active ? -EBUSY : 0;
+}
+
+/*
+ * Execute an internal command and wait for the completion.
+ *
+ * @port    Pointer to the port data structure.
+ * @fis     Pointer to the FIS that describes the command.
+ * @fisLen  Length in WORDS of the FIS.
+ * @buffer  DMA accessible for command data.
+ * @bufLen  Length, in bytes, of the data buffer.
+ * @opts    Command header options, excluding the FIS length
+ *             and the number of PRD entries.
+ * @timeout Time in ms to wait for the command to complete.
+ *
+ * return value
+ *	0	 Command completed successfully.
+ *	-EFAULT  The buffer address is not correctly aligned.
+ *	-EBUSY   Internal command or other IO in progress.
+ *	-EAGAIN  Time out waiting for command to complete.
+ */
+static int mtip_exec_internal_command(struct mtip_port *port,
+					void *fis,
+					int fisLen,
+					dma_addr_t buffer,
+					int bufLen,
+					u32 opts,
+					gfp_t atomic,
+					unsigned long timeout)
+{
+	struct mtip_cmd_sg *command_sg;
+	DECLARE_COMPLETION_ONSTACK(wait);
+	int rv = 0;
+	struct mtip_cmd *int_cmd = &port->commands[MTIP_TAG_INTERNAL];
+
+	/* Make sure the buffer is 8 byte aligned. This is asic specific. */
+	if (buffer & 0x00000007) {
+		dev_err(&port->dd->pdev->dev,
+			"SG buffer is not 8 byte aligned\n");
+		return -EFAULT;
+	}
+
+	/* Only one internal command should be running at a time */
+	if (test_and_set_bit(MTIP_TAG_INTERNAL, port->allocated)) {
+		dev_warn(&port->dd->pdev->dev,
+			"Internal command already active\n");
+		return -EBUSY;
+	}
+	port->internal_cmd_in_progress = 1;
+
+	if (atomic == GFP_KERNEL) {
+		/* wait for io to complete if non atomic */
+		if (mtip_quiesce_io(port, 5000) < 0) {
+			dev_warn(&port->dd->pdev->dev,
+				"Failed to quiesce IO\n");
+			release_slot(port, MTIP_TAG_INTERNAL);
+			port->internal_cmd_in_progress = 0;
+			return -EBUSY;
+		}
+
+		/* Set the completion function and data for the command. */
+		int_cmd->comp_data = &wait;
+		int_cmd->comp_func = mtip_completion;
+
+	} else {
+		/* Clear completion - we're going to poll */
+		int_cmd->comp_data = NULL;
+		int_cmd->comp_func = NULL;
+	}
+
+	/* Copy the command to the command table */
+	memcpy(int_cmd->command, fis, fisLen*4);
+
+	/* Populate the SG list */
+	int_cmd->command_header->opts =
+		 cpu_to_le32(opts | fisLen);
+	if (bufLen) {
+		command_sg = int_cmd->command + AHCI_CMD_TBL_HDR_SZ;
+
+		command_sg->info = cpu_to_le32((bufLen-1) & 0x3fffff);
+		command_sg->dba	= cpu_to_le32(buffer & 0xffffffff);
+		command_sg->dba_upper = cpu_to_le32((buffer >> 16) >> 16);
+
+		int_cmd->command_header->opts |= cpu_to_le32((1 << 16));
+	}
+
+	/* Populate the command header */
+	int_cmd->command_header->byte_count = 0;
+
+	/* Issue the command to the hardware */
+	mtip_issue_non_ncq_command(port, MTIP_TAG_INTERNAL);
+
+	/* Poll if atomic, wait_for_completion otherwise */
+	if (atomic == GFP_KERNEL) {
+		/* Wait for the command to complete or timeout. */
+		if (wait_for_completion_timeout(
+				&wait,
+				msecs_to_jiffies(timeout)) == 0) {
+			dev_err(&port->dd->pdev->dev,
+				"Internal command did not complete [%d]\n",
+				atomic);
+			rv = -EAGAIN;
+		}
+
+		if (readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+			& (1 << MTIP_TAG_INTERNAL)) {
+			dev_warn(&port->dd->pdev->dev,
+				"Retiring internal command but CI is 1.\n");
+		}
+
+	} else {
+		/* Spin for <timeout> checking if command still outstanding */
+		timeout = jiffies + msecs_to_jiffies(timeout);
+
+		while ((readl(
+			port->cmd_issue[MTIP_TAG_INTERNAL])
+			& (1 << MTIP_TAG_INTERNAL))
+			&& time_before(jiffies, timeout))
+			;
+
+		if (readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+			& (1 << MTIP_TAG_INTERNAL)) {
+			dev_err(&port->dd->pdev->dev,
+				"Internal command did not complete [%d]\n",
+				atomic);
+			rv = -EAGAIN;
+		}
+	}
+
+	/* Clear the allocated and active bits for the internal command. */
+	atomic_set(&int_cmd->active, 0);
+	release_slot(port, MTIP_TAG_INTERNAL);
+	port->internal_cmd_in_progress = 0;
+
+	return rv;
+}
+
+/*
+ * Byte-swap ATA ID strings.
+ *
+ * ATA identify data contains strings in byte-swapped 16-bit words.
+ * They must be swapped (on all architectures) to be usable as C strings.
+ * This function swaps bytes in-place.
+ *
+ * @buf The buffer location of the string
+ * @len The number of bytes to swap
+ *
+ * return value
+ *	None
+ */
+static inline void ata_swap_string(u16 *buf, unsigned int len)
+{
+	int i;
+	for (i = 0; i < (len/2); i++)
+		be16_to_cpus(&buf[i]);
+}
+
+/*
+ * Request the device identity information.
+ *
+ * If a user space buffer is not specified, i.e. is NULL, the
+ * identify information is still read from the drive and placed
+ * into the identify data buffer (@e port->identify) in the
+ * port data structure.
+ * When the identify buffer contains valid identify information @e
+ * port->identify_valid is non-zero.
+ *
+ * @port	 Pointer to the port structure.
+ * @user_buffer  A user space buffer where the identify data should be
+ *                    copied.
+ *
+ * return value
+ *	0	Command completed successfully.
+ *	-EFAULT An error occurred while coping data to the user buffer.
+ *	-1	Command failed.
+ */
+static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
+{
+	int rv = 0;
+	struct host_to_dev_fis fis;
+
+	down_write(&port->dd->internal_sem);
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= ATA_CMD_ID_ATA;
+
+	/* Set the identify information as invalid. */
+	port->identify_valid = 0;
+
+	/* Clear the identify information. */
+	memset(port->identify, 0, sizeof(u16) * ATA_ID_WORDS);
+
+	/* Execute the command. */
+	if (mtip_exec_internal_command(port,
+				&fis,
+				5,
+				port->identify_dma,
+				sizeof(u16) * ATA_ID_WORDS,
+				0,
+				GFP_KERNEL,
+				MTIP_INTERNAL_COMMAND_TIMEOUT_MS)
+				< 0) {
+		rv = -1;
+		goto out;
+	}
+
+	/*
+	 * Perform any necessary byte-swapping.  Yes, the kernel does in fact
+	 * perform field-sensitive swapping on the string fields.
+	 * See the kernel use of ata_id_string() for proof of this.
+	 */
+#ifdef __LITTLE_ENDIAN
+	ata_swap_string(port->identify + 27, 40);  /* model string*/
+	ata_swap_string(port->identify + 23, 8);   /* firmware string*/
+	ata_swap_string(port->identify + 10, 20);  /* serial# string*/
+#else
+	{
+		int i;
+		for (i = 0; i < ATA_ID_WORDS; i++)
+			port->identify[i] = le16_to_cpu(port->identify[i]);
+	}
+#endif
+
+	/* Set the identify buffer as valid. */
+	port->identify_valid = 1;
+
+	if (user_buffer) {
+		if (copy_to_user(
+			user_buffer,
+			port->identify,
+			ATA_ID_WORDS * sizeof(u16))) {
+			rv = -EFAULT;
+			goto out;
+		}
+	}
+
+out:
+	up_write(&port->dd->internal_sem);
+	return rv;
+}
+
+/*
+ * Issue a standby immediate command to the device.
+ *
+ * @port Pointer to the port structure.
+ *
+ * return value
+ *	0	Command was executed successfully.
+ *	-1	An error occurred while executing the command.
+ */
+static int mtip_standby_immediate(struct mtip_port *port)
+{
+	int rv;
+	struct host_to_dev_fis	fis;
+
+	down_write(&port->dd->internal_sem);
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= ATA_CMD_STANDBYNOW1;
+
+	/* Execute the command.  Use a 15-second timeout for large drives. */
+	rv = mtip_exec_internal_command(port,
+					&fis,
+					5,
+					0,
+					0,
+					0,
+					GFP_KERNEL,
+					15000);
+
+	up_write(&port->dd->internal_sem);
+
+	return rv;
+}
+
+/*
+ * Get the drive capacity.
+ *
+ * @dd      Pointer to the device data structure.
+ * @sectors Pointer to the variable that will receive the sector count.
+ *
+ * return value
+ *	1 Capacity was returned successfully.
+ *	0 The identify information is invalid.
+ */
+bool mtip_hw_get_capacity(struct driver_data *dd, sector_t *sectors)
+{
+	struct mtip_port *port = dd->port;
+	u64 total, raw0, raw1, raw2, raw3;
+	raw0 = port->identify[100];
+	raw1 = port->identify[101];
+	raw2 = port->identify[102];
+	raw3 = port->identify[103];
+	total = raw0 | raw1<<16 | raw2<<32 | raw3<<48;
+	*sectors = total;
+	return (bool) !!port->identify_valid;
+}
+
+/*
+ * Reset the HBA.
+ *
+ * Resets the HBA by setting the HBA Reset bit in the Global
+ * HBA Control register. After setting the HBA Reset bit the
+ * function waits for 1 second before reading the HBA Reset
+ * bit to make sure it has cleared. If HBA Reset is not clear
+ * an error is returned. Cannot be used in non-blockable
+ * context.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0  The reset was successful.
+ *	-1 The HBA Reset bit did not clear.
+ */
+static int mtip_hba_reset(struct driver_data *dd)
+{
+	mtip_deinit_port(dd->port);
+
+	/* Set the reset bit */
+	writel(HOST_RESET, dd->mmio + HOST_CTL);
+
+	/* Flush */
+	readl(dd->mmio + HOST_CTL);
+
+	/* Wait for reset to clear */
+	ssleep(1);
+
+	/* Check the bit has cleared */
+	if (readl(dd->mmio + HOST_CTL) & HOST_RESET) {
+		dev_err(&dd->pdev->dev,
+			"Reset bit did not clear.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Display the identify command data.
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_dump_identify(struct mtip_port *port)
+{
+	sector_t sectors;
+	unsigned short revid;
+	char cbuf[42];
+
+	if (!port->identify_valid)
+		return;
+
+	strlcpy(cbuf, (char *)(port->identify+10), 21);
+	dev_info(&port->dd->pdev->dev,
+		"Serial No.: %s\n", cbuf);
+
+	strlcpy(cbuf, (char *)(port->identify+23), 9);
+	dev_info(&port->dd->pdev->dev,
+		"Firmware Ver.: %s\n", cbuf);
+
+	strlcpy(cbuf, (char *)(port->identify+27), 41);
+	dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf);
+
+	if (mtip_hw_get_capacity(port->dd, &sectors))
+		dev_info(&port->dd->pdev->dev,
+			"Capacity: %llu sectors (%llu MB)\n",
+			 (u64)sectors,
+			 ((u64)sectors) * ATA_SECT_SIZE >> 20);
+
+	pci_read_config_word(port->dd->pdev, PCI_REVISION_ID, &revid);
+	switch (revid & 0xff) {
+	case 0x1:
+		strlcpy(cbuf, "A0", 3);
+		break;
+	case 0x3:
+		strlcpy(cbuf, "A2", 3);
+		break;
+	default:
+		strlcpy(cbuf, "?", 2);
+		break;
+	}
+	dev_info(&port->dd->pdev->dev,
+		"Card Type: %s\n", cbuf);
+}
+
+/*
+ * Map the commands scatter list into the command table.
+ *
+ * @command Pointer to the command.
+ * @nents Number of scatter list entries.
+ *
+ * return value
+ *	None
+ */
+static inline void fill_command_sg(struct driver_data *dd,
+				struct mtip_cmd *command,
+				int nents)
+{
+	int n;
+	unsigned int dma_len;
+	struct mtip_cmd_sg *command_sg;
+	struct scatterlist *sg = command->sg;
+
+	command_sg = command->command + AHCI_CMD_TBL_HDR_SZ;
+
+	for (n = 0; n < nents; n++) {
+		dma_len = sg_dma_len(sg);
+		if (dma_len > 0x400000)
+			dev_err(&dd->pdev->dev,
+				"DMA segment length truncated\n");
+		command_sg->info = cpu_to_le32((dma_len-1) & 0x3fffff);
+#if (BITS_PER_LONG == 64)
+		*((unsigned long *) &command_sg->dba) =
+			 cpu_to_le64(sg_dma_address(sg));
+#else
+		command_sg->dba	= cpu_to_le32(sg_dma_address(sg));
+		command_sg->dba_upper	=
+			 cpu_to_le32((sg_dma_address(sg) >> 16) >> 16);
+#endif
+		command_sg++;
+		sg++;
+	}
+}
+
+/*
+ * @brief Execute a drive command.
+ *
+ * return value 0 The command completed successfully.
+ * return value -1 An error occurred while executing the command.
+ */
+int exec_drive_task(struct mtip_port *port, u8 *command)
+{
+	struct host_to_dev_fis	fis;
+	struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG);
+
+	/* Lock the internal command semaphore. */
+	down_write(&port->dd->internal_sem);
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= command[0];
+	fis.features	= command[1];
+	fis.sect_count	= command[2];
+	fis.sector	= command[3];
+	fis.cyl_low	= command[4];
+	fis.cyl_hi	= command[5];
+	fis.device	= command[6] & ~0x10; /* Clear the dev bit*/
+
+
+	dbg_printk(MTIP_DRV_NAME "%s: User Command: cmd %x, feat %x, "
+		"nsect %x, sect %x, lcyl %x, "
+		"hcyl %x, sel %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[2],
+		command[3],
+		command[4],
+		command[5],
+		command[6]);
+
+	/* Execute the command. */
+	if (mtip_exec_internal_command(port,
+				 &fis,
+				 5,
+				 0,
+				 0,
+				 0,
+				 GFP_KERNEL,
+				 MTIP_IOCTL_COMMAND_TIMEOUT_MS) < 0) {
+		up_write(&port->dd->internal_sem);
+		return -1;
+	}
+
+	command[0] = reply->command; /* Status*/
+	command[1] = reply->features; /* Error*/
+	command[4] = reply->cyl_low;
+	command[5] = reply->cyl_hi;
+
+	dbg_printk(MTIP_DRV_NAME "%s: Completion Status: stat %x, "
+		"err %x , cyl_lo %x cyl_hi %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[4],
+		command[5]);
+
+	up_write(&port->dd->internal_sem);
+	return 0;
+}
+
+/*
+ * @brief Execute a drive command.
+ *
+ * @param port Pointer to the port data structure.
+ * @param command Pointer to the user specified command parameters.
+ * @param user_buffer Pointer to the user space buffer where read sector
+ *                   data should be copied.
+ *
+ * return value 0 The command completed successfully.
+ * return value -EFAULT An error occurred while copying the completion
+ *                 data to the user space buffer.
+ * return value -1 An error occurred while executing the command.
+ */
+int exec_drive_command(struct mtip_port *port, u8 *command,
+			void __user *user_buffer)
+{
+	struct host_to_dev_fis	fis;
+	struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG);
+
+	/* Lock the internal command semaphore. */
+	down_write(&port->dd->internal_sem);
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type		= 0x27;
+	fis.opts		= 1 << 7;
+	fis.command		= command[0];
+	fis.features	= command[2];
+	fis.sect_count	= command[3];
+	if (fis.command == ATA_CMD_SMART) {
+		fis.sector	= command[1];
+		fis.cyl_low	= 0x4f;
+		fis.cyl_hi	= 0xc2;
+	}
+
+	dbg_printk(MTIP_DRV_NAME
+		"%s: User Command: cmd %x, sect %x, "
+		"feat %x, sectcnt %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[2],
+		command[3]);
+
+	memset(port->sector_buffer, 0x00, ATA_SECT_SIZE);
+
+	/* Execute the command. */
+	if (mtip_exec_internal_command(port,
+				&fis,
+				 5,
+				 port->sector_buffer_dma,
+				 (command[3] != 0) ? ATA_SECT_SIZE : 0,
+				 0,
+				 GFP_KERNEL,
+				 MTIP_IOCTL_COMMAND_TIMEOUT_MS)
+				 < 0) {
+		up_write(&port->dd->internal_sem);
+		return -1;
+	}
+
+	/* Collect the completion status. */
+	command[0] = reply->command; /* Status*/
+	command[1] = reply->features; /* Error*/
+	command[2] = command[3];
+
+	dbg_printk(MTIP_DRV_NAME
+		"%s: Completion Status: stat %x, "
+		"err %x, cmd %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[2]);
+
+	if (user_buffer && command[3]) {
+		if (copy_to_user(user_buffer,
+				 port->sector_buffer,
+				 ATA_SECT_SIZE * command[3])) {
+			up_write(&port->dd->internal_sem);
+			return -EFAULT;
+		}
+	}
+
+	up_write(&port->dd->internal_sem);
+	return 0;
+}
+
+/*
+ *  Indicates whether a command has a single sector payload.
+ *
+ *  @command passed to the device to perform the certain event.
+ *  @features passed to the device to perform the certain event.
+ *
+ *  return value
+ *	1	command is one that always has a single sector payload,
+ *		regardless of the value in the Sector Count field.
+ *      0       otherwise
+ *
+ */
+static unsigned int implicit_sector(unsigned char command,
+				    unsigned char features)
+{
+	unsigned int rv = 0;
+
+	/* list of commands that have an implicit sector count of 1 */
+	switch (command) {
+	case 0xF1:
+	case 0xF2:
+	case 0xF3:
+	case 0xF4:
+	case 0xF5:
+	case 0xF6:
+	case 0xE4:
+	case 0xE8:
+		rv = 1;
+		break;
+	case 0xF9:
+		if (features == 0x03)
+			rv = 1;
+		break;
+	case 0xB0:
+		if ((features == 0xD0) || (features == 0xD1))
+			rv = 1;
+		break;
+	case 0xB1:
+		if ((features == 0xC2) || (features == 0xC3))
+			rv = 1;
+		break;
+	}
+	return rv;
+}
+
+/*
+ * Executes a taskfile
+ * See ide_taskfile_ioctl() for derivation
+ */
+static int exec_drive_taskfile(struct driver_data *dd,
+				unsigned long arg,
+				unsigned char compat)
+{
+	struct host_to_dev_fis	fis;
+	struct host_to_dev_fis *reply;
+	ide_task_request_t *req_task;
+	u8 *outbuf = NULL;
+	u8 *inbuf = NULL;
+	dma_addr_t outbuf_dma = (dma_addr_t)NULL;
+	dma_addr_t inbuf_dma = (dma_addr_t)NULL;
+	dma_addr_t dma_buffer = (dma_addr_t)NULL;
+	int err = 0;
+	int tasksize = sizeof(struct ide_task_request_s);
+	unsigned int taskin = 0;
+	unsigned int taskout = 0;
+	u8 nsect = 0;
+	char __user *buf = (char __user *)arg;
+	unsigned int timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS;
+	unsigned int force_single_sector;
+	unsigned int transfer_size;
+	unsigned long task_file_data;
+	int intotal, outtotal;
+	struct mtip_compat_ide_task_request_s *compat_req_task = NULL;
+	int compat_tasksize = sizeof(struct mtip_compat_ide_task_request_s);
+
+	req_task = kzalloc(tasksize, GFP_KERNEL);
+	if (req_task == NULL)
+		return -ENOMEM;
+
+	if (compat == 1) {
+		compat_req_task =
+			(struct mtip_compat_ide_task_request_s __user *) arg;
+
+		if (copy_from_user(req_task, buf,
+				compat_tasksize -
+				(2 * sizeof(compat_long_t)))) {
+			err = -EFAULT;
+			goto abort;
+		}
+
+		if (get_user(req_task->out_size, &compat_req_task->out_size)) {
+			err = -EFAULT;
+			goto abort;
+		}
+
+		if (get_user(req_task->in_size, &compat_req_task->in_size)) {
+			err = -EFAULT;
+			goto abort;
+		}
+
+		outtotal = compat_tasksize;
+		intotal = compat_tasksize + req_task->out_size;
+	} else {
+		if (copy_from_user(req_task, buf, tasksize)) {
+			kfree(req_task);
+			err = -EFAULT;
+			goto abort;
+		}
+
+		outtotal = tasksize;
+		intotal = tasksize + req_task->out_size;
+	}
+
+	taskout = req_task->out_size;
+	taskin = req_task->in_size;
+	/* 130560 = 512 * 0xFF*/
+	if (taskin > 130560 || taskout > 130560) {
+		err = -EINVAL;
+		goto abort;
+	}
+
+	if (taskout) {
+		outbuf = kzalloc(taskout, GFP_KERNEL);
+		if (outbuf == NULL) {
+			err = -ENOMEM;
+			goto abort;
+		}
+		if (copy_from_user(outbuf, buf + outtotal, taskout)) {
+			err = -EFAULT;
+			goto abort;
+		}
+		outbuf_dma = pci_map_single(dd->pdev,
+					 outbuf,
+					 taskout,
+					 DMA_TO_DEVICE);
+		if (outbuf_dma == (dma_addr_t)NULL) {
+			err = -ENOMEM;
+			goto abort;
+		}
+		dma_buffer = outbuf_dma;
+	}
+
+	if (taskin) {
+		inbuf = kzalloc(taskin, GFP_KERNEL);
+		if (inbuf == NULL) {
+			err = -ENOMEM;
+			goto abort;
+		}
+
+		if (copy_from_user(inbuf, buf + intotal, taskin)) {
+			err = -EFAULT;
+			goto abort;
+		}
+		inbuf_dma = pci_map_single(dd->pdev,
+					 inbuf,
+					 taskin, DMA_FROM_DEVICE);
+		if (inbuf_dma == (dma_addr_t)NULL) {
+			err = -ENOMEM;
+			goto abort;
+		}
+		dma_buffer = inbuf_dma;
+	}
+
+	/* only supports PIO and non-data commands from this ioctl. */
+	switch (req_task->data_phase) {
+	case TASKFILE_OUT:
+		nsect = taskout / ATA_SECT_SIZE;
+		reply = (dd->port->rxfis + RX_FIS_PIO_SETUP);
+		break;
+	case TASKFILE_IN:
+		reply = (dd->port->rxfis + RX_FIS_PIO_SETUP);
+		break;
+	case TASKFILE_NO_DATA:
+		reply = (dd->port->rxfis + RX_FIS_D2H_REG);
+		break;
+	default:
+		err = -EINVAL;
+		goto abort;
+	}
+
+	/* Lock the internal command semaphore. */
+	down_write(&dd->internal_sem);
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= req_task->io_ports[7];
+	fis.features	= req_task->io_ports[1];
+	fis.sect_count	= req_task->io_ports[2];
+	fis.lba_low	= req_task->io_ports[3];
+	fis.lba_mid	= req_task->io_ports[4];
+	fis.lba_hi	= req_task->io_ports[5];
+	 /* Clear the dev bit*/
+	fis.device	= req_task->io_ports[6] & ~0x10;
+
+	if ((req_task->in_flags.all == 0) && (req_task->out_flags.all & 1)) {
+		req_task->in_flags.all	=
+			IDE_TASKFILE_STD_IN_FLAGS |
+			(IDE_HOB_STD_IN_FLAGS << 8);
+		fis.lba_low_ex		= req_task->hob_ports[3];
+		fis.lba_mid_ex		= req_task->hob_ports[4];
+		fis.lba_hi_ex		= req_task->hob_ports[5];
+		fis.features_ex		= req_task->hob_ports[1];
+		fis.sect_cnt_ex		= req_task->hob_ports[2];
+
+	} else {
+		req_task->in_flags.all = IDE_TASKFILE_STD_IN_FLAGS;
+	}
+
+	force_single_sector = implicit_sector(fis.command, fis.features);
+
+	if ((taskin || taskout) && (!fis.sect_count)) {
+		if (nsect)
+			fis.sect_count = nsect;
+		else {
+			if (!force_single_sector) {
+				dev_warn(&dd->pdev->dev,
+					"data movement but "
+					"sect_count is 0\n");
+					up_write(&dd->internal_sem);
+					err = -EINVAL;
+					goto abort;
+			}
+		}
+	}
+
+	dbg_printk(MTIP_DRV_NAME
+		"taskfile: cmd %x, feat %x, nsect %x,"
+		" sect/lbal %x, lcyl/lbam %x, hcyl/lbah %x,"
+		" head/dev %x\n",
+		fis.command,
+		fis.features,
+		fis.sect_count,
+		fis.lba_low,
+		fis.lba_mid,
+		fis.lba_hi,
+		fis.device);
+
+	switch (fis.command) {
+	case 0x92: /* Change timeout for Download Microcode to 60 seconds.*/
+		timeout = 60000;
+		break;
+	case 0xf4: /* Change timeout for Security Erase Unit to 4 minutes.*/
+		timeout = 240000;
+		break;
+	case 0xe0: /* Change timeout for standby immediate to 10 seconds.*/
+		timeout = 10000;
+		break;
+	case 0xf7: /* Change timeout for vendor unique command to 10 secs */
+		timeout = 10000;
+		break;
+	case 0xfa: /* Change timeout for vendor unique command to 10 secs */
+		timeout = 10000;
+		break;
+	default:
+		timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS;
+		break;
+	}
+
+	/* Determine the correct transfer size.*/
+	if (force_single_sector)
+		transfer_size = ATA_SECT_SIZE;
+	else
+		transfer_size = ATA_SECT_SIZE * fis.sect_count;
+
+	/* Execute the command.*/
+	if (mtip_exec_internal_command(dd->port,
+				 &fis,
+				 5,
+				 dma_buffer,
+				 transfer_size,
+				 0,
+				 GFP_KERNEL,
+				 timeout) < 0) {
+		up_write(&dd->internal_sem);
+		err = -EIO;
+		goto abort;
+	}
+
+	task_file_data = readl(dd->port->mmio+PORT_TFDATA);
+
+	if ((req_task->data_phase == TASKFILE_IN) && !(task_file_data & 1)) {
+		reply = dd->port->rxfis + RX_FIS_PIO_SETUP;
+		req_task->io_ports[7] = reply->control;
+	} else {
+		reply = dd->port->rxfis + RX_FIS_D2H_REG;
+		req_task->io_ports[7] = reply->command;
+	}
+
+	/* reclaim the DMA buffers.*/
+	if (inbuf_dma)
+		pci_unmap_single(dd->pdev, inbuf_dma,
+			taskin, DMA_FROM_DEVICE);
+	if (outbuf_dma)
+		pci_unmap_single(dd->pdev, outbuf_dma,
+			taskout, DMA_TO_DEVICE);
+	inbuf_dma  = (dma_addr_t) NULL;
+	outbuf_dma = (dma_addr_t) NULL;
+
+	/* return the ATA registers to the caller.*/
+	req_task->io_ports[1] = reply->features;
+	req_task->io_ports[2] = reply->sect_count;
+	req_task->io_ports[3] = reply->lba_low;
+	req_task->io_ports[4] = reply->lba_mid;
+	req_task->io_ports[5] = reply->lba_hi;
+	req_task->io_ports[6] = reply->device;
+
+	if (req_task->out_flags.all & 1)  {
+
+		req_task->hob_ports[3] = reply->lba_low_ex;
+		req_task->hob_ports[4] = reply->lba_mid_ex;
+		req_task->hob_ports[5] = reply->lba_hi_ex;
+		req_task->hob_ports[1] = reply->features_ex;
+		req_task->hob_ports[2] = reply->sect_cnt_ex;
+	}
+
+	/* Com rest after secure erase or lowlevel format */
+	if (((fis.command == 0xF4) ||
+		((fis.command == 0xFC) &&
+			(fis.features == 0x27 || fis.features == 0x72 ||
+			 fis.features == 0x62 || fis.features == 0x26))) &&
+			 !(reply->command & 1)) {
+		mtip_restart_port(dd->port);
+	}
+
+	dbg_printk(MTIP_DRV_NAME
+		"%s: Completion: stat %x,"
+		"err %x, sect_cnt %x, lbalo %x,"
+		"lbamid %x, lbahi %x, dev %x\n",
+		__func__,
+		req_task->io_ports[7],
+		req_task->io_ports[1],
+		req_task->io_ports[2],
+		req_task->io_ports[3],
+		req_task->io_ports[4],
+		req_task->io_ports[5],
+		req_task->io_ports[6]);
+
+	up_write(&dd->internal_sem);
+
+	if (compat == 1) {
+		if (copy_to_user(buf, req_task,
+				compat_tasksize -
+				(2 * sizeof(compat_long_t)))) {
+			err = -EFAULT;
+			goto abort;
+		}
+		if (put_user(req_task->out_size,
+				&compat_req_task->out_size)) {
+			err = -EFAULT;
+			goto abort;
+		}
+		if (put_user(req_task->in_size, &compat_req_task->in_size)) {
+			err = -EFAULT;
+			goto abort;
+		}
+	} else {
+		if (copy_to_user(buf, req_task, tasksize)) {
+			err = -EFAULT;
+			goto abort;
+		}
+	}
+	if (taskout) {
+		if (copy_to_user(buf + outtotal, outbuf, taskout)) {
+			err = -EFAULT;
+			goto abort;
+		}
+	}
+	if (taskin) {
+		if (copy_to_user(buf + intotal, inbuf, taskin)) {
+			err = -EFAULT;
+			goto abort;
+		}
+	}
+abort:
+	if (inbuf_dma)
+		pci_unmap_single(dd->pdev, inbuf_dma,
+					taskin, DMA_FROM_DEVICE);
+	if (outbuf_dma)
+		pci_unmap_single(dd->pdev, outbuf_dma,
+					taskout, DMA_TO_DEVICE);
+	kfree(req_task);
+	kfree(outbuf);
+	kfree(inbuf);
+
+	return err;
+}
+
+/*
+ * Handle IOCTL calls from the Block Layer.
+ *
+ * This function is called by the Block Layer when it receives an IOCTL
+ * command that it does not understand. If the IOCTL command is not supported
+ * this function returns -ENOTTY.
+ *
+ * @dd  Pointer to the driver data structure.
+ * @cmd IOCTL command passed from the Block Layer.
+ * @arg IOCTL argument passed from the Block Layer.
+ *
+ * return value
+ *	0	The IOCTL completed successfully.
+ *	-ENOTTY The specified command is not supported.
+ *	-EFAULT An error occurred copying data to a user space buffer.
+ *	-EIO	An error occurred while executing the command.
+ */
+int mtip_hw_ioctl(struct driver_data *dd,
+		  unsigned int cmd,
+		  unsigned long arg,
+		  unsigned char compat)
+{
+	switch (cmd) {
+	case HDIO_GET_IDENTITY:
+		if (mtip_get_identify(dd->port, (void __user *) arg) < 0) {
+			dev_warn(&dd->pdev->dev,
+				"Unable to read identity\n");
+			return -EIO;
+		}
+
+		break;
+	case HDIO_DRIVE_CMD:
+	{
+		u8 drive_command[4];
+
+		/* Copy the user command info to our buffer. */
+		if (copy_from_user(drive_command,
+					 (void __user *) arg,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		/* Execute the drive command. */
+		if (exec_drive_command(dd->port,
+					 drive_command,
+					 (void __user *) (arg+4)))
+			return -EIO;
+
+		/* Copy the status back to the users buffer. */
+		if (copy_to_user((void __user *) arg,
+					 drive_command,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		break;
+	}
+	case HDIO_DRIVE_TASK:
+	{
+		u8 drive_command[7];
+
+		/* Copy the user command info to our buffer. */
+		if (copy_from_user(drive_command,
+					 (void __user *) arg,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		/* Execute the drive command. */
+		if (exec_drive_task(dd->port, drive_command))
+			return -EIO;
+
+		/* Copy the status back to the users buffer. */
+		if (copy_to_user((void __user *) arg,
+					 drive_command,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		break;
+	}
+	case HDIO_DRIVE_TASKFILE:
+		return exec_drive_taskfile(dd, arg, compat);
+
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Submit an IO to the hw
+ *
+ * This function is called by the block layer to issue an io
+ * to the device. Upon completion, the callback function will
+ * be called with the data parameter passed as the callback data.
+ *
+ * @dd       Pointer to the driver data structure.
+ * @start    First sector to read.
+ * @nsect    Number of sectors to read.
+ * @nents    Number of entries in scatter list for the read command.
+ * @tag      The tag of this read command.
+ * @callback Pointer to the function that should be called
+ *	     when the read completes.
+ * @data     Callback data passed to the callback function
+ *	     when the read completes.
+ * @barrier  If non-zero, this command must be completed before
+ *	     issuing any other commands.
+ * @dir      Direction (read or write)
+ *
+ * return value
+ *	None
+ */
+void mtip_hw_submit_io(struct driver_data *dd,
+			sector_t start,
+			int nsect,
+			int nents,
+			int tag,
+			void *callback,
+			void *data,
+			int barrier,
+			int dir)
+{
+	struct host_to_dev_fis	*fis;
+	struct mtip_port *port = dd->port;
+	struct mtip_cmd *command = &port->commands[tag];
+
+	/* Map the scatter list for DMA access */
+	if (dir == READ)
+		nents = dma_map_sg(&dd->pdev->dev, command->sg,
+					nents, DMA_FROM_DEVICE);
+	else
+		nents = dma_map_sg(&dd->pdev->dev, command->sg,
+					nents, DMA_TO_DEVICE);
+
+	command->scatter_ents = nents;
+
+	/*
+	 * The number of retries for this command before it is
+	 * reported as a failure to the upper layers.
+	 */
+	command->retries = MTIP_MAX_RETRIES;
+
+	/* Fill out fis */
+	fis = command->command;
+	fis->type        = 0x27;
+	fis->opts        = 1 << 7;
+	fis->command     =
+		(dir == READ ? ATA_CMD_FPDMA_READ : ATA_CMD_FPDMA_WRITE);
+	*((unsigned int *) &fis->lba_low) = (start & 0xffffff);
+	*((unsigned int *) &fis->lba_low_ex) = ((start >> 24) & 0xffffff);
+	fis->device	 = 1 << 6;
+	if (barrier)
+		fis->device |= FUA_BIT;
+	fis->features    = nsect & 0xff;
+	fis->features_ex = (nsect >> 8) & 0xff;
+	fis->sect_count  = ((tag << 3) | (tag >> 5));
+	fis->sect_cnt_ex = 0;
+	fis->control     = 0;
+	fis->res2        = 0;
+	fis->res3        = 0;
+	fill_command_sg(dd, command, nents);
+
+	/* Populate the command header */
+	command->command_header->opts = cpu_to_le32(
+			(nents << 16) | 5 | AHCI_CMD_PREFETCH);
+	command->command_header->byte_count = 0;
+
+	/*
+	 * Set the completion function and data for the command
+	 * within this layer.
+	 */
+	command->comp_data = dd;
+	command->comp_func = mtip_async_complete;
+	command->direction = (dir == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+
+	/*
+	 * Set the completion function and data for the command passed
+	 * from the upper layer.
+	 */
+	command->async_data = data;
+	command->async_callback = callback;
+
+	/*
+	 * Lock used to prevent this command from being issued
+	 * if an internal command is in progress.
+	 */
+	down_read(&port->dd->internal_sem);
+
+	/* Issue the command to the hardware */
+	mtip_issue_ncq_command(port, tag);
+
+	/* Set the command's timeout value.*/
+	port->commands[tag].comp_time = jiffies + msecs_to_jiffies(
+					MTIP_NCQ_COMMAND_TIMEOUT_MS);
+
+	up_read(&port->dd->internal_sem);
+}
+
+/*
+ * Release a command slot.
+ *
+ * @dd  Pointer to the driver data structure.
+ * @tag Slot tag
+ *
+ * return value
+ *      None
+ */
+void mtip_hw_release_scatterlist(struct driver_data *dd, int tag)
+{
+	release_slot(dd->port, tag);
+}
+
+/*
+ * Obtain a command slot and return its associated scatter list.
+ *
+ * @dd  Pointer to the driver data structure.
+ * @tag Pointer to an int that will receive the allocated command
+ *            slot tag.
+ *
+ * return value
+ *	Pointer to the scatter list for the allocated command slot
+ *	or NULL if no command slots are available.
+ */
+struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd,
+						int *tag)
+{
+	/*
+	 * It is possible that, even with this semaphore, a thread
+	 * may think that no command slots are available. Therefore, we
+	 * need to make an attempt to get_slot().
+	 */
+	down(&dd->port->cmd_slot);
+	*tag = get_slot(dd->port);
+
+	if (unlikely(*tag < 0))
+		return NULL;
+
+	return dd->port->commands[*tag].sg;
+}
+
+/*
+ * Sysfs register/status dump.
+ *
+ * @dev  Pointer to the device structure, passed by the kernrel.
+ * @attr Pointer to the device_attribute structure passed by the kernel.
+ * @buf  Pointer to the char buffer that will receive the stats info.
+ *
+ * return value
+ *	The size, in bytes, of the data copied into buf.
+ */
+static ssize_t hw_show_registers(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	u32 group_allocated;
+	struct driver_data *dd = dev_to_disk(dev)->private_data;
+	int size = 0;
+	int n;
+
+	size += sprintf(&buf[size], "%s:\ns_active:\n", __func__);
+
+	for (n = 0; n < dd->slot_groups; n++)
+		size += sprintf(&buf[size], "0x%08x\n",
+					 readl(dd->port->s_active[n]));
+
+	size += sprintf(&buf[size], "Command Issue:\n");
+
+	for (n = 0; n < dd->slot_groups; n++)
+		size += sprintf(&buf[size], "0x%08x\n",
+					readl(dd->port->cmd_issue[n]));
+
+	size += sprintf(&buf[size], "Allocated:\n");
+
+	for (n = 0; n < dd->slot_groups; n++) {
+		if (sizeof(long) > sizeof(u32))
+			group_allocated =
+				dd->port->allocated[n/2] >> (32*(n&1));
+		else
+			group_allocated = dd->port->allocated[n];
+		size += sprintf(&buf[size], "0x%08x\n",
+				 group_allocated);
+	}
+
+	size += sprintf(&buf[size], "completed:\n");
+
+	for (n = 0; n < dd->slot_groups; n++)
+		size += sprintf(&buf[size], "0x%08x\n",
+				readl(dd->port->completed[n]));
+
+	size += sprintf(&buf[size], "PORT_IRQ_STAT 0x%08x\n",
+				readl(dd->port->mmio + PORT_IRQ_STAT));
+	size += sprintf(&buf[size], "HOST_IRQ_STAT 0x%08x\n",
+				readl(dd->mmio + HOST_IRQ_STAT));
+
+	return size;
+}
+static DEVICE_ATTR(registers, S_IRUGO, hw_show_registers, NULL);
+
+/*
+ * Create the sysfs related attributes.
+ *
+ * @dd   Pointer to the driver data structure.
+ * @kobj Pointer to the kobj for the block device.
+ *
+ * return value
+ *	0	Operation completed successfully.
+ *	-EINVAL Invalid parameter.
+ */
+int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj)
+{
+	if (!kobj || !dd)
+		return -EINVAL;
+
+	if (sysfs_create_file(kobj, &dev_attr_registers.attr))
+		dev_warn(&dd->pdev->dev,
+			"Error creating registers sysfs entry\n");
+	return 0;
+}
+
+/*
+ * Remove the sysfs related attributes.
+ *
+ * @dd   Pointer to the driver data structure.
+ * @kobj Pointer to the kobj for the block device.
+ *
+ * return value
+ *	0	Operation completed successfully.
+ *	-EINVAL Invalid parameter.
+ */
+int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj)
+{
+	if (!kobj || !dd)
+		return -EINVAL;
+
+	sysfs_remove_file(kobj, &dev_attr_registers.attr);
+
+	return 0;
+}
+
+/*
+ * Perform any init/resume time hardware setup
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	None
+ */
+static inline void hba_setup(struct driver_data *dd)
+{
+	u32 hwdata;
+	hwdata = readl(dd->mmio + HOST_HSORG);
+
+	/* interrupt bug workaround: use only 1 IS bit.*/
+	writel(hwdata |
+		HSORG_DISABLE_SLOTGRP_INTR |
+		HSORG_DISABLE_SLOTGRP_PXIS,
+		dd->mmio + HOST_HSORG);
+}
+
+/*
+ * Detect the details of the product, and store anything needed
+ * into the driver data structure.  This includes product type and
+ * version and number of slot groups.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_detect_product(struct driver_data *dd)
+{
+	u32 hwdata;
+	unsigned int rev, slotgroups;
+
+	/*
+	 * HBA base + 0xFC [15:0] - vendor-specific hardware interface
+	 * info register:
+	 * [15:8] hardware/software interface rev#
+	 * [   3] asic-style interface
+	 * [ 2:0] number of slot groups, minus 1 (only valid for asic-style).
+	 */
+	hwdata = readl(dd->mmio + HOST_HSORG);
+
+	dd->product_type = MTIP_PRODUCT_UNKNOWN;
+	dd->slot_groups = 1;
+
+	if (hwdata & 0x8) {
+		dd->product_type = MTIP_PRODUCT_ASICFPGA;
+		rev = (hwdata & HSORG_HWREV) >> 8;
+		slotgroups = (hwdata & HSORG_SLOTGROUPS) + 1;
+		dev_info(&dd->pdev->dev,
+			"ASIC-FPGA design, HS rev 0x%x, "
+			"%i slot groups [%i slots]\n",
+			 rev,
+			 slotgroups,
+			 slotgroups * 32);
+
+		if (slotgroups > MTIP_MAX_SLOT_GROUPS) {
+			dev_warn(&dd->pdev->dev,
+				"Warning: driver only supports "
+				"%i slot groups.\n", MTIP_MAX_SLOT_GROUPS);
+			slotgroups = MTIP_MAX_SLOT_GROUPS;
+		}
+		dd->slot_groups = slotgroups;
+		return;
+	}
+
+	dev_warn(&dd->pdev->dev, "Unrecognized product id\n");
+}
+
+/*
+ * Blocking wait for FTL rebuild to complete
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ *	0	FTL rebuild completed successfully
+ *	-EFAULT FTL rebuild error/timeout/interruption
+ */
+static int mtip_ftl_rebuild_poll(struct driver_data *dd)
+{
+	unsigned long timeout, cnt = 0, start;
+
+	dev_warn(&dd->pdev->dev,
+		"FTL rebuild in progress. Polling for completion.\n");
+
+	start = jiffies;
+	dd->ftlrebuildflag = 1;
+	timeout = jiffies + msecs_to_jiffies(MTIP_FTL_REBUILD_TIMEOUT_MS);
+
+	do {
+#ifdef CONFIG_HOTPLUG
+		if (mtip_check_surprise_removal(dd->pdev))
+			return -EFAULT;
+#endif
+		if (mtip_get_identify(dd->port, NULL) < 0)
+			return -EFAULT;
+
+		if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
+			MTIP_FTL_REBUILD_MAGIC) {
+			ssleep(1);
+			/* Print message every 3 minutes */
+			if (cnt++ >= 180) {
+				dev_warn(&dd->pdev->dev,
+				"FTL rebuild in progress (%d secs).\n",
+				jiffies_to_msecs(jiffies - start) / 1000);
+				cnt = 0;
+			}
+		} else {
+			dev_warn(&dd->pdev->dev,
+				"FTL rebuild complete (%d secs).\n",
+			jiffies_to_msecs(jiffies - start) / 1000);
+			dd->ftlrebuildflag = 0;
+			break;
+		}
+		ssleep(10);
+	} while (time_before(jiffies, timeout));
+
+	/* Check for timeout */
+	if (dd->ftlrebuildflag) {
+		dev_err(&dd->pdev->dev,
+		"Timed out waiting for FTL rebuild to complete (%d secs).\n",
+		jiffies_to_msecs(jiffies - start) / 1000);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+/*
+ * Called once for each card.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0 on success, else an error code.
+ */
+int mtip_hw_init(struct driver_data *dd)
+{
+	int i;
+	int rv;
+	unsigned int num_command_slots;
+
+	dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR];
+
+	mtip_detect_product(dd);
+	if (dd->product_type == MTIP_PRODUCT_UNKNOWN) {
+		rv = -EIO;
+		goto out1;
+	}
+	num_command_slots = dd->slot_groups * 32;
+
+	hba_setup(dd);
+
+	/*
+	 * Initialize the internal semaphore
+	 * Use a rw semaphore to enable prioritization of
+	 * mgmnt ioctl traffic during heavy IO load
+	 */
+	init_rwsem(&dd->internal_sem);
+
+	tasklet_init(&dd->tasklet, mtip_tasklet, (unsigned long)dd);
+
+	dd->port = kzalloc(sizeof(struct mtip_port), GFP_KERNEL);
+	if (!dd->port) {
+		dev_err(&dd->pdev->dev,
+			"Memory allocation: port structure\n");
+		return -ENOMEM;
+	}
+
+	/* Counting semaphore to track command slot usage */
+	sema_init(&dd->port->cmd_slot, num_command_slots - 1);
+
+	/* Spinlock to prevent concurrent issue */
+	spin_lock_init(&dd->port->cmd_issue_lock);
+
+	/* Set the port mmio base address. */
+	dd->port->mmio	= dd->mmio + PORT_OFFSET;
+	dd->port->dd	= dd;
+
+	/* Allocate memory for the command list. */
+	dd->port->command_list =
+		dmam_alloc_coherent(&dd->pdev->dev,
+			HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 2),
+			&dd->port->command_list_dma,
+			GFP_KERNEL);
+	if (!dd->port->command_list) {
+		dev_err(&dd->pdev->dev,
+			"Memory allocation: command list\n");
+		rv = -ENOMEM;
+		goto out1;
+	}
+
+	/* Clear the memory we have allocated. */
+	memset(dd->port->command_list,
+		0,
+		HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 2));
+
+	/* Setup the addresse of the RX FIS. */
+	dd->port->rxfis	    = dd->port->command_list + HW_CMD_SLOT_SZ;
+	dd->port->rxfis_dma = dd->port->command_list_dma + HW_CMD_SLOT_SZ;
+
+	/* Setup the address of the command tables. */
+	dd->port->command_table	  = dd->port->rxfis + AHCI_RX_FIS_SZ;
+	dd->port->command_tbl_dma = dd->port->rxfis_dma + AHCI_RX_FIS_SZ;
+
+	/* Setup the address of the identify data. */
+	dd->port->identify     = dd->port->command_table +
+					HW_CMD_TBL_AR_SZ;
+	dd->port->identify_dma = dd->port->command_tbl_dma +
+					HW_CMD_TBL_AR_SZ;
+
+	/* Setup the address of the sector buffer. */
+	dd->port->sector_buffer	= (void *) dd->port->identify + ATA_SECT_SIZE;
+	dd->port->sector_buffer_dma = dd->port->identify_dma + ATA_SECT_SIZE;
+
+	/* Point the command headers at the command tables. */
+	for (i = 0; i < num_command_slots; i++) {
+		dd->port->commands[i].command_header =
+					dd->port->command_list +
+					(sizeof(struct mtip_cmd_hdr) * i);
+		dd->port->commands[i].command_header_dma =
+					dd->port->command_list_dma +
+					(sizeof(struct mtip_cmd_hdr) * i);
+
+		dd->port->commands[i].command =
+			dd->port->command_table + (HW_CMD_TBL_SZ * i);
+		dd->port->commands[i].command_dma =
+			dd->port->command_tbl_dma + (HW_CMD_TBL_SZ * i);
+
+		if (readl(dd->mmio + HOST_CAP) & HOST_CAP_64)
+			dd->port->commands[i].command_header->ctbau =
+			cpu_to_le32(
+			(dd->port->commands[i].command_dma >> 16) >> 16);
+		dd->port->commands[i].command_header->ctba = cpu_to_le32(
+			dd->port->commands[i].command_dma & 0xffffffff);
+
+		/*
+		 * If this is not done, a bug is reported by the stock
+		 * FC11 i386. Due to the fact that it has lots of kernel
+		 * debugging enabled.
+		 */
+		sg_init_table(dd->port->commands[i].sg, MTIP_MAX_SG);
+
+		/* Mark all commands as currently inactive.*/
+		atomic_set(&dd->port->commands[i].active, 0);
+	}
+
+	/* Setup the pointers to the extended s_active and CI registers. */
+	for (i = 0; i < dd->slot_groups; i++) {
+		dd->port->s_active[i] =
+			dd->port->mmio + i*0x80 + PORT_SCR_ACT;
+		dd->port->cmd_issue[i] =
+			dd->port->mmio + i*0x80 + PORT_COMMAND_ISSUE;
+		dd->port->completed[i] =
+			dd->port->mmio + i*0x80 + PORT_SDBV;
+	}
+
+	/* Reset the HBA. */
+	if (mtip_hba_reset(dd) < 0) {
+		dev_err(&dd->pdev->dev,
+			"Card did not reset within timeout\n");
+		rv = -EIO;
+		goto out2;
+	}
+
+	mtip_init_port(dd->port);
+	mtip_start_port(dd->port);
+
+	/* Setup the ISR and enable interrupts. */
+	rv = devm_request_irq(&dd->pdev->dev,
+				dd->pdev->irq,
+				mtip_irq_handler,
+				IRQF_SHARED,
+				dev_driver_string(&dd->pdev->dev),
+				dd);
+
+	if (rv) {
+		dev_err(&dd->pdev->dev,
+			"Unable to allocate IRQ %d\n", dd->pdev->irq);
+		goto out2;
+	}
+
+	/* Enable interrupts on the HBA. */
+	writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+					dd->mmio + HOST_CTL);
+
+	init_timer(&dd->port->cmd_timer);
+	dd->port->cmd_timer.data = (unsigned long int) dd->port;
+	dd->port->cmd_timer.function = mtip_timeout_function;
+	mod_timer(&dd->port->cmd_timer,
+		jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
+
+	if (mtip_get_identify(dd->port, NULL) < 0) {
+		rv = -EFAULT;
+		goto out3;
+	}
+	mtip_dump_identify(dd->port);
+
+	if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
+		MTIP_FTL_REBUILD_MAGIC) {
+		return mtip_ftl_rebuild_poll(dd);
+	}
+	return rv;
+
+out3:
+	del_timer_sync(&dd->port->cmd_timer);
+
+	/* Disable interrupts on the HBA. */
+	writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+			dd->mmio + HOST_CTL);
+
+	/*Release the IRQ. */
+	devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+
+out2:
+	mtip_deinit_port(dd->port);
+
+	/* Free the command/command header memory. */
+	dmam_free_coherent(&dd->pdev->dev,
+				HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 2),
+				dd->port->command_list,
+				dd->port->command_list_dma);
+out1:
+	/* Free the memory allocated for the for structure. */
+	kfree(dd->port);
+
+	return rv;
+}
+
+/*
+ * Called to deinitialize an interface.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+int mtip_hw_exit(struct driver_data *dd)
+{
+	/*
+	 * Send standby immediate (E0h) to the drive so that it
+	 * saves its state.
+	 */
+	if (atomic_read(&dd->drv_cleanup_done) != true) {
+
+		mtip_standby_immediate(dd->port);
+
+		/* de-initialize the port. */
+		mtip_deinit_port(dd->port);
+
+		/* Disable interrupts on the HBA. */
+		writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+				dd->mmio + HOST_CTL);
+	}
+
+	del_timer_sync(&dd->port->cmd_timer);
+
+	/* Stop the bottom half tasklet. */
+	tasklet_kill(&dd->tasklet);
+
+	/* Release the IRQ. */
+	devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+
+	/* Free the command/command header memory. */
+	dmam_free_coherent(&dd->pdev->dev,
+			HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 2),
+			dd->port->command_list,
+			dd->port->command_list_dma);
+	/* Free the memory allocated for the for structure. */
+	kfree(dd->port);
+
+	return 0;
+}
+
+/*
+ * Issue a Standby Immediate command to the device.
+ *
+ * This function is called by the Block Layer just before the
+ * system powers off during a shutdown.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+int mtip_hw_shutdown(struct driver_data *dd)
+{
+	/*
+	 * Send standby immediate (E0h) to the drive so that it
+	 * saves its state.
+	 */
+	mtip_standby_immediate(dd->port);
+
+	return 0;
+}
+
+/*
+ * Suspend function
+ *
+ * This function is called by the Block Layer just before the
+ * system hibernates.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0	Suspend was successful
+ *	-EFAULT Suspend was not successful
+ */
+int mtip_hw_suspend(struct driver_data *dd)
+{
+	/*
+	 * Send standby immediate (E0h) to the drive
+	 * so that it saves its state.
+	 */
+	if (mtip_standby_immediate(dd->port) != 0) {
+		dev_err(&dd->pdev->dev,
+			"Failed standby-immediate command\n");
+		return -EFAULT;
+	}
+
+	/* Disable interrupts on the HBA.*/
+	writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+			dd->mmio + HOST_CTL);
+	mtip_deinit_port(dd->port);
+
+	return 0;
+}
+
+/*
+ * Resume function
+ *
+ * This function is called by the Block Layer as the
+ * system resumes.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0	Resume was successful
+ *      -EFAULT Resume was not successful
+ */
+int mtip_hw_resume(struct driver_data *dd)
+{
+	/* Perform any needed hardware setup steps */
+	hba_setup(dd);
+
+	/* Reset the HBA */
+	if (mtip_hba_reset(dd) != 0) {
+		dev_err(&dd->pdev->dev,
+			"Unable to reset the HBA\n");
+		return -EFAULT;
+	}
+
+	/*
+	 * Enable the port, DMA engine, and FIS reception specific
+	 * h/w in controller.
+	 */
+	mtip_init_port(dd->port);
+	mtip_start_port(dd->port);
+
+	/* Enable interrupts on the HBA.*/
+	writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+			dd->mmio + HOST_CTL);
+
+	return 0;
+}
+
+/*
+ * This function is called for clean the pending command in the
+ * command slot during the surprise removal of device and return
+ * error to the upper layer.
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ *	None
+ */
+void mtip_command_cleanup(struct driver_data *dd)
+{
+	int group = 0, commandslot = 0, commandindex = 0;
+	struct mtip_cmd *command;
+	struct mtip_port *port = dd->port;
+
+	for (group = 0; group < 4; group++) {
+		for (commandslot = 0; commandslot < 32; commandslot++) {
+			if (!(port->allocated[group] & (1 << commandslot)))
+				continue;
+
+			commandindex = group << 5 | commandslot;
+			command = &port->commands[commandindex];
+
+			if (atomic_read(&command->active)
+			    && (command->async_callback)) {
+				command->async_callback(command->async_data,
+					-ENODEV);
+				command->async_callback = NULL;
+				command->async_data = NULL;
+			}
+
+			dma_unmap_sg(&port->dd->pdev->dev,
+				command->sg,
+				command->scatter_ents,
+				command->direction);
+		}
+	}
+
+	up(&port->cmd_slot);
+
+	atomic_set(&dd->drv_cleanup_done, true);
+}
+
+/*
+ * Helper function for reusing disk name
+ * upon hot insertion.
+ */
+static int rssd_disk_name_format(char *prefix,
+				 int index,
+				 char *buf,
+				 int buflen)
+{
+	const int base = 'z' - 'a' + 1;
+	char *begin = buf + strlen(prefix);
+	char *end = buf + buflen;
+	char *p;
+	int unit;
+
+	p = end - 1;
+	*p = '\0';
+	unit = base;
+	do {
+		if (p == begin)
+			return -EINVAL;
+		*--p = 'a' + (index % unit);
+		index = (index / unit) - 1;
+	} while (index >= 0);
+
+	memmove(begin, p, end - p);
+	memcpy(buf, prefix, strlen(prefix));
+
+	return 0;
+}
+
+/*
+ * Block layer IOCTL handler.
+ *
+ * @dev Pointer to the block_device structure.
+ * @mode ignored
+ * @cmd IOCTL command passed from the user application.
+ * @arg Argument passed from the user application.
+ *
+ * return value
+ *	0        IOCTL completed successfully.
+ *	-ENOTTY  IOCTL not supported or invalid driver data
+ *                 structure pointer.
+ */
+static int mtip_block_ioctl(struct block_device *dev,
+			    fmode_t mode,
+			    unsigned cmd,
+			    unsigned long arg)
+{
+	struct driver_data *dd = dev->bd_disk->private_data;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (!dd)
+		return -ENOTTY;
+
+	switch (cmd) {
+	case BLKFLSBUF:
+		return 0;
+	default:
+		return mtip_hw_ioctl(dd, cmd, arg, 0);
+	}
+}
+
+/*
+ * Block layer compat IOCTL handler.
+ *
+ * @dev Pointer to the block_device structure.
+ * @mode ignored
+ * @cmd IOCTL command passed from the user application.
+ * @arg Argument passed from the user application.
+ *
+ * return value
+ *	0        IOCTL completed successfully.
+ *	-ENOTTY  IOCTL not supported or invalid driver data
+ *                 structure pointer.
+ */
+static int mtip_block_compat_ioctl(struct block_device *dev,
+			    fmode_t mode,
+			    unsigned cmd,
+			    unsigned long arg)
+{
+	struct driver_data *dd = dev->bd_disk->private_data;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (!dd)
+		return -ENOTTY;
+
+	switch (cmd) {
+	case BLKFLSBUF:
+		return 0;
+	default:
+		return mtip_hw_ioctl(dd, cmd, arg, 1);
+	}
+}
+
+/*
+ * Obtain the geometry of the device.
+ *
+ * You may think that this function is obsolete, but some applications,
+ * fdisk for example still used CHS values. This function describes the
+ * device as having 224 heads and 56 sectors per cylinder. These values are
+ * chosen so that each cylinder is aligned on a 4KB boundary. Since a
+ * partition is described in terms of a start and end cylinder this means
+ * that each partition is also 4KB aligned. Non-aligned partitions adversely
+ * affects performance.
+ *
+ * @dev Pointer to the block_device strucutre.
+ * @geo Pointer to a hd_geometry structure.
+ *
+ * return value
+ *	0       Operation completed successfully.
+ *	-ENOTTY An error occurred while reading the drive capacity.
+ */
+static int mtip_block_getgeo(struct block_device *dev,
+				struct hd_geometry *geo)
+{
+	struct driver_data *dd = dev->bd_disk->private_data;
+	sector_t capacity;
+
+	if (!dd)
+		return -ENOTTY;
+
+	if (!(mtip_hw_get_capacity(dd, &capacity))) {
+		dev_warn(&dd->pdev->dev,
+			"Could not get drive capacity.\n");
+		return -ENOTTY;
+	}
+
+	geo->heads = 224;
+	geo->sectors = 56;
+#if BITS_PER_LONG == 64
+	geo->cylinders = capacity / (geo->heads * geo->sectors);
+#else
+	do_div(capacity, (geo->heads * geo->sectors));
+	geo->cylinders = capacity;
+#endif
+	return 0;
+}
+
+/*
+ * Block device operation function.
+ *
+ * This structure contains pointers to the functions required by the block
+ * layer.
+ */
+static const struct block_device_operations mtip_block_ops = {
+	.ioctl		= mtip_block_ioctl,
+	.compat_ioctl	= mtip_block_compat_ioctl,
+	.getgeo		= mtip_block_getgeo,
+	.owner		= THIS_MODULE
+};
+
+/*
+ * Block layer make request function.
+ *
+ * This function is called by the kernel to process a BIO for
+ * the P320 device.
+ *
+ * @queue Pointer to the request queue. Unused other than to obtain
+ *              the driver data structure.
+ * @bio   Pointer to the BIO.
+ *
+ * return value
+ *	0
+ */
+static int mtip_make_request(struct request_queue *queue, struct bio *bio)
+{
+	struct driver_data *dd = queue->queuedata;
+	struct scatterlist *sg;
+	struct bio_vec *bvec;
+	int nents = 0;
+	int tag = 0;
+
+	if (unlikely(!bio_has_data(bio))) {
+		blk_queue_flush(queue, 0);
+		bio_endio(bio, 0);
+		return 0;
+	}
+
+	if (unlikely(atomic_read(&dd->eh_active))) {
+		bio_endio(bio, -EBUSY);
+		return 0;
+	}
+
+	sg = mtip_hw_get_scatterlist(dd, &tag);
+	if (likely(sg != NULL)) {
+		blk_queue_bounce(queue, &bio);
+
+		if (unlikely((bio)->bi_vcnt > MTIP_MAX_SG)) {
+			dev_warn(&dd->pdev->dev,
+				"Maximum number of SGL entries exceeded");
+			bio_io_error(bio);
+			mtip_hw_release_scatterlist(dd, tag);
+			return 0;
+		}
+
+		/* Create the scatter list for this bio. */
+		bio_for_each_segment(bvec, bio, nents) {
+			sg_set_page(&sg[nents],
+					bvec->bv_page,
+					bvec->bv_len,
+					bvec->bv_offset);
+		}
+
+		/* Issue the read/write. */
+		mtip_hw_submit_io(dd,
+				bio->bi_sector,
+				bio_sectors(bio),
+				nents,
+				tag,
+				bio_endio,
+				bio,
+				bio->bi_rw & REQ_FLUSH,
+				bio_data_dir(bio));
+	} else {
+		bio_io_error(bio);
+	}
+
+	return 0;
+}
+
+/*
+ * Block layer initialization function.
+ *
+ * This function is called once by the PCI layer for each P320
+ * device that is connected to the system.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0 on success else an error code.
+ */
+int mtip_block_initialize(struct driver_data *dd)
+{
+	int rv = 0;
+	sector_t capacity;
+	unsigned int index = 0;
+	struct kobject *kobj;
+
+	/* Initialize the protocol layer. */
+	rv = mtip_hw_init(dd);
+	if (rv < 0) {
+		dev_err(&dd->pdev->dev,
+			"Protocol layer initialization failed\n");
+		rv = -EINVAL;
+		goto protocol_init_error;
+	}
+
+	/* Allocate the request queue. */
+	dd->queue = blk_alloc_queue(GFP_KERNEL);
+	if (dd->queue == NULL) {
+		dev_err(&dd->pdev->dev,
+			"Unable to allocate request queue\n");
+		rv = -ENOMEM;
+		goto block_queue_alloc_init_error;
+	}
+
+	/* Attach our request function to the request queue. */
+	blk_queue_make_request(dd->queue, mtip_make_request);
+
+	/* Set device limits. */
+	set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags);
+	blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
+	blk_queue_physical_block_size(dd->queue, 4096);
+	blk_queue_io_min(dd->queue, 4096);
+
+	dd->disk = alloc_disk(MTIP_MAX_MINORS);
+	if (dd->disk  == NULL) {
+		dev_err(&dd->pdev->dev,
+			"Unable to allocate gendisk structure\n");
+		rv = -EINVAL;
+		goto alloc_disk_error;
+	}
+
+	/* Generate the disk name, implemented same as in sd.c */
+	do {
+		if (!ida_pre_get(&rssd_index_ida, GFP_KERNEL))
+			goto ida_get_error;
+
+		spin_lock(&rssd_index_lock);
+		rv = ida_get_new(&rssd_index_ida, &index);
+		spin_unlock(&rssd_index_lock);
+	} while (rv == -EAGAIN);
+
+	if (rv)
+		goto ida_get_error;
+
+	rv = rssd_disk_name_format("rssd",
+				index,
+				dd->disk->disk_name,
+				DISK_NAME_LEN);
+	if (rv)
+		goto disk_index_error;
+
+	dd->disk->driverfs_dev	= &dd->pdev->dev;
+	dd->disk->major		= dd->major;
+	dd->disk->first_minor	= dd->instance * MTIP_MAX_MINORS;
+	dd->disk->fops		= &mtip_block_ops;
+	dd->disk->queue		= dd->queue;
+	dd->disk->private_data	= dd;
+	dd->queue->queuedata	= dd;
+	dd->index		= index;
+
+	/* Set the capacity of the device in 512 byte sectors. */
+	if (!(mtip_hw_get_capacity(dd, &capacity))) {
+		dev_warn(&dd->pdev->dev,
+			"Could not read drive capacity\n");
+		rv = -EIO;
+		goto read_capacity_error;
+	}
+	set_capacity(dd->disk, capacity);
+
+	/* Enable the block device and add it to /dev */
+	add_disk(dd->disk);
+
+	/*
+	 * Now that the disk is active, initialize any sysfs attributes
+	 * managed by the protocol layer.
+	 */
+	kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+	if (kobj) {
+		mtip_hw_sysfs_init(dd, kobj);
+		kobject_put(kobj);
+	}
+
+	return rv;
+
+read_capacity_error:
+	/*
+	 * Delete our gendisk structure. This also removes the device
+	 * from /dev
+	 */
+	del_gendisk(dd->disk);
+
+disk_index_error:
+	spin_lock(&rssd_index_lock);
+	ida_remove(&rssd_index_ida, index);
+	spin_unlock(&rssd_index_lock);
+
+ida_get_error:
+	put_disk(dd->disk);
+
+alloc_disk_error:
+	blk_cleanup_queue(dd->queue);
+
+block_queue_alloc_init_error:
+	/* De-initialize the protocol layer. */
+	mtip_hw_exit(dd);
+
+protocol_init_error:
+	return rv;
+}
+
+/*
+ * Block layer deinitialization function.
+ *
+ * Called by the PCI layer as each P320 device is removed.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+int mtip_block_remove(struct driver_data *dd)
+{
+	struct kobject *kobj;
+	/* Clean up the sysfs attributes managed by the protocol layer. */
+	kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+	if (kobj) {
+		mtip_hw_sysfs_exit(dd, kobj);
+		kobject_put(kobj);
+	}
+
+	/*
+	 * Delete our gendisk structure. This also removes the device
+	 * from /dev
+	 */
+	del_gendisk(dd->disk);
+	blk_cleanup_queue(dd->queue);
+	dd->disk  = NULL;
+	dd->queue = NULL;
+
+	/* De-initialize the protocol layer. */
+	mtip_hw_exit(dd);
+
+	return 0;
+}
+
+/*
+ * Function called by the PCI layer when just before the
+ * machine shuts down.
+ *
+ * If a protocol layer shutdown function is present it will be called
+ * by this function.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+int mtip_block_shutdown(struct driver_data *dd)
+{
+	dev_info(&dd->pdev->dev,
+		"Shutting down %s ...\n", dd->disk->disk_name);
+
+	/* Delete our gendisk structure, and cleanup the blk queue. */
+	del_gendisk(dd->disk);
+	blk_cleanup_queue(dd->queue);
+	dd->disk  = NULL;
+	dd->queue = NULL;
+
+	mtip_hw_shutdown(dd);
+	return 0;
+}
+
+int mtip_block_suspend(struct driver_data *dd)
+{
+	dev_info(&dd->pdev->dev,
+		"Suspending %s ...\n", dd->disk->disk_name);
+	mtip_hw_suspend(dd);
+	return 0;
+}
+
+int mtip_block_resume(struct driver_data *dd)
+{
+	dev_info(&dd->pdev->dev, "Resuming %s ...\n",
+		dd->disk->disk_name);
+	mtip_hw_resume(dd);
+	return 0;
+}
+
+/*
+ * Called for each supported PCI device detected.
+ *
+ * This function allocates the private data structure, enables the
+ * PCI device and then calls the block layer initialization function.
+ *
+ * return value
+ *	0 on success else an error code.
+ */
+static int mtip_pci_probe(struct pci_dev *pdev,
+			const struct pci_device_id *ent)
+{
+	int rv = 0;
+	struct driver_data *dd = NULL;
+
+	/* Allocate memory for this devices private data. */
+	dd = kzalloc(sizeof(struct driver_data), GFP_KERNEL);
+	if (dd == NULL) {
+		dev_err(&pdev->dev,
+			"Unable to allocate memory for driver data\n");
+		return -ENOMEM;
+	}
+
+	/* Set the atomic variable as 1 in case of SRSI */
+	atomic_set(&dd->drv_cleanup_done, true);
+
+	atomic_set(&dd->resumeflag, false);
+	atomic_set(&dd->eh_active, 0);
+
+	/* Attach the private data to this PCI device.  */
+	pci_set_drvdata(pdev, dd);
+
+	rv = pcim_enable_device(pdev);
+	if (rv < 0) {
+		dev_err(&pdev->dev, "Unable to enable device\n");
+		goto iomap_err;
+	}
+
+	/* Map BAR5 to memory. */
+	rv = pcim_iomap_regions(pdev, 1 << MTIP_ABAR, MTIP_DRV_NAME);
+	if (rv < 0) {
+		dev_err(&pdev->dev, "Unable to map regions\n");
+		goto iomap_err;
+	}
+
+	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+		rv = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+
+		if (rv) {
+			rv = pci_set_consistent_dma_mask(pdev,
+						DMA_BIT_MASK(32));
+			if (rv) {
+				dev_warn(&pdev->dev,
+					"64-bit DMA enable failed\n");
+				goto setmask_err;
+			}
+		}
+	}
+
+	pci_set_master(pdev);
+
+	if (pci_enable_msi(pdev)) {
+		dev_warn(&pdev->dev,
+			"Unable to enable MSI interrupt.\n");
+		goto block_initialize_err;
+	}
+
+	/* Copy the info we may need later into the private data structure. */
+	dd->major	= mtip_major;
+	dd->protocol	= ent->driver_data;
+	dd->instance	= instance;
+	dd->pdev	= pdev;
+
+	/* Initialize the block layer. */
+	rv = mtip_block_initialize(dd);
+	if (rv < 0) {
+		dev_err(&pdev->dev,
+			"Unable to initialize block layer\n");
+		goto block_initialize_err;
+	}
+
+	/*
+	 * Increment the instance count so that each device has a unique
+	 * instance number.
+	 */
+	instance++;
+
+	goto done;
+
+block_initialize_err:
+	pci_disable_msi(pdev);
+
+setmask_err:
+	pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
+
+iomap_err:
+	kfree(dd);
+	pci_set_drvdata(pdev, NULL);
+	return rv;
+done:
+	/* Set the atomic variable as 0 in case of SRSI */
+	atomic_set(&dd->drv_cleanup_done, true);
+
+	return rv;
+}
+
+/*
+ * Called for each probed device when the device is removed or the
+ * driver is unloaded.
+ *
+ * return value
+ *	None
+ */
+static void mtip_pci_remove(struct pci_dev *pdev)
+{
+	struct driver_data *dd = pci_get_drvdata(pdev);
+	int counter = 0;
+
+	if (mtip_check_surprise_removal(pdev)) {
+		while (atomic_read(&dd->drv_cleanup_done) == false) {
+			counter++;
+			msleep(20);
+			if (counter == 10) {
+				/* Cleanup the outstanding commands */
+				mtip_command_cleanup(dd);
+				break;
+			}
+		}
+	}
+	/* Set the atomic variable as 1 in case of SRSI */
+	atomic_set(&dd->drv_cleanup_done, true);
+
+	/* Clean up the block layer. */
+	mtip_block_remove(dd);
+
+	pci_disable_msi(pdev);
+
+	kfree(dd);
+	pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
+}
+
+/*
+ * Called for each probed device when the device is suspended.
+ *
+ * return value
+ *	0  Success
+ *	<0 Error
+ */
+static int mtip_pci_suspend(struct pci_dev *pdev, pm_message_t mesg)
+{
+	int rv = 0;
+	struct driver_data *dd = pci_get_drvdata(pdev);
+
+	if (!dd) {
+		dev_err(&pdev->dev,
+			"Driver private datastructure is NULL\n");
+		return -EFAULT;
+	}
+
+	atomic_set(&dd->resumeflag, true);
+
+	/* Disable ports & interrupts then send standby immediate */
+	rv = mtip_block_suspend(dd);
+	if (rv < 0) {
+		dev_err(&pdev->dev,
+			"Failed to suspend controller\n");
+		return rv;
+	}
+
+	/*
+	 * Save the pci config space to pdev structure &
+	 * disable the device
+	 */
+	pci_save_state(pdev);
+	pci_disable_device(pdev);
+
+	/* Move to Low power state*/
+	pci_set_power_state(pdev, PCI_D3hot);
+
+	return rv;
+}
+
+/*
+ * Called for each probed device when the device is resumed.
+ *
+ * return value
+ *      0  Success
+ *      <0 Error
+ */
+static int mtip_pci_resume(struct pci_dev *pdev)
+{
+	int rv = 0;
+	struct driver_data *dd;
+
+	dd = pci_get_drvdata(pdev);
+	if (!dd) {
+		dev_err(&pdev->dev,
+			"Driver private datastructure is NULL\n");
+		return -EFAULT;
+	}
+
+	/* Move the device to active State */
+	pci_set_power_state(pdev, PCI_D0);
+
+	/* Restore PCI configuration space */
+	pci_restore_state(pdev);
+
+	/* Enable the PCI device*/
+	rv = pcim_enable_device(pdev);
+	if (rv < 0) {
+		dev_err(&pdev->dev,
+			"Failed to enable card during resume\n");
+		goto err;
+	}
+	pci_set_master(pdev);
+
+	/*
+	 * Calls hbaReset, initPort, & startPort function
+	 * then enables interrupts
+	 */
+	rv = mtip_block_resume(dd);
+	if (rv < 0)
+		dev_err(&pdev->dev, "Unable to resume\n");
+
+err:
+	atomic_set(&dd->resumeflag, false);
+
+	return rv;
+}
+
+/*
+ * Shutdown routine
+ *
+ * return value
+ *      None
+ */
+static void mtip_pci_shutdown(struct pci_dev *pdev)
+{
+	struct driver_data *dd = pci_get_drvdata(pdev);
+	if (dd)
+		mtip_block_shutdown(dd);
+}
+
+/*
+ * This function check_for_surprise_removal is called
+ * while card is removed from the system and it will
+ * read the vendor id from the configration space
+ *
+ * @pdev Pointer to the pci_dev structure.
+ *
+ * return value
+ *	 true if device removed, else false
+ */
+bool mtip_check_surprise_removal(struct pci_dev *pdev)
+{
+	u16 vendor_id = 0;
+
+       /* Read the vendorID from the configuration space */
+	pci_read_config_word(pdev, 0x00, &vendor_id);
+	if (vendor_id == 0xFFFF)
+		return true; /* device removed */
+
+	return false; /* device present */
+}
+
+/* Table of device ids supported by this driver. */
+static DEFINE_PCI_DEVICE_TABLE(mtip_pci_tbl) = {
+	{  PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320_DEVICE_ID) },
+	{ 0 }
+};
+
+/* Structure that describes the PCI driver functions. */
+struct pci_driver mtip_pci_driver = {
+	.name			= MTIP_DRV_NAME,
+	.id_table		= mtip_pci_tbl,
+	.probe			= mtip_pci_probe,
+	.remove			= mtip_pci_remove,
+	.suspend		= mtip_pci_suspend,
+	.resume			= mtip_pci_resume,
+	.shutdown		= mtip_pci_shutdown,
+};
+
+MODULE_DEVICE_TABLE(pci, mtip_pci_tbl);
+
+/*
+ * Module initialization function.
+ *
+ * Called once when the module is loaded. This function allocates a major
+ * block device number to the Cyclone devices and registers the PCI layer
+ * of the driver.
+ *
+ * Return value
+ *      0 on success else error code.
+ */
+static int __init mtip_init(void)
+{
+	printk(KERN_INFO MTIP_DRV_NAME " Version " MTIP_DRV_VERSION "\n");
+
+	/* Allocate a major block device number to use with this driver. */
+	mtip_major = register_blkdev(0, MTIP_DRV_NAME);
+	if (mtip_major < 0) {
+		printk(KERN_ERR "Unable to register block device (%d)\n",
+		mtip_major);
+		return -EBUSY;
+	}
+
+	/* Register our PCI operations. */
+	return pci_register_driver(&mtip_pci_driver);
+}
+
+/*
+ * Module de-initialization function.
+ *
+ * Called once when the module is unloaded. This function deallocates
+ * the major block device number allocated by mtip_init() and
+ * unregisters the PCI layer of the driver.
+ *
+ * Return value
+ *      none
+ */
+static void __exit mtip_exit(void)
+{
+	/* Release the allocated major block device number. */
+	unregister_blkdev(mtip_major, MTIP_DRV_NAME);
+
+	/* Unregister the PCI driver. */
+	pci_unregister_driver(&mtip_pci_driver);
+}
+
+MODULE_AUTHOR("Micron Technology, Inc");
+MODULE_DESCRIPTION("Micron RealSSD PCIe Block Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(MTIP_DRV_VERSION);
+
+module_init(mtip_init);
+module_exit(mtip_exit);
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
new file mode 100644
index 000000000000..3423d18e7c86
--- /dev/null
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -0,0 +1,445 @@
+/*
+ * mtip32xx.h - Header file for the P320 SSD Block Driver
+ *   Copyright (C) 2011 Micron Technology, Inc.
+ *
+ * Portions of this code were derived from works subjected to the
+ * following copyright:
+ *    Copyright (C) 2009 Integrated Device Technology, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __MTIP32XX_H__
+#define __MTIP32XX_H__
+
+#include <linux/spinlock.h>
+#include <linux/rwsem.h>
+#include <linux/ata.h>
+#include <linux/interrupt.h>
+#include <linux/genhd.h>
+#include <linux/version.h>
+
+/* Offset of Subsystem Device ID in pci confoguration space */
+#define PCI_SUBSYSTEM_DEVICEID	0x2E
+
+/* offset of Device Control register in PCIe extended capabilites space */
+#define PCIE_CONFIG_EXT_DEVICE_CONTROL_OFFSET	0x48
+
+/* # of times to retry timed out IOs */
+#define MTIP_MAX_RETRIES	5
+
+/* Various timeout values in ms */
+#define MTIP_NCQ_COMMAND_TIMEOUT_MS       5000
+#define MTIP_IOCTL_COMMAND_TIMEOUT_MS     5000
+#define MTIP_INTERNAL_COMMAND_TIMEOUT_MS  5000
+
+/* check for timeouts every 500ms */
+#define MTIP_TIMEOUT_CHECK_PERIOD	500
+
+/* ftl rebuild */
+#define MTIP_FTL_REBUILD_OFFSET		142
+#define MTIP_FTL_REBUILD_MAGIC		0xed51
+#define MTIP_FTL_REBUILD_TIMEOUT_MS	2400000
+
+/* Macro to extract the tag bit number from a tag value. */
+#define MTIP_TAG_BIT(tag)	(tag & 0x1f)
+
+/*
+ * Macro to extract the tag index from a tag value. The index
+ * is used to access the correct s_active/Command Issue register based
+ * on the tag value.
+ */
+#define MTIP_TAG_INDEX(tag)	(tag >> 5)
+
+/*
+ * Maximum number of scatter gather entries
+ * a single command may have.
+ */
+#define MTIP_MAX_SG		128
+
+/*
+ * Maximum number of slot groups (Command Issue & s_active registers)
+ * NOTE: This is the driver maximum; check dd->slot_groups for actual value.
+ */
+#define MTIP_MAX_SLOT_GROUPS	8
+
+/* Internal command tag. */
+#define MTIP_TAG_INTERNAL	0
+
+/* Micron Vendor ID & P320x SSD Device ID */
+#define PCI_VENDOR_ID_MICRON    0x1344
+#define P320_DEVICE_ID		0x5150
+
+/* Driver name and version strings */
+#define MTIP_DRV_NAME		"mtip32xx"
+#define MTIP_DRV_VERSION	"1.2.6os2"
+
+/* Maximum number of minor device numbers per device. */
+#define MTIP_MAX_MINORS		16
+
+/* Maximum number of supported command slots. */
+#define MTIP_MAX_COMMAND_SLOTS	(MTIP_MAX_SLOT_GROUPS * 32)
+
+/*
+ * Per-tag bitfield size in longs.
+ * Linux bit manipulation functions
+ * (i.e. test_and_set_bit, find_next_zero_bit)
+ * manipulate memory in longs, so we try to make the math work.
+ * take the slot groups and find the number of longs, rounding up.
+ * Careful! i386 and x86_64 use different size longs!
+ */
+#define U32_PER_LONG	(sizeof(long) / sizeof(u32))
+#define SLOTBITS_IN_LONGS ((MTIP_MAX_SLOT_GROUPS + \
+					(U32_PER_LONG-1))/U32_PER_LONG)
+
+/* BAR number used to access the HBA registers. */
+#define MTIP_ABAR		5
+
+/* Forced Unit Access Bit */
+#define FUA_BIT			0x80
+
+#ifdef DEBUG
+ #define dbg_printk(format, arg...)	\
+	printk(pr_fmt(format), ##arg);
+#else
+ #define dbg_printk(format, arg...)
+#endif
+
+/* Register Frame Information Structure (FIS), host to device. */
+struct host_to_dev_fis {
+	/*
+	 * FIS type.
+	 * - 27h Register FIS, host to device.
+	 * - 34h Register FIS, device to host.
+	 * - 39h DMA Activate FIS, device to host.
+	 * - 41h DMA Setup FIS, bi-directional.
+	 * - 46h Data FIS, bi-directional.
+	 * - 58h BIST Activate FIS, bi-directional.
+	 * - 5Fh PIO Setup FIS, device to host.
+	 * - A1h Set Device Bits FIS, device to host.
+	 */
+	unsigned char type;
+	unsigned char opts;
+	unsigned char command;
+	unsigned char features;
+
+	union {
+		unsigned char lba_low;
+		unsigned char sector;
+	};
+	union {
+		unsigned char lba_mid;
+		unsigned char cyl_low;
+	};
+	union {
+		unsigned char lba_hi;
+		unsigned char cyl_hi;
+	};
+	union {
+		unsigned char device;
+		unsigned char head;
+	};
+
+	union {
+		unsigned char lba_low_ex;
+		unsigned char sector_ex;
+	};
+	union {
+		unsigned char lba_mid_ex;
+		unsigned char cyl_low_ex;
+	};
+	union {
+		unsigned char lba_hi_ex;
+		unsigned char cyl_hi_ex;
+	};
+	unsigned char features_ex;
+
+	unsigned char sect_count;
+	unsigned char sect_cnt_ex;
+	unsigned char res2;
+	unsigned char control;
+
+	unsigned int res3;
+};
+
+/* Command header structure. */
+struct mtip_cmd_hdr {
+	/*
+	 * Command options.
+	 * - Bits 31:16 Number of PRD entries.
+	 * - Bits 15:8 Unused in this implementation.
+	 * - Bit 7 Prefetch bit, informs the drive to prefetch PRD entries.
+	 * - Bit 6 Write bit, should be set when writing data to the device.
+	 * - Bit 5 Unused in this implementation.
+	 * - Bits 4:0 Length of the command FIS in DWords (DWord = 4 bytes).
+	 */
+	unsigned int opts;
+	/* This field is unsed when using NCQ. */
+	union {
+		unsigned int byte_count;
+		unsigned int status;
+	};
+	/*
+	 * Lower 32 bits of the command table address associated with this
+	 * header. The command table addresses must be 128 byte aligned.
+	 */
+	unsigned int ctba;
+	/*
+	 * If 64 bit addressing is used this field is the upper 32 bits
+	 * of the command table address associated with this command.
+	 */
+	unsigned int ctbau;
+	/* Reserved and unused. */
+	unsigned int res[4];
+};
+
+/* Command scatter gather structure (PRD). */
+struct mtip_cmd_sg {
+	/*
+	 * Low 32 bits of the data buffer address. For P320 this
+	 * address must be 8 byte aligned signified by bits 2:0 being
+	 * set to 0.
+	 */
+	unsigned int dba;
+	/*
+	 * When 64 bit addressing is used this field is the upper
+	 * 32 bits of the data buffer address.
+	 */
+	unsigned int dba_upper;
+	/* Unused. */
+	unsigned int reserved;
+	/*
+	 * Bit 31: interrupt when this data block has been transferred.
+	 * Bits 30..22: reserved
+	 * Bits 21..0: byte count (minus 1).  For P320 the byte count must be
+	 * 8 byte aligned signified by bits 2:0 being set to 1.
+	 */
+	unsigned int info;
+};
+struct mtip_port;
+
+/* Structure used to describe a command. */
+struct mtip_cmd {
+
+	struct mtip_cmd_hdr *command_header; /* ptr to command header entry */
+
+	dma_addr_t command_header_dma; /* corresponding physical address */
+
+	void *command; /* ptr to command table entry */
+
+	dma_addr_t command_dma; /* corresponding physical address */
+
+	void *comp_data; /* data passed to completion function comp_func() */
+	/*
+	 * Completion function called by the ISR upon completion of
+	 * a command.
+	 */
+	void (*comp_func)(struct mtip_port *port,
+				int tag,
+				void *data,
+				int status);
+	/* Additional callback function that may be called by comp_func() */
+	void (*async_callback)(void *data, int status);
+
+	void *async_data; /* Addl. data passed to async_callback() */
+
+	int scatter_ents; /* Number of scatter list entries used */
+
+	struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */
+
+	int retries; /* The number of retries left for this command. */
+
+	int direction; /* Data transfer direction */
+
+	unsigned long comp_time; /* command completion time, in jiffies */
+
+	atomic_t active; /* declares if this command sent to the drive.  */
+};
+
+/* Structure used to describe a port. */
+struct mtip_port {
+	/* Pointer back to the driver data for this port. */
+	struct driver_data *dd;
+	/*
+	 * Used to determine if the data pointed to by the
+	 * identify field is valid.
+	 */
+	unsigned long identify_valid;
+	/* Base address of the memory mapped IO for the port. */
+	void __iomem *mmio;
+	/* Array of pointers to the memory mapped s_active registers. */
+	void __iomem *s_active[MTIP_MAX_SLOT_GROUPS];
+	/* Array of pointers to the memory mapped completed registers.  */
+	void __iomem *completed[MTIP_MAX_SLOT_GROUPS];
+	/* Array of pointers to the memory mapped Command Issue registers. */
+	void __iomem *cmd_issue[MTIP_MAX_SLOT_GROUPS];
+	/*
+	 * Pointer to the beginning of the command header memory as used
+	 * by the driver.
+	 */
+	void *command_list;
+	/*
+	 * Pointer to the beginning of the command header memory as used
+	 * by the DMA.
+	 */
+	dma_addr_t command_list_dma;
+	/*
+	 * Pointer to the beginning of the RX FIS memory as used
+	 * by the driver.
+	 */
+	void *rxfis;
+	/*
+	 * Pointer to the beginning of the RX FIS memory as used
+	 * by the DMA.
+	 */
+	dma_addr_t rxfis_dma;
+	/*
+	 * Pointer to the beginning of the command table memory as used
+	 * by the driver.
+	 */
+	void *command_table;
+	/*
+	 * Pointer to the beginning of the command table memory as used
+	 * by the DMA.
+	 */
+	dma_addr_t command_tbl_dma;
+	/*
+	 * Pointer to the beginning of the identify data memory as used
+	 * by the driver.
+	 */
+	u16 *identify;
+	/*
+	 * Pointer to the beginning of the identify data memory as used
+	 * by the DMA.
+	 */
+	dma_addr_t identify_dma;
+	/*
+	 * Pointer to the beginning of a sector buffer that is used
+	 * by the driver when issuing internal commands.
+	 */
+	u16 *sector_buffer;
+	/*
+	 * Pointer to the beginning of a sector buffer that is used
+	 * by the DMA when the driver issues internal commands.
+	 */
+	dma_addr_t sector_buffer_dma;
+	/*
+	 * Bit significant, used to determine if a command slot has
+	 * been allocated. i.e. the slot is in use.  Bits are cleared
+	 * when the command slot and all associated data structures
+	 * are no longer needed.
+	 */
+	unsigned long allocated[SLOTBITS_IN_LONGS];
+	/*
+	 * Array of command slots. Structure includes pointers to the
+	 * command header and command table, and completion function and data
+	 * pointers.
+	 */
+	struct mtip_cmd commands[MTIP_MAX_COMMAND_SLOTS];
+	/* Non-zero if an internal command is in progress. */
+	int internal_cmd_in_progress;
+	/*
+	 * Timer used to complete commands that have been active for too long.
+	 */
+	struct timer_list cmd_timer;
+	/*
+	 * Semaphore used to block threads if there are no
+	 * command slots available.
+	 */
+	struct semaphore cmd_slot;
+	/* Spinlock for working around command-issue bug. */
+	spinlock_t cmd_issue_lock;
+};
+
+/*
+ * Driver private data structure.
+ *
+ * One structure is allocated per probed device.
+ */
+struct driver_data {
+	void __iomem *mmio; /* Base address of the HBA registers. */
+
+	int major; /* Major device number. */
+
+	int instance; /* Instance number. First device probed is 0, ... */
+
+	int protocol; /* FIXME: Protocol ops array index. */
+
+	struct gendisk *disk; /* Pointer to our gendisk structure. */
+
+	struct pci_dev *pdev; /* Pointer to the PCI device structure. */
+
+	struct request_queue *queue; /* Our request queue. */
+	/*
+	 * Semaphore used to lock out read/write commands during the
+	 * execution of an internal command.
+	 */
+	struct rw_semaphore internal_sem;
+
+	struct mtip_port *port; /* Pointer to the port data structure. */
+
+	/* Tasklet used to process the bottom half of the ISR. */
+	struct tasklet_struct tasklet;
+
+	unsigned product_type; /* magic value declaring the product type */
+
+	unsigned slot_groups; /* number of slot groups the product supports */
+
+	atomic_t drv_cleanup_done; /* Atomic variable for SRSI */
+
+	unsigned long index; /* Index to determine the disk name */
+
+	unsigned int ftlrebuildflag; /* FTL rebuild flag */
+
+	atomic_t resumeflag; /* Atomic variable to track suspend/resume */
+
+	atomic_t eh_active; /* Flag for error handling tracking */
+};
+
+/* Function declarations */
+extern int mtip_block_initialize(struct driver_data *dd);
+extern int mtip_block_remove(struct driver_data *dd);
+extern int mtip_block_shutdown(struct driver_data *dd);
+extern int mtip_block_suspend(struct driver_data *dd);
+extern int mtip_block_resume(struct driver_data *dd);
+extern int mtip_hw_init(struct driver_data *dd);
+extern int mtip_hw_exit(struct driver_data *dd);
+extern int mtip_hw_shutdown(struct driver_data *dd);
+extern bool mtip_hw_get_capacity(struct driver_data *dd, sector_t *sectors);
+extern void mtip_hw_release_scatterlist(
+			struct driver_data *dd,
+			int tag);
+extern struct scatterlist *mtip_hw_get_scatterlist(
+			struct driver_data *dd,
+			int *tag);
+extern void mtip_hw_submit_io(struct driver_data *dd,
+			sector_t start,
+			int nsect,
+			int nents,
+			int tag,
+			void *callback,
+			void *data,
+			int barrier,
+			int dir);
+extern int mtip_hw_ioctl(struct driver_data *dd,
+			unsigned int cmd,
+			unsigned long arg,
+			unsigned char compat);
+extern int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj);
+extern int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj);
+extern int mtip_hw_resume(struct driver_data *dd);
+extern int mtip_hw_suspend(struct driver_data *dd);
+void mtip_command_cleanup(struct driver_data *dd);
+bool mtip_check_surprise_removal(struct pci_dev *pdev);
+void mtip_restart_port(struct mtip_port *port);
+
+#endif
-- 
cgit 


From 16d02c040bb6769068f7c4b54ea8542f14237362 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 Sep 2011 15:50:01 -0600
Subject: mtip32xx: fix warnings/errors on 32-bit compiles

We need to clean up the compat ioctl handling, but this makes it
work for now at least.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 847b8ff7b8c2..1cf2b0443571 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -85,6 +85,7 @@ int mtip_major;
 static DEFINE_SPINLOCK(rssd_index_lock);
 static DEFINE_IDA(rssd_index_ida);
 
+#ifdef CONFIG_COMPAT
 struct mtip_compat_ide_task_request_s {
 	__u8		io_ports[8];
 	__u8		hob_ports[8];
@@ -95,6 +96,7 @@ struct mtip_compat_ide_task_request_s {
 	compat_ulong_t	out_size;
 	compat_ulong_t	in_size;
 };
+#endif
 
 static int mtip_exec_internal_command(struct mtip_port *port,
 				void *fis,
@@ -1628,9 +1630,9 @@ static int exec_drive_taskfile(struct driver_data *dd,
 	ide_task_request_t *req_task;
 	u8 *outbuf = NULL;
 	u8 *inbuf = NULL;
-	dma_addr_t outbuf_dma = (dma_addr_t)NULL;
-	dma_addr_t inbuf_dma = (dma_addr_t)NULL;
-	dma_addr_t dma_buffer = (dma_addr_t)NULL;
+	dma_addr_t outbuf_dma = 0;
+	dma_addr_t inbuf_dma = 0;
+	dma_addr_t dma_buffer = 0;
 	int err = 0;
 	int tasksize = sizeof(struct ide_task_request_s);
 	unsigned int taskin = 0;
@@ -1642,14 +1644,18 @@ static int exec_drive_taskfile(struct driver_data *dd,
 	unsigned int transfer_size;
 	unsigned long task_file_data;
 	int intotal, outtotal;
+#ifdef CONFIG_COMPAT
 	struct mtip_compat_ide_task_request_s *compat_req_task = NULL;
 	int compat_tasksize = sizeof(struct mtip_compat_ide_task_request_s);
+#endif
+
 
 	req_task = kzalloc(tasksize, GFP_KERNEL);
 	if (req_task == NULL)
 		return -ENOMEM;
 
 	if (compat == 1) {
+#ifdef CONFIG_COMPAT
 		compat_req_task =
 			(struct mtip_compat_ide_task_request_s __user *) arg;
 
@@ -1672,6 +1678,10 @@ static int exec_drive_taskfile(struct driver_data *dd,
 
 		outtotal = compat_tasksize;
 		intotal = compat_tasksize + req_task->out_size;
+#else
+		outtotal = 0;
+		intotal = 0;
+#endif
 	} else {
 		if (copy_from_user(req_task, buf, tasksize)) {
 			kfree(req_task);
@@ -1705,7 +1715,7 @@ static int exec_drive_taskfile(struct driver_data *dd,
 					 outbuf,
 					 taskout,
 					 DMA_TO_DEVICE);
-		if (outbuf_dma == (dma_addr_t)NULL) {
+		if (outbuf_dma == 0) {
 			err = -ENOMEM;
 			goto abort;
 		}
@@ -1726,7 +1736,7 @@ static int exec_drive_taskfile(struct driver_data *dd,
 		inbuf_dma = pci_map_single(dd->pdev,
 					 inbuf,
 					 taskin, DMA_FROM_DEVICE);
-		if (inbuf_dma == (dma_addr_t)NULL) {
+		if (inbuf_dma == 0) {
 			err = -ENOMEM;
 			goto abort;
 		}
@@ -1868,8 +1878,8 @@ static int exec_drive_taskfile(struct driver_data *dd,
 	if (outbuf_dma)
 		pci_unmap_single(dd->pdev, outbuf_dma,
 			taskout, DMA_TO_DEVICE);
-	inbuf_dma  = (dma_addr_t) NULL;
-	outbuf_dma = (dma_addr_t) NULL;
+	inbuf_dma  = 0;
+	outbuf_dma = 0;
 
 	/* return the ATA registers to the caller.*/
 	req_task->io_ports[1] = reply->features;
@@ -1913,6 +1923,7 @@ static int exec_drive_taskfile(struct driver_data *dd,
 	up_write(&dd->internal_sem);
 
 	if (compat == 1) {
+#ifdef CONFIG_COMPAT
 		if (copy_to_user(buf, req_task,
 				compat_tasksize -
 				(2 * sizeof(compat_long_t)))) {
@@ -1928,6 +1939,7 @@ static int exec_drive_taskfile(struct driver_data *dd,
 			err = -EFAULT;
 			goto abort;
 		}
+#endif
 	} else {
 		if (copy_to_user(buf, req_task, tasksize)) {
 			err = -EFAULT;
@@ -2873,6 +2885,7 @@ static int mtip_block_ioctl(struct block_device *dev,
 	}
 }
 
+#ifdef CONFIG_COMPAT
 /*
  * Block layer compat IOCTL handler.
  *
@@ -2906,6 +2919,7 @@ static int mtip_block_compat_ioctl(struct block_device *dev,
 		return mtip_hw_ioctl(dd, cmd, arg, 1);
 	}
 }
+#endif
 
 /*
  * Obtain the geometry of the device.
@@ -2959,7 +2973,9 @@ static int mtip_block_getgeo(struct block_device *dev,
  */
 static const struct block_device_operations mtip_block_ops = {
 	.ioctl		= mtip_block_ioctl,
+#ifdef CONFIG_COMPAT
 	.compat_ioctl	= mtip_block_compat_ioctl,
+#endif
 	.getgeo		= mtip_block_getgeo,
 	.owner		= THIS_MODULE
 };
-- 
cgit 


From ef0f1587343a4c17b9b0d9061546e36c1c1bb2ec Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 Sep 2011 21:19:53 -0600
Subject: mtip32xx: cleanup compat ioctl handling

Do the conversion/copy up front instead of passing in a compat flag
to the ioctl handler and subsequently to the exec_drive_taskfile()
function.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 151 ++++++++++++++++----------------------
 drivers/block/mtip32xx/mtip32xx.h |   4 -
 2 files changed, 64 insertions(+), 91 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 1cf2b0443571..d58581b83c8d 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -1622,76 +1622,26 @@ static unsigned int implicit_sector(unsigned char command,
  * See ide_taskfile_ioctl() for derivation
  */
 static int exec_drive_taskfile(struct driver_data *dd,
-				unsigned long arg,
-				unsigned char compat)
+			       void __user *buf,
+			       ide_task_request_t *req_task,
+			       int outtotal)
 {
 	struct host_to_dev_fis	fis;
 	struct host_to_dev_fis *reply;
-	ide_task_request_t *req_task;
 	u8 *outbuf = NULL;
 	u8 *inbuf = NULL;
 	dma_addr_t outbuf_dma = 0;
 	dma_addr_t inbuf_dma = 0;
 	dma_addr_t dma_buffer = 0;
 	int err = 0;
-	int tasksize = sizeof(struct ide_task_request_s);
 	unsigned int taskin = 0;
 	unsigned int taskout = 0;
 	u8 nsect = 0;
-	char __user *buf = (char __user *)arg;
 	unsigned int timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS;
 	unsigned int force_single_sector;
 	unsigned int transfer_size;
 	unsigned long task_file_data;
-	int intotal, outtotal;
-#ifdef CONFIG_COMPAT
-	struct mtip_compat_ide_task_request_s *compat_req_task = NULL;
-	int compat_tasksize = sizeof(struct mtip_compat_ide_task_request_s);
-#endif
-
-
-	req_task = kzalloc(tasksize, GFP_KERNEL);
-	if (req_task == NULL)
-		return -ENOMEM;
-
-	if (compat == 1) {
-#ifdef CONFIG_COMPAT
-		compat_req_task =
-			(struct mtip_compat_ide_task_request_s __user *) arg;
-
-		if (copy_from_user(req_task, buf,
-				compat_tasksize -
-				(2 * sizeof(compat_long_t)))) {
-			err = -EFAULT;
-			goto abort;
-		}
-
-		if (get_user(req_task->out_size, &compat_req_task->out_size)) {
-			err = -EFAULT;
-			goto abort;
-		}
-
-		if (get_user(req_task->in_size, &compat_req_task->in_size)) {
-			err = -EFAULT;
-			goto abort;
-		}
-
-		outtotal = compat_tasksize;
-		intotal = compat_tasksize + req_task->out_size;
-#else
-		outtotal = 0;
-		intotal = 0;
-#endif
-	} else {
-		if (copy_from_user(req_task, buf, tasksize)) {
-			kfree(req_task);
-			err = -EFAULT;
-			goto abort;
-		}
-
-		outtotal = tasksize;
-		intotal = tasksize + req_task->out_size;
-	}
+	int intotal = outtotal + req_task->out_size;
 
 	taskout = req_task->out_size;
 	taskin = req_task->in_size;
@@ -1922,30 +1872,6 @@ static int exec_drive_taskfile(struct driver_data *dd,
 
 	up_write(&dd->internal_sem);
 
-	if (compat == 1) {
-#ifdef CONFIG_COMPAT
-		if (copy_to_user(buf, req_task,
-				compat_tasksize -
-				(2 * sizeof(compat_long_t)))) {
-			err = -EFAULT;
-			goto abort;
-		}
-		if (put_user(req_task->out_size,
-				&compat_req_task->out_size)) {
-			err = -EFAULT;
-			goto abort;
-		}
-		if (put_user(req_task->in_size, &compat_req_task->in_size)) {
-			err = -EFAULT;
-			goto abort;
-		}
-#endif
-	} else {
-		if (copy_to_user(buf, req_task, tasksize)) {
-			err = -EFAULT;
-			goto abort;
-		}
-	}
 	if (taskout) {
 		if (copy_to_user(buf + outtotal, outbuf, taskout)) {
 			err = -EFAULT;
@@ -1965,7 +1891,6 @@ abort:
 	if (outbuf_dma)
 		pci_unmap_single(dd->pdev, outbuf_dma,
 					taskout, DMA_TO_DEVICE);
-	kfree(req_task);
 	kfree(outbuf);
 	kfree(inbuf);
 
@@ -1989,10 +1914,8 @@ abort:
  *	-EFAULT An error occurred copying data to a user space buffer.
  *	-EIO	An error occurred while executing the command.
  */
-int mtip_hw_ioctl(struct driver_data *dd,
-		  unsigned int cmd,
-		  unsigned long arg,
-		  unsigned char compat)
+static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
+			 unsigned long arg)
 {
 	switch (cmd) {
 	case HDIO_GET_IDENTITY:
@@ -2049,8 +1972,24 @@ int mtip_hw_ioctl(struct driver_data *dd,
 
 		break;
 	}
-	case HDIO_DRIVE_TASKFILE:
-		return exec_drive_taskfile(dd, arg, compat);
+	case HDIO_DRIVE_TASKFILE: {
+		ide_task_request_t req_task;
+		int ret, outtotal;
+
+		if (copy_from_user(&req_task, (void __user *) arg,
+					sizeof(req_task)))
+			return -EFAULT;
+
+		outtotal = sizeof(req_task);
+
+		ret = exec_drive_taskfile(dd, (void __user *) arg,
+						&req_task, outtotal);
+
+		if (copy_to_user((void __user *) arg, &req_task, sizeof(req_task)))
+			return -EFAULT;
+
+		return ret;
+	}
 
 	default:
 		return -EINVAL;
@@ -2881,7 +2820,7 @@ static int mtip_block_ioctl(struct block_device *dev,
 	case BLKFLSBUF:
 		return 0;
 	default:
-		return mtip_hw_ioctl(dd, cmd, arg, 0);
+		return mtip_hw_ioctl(dd, cmd, arg);
 	}
 }
 
@@ -2915,8 +2854,46 @@ static int mtip_block_compat_ioctl(struct block_device *dev,
 	switch (cmd) {
 	case BLKFLSBUF:
 		return 0;
+	case HDIO_DRIVE_TASKFILE: {
+		struct mtip_compat_ide_task_request_s *compat_req_task;
+		ide_task_request_t req_task;
+		int compat_tasksize, outtotal, ret;
+
+		compat_tasksize = sizeof(struct mtip_compat_ide_task_request_s);
+
+		compat_req_task =
+			(struct mtip_compat_ide_task_request_s __user *) arg;
+
+		if (copy_from_user(&req_task, (void __user *) arg,
+				compat_tasksize - (2 * sizeof(compat_long_t))))
+			return -EFAULT;
+
+		if (get_user(req_task.out_size, &compat_req_task->out_size))
+			return -EFAULT;
+
+		if (get_user(req_task.in_size, &compat_req_task->in_size))
+			return -EFAULT;
+
+		outtotal = sizeof(struct mtip_compat_ide_task_request_s);
+
+		ret = exec_drive_taskfile(dd, (void __user *) arg,
+						&req_task, outtotal);
+
+		if (copy_to_user((void __user *) arg, &req_task,
+				compat_tasksize -
+				(2 * sizeof(compat_long_t))))
+			return -EFAULT;
+
+		if (put_user(req_task.out_size, &compat_req_task->out_size))
+			return -EFAULT;
+
+		if (put_user(req_task.in_size, &compat_req_task->in_size))
+			return -EFAULT;
+
+		return ret;
+	}
 	default:
-		return mtip_hw_ioctl(dd, cmd, arg, 1);
+		return mtip_hw_ioctl(dd, cmd, arg);
 	}
 }
 #endif
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index 3423d18e7c86..d6355c6f218f 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -430,10 +430,6 @@ extern void mtip_hw_submit_io(struct driver_data *dd,
 			void *data,
 			int barrier,
 			int dir);
-extern int mtip_hw_ioctl(struct driver_data *dd,
-			unsigned int cmd,
-			unsigned long arg,
-			unsigned char compat);
 extern int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj);
 extern int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj);
 extern int mtip_hw_resume(struct driver_data *dd);
-- 
cgit 


From 6316668fbcf8a0a166830e7e84cdbdf0ab9392c8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 Sep 2011 21:27:43 -0600
Subject: mtip32xx: ensure that all local functions are static

Kill the declarations in the header file and mark them as static.
Reshuffle a few functions to ensure that everything is properly
declared before being used.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 646 +++++++++++++++++++-------------------
 drivers/block/mtip32xx/mtip32xx.h |  33 --
 2 files changed, 320 insertions(+), 359 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index d58581b83c8d..40d840e56256 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -107,6 +107,72 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 				gfp_t atomic,
 				unsigned long timeout);
 
+/*
+ * This function check_for_surprise_removal is called
+ * while card is removed from the system and it will
+ * read the vendor id from the configration space
+ *
+ * @pdev Pointer to the pci_dev structure.
+ *
+ * return value
+ *	 true if device removed, else false
+ */
+static bool mtip_check_surprise_removal(struct pci_dev *pdev)
+{
+	u16 vendor_id = 0;
+
+       /* Read the vendorID from the configuration space */
+	pci_read_config_word(pdev, 0x00, &vendor_id);
+	if (vendor_id == 0xFFFF)
+		return true; /* device removed */
+
+	return false; /* device present */
+}
+
+/*
+ * This function is called for clean the pending command in the
+ * command slot during the surprise removal of device and return
+ * error to the upper layer.
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_command_cleanup(struct driver_data *dd)
+{
+	int group = 0, commandslot = 0, commandindex = 0;
+	struct mtip_cmd *command;
+	struct mtip_port *port = dd->port;
+
+	for (group = 0; group < 4; group++) {
+		for (commandslot = 0; commandslot < 32; commandslot++) {
+			if (!(port->allocated[group] & (1 << commandslot)))
+				continue;
+
+			commandindex = group << 5 | commandslot;
+			command = &port->commands[commandindex];
+
+			if (atomic_read(&command->active)
+			    && (command->async_callback)) {
+				command->async_callback(command->async_data,
+					-ENODEV);
+				command->async_callback = NULL;
+				command->async_data = NULL;
+			}
+
+			dma_unmap_sg(&port->dd->pdev->dev,
+				command->sg,
+				command->scatter_ents,
+				command->direction);
+		}
+	}
+
+	up(&port->cmd_slot);
+
+	atomic_set(&dd->drv_cleanup_done, true);
+}
+
 /*
  * Obtain an empty command slot.
  *
@@ -167,214 +233,72 @@ static inline void release_slot(struct mtip_port *port, int tag)
 }
 
 /*
- * Issue a command to the hardware.
- *
- * Set the appropriate bit in the s_active and Command Issue hardware
- * registers, causing hardware command processing to begin.
- *
- * @port Pointer to the port structure.
- * @tag  The tag of the command to be issued.
+ * Reset the HBA (without sleeping)
  *
- * return value
- *      None
- */
-static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag)
-{
-	unsigned long flags = 0;
-
-	atomic_set(&port->commands[tag].active, 1);
-
-	spin_lock_irqsave(&port->cmd_issue_lock, flags);
-
-	writel((1 << MTIP_TAG_BIT(tag)),
-			port->s_active[MTIP_TAG_INDEX(tag)]);
-	writel((1 << MTIP_TAG_BIT(tag)),
-			port->cmd_issue[MTIP_TAG_INDEX(tag)]);
-
-	spin_unlock_irqrestore(&port->cmd_issue_lock, flags);
-}
-
-/*
- * Called periodically to see if any read/write commands are
- * taking too long to complete.
+ * Just like hba_reset, except does not call sleep, so can be
+ * run from interrupt/tasklet context.
  *
- * @data Pointer to the PORT data structure.
+ * @dd Pointer to the driver data structure.
  *
  * return value
- *	None
+ *	0	The reset was successful.
+ *	-1	The HBA Reset bit did not clear.
  */
-void mtip_timeout_function(unsigned long int data)
+static int hba_reset_nosleep(struct driver_data *dd)
 {
-	struct mtip_port *port = (struct mtip_port *) data;
-	struct host_to_dev_fis *fis;
-	struct mtip_cmd *command;
-	int tag, cmdto_cnt = 0;
-	unsigned int bit, group;
-	unsigned int num_command_slots = port->dd->slot_groups * 32;
-
-	if (unlikely(!port))
-		return;
-
-	if (atomic_read(&port->dd->resumeflag) == true) {
-		mod_timer(&port->cmd_timer,
-			jiffies + msecs_to_jiffies(30000));
-		return;
-	}
-
-	for (tag = 0; tag < num_command_slots; tag++) {
-		/*
-		 * Skip internal command slot as it has
-		 * its own timeout mechanism
-		 */
-		if (tag == MTIP_TAG_INTERNAL)
-			continue;
-
-		if (atomic_read(&port->commands[tag].active) &&
-		   (time_after(jiffies, port->commands[tag].comp_time))) {
-			group = tag >> 5;
-			bit = tag & 0x1f;
-
-			command = &port->commands[tag];
-			fis = (struct host_to_dev_fis *) command->command;
-
-			dev_warn(&port->dd->pdev->dev,
-				"Timeout for command tag %d\n", tag);
-
-			cmdto_cnt++;
-			if (cmdto_cnt == 1)
-				atomic_inc(&port->dd->eh_active);
-
-			/*
-			 * Clear the completed bit. This should prevent
-			 *  any interrupt handlers from trying to retire
-			 *  the command.
-			 */
-			writel(1 << bit, port->completed[group]);
+	unsigned long timeout;
 
-			/* Call the async completion callback. */
-			if (likely(command->async_callback))
-				command->async_callback(command->async_data,
-							 -EIO);
-			command->async_callback = NULL;
-			command->comp_func = NULL;
+	/* Chip quirk: quiesce any chip function */
+	mdelay(10);
 
-			/* Unmap the DMA scatter list entries */
-			dma_unmap_sg(&port->dd->pdev->dev,
-					command->sg,
-					command->scatter_ents,
-					command->direction);
+	/* Set the reset bit */
+	writel(HOST_RESET, dd->mmio + HOST_CTL);
 
-			/*
-			 * Clear the allocated bit and active tag for the
-			 * command.
-			 */
-			atomic_set(&port->commands[tag].active, 0);
-			release_slot(port, tag);
+	/* Flush */
+	readl(dd->mmio + HOST_CTL);
 
-			up(&port->cmd_slot);
-		}
-	}
+	/*
+	 * Wait 10ms then spin for up to 1 second
+	 * waiting for reset acknowledgement
+	 */
+	timeout = jiffies + msecs_to_jiffies(1000);
+	mdelay(10);
+	while ((readl(dd->mmio + HOST_CTL) & HOST_RESET)
+		 && time_before(jiffies, timeout))
+		mdelay(1);
 
-	if (cmdto_cnt) {
-		dev_warn(&port->dd->pdev->dev,
-			"%d commands timed out: restarting port",
-			cmdto_cnt);
-		mtip_restart_port(port);
-		atomic_dec(&port->dd->eh_active);
-	}
+	if (readl(dd->mmio + HOST_CTL) & HOST_RESET)
+		return -1;
 
-	/* Restart the timer */
-	mod_timer(&port->cmd_timer,
-		jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
+	return 0;
 }
 
 /*
- * IO completion function.
+ * Issue a command to the hardware.
  *
- * This completion function is called by the driver ISR when a
- * command that was issued by the kernel completes. It first calls the
- * asynchronous completion function which normally calls back into the block
- * layer passing the asynchronous callback data, then unmaps the
- * scatter list associated with the completed command, and finally
- * clears the allocated bit associated with the completed command.
+ * Set the appropriate bit in the s_active and Command Issue hardware
+ * registers, causing hardware command processing to begin.
  *
- * @port   Pointer to the port data structure.
- * @tag    Tag of the command.
- * @data   Pointer to driver_data.
- * @status Completion status.
+ * @port Pointer to the port structure.
+ * @tag  The tag of the command to be issued.
  *
  * return value
- *	None
+ *      None
  */
-static void mtip_async_complete(struct mtip_port *port,
-				int tag,
-				void *data,
-				int status)
+static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag)
 {
-	struct mtip_cmd *command;
-	struct driver_data *dd = data;
-	int cb_status = status ? -EIO : 0;
-
-	if (unlikely(!dd) || unlikely(!port))
-		return;
-
-	command = &port->commands[tag];
-
-	if (unlikely(status == PORT_IRQ_TF_ERR)) {
-		dev_warn(&port->dd->pdev->dev,
-			"Command tag %d failed due to TFE\n", tag);
-	}
-
-	/* Upper layer callback */
-	if (likely(command->async_callback))
-		command->async_callback(command->async_data, cb_status);
-
-	command->async_callback = NULL;
-	command->comp_func = NULL;
-
-	/* Unmap the DMA scatter list entries */
-	dma_unmap_sg(&dd->pdev->dev,
-		command->sg,
-		command->scatter_ents,
-		command->direction);
-
-	/* Clear the allocated and active bits for the command */
-	atomic_set(&port->commands[tag].active, 0);
-	release_slot(port, tag);
+	unsigned long flags = 0;
 
-	up(&port->cmd_slot);
-}
+	atomic_set(&port->commands[tag].active, 1);
 
-/*
- * Internal command completion callback function.
- *
- * This function is normally called by the driver ISR when an internal
- * command completed. This function signals the command completion by
- * calling complete().
- *
- * @port   Pointer to the port data structure.
- * @tag    Tag of the command that has completed.
- * @data   Pointer to a completion structure.
- * @status Completion status.
- *
- * return value
- *	None
- */
-static void mtip_completion(struct mtip_port *port,
-			    int tag,
-			    void *data,
-			    int status)
-{
-	struct mtip_cmd *command = &port->commands[tag];
-	struct completion *waiting = data;
-	if (unlikely(status == PORT_IRQ_TF_ERR))
-		dev_warn(&port->dd->pdev->dev,
-			"Internal command %d completed with TFE\n", tag);
+	spin_lock_irqsave(&port->cmd_issue_lock, flags);
 
-	command->async_callback = NULL;
-	command->comp_func = NULL;
+	writel((1 << MTIP_TAG_BIT(tag)),
+			port->s_active[MTIP_TAG_INDEX(tag)]);
+	writel((1 << MTIP_TAG_BIT(tag)),
+			port->cmd_issue[MTIP_TAG_INDEX(tag)]);
 
-	complete(waiting);
+	spin_unlock_irqrestore(&port->cmd_issue_lock, flags);
 }
 
 /*
@@ -496,56 +420,15 @@ static void mtip_init_port(struct mtip_port *port)
 	/* Clear SError */
 	writel(readl(port->mmio + PORT_SCR_ERR), port->mmio + PORT_SCR_ERR);
 
-	/* reset the completed registers.*/
-	for (i = 0; i < port->dd->slot_groups; i++)
-		writel(0xFFFFFFFF, port->completed[i]);
-
-	/* Clear any pending interrupts for this port */
-	writel(readl(port->mmio + PORT_IRQ_STAT), port->mmio + PORT_IRQ_STAT);
-
-	/* Enable port interrupts */
-	writel(DEF_PORT_IRQ, port->mmio + PORT_IRQ_MASK);
-}
-
-/*
- * Reset the HBA (without sleeping)
- *
- * Just like hba_reset, except does not call sleep, so can be
- * run from interrupt/tasklet context.
- *
- * @dd Pointer to the driver data structure.
- *
- * return value
- *	0	The reset was successful.
- *	-1	The HBA Reset bit did not clear.
- */
-int hba_reset_nosleep(struct driver_data *dd)
-{
-	unsigned long timeout;
-
-	/* Chip quirk: quiesce any chip function */
-	mdelay(10);
-
-	/* Set the reset bit */
-	writel(HOST_RESET, dd->mmio + HOST_CTL);
-
-	/* Flush */
-	readl(dd->mmio + HOST_CTL);
-
-	/*
-	 * Wait 10ms then spin for up to 1 second
-	 * waiting for reset acknowledgement
-	 */
-	timeout = jiffies + msecs_to_jiffies(1000);
-	mdelay(10);
-	while ((readl(dd->mmio + HOST_CTL) & HOST_RESET)
-		 && time_before(jiffies, timeout))
-		mdelay(1);
+	/* reset the completed registers.*/
+	for (i = 0; i < port->dd->slot_groups; i++)
+		writel(0xFFFFFFFF, port->completed[i]);
 
-	if (readl(dd->mmio + HOST_CTL) & HOST_RESET)
-		return -1;
+	/* Clear any pending interrupts for this port */
+	writel(readl(port->mmio + PORT_IRQ_STAT), port->mmio + PORT_IRQ_STAT);
 
-	return 0;
+	/* Enable port interrupts */
+	writel(DEF_PORT_IRQ, port->mmio + PORT_IRQ_MASK);
 }
 
 /*
@@ -556,7 +439,7 @@ int hba_reset_nosleep(struct driver_data *dd)
  * return value
  *	None
  */
-void mtip_restart_port(struct mtip_port *port)
+static void mtip_restart_port(struct mtip_port *port)
 {
 	unsigned long timeout;
 
@@ -619,6 +502,189 @@ void mtip_restart_port(struct mtip_port *port)
 	mtip_enable_engine(port, 1);
 }
 
+/*
+ * Called periodically to see if any read/write commands are
+ * taking too long to complete.
+ *
+ * @data Pointer to the PORT data structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_timeout_function(unsigned long int data)
+{
+	struct mtip_port *port = (struct mtip_port *) data;
+	struct host_to_dev_fis *fis;
+	struct mtip_cmd *command;
+	int tag, cmdto_cnt = 0;
+	unsigned int bit, group;
+	unsigned int num_command_slots = port->dd->slot_groups * 32;
+
+	if (unlikely(!port))
+		return;
+
+	if (atomic_read(&port->dd->resumeflag) == true) {
+		mod_timer(&port->cmd_timer,
+			jiffies + msecs_to_jiffies(30000));
+		return;
+	}
+
+	for (tag = 0; tag < num_command_slots; tag++) {
+		/*
+		 * Skip internal command slot as it has
+		 * its own timeout mechanism
+		 */
+		if (tag == MTIP_TAG_INTERNAL)
+			continue;
+
+		if (atomic_read(&port->commands[tag].active) &&
+		   (time_after(jiffies, port->commands[tag].comp_time))) {
+			group = tag >> 5;
+			bit = tag & 0x1f;
+
+			command = &port->commands[tag];
+			fis = (struct host_to_dev_fis *) command->command;
+
+			dev_warn(&port->dd->pdev->dev,
+				"Timeout for command tag %d\n", tag);
+
+			cmdto_cnt++;
+			if (cmdto_cnt == 1)
+				atomic_inc(&port->dd->eh_active);
+
+			/*
+			 * Clear the completed bit. This should prevent
+			 *  any interrupt handlers from trying to retire
+			 *  the command.
+			 */
+			writel(1 << bit, port->completed[group]);
+
+			/* Call the async completion callback. */
+			if (likely(command->async_callback))
+				command->async_callback(command->async_data,
+							 -EIO);
+			command->async_callback = NULL;
+			command->comp_func = NULL;
+
+			/* Unmap the DMA scatter list entries */
+			dma_unmap_sg(&port->dd->pdev->dev,
+					command->sg,
+					command->scatter_ents,
+					command->direction);
+
+			/*
+			 * Clear the allocated bit and active tag for the
+			 * command.
+			 */
+			atomic_set(&port->commands[tag].active, 0);
+			release_slot(port, tag);
+
+			up(&port->cmd_slot);
+		}
+	}
+
+	if (cmdto_cnt) {
+		dev_warn(&port->dd->pdev->dev,
+			"%d commands timed out: restarting port",
+			cmdto_cnt);
+		mtip_restart_port(port);
+		atomic_dec(&port->dd->eh_active);
+	}
+
+	/* Restart the timer */
+	mod_timer(&port->cmd_timer,
+		jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
+}
+
+/*
+ * IO completion function.
+ *
+ * This completion function is called by the driver ISR when a
+ * command that was issued by the kernel completes. It first calls the
+ * asynchronous completion function which normally calls back into the block
+ * layer passing the asynchronous callback data, then unmaps the
+ * scatter list associated with the completed command, and finally
+ * clears the allocated bit associated with the completed command.
+ *
+ * @port   Pointer to the port data structure.
+ * @tag    Tag of the command.
+ * @data   Pointer to driver_data.
+ * @status Completion status.
+ *
+ * return value
+ *	None
+ */
+static void mtip_async_complete(struct mtip_port *port,
+				int tag,
+				void *data,
+				int status)
+{
+	struct mtip_cmd *command;
+	struct driver_data *dd = data;
+	int cb_status = status ? -EIO : 0;
+
+	if (unlikely(!dd) || unlikely(!port))
+		return;
+
+	command = &port->commands[tag];
+
+	if (unlikely(status == PORT_IRQ_TF_ERR)) {
+		dev_warn(&port->dd->pdev->dev,
+			"Command tag %d failed due to TFE\n", tag);
+	}
+
+	/* Upper layer callback */
+	if (likely(command->async_callback))
+		command->async_callback(command->async_data, cb_status);
+
+	command->async_callback = NULL;
+	command->comp_func = NULL;
+
+	/* Unmap the DMA scatter list entries */
+	dma_unmap_sg(&dd->pdev->dev,
+		command->sg,
+		command->scatter_ents,
+		command->direction);
+
+	/* Clear the allocated and active bits for the command */
+	atomic_set(&port->commands[tag].active, 0);
+	release_slot(port, tag);
+
+	up(&port->cmd_slot);
+}
+
+/*
+ * Internal command completion callback function.
+ *
+ * This function is normally called by the driver ISR when an internal
+ * command completed. This function signals the command completion by
+ * calling complete().
+ *
+ * @port   Pointer to the port data structure.
+ * @tag    Tag of the command that has completed.
+ * @data   Pointer to a completion structure.
+ * @status Completion status.
+ *
+ * return value
+ *	None
+ */
+static void mtip_completion(struct mtip_port *port,
+			    int tag,
+			    void *data,
+			    int status)
+{
+	struct mtip_cmd *command = &port->commands[tag];
+	struct completion *waiting = data;
+	if (unlikely(status == PORT_IRQ_TF_ERR))
+		dev_warn(&port->dd->pdev->dev,
+			"Internal command %d completed with TFE\n", tag);
+
+	command->async_callback = NULL;
+	command->comp_func = NULL;
+
+	complete(waiting);
+}
+
 /*
  * Helper function for tag logging
  */
@@ -1276,7 +1342,7 @@ static int mtip_standby_immediate(struct mtip_port *port)
  *	1 Capacity was returned successfully.
  *	0 The identify information is invalid.
  */
-bool mtip_hw_get_capacity(struct driver_data *dd, sector_t *sectors)
+static bool mtip_hw_get_capacity(struct driver_data *dd, sector_t *sectors)
 {
 	struct mtip_port *port = dd->port;
 	u64 total, raw0, raw1, raw2, raw3;
@@ -1423,7 +1489,7 @@ static inline void fill_command_sg(struct driver_data *dd,
  * return value 0 The command completed successfully.
  * return value -1 An error occurred while executing the command.
  */
-int exec_drive_task(struct mtip_port *port, u8 *command)
+static int exec_drive_task(struct mtip_port *port, u8 *command)
 {
 	struct host_to_dev_fis	fis;
 	struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG);
@@ -1499,8 +1565,8 @@ int exec_drive_task(struct mtip_port *port, u8 *command)
  *                 data to the user space buffer.
  * return value -1 An error occurred while executing the command.
  */
-int exec_drive_command(struct mtip_port *port, u8 *command,
-			void __user *user_buffer)
+static int exec_drive_command(struct mtip_port *port, u8 *command,
+				void __user *user_buffer)
 {
 	struct host_to_dev_fis	fis;
 	struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG);
@@ -2020,15 +2086,9 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
  * return value
  *	None
  */
-void mtip_hw_submit_io(struct driver_data *dd,
-			sector_t start,
-			int nsect,
-			int nents,
-			int tag,
-			void *callback,
-			void *data,
-			int barrier,
-			int dir)
+static void mtip_hw_submit_io(struct driver_data *dd, sector_t start,
+			      int nsect, int nents, int tag, void *callback,
+			      void *data, int barrier, int dir)
 {
 	struct host_to_dev_fis	*fis;
 	struct mtip_port *port = dd->port;
@@ -2115,7 +2175,7 @@ void mtip_hw_submit_io(struct driver_data *dd,
  * return value
  *      None
  */
-void mtip_hw_release_scatterlist(struct driver_data *dd, int tag)
+static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag)
 {
 	release_slot(dd->port, tag);
 }
@@ -2131,8 +2191,8 @@ void mtip_hw_release_scatterlist(struct driver_data *dd, int tag)
  *	Pointer to the scatter list for the allocated command slot
  *	or NULL if no command slots are available.
  */
-struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd,
-						int *tag)
+static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd,
+						   int *tag)
 {
 	/*
 	 * It is possible that, even with this semaphore, a thread
@@ -2216,7 +2276,7 @@ static DEVICE_ATTR(registers, S_IRUGO, hw_show_registers, NULL);
  *	0	Operation completed successfully.
  *	-EINVAL Invalid parameter.
  */
-int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj)
+static int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj)
 {
 	if (!kobj || !dd)
 		return -EINVAL;
@@ -2237,7 +2297,7 @@ int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj)
  *	0	Operation completed successfully.
  *	-EINVAL Invalid parameter.
  */
-int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj)
+static int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj)
 {
 	if (!kobj || !dd)
 		return -EINVAL;
@@ -2385,7 +2445,7 @@ static int mtip_ftl_rebuild_poll(struct driver_data *dd)
  * return value
  *	0 on success, else an error code.
  */
-int mtip_hw_init(struct driver_data *dd)
+static int mtip_hw_init(struct driver_data *dd)
 {
 	int i;
 	int rv;
@@ -2586,7 +2646,7 @@ out1:
  * return value
  *	0
  */
-int mtip_hw_exit(struct driver_data *dd)
+static int mtip_hw_exit(struct driver_data *dd)
 {
 	/*
 	 * Send standby immediate (E0h) to the drive so that it
@@ -2634,7 +2694,7 @@ int mtip_hw_exit(struct driver_data *dd)
  * return value
  *	0
  */
-int mtip_hw_shutdown(struct driver_data *dd)
+static int mtip_hw_shutdown(struct driver_data *dd)
 {
 	/*
 	 * Send standby immediate (E0h) to the drive so that it
@@ -2657,7 +2717,7 @@ int mtip_hw_shutdown(struct driver_data *dd)
  *	0	Suspend was successful
  *	-EFAULT Suspend was not successful
  */
-int mtip_hw_suspend(struct driver_data *dd)
+static int mtip_hw_suspend(struct driver_data *dd)
 {
 	/*
 	 * Send standby immediate (E0h) to the drive
@@ -2689,7 +2749,7 @@ int mtip_hw_suspend(struct driver_data *dd)
  *	0	Resume was successful
  *      -EFAULT Resume was not successful
  */
-int mtip_hw_resume(struct driver_data *dd)
+static int mtip_hw_resume(struct driver_data *dd)
 {
 	/* Perform any needed hardware setup steps */
 	hba_setup(dd);
@@ -2715,50 +2775,6 @@ int mtip_hw_resume(struct driver_data *dd)
 	return 0;
 }
 
-/*
- * This function is called for clean the pending command in the
- * command slot during the surprise removal of device and return
- * error to the upper layer.
- *
- * @dd Pointer to the DRIVER_DATA structure.
- *
- * return value
- *	None
- */
-void mtip_command_cleanup(struct driver_data *dd)
-{
-	int group = 0, commandslot = 0, commandindex = 0;
-	struct mtip_cmd *command;
-	struct mtip_port *port = dd->port;
-
-	for (group = 0; group < 4; group++) {
-		for (commandslot = 0; commandslot < 32; commandslot++) {
-			if (!(port->allocated[group] & (1 << commandslot)))
-				continue;
-
-			commandindex = group << 5 | commandslot;
-			command = &port->commands[commandindex];
-
-			if (atomic_read(&command->active)
-			    && (command->async_callback)) {
-				command->async_callback(command->async_data,
-					-ENODEV);
-				command->async_callback = NULL;
-				command->async_data = NULL;
-			}
-
-			dma_unmap_sg(&port->dd->pdev->dev,
-				command->sg,
-				command->scatter_ents,
-				command->direction);
-		}
-	}
-
-	up(&port->cmd_slot);
-
-	atomic_set(&dd->drv_cleanup_done, true);
-}
-
 /*
  * Helper function for reusing disk name
  * upon hot insertion.
@@ -3037,7 +3053,7 @@ static int mtip_make_request(struct request_queue *queue, struct bio *bio)
  * return value
  *	0 on success else an error code.
  */
-int mtip_block_initialize(struct driver_data *dd)
+static int mtip_block_initialize(struct driver_data *dd)
 {
 	int rv = 0;
 	sector_t capacity;
@@ -3168,7 +3184,7 @@ protocol_init_error:
  * return value
  *	0
  */
-int mtip_block_remove(struct driver_data *dd)
+static int mtip_block_remove(struct driver_data *dd)
 {
 	struct kobject *kobj;
 	/* Clean up the sysfs attributes managed by the protocol layer. */
@@ -3205,7 +3221,7 @@ int mtip_block_remove(struct driver_data *dd)
  * return value
  *	0
  */
-int mtip_block_shutdown(struct driver_data *dd)
+static int mtip_block_shutdown(struct driver_data *dd)
 {
 	dev_info(&dd->pdev->dev,
 		"Shutting down %s ...\n", dd->disk->disk_name);
@@ -3220,7 +3236,7 @@ int mtip_block_shutdown(struct driver_data *dd)
 	return 0;
 }
 
-int mtip_block_suspend(struct driver_data *dd)
+static int mtip_block_suspend(struct driver_data *dd)
 {
 	dev_info(&dd->pdev->dev,
 		"Suspending %s ...\n", dd->disk->disk_name);
@@ -3228,7 +3244,7 @@ int mtip_block_suspend(struct driver_data *dd)
 	return 0;
 }
 
-int mtip_block_resume(struct driver_data *dd)
+static int mtip_block_resume(struct driver_data *dd)
 {
 	dev_info(&dd->pdev->dev, "Resuming %s ...\n",
 		dd->disk->disk_name);
@@ -3479,28 +3495,6 @@ static void mtip_pci_shutdown(struct pci_dev *pdev)
 		mtip_block_shutdown(dd);
 }
 
-/*
- * This function check_for_surprise_removal is called
- * while card is removed from the system and it will
- * read the vendor id from the configration space
- *
- * @pdev Pointer to the pci_dev structure.
- *
- * return value
- *	 true if device removed, else false
- */
-bool mtip_check_surprise_removal(struct pci_dev *pdev)
-{
-	u16 vendor_id = 0;
-
-       /* Read the vendorID from the configuration space */
-	pci_read_config_word(pdev, 0x00, &vendor_id);
-	if (vendor_id == 0xFFFF)
-		return true; /* device removed */
-
-	return false; /* device present */
-}
-
 /* Table of device ids supported by this driver. */
 static DEFINE_PCI_DEVICE_TABLE(mtip_pci_tbl) = {
 	{  PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320_DEVICE_ID) },
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index d6355c6f218f..17be4f444e7d 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -405,37 +405,4 @@ struct driver_data {
 	atomic_t eh_active; /* Flag for error handling tracking */
 };
 
-/* Function declarations */
-extern int mtip_block_initialize(struct driver_data *dd);
-extern int mtip_block_remove(struct driver_data *dd);
-extern int mtip_block_shutdown(struct driver_data *dd);
-extern int mtip_block_suspend(struct driver_data *dd);
-extern int mtip_block_resume(struct driver_data *dd);
-extern int mtip_hw_init(struct driver_data *dd);
-extern int mtip_hw_exit(struct driver_data *dd);
-extern int mtip_hw_shutdown(struct driver_data *dd);
-extern bool mtip_hw_get_capacity(struct driver_data *dd, sector_t *sectors);
-extern void mtip_hw_release_scatterlist(
-			struct driver_data *dd,
-			int tag);
-extern struct scatterlist *mtip_hw_get_scatterlist(
-			struct driver_data *dd,
-			int *tag);
-extern void mtip_hw_submit_io(struct driver_data *dd,
-			sector_t start,
-			int nsect,
-			int nents,
-			int tag,
-			void *callback,
-			void *data,
-			int barrier,
-			int dir);
-extern int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj);
-extern int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj);
-extern int mtip_hw_resume(struct driver_data *dd);
-extern int mtip_hw_suspend(struct driver_data *dd);
-void mtip_command_cleanup(struct driver_data *dd);
-bool mtip_check_surprise_removal(struct pci_dev *pdev);
-void mtip_restart_port(struct mtip_port *port);
-
 #endif
-- 
cgit 


From 3ff147d3a88e43135b1861d964496101657fa9a2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 Sep 2011 21:33:53 -0600
Subject: mtip32xx: mark a few more items static

Missed two items: mtip_major, and mtip_pci_driver.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 40d840e56256..b426a939c379 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -80,7 +80,7 @@ static int instance;
  * Global variable used to hold the major block device number
  * allocated in mtip_init().
  */
-int mtip_major;
+static int mtip_major;
 
 static DEFINE_SPINLOCK(rssd_index_lock);
 static DEFINE_IDA(rssd_index_ida);
@@ -3502,7 +3502,7 @@ static DEFINE_PCI_DEVICE_TABLE(mtip_pci_tbl) = {
 };
 
 /* Structure that describes the PCI driver functions. */
-struct pci_driver mtip_pci_driver = {
+static struct pci_driver mtip_pci_driver = {
 	.name			= MTIP_DRV_NAME,
 	.id_table		= mtip_pci_tbl,
 	.probe			= mtip_pci_probe,
-- 
cgit 


From 0e838c624e04490985162a1b00d7aa1956c8834e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 28 Sep 2011 07:35:40 -0600
Subject: mtip32xx: add module.h include to avoid conflict with moduleh tree

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/block')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index b426a939c379..f7a33211a81a 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -28,6 +28,7 @@
 #include <linux/smp.h>
 #include <linux/compat.h>
 #include <linux/fs.h>
+#include <linux/module.h>
 #include <linux/genhd.h>
 #include <linux/blkdev.h>
 #include <linux/bio.h>
-- 
cgit 


From a71f483d7957c74368a76a3a88ae54d524fa3b49 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 5 Nov 2011 08:36:21 +0100
Subject: mtip32xx: update to new ->make_request() API

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index f7a33211a81a..880facbb0534 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2984,10 +2984,8 @@ static const struct block_device_operations mtip_block_ops = {
  *              the driver data structure.
  * @bio   Pointer to the BIO.
  *
- * return value
- *	0
  */
-static int mtip_make_request(struct request_queue *queue, struct bio *bio)
+static void mtip_make_request(struct request_queue *queue, struct bio *bio)
 {
 	struct driver_data *dd = queue->queuedata;
 	struct scatterlist *sg;
@@ -2998,12 +2996,12 @@ static int mtip_make_request(struct request_queue *queue, struct bio *bio)
 	if (unlikely(!bio_has_data(bio))) {
 		blk_queue_flush(queue, 0);
 		bio_endio(bio, 0);
-		return 0;
+		return;
 	}
 
 	if (unlikely(atomic_read(&dd->eh_active))) {
 		bio_endio(bio, -EBUSY);
-		return 0;
+		return;
 	}
 
 	sg = mtip_hw_get_scatterlist(dd, &tag);
@@ -3015,7 +3013,7 @@ static int mtip_make_request(struct request_queue *queue, struct bio *bio)
 				"Maximum number of SGL entries exceeded");
 			bio_io_error(bio);
 			mtip_hw_release_scatterlist(dd, tag);
-			return 0;
+			return;
 		}
 
 		/* Create the scatter list for this bio. */
@@ -3036,11 +3034,8 @@ static int mtip_make_request(struct request_queue *queue, struct bio *bio)
 				bio,
 				bio->bi_rw & REQ_FLUSH,
 				bio_data_dir(bio));
-	} else {
+	} else
 		bio_io_error(bio);
-	}
-
-	return 0;
 }
 
 /*
-- 
cgit 


From 97e36834f5a106459ab1b290e663a4eb6264639e Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 12 Oct 2011 12:12:36 -0400
Subject: xen/blk[front|back]: Squash blkif_request_rw and
 blkif_request_discard together

In a union type structure to deal with the overlapping
attributes in a easier manner.

Suggested-by: Ian Campbell <Ian.Campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 13 ++++----
 drivers/block/xen-blkback/common.h  | 64 +++++++++++++++++++++----------------
 drivers/block/xen-blkfront.c        | 44 +++++++++++++------------
 3 files changed, 66 insertions(+), 55 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 15ec4db194d1..d7104abc8b72 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -362,7 +362,7 @@ static int xen_blkbk_map(struct blkif_request *req,
 {
 	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	int i;
-	int nseg = req->nr_segments;
+	int nseg = req->u.rw.nr_segments;
 	int ret = 0;
 
 	/*
@@ -449,7 +449,7 @@ static void xen_blk_discard(struct xen_blkif *blkif, struct blkif_request *req)
 	} else if (err)
 		status = BLKIF_RSP_ERROR;
 
-	make_response(blkif, req->id, req->operation, status);
+	make_response(blkif, req->u.discard.id, req->operation, status);
 }
 
 static void xen_blk_drain_io(struct xen_blkif *blkif)
@@ -644,7 +644,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	}
 
 	/* Check that the number of segments is sane. */
-	nseg = req->nr_segments;
+	nseg = req->u.rw.nr_segments;
+
 	if (unlikely(nseg == 0 && operation != WRITE_FLUSH &&
 				operation != REQ_DISCARD) ||
 	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
@@ -654,12 +655,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 		goto fail_response;
 	}
 
-	preq.dev           = req->handle;
+	preq.dev           = req->u.rw.handle;
 	preq.sector_number = req->u.rw.sector_number;
 	preq.nr_sects      = 0;
 
 	pending_req->blkif     = blkif;
-	pending_req->id        = req->id;
+	pending_req->id        = req->u.rw.id;
 	pending_req->operation = req->operation;
 	pending_req->status    = BLKIF_RSP_OKAY;
 	pending_req->nr_pages  = nseg;
@@ -784,7 +785,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	xen_blkbk_unmap(pending_req);
  fail_response:
 	/* Haven't submitted any bio's yet. */
-	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+	make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR);
 	free_req(pending_req);
 	msleep(1); /* back off a bit */
 	return -EIO;
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index dfb1b3a43a5d..dbfe7b3b0737 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -60,58 +60,66 @@ struct blkif_common_response {
 	char dummy;
 };
 
-/* i386 protocol version */
-#pragma pack(push, 4)
-
 struct blkif_x86_32_request_rw {
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint64_t       id;           /* private guest value, echoed in resp  */
 	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
 	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-};
+} __attribute__((__packed__));
 
 struct blkif_x86_32_request_discard {
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   _pad1;        /* was "handle" for read/write requests */
+	uint64_t       id;           /* private guest value, echoed in resp  */
 	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
-	uint64_t nr_sectors;
-};
+	uint64_t       nr_sectors;
+} __attribute__((__packed__));
 
 struct blkif_x86_32_request {
 	uint8_t        operation;    /* BLKIF_OP_???                         */
-	uint8_t        nr_segments;  /* number of segments                   */
-	blkif_vdev_t   handle;       /* only for read/write requests         */
-	uint64_t       id;           /* private guest value, echoed in resp  */
 	union {
 		struct blkif_x86_32_request_rw rw;
 		struct blkif_x86_32_request_discard discard;
 	} u;
-};
+} __attribute__((__packed__));
+
+/* i386 protocol version */
+#pragma pack(push, 4)
 struct blkif_x86_32_response {
 	uint64_t        id;              /* copied from request */
 	uint8_t         operation;       /* copied from request */
 	int16_t         status;          /* BLKIF_RSP_???       */
 };
 #pragma pack(pop)
-
 /* x86_64 protocol version */
 
 struct blkif_x86_64_request_rw {
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint32_t       _pad1;        /* offsetof(blkif_reqest..,u.rw.id)==8  */
+	uint64_t       id;
 	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
 	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-};
+} __attribute__((__packed__));
 
 struct blkif_x86_64_request_discard {
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   _pad1;        /* was "handle" for read/write requests */
+        uint32_t       _pad2;        /* offsetof(blkif_..,u.discard.id)==8   */
+	uint64_t       id;
 	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
-	uint64_t nr_sectors;
-};
+	uint64_t       nr_sectors;
+} __attribute__((__packed__));
 
 struct blkif_x86_64_request {
 	uint8_t        operation;    /* BLKIF_OP_???                         */
-	uint8_t        nr_segments;  /* number of segments                   */
-	blkif_vdev_t   handle;       /* only for read/write requests         */
-	uint64_t       __attribute__((__aligned__(8))) id;
 	union {
 		struct blkif_x86_64_request_rw rw;
 		struct blkif_x86_64_request_discard discard;
 	} u;
-};
+} __attribute__((__packed__));
+
 struct blkif_x86_64_response {
 	uint64_t       __attribute__((__aligned__(8))) id;
 	uint8_t         operation;       /* copied from request */
@@ -237,18 +245,18 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
 {
 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
 	dst->operation = src->operation;
-	dst->nr_segments = src->nr_segments;
-	dst->handle = src->handle;
-	dst->id = src->id;
 	switch (src->operation) {
 	case BLKIF_OP_READ:
 	case BLKIF_OP_WRITE:
 	case BLKIF_OP_WRITE_BARRIER:
 	case BLKIF_OP_FLUSH_DISKCACHE:
+		dst->u.rw.nr_segments = src->u.rw.nr_segments;
+		dst->u.rw.handle = src->u.rw.handle;
+		dst->u.rw.id = src->u.rw.id;
 		dst->u.rw.sector_number = src->u.rw.sector_number;
 		barrier();
-		if (n > dst->nr_segments)
-			n = dst->nr_segments;
+		if (n > dst->u.rw.nr_segments)
+			n = dst->u.rw.nr_segments;
 		for (i = 0; i < n; i++)
 			dst->u.rw.seg[i] = src->u.rw.seg[i];
 		break;
@@ -266,18 +274,18 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
 {
 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
 	dst->operation = src->operation;
-	dst->nr_segments = src->nr_segments;
-	dst->handle = src->handle;
-	dst->id = src->id;
 	switch (src->operation) {
 	case BLKIF_OP_READ:
 	case BLKIF_OP_WRITE:
 	case BLKIF_OP_WRITE_BARRIER:
 	case BLKIF_OP_FLUSH_DISKCACHE:
+		dst->u.rw.nr_segments = src->u.rw.nr_segments;
+		dst->u.rw.handle = src->u.rw.handle;
+		dst->u.rw.id = src->u.rw.id;
 		dst->u.rw.sector_number = src->u.rw.sector_number;
 		barrier();
-		if (n > dst->nr_segments)
-			n = dst->nr_segments;
+		if (n > dst->u.rw.nr_segments)
+			n = dst->u.rw.nr_segments;
 		for (i = 0; i < n; i++)
 			dst->u.rw.seg[i] = src->u.rw.seg[i];
 		break;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 7b2ec5908413..2c2c4be7d9d6 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -135,15 +135,15 @@ static int get_id_from_freelist(struct blkfront_info *info)
 {
 	unsigned long free = info->shadow_free;
 	BUG_ON(free >= BLK_RING_SIZE);
-	info->shadow_free = info->shadow[free].req.id;
-	info->shadow[free].req.id = 0x0fffffee; /* debug */
+	info->shadow_free = info->shadow[free].req.u.rw.id;
+	info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
 	return free;
 }
 
 static void add_id_to_freelist(struct blkfront_info *info,
 			       unsigned long id)
 {
-	info->shadow[id].req.id  = info->shadow_free;
+	info->shadow[id].req.u.rw.id  = info->shadow_free;
 	info->shadow[id].request = NULL;
 	info->shadow_free = id;
 }
@@ -287,9 +287,9 @@ static int blkif_queue_request(struct request *req)
 	id = get_id_from_freelist(info);
 	info->shadow[id].request = req;
 
-	ring_req->id = id;
+	ring_req->u.rw.id = id;
 	ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
-	ring_req->handle = info->handle;
+	ring_req->u.rw.handle = info->handle;
 
 	ring_req->operation = rq_data_dir(req) ?
 		BLKIF_OP_WRITE : BLKIF_OP_READ;
@@ -308,13 +308,15 @@ static int blkif_queue_request(struct request *req)
 	if (unlikely(req->cmd_flags & REQ_DISCARD)) {
 		/* id, sector_number and handle are set above. */
 		ring_req->operation = BLKIF_OP_DISCARD;
-		ring_req->nr_segments = 0;
+		ring_req->u.discard.nr_segments = 0;
 		ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
 	} else {
-		ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
-		BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
+		ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req,
+							   info->sg);
+		BUG_ON(ring_req->u.rw.nr_segments >
+		       BLKIF_MAX_SEGMENTS_PER_REQUEST);
 
-		for_each_sg(info->sg, sg, ring_req->nr_segments, i) {
+		for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) {
 			buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
 			fsect = sg->offset >> 9;
 			lsect = fsect + (sg->length >> 9) - 1;
@@ -705,7 +707,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 static void blkif_completion(struct blk_shadow *s)
 {
 	int i;
-	for (i = 0; i < s->req.nr_segments; i++)
+	for (i = 0; i < s->req.u.rw.nr_segments; i++)
 		gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL);
 }
 
@@ -763,7 +765,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 				error = -EOPNOTSUPP;
 			}
 			if (unlikely(bret->status == BLKIF_RSP_ERROR &&
-				     info->shadow[id].req.nr_segments == 0)) {
+				     info->shadow[id].req.u.rw.nr_segments == 0)) {
 				printk(KERN_WARNING "blkfront: %s: empty write %s op failed\n",
 				       info->flush_op == BLKIF_OP_WRITE_BARRIER ?
 				       "barrier" :  "flush disk cache",
@@ -984,8 +986,8 @@ static int blkfront_probe(struct xenbus_device *dev,
 	INIT_WORK(&info->work, blkif_restart_queue);
 
 	for (i = 0; i < BLK_RING_SIZE; i++)
-		info->shadow[i].req.id = i+1;
-	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+		info->shadow[i].req.u.rw.id = i+1;
+	info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
 
 	/* Front end dir is a number, which is used as the id. */
 	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
@@ -1019,9 +1021,9 @@ static int blkif_recover(struct blkfront_info *info)
 	/* Stage 2: Set up free list. */
 	memset(&info->shadow, 0, sizeof(info->shadow));
 	for (i = 0; i < BLK_RING_SIZE; i++)
-		info->shadow[i].req.id = i+1;
+		info->shadow[i].req.u.rw.id = i+1;
 	info->shadow_free = info->ring.req_prod_pvt;
-	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+	info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
 
 	/* Stage 3: Find pending requests and requeue them. */
 	for (i = 0; i < BLK_RING_SIZE; i++) {
@@ -1034,17 +1036,17 @@ static int blkif_recover(struct blkfront_info *info)
 		*req = copy[i].req;
 
 		/* We get a new request id, and must reset the shadow state. */
-		req->id = get_id_from_freelist(info);
-		memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
+		req->u.rw.id = get_id_from_freelist(info);
+		memcpy(&info->shadow[req->u.rw.id], &copy[i], sizeof(copy[i]));
 
 		/* Rewrite any grant references invalidated by susp/resume. */
-		for (j = 0; j < req->nr_segments; j++)
+		for (j = 0; j < req->u.rw.nr_segments; j++)
 			gnttab_grant_foreign_access_ref(
 				req->u.rw.seg[j].gref,
 				info->xbdev->otherend_id,
-				pfn_to_mfn(info->shadow[req->id].frame[j]),
-				rq_data_dir(info->shadow[req->id].request));
-		info->shadow[req->id].req = *req;
+				pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]),
+				rq_data_dir(info->shadow[req->u.rw.id].request));
+		info->shadow[req->u.rw.id].req = *req;
 
 		info->ring.req_prod_pvt++;
 	}
-- 
cgit 


From 5ea42986694a96542644f9cae8b122d3a00c508f Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 12 Oct 2011 16:23:30 -0400
Subject: xen/blk[front|back]: Enhance discard support with secure erasing
 support.

Part of the blkdev_issue_discard(xx) operation is that it can also
issue a secure discard operation that will permanantly remove the
sectors in question. We advertise that we can support that via the
'discard-secure' attribute and on the request, if the 'secure' bit
is set, we will attempt to pass in REQ_DISCARD | REQ_SECURE.

CC: Li Dongyang <lidongyang@novell.com>
[v1: Used 'flag' instead of 'secure:1' bit]
[v2: Use 'reserved' uint8_t instead of adding a new value]
[v3: Check for nseg when mapping instead of operation]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 18 ++++++++++------
 drivers/block/xen-blkback/common.h  |  7 +++++--
 drivers/block/xen-blkback/xenbus.c  | 12 +++++++++++
 drivers/block/xen-blkfront.c        | 41 ++++++++++++++++++++++++++++---------
 4 files changed, 60 insertions(+), 18 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index d7104abc8b72..9d2261b02f24 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -422,13 +422,16 @@ static void xen_blk_discard(struct xen_blkif *blkif, struct blkif_request *req)
 	int status = BLKIF_RSP_OKAY;
 	struct block_device *bdev = blkif->vbd.bdev;
 
-	if (blkif->blk_backend_type == BLKIF_BACKEND_PHY)
+	if (blkif->blk_backend_type == BLKIF_BACKEND_PHY) {
+		unsigned long secure = (blkif->vbd.discard_secure &&
+			(req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
+			BLKDEV_DISCARD_SECURE : 0;
 		/* just forward the discard request */
 		err = blkdev_issue_discard(bdev,
 				req->u.discard.sector_number,
 				req->u.discard.nr_sectors,
-				GFP_KERNEL, 0);
-	else if (blkif->blk_backend_type == BLKIF_BACKEND_FILE) {
+				GFP_KERNEL, secure);
+	} else if (blkif->blk_backend_type == BLKIF_BACKEND_FILE) {
 		/* punch a hole in the backing file */
 		struct loop_device *lo = bdev->bd_disk->private_data;
 		struct file *file = lo->lo_backing_file;
@@ -643,8 +646,11 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 		break;
 	}
 
-	/* Check that the number of segments is sane. */
-	nseg = req->u.rw.nr_segments;
+	if (unlikely(operation == REQ_DISCARD))
+		nseg = 0;
+	else
+		/* Check that the number of segments is sane. */
+		nseg = req->u.rw.nr_segments;
 
 	if (unlikely(nseg == 0 && operation != WRITE_FLUSH &&
 				operation != REQ_DISCARD) ||
@@ -708,7 +714,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	 * the hypercall to unmap the grants - that is all done in
 	 * xen_blkbk_unmap.
 	 */
-	if (operation != REQ_DISCARD && xen_blkbk_map(req, pending_req, seg))
+	if (nseg && xen_blkbk_map(req, pending_req, seg))
 		goto fail_flush;
 
 	/*
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index dbfe7b3b0737..d0ee7edc9be8 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -69,7 +69,7 @@ struct blkif_x86_32_request_rw {
 } __attribute__((__packed__));
 
 struct blkif_x86_32_request_discard {
-	uint8_t        nr_segments;  /* number of segments                   */
+	uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero         */
 	blkif_vdev_t   _pad1;        /* was "handle" for read/write requests */
 	uint64_t       id;           /* private guest value, echoed in resp  */
 	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
@@ -104,7 +104,7 @@ struct blkif_x86_64_request_rw {
 } __attribute__((__packed__));
 
 struct blkif_x86_64_request_discard {
-	uint8_t        nr_segments;  /* number of segments                   */
+	uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero         */
 	blkif_vdev_t   _pad1;        /* was "handle" for read/write requests */
         uint32_t       _pad2;        /* offsetof(blkif_..,u.discard.id)==8   */
 	uint64_t       id;
@@ -164,6 +164,7 @@ struct xen_vbd {
 	/* Cached size parameter. */
 	sector_t		size;
 	bool			flush_support;
+	bool			discard_secure;
 };
 
 struct backend_info;
@@ -261,6 +262,7 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
 			dst->u.rw.seg[i] = src->u.rw.seg[i];
 		break;
 	case BLKIF_OP_DISCARD:
+		dst->u.discard.flag = src->u.discard.flag;
 		dst->u.discard.sector_number = src->u.discard.sector_number;
 		dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
 		break;
@@ -290,6 +292,7 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
 			dst->u.rw.seg[i] = src->u.rw.seg[i];
 		break;
 	case BLKIF_OP_DISCARD:
+		dst->u.discard.flag = src->u.discard.flag;
 		dst->u.discard.sector_number = src->u.discard.sector_number;
 		dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
 		break;
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index f759ad4584c3..187fd2c1a15d 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -338,6 +338,9 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
 	if (q && q->flush_flags)
 		vbd->flush_support = true;
 
+	if (q && blk_queue_secdiscard(q))
+		vbd->discard_secure = true;
+
 	DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
 		handle, blkif->domid);
 	return 0;
@@ -420,6 +423,15 @@ int xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be)
 				state = 1;
 				blkif->blk_backend_type = BLKIF_BACKEND_PHY;
 			}
+			/* Optional. */
+			err = xenbus_printf(xbt, dev->nodename,
+				"discard-secure", "%d",
+				blkif->vbd.discard_secure);
+			if (err) {
+				xenbus_dev_fatal(dev, err,
+					"writting discard-secure");
+				goto kfree;
+			}
 		}
 	} else {
 		err = PTR_ERR(type);
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2c2c4be7d9d6..351ddeffd430 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -98,7 +98,8 @@ struct blkfront_info
 	unsigned long shadow_free;
 	unsigned int feature_flush;
 	unsigned int flush_op;
-	unsigned int feature_discard;
+	unsigned int feature_discard:1;
+	unsigned int feature_secdiscard:1;
 	unsigned int discard_granularity;
 	unsigned int discard_alignment;
 	int is_ready;
@@ -305,11 +306,14 @@ static int blkif_queue_request(struct request *req)
 		ring_req->operation = info->flush_op;
 	}
 
-	if (unlikely(req->cmd_flags & REQ_DISCARD)) {
+	if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
 		/* id, sector_number and handle are set above. */
 		ring_req->operation = BLKIF_OP_DISCARD;
-		ring_req->u.discard.nr_segments = 0;
 		ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
+		if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
+			ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
+		else
+			ring_req->u.discard.flag = 0;
 	} else {
 		ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req,
 							   info->sg);
@@ -426,6 +430,8 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 		blk_queue_max_discard_sectors(rq, get_capacity(gd));
 		rq->limits.discard_granularity = info->discard_granularity;
 		rq->limits.discard_alignment = info->discard_alignment;
+		if (info->feature_secdiscard)
+			queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq);
 	}
 
 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
@@ -707,6 +713,8 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 static void blkif_completion(struct blk_shadow *s)
 {
 	int i;
+	/* Do not let BLKIF_OP_DISCARD as nr_segment is in the same place
+	 * flag. */
 	for (i = 0; i < s->req.u.rw.nr_segments; i++)
 		gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL);
 }
@@ -738,7 +746,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 		id   = bret->id;
 		req  = info->shadow[id].request;
 
-		blkif_completion(&info->shadow[id]);
+		if (bret->operation != BLKIF_OP_DISCARD)
+			blkif_completion(&info->shadow[id]);
 
 		add_id_to_freelist(info, id);
 
@@ -751,7 +760,9 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 					   info->gd->disk_name);
 				error = -EOPNOTSUPP;
 				info->feature_discard = 0;
+				info->feature_secdiscard = 0;
 				queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
+				queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
 			}
 			__blk_end_request_all(req, error);
 			break;
@@ -1039,13 +1050,15 @@ static int blkif_recover(struct blkfront_info *info)
 		req->u.rw.id = get_id_from_freelist(info);
 		memcpy(&info->shadow[req->u.rw.id], &copy[i], sizeof(copy[i]));
 
+		if (req->operation != BLKIF_OP_DISCARD) {
 		/* Rewrite any grant references invalidated by susp/resume. */
-		for (j = 0; j < req->u.rw.nr_segments; j++)
-			gnttab_grant_foreign_access_ref(
-				req->u.rw.seg[j].gref,
-				info->xbdev->otherend_id,
-				pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]),
-				rq_data_dir(info->shadow[req->u.rw.id].request));
+			for (j = 0; j < req->u.rw.nr_segments; j++)
+				gnttab_grant_foreign_access_ref(
+					req->u.rw.seg[j].gref,
+					info->xbdev->otherend_id,
+					pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]),
+					rq_data_dir(info->shadow[req->u.rw.id].request));
+		}
 		info->shadow[req->u.rw.id].req = *req;
 
 		info->ring.req_prod_pvt++;
@@ -1137,11 +1150,13 @@ static void blkfront_setup_discard(struct blkfront_info *info)
 	char *type;
 	unsigned int discard_granularity;
 	unsigned int discard_alignment;
+	unsigned int discard_secure;
 
 	type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL);
 	if (IS_ERR(type))
 		return;
 
+	info->feature_secdiscard = 0;
 	if (strncmp(type, "phy", 3) == 0) {
 		err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
 			"discard-granularity", "%u", &discard_granularity,
@@ -1152,6 +1167,12 @@ static void blkfront_setup_discard(struct blkfront_info *info)
 			info->discard_granularity = discard_granularity;
 			info->discard_alignment = discard_alignment;
 		}
+		err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "discard-secure", "%d", &discard_secure,
+			    NULL);
+		if (!err)
+			info->feature_secdiscard = discard_secure;
+
 	} else if (strncmp(type, "file", 4) == 0)
 		info->feature_discard = 1;
 
-- 
cgit 


From 421463526fd1d8b5cb575baca12667c1005a110b Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 12 Oct 2011 17:26:47 -0400
Subject: xen/blkback: Move processing of BLKIF_OP_DISCARD from
 dispatch_rw_block_io

.. and move it to its own function that will deal with the
discard operation.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 54 +++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 30 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 9d2261b02f24..b058de7825f5 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -416,12 +416,16 @@ static int xen_blkbk_map(struct blkif_request *req,
 	return ret;
 }
 
-static void xen_blk_discard(struct xen_blkif *blkif, struct blkif_request *req)
+static int dispatch_discard_io(struct xen_blkif *blkif,
+				struct blkif_request *req)
 {
 	int err = 0;
 	int status = BLKIF_RSP_OKAY;
 	struct block_device *bdev = blkif->vbd.bdev;
 
+	blkif->st_ds_req++;
+
+	xen_blkif_get(blkif);
 	if (blkif->blk_backend_type == BLKIF_BACKEND_PHY) {
 		unsigned long secure = (blkif->vbd.discard_secure &&
 			(req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
@@ -453,6 +457,8 @@ static void xen_blk_discard(struct xen_blkif *blkif, struct blkif_request *req)
 		status = BLKIF_RSP_ERROR;
 
 	make_response(blkif, req->u.discard.id, req->operation, status);
+	xen_blkif_put(blkif);
+	return err;
 }
 
 static void xen_blk_drain_io(struct xen_blkif *blkif)
@@ -576,8 +582,11 @@ __do_block_io_op(struct xen_blkif *blkif)
 
 		/* Apply all sanity checks to /private copy/ of request. */
 		barrier();
-
-		if (dispatch_rw_block_io(blkif, &req, pending_req))
+		if (unlikely(req.operation == BLKIF_OP_DISCARD)) {
+			free_req(pending_req);
+			if (dispatch_discard_io(blkif, &req))
+				break;
+		} else if (dispatch_rw_block_io(blkif, &req, pending_req))
 			break;
 
 		/* Yield point for this unbounded loop. */
@@ -636,24 +645,16 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 		blkif->st_f_req++;
 		operation = WRITE_FLUSH;
 		break;
-	case BLKIF_OP_DISCARD:
-		blkif->st_ds_req++;
-		operation = REQ_DISCARD;
-		break;
 	default:
 		operation = 0; /* make gcc happy */
 		goto fail_response;
 		break;
 	}
 
-	if (unlikely(operation == REQ_DISCARD))
-		nseg = 0;
-	else
-		/* Check that the number of segments is sane. */
-		nseg = req->u.rw.nr_segments;
+	/* Check that the number of segments is sane. */
+	nseg = req->u.rw.nr_segments;
 
-	if (unlikely(nseg == 0 && operation != WRITE_FLUSH &&
-				operation != REQ_DISCARD) ||
+	if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
 	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
 		pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
 			 nseg);
@@ -714,7 +715,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	 * the hypercall to unmap the grants - that is all done in
 	 * xen_blkbk_unmap.
 	 */
-	if (nseg && xen_blkbk_map(req, pending_req, seg))
+	if (xen_blkbk_map(req, pending_req, seg))
 		goto fail_flush;
 
 	/*
@@ -746,23 +747,16 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 
 	/* This will be hit if the operation was a flush or discard. */
 	if (!bio) {
-		BUG_ON(operation != WRITE_FLUSH && operation != REQ_DISCARD);
+		BUG_ON(operation != WRITE_FLUSH);
 
-		if (operation == WRITE_FLUSH) {
-			bio = bio_alloc(GFP_KERNEL, 0);
-			if (unlikely(bio == NULL))
-				goto fail_put_bio;
+		bio = bio_alloc(GFP_KERNEL, 0);
+		if (unlikely(bio == NULL))
+			goto fail_put_bio;
 
-			biolist[nbio++] = bio;
-			bio->bi_bdev    = preq.bdev;
-			bio->bi_private = pending_req;
-			bio->bi_end_io  = end_block_io_op;
-		} else if (operation == REQ_DISCARD) {
-			xen_blk_discard(blkif, req);
-			xen_blkif_put(blkif);
-			free_req(pending_req);
-			return 0;
-		}
+		biolist[nbio++] = bio;
+		bio->bi_bdev    = preq.bdev;
+		bio->bi_private = pending_req;
+		bio->bi_end_io  = end_block_io_op;
 	}
 
 	/*
-- 
cgit 


From ae18be11b5ccc3be9e268592616488c5f9d987f5 Mon Sep 17 00:00:00 2001
From: Li Dongyang <lidongyang@novell.com>
Date: Thu, 10 Nov 2011 15:52:06 +0800
Subject: xen-blkback: convert hole punching to discard request on loop devices

As of dfaa2ef68e80c378e610e3c8c536f1c239e8d3ef, loop devices support
discard request now. We could just issue a discard request, and
the loop driver will punch the hole for us, so we don't need to touch
the internals of loop device and punch the hole ourselves, Thanks.

V0->V1: rebased on devel/for-jens-3.3

Signed-off-by: Li Dongyang <lidongyang@novell.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index b058de7825f5..0088bf60f368 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -39,9 +39,6 @@
 #include <linux/list.h>
 #include <linux/delay.h>
 #include <linux/freezer.h>
-#include <linux/loop.h>
-#include <linux/falloc.h>
-#include <linux/fs.h>
 
 #include <xen/events.h>
 #include <xen/page.h>
@@ -426,27 +423,15 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
 	blkif->st_ds_req++;
 
 	xen_blkif_get(blkif);
-	if (blkif->blk_backend_type == BLKIF_BACKEND_PHY) {
+	if (blkif->blk_backend_type == BLKIF_BACKEND_PHY ||
+	    blkif->blk_backend_type == BLKIF_BACKEND_FILE) {
 		unsigned long secure = (blkif->vbd.discard_secure &&
 			(req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
 			BLKDEV_DISCARD_SECURE : 0;
-		/* just forward the discard request */
 		err = blkdev_issue_discard(bdev,
 				req->u.discard.sector_number,
 				req->u.discard.nr_sectors,
 				GFP_KERNEL, secure);
-	} else if (blkif->blk_backend_type == BLKIF_BACKEND_FILE) {
-		/* punch a hole in the backing file */
-		struct loop_device *lo = bdev->bd_disk->private_data;
-		struct file *file = lo->lo_backing_file;
-
-		if (file->f_op->fallocate)
-			err = file->f_op->fallocate(file,
-				FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
-				req->u.discard.sector_number << 9,
-				req->u.discard.nr_sectors << 9);
-		else
-			err = -EOPNOTSUPP;
 	} else
 		err = -EOPNOTSUPP;
 
-- 
cgit 


From 60ec0eecfa8968d0f1188e3730979196ac28b9de Mon Sep 17 00:00:00 2001
From: Asai Thambi S P <asamymuthupa@micron.com>
Date: Wed, 23 Nov 2011 08:29:24 +0100
Subject: mtip32xx: updates based on feedback

* queue ncq commands when a non-ncq is in progress or error handling is active
* merge variables 'internal_cmd_in_progress' and 'eh_active' into new variable 'flags'
* get rid of read/write semaphore 'internal_sem'
* new service thread to issue queued commands
* use macros from ata.h for command codes
* return ENOTTY for BLKFLSBUF ioctl
* style changes

Signed-off-by: Asai Thambi S P <asamymuthupa@micron.com>
Signed-off-by: Sam Bradshaw <sbradshaw@micron.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 342 ++++++++++++++++++++++----------------
 drivers/block/mtip32xx/mtip32xx.h |  42 +++--
 2 files changed, 230 insertions(+), 154 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 880facbb0534..b5d843a02bfa 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -34,6 +34,7 @@
 #include <linux/bio.h>
 #include <linux/dma-mapping.h>
 #include <linux/idr.h>
+#include <linux/kthread.h>
 #include <../drivers/ata/ahci.h>
 #include "mtip32xx.h"
 
@@ -99,15 +100,6 @@ struct mtip_compat_ide_task_request_s {
 };
 #endif
 
-static int mtip_exec_internal_command(struct mtip_port *port,
-				void *fis,
-				int fisLen,
-				dma_addr_t buffer,
-				int bufLen,
-				u32 opts,
-				gfp_t atomic,
-				unsigned long timeout);
-
 /*
  * This function check_for_surprise_removal is called
  * while card is removed from the system and it will
@@ -414,9 +406,9 @@ static void mtip_init_port(struct mtip_port *port)
 			 port->mmio + PORT_FIS_ADDR_HI);
 	}
 
-	writel(port->command_list_dma & 0xffffffff,
+	writel(port->command_list_dma & 0xFFFFFFFF,
 			port->mmio + PORT_LST_ADDR);
-	writel(port->rxfis_dma & 0xffffffff, port->mmio + PORT_FIS_ADDR);
+	writel(port->rxfis_dma & 0xFFFFFFFF, port->mmio + PORT_FIS_ADDR);
 
 	/* Clear SError */
 	writel(readl(port->mmio + PORT_SCR_ERR), port->mmio + PORT_SCR_ERR);
@@ -541,7 +533,7 @@ static void mtip_timeout_function(unsigned long int data)
 		if (atomic_read(&port->commands[tag].active) &&
 		   (time_after(jiffies, port->commands[tag].comp_time))) {
 			group = tag >> 5;
-			bit = tag & 0x1f;
+			bit = tag & 0x1F;
 
 			command = &port->commands[tag];
 			fis = (struct host_to_dev_fis *) command->command;
@@ -551,7 +543,7 @@ static void mtip_timeout_function(unsigned long int data)
 
 			cmdto_cnt++;
 			if (cmdto_cnt == 1)
-				atomic_inc(&port->dd->eh_active);
+				set_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags);
 
 			/*
 			 * Clear the completed bit. This should prevent
@@ -589,7 +581,8 @@ static void mtip_timeout_function(unsigned long int data)
 			"%d commands timed out: restarting port",
 			cmdto_cnt);
 		mtip_restart_port(port);
-		atomic_dec(&port->dd->eh_active);
+		clear_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags);
+		wake_up_interruptible(&port->svc_wait);
 	}
 
 	/* Restart the timer */
@@ -728,7 +721,7 @@ static void mtip_handle_tfe(struct driver_data *dd)
 	del_timer(&port->cmd_timer);
 
 	/* Set eh_active */
-	atomic_inc(&dd->eh_active);
+	set_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags);
 
 	/* Loop through all the groups */
 	for (group = 0; group < dd->slot_groups; group++) {
@@ -835,8 +828,9 @@ static void mtip_handle_tfe(struct driver_data *dd)
 	}
 	print_tags(dd, "TFE tags reissued:", tagaccum);
 
-	/* Decrement eh_active */
-	atomic_dec(&dd->eh_active);
+	/* clear eh_active */
+	clear_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags);
+	wake_up_interruptible(&port->svc_wait);
 
 	mod_timer(&port->cmd_timer,
 		 jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
@@ -871,7 +865,6 @@ static inline void mtip_process_sdbf(struct driver_data *dd)
 					continue;
 
 				command = &port->commands[tag];
-
 				/* make internal callback */
 				if (likely(command->comp_func)) {
 					command->comp_func(
@@ -904,9 +897,8 @@ static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat)
 	struct mtip_port *port = dd->port;
 	struct mtip_cmd *cmd = &port->commands[MTIP_TAG_INTERNAL];
 
-	if (port->internal_cmd_in_progress &&
-	    cmd != NULL &&
-	    !(readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+	if (test_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags) &&
+	    (cmd != NULL) && !(readl(port->cmd_issue[MTIP_TAG_INTERNAL])
 		& (1 << MTIP_TAG_INTERNAL))) {
 		if (cmd->comp_func) {
 			cmd->comp_func(port,
@@ -1038,11 +1030,15 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
 
 	to = jiffies + msecs_to_jiffies(timeout);
 	do {
+		if (test_bit(MTIP_FLAG_SVC_THD_ACTIVE_BIT, &port->flags)) {
+			msleep(20);
+			continue; /* svc thd is actively issuing commands */
+		}
 		/*
 		 * Ignore s_active bit 0 of array element 0.
 		 * This bit will always be set
 		 */
-		active = readl(port->s_active[0]) & 0xfffffffe;
+		active = readl(port->s_active[0]) & 0xFFFFFFFE;
 		for (n = 1; n < port->dd->slot_groups; n++)
 			active |= readl(port->s_active[n]);
 
@@ -1060,9 +1056,9 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
  *
  * @port    Pointer to the port data structure.
  * @fis     Pointer to the FIS that describes the command.
- * @fisLen  Length in WORDS of the FIS.
+ * @fis_len  Length in WORDS of the FIS.
  * @buffer  DMA accessible for command data.
- * @bufLen  Length, in bytes, of the data buffer.
+ * @buf_len  Length, in bytes, of the data buffer.
  * @opts    Command header options, excluding the FIS length
  *             and the number of PRD entries.
  * @timeout Time in ms to wait for the command to complete.
@@ -1075,9 +1071,9 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
  */
 static int mtip_exec_internal_command(struct mtip_port *port,
 					void *fis,
-					int fisLen,
+					int fis_len,
 					dma_addr_t buffer,
-					int bufLen,
+					int buf_len,
 					u32 opts,
 					gfp_t atomic,
 					unsigned long timeout)
@@ -1100,7 +1096,7 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 			"Internal command already active\n");
 		return -EBUSY;
 	}
-	port->internal_cmd_in_progress = 1;
+	set_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags);
 
 	if (atomic == GFP_KERNEL) {
 		/* wait for io to complete if non atomic */
@@ -1108,7 +1104,8 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 			dev_warn(&port->dd->pdev->dev,
 				"Failed to quiesce IO\n");
 			release_slot(port, MTIP_TAG_INTERNAL);
-			port->internal_cmd_in_progress = 0;
+			clear_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags);
+			wake_up_interruptible(&port->svc_wait);
 			return -EBUSY;
 		}
 
@@ -1123,19 +1120,23 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 	}
 
 	/* Copy the command to the command table */
-	memcpy(int_cmd->command, fis, fisLen*4);
+	memcpy(int_cmd->command, fis, fis_len*4);
 
 	/* Populate the SG list */
 	int_cmd->command_header->opts =
-		 cpu_to_le32(opts | fisLen);
-	if (bufLen) {
+		 __force_bit2int cpu_to_le32(opts | fis_len);
+	if (buf_len) {
 		command_sg = int_cmd->command + AHCI_CMD_TBL_HDR_SZ;
 
-		command_sg->info = cpu_to_le32((bufLen-1) & 0x3fffff);
-		command_sg->dba	= cpu_to_le32(buffer & 0xffffffff);
-		command_sg->dba_upper = cpu_to_le32((buffer >> 16) >> 16);
+		command_sg->info =
+			__force_bit2int cpu_to_le32((buf_len-1) & 0x3FFFFF);
+		command_sg->dba	=
+			__force_bit2int cpu_to_le32(buffer & 0xFFFFFFFF);
+		command_sg->dba_upper =
+			__force_bit2int cpu_to_le32((buffer >> 16) >> 16);
 
-		int_cmd->command_header->opts |= cpu_to_le32((1 << 16));
+		int_cmd->command_header->opts |=
+			__force_bit2int cpu_to_le32((1 << 16));
 	}
 
 	/* Populate the command header */
@@ -1151,8 +1152,9 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 				&wait,
 				msecs_to_jiffies(timeout)) == 0) {
 			dev_err(&port->dd->pdev->dev,
-				"Internal command did not complete [%d]\n",
-				atomic);
+				"Internal command did not complete [%d] "
+				"within timeout of  %lu ms\n",
+				atomic, timeout);
 			rv = -EAGAIN;
 		}
 
@@ -1184,7 +1186,8 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 	/* Clear the allocated and active bits for the internal command. */
 	atomic_set(&int_cmd->active, 0);
 	release_slot(port, MTIP_TAG_INTERNAL);
-	port->internal_cmd_in_progress = 0;
+	clear_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags);
+	wake_up_interruptible(&port->svc_wait);
 
 	return rv;
 }
@@ -1233,8 +1236,6 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
 	int rv = 0;
 	struct host_to_dev_fis fis;
 
-	down_write(&port->dd->internal_sem);
-
 	/* Build the FIS. */
 	memset(&fis, 0, sizeof(struct host_to_dev_fis));
 	fis.type	= 0x27;
@@ -1292,7 +1293,6 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
 	}
 
 out:
-	up_write(&port->dd->internal_sem);
 	return rv;
 }
 
@@ -1310,8 +1310,6 @@ static int mtip_standby_immediate(struct mtip_port *port)
 	int rv;
 	struct host_to_dev_fis	fis;
 
-	down_write(&port->dd->internal_sem);
-
 	/* Build the FIS. */
 	memset(&fis, 0, sizeof(struct host_to_dev_fis));
 	fis.type	= 0x27;
@@ -1328,8 +1326,6 @@ static int mtip_standby_immediate(struct mtip_port *port)
 					GFP_KERNEL,
 					15000);
 
-	up_write(&port->dd->internal_sem);
-
 	return rv;
 }
 
@@ -1430,7 +1426,7 @@ static void mtip_dump_identify(struct mtip_port *port)
 			 ((u64)sectors) * ATA_SECT_SIZE >> 20);
 
 	pci_read_config_word(port->dd->pdev, PCI_REVISION_ID, &revid);
-	switch (revid & 0xff) {
+	switch (revid & 0xFF) {
 	case 0x1:
 		strlcpy(cbuf, "A0", 3);
 		break;
@@ -1470,15 +1466,12 @@ static inline void fill_command_sg(struct driver_data *dd,
 		if (dma_len > 0x400000)
 			dev_err(&dd->pdev->dev,
 				"DMA segment length truncated\n");
-		command_sg->info = cpu_to_le32((dma_len-1) & 0x3fffff);
-#if (BITS_PER_LONG == 64)
-		*((unsigned long *) &command_sg->dba) =
-			 cpu_to_le64(sg_dma_address(sg));
-#else
-		command_sg->dba	= cpu_to_le32(sg_dma_address(sg));
-		command_sg->dba_upper	=
-			 cpu_to_le32((sg_dma_address(sg) >> 16) >> 16);
-#endif
+		command_sg->info = __force_bit2int
+			cpu_to_le32((dma_len-1) & 0x3FFFFF);
+		command_sg->dba	= __force_bit2int
+			cpu_to_le32(sg_dma_address(sg));
+		command_sg->dba_upper = __force_bit2int
+			cpu_to_le32((sg_dma_address(sg) >> 16) >> 16);
 		command_sg++;
 		sg++;
 	}
@@ -1495,9 +1488,6 @@ static int exec_drive_task(struct mtip_port *port, u8 *command)
 	struct host_to_dev_fis	fis;
 	struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG);
 
-	/* Lock the internal command semaphore. */
-	down_write(&port->dd->internal_sem);
-
 	/* Build the FIS. */
 	memset(&fis, 0, sizeof(struct host_to_dev_fis));
 	fis.type	= 0x27;
@@ -1532,7 +1522,6 @@ static int exec_drive_task(struct mtip_port *port, u8 *command)
 				 0,
 				 GFP_KERNEL,
 				 MTIP_IOCTL_COMMAND_TIMEOUT_MS) < 0) {
-		up_write(&port->dd->internal_sem);
 		return -1;
 	}
 
@@ -1549,7 +1538,6 @@ static int exec_drive_task(struct mtip_port *port, u8 *command)
 		command[4],
 		command[5]);
 
-	up_write(&port->dd->internal_sem);
 	return 0;
 }
 
@@ -1572,9 +1560,6 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
 	struct host_to_dev_fis	fis;
 	struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG);
 
-	/* Lock the internal command semaphore. */
-	down_write(&port->dd->internal_sem);
-
 	/* Build the FIS. */
 	memset(&fis, 0, sizeof(struct host_to_dev_fis));
 	fis.type		= 0x27;
@@ -1584,8 +1569,8 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
 	fis.sect_count	= command[3];
 	if (fis.command == ATA_CMD_SMART) {
 		fis.sector	= command[1];
-		fis.cyl_low	= 0x4f;
-		fis.cyl_hi	= 0xc2;
+		fis.cyl_low	= 0x4F;
+		fis.cyl_hi	= 0xC2;
 	}
 
 	dbg_printk(MTIP_DRV_NAME
@@ -1609,7 +1594,6 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
 				 GFP_KERNEL,
 				 MTIP_IOCTL_COMMAND_TIMEOUT_MS)
 				 < 0) {
-		up_write(&port->dd->internal_sem);
 		return -1;
 	}
 
@@ -1630,12 +1614,10 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
 		if (copy_to_user(user_buffer,
 				 port->sector_buffer,
 				 ATA_SECT_SIZE * command[3])) {
-			up_write(&port->dd->internal_sem);
 			return -EFAULT;
 		}
 	}
 
-	up_write(&port->dd->internal_sem);
 	return 0;
 }
 
@@ -1658,26 +1640,28 @@ static unsigned int implicit_sector(unsigned char command,
 
 	/* list of commands that have an implicit sector count of 1 */
 	switch (command) {
-	case 0xF1:
-	case 0xF2:
-	case 0xF3:
-	case 0xF4:
-	case 0xF5:
-	case 0xF6:
-	case 0xE4:
-	case 0xE8:
+	case ATA_CMD_SEC_SET_PASS:
+	case ATA_CMD_SEC_UNLOCK:
+	case ATA_CMD_SEC_ERASE_PREP:
+	case ATA_CMD_SEC_ERASE_UNIT:
+	case ATA_CMD_SEC_FREEZE_LOCK:
+	case ATA_CMD_SEC_DISABLE_PASS:
+	case ATA_CMD_PMP_READ:
+	case ATA_CMD_PMP_WRITE:
 		rv = 1;
 		break;
-	case 0xF9:
-		if (features == 0x03)
+	case ATA_CMD_SET_MAX:
+		if (features == ATA_SET_MAX_UNLOCK)
 			rv = 1;
 		break;
-	case 0xB0:
-		if ((features == 0xD0) || (features == 0xD1))
+	case ATA_CMD_SMART:
+		if ((features == ATA_SMART_READ_VALUES) ||
+				(features == ATA_SMART_READ_THRESHOLDS))
 			rv = 1;
 		break;
-	case 0xB1:
-		if ((features == 0xC2) || (features == 0xC3))
+	case ATA_CMD_CONF_OVERLAY:
+		if ((features == ATA_DCO_IDENTIFY) ||
+				(features == ATA_DCO_SET))
 			rv = 1;
 		break;
 	}
@@ -1777,9 +1761,6 @@ static int exec_drive_taskfile(struct driver_data *dd,
 		goto abort;
 	}
 
-	/* Lock the internal command semaphore. */
-	down_write(&dd->internal_sem);
-
 	/* Build the FIS. */
 	memset(&fis, 0, sizeof(struct host_to_dev_fis));
 
@@ -1818,7 +1799,6 @@ static int exec_drive_taskfile(struct driver_data *dd,
 				dev_warn(&dd->pdev->dev,
 					"data movement but "
 					"sect_count is 0\n");
-					up_write(&dd->internal_sem);
 					err = -EINVAL;
 					goto abort;
 			}
@@ -1838,19 +1818,25 @@ static int exec_drive_taskfile(struct driver_data *dd,
 		fis.device);
 
 	switch (fis.command) {
-	case 0x92: /* Change timeout for Download Microcode to 60 seconds.*/
+	case ATA_CMD_DOWNLOAD_MICRO:
+		/* Change timeout for Download Microcode to 60 seconds.*/
 		timeout = 60000;
 		break;
-	case 0xf4: /* Change timeout for Security Erase Unit to 4 minutes.*/
+	case ATA_CMD_SEC_ERASE_UNIT:
+		/* Change timeout for Security Erase Unit to 4 minutes.*/
 		timeout = 240000;
 		break;
-	case 0xe0: /* Change timeout for standby immediate to 10 seconds.*/
+	case ATA_CMD_STANDBYNOW1:
+		/* Change timeout for standby immediate to 10 seconds.*/
 		timeout = 10000;
 		break;
-	case 0xf7: /* Change timeout for vendor unique command to 10 secs */
+	case 0xF7:
+	case 0xFA:
+		/* Change timeout for vendor unique command to 10 secs */
 		timeout = 10000;
 		break;
-	case 0xfa: /* Change timeout for vendor unique command to 10 secs */
+	case ATA_CMD_SMART:
+		/* Change timeout for vendor unique command to 10 secs */
 		timeout = 10000;
 		break;
 	default:
@@ -1873,7 +1859,6 @@ static int exec_drive_taskfile(struct driver_data *dd,
 				 0,
 				 GFP_KERNEL,
 				 timeout) < 0) {
-		up_write(&dd->internal_sem);
 		err = -EIO;
 		goto abort;
 	}
@@ -1916,7 +1901,7 @@ static int exec_drive_taskfile(struct driver_data *dd,
 	}
 
 	/* Com rest after secure erase or lowlevel format */
-	if (((fis.command == 0xF4) ||
+	if (((fis.command == ATA_CMD_SEC_ERASE_UNIT) ||
 		((fis.command == 0xFC) &&
 			(fis.features == 0x27 || fis.features == 0x72 ||
 			 fis.features == 0x62 || fis.features == 0x26))) &&
@@ -1937,8 +1922,6 @@ static int exec_drive_taskfile(struct driver_data *dd,
 		req_task->io_ports[5],
 		req_task->io_ports[6]);
 
-	up_write(&dd->internal_sem);
-
 	if (taskout) {
 		if (copy_to_user(buf + outtotal, outbuf, taskout)) {
 			err = -EFAULT;
@@ -2052,7 +2035,8 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
 		ret = exec_drive_taskfile(dd, (void __user *) arg,
 						&req_task, outtotal);
 
-		if (copy_to_user((void __user *) arg, &req_task, sizeof(req_task)))
+		if (copy_to_user((void __user *) arg, &req_task,
+							sizeof(req_task)))
 			return -EFAULT;
 
 		return ret;
@@ -2117,13 +2101,13 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t start,
 	fis->opts        = 1 << 7;
 	fis->command     =
 		(dir == READ ? ATA_CMD_FPDMA_READ : ATA_CMD_FPDMA_WRITE);
-	*((unsigned int *) &fis->lba_low) = (start & 0xffffff);
-	*((unsigned int *) &fis->lba_low_ex) = ((start >> 24) & 0xffffff);
+	*((unsigned int *) &fis->lba_low) = (start & 0xFFFFFF);
+	*((unsigned int *) &fis->lba_low_ex) = ((start >> 24) & 0xFFFFFF);
 	fis->device	 = 1 << 6;
 	if (barrier)
 		fis->device |= FUA_BIT;
-	fis->features    = nsect & 0xff;
-	fis->features_ex = (nsect >> 8) & 0xff;
+	fis->features    = nsect & 0xFF;
+	fis->features_ex = (nsect >> 8) & 0xFF;
 	fis->sect_count  = ((tag << 3) | (tag >> 5));
 	fis->sect_cnt_ex = 0;
 	fis->control     = 0;
@@ -2132,8 +2116,9 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t start,
 	fill_command_sg(dd, command, nents);
 
 	/* Populate the command header */
-	command->command_header->opts = cpu_to_le32(
-			(nents << 16) | 5 | AHCI_CMD_PREFETCH);
+	command->command_header->opts =
+			__force_bit2int cpu_to_le32(
+				(nents << 16) | 5 | AHCI_CMD_PREFETCH);
 	command->command_header->byte_count = 0;
 
 	/*
@@ -2152,10 +2137,15 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t start,
 	command->async_callback = callback;
 
 	/*
-	 * Lock used to prevent this command from being issued
-	 * if an internal command is in progress.
+	 * To prevent this command from being issued
+	 * if an internal command is in progress or error handling is active.
 	 */
-	down_read(&port->dd->internal_sem);
+	if (unlikely(test_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags) ||
+			test_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags))) {
+		set_bit(tag, port->cmds_to_issue);
+		set_bit(MTIP_FLAG_ISSUE_CMDS_BIT, &port->flags);
+		return;
+	}
 
 	/* Issue the command to the hardware */
 	mtip_issue_ncq_command(port, tag);
@@ -2163,8 +2153,6 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t start,
 	/* Set the command's timeout value.*/
 	port->commands[tag].comp_time = jiffies + msecs_to_jiffies(
 					MTIP_NCQ_COMMAND_TIMEOUT_MS);
-
-	up_read(&port->dd->internal_sem);
 }
 
 /*
@@ -2400,10 +2388,9 @@ static int mtip_ftl_rebuild_poll(struct driver_data *dd)
 	timeout = jiffies + msecs_to_jiffies(MTIP_FTL_REBUILD_TIMEOUT_MS);
 
 	do {
-#ifdef CONFIG_HOTPLUG
 		if (mtip_check_surprise_removal(dd->pdev))
 			return -EFAULT;
-#endif
+
 		if (mtip_get_identify(dd->port, NULL) < 0)
 			return -EFAULT;
 
@@ -2438,6 +2425,74 @@ static int mtip_ftl_rebuild_poll(struct driver_data *dd)
 	return 0;
 }
 
+/*
+ * service thread to issue queued commands
+ *
+ * @data Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+
+static int mtip_service_thread(void *data)
+{
+	struct driver_data *dd = (struct driver_data *)data;
+	unsigned long slot, slot_start, slot_wrap;
+	unsigned int num_cmd_slots = dd->slot_groups * 32;
+	struct mtip_port *port = dd->port;
+
+	while (1) {
+		/*
+		 * the condition is to check neither an internal command is
+		 * is in progress nor error handling is active
+		 */
+		wait_event_interruptible(port->svc_wait, (port->flags) &&
+			!test_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags) &&
+			!test_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags));
+
+		if (kthread_should_stop())
+			break;
+
+		if (test_bit(MTIP_FLAG_ISSUE_CMDS_BIT, &port->flags)) {
+			set_bit(MTIP_FLAG_SVC_THD_ACTIVE_BIT, &port->flags);
+			slot = 1;
+			/* used to restrict the loop to one iteration */
+			slot_start = num_cmd_slots;
+			slot_wrap = 0;
+			while (1) {
+				slot = find_next_bit(port->cmds_to_issue,
+						num_cmd_slots, slot);
+				if (slot_wrap == 1) {
+					if ((slot_start >= slot) ||
+						(slot >= num_cmd_slots))
+						break;
+				}
+				if (unlikely(slot_start == num_cmd_slots))
+					slot_start = slot;
+
+				if (unlikely(slot == num_cmd_slots)) {
+					slot = 1;
+					slot_wrap = 1;
+					continue;
+				}
+
+				/* Issue the command to the hardware */
+				mtip_issue_ncq_command(port, slot);
+
+				/* Set the command's timeout value.*/
+				port->commands[slot].comp_time = jiffies +
+				msecs_to_jiffies(MTIP_NCQ_COMMAND_TIMEOUT_MS);
+
+				clear_bit(slot, port->cmds_to_issue);
+			}
+
+			clear_bit(MTIP_FLAG_ISSUE_CMDS_BIT, &port->flags);
+			clear_bit(MTIP_FLAG_SVC_THD_ACTIVE_BIT, &port->flags);
+		}
+	}
+	return 0;
+}
+
 /*
  * Called once for each card.
  *
@@ -2463,13 +2518,6 @@ static int mtip_hw_init(struct driver_data *dd)
 
 	hba_setup(dd);
 
-	/*
-	 * Initialize the internal semaphore
-	 * Use a rw semaphore to enable prioritization of
-	 * mgmnt ioctl traffic during heavy IO load
-	 */
-	init_rwsem(&dd->internal_sem);
-
 	tasklet_init(&dd->tasklet, mtip_tasklet, (unsigned long)dd);
 
 	dd->port = kzalloc(sizeof(struct mtip_port), GFP_KERNEL);
@@ -2541,10 +2589,11 @@ static int mtip_hw_init(struct driver_data *dd)
 
 		if (readl(dd->mmio + HOST_CAP) & HOST_CAP_64)
 			dd->port->commands[i].command_header->ctbau =
-			cpu_to_le32(
+			__force_bit2int cpu_to_le32(
 			(dd->port->commands[i].command_dma >> 16) >> 16);
-		dd->port->commands[i].command_header->ctba = cpu_to_le32(
-			dd->port->commands[i].command_dma & 0xffffffff);
+		dd->port->commands[i].command_header->ctba =
+			__force_bit2int cpu_to_le32(
+			dd->port->commands[i].command_dma & 0xFFFFFFFF);
 
 		/*
 		 * If this is not done, a bug is reported by the stock
@@ -2597,6 +2646,8 @@ static int mtip_hw_init(struct driver_data *dd)
 					dd->mmio + HOST_CTL);
 
 	init_timer(&dd->port->cmd_timer);
+	init_waitqueue_head(&dd->port->svc_wait);
+
 	dd->port->cmd_timer.data = (unsigned long int) dd->port;
 	dd->port->cmd_timer.function = mtip_timeout_function;
 	mod_timer(&dd->port->cmd_timer,
@@ -2667,12 +2718,12 @@ static int mtip_hw_exit(struct driver_data *dd)
 
 	del_timer_sync(&dd->port->cmd_timer);
 
-	/* Stop the bottom half tasklet. */
-	tasklet_kill(&dd->tasklet);
-
 	/* Release the IRQ. */
 	devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
 
+	/* Stop the bottom half tasklet. */
+	tasklet_kill(&dd->tasklet);
+
 	/* Free the command/command header memory. */
 	dmam_free_coherent(&dd->pdev->dev,
 			HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 2),
@@ -2835,7 +2886,7 @@ static int mtip_block_ioctl(struct block_device *dev,
 
 	switch (cmd) {
 	case BLKFLSBUF:
-		return 0;
+		return -ENOTTY;
 	default:
 		return mtip_hw_ioctl(dd, cmd, arg);
 	}
@@ -2870,19 +2921,20 @@ static int mtip_block_compat_ioctl(struct block_device *dev,
 
 	switch (cmd) {
 	case BLKFLSBUF:
-		return 0;
+		return -ENOTTY;
 	case HDIO_DRIVE_TASKFILE: {
-		struct mtip_compat_ide_task_request_s *compat_req_task;
+		struct mtip_compat_ide_task_request_s __user *compat_req_task;
 		ide_task_request_t req_task;
 		int compat_tasksize, outtotal, ret;
 
-		compat_tasksize = sizeof(struct mtip_compat_ide_task_request_s);
+		compat_tasksize =
+			sizeof(struct mtip_compat_ide_task_request_s);
 
 		compat_req_task =
 			(struct mtip_compat_ide_task_request_s __user *) arg;
 
 		if (copy_from_user(&req_task, (void __user *) arg,
-				compat_tasksize - (2 * sizeof(compat_long_t))))
+			compat_tasksize - (2 * sizeof(compat_long_t))))
 			return -EFAULT;
 
 		if (get_user(req_task.out_size, &compat_req_task->out_size))
@@ -2950,12 +3002,8 @@ static int mtip_block_getgeo(struct block_device *dev,
 
 	geo->heads = 224;
 	geo->sectors = 56;
-#if BITS_PER_LONG == 64
-	geo->cylinders = capacity / (geo->heads * geo->sectors);
-#else
-	do_div(capacity, (geo->heads * geo->sectors));
+	sector_div(capacity, (geo->heads * geo->sectors));
 	geo->cylinders = capacity;
-#endif
 	return 0;
 }
 
@@ -2999,11 +3047,6 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
 		return;
 	}
 
-	if (unlikely(atomic_read(&dd->eh_active))) {
-		bio_endio(bio, -EBUSY);
-		return;
-	}
-
 	sg = mtip_hw_get_scatterlist(dd, &tag);
 	if (likely(sg != NULL)) {
 		blk_queue_bounce(queue, &bio);
@@ -3032,7 +3075,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
 				tag,
 				bio_endio,
 				bio,
-				bio->bi_rw & REQ_FLUSH,
+				bio->bi_rw & REQ_FUA,
 				bio_data_dir(bio));
 	} else
 		bio_io_error(bio);
@@ -3055,6 +3098,7 @@ static int mtip_block_initialize(struct driver_data *dd)
 	sector_t capacity;
 	unsigned int index = 0;
 	struct kobject *kobj;
+	unsigned char thd_name[16];
 
 	/* Initialize the protocol layer. */
 	rv = mtip_hw_init(dd);
@@ -3082,6 +3126,7 @@ static int mtip_block_initialize(struct driver_data *dd)
 	blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
 	blk_queue_physical_block_size(dd->queue, 4096);
 	blk_queue_io_min(dd->queue, 4096);
+	blk_queue_flush(dd->queue, 0);
 
 	dd->disk = alloc_disk(MTIP_MAX_MINORS);
 	if (dd->disk  == NULL) {
@@ -3142,6 +3187,18 @@ static int mtip_block_initialize(struct driver_data *dd)
 		kobject_put(kobj);
 	}
 
+	sprintf(thd_name, "mtip_svc_thd_%02d", index);
+
+	dd->mtip_svc_handler = kthread_run(mtip_service_thread,
+						dd, thd_name);
+
+	if (IS_ERR(dd->mtip_svc_handler)) {
+		printk(KERN_ERR "mtip32xx: service thread failed to start\n");
+		dd->mtip_svc_handler = NULL;
+		rv = -EFAULT;
+		goto read_capacity_error;
+	}
+
 	return rv;
 
 read_capacity_error:
@@ -3183,6 +3240,13 @@ protocol_init_error:
 static int mtip_block_remove(struct driver_data *dd)
 {
 	struct kobject *kobj;
+
+	if (dd->mtip_svc_handler) {
+		set_bit(MTIP_FLAG_SVC_THD_SHOULD_STOP_BIT, &dd->port->flags);
+		wake_up_interruptible(&dd->port->svc_wait);
+		kthread_stop(dd->mtip_svc_handler);
+	}
+
 	/* Clean up the sysfs attributes managed by the protocol layer. */
 	kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
 	if (kobj) {
@@ -3275,7 +3339,6 @@ static int mtip_pci_probe(struct pci_dev *pdev,
 	atomic_set(&dd->drv_cleanup_done, true);
 
 	atomic_set(&dd->resumeflag, false);
-	atomic_set(&dd->eh_active, 0);
 
 	/* Attach the private data to this PCI device.  */
 	pci_set_drvdata(pdev, dd);
@@ -3317,7 +3380,6 @@ static int mtip_pci_probe(struct pci_dev *pdev,
 
 	/* Copy the info we may need later into the private data structure. */
 	dd->major	= mtip_major;
-	dd->protocol	= ent->driver_data;
 	dd->instance	= instance;
 	dd->pdev	= pdev;
 
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index 17be4f444e7d..933192abe178 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -47,11 +47,11 @@
 
 /* ftl rebuild */
 #define MTIP_FTL_REBUILD_OFFSET		142
-#define MTIP_FTL_REBUILD_MAGIC		0xed51
+#define MTIP_FTL_REBUILD_MAGIC		0xED51
 #define MTIP_FTL_REBUILD_TIMEOUT_MS	2400000
 
 /* Macro to extract the tag bit number from a tag value. */
-#define MTIP_TAG_BIT(tag)	(tag & 0x1f)
+#define MTIP_TAG_BIT(tag)	(tag & 0x1F)
 
 /*
  * Macro to extract the tag index from a tag value. The index
@@ -81,7 +81,7 @@
 
 /* Driver name and version strings */
 #define MTIP_DRV_NAME		"mtip32xx"
-#define MTIP_DRV_VERSION	"1.2.6os2"
+#define MTIP_DRV_VERSION	"1.2.6os3"
 
 /* Maximum number of minor device numbers per device. */
 #define MTIP_MAX_MINORS		16
@@ -114,6 +114,15 @@
  #define dbg_printk(format, arg...)
 #endif
 
+#define __force_bit2int (unsigned int __force)
+
+/* below are bit numbers in 'flags' defined in mtip_port */
+#define MTIP_FLAG_IC_ACTIVE_BIT			0
+#define MTIP_FLAG_EH_ACTIVE_BIT			1
+#define MTIP_FLAG_SVC_THD_ACTIVE_BIT		2
+#define MTIP_FLAG_ISSUE_CMDS_BIT		4
+#define MTIP_FLAG_SVC_THD_SHOULD_STOP_BIT	8
+
 /* Register Frame Information Structure (FIS), host to device. */
 struct host_to_dev_fis {
 	/*
@@ -262,7 +271,7 @@ struct mtip_cmd {
 
 	unsigned long comp_time; /* command completion time, in jiffies */
 
-	atomic_t active; /* declares if this command sent to the drive.  */
+	atomic_t active; /* declares if this command sent to the drive. */
 };
 
 /* Structure used to describe a port. */
@@ -278,7 +287,7 @@ struct mtip_port {
 	void __iomem *mmio;
 	/* Array of pointers to the memory mapped s_active registers. */
 	void __iomem *s_active[MTIP_MAX_SLOT_GROUPS];
-	/* Array of pointers to the memory mapped completed registers.  */
+	/* Array of pointers to the memory mapped completed registers. */
 	void __iomem *completed[MTIP_MAX_SLOT_GROUPS];
 	/* Array of pointers to the memory mapped Command Issue registers. */
 	void __iomem *cmd_issue[MTIP_MAX_SLOT_GROUPS];
@@ -339,14 +348,24 @@ struct mtip_port {
 	 * are no longer needed.
 	 */
 	unsigned long allocated[SLOTBITS_IN_LONGS];
+	/*
+	 * used to queue commands when an internal command is in progress
+	 * or error handling is active
+	 */
+	unsigned long cmds_to_issue[SLOTBITS_IN_LONGS];
 	/*
 	 * Array of command slots. Structure includes pointers to the
 	 * command header and command table, and completion function and data
 	 * pointers.
 	 */
 	struct mtip_cmd commands[MTIP_MAX_COMMAND_SLOTS];
-	/* Non-zero if an internal command is in progress. */
-	int internal_cmd_in_progress;
+	/* Used by mtip_service_thread to wait for an event */
+	wait_queue_head_t svc_wait;
+	/*
+	 * indicates the state of the port. Also, helps the service thread
+	 * to determine its action on wake up.
+	 */
+	unsigned long flags;
 	/*
 	 * Timer used to complete commands that have been active for too long.
 	 */
@@ -372,18 +391,11 @@ struct driver_data {
 
 	int instance; /* Instance number. First device probed is 0, ... */
 
-	int protocol; /* FIXME: Protocol ops array index. */
-
 	struct gendisk *disk; /* Pointer to our gendisk structure. */
 
 	struct pci_dev *pdev; /* Pointer to the PCI device structure. */
 
 	struct request_queue *queue; /* Our request queue. */
-	/*
-	 * Semaphore used to lock out read/write commands during the
-	 * execution of an internal command.
-	 */
-	struct rw_semaphore internal_sem;
 
 	struct mtip_port *port; /* Pointer to the port data structure. */
 
@@ -403,6 +415,8 @@ struct driver_data {
 	atomic_t resumeflag; /* Atomic variable to track suspend/resume */
 
 	atomic_t eh_active; /* Flag for error handling tracking */
+
+	struct task_struct *mtip_svc_handler; /* task_struct of svc thd */
 };
 
 #endif
-- 
cgit 


From 3e54a3d1b83220d748f1a27c8999634be7a83949 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 24 Nov 2011 12:59:00 +0100
Subject: mtip32xx: uninitialized variable in mtip_quiesce_io()

We recently introduce new continue in the loop which make gcc complain.
In theory if MTIP_FLAG_SVC_THD_ACTIVE_BIT is set, we could hit continue
over and over until eventually we time out of the loop.  In that case
"active" should be set as true, but right now it's uninitialized.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index b5d843a02bfa..9bc10e31a143 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -1026,7 +1026,8 @@ static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag)
 static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
 {
 	unsigned long to;
-	unsigned int n, active;
+	unsigned int n;
+	unsigned int active = 1;
 
 	to = jiffies + msecs_to_jiffies(timeout);
 	do {
-- 
cgit 


From 1ba64edef6051d2ec79bb2fbd3a0c8f0df00ab55 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Dec 2011 00:33:37 +0100
Subject: block, sx8: kill blk_insert_request()

The only user left for blk_insert_request() is sx8 and it can be
trivially switched to use blk_execute_rq_nowait() - special requests
aren't included in io stat and sx8 doesn't use block layer tagging.
Switch sx8 and kill blk_insert_requeset().

This patch doesn't introduce any functional difference.

Only compile tested.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/sx8.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index b70f0fca9a42..e7472f567c9d 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -619,8 +619,10 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
 	       host->state == HST_DEV_SCAN);
 	spin_unlock_irq(&host->lock);
 
-	DPRINTK("blk_insert_request, tag == %u\n", idx);
-	blk_insert_request(host->oob_q, crq->rq, 1, crq);
+	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
+	crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+	crq->rq->special = crq;
+	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
 
 	return 0;
 
@@ -658,8 +660,10 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 	BUG_ON(rc < 0);
 	crq->msg_bucket = (u32) rc;
 
-	DPRINTK("blk_insert_request, tag == %u\n", idx);
-	blk_insert_request(host->oob_q, crq->rq, 1, crq);
+	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
+	crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+	crq->rq->special = crq;
+	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
 
 	return 0;
 }
-- 
cgit 


From f094148a1751d6ece9374851eb2926bc3cfd16ef Mon Sep 17 00:00:00 2001
From: Thomas Meyer <thomas@m3y3r.de>
Date: Tue, 29 Nov 2011 22:08:00 +0100
Subject: xen-blkfront: Use kcalloc instead of kzalloc to allocate array

The advantage of kcalloc is, that will prevent integer overflows which could
result from the multiplication of number of elements and size and it is also
a bit nicer to read.

The semantic patch that makes this change is available
in https://lkml.org/lkml/2011/11/25/107

Signed-off-by: Thomas Meyer <thomas@m3y3r.de>
[v1: Seperated the drivers/block/cciss_scsi.c out of this patch]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 351ddeffd430..8cb0c27f2654 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -157,7 +157,7 @@ static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
 	if (end > nr_minors) {
 		unsigned long *bitmap, *old;
 
-		bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap),
+		bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap),
 				 GFP_KERNEL);
 		if (bitmap == NULL)
 			return -ENOMEM;
-- 
cgit 


From 62ee8c13e26cffe6483630f59932c3e936dfb586 Mon Sep 17 00:00:00 2001
From: Asai Thambi S P <asamymuthupa@micron.com>
Date: Wed, 4 Jan 2012 22:01:32 +0100
Subject: mtip32xx: do rebuild monitoring asynchronously

Earlier, rebuild monitoring was done in the context of probe. Now the service
thread takes the responsibility of rebuild monitoring, and probe returns good
status.

Signed-off-by: Asai Thambi S P <asamymuthupa@micron.com>
Signed-off-by: Sam Bradshaw <sbradshaw@micron.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 104 +++++++++++++++++++++++---------------
 drivers/block/mtip32xx/mtip32xx.h |   1 +
 2 files changed, 65 insertions(+), 40 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 9bc10e31a143..b74eab70c3d0 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -87,6 +87,8 @@ static int mtip_major;
 static DEFINE_SPINLOCK(rssd_index_lock);
 static DEFINE_IDA(rssd_index_ida);
 
+static int mtip_block_initialize(struct driver_data *dd);
+
 #ifdef CONFIG_COMPAT
 struct mtip_compat_ide_task_request_s {
 	__u8		io_ports[8];
@@ -1031,7 +1033,8 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
 
 	to = jiffies + msecs_to_jiffies(timeout);
 	do {
-		if (test_bit(MTIP_FLAG_SVC_THD_ACTIVE_BIT, &port->flags)) {
+		if (test_bit(MTIP_FLAG_SVC_THD_ACTIVE_BIT, &port->flags) &&
+			test_bit(MTIP_FLAG_ISSUE_CMDS_BIT, &port->flags)) {
 			msleep(20);
 			continue; /* svc thd is actively issuing commands */
 		}
@@ -2410,6 +2413,7 @@ static int mtip_ftl_rebuild_poll(struct driver_data *dd)
 				"FTL rebuild complete (%d secs).\n",
 			jiffies_to_msecs(jiffies - start) / 1000);
 			dd->ftlrebuildflag = 0;
+			mtip_block_initialize(dd);
 			break;
 		}
 		ssleep(10);
@@ -2454,8 +2458,8 @@ static int mtip_service_thread(void *data)
 		if (kthread_should_stop())
 			break;
 
+		set_bit(MTIP_FLAG_SVC_THD_ACTIVE_BIT, &port->flags);
 		if (test_bit(MTIP_FLAG_ISSUE_CMDS_BIT, &port->flags)) {
-			set_bit(MTIP_FLAG_SVC_THD_ACTIVE_BIT, &port->flags);
 			slot = 1;
 			/* used to restrict the loop to one iteration */
 			slot_start = num_cmd_slots;
@@ -2488,8 +2492,14 @@ static int mtip_service_thread(void *data)
 			}
 
 			clear_bit(MTIP_FLAG_ISSUE_CMDS_BIT, &port->flags);
-			clear_bit(MTIP_FLAG_SVC_THD_ACTIVE_BIT, &port->flags);
+		} else if (test_bit(MTIP_FLAG_REBUILD_BIT, &port->flags)) {
+			mtip_ftl_rebuild_poll(dd);
+			clear_bit(MTIP_FLAG_REBUILD_BIT, &port->flags);
 		}
+		clear_bit(MTIP_FLAG_SVC_THD_ACTIVE_BIT, &port->flags);
+
+		if (test_bit(MTIP_FLAG_SVC_THD_SHOULD_STOP_BIT, &port->flags))
+			break;
 	}
 	return 0;
 }
@@ -2658,12 +2668,13 @@ static int mtip_hw_init(struct driver_data *dd)
 		rv = -EFAULT;
 		goto out3;
 	}
-	mtip_dump_identify(dd->port);
 
 	if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
 		MTIP_FTL_REBUILD_MAGIC) {
-		return mtip_ftl_rebuild_poll(dd);
+		set_bit(MTIP_FLAG_REBUILD_BIT, &dd->port->flags);
+		return MTIP_FTL_REBUILD_MAGIC;
 	}
+	mtip_dump_identify(dd->port);
 	return rv;
 
 out3:
@@ -3095,40 +3106,24 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
  */
 static int mtip_block_initialize(struct driver_data *dd)
 {
-	int rv = 0;
+	int rv = 0, wait_for_rebuild = 0;
 	sector_t capacity;
 	unsigned int index = 0;
 	struct kobject *kobj;
 	unsigned char thd_name[16];
 
+	if (dd->disk)
+		goto skip_create_disk; /* hw init done, before rebuild */
+
 	/* Initialize the protocol layer. */
-	rv = mtip_hw_init(dd);
-	if (rv < 0) {
+	wait_for_rebuild = mtip_hw_init(dd);
+	if (wait_for_rebuild < 0) {
 		dev_err(&dd->pdev->dev,
 			"Protocol layer initialization failed\n");
 		rv = -EINVAL;
 		goto protocol_init_error;
 	}
 
-	/* Allocate the request queue. */
-	dd->queue = blk_alloc_queue(GFP_KERNEL);
-	if (dd->queue == NULL) {
-		dev_err(&dd->pdev->dev,
-			"Unable to allocate request queue\n");
-		rv = -ENOMEM;
-		goto block_queue_alloc_init_error;
-	}
-
-	/* Attach our request function to the request queue. */
-	blk_queue_make_request(dd->queue, mtip_make_request);
-
-	/* Set device limits. */
-	set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags);
-	blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
-	blk_queue_physical_block_size(dd->queue, 4096);
-	blk_queue_io_min(dd->queue, 4096);
-	blk_queue_flush(dd->queue, 0);
-
 	dd->disk = alloc_disk(MTIP_MAX_MINORS);
 	if (dd->disk  == NULL) {
 		dev_err(&dd->pdev->dev,
@@ -3161,11 +3156,39 @@ static int mtip_block_initialize(struct driver_data *dd)
 	dd->disk->major		= dd->major;
 	dd->disk->first_minor	= dd->instance * MTIP_MAX_MINORS;
 	dd->disk->fops		= &mtip_block_ops;
-	dd->disk->queue		= dd->queue;
 	dd->disk->private_data	= dd;
-	dd->queue->queuedata	= dd;
 	dd->index		= index;
 
+	/*
+	 * if rebuild pending, start the service thread, and delay the block
+	 * queue creation and add_disk()
+	 */
+	if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
+		goto start_service_thread;
+
+skip_create_disk:
+	/* Allocate the request queue. */
+	dd->queue = blk_alloc_queue(GFP_KERNEL);
+	if (dd->queue == NULL) {
+		dev_err(&dd->pdev->dev,
+			"Unable to allocate request queue\n");
+		rv = -ENOMEM;
+		goto block_queue_alloc_init_error;
+	}
+
+	/* Attach our request function to the request queue. */
+	blk_queue_make_request(dd->queue, mtip_make_request);
+
+	dd->disk->queue		= dd->queue;
+	dd->queue->queuedata	= dd;
+
+	/* Set device limits. */
+	set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags);
+	blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
+	blk_queue_physical_block_size(dd->queue, 4096);
+	blk_queue_io_min(dd->queue, 4096);
+	blk_queue_flush(dd->queue, 0);
+
 	/* Set the capacity of the device in 512 byte sectors. */
 	if (!(mtip_hw_get_capacity(dd, &capacity))) {
 		dev_warn(&dd->pdev->dev,
@@ -3188,6 +3211,10 @@ static int mtip_block_initialize(struct driver_data *dd)
 		kobject_put(kobj);
 	}
 
+	if (dd->mtip_svc_handler)
+		return rv; /* service thread created for handling rebuild */
+
+start_service_thread:
 	sprintf(thd_name, "mtip_svc_thd_%02d", index);
 
 	dd->mtip_svc_handler = kthread_run(mtip_service_thread,
@@ -3197,18 +3224,19 @@ static int mtip_block_initialize(struct driver_data *dd)
 		printk(KERN_ERR "mtip32xx: service thread failed to start\n");
 		dd->mtip_svc_handler = NULL;
 		rv = -EFAULT;
-		goto read_capacity_error;
+		goto kthread_run_error;
 	}
 
 	return rv;
 
-read_capacity_error:
-	/*
-	 * Delete our gendisk structure. This also removes the device
-	 * from /dev
-	 */
+kthread_run_error:
+	/* Delete our gendisk. This also removes the device from /dev */
 	del_gendisk(dd->disk);
 
+read_capacity_error:
+	blk_cleanup_queue(dd->queue);
+
+block_queue_alloc_init_error:
 disk_index_error:
 	spin_lock(&rssd_index_lock);
 	ida_remove(&rssd_index_ida, index);
@@ -3218,11 +3246,7 @@ ida_get_error:
 	put_disk(dd->disk);
 
 alloc_disk_error:
-	blk_cleanup_queue(dd->queue);
-
-block_queue_alloc_init_error:
-	/* De-initialize the protocol layer. */
-	mtip_hw_exit(dd);
+	mtip_hw_exit(dd); /* De-initialize the protocol layer. */
 
 protocol_init_error:
 	return rv;
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index 933192abe178..723d7c4946dc 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -121,6 +121,7 @@
 #define MTIP_FLAG_EH_ACTIVE_BIT			1
 #define MTIP_FLAG_SVC_THD_ACTIVE_BIT		2
 #define MTIP_FLAG_ISSUE_CMDS_BIT		4
+#define MTIP_FLAG_REBUILD_BIT			5
 #define MTIP_FLAG_SVC_THD_SHOULD_STOP_BIT	8
 
 /* Register Frame Information Structure (FIS), host to device. */
-- 
cgit 


From c2f5b65020869215814df03c3941dac9436f99fb Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sat, 15 Oct 2011 07:33:46 -0400
Subject: NVMe: Simplify completion handling

Instead of encoding the handler type in the bottom two bits of the
per-completion context pointer, store the handler function as well
as the context pointer.  This gives us more flexibility and the code
is clearer.  It comes at the cost of an extra 8k of memory per queue,
but this feels like a reasonable price to pay.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 167 +++++++++++++++++++++++++--------------------------
 1 file changed, 81 insertions(+), 86 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index a17f80fa3881..4724655a6ebf 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -135,8 +135,12 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
 }
 
+typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
+						struct nvme_completion *);
+
 struct nvme_cmd_info {
-	unsigned long ctx;
+	nvme_completion_fn fn;
+	void *ctx;
 	unsigned long timeout;
 };
 
@@ -149,7 +153,7 @@ static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
  * alloc_cmdid() - Allocate a Command ID
  * @nvmeq: The queue that will be used for this command
  * @ctx: A pointer that will be passed to the handler
- * @handler: The ID of the handler to call
+ * @handler: The function to call on completion
  *
  * Allocate a Command ID for a queue.  The data passed in will
  * be passed to the completion handler.  This is implemented by using
@@ -160,28 +164,27 @@ static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
  * May be called with local interrupts disabled and the q_lock held,
  * or with interrupts enabled and no locks held.
  */
-static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler,
-							unsigned timeout)
+static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx,
+				nvme_completion_fn handler, unsigned timeout)
 {
 	int depth = nvmeq->q_depth - 1;
 	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
 	int cmdid;
 
-	BUG_ON((unsigned long)ctx & 3);
-
 	do {
 		cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth);
 		if (cmdid >= depth)
 			return -EBUSY;
 	} while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
 
-	info[cmdid].ctx = (unsigned long)ctx | handler;
+	info[cmdid].fn = handler;
+	info[cmdid].ctx = ctx;
 	info[cmdid].timeout = jiffies + timeout;
 	return cmdid;
 }
 
 static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
-						int handler, unsigned timeout)
+				nvme_completion_fn handler, unsigned timeout)
 {
 	int cmdid;
 	wait_event_killable(nvmeq->sq_full,
@@ -189,47 +192,69 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
 	return (cmdid < 0) ? -EINTR : cmdid;
 }
 
-/*
- * If you need more than four handlers, you'll need to change how
- * alloc_cmdid and nvme_process_cq work.  Consider using a special
- * CMD_CTX value instead, if that works for your situation.
- */
-enum {
-	sync_completion_id = 0,
-	bio_completion_id,
-};
-
-/* Special values must be a multiple of 4, and less than 0x1000 */
-#define CMD_CTX_BASE		(POISON_POINTER_DELTA + sync_completion_id)
+/* Special values must be less than 0x1000 */
+#define CMD_CTX_BASE		((void *)POISON_POINTER_DELTA)
 #define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
 #define CMD_CTX_COMPLETED	(0x310 + CMD_CTX_BASE)
 #define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
 #define CMD_CTX_FLUSH		(0x318 + CMD_CTX_BASE)
 
+static void special_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
+{
+	if (ctx == CMD_CTX_CANCELLED)
+		return;
+	if (ctx == CMD_CTX_FLUSH)
+		return;
+	if (ctx == CMD_CTX_COMPLETED) {
+		dev_warn(nvmeq->q_dmadev,
+				"completed id %d twice on queue %d\n",
+				cqe->command_id, le16_to_cpup(&cqe->sq_id));
+		return;
+	}
+	if (ctx == CMD_CTX_INVALID) {
+		dev_warn(nvmeq->q_dmadev,
+				"invalid id %d completed on queue %d\n",
+				cqe->command_id, le16_to_cpup(&cqe->sq_id));
+		return;
+	}
+
+	dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
+}
+
 /*
  * Called with local interrupts disabled and the q_lock held.  May not sleep.
  */
-static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
+static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid,
+						nvme_completion_fn *fn)
 {
-	unsigned long data;
+	void *ctx;
 	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
 
-	if (cmdid >= nvmeq->q_depth)
+	if (cmdid >= nvmeq->q_depth) {
+		*fn = special_completion;
 		return CMD_CTX_INVALID;
-	data = info[cmdid].ctx;
+	}
+	*fn = info[cmdid].fn;
+	ctx = info[cmdid].ctx;
+	info[cmdid].fn = special_completion;
 	info[cmdid].ctx = CMD_CTX_COMPLETED;
 	clear_bit(cmdid, nvmeq->cmdid_data);
 	wake_up(&nvmeq->sq_full);
-	return data;
+	return ctx;
 }
 
-static unsigned long cancel_cmdid(struct nvme_queue *nvmeq, int cmdid)
+static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
+						nvme_completion_fn *fn)
 {
-	unsigned long data;
+	void *ctx;
 	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
-	data = info[cmdid].ctx;
+	if (fn)
+		*fn = info[cmdid].fn;
+	ctx = info[cmdid].ctx;
+	info[cmdid].fn = special_completion;
 	info[cmdid].ctx = CMD_CTX_CANCELLED;
-	return data;
+	return ctx;
 }
 
 static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
@@ -485,7 +510,7 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
 {
 	int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH,
-						sync_completion_id, IO_TIMEOUT);
+						special_completion, IO_TIMEOUT);
 	if (unlikely(cmdid < 0))
 		return cmdid;
 
@@ -518,7 +543,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	nbio->bio = bio;
 
 	result = -EBUSY;
-	cmdid = alloc_cmdid(nvmeq, nbio, bio_completion_id, IO_TIMEOUT);
+	cmdid = alloc_cmdid(nvmeq, nbio, bio_completion, IO_TIMEOUT);
 	if (unlikely(cmdid < 0))
 		goto free_nbio;
 
@@ -599,45 +624,6 @@ static int nvme_make_request(struct request_queue *q, struct bio *bio)
 	return 0;
 }
 
-struct sync_cmd_info {
-	struct task_struct *task;
-	u32 result;
-	int status;
-};
-
-static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
-						struct nvme_completion *cqe)
-{
-	struct sync_cmd_info *cmdinfo = ctx;
-	if (unlikely((unsigned long)cmdinfo == CMD_CTX_CANCELLED))
-		return;
-	if ((unsigned long)cmdinfo == CMD_CTX_FLUSH)
-		return;
-	if (unlikely((unsigned long)cmdinfo == CMD_CTX_COMPLETED)) {
-		dev_warn(nvmeq->q_dmadev,
-				"completed id %d twice on queue %d\n",
-				cqe->command_id, le16_to_cpup(&cqe->sq_id));
-		return;
-	}
-	if (unlikely((unsigned long)cmdinfo == CMD_CTX_INVALID)) {
-		dev_warn(nvmeq->q_dmadev,
-				"invalid id %d completed on queue %d\n",
-				cqe->command_id, le16_to_cpup(&cqe->sq_id));
-		return;
-	}
-	cmdinfo->result = le32_to_cpup(&cqe->result);
-	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
-	wake_up_process(cmdinfo->task);
-}
-
-typedef void (*completion_fn)(struct nvme_queue *, void *,
-						struct nvme_completion *);
-
-static const completion_fn nvme_completions[4] = {
-	[sync_completion_id] = sync_completion,
-	[bio_completion_id]  = bio_completion,
-};
-
 static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 {
 	u16 head, phase;
@@ -646,9 +632,8 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 	phase = nvmeq->cq_phase;
 
 	for (;;) {
-		unsigned long data;
-		void *ptr;
-		unsigned char handler;
+		void *ctx;
+		nvme_completion_fn fn;
 		struct nvme_completion cqe = nvmeq->cqes[head];
 		if ((le16_to_cpu(cqe.status) & 1) != phase)
 			break;
@@ -658,10 +643,8 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 			phase = !phase;
 		}
 
-		data = free_cmdid(nvmeq, cqe.command_id);
-		handler = data & 3;
-		ptr = (void *)(data & ~3UL);
-		nvme_completions[handler](nvmeq, ptr, &cqe);
+		ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
+		fn(nvmeq, ctx, &cqe);
 	}
 
 	/* If the controller ignores the cq head doorbell and continuously
@@ -702,10 +685,25 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
 {
 	spin_lock_irq(&nvmeq->q_lock);
-	cancel_cmdid(nvmeq, cmdid);
+	cancel_cmdid(nvmeq, cmdid, NULL);
 	spin_unlock_irq(&nvmeq->q_lock);
 }
 
+struct sync_cmd_info {
+	struct task_struct *task;
+	u32 result;
+	int status;
+};
+
+static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
+{
+	struct sync_cmd_info *cmdinfo = ctx;
+	cmdinfo->result = le32_to_cpup(&cqe->result);
+	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
+	wake_up_process(cmdinfo->task);
+}
+
 /*
  * Returns 0 on success.  If the result is negative, it's a Linux error code;
  * if the result is positive, it's an NVM Express status code
@@ -719,7 +717,7 @@ static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
 	cmdinfo.task = current;
 	cmdinfo.status = -EINTR;
 
-	cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion_id,
+	cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion,
 								timeout);
 	if (cmdid < 0)
 		return cmdid;
@@ -1201,18 +1199,15 @@ static void nvme_timeout_ios(struct nvme_queue *nvmeq)
 	int cmdid;
 
 	for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
-		unsigned long data;
-		void *ptr;
-		unsigned char handler;
+		void *ctx;
+		nvme_completion_fn fn;
 		static struct nvme_completion cqe = { .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, };
 
 		if (!time_after(now, info[cmdid].timeout))
 			continue;
 		dev_warn(nvmeq->q_dmadev, "Timing out I/O %d\n", cmdid);
-		data = cancel_cmdid(nvmeq, cmdid);
-		handler = data & 3;
-		ptr = (void *)(data & ~3UL);
-		nvme_completions[handler](nvmeq, ptr, &cqe);
+		ctx = cancel_cmdid(nvmeq, cmdid, &fn);
+		fn(nvmeq, ctx, &cqe);
 	}
 }
 
-- 
cgit 


From 040a93b52a9eee8177ebaf2ba0ee0f9f518d1bf8 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 20 Dec 2011 11:04:12 -0500
Subject: NVMe: Change get_nvmeq to take a dev instead of a namespace

Upcoming patches require calling get_nvmeq when we don't have a namespace.
Some callers already have the device in a local variable anyway.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 4724655a6ebf..aa2fd66aabd6 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -257,9 +257,9 @@ static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
 	return ctx;
 }
 
-static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
+static struct nvme_queue *get_nvmeq(struct nvme_dev *dev)
 {
-	return ns->dev->queues[get_cpu() + 1];
+	return dev->queues[get_cpu() + 1];
 }
 
 static void put_nvmeq(struct nvme_queue *nvmeq)
@@ -606,7 +606,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 static int nvme_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct nvme_ns *ns = q->queuedata;
-	struct nvme_queue *nvmeq = get_nvmeq(ns);
+	struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
 	int result = -EBUSY;
 
 	spin_lock_irq(&nvmeq->q_lock);
@@ -1103,7 +1103,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	/* XXX: metadata */
 	prps = nvme_setup_prps(dev, &c.common, sg, &length, GFP_KERNEL);
 
-	nvmeq = get_nvmeq(ns);
+	nvmeq = get_nvmeq(dev);
 	/*
 	 * Since nvme_submit_sync_cmd sleeps, we can't keep preemption
 	 * disabled.  We may be preempted at any point, and be rescheduled
-- 
cgit 


From 5c1281a3bf5655ec1b90db495da3a2b77826ba88 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 20 Dec 2011 11:54:53 -0500
Subject: NVMe: Change nvme_completion_fn to take a dev

The queue is only needed for some rare occasions, and it's more consistent
to pass the device around.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index aa2fd66aabd6..b0e8a6dd33b1 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -135,7 +135,7 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
 }
 
-typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
+typedef void (*nvme_completion_fn)(struct nvme_dev *, void *,
 						struct nvme_completion *);
 
 struct nvme_cmd_info {
@@ -199,7 +199,7 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
 #define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
 #define CMD_CTX_FLUSH		(0x318 + CMD_CTX_BASE)
 
-static void special_completion(struct nvme_queue *nvmeq, void *ctx,
+static void special_completion(struct nvme_dev *dev, void *ctx,
 						struct nvme_completion *cqe)
 {
 	if (ctx == CMD_CTX_CANCELLED)
@@ -207,19 +207,19 @@ static void special_completion(struct nvme_queue *nvmeq, void *ctx,
 	if (ctx == CMD_CTX_FLUSH)
 		return;
 	if (ctx == CMD_CTX_COMPLETED) {
-		dev_warn(nvmeq->q_dmadev,
+		dev_warn(&dev->pci_dev->dev,
 				"completed id %d twice on queue %d\n",
 				cqe->command_id, le16_to_cpup(&cqe->sq_id));
 		return;
 	}
 	if (ctx == CMD_CTX_INVALID) {
-		dev_warn(nvmeq->q_dmadev,
+		dev_warn(&dev->pci_dev->dev,
 				"invalid id %d completed on queue %d\n",
 				cqe->command_id, le16_to_cpup(&cqe->sq_id));
 		return;
 	}
 
-	dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
+	dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx);
 }
 
 /*
@@ -332,29 +332,36 @@ static struct nvme_bio *alloc_nbio(unsigned nseg, gfp_t gfp)
 			sizeof(struct scatterlist) * nseg, gfp);
 }
 
-static void free_nbio(struct nvme_queue *nvmeq, struct nvme_bio *nbio)
+static void free_nbio(struct nvme_dev *dev, struct nvme_bio *nbio)
 {
-	nvme_free_prps(nvmeq->dev, nbio->prps);
+	nvme_free_prps(dev, nbio->prps);
 	kfree(nbio);
 }
 
-static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
+static void requeue_bio(struct nvme_dev *dev, struct bio *bio)
+{
+	struct nvme_queue *nvmeq = get_nvmeq(dev);
+	if (bio_list_empty(&nvmeq->sq_cong))
+		add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
+	bio_list_add(&nvmeq->sq_cong, bio);
+	put_nvmeq(nvmeq);
+	wake_up_process(nvme_thread);
+}
+
+static void bio_completion(struct nvme_dev *dev, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct nvme_bio *nbio = ctx;
 	struct bio *bio = nbio->bio;
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
 
-	dma_unmap_sg(nvmeq->q_dmadev, nbio->sg, nbio->nents,
+	dma_unmap_sg(&dev->pci_dev->dev, nbio->sg, nbio->nents,
 			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-	free_nbio(nvmeq, nbio);
+	free_nbio(dev, nbio);
 	if (status) {
 		bio_endio(bio, -EIO);
 	} else if (bio->bi_vcnt > bio->bi_idx) {
-		if (bio_list_empty(&nvmeq->sq_cong))
-			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
-		bio_list_add(&nvmeq->sq_cong, bio);
-		wake_up_process(nvme_thread);
+		requeue_bio(dev, bio);
 	} else {
 		bio_endio(bio, 0);
 	}
@@ -594,7 +601,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	return 0;
 
  free_nbio:
-	free_nbio(nvmeq, nbio);
+	free_nbio(nvmeq->dev, nbio);
  nomem:
 	return result;
 }
@@ -644,7 +651,7 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 		}
 
 		ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
-		fn(nvmeq, ctx, &cqe);
+		fn(nvmeq->dev, ctx, &cqe);
 	}
 
 	/* If the controller ignores the cq head doorbell and continuously
@@ -695,7 +702,7 @@ struct sync_cmd_info {
 	int status;
 };
 
-static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
+static void sync_completion(struct nvme_dev *dev, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct sync_cmd_info *cmdinfo = ctx;
@@ -1207,7 +1214,7 @@ static void nvme_timeout_ios(struct nvme_queue *nvmeq)
 			continue;
 		dev_warn(nvmeq->q_dmadev, "Timing out I/O %d\n", cmdid);
 		ctx = cancel_cmdid(nvmeq, cmdid, &fn);
-		fn(nvmeq, ctx, &cqe);
+		fn(nvmeq->dev, ctx, &cqe);
 	}
 }
 
-- 
cgit 


From eca18b2394a9387feeaf14cd884ddddd7a809d19 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 20 Dec 2011 13:34:52 -0500
Subject: NVMe: Merge the nvme_bio and nvme_prp data structures

The new merged data structure is called nvme_iod.  This improves performance
for mid-sized I/Os (in the 16k range) since we save a memory allocation.
It is also a slightly simpler interface to use.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 239 ++++++++++++++++++++++++++-------------------------
 1 file changed, 124 insertions(+), 115 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index b0e8a6dd33b1..4517608c068f 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -290,52 +290,70 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 	return 0;
 }
 
-struct nvme_prps {
-	int npages;		/* 0 means small pool in use */
+/*
+ * The nvme_iod describes the data in an I/O, including the list of PRP
+ * entries.  You can't see it in this data structure because C doesn't let
+ * me express that.  Use nvme_alloc_iod to ensure there's enough space
+ * allocated to store the PRP list.
+ */
+struct nvme_iod {
+	void *private;		/* For the use of the submitter of the I/O */
+	int npages;		/* In the PRP list. 0 means small pool in use */
+	int offset;		/* Of PRP list */
+	int nents;		/* Used in scatterlist */
+	int length;		/* Of data, in bytes */
 	dma_addr_t first_dma;
-	__le64 *list[0];
+	struct scatterlist sg[0];
 };
 
-static void nvme_free_prps(struct nvme_dev *dev, struct nvme_prps *prps)
+static __le64 **iod_list(struct nvme_iod *iod)
 {
-	const int last_prp = PAGE_SIZE / 8 - 1;
-	int i;
-	dma_addr_t prp_dma;
+	return ((void *)iod) + iod->offset;
+}
 
-	if (!prps)
-		return;
+/*
+ * Will slightly overestimate the number of pages needed.  This is OK
+ * as it only leads to a small amount of wasted memory for the lifetime of
+ * the I/O.
+ */
+static int nvme_npages(unsigned size)
+{
+	unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE);
+	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+}
 
-	prp_dma = prps->first_dma;
+static struct nvme_iod *
+nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
+{
+	struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
+				sizeof(__le64 *) * nvme_npages(nbytes) +
+				sizeof(struct scatterlist) * nseg, gfp);
 
-	if (prps->npages == 0)
-		dma_pool_free(dev->prp_small_pool, prps->list[0], prp_dma);
-	for (i = 0; i < prps->npages; i++) {
-		__le64 *prp_list = prps->list[i];
-		dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
-		dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
-		prp_dma = next_prp_dma;
+	if (iod) {
+		iod->offset = offsetof(struct nvme_iod, sg[nseg]);
+		iod->npages = -1;
+		iod->length = nbytes;
 	}
-	kfree(prps);
-}
 
-struct nvme_bio {
-	struct bio *bio;
-	int nents;
-	struct nvme_prps *prps;
-	struct scatterlist sg[0];
-};
-
-/* XXX: use a mempool */
-static struct nvme_bio *alloc_nbio(unsigned nseg, gfp_t gfp)
-{
-	return kzalloc(sizeof(struct nvme_bio) +
-			sizeof(struct scatterlist) * nseg, gfp);
+	return iod;
 }
 
-static void free_nbio(struct nvme_dev *dev, struct nvme_bio *nbio)
+static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 {
-	nvme_free_prps(dev, nbio->prps);
-	kfree(nbio);
+	const int last_prp = PAGE_SIZE / 8 - 1;
+	int i;
+	__le64 **list = iod_list(iod);
+	dma_addr_t prp_dma = iod->first_dma;
+
+	if (iod->npages == 0)
+		dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
+	for (i = 0; i < iod->npages; i++) {
+		__le64 *prp_list = list[i];
+		dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
+		dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
+		prp_dma = next_prp_dma;
+	}
+	kfree(iod);
 }
 
 static void requeue_bio(struct nvme_dev *dev, struct bio *bio)
@@ -351,13 +369,13 @@ static void requeue_bio(struct nvme_dev *dev, struct bio *bio)
 static void bio_completion(struct nvme_dev *dev, void *ctx,
 						struct nvme_completion *cqe)
 {
-	struct nvme_bio *nbio = ctx;
-	struct bio *bio = nbio->bio;
+	struct nvme_iod *iod = ctx;
+	struct bio *bio = iod->private;
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
 
-	dma_unmap_sg(&dev->pci_dev->dev, nbio->sg, nbio->nents,
+	dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
 			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-	free_nbio(dev, nbio);
+	nvme_free_iod(dev, iod);
 	if (status) {
 		bio_endio(bio, -EIO);
 	} else if (bio->bi_vcnt > bio->bi_idx) {
@@ -368,25 +386,25 @@ static void bio_completion(struct nvme_dev *dev, void *ctx,
 }
 
 /* length is in bytes.  gfp flags indicates whether we may sleep. */
-static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
-					struct nvme_common_command *cmd,
-					struct scatterlist *sg, int *len,
-					gfp_t gfp)
+static int nvme_setup_prps(struct nvme_dev *dev,
+			struct nvme_common_command *cmd, struct nvme_iod *iod,
+			int total_len, gfp_t gfp)
 {
 	struct dma_pool *pool;
-	int length = *len;
+	int length = total_len;
+	struct scatterlist *sg = iod->sg;
 	int dma_len = sg_dma_len(sg);
 	u64 dma_addr = sg_dma_address(sg);
 	int offset = offset_in_page(dma_addr);
 	__le64 *prp_list;
+	__le64 **list = iod_list(iod);
 	dma_addr_t prp_dma;
-	int nprps, npages, i;
-	struct nvme_prps *prps = NULL;
+	int nprps, i;
 
 	cmd->prp1 = cpu_to_le64(dma_addr);
 	length -= (PAGE_SIZE - offset);
 	if (length <= 0)
-		return prps;
+		return total_len;
 
 	dma_len -= (PAGE_SIZE - offset);
 	if (dma_len) {
@@ -399,46 +417,35 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 
 	if (length <= PAGE_SIZE) {
 		cmd->prp2 = cpu_to_le64(dma_addr);
-		return prps;
+		return total_len;
 	}
 
 	nprps = DIV_ROUND_UP(length, PAGE_SIZE);
-	npages = DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
-	prps = kmalloc(sizeof(*prps) + sizeof(__le64 *) * npages, gfp);
-	if (!prps) {
-		cmd->prp2 = cpu_to_le64(dma_addr);
-		*len = (*len - length) + PAGE_SIZE;
-		return prps;
-	}
-
 	if (nprps <= (256 / 8)) {
 		pool = dev->prp_small_pool;
-		prps->npages = 0;
+		iod->npages = 0;
 	} else {
 		pool = dev->prp_page_pool;
-		prps->npages = 1;
+		iod->npages = 1;
 	}
 
 	prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
 	if (!prp_list) {
 		cmd->prp2 = cpu_to_le64(dma_addr);
-		*len = (*len - length) + PAGE_SIZE;
-		kfree(prps);
-		return NULL;
+		iod->npages = -1;
+		return (total_len - length) + PAGE_SIZE;
 	}
-	prps->list[0] = prp_list;
-	prps->first_dma = prp_dma;
+	list[0] = prp_list;
+	iod->first_dma = prp_dma;
 	cmd->prp2 = cpu_to_le64(prp_dma);
 	i = 0;
 	for (;;) {
 		if (i == PAGE_SIZE / 8) {
 			__le64 *old_prp_list = prp_list;
 			prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
-			if (!prp_list) {
-				*len = (*len - length);
-				return prps;
-			}
-			prps->list[prps->npages++] = prp_list;
+			if (!prp_list)
+				return total_len - length;
+			list[iod->npages++] = prp_list;
 			prp_list[0] = old_prp_list[i - 1];
 			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
 			i = 1;
@@ -457,21 +464,21 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 		dma_len = sg_dma_len(sg);
 	}
 
-	return prps;
+	return total_len;
 }
 
 /* NVMe scatterlists require no holes in the virtual address */
 #define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2)	((vec2)->bv_offset || \
 			(((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
 
-static int nvme_map_bio(struct device *dev, struct nvme_bio *nbio,
+static int nvme_map_bio(struct device *dev, struct nvme_iod *iod,
 		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
 {
 	struct bio_vec *bvec, *bvprv = NULL;
 	struct scatterlist *sg = NULL;
 	int i, old_idx, length = 0, nsegs = 0;
 
-	sg_init_table(nbio->sg, psegs);
+	sg_init_table(iod->sg, psegs);
 	old_idx = bio->bi_idx;
 	bio_for_each_segment(bvec, bio, i) {
 		if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) {
@@ -479,7 +486,7 @@ static int nvme_map_bio(struct device *dev, struct nvme_bio *nbio,
 		} else {
 			if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec))
 				break;
-			sg = sg ? sg + 1 : nbio->sg;
+			sg = sg ? sg + 1 : iod->sg;
 			sg_set_page(sg, bvec->bv_page, bvec->bv_len,
 							bvec->bv_offset);
 			nsegs++;
@@ -488,9 +495,9 @@ static int nvme_map_bio(struct device *dev, struct nvme_bio *nbio,
 		bvprv = bvec;
 	}
 	bio->bi_idx = i;
-	nbio->nents = nsegs;
+	iod->nents = nsegs;
 	sg_mark_end(sg);
-	if (dma_map_sg(dev, nbio->sg, nbio->nents, dma_dir) == 0) {
+	if (dma_map_sg(dev, iod->sg, iod->nents, dma_dir) == 0) {
 		bio->bi_idx = old_idx;
 		return -ENOMEM;
 	}
@@ -531,7 +538,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 								struct bio *bio)
 {
 	struct nvme_command *cmnd;
-	struct nvme_bio *nbio;
+	struct nvme_iod *iod;
 	enum dma_data_direction dma_dir;
 	int cmdid, length, result = -ENOMEM;
 	u16 control;
@@ -544,15 +551,15 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 			return result;
 	}
 
-	nbio = alloc_nbio(psegs, GFP_ATOMIC);
-	if (!nbio)
+	iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC);
+	if (!iod)
 		goto nomem;
-	nbio->bio = bio;
+	iod->private = bio;
 
 	result = -EBUSY;
-	cmdid = alloc_cmdid(nvmeq, nbio, bio_completion, IO_TIMEOUT);
+	cmdid = alloc_cmdid(nvmeq, iod, bio_completion, IO_TIMEOUT);
 	if (unlikely(cmdid < 0))
-		goto free_nbio;
+		goto free_iod;
 
 	if ((bio->bi_rw & REQ_FLUSH) && !psegs)
 		return nvme_submit_flush(nvmeq, ns, cmdid);
@@ -578,15 +585,15 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		dma_dir = DMA_FROM_DEVICE;
 	}
 
-	result = nvme_map_bio(nvmeq->q_dmadev, nbio, bio, dma_dir, psegs);
+	result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs);
 	if (result < 0)
-		goto free_nbio;
+		goto free_iod;
 	length = result;
 
 	cmnd->rw.command_id = cmdid;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
-	nbio->prps = nvme_setup_prps(nvmeq->dev, &cmnd->common, nbio->sg,
-							&length, GFP_ATOMIC);
+	length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length,
+								GFP_ATOMIC);
 	cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
 	cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
 	cmnd->rw.control = cpu_to_le16(control);
@@ -600,8 +607,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 
 	return 0;
 
- free_nbio:
-	free_nbio(nvmeq->dev, nbio);
+ free_iod:
+	nvme_free_iod(nvmeq->dev, iod);
  nomem:
 	return result;
 }
@@ -1005,18 +1012,18 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 	return result;
 }
 
-static int nvme_map_user_pages(struct nvme_dev *dev, int write,
-				unsigned long addr, unsigned length,
-				struct scatterlist **sgp)
+static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
+				unsigned long addr, unsigned length)
 {
 	int i, err, count, nents, offset;
 	struct scatterlist *sg;
 	struct page **pages;
+	struct nvme_iod *iod;
 
 	if (addr & 3)
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 	if (!length)
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 
 	offset = offset_in_page(addr);
 	count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
@@ -1029,7 +1036,8 @@ static int nvme_map_user_pages(struct nvme_dev *dev, int write,
 		goto put_pages;
 	}
 
-	sg = kcalloc(count, sizeof(*sg), GFP_KERNEL);
+	iod = nvme_alloc_iod(count, length, GFP_KERNEL);
+	sg = iod->sg;
 	sg_init_table(sg, count);
 	for (i = 0; i < count; i++) {
 		sg_set_page(&sg[i], pages[i],
@@ -1042,22 +1050,24 @@ static int nvme_map_user_pages(struct nvme_dev *dev, int write,
 	nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
 				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	if (!nents)
-		goto put_pages;
+		goto free_iod;
 
 	kfree(pages);
-	*sgp = sg;
-	return nents;
+	return iod;
 
+ free_iod:
+	kfree(iod);
  put_pages:
 	for (i = 0; i < count; i++)
 		put_page(pages[i]);
 	kfree(pages);
-	return err;
+	return ERR_PTR(err);
 }
 
 static void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
-			unsigned long addr, int length, struct scatterlist *sg)
+			unsigned long addr, int length, struct nvme_iod *iod)
 {
+	struct scatterlist *sg = iod->sg;
 	int i, count;
 
 	count = DIV_ROUND_UP(offset_in_page(addr) + length, PAGE_SIZE);
@@ -1074,9 +1084,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	struct nvme_user_io io;
 	struct nvme_command c;
 	unsigned length;
-	int nents, status;
-	struct scatterlist *sg;
-	struct nvme_prps *prps;
+	int status;
+	struct nvme_iod *iod;
 
 	if (copy_from_user(&io, uio, sizeof(io)))
 		return -EFAULT;
@@ -1086,15 +1095,14 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	case nvme_cmd_write:
 	case nvme_cmd_read:
 	case nvme_cmd_compare:
-		nents = nvme_map_user_pages(dev, io.opcode & 1, io.addr,
-								length, &sg);
+		iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length);
 		break;
 	default:
 		return -EINVAL;
 	}
 
-	if (nents < 0)
-		return nents;
+	if (IS_ERR(iod))
+		return PTR_ERR(iod);
 
 	memset(&c, 0, sizeof(c));
 	c.rw.opcode = io.opcode;
@@ -1108,7 +1116,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	c.rw.apptag = io.apptag;
 	c.rw.appmask = io.appmask;
 	/* XXX: metadata */
-	prps = nvme_setup_prps(dev, &c.common, sg, &length, GFP_KERNEL);
+	length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
 
 	nvmeq = get_nvmeq(dev);
 	/*
@@ -1123,8 +1131,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	else
 		status = nvme_submit_sync_cmd(nvmeq, &c, NULL, IO_TIMEOUT);
 
-	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg);
-	nvme_free_prps(dev, prps);
+	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, iod);
+	nvme_free_iod(dev, iod);
 	return status;
 }
 
@@ -1134,9 +1142,8 @@ static int nvme_user_admin_cmd(struct nvme_ns *ns,
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_admin_cmd cmd;
 	struct nvme_command c;
-	int status, length, nents = 0;
-	struct scatterlist *sg;
-	struct nvme_prps *prps = NULL;
+	int status, length;
+	struct nvme_iod *iod;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -1158,19 +1165,21 @@ static int nvme_user_admin_cmd(struct nvme_ns *ns,
 
 	length = cmd.data_len;
 	if (cmd.data_len) {
-		nents = nvme_map_user_pages(dev, 1, cmd.addr, length, &sg);
-		if (nents < 0)
-			return nents;
-		prps = nvme_setup_prps(dev, &c.common, sg, &length, GFP_KERNEL);
+		iod = nvme_map_user_pages(dev, 1, cmd.addr, length);
+		if (IS_ERR(iod))
+			return PTR_ERR(iod);
+		length = nvme_setup_prps(dev, &c.common, iod, length,
+								GFP_KERNEL);
 	}
 
 	if (length != cmd.data_len)
 		status = -ENOMEM;
 	else
 		status = nvme_submit_admin_cmd(dev, &c, NULL);
+
 	if (cmd.data_len) {
-		nvme_unmap_user_pages(dev, 0, cmd.addr, cmd.data_len, sg);
-		nvme_free_prps(dev, prps);
+		nvme_unmap_user_pages(dev, 0, cmd.addr, cmd.data_len, iod);
+		nvme_free_iod(dev, iod);
 	}
 	return status;
 }
-- 
cgit 


From ff976d724a74e4522e9ca2de1fb37ac4520f454f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 20 Dec 2011 13:53:01 -0500
Subject: NVMe: Rename IO_TIMEOUT to NVME_IO_TIMEOUT

IO_TIMEOUT is a little too generic and might be used by other parts of
the kernel in the future.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 4517608c068f..1cc01872f6dc 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -45,7 +45,7 @@
 #define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
 #define NVME_MINORS 64
-#define IO_TIMEOUT	(5 * HZ)
+#define NVME_IO_TIMEOUT	(5 * HZ)
 #define ADMIN_TIMEOUT	(60 * HZ)
 
 static int nvme_major;
@@ -524,7 +524,7 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
 {
 	int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH,
-						special_completion, IO_TIMEOUT);
+					special_completion, NVME_IO_TIMEOUT);
 	if (unlikely(cmdid < 0))
 		return cmdid;
 
@@ -557,7 +557,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	iod->private = bio;
 
 	result = -EBUSY;
-	cmdid = alloc_cmdid(nvmeq, iod, bio_completion, IO_TIMEOUT);
+	cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT);
 	if (unlikely(cmdid < 0))
 		goto free_iod;
 
@@ -1129,7 +1129,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	if (length != (io.nblocks + 1) << ns->lba_shift)
 		status = -ENOMEM;
 	else
-		status = nvme_submit_sync_cmd(nvmeq, &c, NULL, IO_TIMEOUT);
+		status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
 
 	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, iod);
 	nvme_free_iod(dev, iod);
-- 
cgit 


From 497421880acecd0281d3182d534f3d28c927caec Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 6 Jan 2012 13:42:45 -0700
Subject: NVMe: Fix DMA mapping for admin commands

We were always mapping as DMA_FROM_DEVICE then unmapping with
DMA_TO_DEVICE which was clearly not correct.  Follow the same pattern as
nvme_submit_io() and key off the bottom bit of the opcode to determine
whether this is a read or a write.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 1cc01872f6dc..3f8cae9dc960 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1165,7 +1165,8 @@ static int nvme_user_admin_cmd(struct nvme_ns *ns,
 
 	length = cmd.data_len;
 	if (cmd.data_len) {
-		iod = nvme_map_user_pages(dev, 1, cmd.addr, length);
+		iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr,
+								length);
 		if (IS_ERR(iod))
 			return PTR_ERR(iod);
 		length = nvme_setup_prps(dev, &c.common, iod, length,
@@ -1178,7 +1179,8 @@ static int nvme_user_admin_cmd(struct nvme_ns *ns,
 		status = nvme_submit_admin_cmd(dev, &c, NULL);
 
 	if (cmd.data_len) {
-		nvme_unmap_user_pages(dev, 0, cmd.addr, cmd.data_len, iod);
+		nvme_unmap_user_pages(dev, cmd.opcode & 1, cmd.addr,
+							cmd.data_len, iod);
 		nvme_free_iod(dev, iod);
 	}
 	return status;
-- 
cgit 


From fe304c43c6d63e29ed4fc46a874d7a74313788c5 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 6 Jan 2012 13:49:25 -0700
Subject: NVMe: Mark the end of the sg list

For user I/O and admin commands, we were forgetting to mark the end of
the SG list.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 3f8cae9dc960..71fc9030b4df 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1045,6 +1045,7 @@ static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 		length -= (PAGE_SIZE - offset);
 		offset = 0;
 	}
+	sg_mark_end(&sg[i - 1]);
 
 	err = -ENOMEM;
 	nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
-- 
cgit 


From 1c2ad9faaf662b4a525348775deca3ac8e6c35a0 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 6 Jan 2012 13:52:56 -0700
Subject: NVMe: Simplify nvme_unmap_user_pages

By using the iod->nents field (the same way other I/O paths do), we can
avoid recalculating the number of sg entries at unmap time, and make
nvme_unmap_user_pages() easier to call.

Also, use the 'write' parameter instead of assuming DMA_FROM_DEVICE.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 71fc9030b4df..3cf82c27a544 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1046,6 +1046,7 @@ static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 		offset = 0;
 	}
 	sg_mark_end(&sg[i - 1]);
+	iod->nents = count;
 
 	err = -ENOMEM;
 	nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
@@ -1066,16 +1067,15 @@ static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 }
 
 static void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
-			unsigned long addr, int length, struct nvme_iod *iod)
+			struct nvme_iod *iod)
 {
-	struct scatterlist *sg = iod->sg;
-	int i, count;
+	int i;
 
-	count = DIV_ROUND_UP(offset_in_page(addr) + length, PAGE_SIZE);
-	dma_unmap_sg(&dev->pci_dev->dev, sg, count, DMA_FROM_DEVICE);
+	dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
+				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 
-	for (i = 0; i < count; i++)
-		put_page(sg_page(&sg[i]));
+	for (i = 0; i < iod->nents; i++)
+		put_page(sg_page(&iod->sg[i]));
 }
 
 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
@@ -1132,7 +1132,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	else
 		status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
 
-	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, iod);
+	nvme_unmap_user_pages(dev, io.opcode & 1, iod);
 	nvme_free_iod(dev, iod);
 	return status;
 }
@@ -1180,8 +1180,7 @@ static int nvme_user_admin_cmd(struct nvme_ns *ns,
 		status = nvme_submit_admin_cmd(dev, &c, NULL);
 
 	if (cmd.data_len) {
-		nvme_unmap_user_pages(dev, cmd.opcode & 1, cmd.addr,
-							cmd.data_len, iod);
+		nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
 		nvme_free_iod(dev, iod);
 	}
 	return status;
-- 
cgit 


From 4eeb9215a0d5c9494ca8b20158cc8ee82618840c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 10 Jan 2012 14:35:08 -0700
Subject: NVMe: Set queue flags correctly

QUEUE_FLAG_* are flags (other than QUEUE_FLAG_DEFAULT), so they cannot
be ORed together.  Set the queue flags using queue_flag_set_unlocked().

Reported-by: Donald Wood <donald.e.wood@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 3cf82c27a544..b583603fae5b 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1315,8 +1315,10 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
 	ns->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!ns->queue)
 		goto out_free_ns;
-	ns->queue->queue_flags = QUEUE_FLAG_DEFAULT | QUEUE_FLAG_NOMERGES |
-				QUEUE_FLAG_NONROT | QUEUE_FLAG_DISCARD;
+	ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
+	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
+/*	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); */
 	blk_queue_make_request(ns->queue, nvme_make_request);
 	ns->dev = dev;
 	ns->queue->queuedata = ns;
-- 
cgit 


From 366e8217e5ec6ce9f73aec19c46d983110fb4a98 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 10 Jan 2012 16:30:15 -0500
Subject: NVMe: Version 0.8

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index b583603fae5b..28c84b18712d 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1726,6 +1726,6 @@ static void __exit nvme_exit(void)
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("0.7");
+MODULE_VERSION("0.8");
 module_init(nvme_init);
 module_exit(nvme_exit);
-- 
cgit 


From df3481399042200792822b6243e36a95a557b57e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 11 Jan 2012 07:29:56 -0700
Subject: NVMe: Set number of queues correctly

The number of submission & completion queues should be set by calling
Set Features, not Get Features.

Reported-by: Kwok Kong <Kwok.Kong@idt.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 28c84b18712d..f4996b0e4b1a 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -840,7 +840,7 @@ static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
 }
 
 static int nvme_get_features(struct nvme_dev *dev, unsigned fid,
-			unsigned dword11, dma_addr_t dma_addr, u32 *result)
+				unsigned dword11, dma_addr_t dma_addr)
 {
 	struct nvme_command c;
 
@@ -850,6 +850,20 @@ static int nvme_get_features(struct nvme_dev *dev, unsigned fid,
 	c.features.fid = cpu_to_le32(fid);
 	c.features.dword11 = cpu_to_le32(dword11);
 
+	return nvme_submit_admin_cmd(dev, &c, NULL);
+}
+
+static int nvme_set_features(struct nvme_dev *dev, unsigned fid,
+			unsigned dword11, dma_addr_t dma_addr, u32 *result)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = nvme_admin_set_features;
+	c.features.prp1 = cpu_to_le64(dma_addr);
+	c.features.fid = cpu_to_le32(fid);
+	c.features.dword11 = cpu_to_le32(dword11);
+
 	return nvme_submit_admin_cmd(dev, &c, result);
 }
 
@@ -1365,7 +1379,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 	u32 result;
 	u32 q_count = (count - 1) | ((count - 1) << 16);
 
-	status = nvme_get_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
+	status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
 								&result);
 	if (status)
 		return -EIO;
@@ -1482,7 +1496,7 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 			continue;
 
 		res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i,
-							dma_addr + 4096, NULL);
+							dma_addr + 4096);
 		if (res)
 			continue;
 
-- 
cgit 


From 0e805a1d857799352e51e8697c0b1a30aec16913 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Wed, 11 Jan 2012 19:42:15 -0800
Subject: rbd: initialize snap_rwsem in rbd_add()

New rbd device structures get initialized in rbd_add().  Many of
the fields rely on being initially zero-filled.  However we lockdep
was noticing that the rw_semaphore embedded in the header field
was not getting properly initialized.  Fix that.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 drivers/block/rbd.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 148ab944378d..3fd31dec8c9c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2184,6 +2184,8 @@ static ssize_t rbd_add(struct bus_type *bus,
 	INIT_LIST_HEAD(&rbd_dev->node);
 	INIT_LIST_HEAD(&rbd_dev->snaps);
 
+	init_rwsem(&rbd_dev->header.snap_rwsem);
+
 	/* generate unique id: find highest unique id, add one */
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 
-- 
cgit 


From 90ab5ee94171b3e28de6bb42ee30b527014e0be7 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Jan 2012 09:32:20 +1030
Subject: module_param: make bool parameters really bool (drivers & misc)

module_param(bool) used to counter-intuitively take an int.  In
fddd5201 (mid-2009) we allowed bool or int/unsigned int using a messy
trick.

It's time to remove the int/unsigned int option.  For this version
it'll simply give a warning, but it'll break next kernel version.

Acked-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/block/drbd/drbd_int.h  | 4 ++--
 drivers/block/drbd/drbd_main.c | 4 ++--
 drivers/block/paride/bpck6.c   | 5 ++---
 drivers/block/paride/pd.c      | 3 ++-
 drivers/block/paride/pf.c      | 4 +++-
 drivers/block/paride/pg.c      | 3 ++-
 drivers/block/paride/pt.c      | 4 +++-
 drivers/block/xd.c             | 2 +-
 8 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 9cf20355ceec..8d680562ba73 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -59,8 +59,8 @@
 
 /* module parameter, defined in drbd_main.c */
 extern unsigned int minor_count;
-extern int disable_sendpage;
-extern int allow_oos;
+extern bool disable_sendpage;
+extern bool allow_oos;
 extern unsigned int cn_idx;
 
 #ifdef CONFIG_DRBD_FAULT_INJECTION
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 0358e55356c8..211fc44f84be 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -117,8 +117,8 @@ module_param(fault_devs, int, 0644);
 
 /* module parameter, defined */
 unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
-int disable_sendpage;
-int allow_oos;
+bool disable_sendpage;
+bool allow_oos;
 unsigned int cn_idx = CN_IDX_DRBD;
 int proc_details;       /* Detail level in proc drbd*/
 
diff --git a/drivers/block/paride/bpck6.c b/drivers/block/paride/bpck6.c
index ad124525ac23..ec64e7f5d1ce 100644
--- a/drivers/block/paride/bpck6.c
+++ b/drivers/block/paride/bpck6.c
@@ -20,9 +20,6 @@
 */
 
 
-/* PARAMETERS */
-static int verbose; /* set this to 1 to see debugging messages and whatnot */
-
 #define BACKPACK_VERSION "2.0.2"
 
 #include <linux/module.h>
@@ -36,6 +33,8 @@ static int verbose; /* set this to 1 to see debugging messages and whatnot */
 #include "ppc6lnx.c"
 #include "paride.h"
 
+/* PARAMETERS */
+static bool verbose; /* set this to 1 to see debugging messages and whatnot */
  
 
 #define PPCSTRUCT(pi) ((Interface *)(pi->private))
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 869e7676d46f..831e3ac156e6 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -124,8 +124,9 @@
    by default.
 
 */
+#include <linux/types.h>
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PD_MAJOR;
 static char *name = PD_NAME;
 static int cluster = 64;
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index f21b520ef419..ec8f9ed6326e 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -118,13 +118,15 @@
 #define PF_NAME		"pf"
 #define PF_UNITS	4
 
+#include <linux/types.h>
+
 /* Here are things one can override from the insmod command.
    Most are autoprobed by paride unless set here.  Verbose is off
    by default.
 
 */
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PF_MAJOR;
 static char *name = PF_NAME;
 static int cluster = 64;
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index a79fb4f7ff62..4a27b1de5fcb 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -130,13 +130,14 @@
 #define PI_PG	4
 #endif
 
+#include <linux/types.h>
 /* Here are things one can override from the insmod command.
    Most are autoprobed by paride unless set here.  Verbose is 0
    by default.
 
 */
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PG_MAJOR;
 static char *name = PG_NAME;
 static int disable = 0;
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index 7179f79d7468..2596042eb987 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -109,13 +109,15 @@
 #define PT_NAME		"pt"
 #define PT_UNITS	4
 
+#include <linux/types.h>
+
 /* Here are things one can override from the insmod command.
    Most are autoprobed by paride unless set here.  Verbose is on
    by default.
 
 */
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PT_MAJOR;
 static char *name = PT_NAME;
 static int disable = 0;
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 4abd2bcd20fb..51a972704db5 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -148,7 +148,7 @@ static volatile int xdc_busy;
 static struct timer_list xd_watchdog_int;
 
 static volatile u_char xd_error;
-static int nodma = XD_DONT_USE_DMA;
+static bool nodma = XD_DONT_USE_DMA;
 
 static struct request_queue *xd_queue;
 
-- 
cgit 


From 1b9fbafb3ad3fd02db42e3dd48b4fb7631753ca9 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Jan 2012 09:32:26 +1030
Subject: paride/pcd: fix bool verbose module parameter.

Dan Carpenter points out that it's an int, not a bool:

pcd.c:427:				if (verbose > 1)
pcd.c:433:				if (verbose > 1)
pcd.c:437:				if (verbose < 2)
pcd.c:506:#define DBMSG(msg)	((verbose>1)?(msg):NULL)

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
---
 drivers/block/paride/pcd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 46b8136c31bb..ba2b6b5e5910 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -144,7 +144,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
 static DEFINE_MUTEX(pcd_mutex);
 static DEFINE_SPINLOCK(pcd_lock);
 
-module_param(verbose, bool, 0644);
+module_param(verbose, int, 0644);
 module_param(major, int, 0);
 module_param(name, charp, 0);
 module_param(nice, int, 0);
-- 
cgit 


From 577ebb374c78314ac4617242f509e2f5e7156649 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 12 Jan 2012 16:01:27 +0100
Subject: block: add and use scsi_blk_cmd_ioctl

Introduce a wrapper around scsi_cmd_ioctl that takes a block device.

The function will then be enhanced to detect partition block devices
and, in that case, subject the ioctls to whitelisting.

Cc: linux-scsi@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>
Cc: James Bottomley <JBottomley@parallels.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/cciss.c      | 6 +++---
 drivers/block/ub.c         | 3 +--
 drivers/block/virtio_blk.c | 4 ++--
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 587cce57adae..b0f553b26d0f 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1735,7 +1735,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	case CCISS_BIG_PASSTHRU:
 		return cciss_bigpassthru(h, argp);
 
-	/* scsi_cmd_ioctl handles these, below, though some are not */
+	/* scsi_cmd_blk_ioctl handles these, below, though some are not */
 	/* very meaningful for cciss.  SG_IO is the main one people want. */
 
 	case SG_GET_VERSION_NUM:
@@ -1746,9 +1746,9 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	case SG_EMULATED_HOST:
 	case SG_IO:
 	case SCSI_IOCTL_SEND_COMMAND:
-		return scsi_cmd_ioctl(disk->queue, disk, mode, cmd, argp);
+		return scsi_cmd_blk_ioctl(bdev, mode, cmd, argp);
 
-	/* scsi_cmd_ioctl would normally handle these, below, but */
+	/* scsi_cmd_blk_ioctl would normally handle these, below, but */
 	/* they aren't a good fit for cciss, as CD-ROMs are */
 	/* not supported, and we don't have any bus/target/lun */
 	/* which we present to the kernel. */
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 0e376d46bdd1..7333b9e44411 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -1744,12 +1744,11 @@ static int ub_bd_release(struct gendisk *disk, fmode_t mode)
 static int ub_bd_ioctl(struct block_device *bdev, fmode_t mode,
     unsigned int cmd, unsigned long arg)
 {
-	struct gendisk *disk = bdev->bd_disk;
 	void __user *usermem = (void __user *) arg;
 	int ret;
 
 	mutex_lock(&ub_mutex);
-	ret = scsi_cmd_ioctl(disk->queue, disk, mode, cmd, usermem);
+	ret = scsi_cmd_blk_ioctl(bdev, mode, cmd, usermem);
 	mutex_unlock(&ub_mutex);
 
 	return ret;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index ffd5ca919295..c4a60badf252 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -250,8 +250,8 @@ static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 	if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI))
 		return -ENOTTY;
 
-	return scsi_cmd_ioctl(disk->queue, disk, mode, cmd,
-			      (void __user *)data);
+	return scsi_cmd_blk_ioctl(bdev, mode, cmd,
+				  (void __user *)data);
 }
 
 /* We provide getgeo only to please some old bootloader/partitioning tools */
-- 
cgit 


From 93c3d65b28bab6da520c2add9cb387a0303f8b2d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 18 Jan 2012 15:41:27 -0800
Subject: nvme: fix merge error due to change of 'make_request_fn' fn type

The type of 'make_request_fn' changed in 5a7bbad27a4 ("block: remove
support for bio remapping from ->make_request"), but the merge of the
nvme driver didn't take that into account, and as a result the driver
would compile with a warning:

  drivers/block/nvme.c: In function 'nvme_alloc_ns':
  drivers/block/nvme.c:1336:2: warning: passing argument 2 of 'blk_queue_make_request' from incompatible pointer type [enabled by default]
  include/linux/blkdev.h:830:13: note: expected 'void (*)(struct request_queue *, struct bio *)' but argument is of type 'int (*)(struct request_queue *, struct bio *)'

It's benign, but the warning is annoying.

Reported-by: Stephen Rothwell <sfr@canb.auug.org>
Cc: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/nvme.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index f4996b0e4b1a..c1dc4d86c221 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -613,11 +613,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	return result;
 }
 
-/*
- * NB: return value of non-zero would mean that we were a stacking driver.
- * make_request must always succeed.
- */
-static int nvme_make_request(struct request_queue *q, struct bio *bio)
+static void nvme_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct nvme_ns *ns = q->queuedata;
 	struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
@@ -634,8 +630,6 @@ static int nvme_make_request(struct request_queue *q, struct bio *bio)
 
 	spin_unlock_irq(&nvmeq->q_lock);
 	put_nvmeq(nvmeq);
-
-	return 0;
 }
 
 static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
-- 
cgit