From 0793448187643b50af89d36b08470baf45a3cab4 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@stericsson.com>
Date: Fri, 26 Mar 2010 16:50:49 -0700
Subject: DMAENGINE: generic channel status v2

Convert the device_is_tx_complete() operation on the
DMA engine to a generic device_tx_status()operation which
can return three states, DMA_TX_RUNNING, DMA_TX_COMPLETE,
DMA_TX_PAUSED.

[dan.j.williams@intel.com: update for timberdale]
Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Cc: Maciej Sosnowski <maciej.sosnowski@intel.com>
Cc: Nicolas Ferre <nicolas.ferre@atmel.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Li Yang <leoli@freescale.com>
Cc: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
Cc: Magnus Damm <damm@opensource.se>
Cc: Liam Girdwood <lrg@slimlogic.co.uk>
Cc: Joe Perches <joe@perches.com>
Cc: Roland Dreier <rdreier@cisco.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/ioat/dma_v3.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'drivers/dma/ioat/dma_v3.c')

diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c
index 26febc56dab1..d1adbf35268c 100644
--- a/drivers/dma/ioat/dma_v3.c
+++ b/drivers/dma/ioat/dma_v3.c
@@ -438,17 +438,17 @@ static void ioat3_timer_event(unsigned long data)
 }
 
 static enum dma_status
-ioat3_is_complete(struct dma_chan *c, dma_cookie_t cookie,
-		  dma_cookie_t *done, dma_cookie_t *used)
+ioat3_tx_status(struct dma_chan *c, dma_cookie_t cookie,
+		struct dma_tx_state *txstate)
 {
 	struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
 
-	if (ioat_is_complete(c, cookie, done, used) == DMA_SUCCESS)
+	if (ioat_tx_status(c, cookie, txstate) == DMA_SUCCESS)
 		return DMA_SUCCESS;
 
 	ioat3_cleanup_poll(ioat);
 
-	return ioat_is_complete(c, cookie, done, used);
+	return ioat_tx_status(c, cookie, txstate);
 }
 
 static struct dma_async_tx_descriptor *
@@ -976,7 +976,7 @@ static int __devinit ioat_xor_val_self_test(struct ioatdma_device *device)
 
 	tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
 
-	if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
+	if (dma->device_tx_status(dma_chan, cookie, NULL) != DMA_SUCCESS) {
 		dev_err(dev, "Self-test xor timed out\n");
 		err = -ENODEV;
 		goto free_resources;
@@ -1030,7 +1030,7 @@ static int __devinit ioat_xor_val_self_test(struct ioatdma_device *device)
 
 	tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
 
-	if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
+	if (dma->device_tx_status(dma_chan, cookie, NULL) != DMA_SUCCESS) {
 		dev_err(dev, "Self-test validate timed out\n");
 		err = -ENODEV;
 		goto free_resources;
@@ -1071,7 +1071,7 @@ static int __devinit ioat_xor_val_self_test(struct ioatdma_device *device)
 
 	tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
 
-	if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
+	if (dma->device_tx_status(dma_chan, cookie, NULL) != DMA_SUCCESS) {
 		dev_err(dev, "Self-test memset timed out\n");
 		err = -ENODEV;
 		goto free_resources;
@@ -1114,7 +1114,7 @@ static int __devinit ioat_xor_val_self_test(struct ioatdma_device *device)
 
 	tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
 
-	if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
+	if (dma->device_tx_status(dma_chan, cookie, NULL) != DMA_SUCCESS) {
 		dev_err(dev, "Self-test 2nd validate timed out\n");
 		err = -ENODEV;
 		goto free_resources;
@@ -1258,11 +1258,11 @@ int __devinit ioat3_dma_probe(struct ioatdma_device *device, int dca)
 
 
 	if (is_raid_device) {
-		dma->device_is_tx_complete = ioat3_is_complete;
+		dma->device_tx_status = ioat3_tx_status;
 		device->cleanup_fn = ioat3_cleanup_event;
 		device->timer_fn = ioat3_timer_event;
 	} else {
-		dma->device_is_tx_complete = ioat_is_dma_complete;
+		dma->device_tx_status = ioat_dma_tx_status;
 		device->cleanup_fn = ioat2_cleanup_event;
 		device->timer_fn = ioat2_timer_event;
 	}
-- 
cgit 


From 074cc47679f8b0931d7d5384e95822d82768f149 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 1 May 2010 15:22:55 -0700
Subject: ioat2,3: convert to producer/consumer locking

Use separate locks for the descriptor prep (producer) and descriptor
cleanup (consumer) paths.  Allows the producer path to run concurrently
with the cleanup path.  Inspired by Documentation/circular-buffer.txt.

Cc: David Howells <dhowells@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/ioat/dma_v3.c | 117 ++++++++++++++--------------------------------
 1 file changed, 35 insertions(+), 82 deletions(-)

(limited to 'drivers/dma/ioat/dma_v3.c')

diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c
index 6740e319c9cf..8b573fac2a25 100644
--- a/drivers/dma/ioat/dma_v3.c
+++ b/drivers/dma/ioat/dma_v3.c
@@ -260,8 +260,8 @@ static void __cleanup(struct ioat2_dma_chan *ioat, unsigned long phys_complete)
 	struct ioat_chan_common *chan = &ioat->base;
 	struct ioat_ring_ent *desc;
 	bool seen_current = false;
+	int idx = ioat->tail, i;
 	u16 active;
-	int i;
 
 	dev_dbg(to_dev(chan), "%s: head: %#x tail: %#x issued: %#x\n",
 		__func__, ioat->head, ioat->tail, ioat->issued);
@@ -270,13 +270,14 @@ static void __cleanup(struct ioat2_dma_chan *ioat, unsigned long phys_complete)
 	for (i = 0; i < active && !seen_current; i++) {
 		struct dma_async_tx_descriptor *tx;
 
-		prefetch(ioat2_get_ring_ent(ioat, ioat->tail + i + 1));
-		desc = ioat2_get_ring_ent(ioat, ioat->tail + i);
+		smp_read_barrier_depends();
+		prefetch(ioat2_get_ring_ent(ioat, idx + i + 1));
+		desc = ioat2_get_ring_ent(ioat, idx + i);
 		dump_desc_dbg(ioat, desc);
 		tx = &desc->txd;
 		if (tx->cookie) {
 			chan->completed_cookie = tx->cookie;
-			ioat3_dma_unmap(ioat, desc, ioat->tail + i);
+			ioat3_dma_unmap(ioat, desc, idx + i);
 			tx->cookie = 0;
 			if (tx->callback) {
 				tx->callback(tx->callback_param);
@@ -293,69 +294,30 @@ static void __cleanup(struct ioat2_dma_chan *ioat, unsigned long phys_complete)
 			i++;
 		}
 	}
-	ioat->tail += i;
+	smp_mb(); /* finish all descriptor reads before incrementing tail */
+	ioat->tail = idx + i;
 	BUG_ON(active && !seen_current); /* no active descs have written a completion? */
 	chan->last_completion = phys_complete;
 
-	active = ioat2_ring_active(ioat);
-	if (active == 0) {
+	if (active - i == 0) {
 		dev_dbg(to_dev(chan), "%s: cancel completion timeout\n",
 			__func__);
 		clear_bit(IOAT_COMPLETION_PENDING, &chan->state);
 		mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT);
 	}
 	/* 5 microsecond delay per pending descriptor */
-	writew(min((5 * active), IOAT_INTRDELAY_MASK),
+	writew(min((5 * (active - i)), IOAT_INTRDELAY_MASK),
 	       chan->device->reg_base + IOAT_INTRDELAY_OFFSET);
 }
 
-/* try to cleanup, but yield (via spin_trylock) to incoming submissions
- * with the expectation that we will immediately poll again shortly
- */
-static void ioat3_cleanup_poll(struct ioat2_dma_chan *ioat)
+static void ioat3_cleanup(struct ioat2_dma_chan *ioat)
 {
 	struct ioat_chan_common *chan = &ioat->base;
 	unsigned long phys_complete;
 
-	prefetch(chan->completion);
-
-	if (!spin_trylock_bh(&chan->cleanup_lock))
-		return;
-
-	if (!ioat_cleanup_preamble(chan, &phys_complete)) {
-		spin_unlock_bh(&chan->cleanup_lock);
-		return;
-	}
-
-	if (!spin_trylock_bh(&ioat->ring_lock)) {
-		spin_unlock_bh(&chan->cleanup_lock);
-		return;
-	}
-
-	__cleanup(ioat, phys_complete);
-
-	spin_unlock_bh(&ioat->ring_lock);
-	spin_unlock_bh(&chan->cleanup_lock);
-}
-
-/* run cleanup now because we already delayed the interrupt via INTRDELAY */
-static void ioat3_cleanup_sync(struct ioat2_dma_chan *ioat)
-{
-	struct ioat_chan_common *chan = &ioat->base;
-	unsigned long phys_complete;
-
-	prefetch(chan->completion);
-
 	spin_lock_bh(&chan->cleanup_lock);
-	if (!ioat_cleanup_preamble(chan, &phys_complete)) {
-		spin_unlock_bh(&chan->cleanup_lock);
-		return;
-	}
-	spin_lock_bh(&ioat->ring_lock);
-
-	__cleanup(ioat, phys_complete);
-
-	spin_unlock_bh(&ioat->ring_lock);
+	if (ioat_cleanup_preamble(chan, &phys_complete))
+		__cleanup(ioat, phys_complete);
 	spin_unlock_bh(&chan->cleanup_lock);
 }
 
@@ -363,7 +325,7 @@ static void ioat3_cleanup_event(unsigned long data)
 {
 	struct ioat2_dma_chan *ioat = to_ioat2_chan((void *) data);
 
-	ioat3_cleanup_sync(ioat);
+	ioat3_cleanup(ioat);
 	writew(IOAT_CHANCTRL_RUN, ioat->base.reg_base + IOAT_CHANCTRL_OFFSET);
 }
 
@@ -384,12 +346,10 @@ static void ioat3_timer_event(unsigned long data)
 	struct ioat2_dma_chan *ioat = to_ioat2_chan((void *) data);
 	struct ioat_chan_common *chan = &ioat->base;
 
-	spin_lock_bh(&chan->cleanup_lock);
 	if (test_bit(IOAT_COMPLETION_PENDING, &chan->state)) {
 		unsigned long phys_complete;
 		u64 status;
 
-		spin_lock_bh(&ioat->ring_lock);
 		status = ioat_chansts(chan);
 
 		/* when halted due to errors check for channel
@@ -408,26 +368,31 @@ static void ioat3_timer_event(unsigned long data)
 		 * acknowledged a pending completion once, then be more
 		 * forceful with a restart
 		 */
+		spin_lock_bh(&chan->cleanup_lock);
 		if (ioat_cleanup_preamble(chan, &phys_complete))
 			__cleanup(ioat, phys_complete);
-		else if (test_bit(IOAT_COMPLETION_ACK, &chan->state))
+		else if (test_bit(IOAT_COMPLETION_ACK, &chan->state)) {
+			spin_lock_bh(&ioat->prep_lock);
 			ioat3_restart_channel(ioat);
-		else {
+			spin_unlock_bh(&ioat->prep_lock);
+		} else {
 			set_bit(IOAT_COMPLETION_ACK, &chan->state);
 			mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
 		}
-		spin_unlock_bh(&ioat->ring_lock);
+		spin_unlock_bh(&chan->cleanup_lock);
 	} else {
 		u16 active;
 
 		/* if the ring is idle, empty, and oversized try to step
 		 * down the size
 		 */
-		spin_lock_bh(&ioat->ring_lock);
+		spin_lock_bh(&chan->cleanup_lock);
+		spin_lock_bh(&ioat->prep_lock);
 		active = ioat2_ring_active(ioat);
 		if (active == 0 && ioat->alloc_order > ioat_get_alloc_order())
 			reshape_ring(ioat, ioat->alloc_order-1);
-		spin_unlock_bh(&ioat->ring_lock);
+		spin_unlock_bh(&ioat->prep_lock);
+		spin_unlock_bh(&chan->cleanup_lock);
 
 		/* keep shrinking until we get back to our minimum
 		 * default size
@@ -435,7 +400,6 @@ static void ioat3_timer_event(unsigned long data)
 		if (ioat->alloc_order > ioat_get_alloc_order())
 			mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT);
 	}
-	spin_unlock_bh(&chan->cleanup_lock);
 }
 
 static enum dma_status
@@ -447,7 +411,7 @@ ioat3_is_complete(struct dma_chan *c, dma_cookie_t cookie,
 	if (ioat_is_complete(c, cookie, done, used) == DMA_SUCCESS)
 		return DMA_SUCCESS;
 
-	ioat3_cleanup_poll(ioat);
+	ioat3_cleanup(ioat);
 
 	return ioat_is_complete(c, cookie, done, used);
 }
@@ -460,15 +424,12 @@ ioat3_prep_memset_lock(struct dma_chan *c, dma_addr_t dest, int value,
 	struct ioat_ring_ent *desc;
 	size_t total_len = len;
 	struct ioat_fill_descriptor *fill;
-	int num_descs;
 	u64 src_data = (0x0101010101010101ULL) * (value & 0xff);
-	u16 idx;
-	int i;
+	int num_descs, idx, i;
 
 	num_descs = ioat2_xferlen_to_descs(ioat, len);
-	if (likely(num_descs) &&
-	    ioat2_alloc_and_lock(&idx, ioat, num_descs) == 0)
-		/* pass */;
+	if (likely(num_descs) && ioat2_check_space_lock(ioat, num_descs) == 0)
+		idx = ioat->head;
 	else
 		return NULL;
 	i = 0;
@@ -513,11 +474,8 @@ __ioat3_prep_xor_lock(struct dma_chan *c, enum sum_check_flags *result,
 	struct ioat_xor_descriptor *xor;
 	struct ioat_xor_ext_descriptor *xor_ex = NULL;
 	struct ioat_dma_descriptor *hw;
+	int num_descs, with_ext, idx, i;
 	u32 offset = 0;
-	int num_descs;
-	int with_ext;
-	int i;
-	u16 idx;
 	u8 op = result ? IOAT_OP_XOR_VAL : IOAT_OP_XOR;
 
 	BUG_ON(src_cnt < 2);
@@ -537,9 +495,8 @@ __ioat3_prep_xor_lock(struct dma_chan *c, enum sum_check_flags *result,
 	 * (legacy) descriptor to ensure all completion writes arrive in
 	 * order.
 	 */
-	if (likely(num_descs) &&
-	    ioat2_alloc_and_lock(&idx, ioat, num_descs+1) == 0)
-		/* pass */;
+	if (likely(num_descs) && ioat2_check_space_lock(ioat, num_descs+1) == 0)
+		idx = ioat->head;
 	else
 		return NULL;
 	i = 0;
@@ -657,11 +614,8 @@ __ioat3_prep_pq_lock(struct dma_chan *c, enum sum_check_flags *result,
 	struct ioat_pq_ext_descriptor *pq_ex = NULL;
 	struct ioat_dma_descriptor *hw;
 	u32 offset = 0;
-	int num_descs;
-	int with_ext;
-	int i, s;
-	u16 idx;
 	u8 op = result ? IOAT_OP_PQ_VAL : IOAT_OP_PQ;
+	int i, s, idx, with_ext, num_descs;
 
 	dev_dbg(to_dev(chan), "%s\n", __func__);
 	/* the engine requires at least two sources (we provide
@@ -687,8 +641,8 @@ __ioat3_prep_pq_lock(struct dma_chan *c, enum sum_check_flags *result,
 	 * order.
 	 */
 	if (likely(num_descs) &&
-	    ioat2_alloc_and_lock(&idx, ioat, num_descs+1) == 0)
-		/* pass */;
+	    ioat2_check_space_lock(ioat, num_descs+1) == 0)
+		idx = ioat->head;
 	else
 		return NULL;
 	i = 0;
@@ -851,10 +805,9 @@ ioat3_prep_interrupt_lock(struct dma_chan *c, unsigned long flags)
 	struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
 	struct ioat_ring_ent *desc;
 	struct ioat_dma_descriptor *hw;
-	u16 idx;
 
-	if (ioat2_alloc_and_lock(&idx, ioat, 1) == 0)
-		desc = ioat2_get_ring_ent(ioat, idx);
+	if (ioat2_check_space_lock(ioat, 1) == 0)
+		desc = ioat2_get_ring_ent(ioat, ioat->head);
 	else
 		return NULL;
 
-- 
cgit 


From 2adfc550b6d9646301c810643bc309fa49375987 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 1 May 2010 15:22:56 -0700
Subject: ioat3: disable cacheline-unaligned transfers for raid operations

There are cases where cacheline-unaligned raid operations can hang the
dma channel.  Simply disable these operations by increasing the
alignment constraints published to async_tx.  The raid456 driver always
issues page aligned requests, so the only in-kernel user of the ioatdma
driver that is affected by this change is dmatest.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/ioat/dma_v3.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers/dma/ioat/dma_v3.c')

diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c
index 8b573fac2a25..e2a23952172d 100644
--- a/drivers/dma/ioat/dma_v3.c
+++ b/drivers/dma/ioat/dma_v3.c
@@ -1175,7 +1175,7 @@ int __devinit ioat3_dma_probe(struct ioatdma_device *device, int dca)
 	if (cap & IOAT_CAP_XOR) {
 		is_raid_device = true;
 		dma->max_xor = 8;
-		dma->xor_align = 2;
+		dma->xor_align = 6;
 
 		dma_cap_set(DMA_XOR, dma->cap_mask);
 		dma->device_prep_dma_xor = ioat3_prep_xor;
@@ -1186,7 +1186,7 @@ int __devinit ioat3_dma_probe(struct ioatdma_device *device, int dca)
 	if (cap & IOAT_CAP_PQ) {
 		is_raid_device = true;
 		dma_set_maxpq(dma, 8, 0);
-		dma->pq_align = 2;
+		dma->pq_align = 6;
 
 		dma_cap_set(DMA_PQ, dma->cap_mask);
 		dma->device_prep_dma_pq = ioat3_prep_pq;
@@ -1196,7 +1196,7 @@ int __devinit ioat3_dma_probe(struct ioatdma_device *device, int dca)
 
 		if (!(cap & IOAT_CAP_XOR)) {
 			dma->max_xor = 8;
-			dma->xor_align = 2;
+			dma->xor_align = 6;
 
 			dma_cap_set(DMA_XOR, dma->cap_mask);
 			dma->device_prep_dma_xor = ioat3_prep_pqxor;
-- 
cgit