From b56f5cbc7e08ec7d31c42fc41e5247677f20b143 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 14 Feb 2017 21:51:01 +0000
Subject: crypto: arm/aes-neonbs - resolve fallback cipher at runtime

Currently, the bit sliced NEON AES code for ARM has a link time
dependency on the scalar ARM asm implementation, which it uses as a
fallback to perform CBC encryption and the encryption of the initial
XTS tweak.

The bit sliced NEON code is both fast and time invariant, which makes
it a reasonable default on hardware that supports it. However, the
ARM asm code it pulls in is not time invariant, and due to the way it
is linked in, cannot be overridden by the new generic time invariant
driver. In fact, it will not be used at all, given that the ARM asm
code registers itself as a cipher with a priority that exceeds the
priority of the fixed time cipher.

So remove the link time dependency, and allocate the fallback cipher
via the crypto API. Note that this requires this driver's module_init
call to be replaced with late_initcall, so that the (possibly generic)
fallback cipher is guaranteed to be available when the builtin test
is performed at registration time.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm/crypto/Kconfig           |  2 +-
 arch/arm/crypto/aes-neonbs-glue.c | 60 +++++++++++++++++++++++++++++----------
 2 files changed, 46 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index a8fce93137fb..b9adedcc5b2e 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -73,7 +73,7 @@ config CRYPTO_AES_ARM_BS
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_SIMD
-	select CRYPTO_AES_ARM
+	select CRYPTO_AES
 	help
 	  Use a faster and more secure NEON based implementation of AES in CBC,
 	  CTR and XTS modes
diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c
index 2920b96dbd36..c76377961444 100644
--- a/arch/arm/crypto/aes-neonbs-glue.c
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@@ -42,9 +42,6 @@ asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
 asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[],
 				  int rounds, int blocks, u8 iv[]);
 
-asmlinkage void __aes_arm_encrypt(const u32 rk[], int rounds, const u8 in[],
-				  u8 out[]);
-
 struct aesbs_ctx {
 	int	rounds;
 	u8	rk[13 * (8 * AES_BLOCK_SIZE) + 32] __aligned(AES_BLOCK_SIZE);
@@ -52,12 +49,12 @@ struct aesbs_ctx {
 
 struct aesbs_cbc_ctx {
 	struct aesbs_ctx	key;
-	u32			enc[AES_MAX_KEYLENGTH_U32];
+	struct crypto_cipher	*enc_tfm;
 };
 
 struct aesbs_xts_ctx {
 	struct aesbs_ctx	key;
-	u32			twkey[AES_MAX_KEYLENGTH_U32];
+	struct crypto_cipher	*tweak_tfm;
 };
 
 static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
@@ -132,20 +129,18 @@ static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 
 	ctx->key.rounds = 6 + key_len / 4;
 
-	memcpy(ctx->enc, rk.key_enc, sizeof(ctx->enc));
-
 	kernel_neon_begin();
 	aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
 	kernel_neon_end();
 
-	return 0;
+	return crypto_cipher_setkey(ctx->enc_tfm, in_key, key_len);
 }
 
 static void cbc_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
 {
 	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-	__aes_arm_encrypt(ctx->enc, ctx->key.rounds, src, dst);
+	crypto_cipher_encrypt_one(ctx->enc_tfm, dst, src);
 }
 
 static int cbc_encrypt(struct skcipher_request *req)
@@ -181,6 +176,23 @@ static int cbc_decrypt(struct skcipher_request *req)
 	return err;
 }
 
+static int cbc_init(struct crypto_tfm *tfm)
+{
+	struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	ctx->enc_tfm = crypto_alloc_cipher("aes", 0, 0);
+	if (IS_ERR(ctx->enc_tfm))
+		return PTR_ERR(ctx->enc_tfm);
+	return 0;
+}
+
+static void cbc_exit(struct crypto_tfm *tfm)
+{
+	struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	crypto_free_cipher(ctx->enc_tfm);
+}
+
 static int ctr_encrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
@@ -228,7 +240,6 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 			    unsigned int key_len)
 {
 	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct crypto_aes_ctx rk;
 	int err;
 
 	err = xts_verify_key(tfm, in_key, key_len);
@@ -236,15 +247,30 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 		return err;
 
 	key_len /= 2;
-	err = crypto_aes_expand_key(&rk, in_key + key_len, key_len);
+	err = crypto_cipher_setkey(ctx->tweak_tfm, in_key + key_len, key_len);
 	if (err)
 		return err;
 
-	memcpy(ctx->twkey, rk.key_enc, sizeof(ctx->twkey));
-
 	return aesbs_setkey(tfm, in_key, key_len);
 }
 
+static int xts_init(struct crypto_tfm *tfm)
+{
+	struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	ctx->tweak_tfm = crypto_alloc_cipher("aes", 0, 0);
+	if (IS_ERR(ctx->tweak_tfm))
+		return PTR_ERR(ctx->tweak_tfm);
+	return 0;
+}
+
+static void xts_exit(struct crypto_tfm *tfm)
+{
+	struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	crypto_free_cipher(ctx->tweak_tfm);
+}
+
 static int __xts_crypt(struct skcipher_request *req,
 		       void (*fn)(u8 out[], u8 const in[], u8 const rk[],
 				  int rounds, int blocks, u8 iv[]))
@@ -256,7 +282,7 @@ static int __xts_crypt(struct skcipher_request *req,
 
 	err = skcipher_walk_virt(&walk, req, true);
 
-	__aes_arm_encrypt(ctx->twkey, ctx->key.rounds, walk.iv, walk.iv);
+	crypto_cipher_encrypt_one(ctx->tweak_tfm, walk.iv, walk.iv);
 
 	kernel_neon_begin();
 	while (walk.nbytes >= AES_BLOCK_SIZE) {
@@ -309,6 +335,8 @@ static struct skcipher_alg aes_algs[] = { {
 	.base.cra_ctxsize	= sizeof(struct aesbs_cbc_ctx),
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+	.base.cra_init		= cbc_init,
+	.base.cra_exit		= cbc_exit,
 
 	.min_keysize		= AES_MIN_KEY_SIZE,
 	.max_keysize		= AES_MAX_KEY_SIZE,
@@ -342,6 +370,8 @@ static struct skcipher_alg aes_algs[] = { {
 	.base.cra_ctxsize	= sizeof(struct aesbs_xts_ctx),
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+	.base.cra_init		= xts_init,
+	.base.cra_exit		= xts_exit,
 
 	.min_keysize		= 2 * AES_MIN_KEY_SIZE,
 	.max_keysize		= 2 * AES_MAX_KEY_SIZE,
@@ -402,5 +432,5 @@ unregister_simds:
 	return err;
 }
 
-module_init(aes_init);
+late_initcall(aes_init);
 module_exit(aes_exit);
-- 
cgit 


From 1b3f6d148692ef3c3c96af1647083ff38d429e46 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 22 Feb 2017 08:00:55 +0100
Subject: ARM64: dts: meson-gx: add clock CLKID_RNG0 to hwrng node

Add clock CLKID_RNG0 to HW randon number generator node.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/boot/dts/amlogic/meson-gx.dtsi   | 2 +-
 arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm64/boot/dts/amlogic/meson-gx.dtsi b/arch/arm64/boot/dts/amlogic/meson-gx.dtsi
index 5d995f7724af..620495a43363 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gx.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-gx.dtsi
@@ -380,7 +380,7 @@
 			#size-cells = <2>;
 			ranges = <0x0 0x0 0x0 0xc8834000 0x0 0x2000>;
 
-			rng {
+			hwrng: rng {
 				compatible = "amlogic,meson-rng";
 				reg = <0x0 0x0 0x0 0x4>;
 			};
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi
index 04b3324bc132..a375cb21cc8b 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi
@@ -524,3 +524,8 @@
 &vpu {
 	compatible = "amlogic,meson-gxbb-vpu", "amlogic,meson-gx-vpu";
 };
+
+&hwrng {
+	clocks = <&clkc CLKID_RNG0>;
+	clock-names = "core";
+};
-- 
cgit 


From de696a26435ae50f06fd913bc84e0bf602caee1f Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Wed, 15 Mar 2017 23:37:34 +1100
Subject: crypto: powerpc - Factor out the core CRC vpmsum algorithm

The core nuts and bolts of the crc32c vpmsum algorithm will
also work for a number of other CRC algorithms with different
polynomials. Factor out the function into a new asm file.

To handle multiple users of the function, a user simply
provides constants, defines the name of their CRC function,
and then #includes the core algorithm file.

Cc: Anton Blanchard <anton@samba.org>
Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/powerpc/crypto/crc32-vpmsum_core.S | 726 ++++++++++++++++++++++++++++++++
 arch/powerpc/crypto/crc32c-vpmsum_asm.S | 714 +------------------------------
 2 files changed, 729 insertions(+), 711 deletions(-)
 create mode 100644 arch/powerpc/crypto/crc32-vpmsum_core.S

(limited to 'arch')

diff --git a/arch/powerpc/crypto/crc32-vpmsum_core.S b/arch/powerpc/crypto/crc32-vpmsum_core.S
new file mode 100644
index 000000000000..7c6be6a5c977
--- /dev/null
+++ b/arch/powerpc/crypto/crc32-vpmsum_core.S
@@ -0,0 +1,726 @@
+/*
+ * Core of the accelerated CRC algorithm.
+ * In your file, define the constants and CRC_FUNCTION_NAME
+ * Then include this file.
+ *
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+*/
+
+#include <asm/ppc_asm.h>
+#include <asm/ppc-opcode.h>
+
+#define MAX_SIZE	32768
+
+	.text
+
+#if defined(__BIG_ENDIAN__)
+#define BYTESWAP_DATA
+#else
+#undef BYTESWAP_DATA
+#endif
+
+#define off16		r25
+#define off32		r26
+#define off48		r27
+#define off64		r28
+#define off80		r29
+#define off96		r30
+#define off112		r31
+
+#define const1		v24
+#define const2		v25
+
+#define byteswap	v26
+#define	mask_32bit	v27
+#define	mask_64bit	v28
+#define zeroes		v29
+
+#ifdef BYTESWAP_DATA
+#define VPERM(A, B, C, D) vperm	A, B, C, D
+#else
+#define VPERM(A, B, C, D)
+#endif
+
+/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
+FUNC_START(CRC_FUNCTION_NAME)
+	std	r31,-8(r1)
+	std	r30,-16(r1)
+	std	r29,-24(r1)
+	std	r28,-32(r1)
+	std	r27,-40(r1)
+	std	r26,-48(r1)
+	std	r25,-56(r1)
+
+	li	off16,16
+	li	off32,32
+	li	off48,48
+	li	off64,64
+	li	off80,80
+	li	off96,96
+	li	off112,112
+	li	r0,0
+
+	/* Enough room for saving 10 non volatile VMX registers */
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+
+	stvx	v20,0,r6
+	stvx	v21,off16,r6
+	stvx	v22,off32,r6
+	stvx	v23,off48,r6
+	stvx	v24,off64,r6
+	stvx	v25,off80,r6
+	stvx	v26,off96,r6
+	stvx	v27,off112,r6
+	stvx	v28,0,r7
+	stvx	v29,off16,r7
+
+	mr	r10,r3
+
+	vxor	zeroes,zeroes,zeroes
+	vspltisw v0,-1
+
+	vsldoi	mask_32bit,zeroes,v0,4
+	vsldoi	mask_64bit,zeroes,v0,8
+
+	/* Get the initial value into v8 */
+	vxor	v8,v8,v8
+	MTVRD(v8, R3)
+	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
+
+#ifdef BYTESWAP_DATA
+	addis	r3,r2,.byteswap_constant@toc@ha
+	addi	r3,r3,.byteswap_constant@toc@l
+
+	lvx	byteswap,0,r3
+	addi	r3,r3,16
+#endif
+
+	cmpdi	r5,256
+	blt	.Lshort
+
+	rldicr	r6,r5,0,56
+
+	/* Checksum in blocks of MAX_SIZE */
+1:	lis	r7,MAX_SIZE@h
+	ori	r7,r7,MAX_SIZE@l
+	mr	r9,r7
+	cmpd	r6,r7
+	bgt	2f
+	mr	r7,r6
+2:	subf	r6,r7,r6
+
+	/* our main loop does 128 bytes at a time */
+	srdi	r7,r7,7
+
+	/*
+	 * Work out the offset into the constants table to start at. Each
+	 * constant is 16 bytes, and it is used against 128 bytes of input
+	 * data - 128 / 16 = 8
+	 */
+	sldi	r8,r7,4
+	srdi	r9,r9,3
+	subf	r8,r8,r9
+
+	/* We reduce our final 128 bytes in a separate step */
+	addi	r7,r7,-1
+	mtctr	r7
+
+	addis	r3,r2,.constants@toc@ha
+	addi	r3,r3,.constants@toc@l
+
+	/* Find the start of our constants */
+	add	r3,r3,r8
+
+	/* zero v0-v7 which will contain our checksums */
+	vxor	v0,v0,v0
+	vxor	v1,v1,v1
+	vxor	v2,v2,v2
+	vxor	v3,v3,v3
+	vxor	v4,v4,v4
+	vxor	v5,v5,v5
+	vxor	v6,v6,v6
+	vxor	v7,v7,v7
+
+	lvx	const1,0,r3
+
+	/*
+	 * If we are looping back to consume more data we use the values
+	 * already in v16-v23.
+	 */
+	cmpdi	r0,1
+	beq	2f
+
+	/* First warm up pass */
+	lvx	v16,0,r4
+	lvx	v17,off16,r4
+	VPERM(v16,v16,v16,byteswap)
+	VPERM(v17,v17,v17,byteswap)
+	lvx	v18,off32,r4
+	lvx	v19,off48,r4
+	VPERM(v18,v18,v18,byteswap)
+	VPERM(v19,v19,v19,byteswap)
+	lvx	v20,off64,r4
+	lvx	v21,off80,r4
+	VPERM(v20,v20,v20,byteswap)
+	VPERM(v21,v21,v21,byteswap)
+	lvx	v22,off96,r4
+	lvx	v23,off112,r4
+	VPERM(v22,v22,v22,byteswap)
+	VPERM(v23,v23,v23,byteswap)
+	addi	r4,r4,8*16
+
+	/* xor in initial value */
+	vxor	v16,v16,v8
+
+2:	bdz	.Lfirst_warm_up_done
+
+	addi	r3,r3,16
+	lvx	const2,0,r3
+
+	/* Second warm up pass */
+	VPMSUMD(v8,v16,const1)
+	lvx	v16,0,r4
+	VPERM(v16,v16,v16,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v9,v17,const1)
+	lvx	v17,off16,r4
+	VPERM(v17,v17,v17,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v10,v18,const1)
+	lvx	v18,off32,r4
+	VPERM(v18,v18,v18,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v11,v19,const1)
+	lvx	v19,off48,r4
+	VPERM(v19,v19,v19,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v12,v20,const1)
+	lvx	v20,off64,r4
+	VPERM(v20,v20,v20,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v13,v21,const1)
+	lvx	v21,off80,r4
+	VPERM(v21,v21,v21,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v14,v22,const1)
+	lvx	v22,off96,r4
+	VPERM(v22,v22,v22,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v15,v23,const1)
+	lvx	v23,off112,r4
+	VPERM(v23,v23,v23,byteswap)
+
+	addi	r4,r4,8*16
+
+	bdz	.Lfirst_cool_down
+
+	/*
+	 * main loop. We modulo schedule it such that it takes three iterations
+	 * to complete - first iteration load, second iteration vpmsum, third
+	 * iteration xor.
+	 */
+	.balign	16
+4:	lvx	const1,0,r3
+	addi	r3,r3,16
+	ori	r2,r2,0
+
+	vxor	v0,v0,v8
+	VPMSUMD(v8,v16,const2)
+	lvx	v16,0,r4
+	VPERM(v16,v16,v16,byteswap)
+	ori	r2,r2,0
+
+	vxor	v1,v1,v9
+	VPMSUMD(v9,v17,const2)
+	lvx	v17,off16,r4
+	VPERM(v17,v17,v17,byteswap)
+	ori	r2,r2,0
+
+	vxor	v2,v2,v10
+	VPMSUMD(v10,v18,const2)
+	lvx	v18,off32,r4
+	VPERM(v18,v18,v18,byteswap)
+	ori	r2,r2,0
+
+	vxor	v3,v3,v11
+	VPMSUMD(v11,v19,const2)
+	lvx	v19,off48,r4
+	VPERM(v19,v19,v19,byteswap)
+	lvx	const2,0,r3
+	ori	r2,r2,0
+
+	vxor	v4,v4,v12
+	VPMSUMD(v12,v20,const1)
+	lvx	v20,off64,r4
+	VPERM(v20,v20,v20,byteswap)
+	ori	r2,r2,0
+
+	vxor	v5,v5,v13
+	VPMSUMD(v13,v21,const1)
+	lvx	v21,off80,r4
+	VPERM(v21,v21,v21,byteswap)
+	ori	r2,r2,0
+
+	vxor	v6,v6,v14
+	VPMSUMD(v14,v22,const1)
+	lvx	v22,off96,r4
+	VPERM(v22,v22,v22,byteswap)
+	ori	r2,r2,0
+
+	vxor	v7,v7,v15
+	VPMSUMD(v15,v23,const1)
+	lvx	v23,off112,r4
+	VPERM(v23,v23,v23,byteswap)
+
+	addi	r4,r4,8*16
+
+	bdnz	4b
+
+.Lfirst_cool_down:
+	/* First cool down pass */
+	lvx	const1,0,r3
+	addi	r3,r3,16
+
+	vxor	v0,v0,v8
+	VPMSUMD(v8,v16,const1)
+	ori	r2,r2,0
+
+	vxor	v1,v1,v9
+	VPMSUMD(v9,v17,const1)
+	ori	r2,r2,0
+
+	vxor	v2,v2,v10
+	VPMSUMD(v10,v18,const1)
+	ori	r2,r2,0
+
+	vxor	v3,v3,v11
+	VPMSUMD(v11,v19,const1)
+	ori	r2,r2,0
+
+	vxor	v4,v4,v12
+	VPMSUMD(v12,v20,const1)
+	ori	r2,r2,0
+
+	vxor	v5,v5,v13
+	VPMSUMD(v13,v21,const1)
+	ori	r2,r2,0
+
+	vxor	v6,v6,v14
+	VPMSUMD(v14,v22,const1)
+	ori	r2,r2,0
+
+	vxor	v7,v7,v15
+	VPMSUMD(v15,v23,const1)
+	ori	r2,r2,0
+
+.Lsecond_cool_down:
+	/* Second cool down pass */
+	vxor	v0,v0,v8
+	vxor	v1,v1,v9
+	vxor	v2,v2,v10
+	vxor	v3,v3,v11
+	vxor	v4,v4,v12
+	vxor	v5,v5,v13
+	vxor	v6,v6,v14
+	vxor	v7,v7,v15
+
+	/*
+	 * vpmsumd produces a 96 bit result in the least significant bits
+	 * of the register. Since we are bit reflected we have to shift it
+	 * left 32 bits so it occupies the least significant bits in the
+	 * bit reflected domain.
+	 */
+	vsldoi	v0,v0,zeroes,4
+	vsldoi	v1,v1,zeroes,4
+	vsldoi	v2,v2,zeroes,4
+	vsldoi	v3,v3,zeroes,4
+	vsldoi	v4,v4,zeroes,4
+	vsldoi	v5,v5,zeroes,4
+	vsldoi	v6,v6,zeroes,4
+	vsldoi	v7,v7,zeroes,4
+
+	/* xor with last 1024 bits */
+	lvx	v8,0,r4
+	lvx	v9,off16,r4
+	VPERM(v8,v8,v8,byteswap)
+	VPERM(v9,v9,v9,byteswap)
+	lvx	v10,off32,r4
+	lvx	v11,off48,r4
+	VPERM(v10,v10,v10,byteswap)
+	VPERM(v11,v11,v11,byteswap)
+	lvx	v12,off64,r4
+	lvx	v13,off80,r4
+	VPERM(v12,v12,v12,byteswap)
+	VPERM(v13,v13,v13,byteswap)
+	lvx	v14,off96,r4
+	lvx	v15,off112,r4
+	VPERM(v14,v14,v14,byteswap)
+	VPERM(v15,v15,v15,byteswap)
+
+	addi	r4,r4,8*16
+
+	vxor	v16,v0,v8
+	vxor	v17,v1,v9
+	vxor	v18,v2,v10
+	vxor	v19,v3,v11
+	vxor	v20,v4,v12
+	vxor	v21,v5,v13
+	vxor	v22,v6,v14
+	vxor	v23,v7,v15
+
+	li	r0,1
+	cmpdi	r6,0
+	addi	r6,r6,128
+	bne	1b
+
+	/* Work out how many bytes we have left */
+	andi.	r5,r5,127
+
+	/* Calculate where in the constant table we need to start */
+	subfic	r6,r5,128
+	add	r3,r3,r6
+
+	/* How many 16 byte chunks are in the tail */
+	srdi	r7,r5,4
+	mtctr	r7
+
+	/*
+	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
+	 * 32 bits to include the trailing 32 bits of zeros
+	 */
+	lvx	v0,0,r3
+	lvx	v1,off16,r3
+	lvx	v2,off32,r3
+	lvx	v3,off48,r3
+	lvx	v4,off64,r3
+	lvx	v5,off80,r3
+	lvx	v6,off96,r3
+	lvx	v7,off112,r3
+	addi	r3,r3,8*16
+
+	VPMSUMW(v0,v16,v0)
+	VPMSUMW(v1,v17,v1)
+	VPMSUMW(v2,v18,v2)
+	VPMSUMW(v3,v19,v3)
+	VPMSUMW(v4,v20,v4)
+	VPMSUMW(v5,v21,v5)
+	VPMSUMW(v6,v22,v6)
+	VPMSUMW(v7,v23,v7)
+
+	/* Now reduce the tail (0 - 112 bytes) */
+	cmpdi	r7,0
+	beq	1f
+
+	lvx	v16,0,r4
+	lvx	v17,0,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off32,r4
+	lvx	v17,off32,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off64,r4
+	lvx	v17,off64,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off96,r4
+	lvx	v17,off96,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+
+	/* Now xor all the parallel chunks together */
+1:	vxor	v0,v0,v1
+	vxor	v2,v2,v3
+	vxor	v4,v4,v5
+	vxor	v6,v6,v7
+
+	vxor	v0,v0,v2
+	vxor	v4,v4,v6
+
+	vxor	v0,v0,v4
+
+.Lbarrett_reduction:
+	/* Barrett constants */
+	addis	r3,r2,.barrett_constants@toc@ha
+	addi	r3,r3,.barrett_constants@toc@l
+
+	lvx	const1,0,r3
+	lvx	const2,off16,r3
+
+	vsldoi	v1,v0,v0,8
+	vxor	v0,v0,v1		/* xor two 64 bit results together */
+
+	/* shift left one bit */
+	vspltisb v1,1
+	vsl	v0,v0,v1
+
+	vand	v0,v0,mask_64bit
+
+	/*
+	 * The reflected version of Barrett reduction. Instead of bit
+	 * reflecting our data (which is expensive to do), we bit reflect our
+	 * constants and our algorithm, which means the intermediate data in
+	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
+	 * the algorithm because we don't carry in mod 2 arithmetic.
+	 */
+	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
+	VPMSUMD(v1,v1,const1)		/* ma */
+	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
+	VPMSUMD(v1,v1,const2)		/* qn */
+	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
+
+	/*
+	 * Since we are bit reflected, the result (ie the low 32 bits) is in
+	 * the high 32 bits. We just need to shift it left 4 bytes
+	 * V0 [ 0 1 X 3 ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
+
+	/* Get it into r3 */
+	MFVRD(R3, v0)
+
+.Lout:
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+
+	lvx	v20,0,r6
+	lvx	v21,off16,r6
+	lvx	v22,off32,r6
+	lvx	v23,off48,r6
+	lvx	v24,off64,r6
+	lvx	v25,off80,r6
+	lvx	v26,off96,r6
+	lvx	v27,off112,r6
+	lvx	v28,0,r7
+	lvx	v29,off16,r7
+
+	ld	r31,-8(r1)
+	ld	r30,-16(r1)
+	ld	r29,-24(r1)
+	ld	r28,-32(r1)
+	ld	r27,-40(r1)
+	ld	r26,-48(r1)
+	ld	r25,-56(r1)
+
+	blr
+
+.Lfirst_warm_up_done:
+	lvx	const1,0,r3
+	addi	r3,r3,16
+
+	VPMSUMD(v8,v16,const1)
+	VPMSUMD(v9,v17,const1)
+	VPMSUMD(v10,v18,const1)
+	VPMSUMD(v11,v19,const1)
+	VPMSUMD(v12,v20,const1)
+	VPMSUMD(v13,v21,const1)
+	VPMSUMD(v14,v22,const1)
+	VPMSUMD(v15,v23,const1)
+
+	b	.Lsecond_cool_down
+
+.Lshort:
+	cmpdi	r5,0
+	beq	.Lzero
+
+	addis	r3,r2,.short_constants@toc@ha
+	addi	r3,r3,.short_constants@toc@l
+
+	/* Calculate where in the constant table we need to start */
+	subfic	r6,r5,256
+	add	r3,r3,r6
+
+	/* How many 16 byte chunks? */
+	srdi	r7,r5,4
+	mtctr	r7
+
+	vxor	v19,v19,v19
+	vxor	v20,v20,v20
+
+	lvx	v0,0,r4
+	lvx	v16,0,r3
+	VPERM(v0,v0,v16,byteswap)
+	vxor	v0,v0,v8	/* xor in initial value */
+	VPMSUMW(v0,v0,v16)
+	bdz	.Lv0
+
+	lvx	v1,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v1,v1,v17,byteswap)
+	VPMSUMW(v1,v1,v17)
+	bdz	.Lv1
+
+	lvx	v2,off32,r4
+	lvx	v16,off32,r3
+	VPERM(v2,v2,v16,byteswap)
+	VPMSUMW(v2,v2,v16)
+	bdz	.Lv2
+
+	lvx	v3,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v3,v3,v17,byteswap)
+	VPMSUMW(v3,v3,v17)
+	bdz	.Lv3
+
+	lvx	v4,off64,r4
+	lvx	v16,off64,r3
+	VPERM(v4,v4,v16,byteswap)
+	VPMSUMW(v4,v4,v16)
+	bdz	.Lv4
+
+	lvx	v5,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v5,v5,v17,byteswap)
+	VPMSUMW(v5,v5,v17)
+	bdz	.Lv5
+
+	lvx	v6,off96,r4
+	lvx	v16,off96,r3
+	VPERM(v6,v6,v16,byteswap)
+	VPMSUMW(v6,v6,v16)
+	bdz	.Lv6
+
+	lvx	v7,off112,r4
+	lvx	v17,off112,r3
+	VPERM(v7,v7,v17,byteswap)
+	VPMSUMW(v7,v7,v17)
+	bdz	.Lv7
+
+	addi	r3,r3,128
+	addi	r4,r4,128
+
+	lvx	v8,0,r4
+	lvx	v16,0,r3
+	VPERM(v8,v8,v16,byteswap)
+	VPMSUMW(v8,v8,v16)
+	bdz	.Lv8
+
+	lvx	v9,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v9,v9,v17,byteswap)
+	VPMSUMW(v9,v9,v17)
+	bdz	.Lv9
+
+	lvx	v10,off32,r4
+	lvx	v16,off32,r3
+	VPERM(v10,v10,v16,byteswap)
+	VPMSUMW(v10,v10,v16)
+	bdz	.Lv10
+
+	lvx	v11,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v11,v11,v17,byteswap)
+	VPMSUMW(v11,v11,v17)
+	bdz	.Lv11
+
+	lvx	v12,off64,r4
+	lvx	v16,off64,r3
+	VPERM(v12,v12,v16,byteswap)
+	VPMSUMW(v12,v12,v16)
+	bdz	.Lv12
+
+	lvx	v13,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v13,v13,v17,byteswap)
+	VPMSUMW(v13,v13,v17)
+	bdz	.Lv13
+
+	lvx	v14,off96,r4
+	lvx	v16,off96,r3
+	VPERM(v14,v14,v16,byteswap)
+	VPMSUMW(v14,v14,v16)
+	bdz	.Lv14
+
+	lvx	v15,off112,r4
+	lvx	v17,off112,r3
+	VPERM(v15,v15,v17,byteswap)
+	VPMSUMW(v15,v15,v17)
+
+.Lv15:	vxor	v19,v19,v15
+.Lv14:	vxor	v20,v20,v14
+.Lv13:	vxor	v19,v19,v13
+.Lv12:	vxor	v20,v20,v12
+.Lv11:	vxor	v19,v19,v11
+.Lv10:	vxor	v20,v20,v10
+.Lv9:	vxor	v19,v19,v9
+.Lv8:	vxor	v20,v20,v8
+.Lv7:	vxor	v19,v19,v7
+.Lv6:	vxor	v20,v20,v6
+.Lv5:	vxor	v19,v19,v5
+.Lv4:	vxor	v20,v20,v4
+.Lv3:	vxor	v19,v19,v3
+.Lv2:	vxor	v20,v20,v2
+.Lv1:	vxor	v19,v19,v1
+.Lv0:	vxor	v20,v20,v0
+
+	vxor	v0,v19,v20
+
+	b	.Lbarrett_reduction
+
+.Lzero:
+	mr	r3,r10
+	b	.Lout
+
+FUNC_END(CRC_FUNCTION_NAME)
diff --git a/arch/powerpc/crypto/crc32c-vpmsum_asm.S b/arch/powerpc/crypto/crc32c-vpmsum_asm.S
index dc640b212299..c0d080caefc1 100644
--- a/arch/powerpc/crypto/crc32c-vpmsum_asm.S
+++ b/arch/powerpc/crypto/crc32c-vpmsum_asm.S
@@ -1,20 +1,5 @@
 /*
- * Calculate the checksum of data that is 16 byte aligned and a multiple of
- * 16 bytes.
- *
- * The first step is to reduce it to 1024 bits. We do this in 8 parallel
- * chunks in order to mask the latency of the vpmsum instructions. If we
- * have more than 32 kB of data to checksum we repeat this step multiple
- * times, passing in the previous 1024 bits.
- *
- * The next step is to reduce the 1024 bits to 64 bits. This step adds
- * 32 bits of 0s to the end - this matches what a CRC does. We just
- * calculate constants that land the data in this 32 bits.
- *
- * We then use fixed point Barrett reduction to compute a mod n over GF(2)
- * for n = CRC using POWER8 instructions. We use x = 32.
- *
- * http://en.wikipedia.org/wiki/Barrett_reduction
+ * Calculate a crc32c with vpmsum acceleration
  *
  * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
  *
@@ -23,9 +8,6 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
-#include <asm/ppc_asm.h>
-#include <asm/ppc-opcode.h>
-
 	.section	.rodata
 .balign 16
 
@@ -33,7 +15,6 @@
 	/* byte reverse permute constant */
 	.octa 0x0F0E0D0C0B0A09080706050403020100
 
-#define MAX_SIZE	32768
 .constants:
 
 	/* Reduce 262144 kbits to 1024 bits */
@@ -860,694 +841,5 @@
 	/* 33 bit reflected Barrett constant n */
 	.octa 0x00000000000000000000000105ec76f1
 
-	.text
-
-#if defined(__BIG_ENDIAN__)
-#define BYTESWAP_DATA
-#else
-#undef BYTESWAP_DATA
-#endif
-
-#define off16		r25
-#define off32		r26
-#define off48		r27
-#define off64		r28
-#define off80		r29
-#define off96		r30
-#define off112		r31
-
-#define const1		v24
-#define const2		v25
-
-#define byteswap	v26
-#define	mask_32bit	v27
-#define	mask_64bit	v28
-#define zeroes		v29
-
-#ifdef BYTESWAP_DATA
-#define VPERM(A, B, C, D) vperm	A, B, C, D
-#else
-#define VPERM(A, B, C, D)
-#endif
-
-/* unsigned int __crc32c_vpmsum(unsigned int crc, void *p, unsigned long len) */
-FUNC_START(__crc32c_vpmsum)
-	std	r31,-8(r1)
-	std	r30,-16(r1)
-	std	r29,-24(r1)
-	std	r28,-32(r1)
-	std	r27,-40(r1)
-	std	r26,-48(r1)
-	std	r25,-56(r1)
-
-	li	off16,16
-	li	off32,32
-	li	off48,48
-	li	off64,64
-	li	off80,80
-	li	off96,96
-	li	off112,112
-	li	r0,0
-
-	/* Enough room for saving 10 non volatile VMX registers */
-	subi	r6,r1,56+10*16
-	subi	r7,r1,56+2*16
-
-	stvx	v20,0,r6
-	stvx	v21,off16,r6
-	stvx	v22,off32,r6
-	stvx	v23,off48,r6
-	stvx	v24,off64,r6
-	stvx	v25,off80,r6
-	stvx	v26,off96,r6
-	stvx	v27,off112,r6
-	stvx	v28,0,r7
-	stvx	v29,off16,r7
-
-	mr	r10,r3
-
-	vxor	zeroes,zeroes,zeroes
-	vspltisw v0,-1
-
-	vsldoi	mask_32bit,zeroes,v0,4
-	vsldoi	mask_64bit,zeroes,v0,8
-
-	/* Get the initial value into v8 */
-	vxor	v8,v8,v8
-	MTVRD(v8, R3)
-	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
-
-#ifdef BYTESWAP_DATA
-	addis	r3,r2,.byteswap_constant@toc@ha
-	addi	r3,r3,.byteswap_constant@toc@l
-
-	lvx	byteswap,0,r3
-	addi	r3,r3,16
-#endif
-
-	cmpdi	r5,256
-	blt	.Lshort
-
-	rldicr	r6,r5,0,56
-
-	/* Checksum in blocks of MAX_SIZE */
-1:	lis	r7,MAX_SIZE@h
-	ori	r7,r7,MAX_SIZE@l
-	mr	r9,r7
-	cmpd	r6,r7
-	bgt	2f
-	mr	r7,r6
-2:	subf	r6,r7,r6
-
-	/* our main loop does 128 bytes at a time */
-	srdi	r7,r7,7
-
-	/*
-	 * Work out the offset into the constants table to start at. Each
-	 * constant is 16 bytes, and it is used against 128 bytes of input
-	 * data - 128 / 16 = 8
-	 */
-	sldi	r8,r7,4
-	srdi	r9,r9,3
-	subf	r8,r8,r9
-
-	/* We reduce our final 128 bytes in a separate step */
-	addi	r7,r7,-1
-	mtctr	r7
-
-	addis	r3,r2,.constants@toc@ha
-	addi	r3,r3,.constants@toc@l
-
-	/* Find the start of our constants */
-	add	r3,r3,r8
-
-	/* zero v0-v7 which will contain our checksums */
-	vxor	v0,v0,v0
-	vxor	v1,v1,v1
-	vxor	v2,v2,v2
-	vxor	v3,v3,v3
-	vxor	v4,v4,v4
-	vxor	v5,v5,v5
-	vxor	v6,v6,v6
-	vxor	v7,v7,v7
-
-	lvx	const1,0,r3
-
-	/*
-	 * If we are looping back to consume more data we use the values
-	 * already in v16-v23.
-	 */
-	cmpdi	r0,1
-	beq	2f
-
-	/* First warm up pass */
-	lvx	v16,0,r4
-	lvx	v17,off16,r4
-	VPERM(v16,v16,v16,byteswap)
-	VPERM(v17,v17,v17,byteswap)
-	lvx	v18,off32,r4
-	lvx	v19,off48,r4
-	VPERM(v18,v18,v18,byteswap)
-	VPERM(v19,v19,v19,byteswap)
-	lvx	v20,off64,r4
-	lvx	v21,off80,r4
-	VPERM(v20,v20,v20,byteswap)
-	VPERM(v21,v21,v21,byteswap)
-	lvx	v22,off96,r4
-	lvx	v23,off112,r4
-	VPERM(v22,v22,v22,byteswap)
-	VPERM(v23,v23,v23,byteswap)
-	addi	r4,r4,8*16
-
-	/* xor in initial value */
-	vxor	v16,v16,v8
-
-2:	bdz	.Lfirst_warm_up_done
-
-	addi	r3,r3,16
-	lvx	const2,0,r3
-
-	/* Second warm up pass */
-	VPMSUMD(v8,v16,const1)
-	lvx	v16,0,r4
-	VPERM(v16,v16,v16,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v9,v17,const1)
-	lvx	v17,off16,r4
-	VPERM(v17,v17,v17,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v10,v18,const1)
-	lvx	v18,off32,r4
-	VPERM(v18,v18,v18,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v11,v19,const1)
-	lvx	v19,off48,r4
-	VPERM(v19,v19,v19,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v12,v20,const1)
-	lvx	v20,off64,r4
-	VPERM(v20,v20,v20,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v13,v21,const1)
-	lvx	v21,off80,r4
-	VPERM(v21,v21,v21,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v14,v22,const1)
-	lvx	v22,off96,r4
-	VPERM(v22,v22,v22,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v15,v23,const1)
-	lvx	v23,off112,r4
-	VPERM(v23,v23,v23,byteswap)
-
-	addi	r4,r4,8*16
-
-	bdz	.Lfirst_cool_down
-
-	/*
-	 * main loop. We modulo schedule it such that it takes three iterations
-	 * to complete - first iteration load, second iteration vpmsum, third
-	 * iteration xor.
-	 */
-	.balign	16
-4:	lvx	const1,0,r3
-	addi	r3,r3,16
-	ori	r2,r2,0
-
-	vxor	v0,v0,v8
-	VPMSUMD(v8,v16,const2)
-	lvx	v16,0,r4
-	VPERM(v16,v16,v16,byteswap)
-	ori	r2,r2,0
-
-	vxor	v1,v1,v9
-	VPMSUMD(v9,v17,const2)
-	lvx	v17,off16,r4
-	VPERM(v17,v17,v17,byteswap)
-	ori	r2,r2,0
-
-	vxor	v2,v2,v10
-	VPMSUMD(v10,v18,const2)
-	lvx	v18,off32,r4
-	VPERM(v18,v18,v18,byteswap)
-	ori	r2,r2,0
-
-	vxor	v3,v3,v11
-	VPMSUMD(v11,v19,const2)
-	lvx	v19,off48,r4
-	VPERM(v19,v19,v19,byteswap)
-	lvx	const2,0,r3
-	ori	r2,r2,0
-
-	vxor	v4,v4,v12
-	VPMSUMD(v12,v20,const1)
-	lvx	v20,off64,r4
-	VPERM(v20,v20,v20,byteswap)
-	ori	r2,r2,0
-
-	vxor	v5,v5,v13
-	VPMSUMD(v13,v21,const1)
-	lvx	v21,off80,r4
-	VPERM(v21,v21,v21,byteswap)
-	ori	r2,r2,0
-
-	vxor	v6,v6,v14
-	VPMSUMD(v14,v22,const1)
-	lvx	v22,off96,r4
-	VPERM(v22,v22,v22,byteswap)
-	ori	r2,r2,0
-
-	vxor	v7,v7,v15
-	VPMSUMD(v15,v23,const1)
-	lvx	v23,off112,r4
-	VPERM(v23,v23,v23,byteswap)
-
-	addi	r4,r4,8*16
-
-	bdnz	4b
-
-.Lfirst_cool_down:
-	/* First cool down pass */
-	lvx	const1,0,r3
-	addi	r3,r3,16
-
-	vxor	v0,v0,v8
-	VPMSUMD(v8,v16,const1)
-	ori	r2,r2,0
-
-	vxor	v1,v1,v9
-	VPMSUMD(v9,v17,const1)
-	ori	r2,r2,0
-
-	vxor	v2,v2,v10
-	VPMSUMD(v10,v18,const1)
-	ori	r2,r2,0
-
-	vxor	v3,v3,v11
-	VPMSUMD(v11,v19,const1)
-	ori	r2,r2,0
-
-	vxor	v4,v4,v12
-	VPMSUMD(v12,v20,const1)
-	ori	r2,r2,0
-
-	vxor	v5,v5,v13
-	VPMSUMD(v13,v21,const1)
-	ori	r2,r2,0
-
-	vxor	v6,v6,v14
-	VPMSUMD(v14,v22,const1)
-	ori	r2,r2,0
-
-	vxor	v7,v7,v15
-	VPMSUMD(v15,v23,const1)
-	ori	r2,r2,0
-
-.Lsecond_cool_down:
-	/* Second cool down pass */
-	vxor	v0,v0,v8
-	vxor	v1,v1,v9
-	vxor	v2,v2,v10
-	vxor	v3,v3,v11
-	vxor	v4,v4,v12
-	vxor	v5,v5,v13
-	vxor	v6,v6,v14
-	vxor	v7,v7,v15
-
-	/*
-	 * vpmsumd produces a 96 bit result in the least significant bits
-	 * of the register. Since we are bit reflected we have to shift it
-	 * left 32 bits so it occupies the least significant bits in the
-	 * bit reflected domain.
-	 */
-	vsldoi	v0,v0,zeroes,4
-	vsldoi	v1,v1,zeroes,4
-	vsldoi	v2,v2,zeroes,4
-	vsldoi	v3,v3,zeroes,4
-	vsldoi	v4,v4,zeroes,4
-	vsldoi	v5,v5,zeroes,4
-	vsldoi	v6,v6,zeroes,4
-	vsldoi	v7,v7,zeroes,4
-
-	/* xor with last 1024 bits */
-	lvx	v8,0,r4
-	lvx	v9,off16,r4
-	VPERM(v8,v8,v8,byteswap)
-	VPERM(v9,v9,v9,byteswap)
-	lvx	v10,off32,r4
-	lvx	v11,off48,r4
-	VPERM(v10,v10,v10,byteswap)
-	VPERM(v11,v11,v11,byteswap)
-	lvx	v12,off64,r4
-	lvx	v13,off80,r4
-	VPERM(v12,v12,v12,byteswap)
-	VPERM(v13,v13,v13,byteswap)
-	lvx	v14,off96,r4
-	lvx	v15,off112,r4
-	VPERM(v14,v14,v14,byteswap)
-	VPERM(v15,v15,v15,byteswap)
-
-	addi	r4,r4,8*16
-
-	vxor	v16,v0,v8
-	vxor	v17,v1,v9
-	vxor	v18,v2,v10
-	vxor	v19,v3,v11
-	vxor	v20,v4,v12
-	vxor	v21,v5,v13
-	vxor	v22,v6,v14
-	vxor	v23,v7,v15
-
-	li	r0,1
-	cmpdi	r6,0
-	addi	r6,r6,128
-	bne	1b
-
-	/* Work out how many bytes we have left */
-	andi.	r5,r5,127
-
-	/* Calculate where in the constant table we need to start */
-	subfic	r6,r5,128
-	add	r3,r3,r6
-
-	/* How many 16 byte chunks are in the tail */
-	srdi	r7,r5,4
-	mtctr	r7
-
-	/*
-	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
-	 * 32 bits to include the trailing 32 bits of zeros
-	 */
-	lvx	v0,0,r3
-	lvx	v1,off16,r3
-	lvx	v2,off32,r3
-	lvx	v3,off48,r3
-	lvx	v4,off64,r3
-	lvx	v5,off80,r3
-	lvx	v6,off96,r3
-	lvx	v7,off112,r3
-	addi	r3,r3,8*16
-
-	VPMSUMW(v0,v16,v0)
-	VPMSUMW(v1,v17,v1)
-	VPMSUMW(v2,v18,v2)
-	VPMSUMW(v3,v19,v3)
-	VPMSUMW(v4,v20,v4)
-	VPMSUMW(v5,v21,v5)
-	VPMSUMW(v6,v22,v6)
-	VPMSUMW(v7,v23,v7)
-
-	/* Now reduce the tail (0 - 112 bytes) */
-	cmpdi	r7,0
-	beq	1f
-
-	lvx	v16,0,r4
-	lvx	v17,0,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-
-	lvx	v16,off16,r4
-	lvx	v17,off16,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-
-	lvx	v16,off32,r4
-	lvx	v17,off32,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-
-	lvx	v16,off48,r4
-	lvx	v17,off48,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-
-	lvx	v16,off64,r4
-	lvx	v17,off64,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-
-	lvx	v16,off80,r4
-	lvx	v17,off80,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-
-	lvx	v16,off96,r4
-	lvx	v17,off96,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-
-	/* Now xor all the parallel chunks together */
-1:	vxor	v0,v0,v1
-	vxor	v2,v2,v3
-	vxor	v4,v4,v5
-	vxor	v6,v6,v7
-
-	vxor	v0,v0,v2
-	vxor	v4,v4,v6
-
-	vxor	v0,v0,v4
-
-.Lbarrett_reduction:
-	/* Barrett constants */
-	addis	r3,r2,.barrett_constants@toc@ha
-	addi	r3,r3,.barrett_constants@toc@l
-
-	lvx	const1,0,r3
-	lvx	const2,off16,r3
-
-	vsldoi	v1,v0,v0,8
-	vxor	v0,v0,v1		/* xor two 64 bit results together */
-
-	/* shift left one bit */
-	vspltisb v1,1
-	vsl	v0,v0,v1
-
-	vand	v0,v0,mask_64bit
-
-	/*
-	 * The reflected version of Barrett reduction. Instead of bit
-	 * reflecting our data (which is expensive to do), we bit reflect our
-	 * constants and our algorithm, which means the intermediate data in
-	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
-	 * the algorithm because we don't carry in mod 2 arithmetic.
-	 */
-	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
-	VPMSUMD(v1,v1,const1)		/* ma */
-	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
-	VPMSUMD(v1,v1,const2)		/* qn */
-	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
-
-	/*
-	 * Since we are bit reflected, the result (ie the low 32 bits) is in
-	 * the high 32 bits. We just need to shift it left 4 bytes
-	 * V0 [ 0 1 X 3 ]
-	 * V0 [ 0 X 2 3 ]
-	 */
-	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
-
-	/* Get it into r3 */
-	MFVRD(R3, v0)
-
-.Lout:
-	subi	r6,r1,56+10*16
-	subi	r7,r1,56+2*16
-
-	lvx	v20,0,r6
-	lvx	v21,off16,r6
-	lvx	v22,off32,r6
-	lvx	v23,off48,r6
-	lvx	v24,off64,r6
-	lvx	v25,off80,r6
-	lvx	v26,off96,r6
-	lvx	v27,off112,r6
-	lvx	v28,0,r7
-	lvx	v29,off16,r7
-
-	ld	r31,-8(r1)
-	ld	r30,-16(r1)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
-	ld	r27,-40(r1)
-	ld	r26,-48(r1)
-	ld	r25,-56(r1)
-
-	blr
-
-.Lfirst_warm_up_done:
-	lvx	const1,0,r3
-	addi	r3,r3,16
-
-	VPMSUMD(v8,v16,const1)
-	VPMSUMD(v9,v17,const1)
-	VPMSUMD(v10,v18,const1)
-	VPMSUMD(v11,v19,const1)
-	VPMSUMD(v12,v20,const1)
-	VPMSUMD(v13,v21,const1)
-	VPMSUMD(v14,v22,const1)
-	VPMSUMD(v15,v23,const1)
-
-	b	.Lsecond_cool_down
-
-.Lshort:
-	cmpdi	r5,0
-	beq	.Lzero
-
-	addis	r3,r2,.short_constants@toc@ha
-	addi	r3,r3,.short_constants@toc@l
-
-	/* Calculate where in the constant table we need to start */
-	subfic	r6,r5,256
-	add	r3,r3,r6
-
-	/* How many 16 byte chunks? */
-	srdi	r7,r5,4
-	mtctr	r7
-
-	vxor	v19,v19,v19
-	vxor	v20,v20,v20
-
-	lvx	v0,0,r4
-	lvx	v16,0,r3
-	VPERM(v0,v0,v16,byteswap)
-	vxor	v0,v0,v8	/* xor in initial value */
-	VPMSUMW(v0,v0,v16)
-	bdz	.Lv0
-
-	lvx	v1,off16,r4
-	lvx	v17,off16,r3
-	VPERM(v1,v1,v17,byteswap)
-	VPMSUMW(v1,v1,v17)
-	bdz	.Lv1
-
-	lvx	v2,off32,r4
-	lvx	v16,off32,r3
-	VPERM(v2,v2,v16,byteswap)
-	VPMSUMW(v2,v2,v16)
-	bdz	.Lv2
-
-	lvx	v3,off48,r4
-	lvx	v17,off48,r3
-	VPERM(v3,v3,v17,byteswap)
-	VPMSUMW(v3,v3,v17)
-	bdz	.Lv3
-
-	lvx	v4,off64,r4
-	lvx	v16,off64,r3
-	VPERM(v4,v4,v16,byteswap)
-	VPMSUMW(v4,v4,v16)
-	bdz	.Lv4
-
-	lvx	v5,off80,r4
-	lvx	v17,off80,r3
-	VPERM(v5,v5,v17,byteswap)
-	VPMSUMW(v5,v5,v17)
-	bdz	.Lv5
-
-	lvx	v6,off96,r4
-	lvx	v16,off96,r3
-	VPERM(v6,v6,v16,byteswap)
-	VPMSUMW(v6,v6,v16)
-	bdz	.Lv6
-
-	lvx	v7,off112,r4
-	lvx	v17,off112,r3
-	VPERM(v7,v7,v17,byteswap)
-	VPMSUMW(v7,v7,v17)
-	bdz	.Lv7
-
-	addi	r3,r3,128
-	addi	r4,r4,128
-
-	lvx	v8,0,r4
-	lvx	v16,0,r3
-	VPERM(v8,v8,v16,byteswap)
-	VPMSUMW(v8,v8,v16)
-	bdz	.Lv8
-
-	lvx	v9,off16,r4
-	lvx	v17,off16,r3
-	VPERM(v9,v9,v17,byteswap)
-	VPMSUMW(v9,v9,v17)
-	bdz	.Lv9
-
-	lvx	v10,off32,r4
-	lvx	v16,off32,r3
-	VPERM(v10,v10,v16,byteswap)
-	VPMSUMW(v10,v10,v16)
-	bdz	.Lv10
-
-	lvx	v11,off48,r4
-	lvx	v17,off48,r3
-	VPERM(v11,v11,v17,byteswap)
-	VPMSUMW(v11,v11,v17)
-	bdz	.Lv11
-
-	lvx	v12,off64,r4
-	lvx	v16,off64,r3
-	VPERM(v12,v12,v16,byteswap)
-	VPMSUMW(v12,v12,v16)
-	bdz	.Lv12
-
-	lvx	v13,off80,r4
-	lvx	v17,off80,r3
-	VPERM(v13,v13,v17,byteswap)
-	VPMSUMW(v13,v13,v17)
-	bdz	.Lv13
-
-	lvx	v14,off96,r4
-	lvx	v16,off96,r3
-	VPERM(v14,v14,v16,byteswap)
-	VPMSUMW(v14,v14,v16)
-	bdz	.Lv14
-
-	lvx	v15,off112,r4
-	lvx	v17,off112,r3
-	VPERM(v15,v15,v17,byteswap)
-	VPMSUMW(v15,v15,v17)
-
-.Lv15:	vxor	v19,v19,v15
-.Lv14:	vxor	v20,v20,v14
-.Lv13:	vxor	v19,v19,v13
-.Lv12:	vxor	v20,v20,v12
-.Lv11:	vxor	v19,v19,v11
-.Lv10:	vxor	v20,v20,v10
-.Lv9:	vxor	v19,v19,v9
-.Lv8:	vxor	v20,v20,v8
-.Lv7:	vxor	v19,v19,v7
-.Lv6:	vxor	v20,v20,v6
-.Lv5:	vxor	v19,v19,v5
-.Lv4:	vxor	v20,v20,v4
-.Lv3:	vxor	v19,v19,v3
-.Lv2:	vxor	v20,v20,v2
-.Lv1:	vxor	v19,v19,v1
-.Lv0:	vxor	v20,v20,v0
-
-	vxor	v0,v19,v20
-
-	b	.Lbarrett_reduction
-
-.Lzero:
-	mr	r3,r10
-	b	.Lout
-
-FUNC_END(__crc32_vpmsum)
+#define CRC_FUNCTION_NAME __crc32c_vpmsum
+#include "crc32-vpmsum_core.S"
-- 
cgit 


From 08c7dd1bd4b3a59c0d57b3ce4fe5ba1a90123685 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Wed, 15 Mar 2017 23:37:35 +1100
Subject: crypto: powerpc - Re-enable non-REFLECTed CRCs

When CRC32c was included in the kernel, Anton ripped out
the #ifdefs around reflected polynomials, because CRC32c
is always reflected. However, not all CRCs use reflection
so we'd like to make it optional.

Restore the REFLECT parts from Anton's original CRC32
implementation (https://github.com/antonblanchard/crc32-vpmsum)

That implementation is available under GPLv2+, so we're OK
from a licensing point of view:
https://github.com/antonblanchard/crc32-vpmsum/blob/master/LICENSE.TXT

As CRC32c requires REFLECT, add that #define.

Cc: Anton Blanchard <anton@samba.org>
Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/powerpc/crypto/crc32-vpmsum_core.S | 31 ++++++++++++++++++++++++++++++-
 arch/powerpc/crypto/crc32c-vpmsum_asm.S |  1 +
 2 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/crypto/crc32-vpmsum_core.S b/arch/powerpc/crypto/crc32-vpmsum_core.S
index 7c6be6a5c977..aadb59c96a27 100644
--- a/arch/powerpc/crypto/crc32-vpmsum_core.S
+++ b/arch/powerpc/crypto/crc32-vpmsum_core.S
@@ -35,7 +35,9 @@
 
 	.text
 
-#if defined(__BIG_ENDIAN__)
+#if defined(__BIG_ENDIAN__) && defined(REFLECT)
+#define BYTESWAP_DATA
+#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
 #define BYTESWAP_DATA
 #else
 #undef BYTESWAP_DATA
@@ -108,7 +110,11 @@ FUNC_START(CRC_FUNCTION_NAME)
 	/* Get the initial value into v8 */
 	vxor	v8,v8,v8
 	MTVRD(v8, R3)
+#ifdef REFLECT
 	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
+#else
+	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
+#endif
 
 #ifdef BYTESWAP_DATA
 	addis	r3,r2,.byteswap_constant@toc@ha
@@ -354,6 +360,7 @@ FUNC_START(CRC_FUNCTION_NAME)
 	vxor	v6,v6,v14
 	vxor	v7,v7,v15
 
+#ifdef REFLECT
 	/*
 	 * vpmsumd produces a 96 bit result in the least significant bits
 	 * of the register. Since we are bit reflected we have to shift it
@@ -368,6 +375,7 @@ FUNC_START(CRC_FUNCTION_NAME)
 	vsldoi	v5,v5,zeroes,4
 	vsldoi	v6,v6,zeroes,4
 	vsldoi	v7,v7,zeroes,4
+#endif
 
 	/* xor with last 1024 bits */
 	lvx	v8,0,r4
@@ -511,12 +519,32 @@ FUNC_START(CRC_FUNCTION_NAME)
 	vsldoi	v1,v0,v0,8
 	vxor	v0,v0,v1		/* xor two 64 bit results together */
 
+#ifdef REFLECT
 	/* shift left one bit */
 	vspltisb v1,1
 	vsl	v0,v0,v1
+#endif
 
 	vand	v0,v0,mask_64bit
+#ifndef REFLECT
+	/*
+	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
+	 * the multiple of our polynomial that we need to subtract. By
+	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
+	 * result back down 2x bits, we round down to the nearest multiple.
+	 */
+	VPMSUMD(v1,v0,const1)	/* ma */
+	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
+	VPMSUMD(v1,v1,const2)	/* qn */
+	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */
 
+	/*
+	 * Get the result into r3. We need to shift it left 8 bytes:
+	 * V0 [ 0 1 2 X ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
+#else
 	/*
 	 * The reflected version of Barrett reduction. Instead of bit
 	 * reflecting our data (which is expensive to do), we bit reflect our
@@ -537,6 +565,7 @@ FUNC_START(CRC_FUNCTION_NAME)
 	 * V0 [ 0 X 2 3 ]
 	 */
 	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
+#endif
 
 	/* Get it into r3 */
 	MFVRD(R3, v0)
diff --git a/arch/powerpc/crypto/crc32c-vpmsum_asm.S b/arch/powerpc/crypto/crc32c-vpmsum_asm.S
index c0d080caefc1..d2bea48051a0 100644
--- a/arch/powerpc/crypto/crc32c-vpmsum_asm.S
+++ b/arch/powerpc/crypto/crc32c-vpmsum_asm.S
@@ -842,4 +842,5 @@
 	.octa 0x00000000000000000000000105ec76f1
 
 #define CRC_FUNCTION_NAME __crc32c_vpmsum
+#define REFLECT
 #include "crc32-vpmsum_core.S"
-- 
cgit 


From b01df1c16c9a6f7a14f843d3ac6b9eef5a7bb17e Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Wed, 15 Mar 2017 23:37:36 +1100
Subject: crypto: powerpc - Add CRC-T10DIF acceleration

T10DIF is a CRC16 used heavily in NVMe.

It turns out we can accelerate it with a CRC32 library and a few
little tricks.

Provide the accelerator based the refactored CRC32 code.

Cc: Anton Blanchard <anton@samba.org>
Thanks-to: Hong Bo Peng <penghb@cn.ibm.com>
Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/powerpc/crypto/Makefile                |   2 +
 arch/powerpc/crypto/crct10dif-vpmsum_asm.S  | 850 ++++++++++++++++++++++++++++
 arch/powerpc/crypto/crct10dif-vpmsum_glue.c | 125 ++++
 3 files changed, 977 insertions(+)
 create mode 100644 arch/powerpc/crypto/crct10dif-vpmsum_asm.S
 create mode 100644 arch/powerpc/crypto/crct10dif-vpmsum_glue.c

(limited to 'arch')

diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
index 87f40454bad3..e66aaf19764d 100644
--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_CRYPTO_SHA1_PPC) += sha1-powerpc.o
 obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o
 obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o
 obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o
+obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o
 
 aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o
 md5-ppc-y := md5-asm.o md5-glue.o
@@ -17,3 +18,4 @@ sha1-powerpc-y := sha1-powerpc-asm.o sha1.o
 sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o
 sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o
 crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o
+crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o
diff --git a/arch/powerpc/crypto/crct10dif-vpmsum_asm.S b/arch/powerpc/crypto/crct10dif-vpmsum_asm.S
new file mode 100644
index 000000000000..5e3d81a0af1b
--- /dev/null
+++ b/arch/powerpc/crypto/crct10dif-vpmsum_asm.S
@@ -0,0 +1,850 @@
+/*
+ * Calculate a CRC T10DIF  with vpmsum acceleration
+ *
+ * Constants generated by crc32-vpmsum, available at
+ * https://github.com/antonblanchard/crc32-vpmsum
+ *
+ * crc32-vpmsum is
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ * and is available under the GPL v2 or later.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+	.section	.rodata
+.balign 16
+
+.byteswap_constant:
+	/* byte reverse permute constant */
+	.octa 0x0F0E0D0C0B0A09080706050403020100
+
+.constants:
+
+	/* Reduce 262144 kbits to 1024 bits */
+	/* x^261184 mod p(x), x^261120 mod p(x) */
+	.octa 0x0000000056d300000000000052550000
+
+	/* x^260160 mod p(x), x^260096 mod p(x) */
+	.octa 0x00000000ee67000000000000a1e40000
+
+	/* x^259136 mod p(x), x^259072 mod p(x) */
+	.octa 0x0000000060830000000000004ad10000
+
+	/* x^258112 mod p(x), x^258048 mod p(x) */
+	.octa 0x000000008cfe0000000000009ab40000
+
+	/* x^257088 mod p(x), x^257024 mod p(x) */
+	.octa 0x000000003e93000000000000fdb50000
+
+	/* x^256064 mod p(x), x^256000 mod p(x) */
+	.octa 0x000000003c2000000000000045480000
+
+	/* x^255040 mod p(x), x^254976 mod p(x) */
+	.octa 0x00000000b1fc0000000000008d690000
+
+	/* x^254016 mod p(x), x^253952 mod p(x) */
+	.octa 0x00000000f82b00000000000024ad0000
+
+	/* x^252992 mod p(x), x^252928 mod p(x) */
+	.octa 0x0000000044420000000000009f1a0000
+
+	/* x^251968 mod p(x), x^251904 mod p(x) */
+	.octa 0x00000000e88c00000000000066ec0000
+
+	/* x^250944 mod p(x), x^250880 mod p(x) */
+	.octa 0x00000000385c000000000000c87d0000
+
+	/* x^249920 mod p(x), x^249856 mod p(x) */
+	.octa 0x000000003227000000000000c8ff0000
+
+	/* x^248896 mod p(x), x^248832 mod p(x) */
+	.octa 0x00000000a9a900000000000033440000
+
+	/* x^247872 mod p(x), x^247808 mod p(x) */
+	.octa 0x00000000abaa00000000000066eb0000
+
+	/* x^246848 mod p(x), x^246784 mod p(x) */
+	.octa 0x000000001ac3000000000000c4ef0000
+
+	/* x^245824 mod p(x), x^245760 mod p(x) */
+	.octa 0x0000000063f000000000000056f30000
+
+	/* x^244800 mod p(x), x^244736 mod p(x) */
+	.octa 0x0000000032cc00000000000002050000
+
+	/* x^243776 mod p(x), x^243712 mod p(x) */
+	.octa 0x00000000f8b5000000000000568e0000
+
+	/* x^242752 mod p(x), x^242688 mod p(x) */
+	.octa 0x000000008db100000000000064290000
+
+	/* x^241728 mod p(x), x^241664 mod p(x) */
+	.octa 0x0000000059ca0000000000006b660000
+
+	/* x^240704 mod p(x), x^240640 mod p(x) */
+	.octa 0x000000005f5c00000000000018f80000
+
+	/* x^239680 mod p(x), x^239616 mod p(x) */
+	.octa 0x0000000061af000000000000b6090000
+
+	/* x^238656 mod p(x), x^238592 mod p(x) */
+	.octa 0x00000000e29e000000000000099a0000
+
+	/* x^237632 mod p(x), x^237568 mod p(x) */
+	.octa 0x000000000975000000000000a8360000
+
+	/* x^236608 mod p(x), x^236544 mod p(x) */
+	.octa 0x0000000043900000000000004f570000
+
+	/* x^235584 mod p(x), x^235520 mod p(x) */
+	.octa 0x00000000f9cd000000000000134c0000
+
+	/* x^234560 mod p(x), x^234496 mod p(x) */
+	.octa 0x000000007c29000000000000ec380000
+
+	/* x^233536 mod p(x), x^233472 mod p(x) */
+	.octa 0x000000004c6a000000000000b0d10000
+
+	/* x^232512 mod p(x), x^232448 mod p(x) */
+	.octa 0x00000000e7290000000000007d3e0000
+
+	/* x^231488 mod p(x), x^231424 mod p(x) */
+	.octa 0x00000000f1ab000000000000f0b20000
+
+	/* x^230464 mod p(x), x^230400 mod p(x) */
+	.octa 0x0000000039db0000000000009c270000
+
+	/* x^229440 mod p(x), x^229376 mod p(x) */
+	.octa 0x000000005e2800000000000092890000
+
+	/* x^228416 mod p(x), x^228352 mod p(x) */
+	.octa 0x00000000d44e000000000000d5ee0000
+
+	/* x^227392 mod p(x), x^227328 mod p(x) */
+	.octa 0x00000000cd0a00000000000041f50000
+
+	/* x^226368 mod p(x), x^226304 mod p(x) */
+	.octa 0x00000000c5b400000000000010520000
+
+	/* x^225344 mod p(x), x^225280 mod p(x) */
+	.octa 0x00000000fd2100000000000042170000
+
+	/* x^224320 mod p(x), x^224256 mod p(x) */
+	.octa 0x000000002f2500000000000095c20000
+
+	/* x^223296 mod p(x), x^223232 mod p(x) */
+	.octa 0x000000001b0100000000000001ce0000
+
+	/* x^222272 mod p(x), x^222208 mod p(x) */
+	.octa 0x000000000d430000000000002aca0000
+
+	/* x^221248 mod p(x), x^221184 mod p(x) */
+	.octa 0x0000000030a6000000000000385e0000
+
+	/* x^220224 mod p(x), x^220160 mod p(x) */
+	.octa 0x00000000e37b0000000000006f7a0000
+
+	/* x^219200 mod p(x), x^219136 mod p(x) */
+	.octa 0x00000000873600000000000024320000
+
+	/* x^218176 mod p(x), x^218112 mod p(x) */
+	.octa 0x00000000e9fb000000000000bd9c0000
+
+	/* x^217152 mod p(x), x^217088 mod p(x) */
+	.octa 0x000000003b9500000000000054bc0000
+
+	/* x^216128 mod p(x), x^216064 mod p(x) */
+	.octa 0x00000000133e000000000000a4660000
+
+	/* x^215104 mod p(x), x^215040 mod p(x) */
+	.octa 0x00000000784500000000000079930000
+
+	/* x^214080 mod p(x), x^214016 mod p(x) */
+	.octa 0x00000000b9800000000000001bb80000
+
+	/* x^213056 mod p(x), x^212992 mod p(x) */
+	.octa 0x00000000687600000000000024400000
+
+	/* x^212032 mod p(x), x^211968 mod p(x) */
+	.octa 0x00000000aff300000000000029e10000
+
+	/* x^211008 mod p(x), x^210944 mod p(x) */
+	.octa 0x0000000024b50000000000005ded0000
+
+	/* x^209984 mod p(x), x^209920 mod p(x) */
+	.octa 0x0000000017e8000000000000b12e0000
+
+	/* x^208960 mod p(x), x^208896 mod p(x) */
+	.octa 0x00000000128400000000000026d20000
+
+	/* x^207936 mod p(x), x^207872 mod p(x) */
+	.octa 0x000000002115000000000000a32a0000
+
+	/* x^206912 mod p(x), x^206848 mod p(x) */
+	.octa 0x000000009595000000000000a1210000
+
+	/* x^205888 mod p(x), x^205824 mod p(x) */
+	.octa 0x00000000281e000000000000ee8b0000
+
+	/* x^204864 mod p(x), x^204800 mod p(x) */
+	.octa 0x0000000006010000000000003d0d0000
+
+	/* x^203840 mod p(x), x^203776 mod p(x) */
+	.octa 0x00000000e2b600000000000034e90000
+
+	/* x^202816 mod p(x), x^202752 mod p(x) */
+	.octa 0x000000001bd40000000000004cdb0000
+
+	/* x^201792 mod p(x), x^201728 mod p(x) */
+	.octa 0x00000000df2800000000000030e90000
+
+	/* x^200768 mod p(x), x^200704 mod p(x) */
+	.octa 0x0000000049c200000000000042590000
+
+	/* x^199744 mod p(x), x^199680 mod p(x) */
+	.octa 0x000000009b97000000000000df950000
+
+	/* x^198720 mod p(x), x^198656 mod p(x) */
+	.octa 0x000000006184000000000000da7b0000
+
+	/* x^197696 mod p(x), x^197632 mod p(x) */
+	.octa 0x00000000461700000000000012510000
+
+	/* x^196672 mod p(x), x^196608 mod p(x) */
+	.octa 0x000000009b40000000000000f37e0000
+
+	/* x^195648 mod p(x), x^195584 mod p(x) */
+	.octa 0x00000000eeb2000000000000ecf10000
+
+	/* x^194624 mod p(x), x^194560 mod p(x) */
+	.octa 0x00000000b2e800000000000050f20000
+
+	/* x^193600 mod p(x), x^193536 mod p(x) */
+	.octa 0x00000000f59a000000000000e0b30000
+
+	/* x^192576 mod p(x), x^192512 mod p(x) */
+	.octa 0x00000000467f0000000000004d5a0000
+
+	/* x^191552 mod p(x), x^191488 mod p(x) */
+	.octa 0x00000000da92000000000000bb010000
+
+	/* x^190528 mod p(x), x^190464 mod p(x) */
+	.octa 0x000000001e1000000000000022a40000
+
+	/* x^189504 mod p(x), x^189440 mod p(x) */
+	.octa 0x0000000058fe000000000000836f0000
+
+	/* x^188480 mod p(x), x^188416 mod p(x) */
+	.octa 0x00000000b9ce000000000000d78d0000
+
+	/* x^187456 mod p(x), x^187392 mod p(x) */
+	.octa 0x0000000022210000000000004f8d0000
+
+	/* x^186432 mod p(x), x^186368 mod p(x) */
+	.octa 0x00000000744600000000000033760000
+
+	/* x^185408 mod p(x), x^185344 mod p(x) */
+	.octa 0x000000001c2e000000000000a1e50000
+
+	/* x^184384 mod p(x), x^184320 mod p(x) */
+	.octa 0x00000000dcc8000000000000a1a40000
+
+	/* x^183360 mod p(x), x^183296 mod p(x) */
+	.octa 0x00000000910f00000000000019a20000
+
+	/* x^182336 mod p(x), x^182272 mod p(x) */
+	.octa 0x0000000055d5000000000000f6ae0000
+
+	/* x^181312 mod p(x), x^181248 mod p(x) */
+	.octa 0x00000000c8ba000000000000a7ac0000
+
+	/* x^180288 mod p(x), x^180224 mod p(x) */
+	.octa 0x0000000031f8000000000000eea20000
+
+	/* x^179264 mod p(x), x^179200 mod p(x) */
+	.octa 0x000000001966000000000000c4d90000
+
+	/* x^178240 mod p(x), x^178176 mod p(x) */
+	.octa 0x00000000b9810000000000002b470000
+
+	/* x^177216 mod p(x), x^177152 mod p(x) */
+	.octa 0x000000008303000000000000f7cf0000
+
+	/* x^176192 mod p(x), x^176128 mod p(x) */
+	.octa 0x000000002ce500000000000035b30000
+
+	/* x^175168 mod p(x), x^175104 mod p(x) */
+	.octa 0x000000002fae0000000000000c7c0000
+
+	/* x^174144 mod p(x), x^174080 mod p(x) */
+	.octa 0x00000000f50c0000000000009edf0000
+
+	/* x^173120 mod p(x), x^173056 mod p(x) */
+	.octa 0x00000000714f00000000000004cd0000
+
+	/* x^172096 mod p(x), x^172032 mod p(x) */
+	.octa 0x00000000c161000000000000541b0000
+
+	/* x^171072 mod p(x), x^171008 mod p(x) */
+	.octa 0x0000000021c8000000000000e2700000
+
+	/* x^170048 mod p(x), x^169984 mod p(x) */
+	.octa 0x00000000b93d00000000000009a60000
+
+	/* x^169024 mod p(x), x^168960 mod p(x) */
+	.octa 0x00000000fbcf000000000000761c0000
+
+	/* x^168000 mod p(x), x^167936 mod p(x) */
+	.octa 0x0000000026350000000000009db30000
+
+	/* x^166976 mod p(x), x^166912 mod p(x) */
+	.octa 0x00000000b64f0000000000003e9f0000
+
+	/* x^165952 mod p(x), x^165888 mod p(x) */
+	.octa 0x00000000bd0e00000000000078590000
+
+	/* x^164928 mod p(x), x^164864 mod p(x) */
+	.octa 0x00000000d9360000000000008bc80000
+
+	/* x^163904 mod p(x), x^163840 mod p(x) */
+	.octa 0x000000002f140000000000008c9f0000
+
+	/* x^162880 mod p(x), x^162816 mod p(x) */
+	.octa 0x000000006a270000000000006af70000
+
+	/* x^161856 mod p(x), x^161792 mod p(x) */
+	.octa 0x000000006685000000000000e5210000
+
+	/* x^160832 mod p(x), x^160768 mod p(x) */
+	.octa 0x0000000062da00000000000008290000
+
+	/* x^159808 mod p(x), x^159744 mod p(x) */
+	.octa 0x00000000bb4b000000000000e4d00000
+
+	/* x^158784 mod p(x), x^158720 mod p(x) */
+	.octa 0x00000000d2490000000000004ae10000
+
+	/* x^157760 mod p(x), x^157696 mod p(x) */
+	.octa 0x00000000c85b00000000000000e70000
+
+	/* x^156736 mod p(x), x^156672 mod p(x) */
+	.octa 0x00000000c37a00000000000015650000
+
+	/* x^155712 mod p(x), x^155648 mod p(x) */
+	.octa 0x0000000018530000000000001c2f0000
+
+	/* x^154688 mod p(x), x^154624 mod p(x) */
+	.octa 0x00000000b46600000000000037bd0000
+
+	/* x^153664 mod p(x), x^153600 mod p(x) */
+	.octa 0x00000000439b00000000000012190000
+
+	/* x^152640 mod p(x), x^152576 mod p(x) */
+	.octa 0x00000000b1260000000000005ece0000
+
+	/* x^151616 mod p(x), x^151552 mod p(x) */
+	.octa 0x00000000d8110000000000002a5e0000
+
+	/* x^150592 mod p(x), x^150528 mod p(x) */
+	.octa 0x00000000099f00000000000052330000
+
+	/* x^149568 mod p(x), x^149504 mod p(x) */
+	.octa 0x00000000f9f9000000000000f9120000
+
+	/* x^148544 mod p(x), x^148480 mod p(x) */
+	.octa 0x000000005cc00000000000000ddc0000
+
+	/* x^147520 mod p(x), x^147456 mod p(x) */
+	.octa 0x00000000343b00000000000012200000
+
+	/* x^146496 mod p(x), x^146432 mod p(x) */
+	.octa 0x000000009222000000000000d12b0000
+
+	/* x^145472 mod p(x), x^145408 mod p(x) */
+	.octa 0x00000000d781000000000000eb2d0000
+
+	/* x^144448 mod p(x), x^144384 mod p(x) */
+	.octa 0x000000000bf400000000000058970000
+
+	/* x^143424 mod p(x), x^143360 mod p(x) */
+	.octa 0x00000000094200000000000013690000
+
+	/* x^142400 mod p(x), x^142336 mod p(x) */
+	.octa 0x00000000d55100000000000051950000
+
+	/* x^141376 mod p(x), x^141312 mod p(x) */
+	.octa 0x000000008f11000000000000954b0000
+
+	/* x^140352 mod p(x), x^140288 mod p(x) */
+	.octa 0x00000000140f000000000000b29e0000
+
+	/* x^139328 mod p(x), x^139264 mod p(x) */
+	.octa 0x00000000c6db000000000000db5d0000
+
+	/* x^138304 mod p(x), x^138240 mod p(x) */
+	.octa 0x00000000715b000000000000dfaf0000
+
+	/* x^137280 mod p(x), x^137216 mod p(x) */
+	.octa 0x000000000dea000000000000e3b60000
+
+	/* x^136256 mod p(x), x^136192 mod p(x) */
+	.octa 0x000000006f94000000000000ddaf0000
+
+	/* x^135232 mod p(x), x^135168 mod p(x) */
+	.octa 0x0000000024e1000000000000e4f70000
+
+	/* x^134208 mod p(x), x^134144 mod p(x) */
+	.octa 0x000000008810000000000000aa110000
+
+	/* x^133184 mod p(x), x^133120 mod p(x) */
+	.octa 0x0000000030c2000000000000a8e60000
+
+	/* x^132160 mod p(x), x^132096 mod p(x) */
+	.octa 0x00000000e6d0000000000000ccf30000
+
+	/* x^131136 mod p(x), x^131072 mod p(x) */
+	.octa 0x000000004da000000000000079bf0000
+
+	/* x^130112 mod p(x), x^130048 mod p(x) */
+	.octa 0x000000007759000000000000b3a30000
+
+	/* x^129088 mod p(x), x^129024 mod p(x) */
+	.octa 0x00000000597400000000000028790000
+
+	/* x^128064 mod p(x), x^128000 mod p(x) */
+	.octa 0x000000007acd000000000000b5820000
+
+	/* x^127040 mod p(x), x^126976 mod p(x) */
+	.octa 0x00000000e6e400000000000026ad0000
+
+	/* x^126016 mod p(x), x^125952 mod p(x) */
+	.octa 0x000000006d49000000000000985b0000
+
+	/* x^124992 mod p(x), x^124928 mod p(x) */
+	.octa 0x000000000f0800000000000011520000
+
+	/* x^123968 mod p(x), x^123904 mod p(x) */
+	.octa 0x000000002c7f000000000000846c0000
+
+	/* x^122944 mod p(x), x^122880 mod p(x) */
+	.octa 0x000000005ce7000000000000ae1d0000
+
+	/* x^121920 mod p(x), x^121856 mod p(x) */
+	.octa 0x00000000d4cb000000000000e21d0000
+
+	/* x^120896 mod p(x), x^120832 mod p(x) */
+	.octa 0x000000003a2300000000000019bb0000
+
+	/* x^119872 mod p(x), x^119808 mod p(x) */
+	.octa 0x000000000e1700000000000095290000
+
+	/* x^118848 mod p(x), x^118784 mod p(x) */
+	.octa 0x000000006e6400000000000050d20000
+
+	/* x^117824 mod p(x), x^117760 mod p(x) */
+	.octa 0x000000008d5c0000000000000cd10000
+
+	/* x^116800 mod p(x), x^116736 mod p(x) */
+	.octa 0x00000000ef310000000000007b570000
+
+	/* x^115776 mod p(x), x^115712 mod p(x) */
+	.octa 0x00000000645d00000000000053d60000
+
+	/* x^114752 mod p(x), x^114688 mod p(x) */
+	.octa 0x0000000018fc00000000000077510000
+
+	/* x^113728 mod p(x), x^113664 mod p(x) */
+	.octa 0x000000000cb3000000000000a7b70000
+
+	/* x^112704 mod p(x), x^112640 mod p(x) */
+	.octa 0x00000000991b000000000000d0780000
+
+	/* x^111680 mod p(x), x^111616 mod p(x) */
+	.octa 0x00000000845a000000000000be3c0000
+
+	/* x^110656 mod p(x), x^110592 mod p(x) */
+	.octa 0x00000000d3a9000000000000df020000
+
+	/* x^109632 mod p(x), x^109568 mod p(x) */
+	.octa 0x0000000017d7000000000000063e0000
+
+	/* x^108608 mod p(x), x^108544 mod p(x) */
+	.octa 0x000000007a860000000000008ab40000
+
+	/* x^107584 mod p(x), x^107520 mod p(x) */
+	.octa 0x00000000fd7c000000000000c7bd0000
+
+	/* x^106560 mod p(x), x^106496 mod p(x) */
+	.octa 0x00000000a56b000000000000efd60000
+
+	/* x^105536 mod p(x), x^105472 mod p(x) */
+	.octa 0x0000000010e400000000000071380000
+
+	/* x^104512 mod p(x), x^104448 mod p(x) */
+	.octa 0x00000000994500000000000004d30000
+
+	/* x^103488 mod p(x), x^103424 mod p(x) */
+	.octa 0x00000000b83c0000000000003b0e0000
+
+	/* x^102464 mod p(x), x^102400 mod p(x) */
+	.octa 0x00000000d6c10000000000008b020000
+
+	/* x^101440 mod p(x), x^101376 mod p(x) */
+	.octa 0x000000009efc000000000000da940000
+
+	/* x^100416 mod p(x), x^100352 mod p(x) */
+	.octa 0x000000005e87000000000000f9f70000
+
+	/* x^99392 mod p(x), x^99328 mod p(x) */
+	.octa 0x000000006c9b00000000000045e40000
+
+	/* x^98368 mod p(x), x^98304 mod p(x) */
+	.octa 0x00000000178a00000000000083940000
+
+	/* x^97344 mod p(x), x^97280 mod p(x) */
+	.octa 0x00000000f0c8000000000000f0a00000
+
+	/* x^96320 mod p(x), x^96256 mod p(x) */
+	.octa 0x00000000f699000000000000b74b0000
+
+	/* x^95296 mod p(x), x^95232 mod p(x) */
+	.octa 0x00000000316d000000000000c1cf0000
+
+	/* x^94272 mod p(x), x^94208 mod p(x) */
+	.octa 0x00000000987e00000000000072680000
+
+	/* x^93248 mod p(x), x^93184 mod p(x) */
+	.octa 0x00000000acff000000000000e0ab0000
+
+	/* x^92224 mod p(x), x^92160 mod p(x) */
+	.octa 0x00000000a1f6000000000000c5a80000
+
+	/* x^91200 mod p(x), x^91136 mod p(x) */
+	.octa 0x0000000061bd000000000000cf690000
+
+	/* x^90176 mod p(x), x^90112 mod p(x) */
+	.octa 0x00000000c9f2000000000000cbcc0000
+
+	/* x^89152 mod p(x), x^89088 mod p(x) */
+	.octa 0x000000005a33000000000000de050000
+
+	/* x^88128 mod p(x), x^88064 mod p(x) */
+	.octa 0x00000000e416000000000000ccd70000
+
+	/* x^87104 mod p(x), x^87040 mod p(x) */
+	.octa 0x0000000058930000000000002f670000
+
+	/* x^86080 mod p(x), x^86016 mod p(x) */
+	.octa 0x00000000a9d3000000000000152f0000
+
+	/* x^85056 mod p(x), x^84992 mod p(x) */
+	.octa 0x00000000c114000000000000ecc20000
+
+	/* x^84032 mod p(x), x^83968 mod p(x) */
+	.octa 0x00000000b9270000000000007c890000
+
+	/* x^83008 mod p(x), x^82944 mod p(x) */
+	.octa 0x000000002e6000000000000006ee0000
+
+	/* x^81984 mod p(x), x^81920 mod p(x) */
+	.octa 0x00000000dfc600000000000009100000
+
+	/* x^80960 mod p(x), x^80896 mod p(x) */
+	.octa 0x000000004911000000000000ad4e0000
+
+	/* x^79936 mod p(x), x^79872 mod p(x) */
+	.octa 0x00000000ae1b000000000000b04d0000
+
+	/* x^78912 mod p(x), x^78848 mod p(x) */
+	.octa 0x0000000005fa000000000000e9900000
+
+	/* x^77888 mod p(x), x^77824 mod p(x) */
+	.octa 0x0000000004a1000000000000cc6f0000
+
+	/* x^76864 mod p(x), x^76800 mod p(x) */
+	.octa 0x00000000af73000000000000ed110000
+
+	/* x^75840 mod p(x), x^75776 mod p(x) */
+	.octa 0x0000000082530000000000008f7e0000
+
+	/* x^74816 mod p(x), x^74752 mod p(x) */
+	.octa 0x00000000cfdc000000000000594f0000
+
+	/* x^73792 mod p(x), x^73728 mod p(x) */
+	.octa 0x00000000a6b6000000000000a8750000
+
+	/* x^72768 mod p(x), x^72704 mod p(x) */
+	.octa 0x00000000fd76000000000000aa0c0000
+
+	/* x^71744 mod p(x), x^71680 mod p(x) */
+	.octa 0x0000000006f500000000000071db0000
+
+	/* x^70720 mod p(x), x^70656 mod p(x) */
+	.octa 0x0000000037ca000000000000ab0c0000
+
+	/* x^69696 mod p(x), x^69632 mod p(x) */
+	.octa 0x00000000d7ab000000000000b7a00000
+
+	/* x^68672 mod p(x), x^68608 mod p(x) */
+	.octa 0x00000000440800000000000090d30000
+
+	/* x^67648 mod p(x), x^67584 mod p(x) */
+	.octa 0x00000000186100000000000054730000
+
+	/* x^66624 mod p(x), x^66560 mod p(x) */
+	.octa 0x000000007368000000000000a3a20000
+
+	/* x^65600 mod p(x), x^65536 mod p(x) */
+	.octa 0x0000000026d0000000000000f9040000
+
+	/* x^64576 mod p(x), x^64512 mod p(x) */
+	.octa 0x00000000fe770000000000009c0a0000
+
+	/* x^63552 mod p(x), x^63488 mod p(x) */
+	.octa 0x000000002cba000000000000d1e70000
+
+	/* x^62528 mod p(x), x^62464 mod p(x) */
+	.octa 0x00000000f8bd0000000000005ac10000
+
+	/* x^61504 mod p(x), x^61440 mod p(x) */
+	.octa 0x000000007372000000000000d68d0000
+
+	/* x^60480 mod p(x), x^60416 mod p(x) */
+	.octa 0x00000000f37f00000000000089f60000
+
+	/* x^59456 mod p(x), x^59392 mod p(x) */
+	.octa 0x00000000078400000000000008a90000
+
+	/* x^58432 mod p(x), x^58368 mod p(x) */
+	.octa 0x00000000d3e400000000000042360000
+
+	/* x^57408 mod p(x), x^57344 mod p(x) */
+	.octa 0x00000000eba800000000000092d50000
+
+	/* x^56384 mod p(x), x^56320 mod p(x) */
+	.octa 0x00000000afbe000000000000b4d50000
+
+	/* x^55360 mod p(x), x^55296 mod p(x) */
+	.octa 0x00000000d8ca000000000000c9060000
+
+	/* x^54336 mod p(x), x^54272 mod p(x) */
+	.octa 0x00000000c2d00000000000008f4f0000
+
+	/* x^53312 mod p(x), x^53248 mod p(x) */
+	.octa 0x00000000373200000000000028690000
+
+	/* x^52288 mod p(x), x^52224 mod p(x) */
+	.octa 0x0000000046ae000000000000c3b30000
+
+	/* x^51264 mod p(x), x^51200 mod p(x) */
+	.octa 0x00000000b243000000000000f8700000
+
+	/* x^50240 mod p(x), x^50176 mod p(x) */
+	.octa 0x00000000f7f500000000000029eb0000
+
+	/* x^49216 mod p(x), x^49152 mod p(x) */
+	.octa 0x000000000c7e000000000000fe730000
+
+	/* x^48192 mod p(x), x^48128 mod p(x) */
+	.octa 0x00000000c38200000000000096000000
+
+	/* x^47168 mod p(x), x^47104 mod p(x) */
+	.octa 0x000000008956000000000000683c0000
+
+	/* x^46144 mod p(x), x^46080 mod p(x) */
+	.octa 0x00000000422d0000000000005f1e0000
+
+	/* x^45120 mod p(x), x^45056 mod p(x) */
+	.octa 0x00000000ac0f0000000000006f810000
+
+	/* x^44096 mod p(x), x^44032 mod p(x) */
+	.octa 0x00000000ce30000000000000031f0000
+
+	/* x^43072 mod p(x), x^43008 mod p(x) */
+	.octa 0x000000003d43000000000000455a0000
+
+	/* x^42048 mod p(x), x^41984 mod p(x) */
+	.octa 0x000000007ebe000000000000a6050000
+
+	/* x^41024 mod p(x), x^40960 mod p(x) */
+	.octa 0x00000000976e00000000000077eb0000
+
+	/* x^40000 mod p(x), x^39936 mod p(x) */
+	.octa 0x000000000872000000000000389c0000
+
+	/* x^38976 mod p(x), x^38912 mod p(x) */
+	.octa 0x000000008979000000000000c7b20000
+
+	/* x^37952 mod p(x), x^37888 mod p(x) */
+	.octa 0x000000005c1e0000000000001d870000
+
+	/* x^36928 mod p(x), x^36864 mod p(x) */
+	.octa 0x00000000aebb00000000000045810000
+
+	/* x^35904 mod p(x), x^35840 mod p(x) */
+	.octa 0x000000004f7e0000000000006d4a0000
+
+	/* x^34880 mod p(x), x^34816 mod p(x) */
+	.octa 0x00000000ea98000000000000b9200000
+
+	/* x^33856 mod p(x), x^33792 mod p(x) */
+	.octa 0x00000000f39600000000000022f20000
+
+	/* x^32832 mod p(x), x^32768 mod p(x) */
+	.octa 0x000000000bc500000000000041ca0000
+
+	/* x^31808 mod p(x), x^31744 mod p(x) */
+	.octa 0x00000000786400000000000078500000
+
+	/* x^30784 mod p(x), x^30720 mod p(x) */
+	.octa 0x00000000be970000000000009e7e0000
+
+	/* x^29760 mod p(x), x^29696 mod p(x) */
+	.octa 0x00000000dd6d000000000000a53c0000
+
+	/* x^28736 mod p(x), x^28672 mod p(x) */
+	.octa 0x000000004c3f00000000000039340000
+
+	/* x^27712 mod p(x), x^27648 mod p(x) */
+	.octa 0x0000000093a4000000000000b58e0000
+
+	/* x^26688 mod p(x), x^26624 mod p(x) */
+	.octa 0x0000000050fb00000000000062d40000
+
+	/* x^25664 mod p(x), x^25600 mod p(x) */
+	.octa 0x00000000f505000000000000a26f0000
+
+	/* x^24640 mod p(x), x^24576 mod p(x) */
+	.octa 0x0000000064f900000000000065e60000
+
+	/* x^23616 mod p(x), x^23552 mod p(x) */
+	.octa 0x00000000e8c2000000000000aad90000
+
+	/* x^22592 mod p(x), x^22528 mod p(x) */
+	.octa 0x00000000720b000000000000a3b00000
+
+	/* x^21568 mod p(x), x^21504 mod p(x) */
+	.octa 0x00000000e992000000000000d2680000
+
+	/* x^20544 mod p(x), x^20480 mod p(x) */
+	.octa 0x000000009132000000000000cf4c0000
+
+	/* x^19520 mod p(x), x^19456 mod p(x) */
+	.octa 0x00000000608a00000000000076610000
+
+	/* x^18496 mod p(x), x^18432 mod p(x) */
+	.octa 0x000000009948000000000000fb9f0000
+
+	/* x^17472 mod p(x), x^17408 mod p(x) */
+	.octa 0x00000000173000000000000003770000
+
+	/* x^16448 mod p(x), x^16384 mod p(x) */
+	.octa 0x000000006fe300000000000004880000
+
+	/* x^15424 mod p(x), x^15360 mod p(x) */
+	.octa 0x00000000e15300000000000056a70000
+
+	/* x^14400 mod p(x), x^14336 mod p(x) */
+	.octa 0x0000000092d60000000000009dfd0000
+
+	/* x^13376 mod p(x), x^13312 mod p(x) */
+	.octa 0x0000000002fd00000000000074c80000
+
+	/* x^12352 mod p(x), x^12288 mod p(x) */
+	.octa 0x00000000c78b000000000000a3ec0000
+
+	/* x^11328 mod p(x), x^11264 mod p(x) */
+	.octa 0x000000009262000000000000b3530000
+
+	/* x^10304 mod p(x), x^10240 mod p(x) */
+	.octa 0x0000000084f200000000000047bf0000
+
+	/* x^9280 mod p(x), x^9216 mod p(x) */
+	.octa 0x0000000067ee000000000000e97c0000
+
+	/* x^8256 mod p(x), x^8192 mod p(x) */
+	.octa 0x00000000535b00000000000091e10000
+
+	/* x^7232 mod p(x), x^7168 mod p(x) */
+	.octa 0x000000007ebb00000000000055060000
+
+	/* x^6208 mod p(x), x^6144 mod p(x) */
+	.octa 0x00000000c6a1000000000000fd360000
+
+	/* x^5184 mod p(x), x^5120 mod p(x) */
+	.octa 0x000000001be500000000000055860000
+
+	/* x^4160 mod p(x), x^4096 mod p(x) */
+	.octa 0x00000000ae0e0000000000005bd00000
+
+	/* x^3136 mod p(x), x^3072 mod p(x) */
+	.octa 0x0000000022040000000000008db20000
+
+	/* x^2112 mod p(x), x^2048 mod p(x) */
+	.octa 0x00000000c9eb000000000000efe20000
+
+	/* x^1088 mod p(x), x^1024 mod p(x) */
+	.octa 0x0000000039b400000000000051d10000
+
+.short_constants:
+
+	/* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
+	/* x^2048 mod p(x), x^2016 mod p(x), x^1984 mod p(x), x^1952 mod p(x) */
+	.octa 0xefe20000dccf00009440000033590000
+
+	/* x^1920 mod p(x), x^1888 mod p(x), x^1856 mod p(x), x^1824 mod p(x) */
+	.octa 0xee6300002f3f000062180000e0ed0000
+
+	/* x^1792 mod p(x), x^1760 mod p(x), x^1728 mod p(x), x^1696 mod p(x) */
+	.octa 0xcf5f000017ef0000ccbe000023d30000
+
+	/* x^1664 mod p(x), x^1632 mod p(x), x^1600 mod p(x), x^1568 mod p(x) */
+	.octa 0x6d0c0000a30e00000920000042630000
+
+	/* x^1536 mod p(x), x^1504 mod p(x), x^1472 mod p(x), x^1440 mod p(x) */
+	.octa 0x21d30000932b0000a7a00000efcc0000
+
+	/* x^1408 mod p(x), x^1376 mod p(x), x^1344 mod p(x), x^1312 mod p(x) */
+	.octa 0x10be00000b310000666f00000d1c0000
+
+	/* x^1280 mod p(x), x^1248 mod p(x), x^1216 mod p(x), x^1184 mod p(x) */
+	.octa 0x1f240000ce9e0000caad0000589e0000
+
+	/* x^1152 mod p(x), x^1120 mod p(x), x^1088 mod p(x), x^1056 mod p(x) */
+	.octa 0x29610000d02b000039b400007cf50000
+
+	/* x^1024 mod p(x), x^992 mod p(x), x^960 mod p(x), x^928 mod p(x) */
+	.octa 0x51d100009d9d00003c0e0000bfd60000
+
+	/* x^896 mod p(x), x^864 mod p(x), x^832 mod p(x), x^800 mod p(x) */
+	.octa 0xda390000ceae000013830000713c0000
+
+	/* x^768 mod p(x), x^736 mod p(x), x^704 mod p(x), x^672 mod p(x) */
+	.octa 0xb67800001e16000085c0000080a60000
+
+	/* x^640 mod p(x), x^608 mod p(x), x^576 mod p(x), x^544 mod p(x) */
+	.octa 0x0db40000f7f90000371d0000e6580000
+
+	/* x^512 mod p(x), x^480 mod p(x), x^448 mod p(x), x^416 mod p(x) */
+	.octa 0x87e70000044c0000aadb0000a4970000
+
+	/* x^384 mod p(x), x^352 mod p(x), x^320 mod p(x), x^288 mod p(x) */
+	.octa 0x1f990000ad180000d8b30000e7b50000
+
+	/* x^256 mod p(x), x^224 mod p(x), x^192 mod p(x), x^160 mod p(x) */
+	.octa 0xbe6c00006ee300004c1a000006df0000
+
+	/* x^128 mod p(x), x^96 mod p(x), x^64 mod p(x), x^32 mod p(x) */
+	.octa 0xfb0b00002d560000136800008bb70000
+
+
+.barrett_constants:
+	/* Barrett constant m - (4^32)/n */
+	.octa 0x000000000000000000000001f65a57f8	/* x^64 div p(x) */
+	/* Barrett constant n */
+	.octa 0x0000000000000000000000018bb70000
+
+#define CRC_FUNCTION_NAME __crct10dif_vpmsum
+#include "crc32-vpmsum_core.S"
diff --git a/arch/powerpc/crypto/crct10dif-vpmsum_glue.c b/arch/powerpc/crypto/crct10dif-vpmsum_glue.c
new file mode 100644
index 000000000000..bebfc329f746
--- /dev/null
+++ b/arch/powerpc/crypto/crct10dif-vpmsum_glue.c
@@ -0,0 +1,125 @@
+/*
+ * Calculate a CRC T10-DIF with vpmsum acceleration
+ *
+ * Copyright 2017, Daniel Axtens, IBM Corporation.
+ * [based on crc32c-vpmsum_glue.c]
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ */
+
+#include <linux/crc-t10dif.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <asm/switch_to.h>
+
+#define VMX_ALIGN		16
+#define VMX_ALIGN_MASK		(VMX_ALIGN-1)
+
+#define VECTOR_BREAKPOINT	64
+
+u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len);
+
+static u16 crct10dif_vpmsum(u16 crci, unsigned char const *p, size_t len)
+{
+	unsigned int prealign;
+	unsigned int tail;
+	u32 crc = crci;
+
+	if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || in_interrupt())
+		return crc_t10dif_generic(crc, p, len);
+
+	if ((unsigned long)p & VMX_ALIGN_MASK) {
+		prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+		crc = crc_t10dif_generic(crc, p, prealign);
+		len -= prealign;
+		p += prealign;
+	}
+
+	if (len & ~VMX_ALIGN_MASK) {
+		crc <<= 16;
+		pagefault_disable();
+		enable_kernel_altivec();
+		crc = __crct10dif_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+		pagefault_enable();
+		crc >>= 16;
+	}
+
+	tail = len & VMX_ALIGN_MASK;
+	if (tail) {
+		p += len & ~VMX_ALIGN_MASK;
+		crc = crc_t10dif_generic(crc, p, tail);
+	}
+
+	return crc & 0xffff;
+}
+
+static int crct10dif_vpmsum_init(struct shash_desc *desc)
+{
+	u16 *crc = shash_desc_ctx(desc);
+
+	*crc = 0;
+	return 0;
+}
+
+static int crct10dif_vpmsum_update(struct shash_desc *desc, const u8 *data,
+			    unsigned int length)
+{
+	u16 *crc = shash_desc_ctx(desc);
+
+	*crc = crct10dif_vpmsum(*crc, data, length);
+
+	return 0;
+}
+
+
+static int crct10dif_vpmsum_final(struct shash_desc *desc, u8 *out)
+{
+	u16 *crcp = shash_desc_ctx(desc);
+
+	*(u16 *)out = *crcp;
+	return 0;
+}
+
+static struct shash_alg alg = {
+	.init		= crct10dif_vpmsum_init,
+	.update		= crct10dif_vpmsum_update,
+	.final		= crct10dif_vpmsum_final,
+	.descsize	= CRC_T10DIF_DIGEST_SIZE,
+	.digestsize	= CRC_T10DIF_DIGEST_SIZE,
+	.base		= {
+		.cra_name		= "crct10dif",
+		.cra_driver_name	= "crct10dif-vpmsum",
+		.cra_priority		= 200,
+		.cra_blocksize		= CRC_T10DIF_BLOCK_SIZE,
+		.cra_module		= THIS_MODULE,
+	}
+};
+
+static int __init crct10dif_vpmsum_mod_init(void)
+{
+	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+		return -ENODEV;
+
+	return crypto_register_shash(&alg);
+}
+
+static void __exit crct10dif_vpmsum_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_cpu_feature_match(PPC_MODULE_FEATURE_VEC_CRYPTO, crct10dif_vpmsum_mod_init);
+module_exit(crct10dif_vpmsum_mod_fini);
+
+MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>");
+MODULE_DESCRIPTION("CRCT10DIF using vector polynomial multiply-sum instructions");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("crct10dif");
+MODULE_ALIAS_CRYPTO("crct10dif-vpmsum");
-- 
cgit 


From 146c8688d99c574d9ff0af17eca51bbd6402a57f Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Wed, 15 Mar 2017 23:37:37 +1100
Subject: crypto: powerpc - Stress test for vpmsum implementations

vpmsum implementations often don't kick in for short test vectors.
This is a simple test module that does a configurable number of
random tests, each up to 64kB and each with random offsets.

Both CRC-T10DIF and CRC32C are tested.

Cc: Anton Blanchard <anton@samba.org>
Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/powerpc/crypto/Makefile          |   1 +
 arch/powerpc/crypto/crc-vpmsum_test.c | 137 ++++++++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+)
 create mode 100644 arch/powerpc/crypto/crc-vpmsum_test.c

(limited to 'arch')

diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
index e66aaf19764d..67eca3af9fc7 100644
--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o
 obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o
 obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o
+obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o
 
 aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o
 md5-ppc-y := md5-asm.o md5-glue.o
diff --git a/arch/powerpc/crypto/crc-vpmsum_test.c b/arch/powerpc/crypto/crc-vpmsum_test.c
new file mode 100644
index 000000000000..0153a9c6f4af
--- /dev/null
+++ b/arch/powerpc/crypto/crc-vpmsum_test.c
@@ -0,0 +1,137 @@
+/*
+ * CRC vpmsum tester
+ * Copyright 2017 Daniel Axtens, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/crc-t10dif.h>
+#include <linux/crc32.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <asm/switch_to.h>
+
+static unsigned long iterations = 10000;
+
+#define MAX_CRC_LENGTH 65535
+
+
+static int __init crc_test_init(void)
+{
+	u16 crc16 = 0, verify16 = 0;
+	u32 crc32 = 0, verify32 = 0;
+	__le32 verify32le = 0;
+	unsigned char *data;
+	unsigned long i;
+	int ret;
+
+	struct crypto_shash *crct10dif_tfm;
+	struct crypto_shash *crc32c_tfm;
+
+	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+		return -ENODEV;
+
+	data = kmalloc(MAX_CRC_LENGTH, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	crct10dif_tfm = crypto_alloc_shash("crct10dif", 0, 0);
+
+	if (IS_ERR(crct10dif_tfm)) {
+		pr_err("Error allocating crc-t10dif\n");
+		goto free_buf;
+	}
+
+	crc32c_tfm = crypto_alloc_shash("crc32c", 0, 0);
+
+	if (IS_ERR(crc32c_tfm)) {
+		pr_err("Error allocating crc32c\n");
+		goto free_16;
+	}
+
+	do {
+		SHASH_DESC_ON_STACK(crct10dif_shash, crct10dif_tfm);
+		SHASH_DESC_ON_STACK(crc32c_shash, crc32c_tfm);
+
+		crct10dif_shash->tfm = crct10dif_tfm;
+		ret = crypto_shash_init(crct10dif_shash);
+
+		if (ret) {
+			pr_err("Error initing crc-t10dif\n");
+			goto free_32;
+		}
+
+
+		crc32c_shash->tfm = crc32c_tfm;
+		ret = crypto_shash_init(crc32c_shash);
+
+		if (ret) {
+			pr_err("Error initing crc32c\n");
+			goto free_32;
+		}
+
+		pr_info("crc-vpmsum_test begins, %lu iterations\n", iterations);
+		for (i=0; i<iterations; i++) {
+			size_t len, offset;
+
+			get_random_bytes(data, MAX_CRC_LENGTH);
+			get_random_bytes(&len, sizeof(len));
+			get_random_bytes(&offset, sizeof(offset));
+
+			len %= MAX_CRC_LENGTH;
+			offset &= 15;
+			if (len <= offset)
+				continue;
+			len -= offset;
+
+			crypto_shash_update(crct10dif_shash, data+offset, len);
+			crypto_shash_final(crct10dif_shash, (u8 *)(&crc16));
+			verify16 = crc_t10dif_generic(verify16, data+offset, len);
+
+
+			if (crc16 != verify16) {
+				pr_err("FAILURE in CRC16: got 0x%04x expected 0x%04x (len %lu)\n",
+				       crc16, verify16, len);
+				break;
+			}
+
+			crypto_shash_update(crc32c_shash, data+offset, len);
+			crypto_shash_final(crc32c_shash, (u8 *)(&crc32));
+			verify32 = le32_to_cpu(verify32le);
+		        verify32le = ~cpu_to_le32(__crc32c_le(~verify32, data+offset, len));
+			if (crc32 != (u32)verify32le) {
+				pr_err("FAILURE in CRC32: got 0x%08x expected 0x%08x (len %lu)\n",
+				       crc32, verify32, len);
+				break;
+			}
+		}
+		pr_info("crc-vpmsum_test done, completed %lu iterations\n", i);
+	} while (0);
+
+free_32:
+	crypto_free_shash(crc32c_tfm);
+
+free_16:
+	crypto_free_shash(crct10dif_tfm);
+
+free_buf:
+	kfree(data);
+
+	return 0;
+}
+
+static void __exit crc_test_exit(void) {}
+
+module_init(crc_test_init);
+module_exit(crc_test_exit);
+module_param(iterations, long, 0400);
+
+MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>");
+MODULE_DESCRIPTION("Vector polynomial multiply-sum CRC tester");
+MODULE_LICENSE("GPL");
-- 
cgit 


From fdb2726f4e61c5e3abc052f547d5a5f6c0dc5504 Mon Sep 17 00:00:00 2001
From: Michael Davidson <md@google.com>
Date: Wed, 15 Mar 2017 15:36:00 -0700
Subject: crypto, x86: aesni - fix token pasting for clang

aes_ctrby8_avx-x86_64.S uses the C preprocessor for token pasting
of character sequences that are not valid preprocessor tokens.
While this is allowed when preprocessing assembler files it exposes
an incompatibilty between the clang and gcc preprocessors where
clang does not strip leading white space from macro parameters,
leading to the CONCAT(%xmm, i) macro expansion on line 96 resulting
in a token with a space character embedded in it.

While this could be resolved by deleting the offending space character,
the assembler is perfectly capable of doing the token pasting correctly
for itself so we can just get rid of the preprocessor macros.

Signed-off-by: Michael Davidson <md@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aes_ctrby8_avx-x86_64.S | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
index a916c4a61165..5f6a5af9c489 100644
--- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -65,7 +65,6 @@
 #include <linux/linkage.h>
 #include <asm/inst.h>
 
-#define CONCAT(a,b)	a##b
 #define VMOVDQ		vmovdqu
 
 #define xdata0		%xmm0
@@ -92,8 +91,6 @@
 #define num_bytes	%r8
 
 #define tmp		%r10
-#define	DDQ(i)		CONCAT(ddq_add_,i)
-#define	XMM(i)		CONCAT(%xmm, i)
 #define	DDQ_DATA	0
 #define	XDATA		1
 #define KEY_128		1
@@ -131,12 +128,12 @@ ddq_add_8:
 /* generate a unique variable for ddq_add_x */
 
 .macro setddq n
-	var_ddq_add = DDQ(\n)
+	var_ddq_add = ddq_add_\n
 .endm
 
 /* generate a unique variable for xmm register */
 .macro setxdata n
-	var_xdata = XMM(\n)
+	var_xdata = %xmm\n
 .endm
 
 /* club the numeric 'id' to the symbol 'name' */
-- 
cgit 


From 115d691fc3a40262003802ee841279597ceb3df2 Mon Sep 17 00:00:00 2001
From: Fabien DESSENNE <fabien.dessenne@st.com>
Date: Tue, 21 Mar 2017 16:13:29 +0100
Subject: ARM: dts: stm32: Add CRC support to stm32f746

Add CRC (CRC32 crypto) support to stm32f746.

Signed-off-by: Fabien Dessenne <fabien.dessenne@st.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm/boot/dts/stm32f746.dtsi | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch')

diff --git a/arch/arm/boot/dts/stm32f746.dtsi b/arch/arm/boot/dts/stm32f746.dtsi
index f321ffe87144..755fb923c07b 100644
--- a/arch/arm/boot/dts/stm32f746.dtsi
+++ b/arch/arm/boot/dts/stm32f746.dtsi
@@ -289,6 +289,13 @@
 			};
 		};
 
+		crc: crc@40023000 {
+			compatible = "st,stm32f7-crc";
+			reg = <0x40023000 0x400>;
+			clocks = <&rcc 0 12>;
+			status = "disabled";
+		};
+
 		rcc: rcc@40023800 {
 			#clock-cells = <2>;
 			compatible = "st,stm32f42xx-rcc", "st,stm32-rcc";
-- 
cgit 


From 2e3db2931875a843db8b56158cf93edab4286cde Mon Sep 17 00:00:00 2001
From: Fabien DESSENNE <fabien.dessenne@st.com>
Date: Tue, 21 Mar 2017 16:13:30 +0100
Subject: ARM: dts: stm32: enable CRC on stm32746g-eval board

Enable the CRC (CRC32 crypto) on stm32746g-eval board

Signed-off-by: Fabien Dessenne <fabien.dessenne@st.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm/boot/dts/stm32746g-eval.dts | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch')

diff --git a/arch/arm/boot/dts/stm32746g-eval.dts b/arch/arm/boot/dts/stm32746g-eval.dts
index aa03fac1ec55..0dc18a0f0940 100644
--- a/arch/arm/boot/dts/stm32746g-eval.dts
+++ b/arch/arm/boot/dts/stm32746g-eval.dts
@@ -89,6 +89,10 @@
 	clock-frequency = <25000000>;
 };
 
+&crc {
+	status = "okay";
+};
+
 &usart1 {
 	pinctrl-0 = <&usart1_pins_a>;
 	pinctrl-names = "default";
-- 
cgit 


From 8fbbcbdd1d259da9bc5a7772e8064aaf5e04ec20 Mon Sep 17 00:00:00 2001
From: Fabien DESSENNE <fabien.dessenne@st.com>
Date: Tue, 21 Mar 2017 16:13:31 +0100
Subject: ARM: configs: stm32: Add crypto support

Add STM32 crypto support in stm32_defconfig file.

Signed-off-by: Fabien Dessenne <fabien.dessenne@st.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm/configs/stm32_defconfig | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/arm/configs/stm32_defconfig b/arch/arm/configs/stm32_defconfig
index a9d8e3c9b487..03437f8f9ad1 100644
--- a/arch/arm/configs/stm32_defconfig
+++ b/arch/arm/configs/stm32_defconfig
@@ -75,5 +75,7 @@ CONFIG_MAGIC_SYSRQ=y
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_DEBUG_BUGVERBOSE is not set
 # CONFIG_FTRACE is not set
+CONFIG_CRYPTO=y
+CONFIG_CRYPTO_DEV_STM32=y
 CONFIG_CRC_ITU_T=y
 CONFIG_CRC7=y
-- 
cgit 


From e55318c84f199d6056a0bcd98bc4612d01ccfe80 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnáček <omosnacek@gmail.com>
Date: Sun, 2 Apr 2017 21:19:14 +0200
Subject: crypto: gf128mul - switch gf128mul_x_ble to le128

Currently, gf128mul_x_ble works with pointers to be128, even though it
actually interprets the words as little-endian. Consequently, it uses
cpu_to_le64/le64_to_cpu on fields of type __be64, which is incorrect.

This patch fixes that by changing the function to accept pointers to
le128 and updating all users accordingly.

Signed-off-by: Ondrej Mosnacek <omosnacek@gmail.com>
Reviewd-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/camellia_glue.c     | 4 ++--
 arch/x86/crypto/serpent_sse2_glue.c | 4 ++--
 arch/x86/crypto/twofish_glue_3way.c | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index aa76cad9d262..af4840ab2a3d 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -1522,7 +1522,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
 	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[2 * 4];
+	le128 buf[2 * 4];
 	struct xts_crypt_req req = {
 		.tbuf = buf,
 		.tbuflen = sizeof(buf),
@@ -1540,7 +1540,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
 	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[2 * 4];
+	le128 buf[2 * 4];
 	struct xts_crypt_req req = {
 		.tbuf = buf,
 		.tbuflen = sizeof(buf),
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 644f97ab8cac..ac0e831943f5 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -328,7 +328,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
 	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[SERPENT_PARALLEL_BLOCKS];
+	le128 buf[SERPENT_PARALLEL_BLOCKS];
 	struct crypt_priv crypt_ctx = {
 		.ctx = &ctx->crypt_ctx,
 		.fpu_enabled = false,
@@ -355,7 +355,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
 	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[SERPENT_PARALLEL_BLOCKS];
+	le128 buf[SERPENT_PARALLEL_BLOCKS];
 	struct crypt_priv crypt_ctx = {
 		.ctx = &ctx->crypt_ctx,
 		.fpu_enabled = false,
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 2ebb5e9789f3..243e90a4b5d9 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -296,7 +296,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
 	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[3];
+	le128 buf[3];
 	struct xts_crypt_req req = {
 		.tbuf = buf,
 		.tbuflen = sizeof(buf),
@@ -314,7 +314,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
 	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[3];
+	le128 buf[3];
 	struct xts_crypt_req req = {
 		.tbuf = buf,
 		.tbuflen = sizeof(buf),
-- 
cgit 


From 692016bdf7cf33abaaa7f2b080b47f504c98810c Mon Sep 17 00:00:00 2001
From: Ondrej Mosnáček <omosnacek@gmail.com>
Date: Sun, 2 Apr 2017 21:19:15 +0200
Subject: crypto: glue_helper - remove the le128_gf128mul_x_ble function

The le128_gf128mul_x_ble function in glue_helper.h is now obsolete and
can be replaced with the gf128mul_x_ble function from gf128mul.h.

Signed-off-by: Ondrej Mosnacek <omosnacek@gmail.com>
Reviewd-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/glue_helper.c             |  3 ++-
 arch/x86/include/asm/crypto/glue_helper.h | 10 ----------
 2 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 260a060d7275..24ac9fad832d 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -27,6 +27,7 @@
 
 #include <linux/module.h>
 #include <crypto/b128ops.h>
+#include <crypto/gf128mul.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
@@ -457,7 +458,7 @@ void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv,
 	le128 ivblk = *iv;
 
 	/* generate next IV */
-	le128_gf128mul_x_ble(iv, &ivblk);
+	gf128mul_x_ble(iv, &ivblk);
 
 	/* CC <- T xor C */
 	u128_xor(dst, src, (u128 *)&ivblk);
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h
index 29e53ea7d764..ed8b66de541f 100644
--- a/arch/x86/include/asm/crypto/glue_helper.h
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -125,16 +125,6 @@ static inline void le128_inc(le128 *i)
 	i->b = cpu_to_le64(b);
 }
 
-static inline void le128_gf128mul_x_ble(le128 *dst, const le128 *src)
-{
-	u64 a = le64_to_cpu(src->a);
-	u64 b = le64_to_cpu(src->b);
-	u64 _tt = ((s64)a >> 63) & 0x87;
-
-	dst->a = cpu_to_le64((a << 1) ^ (b >> 63));
-	dst->b = cpu_to_le64((b << 1) ^ _tt);
-}
-
 extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
 				 struct blkcipher_desc *desc,
 				 struct scatterlist *dst,
-- 
cgit 


From 42ae2922a68ac8d68927ccb052b486f34e5fba71 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Wed, 5 Apr 2017 11:34:58 -0700
Subject: crypto: arm64/sha - Add constant operand modifier to ASM_EXPORT

The operand is an integer constant, make the constness explicit by
adding the modifier. This is needed for clang to generate valid code
and also works with gcc.

Also change the constraint of the operand from 'I' ("Integer constant
that is valid as an immediate operand in an ADD instruction", AArch64)
to 'i' ("An immediate integer operand").

Based-on-patch-from: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/sha1-ce-glue.c | 2 +-
 arch/arm64/crypto/sha2-ce-glue.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
index aefda9868627..6b520e3f3ab1 100644
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -18,7 +18,7 @@
 #include <linux/module.h>
 
 #define ASM_EXPORT(sym, val) \
-	asm(".globl " #sym "; .set " #sym ", %0" :: "I"(val));
+	asm(".globl " #sym "; .set " #sym ", %c0" :: "i"(val));
 
 MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index 7cd587564a41..e3abe11de48c 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -18,7 +18,7 @@
 #include <linux/module.h>
 
 #define ASM_EXPORT(sym, val) \
-	asm(".globl " #sym "; .set " #sym ", %0" :: "I"(val));
+	asm(".globl " #sym "; .set " #sym ", %c0" :: "i"(val));
 
 MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-- 
cgit 


From ed067d4a859ff696373324c5061392e013a7561a Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Tue, 11 Apr 2017 20:08:34 +0200
Subject: linux/kernel.h: Add ALIGN_DOWN macro

Few parts of kernel define their own macro for aligning down so provide
a common define for this, with the same usage and assumptions as existing
ALIGN.

Convert also three existing implementations to this one.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/metag/kernel/stacktrace.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch')

diff --git a/arch/metag/kernel/stacktrace.c b/arch/metag/kernel/stacktrace.c
index 91ffc4b75c33..09d67b7f51ca 100644
--- a/arch/metag/kernel/stacktrace.c
+++ b/arch/metag/kernel/stacktrace.c
@@ -31,8 +31,6 @@ static void tbi_boing_init(void)
 }
 #endif
 
-#define ALIGN_DOWN(addr, size)  ((addr)&(~((size)-1)))
-
 /*
  * Unwind the current stack frame and store the new register values in the
  * structure passed as argument. Unwinding is equivalent to a function return,
-- 
cgit 


From 0f89f6e188fa929b6de49c00c73f5731a6bd6bac Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 20 Apr 2017 15:35:09 +1000
Subject: crypto: crct10dif-vpmsum - Fix missing preempt_disable()

In crct10dif_vpmsum() we call enable_kernel_altivec() without first
disabling preemption, which is not allowed.

It used to be sufficient just to call pagefault_disable(), because that
also disabled preemption. But the two were decoupled in commit 8222dbe21e79
("sched/preempt, mm/fault: Decouple preemption from the page fault
logic") in mid 2015.

The crct10dif-vpmsum code inherited this bug from the crc32c-vpmsum code
on which it was modelled.

So add the missing preempt_disable/enable(). We should also call
disable_kernel_fp(), although it does nothing by default, there is a
debug switch to make it active and all enables should be paired with
disables.

Fixes: b01df1c16c9a ("crypto: powerpc - Add CRC-T10DIF acceleration")
Acked-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/powerpc/crypto/crct10dif-vpmsum_glue.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/crypto/crct10dif-vpmsum_glue.c b/arch/powerpc/crypto/crct10dif-vpmsum_glue.c
index bebfc329f746..02ea277863d1 100644
--- a/arch/powerpc/crypto/crct10dif-vpmsum_glue.c
+++ b/arch/powerpc/crypto/crct10dif-vpmsum_glue.c
@@ -44,10 +44,13 @@ static u16 crct10dif_vpmsum(u16 crci, unsigned char const *p, size_t len)
 
 	if (len & ~VMX_ALIGN_MASK) {
 		crc <<= 16;
+		preempt_disable();
 		pagefault_disable();
 		enable_kernel_altivec();
 		crc = __crct10dif_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+		disable_kernel_altivec();
 		pagefault_enable();
+		preempt_enable();
 		crc >>= 16;
 	}
 
-- 
cgit 


From 899f35fabeea0d51fbfa146aaeaa9fc8f3b00292 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 24 Apr 2017 16:09:50 +0800
Subject: Revert "crypto: arm64/sha - Add constant operand modifier to
 ASM_EXPORT"

This reverts commit 42ae2922a68ac8d68927ccb052b486f34e5fba71.  It
causes a regression with older versions of gcc.  The consensus is
that this should instead be fixed in clang.

Reported-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/sha1-ce-glue.c | 2 +-
 arch/arm64/crypto/sha2-ce-glue.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
index 6b520e3f3ab1..aefda9868627 100644
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -18,7 +18,7 @@
 #include <linux/module.h>
 
 #define ASM_EXPORT(sym, val) \
-	asm(".globl " #sym "; .set " #sym ", %c0" :: "i"(val));
+	asm(".globl " #sym "; .set " #sym ", %0" :: "I"(val));
 
 MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index e3abe11de48c..7cd587564a41 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -18,7 +18,7 @@
 #include <linux/module.h>
 
 #define ASM_EXPORT(sym, val) \
-	asm(".globl " #sym "; .set " #sym ", %c0" :: "i"(val));
+	asm(".globl " #sym "; .set " #sym ", %0" :: "I"(val));
 
 MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-- 
cgit