24 files changed, 7221 insertions, 4288 deletions
diff --git a/arch/powerpc/crypto/.gitignore b/arch/powerpc/crypto/.gitignore
index e1094f08f713..e9fe73aac8b6 100644
--- a/arch/powerpc/crypto/.gitignore
+++ b/arch/powerpc/crypto/.gitignore
@@ -1,3 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 aesp10-ppc.S
+aesp8-ppc.S
 ghashp10-ppc.S
+ghashp8-ppc.S
diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index 6fc2248ca561..5b315e9756b3 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -2,38 +2,16 @@
 
 menu "Accelerated Cryptographic Algorithms for CPU (powerpc)"
 
-config CRYPTO_CRC32C_VPMSUM
-	tristate "CRC32c"
-	depends on PPC64 && ALTIVEC
-	select CRYPTO_HASH
-	select CRC32
+config CRYPTO_CURVE25519_PPC64
+	tristate "Public key crypto: Curve25519 (PowerPC64)"
+	depends on PPC64 && CPU_LITTLE_ENDIAN
+	select CRYPTO_LIB_CURVE25519_GENERIC
+	select CRYPTO_ARCH_HAVE_LIB_CURVE25519
 	help
-	  CRC32c CRC algorithm with the iSCSI polynomial (RFC 3385 and RFC 3720)
-
-	  Architecture: powerpc64 using
-	  - AltiVec extensions
-
-	  Enable on POWER8 and newer processors for improved performance.
-
-config CRYPTO_CRCT10DIF_VPMSUM
-	tristate "CRC32T10DIF"
-	depends on PPC64 && ALTIVEC && CRC_T10DIF
-	select CRYPTO_HASH
-	help
-	  CRC16 CRC algorithm used for the T10 (SCSI) Data Integrity Field (DIF)
-
-	  Architecture: powerpc64 using
-	  - AltiVec extensions
+	  Curve25519 algorithm
 
-	  Enable on POWER8 and newer processors for improved performance.
-
-config CRYPTO_VPMSUM_TESTER
-	tristate "CRC32c and CRC32T10DIF hardware acceleration tester"
-	depends on CRYPTO_CRCT10DIF_VPMSUM && CRYPTO_CRC32C_VPMSUM
-	help
-	  Stress test for CRC32c and CRCT10DIF algorithms implemented with
-	  powerpc64 AltiVec extensions (POWER8 vpmsum instructions).
-	  Unless you are testing these algorithms, you don't need this.
+	  Architecture: PowerPC64
+	  - Little-endian
 
 config CRYPTO_MD5_PPC
 	tristate "Digests: MD5"
@@ -101,6 +79,7 @@ config CRYPTO_AES_GCM_P10
 	select CRYPTO_ALGAPI
 	select CRYPTO_AEAD
 	select CRYPTO_SKCIPHER
+	select CRYPTO_SIMD
 	help
 	  AEAD cipher: AES cipher algorithms (FIPS-197)
 	  GCM (Galois/Counter Mode) authenticated encryption mode (NIST SP800-38D)
@@ -137,4 +116,24 @@ config CRYPTO_POLY1305_P10
 	  - Power10 or later
 	  - Little-endian
 
+config CRYPTO_DEV_VMX
+        bool "Support for VMX cryptographic acceleration instructions"
+        depends on PPC64 && VSX
+        help
+          Support for VMX cryptographic acceleration instructions.
+
+config CRYPTO_DEV_VMX_ENCRYPT
+	tristate "Encryption acceleration support on P8 CPU"
+	depends on CRYPTO_DEV_VMX
+	select CRYPTO_AES
+	select CRYPTO_CBC
+	select CRYPTO_CTR
+	select CRYPTO_GHASH
+	select CRYPTO_XTS
+	default m
+	help
+	  Support for VMX cryptographic acceleration instructions on Power8 CPU.
+	  This module supports acceleration for AES and GHASH in hardware. If you
+	  choose 'M' here, this module will be called vmx-crypto.
+
 endmenu
diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
index ebdac1b9eb9a..9b38f4a7bc15 100644
--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -10,31 +10,44 @@ obj-$(CONFIG_CRYPTO_MD5_PPC) += md5-ppc.o
 obj-$(CONFIG_CRYPTO_SHA1_PPC) += sha1-powerpc.o
 obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o
 obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o
-obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o
-obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o
-obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o
 obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o
 obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o
 obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o
+obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o
+obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o
 
 aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o
 md5-ppc-y := md5-asm.o md5-glue.o
 sha1-powerpc-y := sha1-powerpc-asm.o sha1.o
 sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o
 sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o
-crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o
-crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o
 aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o
 chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o
 poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o
+vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o
+curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o
+
+ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
+override flavour := linux-ppc64le
+else
+ifdef CONFIG_PPC64_ELF_ABI_V2
+override flavour := linux-ppc64-elfv2
+else
+override flavour := linux-ppc64
+endif
+endif
 
 quiet_cmd_perl = PERL    $@
-      cmd_perl = $(PERL) $< $(if $(CONFIG_CPU_LITTLE_ENDIAN), linux-ppc64le, linux-ppc64) > $@
+      cmd_perl = $(PERL) $< $(flavour) > $@
 
-targets += aesp10-ppc.S ghashp10-ppc.S
+targets += aesp10-ppc.S ghashp10-ppc.S aesp8-ppc.S ghashp8-ppc.S
 
 $(obj)/aesp10-ppc.S $(obj)/ghashp10-ppc.S: $(obj)/%.S: $(src)/%.pl FORCE
 	$(call if_changed,perl)
 
+$(obj)/aesp8-ppc.S $(obj)/ghashp8-ppc.S: $(obj)/%.S: $(src)/%.pl FORCE
+	$(call if_changed,perl)
+
 OBJECT_FILES_NON_STANDARD_aesp10-ppc.o := y
 OBJECT_FILES_NON_STANDARD_ghashp10-ppc.o := y
+OBJECT_FILES_NON_STANDARD_aesp8-ppc.o := y
diff --git a/arch/powerpc/crypto/aes-gcm-p10-glue.c b/arch/powerpc/crypto/aes-gcm-p10-glue.c
index f62ee54076c0..679f52794baf 100644
--- a/arch/powerpc/crypto/aes-gcm-p10-glue.c
+++ b/arch/powerpc/crypto/aes-gcm-p10-glue.c
@@ -5,9 +5,10 @@
  * Copyright 2022- IBM Inc. All rights reserved
  */
 
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 #include <asm/simd.h>
 #include <asm/switch_to.h>
+#include <crypto/gcm.h>
 #include <crypto/aes.h>
 #include <crypto/algapi.h>
 #include <crypto/b128ops.h>
@@ -24,6 +25,7 @@
 
 #define	PPC_ALIGN		16
 #define GCM_IV_SIZE		12
+#define RFC4106_NONCE_SIZE	4
 
 MODULE_DESCRIPTION("PPC64le AES-GCM with Stitched implementation");
 MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com");
@@ -31,7 +33,7 @@ MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("aes");
 
 asmlinkage int aes_p10_set_encrypt_key(const u8 *userKey, const int bits,
-				      void *key);
+				       void *key);
 asmlinkage void aes_p10_encrypt(const u8 *in, u8 *out, const void *key);
 asmlinkage void aes_p10_gcm_encrypt(u8 *in, u8 *out, size_t len,
 				    void *rkey, u8 *iv, void *Xi);
@@ -39,7 +41,8 @@ asmlinkage void aes_p10_gcm_decrypt(u8 *in, u8 *out, size_t len,
 				    void *rkey, u8 *iv, void *Xi);
 asmlinkage void gcm_init_htable(unsigned char htable[], unsigned char Xi[]);
 asmlinkage void gcm_ghash_p10(unsigned char *Xi, unsigned char *Htable,
-		unsigned char *aad, unsigned int alen);
+			      unsigned char *aad, unsigned int alen);
+asmlinkage void gcm_update(u8 *iv, void *Xi);
 
 struct aes_key {
 	u8 key[AES_MAX_KEYLENGTH];
@@ -52,6 +55,7 @@ struct gcm_ctx {
 	u8 aad_hash[16];
 	u64 aadLen;
 	u64 Plen;	/* offset 56 - used in aes_p10_gcm_{en/de}crypt */
+	u8 pblock[16];
 };
 struct Hash_ctx {
 	u8 H[16];	/* subkey */
@@ -60,17 +64,20 @@ struct Hash_ctx {
 
 struct p10_aes_gcm_ctx {
 	struct aes_key enc_key;
+	u8 nonce[RFC4106_NONCE_SIZE];
 };
 
 static void vsx_begin(void)
 {
 	preempt_disable();
+	pagefault_disable();
 	enable_kernel_vsx();
 }
 
 static void vsx_end(void)
 {
 	disable_kernel_vsx();
+	pagefault_enable();
 	preempt_enable();
 }
 
@@ -185,7 +192,7 @@ static int set_authsize(struct crypto_aead *tfm, unsigned int authsize)
 }
 
 static int p10_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key,
-			     unsigned int keylen)
+			      unsigned int keylen)
 {
 	struct crypto_tfm *tfm = crypto_aead_tfm(aead);
 	struct p10_aes_gcm_ctx *ctx = crypto_tfm_ctx(tfm);
@@ -198,7 +205,8 @@ static int p10_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key,
 	return ret ? -EINVAL : 0;
 }
 
-static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
+static int p10_aes_gcm_crypt(struct aead_request *req, u8 *riv,
+			     int assoclen, int enc)
 {
 	struct crypto_tfm *tfm = req->base.tfm;
 	struct p10_aes_gcm_ctx *ctx = crypto_tfm_ctx(tfm);
@@ -206,11 +214,9 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
 	struct gcm_ctx *gctx = PTR_ALIGN((void *)databuf, PPC_ALIGN);
 	u8 hashbuf[sizeof(struct Hash_ctx) + PPC_ALIGN];
 	struct Hash_ctx *hash = PTR_ALIGN((void *)hashbuf, PPC_ALIGN);
-	struct scatter_walk assoc_sg_walk;
 	struct skcipher_walk walk;
 	u8 *assocmem = NULL;
 	u8 *assoc;
-	unsigned int assoclen = req->assoclen;
 	unsigned int cryptlen = req->cryptlen;
 	unsigned char ivbuf[AES_BLOCK_SIZE+PPC_ALIGN];
 	unsigned char *iv = PTR_ALIGN((void *)ivbuf, PPC_ALIGN);
@@ -218,16 +224,16 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
 	unsigned long auth_tag_len = crypto_aead_authsize(__crypto_aead_cast(tfm));
 	u8 otag[16];
 	int total_processed = 0;
+	int nbytes;
 
 	memset(databuf, 0, sizeof(databuf));
 	memset(hashbuf, 0, sizeof(hashbuf));
 	memset(ivbuf, 0, sizeof(ivbuf));
-	memcpy(iv, req->iv, GCM_IV_SIZE);
+	memcpy(iv, riv, GCM_IV_SIZE);
 
 	/* Linearize assoc, if not already linear */
 	if (req->src->length >= assoclen && req->src->length) {
-		scatterwalk_start(&assoc_sg_walk, req->src);
-		assoc = scatterwalk_map(&assoc_sg_walk);
+		assoc = sg_virt(req->src); /* ppc64 is !HIGHMEM */
 	} else {
 		gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ?
 			      GFP_KERNEL : GFP_ATOMIC;
@@ -245,10 +251,7 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
 	gcmp10_init(gctx, iv, (unsigned char *) &ctx->enc_key, hash, assoc, assoclen);
 	vsx_end();
 
-	if (!assocmem)
-		scatterwalk_unmap(assoc);
-	else
-		kfree(assocmem);
+	kfree(assocmem);
 
 	if (enc)
 		ret = skcipher_walk_aead_encrypt(&walk, req, false);
@@ -257,19 +260,25 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
 	if (ret)
 		return ret;
 
-	while (walk.nbytes > 0 && ret == 0) {
+	while ((nbytes = walk.nbytes) > 0 && ret == 0) {
+		u8 *src = walk.src.virt.addr;
+		u8 *dst = walk.dst.virt.addr;
+		u8 buf[AES_BLOCK_SIZE];
+
+		if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
+			src = dst = memcpy(buf, src, nbytes);
 
 		vsx_begin();
 		if (enc)
-			aes_p10_gcm_encrypt(walk.src.virt.addr,
-					    walk.dst.virt.addr,
-					    walk.nbytes,
+			aes_p10_gcm_encrypt(src, dst, nbytes,
 					    &ctx->enc_key, gctx->iv, hash->Htable);
 		else
-			aes_p10_gcm_decrypt(walk.src.virt.addr,
-					    walk.dst.virt.addr,
-					    walk.nbytes,
+			aes_p10_gcm_decrypt(src, dst, nbytes,
 					    &ctx->enc_key, gctx->iv, hash->Htable);
+
+		if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
+			memcpy(walk.dst.virt.addr, buf, nbytes);
+
 		vsx_end();
 
 		total_processed += walk.nbytes;
@@ -281,6 +290,7 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
 
 	/* Finalize hash */
 	vsx_begin();
+	gcm_update(gctx->iv, hash->Htable);
 	finish_tag(gctx, hash, total_processed);
 	vsx_end();
 
@@ -302,17 +312,63 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
 	return 0;
 }
 
+static int rfc4106_setkey(struct crypto_aead *tfm, const u8 *inkey,
+			  unsigned int keylen)
+{
+	struct p10_aes_gcm_ctx *ctx = crypto_aead_ctx(tfm);
+	int err;
+
+	keylen -= RFC4106_NONCE_SIZE;
+	err = p10_aes_gcm_setkey(tfm, inkey, keylen);
+	if (err)
+		return err;
+
+	memcpy(ctx->nonce, inkey + keylen, RFC4106_NONCE_SIZE);
+	return 0;
+}
+
+static int rfc4106_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
+{
+	return crypto_rfc4106_check_authsize(authsize);
+}
+
+static int rfc4106_encrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct p10_aes_gcm_ctx *ctx = crypto_aead_ctx(aead);
+	u8 iv[AES_BLOCK_SIZE];
+
+	memcpy(iv, ctx->nonce, RFC4106_NONCE_SIZE);
+	memcpy(iv + RFC4106_NONCE_SIZE, req->iv, GCM_RFC4106_IV_SIZE);
+
+	return crypto_ipsec_check_assoclen(req->assoclen) ?:
+	       p10_aes_gcm_crypt(req, iv, req->assoclen - GCM_RFC4106_IV_SIZE, 1);
+}
+
+static int rfc4106_decrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct p10_aes_gcm_ctx *ctx = crypto_aead_ctx(aead);
+	u8 iv[AES_BLOCK_SIZE];
+
+	memcpy(iv, ctx->nonce, RFC4106_NONCE_SIZE);
+	memcpy(iv + RFC4106_NONCE_SIZE, req->iv, GCM_RFC4106_IV_SIZE);
+
+	return crypto_ipsec_check_assoclen(req->assoclen) ?:
+	       p10_aes_gcm_crypt(req, iv, req->assoclen - GCM_RFC4106_IV_SIZE, 0);
+}
+
 static int p10_aes_gcm_encrypt(struct aead_request *req)
 {
-	return p10_aes_gcm_crypt(req, 1);
+	return p10_aes_gcm_crypt(req, req->iv, req->assoclen, 1);
 }
 
 static int p10_aes_gcm_decrypt(struct aead_request *req)
 {
-	return p10_aes_gcm_crypt(req, 0);
+	return p10_aes_gcm_crypt(req, req->iv, req->assoclen, 0);
 }
 
-static struct aead_alg gcm_aes_alg = {
+static struct aead_alg gcm_aes_algs[] = {{
 	.ivsize			= GCM_IV_SIZE,
 	.maxauthsize		= 16,
 
@@ -321,23 +377,57 @@ static struct aead_alg gcm_aes_alg = {
 	.encrypt		= p10_aes_gcm_encrypt,
 	.decrypt		= p10_aes_gcm_decrypt,
 
-	.base.cra_name		= "gcm(aes)",
-	.base.cra_driver_name	= "aes_gcm_p10",
+	.base.cra_name		= "__gcm(aes)",
+	.base.cra_driver_name	= "__aes_gcm_p10",
 	.base.cra_priority	= 2100,
 	.base.cra_blocksize	= 1,
-	.base.cra_ctxsize	= sizeof(struct p10_aes_gcm_ctx),
+	.base.cra_ctxsize	= sizeof(struct p10_aes_gcm_ctx)+
+				  4 * sizeof(u64[2]),
 	.base.cra_module	= THIS_MODULE,
-};
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+}, {
+	.ivsize			= GCM_RFC4106_IV_SIZE,
+	.maxauthsize		= 16,
+	.setkey			= rfc4106_setkey,
+	.setauthsize		= rfc4106_setauthsize,
+	.encrypt		= rfc4106_encrypt,
+	.decrypt		= rfc4106_decrypt,
+
+	.base.cra_name		= "__rfc4106(gcm(aes))",
+	.base.cra_driver_name	= "__rfc4106_aes_gcm_p10",
+	.base.cra_priority	= 2100,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct p10_aes_gcm_ctx) +
+				  4 * sizeof(u64[2]),
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+}};
+
+static struct simd_aead_alg *p10_simd_aeads[ARRAY_SIZE(gcm_aes_algs)];
 
 static int __init p10_init(void)
 {
-	return crypto_register_aead(&gcm_aes_alg);
+	int ret;
+
+	if (!cpu_has_feature(CPU_FTR_ARCH_31))
+		return 0;
+
+	ret = simd_register_aeads_compat(gcm_aes_algs,
+					 ARRAY_SIZE(gcm_aes_algs),
+					 p10_simd_aeads);
+	if (ret) {
+		simd_unregister_aeads(gcm_aes_algs, ARRAY_SIZE(gcm_aes_algs),
+				      p10_simd_aeads);
+		return ret;
+	}
+	return 0;
 }
 
 static void __exit p10_exit(void)
 {
-	crypto_unregister_aead(&gcm_aes_alg);
+	simd_unregister_aeads(gcm_aes_algs, ARRAY_SIZE(gcm_aes_algs),
+			      p10_simd_aeads);
 }
 
-module_cpu_feature_match(PPC_MODULE_FEATURE_P10, p10_init);
+module_init(p10_init);
 module_exit(p10_exit);
diff --git a/arch/powerpc/crypto/aes-gcm-p10.S b/arch/powerpc/crypto/aes-gcm-p10.S
index a51f4b265308..89f50eef3512 100644
--- a/arch/powerpc/crypto/aes-gcm-p10.S
+++ b/arch/powerpc/crypto/aes-gcm-p10.S
@@ -1,42 +1,42 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
- #
- # Accelerated AES-GCM stitched implementation for ppc64le.
- #
- # Copyright 2022- IBM Inc. All rights reserved
- #
- #===================================================================================
- # Written by Danny Tsen <dtsen@linux.ibm.com>
- #
- # GHASH is based on the Karatsuba multiplication method.
- #
- #    Xi xor X1
- #
- #    X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
- #      (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
- #      (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
- #      (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
- #      (X4.h * H.h + X4.l * H.l + X4 * H)
- #
- # Xi = v0
- # H Poly = v2
- # Hash keys = v3 - v14
- #     ( H.l, H, H.h)
- #     ( H^2.l, H^2, H^2.h)
- #     ( H^3.l, H^3, H^3.h)
- #     ( H^4.l, H^4, H^4.h)
- #
- # v30 is IV
- # v31 - counter 1
- #
- # AES used,
- #     vs0 - vs14 for round keys
- #     v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
- #
- # This implementation uses stitched AES-GCM approach to improve overall performance.
- # AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
- #
- # ===================================================================================
- #
+#
+# Accelerated AES-GCM stitched implementation for ppc64le.
+#
+# Copyright 2024- IBM Inc.
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# GHASH is based on the Karatsuba multiplication method.
+#
+#    Xi xor X1
+#
+#    X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
+#      (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
+#      (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
+#      (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
+#      (X4.h * H.h + X4.l * H.l + X4 * H)
+#
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+#     ( H.l, H, H.h)
+#     ( H^2.l, H^2, H^2.h)
+#     ( H^3.l, H^3, H^3.h)
+#     ( H^4.l, H^4, H^4.h)
+#
+# v30 is IV
+# v31 - counter 1
+#
+# AES used,
+#     vs0 - round key 0
+#     v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
+#
+# This implementation uses stitched AES-GCM approach to improve overall performance.
+# AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
+#
+# ===================================================================================
+#
 
 #include <asm/ppc_asm.h>
 #include <linux/linkage.h>
@@ -44,483 +44,224 @@
 .machine        "any"
 .text
 
- # 4x loops
- # v15 - v18 - input states
- # vs1 - vs9 - round keys
- #
-.macro Loop_aes_middle4x
-	xxlor	19+32, 1, 1
-	xxlor	20+32, 2, 2
-	xxlor	21+32, 3, 3
-	xxlor	22+32, 4, 4
-
-	vcipher	15, 15, 19
-	vcipher	16, 16, 19
-	vcipher	17, 17, 19
-	vcipher	18, 18, 19
-
-	vcipher	15, 15, 20
-	vcipher	16, 16, 20
-	vcipher	17, 17, 20
-	vcipher	18, 18, 20
-
-	vcipher	15, 15, 21
-	vcipher	16, 16, 21
-	vcipher	17, 17, 21
-	vcipher	18, 18, 21
-
-	vcipher	15, 15, 22
-	vcipher	16, 16, 22
-	vcipher	17, 17, 22
-	vcipher	18, 18, 22
-
-	xxlor	19+32, 5, 5
-	xxlor	20+32, 6, 6
-	xxlor	21+32, 7, 7
-	xxlor	22+32, 8, 8
-
-	vcipher	15, 15, 19
-	vcipher	16, 16, 19
-	vcipher	17, 17, 19
-	vcipher	18, 18, 19
-
-	vcipher	15, 15, 20
-	vcipher	16, 16, 20
-	vcipher	17, 17, 20
-	vcipher	18, 18, 20
-
-	vcipher	15, 15, 21
-	vcipher	16, 16, 21
-	vcipher	17, 17, 21
-	vcipher	18, 18, 21
-
-	vcipher	15, 15, 22
-	vcipher	16, 16, 22
-	vcipher	17, 17, 22
-	vcipher	18, 18, 22
-
-	xxlor	23+32, 9, 9
-	vcipher	15, 15, 23
-	vcipher	16, 16, 23
-	vcipher	17, 17, 23
-	vcipher	18, 18, 23
+.macro	SAVE_GPR GPR OFFSET FRAME
+	std	\GPR,\OFFSET(\FRAME)
 .endm
 
- # 8x loops
- # v15 - v22 - input states
- # vs1 - vs9 - round keys
- #
-.macro Loop_aes_middle8x
-	xxlor	23+32, 1, 1
-	xxlor	24+32, 2, 2
-	xxlor	25+32, 3, 3
-	xxlor	26+32, 4, 4
-
-	vcipher	15, 15, 23
-	vcipher	16, 16, 23
-	vcipher	17, 17, 23
-	vcipher	18, 18, 23
-	vcipher	19, 19, 23
-	vcipher	20, 20, 23
-	vcipher	21, 21, 23
-	vcipher	22, 22, 23
-
-	vcipher	15, 15, 24
-	vcipher	16, 16, 24
-	vcipher	17, 17, 24
-	vcipher	18, 18, 24
-	vcipher	19, 19, 24
-	vcipher	20, 20, 24
-	vcipher	21, 21, 24
-	vcipher	22, 22, 24
-
-	vcipher	15, 15, 25
-	vcipher	16, 16, 25
-	vcipher	17, 17, 25
-	vcipher	18, 18, 25
-	vcipher	19, 19, 25
-	vcipher	20, 20, 25
-	vcipher	21, 21, 25
-	vcipher	22, 22, 25
-
-	vcipher	15, 15, 26
-	vcipher	16, 16, 26
-	vcipher	17, 17, 26
-	vcipher	18, 18, 26
-	vcipher	19, 19, 26
-	vcipher	20, 20, 26
-	vcipher	21, 21, 26
-	vcipher	22, 22, 26
-
-	xxlor	23+32, 5, 5
-	xxlor	24+32, 6, 6
-	xxlor	25+32, 7, 7
-	xxlor	26+32, 8, 8
-
-	vcipher	15, 15, 23
-	vcipher	16, 16, 23
-	vcipher	17, 17, 23
-	vcipher	18, 18, 23
-	vcipher	19, 19, 23
-	vcipher	20, 20, 23
-	vcipher	21, 21, 23
-	vcipher	22, 22, 23
-
-	vcipher	15, 15, 24
-	vcipher	16, 16, 24
-	vcipher	17, 17, 24
-	vcipher	18, 18, 24
-	vcipher	19, 19, 24
-	vcipher	20, 20, 24
-	vcipher	21, 21, 24
-	vcipher	22, 22, 24
-
-	vcipher	15, 15, 25
-	vcipher	16, 16, 25
-	vcipher	17, 17, 25
-	vcipher	18, 18, 25
-	vcipher	19, 19, 25
-	vcipher	20, 20, 25
-	vcipher	21, 21, 25
-	vcipher	22, 22, 25
-
-	vcipher	15, 15, 26
-	vcipher	16, 16, 26
-	vcipher	17, 17, 26
-	vcipher	18, 18, 26
-	vcipher	19, 19, 26
-	vcipher	20, 20, 26
-	vcipher	21, 21, 26
-	vcipher	22, 22, 26
-
-	xxlor	23+32, 9, 9
-	vcipher	15, 15, 23
-	vcipher	16, 16, 23
-	vcipher	17, 17, 23
-	vcipher	18, 18, 23
-	vcipher	19, 19, 23
-	vcipher	20, 20, 23
-	vcipher	21, 21, 23
-	vcipher	22, 22, 23
+.macro	SAVE_VRS VRS OFFSET FRAME
+	stxv	\VRS+32, \OFFSET(\FRAME)
 .endm
 
-.macro Loop_aes_middle_1x
-	xxlor	19+32, 1, 1
-	xxlor	20+32, 2, 2
-	xxlor	21+32, 3, 3
-	xxlor	22+32, 4, 4
-
-	vcipher 15, 15, 19
-	vcipher 15, 15, 20
-	vcipher 15, 15, 21
-	vcipher 15, 15, 22
-
-	xxlor	19+32, 5, 5
-	xxlor	20+32, 6, 6
-	xxlor	21+32, 7, 7
-	xxlor	22+32, 8, 8
-
-	vcipher 15, 15, 19
-	vcipher 15, 15, 20
-	vcipher 15, 15, 21
-	vcipher 15, 15, 22
-
-	xxlor	19+32, 9, 9
-	vcipher 15, 15, 19
+.macro	RESTORE_GPR GPR OFFSET FRAME
+	ld	\GPR,\OFFSET(\FRAME)
 .endm
 
- #
- # Compute 4x hash values based on Karatsuba method.
- #
-.macro ppc_aes_gcm_ghash
-	vxor		15, 15, 0
-
-	vpmsumd		23, 12, 15		# H4.L * X.L
-	vpmsumd		24, 9, 16
-	vpmsumd		25, 6, 17
-	vpmsumd		26, 3, 18
-
-	vxor		23, 23, 24
-	vxor		23, 23, 25
-	vxor		23, 23, 26		# L
-
-	vpmsumd		24, 13, 15		# H4.L * X.H + H4.H * X.L
-	vpmsumd		25, 10, 16		# H3.L * X1.H + H3.H * X1.L
-	vpmsumd		26, 7, 17
-	vpmsumd		27, 4, 18
-
-	vxor		24, 24, 25
-	vxor		24, 24, 26
-	vxor		24, 24, 27		# M
-
-	# sum hash and reduction with H Poly
-	vpmsumd		28, 23, 2		# reduction
-
-	vxor		29, 29, 29
-	vsldoi		26, 24, 29, 8		# mL
-	vsldoi		29, 29, 24, 8		# mH
-	vxor		23, 23, 26		# mL + L
-
-	vsldoi		23, 23, 23, 8		# swap
-	vxor		23, 23, 28
-
-	vpmsumd		24, 14, 15		# H4.H * X.H
-	vpmsumd		25, 11, 16
-	vpmsumd		26, 8, 17
-	vpmsumd		27, 5, 18
-
-	vxor		24, 24, 25
-	vxor		24, 24, 26
-	vxor		24, 24, 27
-
-	vxor		24, 24, 29
-
-	# sum hash and reduction with H Poly
-	vsldoi		27, 23, 23, 8		# swap
-	vpmsumd		23, 23, 2
-	vxor		27, 27, 24
-	vxor		23, 23, 27
-
-	xxlor		32, 23+32, 23+32		# update hash
-
+.macro	RESTORE_VRS VRS OFFSET FRAME
+	lxv	\VRS+32, \OFFSET(\FRAME)
 .endm
 
- #
- # Combine two 4x ghash
- # v15 - v22 - input blocks
- #
-.macro ppc_aes_gcm_ghash2_4x
-	# first 4x hash
-	vxor		15, 15, 0		# Xi + X
-
-	vpmsumd		23, 12, 15		# H4.L * X.L
-	vpmsumd		24, 9, 16
-	vpmsumd		25, 6, 17
-	vpmsumd		26, 3, 18
-
-	vxor		23, 23, 24
-	vxor		23, 23, 25
-	vxor		23, 23, 26		# L
-
-	vpmsumd		24, 13, 15		# H4.L * X.H + H4.H * X.L
-	vpmsumd		25, 10, 16		# H3.L * X1.H + H3.H * X1.L
-	vpmsumd		26, 7, 17
-	vpmsumd		27, 4, 18
-
-	vxor		24, 24, 25
-	vxor		24, 24, 26
-
-	# sum hash and reduction with H Poly
-	vpmsumd		28, 23, 2		# reduction
-
-	vxor		29, 29, 29
-
-	vxor		24, 24, 27		# M
-	vsldoi		26, 24, 29, 8		# mL
-	vsldoi		29, 29, 24, 8		# mH
-	vxor		23, 23, 26		# mL + L
-
-	vsldoi		23, 23, 23, 8		# swap
-	vxor		23, 23, 28
+.macro SAVE_REGS
+	mflr 0
+	std 0, 16(1)
+	stdu 1,-512(1)
+
+	SAVE_GPR 14, 112, 1
+	SAVE_GPR 15, 120, 1
+	SAVE_GPR 16, 128, 1
+	SAVE_GPR 17, 136, 1
+	SAVE_GPR 18, 144, 1
+	SAVE_GPR 19, 152, 1
+	SAVE_GPR 20, 160, 1
+	SAVE_GPR 21, 168, 1
+	SAVE_GPR 22, 176, 1
+	SAVE_GPR 23, 184, 1
+	SAVE_GPR 24, 192, 1
+
+	addi	9, 1, 256
+	SAVE_VRS 20, 0, 9
+	SAVE_VRS 21, 16, 9
+	SAVE_VRS 22, 32, 9
+	SAVE_VRS 23, 48, 9
+	SAVE_VRS 24, 64, 9
+	SAVE_VRS 25, 80, 9
+	SAVE_VRS 26, 96, 9
+	SAVE_VRS 27, 112, 9
+	SAVE_VRS 28, 128, 9
+	SAVE_VRS 29, 144, 9
+	SAVE_VRS 30, 160, 9
+	SAVE_VRS 31, 176, 9
+.endm # SAVE_REGS
 
-	vpmsumd		24, 14, 15		# H4.H * X.H
-	vpmsumd		25, 11, 16
-	vpmsumd		26, 8, 17
-	vpmsumd		27, 5, 18
+.macro RESTORE_REGS
+	addi	9, 1, 256
+	RESTORE_VRS 20, 0, 9
+	RESTORE_VRS 21, 16, 9
+	RESTORE_VRS 22, 32, 9
+	RESTORE_VRS 23, 48, 9
+	RESTORE_VRS 24, 64, 9
+	RESTORE_VRS 25, 80, 9
+	RESTORE_VRS 26, 96, 9
+	RESTORE_VRS 27, 112, 9
+	RESTORE_VRS 28, 128, 9
+	RESTORE_VRS 29, 144, 9
+	RESTORE_VRS 30, 160, 9
+	RESTORE_VRS 31, 176, 9
+
+	RESTORE_GPR 14, 112, 1
+	RESTORE_GPR 15, 120, 1
+	RESTORE_GPR 16, 128, 1
+	RESTORE_GPR 17, 136, 1
+	RESTORE_GPR 18, 144, 1
+	RESTORE_GPR 19, 152, 1
+	RESTORE_GPR 20, 160, 1
+	RESTORE_GPR 21, 168, 1
+	RESTORE_GPR 22, 176, 1
+	RESTORE_GPR 23, 184, 1
+	RESTORE_GPR 24, 192, 1
+
+	addi    1, 1, 512
+	ld 0, 16(1)
+	mtlr 0
+.endm # RESTORE_REGS
+
+# 4x loops
+.macro AES_CIPHER_4x _VCIPHER ST r
+	\_VCIPHER	\ST, \ST, \r
+	\_VCIPHER	\ST+1, \ST+1, \r
+	\_VCIPHER	\ST+2, \ST+2, \r
+	\_VCIPHER	\ST+3, \ST+3, \r
+.endm
 
-	vxor		24, 24, 25
-	vxor		24, 24, 26
-	vxor		24, 24, 27		# H
+# 8x loops
+.macro AES_CIPHER_8x _VCIPHER ST r
+	\_VCIPHER	\ST, \ST, \r
+	\_VCIPHER	\ST+1, \ST+1, \r
+	\_VCIPHER	\ST+2, \ST+2, \r
+	\_VCIPHER	\ST+3, \ST+3, \r
+	\_VCIPHER	\ST+4, \ST+4, \r
+	\_VCIPHER	\ST+5, \ST+5, \r
+	\_VCIPHER	\ST+6, \ST+6, \r
+	\_VCIPHER	\ST+7, \ST+7, \r
+.endm
 
-	vxor		24, 24, 29		# H + mH
+.macro LOOP_8AES_STATE
+	xxlor	32+23, 1, 1
+	xxlor	32+24, 2, 2
+	xxlor	32+25, 3, 3
+	xxlor	32+26, 4, 4
+	AES_CIPHER_8x vcipher, 15, 23
+	AES_CIPHER_8x vcipher, 15, 24
+	AES_CIPHER_8x vcipher, 15, 25
+	AES_CIPHER_8x vcipher, 15, 26
+	xxlor	32+23, 5, 5
+	xxlor	32+24, 6, 6
+	xxlor	32+25, 7, 7
+	xxlor	32+26, 8, 8
+	AES_CIPHER_8x vcipher, 15, 23
+	AES_CIPHER_8x vcipher, 15, 24
+	AES_CIPHER_8x vcipher, 15, 25
+	AES_CIPHER_8x vcipher, 15, 26
+.endm
 
-	# sum hash and reduction with H Poly
-	vsldoi		27, 23, 23, 8		# swap
-	vpmsumd		23, 23, 2
-	vxor		27, 27, 24
-	vxor		27, 23, 27		# 1st Xi
-
-	# 2nd 4x hash
-	vpmsumd		24, 9, 20
-	vpmsumd		25, 6, 21
-	vpmsumd		26, 3, 22
-	vxor		19, 19, 27		# Xi + X
-	vpmsumd		23, 12, 19		# H4.L * X.L
-
-	vxor		23, 23, 24
-	vxor		23, 23, 25
-	vxor		23, 23, 26		# L
-
-	vpmsumd		24, 13, 19		# H4.L * X.H + H4.H * X.L
-	vpmsumd		25, 10, 20		# H3.L * X1.H + H3.H * X1.L
-	vpmsumd		26, 7, 21
-	vpmsumd		27, 4, 22
-
-	vxor		24, 24, 25
-	vxor		24, 24, 26
+#
+# PPC_GHASH4x(H, S1, S2, S3, S4): Compute 4x hash values based on Karatsuba method.
+# H: returning digest
+# S#: states
+#
+# S1 should xor with the previous digest
+#
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+# Scratch: v23 - v29
+#
+.macro PPC_GHASH4x H S1 S2 S3 S4
+
+	vpmsumd	23, 12, \S1		# H4.L * X.L
+	vpmsumd	24, 9, \S2
+	vpmsumd	25, 6, \S3
+	vpmsumd	26, 3, \S4
+
+	vpmsumd	27, 13, \S1		# H4.L * X.H + H4.H * X.L
+	vpmsumd	28, 10, \S2		# H3.L * X1.H + H3.H * X1.L
+
+	vxor	23, 23, 24
+	vxor	23, 23, 25
+	vxor	23, 23, 26		# L
+
+	vxor	24, 27, 28
+	vpmsumd	25, 7, \S3
+	vpmsumd	26, 4, \S4
+
+	vxor	24, 24, 25
+	vxor	24, 24, 26		# M
 
 	# sum hash and reduction with H Poly
-	vpmsumd		28, 23, 2		# reduction
-
-	vxor		29, 29, 29
+	vpmsumd	28, 23, 2		# reduction
 
-	vxor		24, 24, 27		# M
-	vsldoi		26, 24, 29, 8		# mL
-	vsldoi		29, 29, 24, 8		# mH
-	vxor		23, 23, 26		# mL + L
+	vxor	1, 1, 1
+	vsldoi	25, 24, 1, 8		# mL
+	vsldoi	1, 1, 24, 8		# mH
+	vxor	23, 23, 25		# mL + L
 
-	vsldoi		23, 23, 23, 8		# swap
-	vxor		23, 23, 28
+	# This performs swap and xor like,
+	#   vsldoi	23, 23, 23, 8		# swap
+	#   vxor	23, 23, 28
+	xxlor	32+25, 10, 10
+	vpermxor 23, 23, 28, 25
 
-	vpmsumd		24, 14, 19		# H4.H * X.H
-	vpmsumd		25, 11, 20
-	vpmsumd		26, 8, 21
-	vpmsumd		27, 5, 22
+	vpmsumd	26, 14, \S1		# H4.H * X.H
+	vpmsumd	27, 11, \S2
+	vpmsumd	28, 8, \S3
+	vpmsumd	29, 5, \S4
 
-	vxor		24, 24, 25
-	vxor		24, 24, 26
-	vxor		24, 24, 27		# H
+	vxor	24, 26, 27
+	vxor	24, 24, 28
+	vxor	24, 24, 29
 
-	vxor		24, 24, 29		# H + mH
+	vxor	24, 24, 1
 
 	# sum hash and reduction with H Poly
-	vsldoi		27, 23, 23, 8		# swap
-	vpmsumd		23, 23, 2
-	vxor		27, 27, 24
-	vxor		23, 23, 27
-
-	xxlor		32, 23+32, 23+32		# update hash
-
+	vsldoi	25, 23, 23, 8		# swap
+	vpmsumd	23, 23, 2
+	vxor	27, 25, 24
+	vxor	\H, 23, 27
 .endm
 
- #
- # Compute update single hash
- #
-.macro ppc_update_hash_1x
-	vxor		28, 28, 0
-
-	vxor		19, 19, 19
+#
+# Compute update single ghash
+# scratch: v1, v22..v27
+#
+.macro PPC_GHASH1x H S1
 
-	vpmsumd		22, 3, 28		# L
-	vpmsumd		23, 4, 28		# M
-	vpmsumd		24, 5, 28		# H
+	vxor	1, 1, 1
 
-	vpmsumd		27, 22, 2		# reduction
+	vpmsumd	22, 3, \S1		# L
+	vpmsumd	23, 4, \S1		# M
+	vpmsumd	24, 5, \S1		# H
 
-	vsldoi		25, 23, 19, 8		# mL
-	vsldoi		26, 19, 23, 8		# mH
-	vxor		22, 22, 25		# LL + LL
-	vxor		24, 24, 26		# HH + HH
+	vpmsumd	27, 22, 2		# reduction
 
-	vsldoi		22, 22, 22, 8		# swap
-	vxor		22, 22, 27
+	vsldoi	25, 23, 1, 8		# mL
+	vsldoi	26, 1, 23, 8		# mH
+	vxor	22, 22, 25		# LL + LL
+	vxor	24, 24, 26		# HH + HH
 
-	vsldoi		20, 22, 22, 8		# swap
-	vpmsumd		22, 22, 2		# reduction
-	vxor		20, 20, 24
-	vxor		22, 22, 20
+	xxlor	32+25, 10, 10
+	vpermxor 22, 22, 27, 25
 
-	vmr		0, 22			# update hash
-
-.endm
-
-.macro SAVE_REGS
-	stdu 1,-640(1)
-	mflr 0
-
-	std	14,112(1)
-	std	15,120(1)
-	std	16,128(1)
-	std	17,136(1)
-	std	18,144(1)
-	std	19,152(1)
-	std	20,160(1)
-	std	21,168(1)
-	li	9, 256
-	stvx	20, 9, 1
-	addi	9, 9, 16
-	stvx	21, 9, 1
-	addi	9, 9, 16
-	stvx	22, 9, 1
-	addi	9, 9, 16
-	stvx	23, 9, 1
-	addi	9, 9, 16
-	stvx	24, 9, 1
-	addi	9, 9, 16
-	stvx	25, 9, 1
-	addi	9, 9, 16
-	stvx	26, 9, 1
-	addi	9, 9, 16
-	stvx	27, 9, 1
-	addi	9, 9, 16
-	stvx	28, 9, 1
-	addi	9, 9, 16
-	stvx	29, 9, 1
-	addi	9, 9, 16
-	stvx	30, 9, 1
-	addi	9, 9, 16
-	stvx	31, 9, 1
-	stxv	14, 464(1)
-	stxv	15, 480(1)
-	stxv	16, 496(1)
-	stxv	17, 512(1)
-	stxv	18, 528(1)
-	stxv	19, 544(1)
-	stxv	20, 560(1)
-	stxv	21, 576(1)
-	stxv	22, 592(1)
-	std	0, 656(1)
-.endm
-
-.macro RESTORE_REGS
-	lxv	14, 464(1)
-	lxv	15, 480(1)
-	lxv	16, 496(1)
-	lxv	17, 512(1)
-	lxv	18, 528(1)
-	lxv	19, 544(1)
-	lxv	20, 560(1)
-	lxv	21, 576(1)
-	lxv	22, 592(1)
-	li	9, 256
-	lvx	20, 9, 1
-	addi	9, 9, 16
-	lvx	21, 9, 1
-	addi	9, 9, 16
-	lvx	22, 9, 1
-	addi	9, 9, 16
-	lvx	23, 9, 1
-	addi	9, 9, 16
-	lvx	24, 9, 1
-	addi	9, 9, 16
-	lvx	25, 9, 1
-	addi	9, 9, 16
-	lvx	26, 9, 1
-	addi	9, 9, 16
-	lvx	27, 9, 1
-	addi	9, 9, 16
-	lvx	28, 9, 1
-	addi	9, 9, 16
-	lvx	29, 9, 1
-	addi	9, 9, 16
-	lvx	30, 9, 1
-	addi	9, 9, 16
-	lvx	31, 9, 1
-
-	ld	0, 656(1)
-	ld      14,112(1)
-	ld      15,120(1)
-	ld      16,128(1)
-	ld      17,136(1)
-	ld      18,144(1)
-	ld      19,152(1)
-	ld      20,160(1)
-	ld	21,168(1)
-
-	mtlr	0
-	addi	1, 1, 640
+	vsldoi	23, 22, 22, 8		# swap
+	vpmsumd	22, 22, 2		# reduction
+	vxor	23, 23, 24
+	vxor	\H, 22, 23
 .endm
 
+#
+# LOAD_HASH_TABLE
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+#
 .macro LOAD_HASH_TABLE
 	# Load Xi
 	lxvb16x	32, 0, 8	# load Xi
@@ -557,657 +298,434 @@
 	lxvd2x	14+32, 10, 8	# H^4h
 .endm
 
- #
- # aes_p10_gcm_encrypt (const void *inp, void *out, size_t len,
- #               const char *rk, unsigned char iv[16], void *Xip);
- #
- #    r3 - inp
- #    r4 - out
- #    r5 - len
- #    r6 - AES round keys
- #    r7 - iv and other data
- #    r8 - Xi, HPoli, hash keys
- #
- #    rounds is at offset 240 in rk
- #    Xi is at 0 in gcm_table (Xip).
- #
-_GLOBAL(aes_p10_gcm_encrypt)
-.align 5
-
-	SAVE_REGS
-
-	LOAD_HASH_TABLE
-
-	# initialize ICB: GHASH( IV ), IV - r7
-	lxvb16x	30+32, 0, 7	# load IV  - v30
-
-	mr	12, 5		# length
-	li	11, 0		# block index
-
-	# counter 1
-	vxor	31, 31, 31
-	vspltisb 22, 1
-	vsldoi	31, 31, 22,1	# counter 1
-
-	# load round key to VSR
-	lxv	0, 0(6)
-	lxv	1, 0x10(6)
-	lxv	2, 0x20(6)
-	lxv	3, 0x30(6)
-	lxv	4, 0x40(6)
-	lxv	5, 0x50(6)
-	lxv	6, 0x60(6)
-	lxv	7, 0x70(6)
-	lxv	8, 0x80(6)
-	lxv	9, 0x90(6)
-	lxv	10, 0xa0(6)
-
-	# load rounds - 10 (128), 12 (192), 14 (256)
-	lwz	9,240(6)
-
-	#
-	# vxor	state, state, w # addroundkey
-	xxlor	32+29, 0, 0
-	vxor	15, 30, 29	# IV + round key - add round key 0
-
-	cmpdi	9, 10
-	beq	Loop_aes_gcm_8x
-
-	# load 2 more round keys (v11, v12)
-	lxv	11, 0xb0(6)
-	lxv	12, 0xc0(6)
-
-	cmpdi	9, 12
-	beq	Loop_aes_gcm_8x
-
-	# load 2 more round keys (v11, v12, v13, v14)
-	lxv	13, 0xd0(6)
-	lxv	14, 0xe0(6)
-	cmpdi	9, 14
-	beq	Loop_aes_gcm_8x
-
-	b	aes_gcm_out
-
-.align 5
-Loop_aes_gcm_8x:
-	mr	14, 3
-	mr	9, 4
-
-	#
-	# check partial block
-	#
-Continue_partial_check:
-	ld	15, 56(7)
-	cmpdi	15, 0
-	beq	Continue
-	bgt	Final_block
-	cmpdi	15, 16
-	blt	Final_block
-
-Continue:
-	# n blcoks
-	li	10, 128
-	divdu	10, 12, 10	# n 128 bytes-blocks
-	cmpdi	10, 0
-	beq	Loop_last_block
-
-	vaddudm	30, 30, 31	# IV + counter
-	vxor	16, 30, 29
-	vaddudm	30, 30, 31
-	vxor	17, 30, 29
-	vaddudm	30, 30, 31
-	vxor	18, 30, 29
-	vaddudm	30, 30, 31
-	vxor	19, 30, 29
-	vaddudm	30, 30, 31
-	vxor	20, 30, 29
-	vaddudm	30, 30, 31
-	vxor	21, 30, 29
-	vaddudm	30, 30, 31
-	vxor	22, 30, 29
-
-	mtctr	10
-
-	li	15, 16
-	li	16, 32
-	li	17, 48
-	li	18, 64
-	li	19, 80
-	li	20, 96
-	li	21, 112
-
-	lwz	10, 240(6)
-
-Loop_8x_block:
-
-	lxvb16x		15, 0, 14	# load block
-	lxvb16x		16, 15, 14	# load block
-	lxvb16x		17, 16, 14	# load block
-	lxvb16x		18, 17, 14	# load block
-	lxvb16x		19, 18, 14	# load block
-	lxvb16x		20, 19, 14	# load block
-	lxvb16x		21, 20, 14	# load block
-	lxvb16x		22, 21, 14	# load block
-	addi		14, 14, 128
-
-	Loop_aes_middle8x
-
-	xxlor	23+32, 10, 10
-
-	cmpdi	10, 10
-	beq	Do_next_ghash
-
-	# 192 bits
-	xxlor	24+32, 11, 11
-
-	vcipher	15, 15, 23
-	vcipher	16, 16, 23
-	vcipher	17, 17, 23
-	vcipher	18, 18, 23
-	vcipher	19, 19, 23
-	vcipher	20, 20, 23
-	vcipher	21, 21, 23
-	vcipher	22, 22, 23
-
-	vcipher	15, 15, 24
-	vcipher	16, 16, 24
-	vcipher	17, 17, 24
-	vcipher	18, 18, 24
-	vcipher	19, 19, 24
-	vcipher	20, 20, 24
-	vcipher	21, 21, 24
-	vcipher	22, 22, 24
-
-	xxlor	23+32, 12, 12
-
-	cmpdi	10, 12
-	beq	Do_next_ghash
-
-	# 256 bits
-	xxlor	24+32, 13, 13
-
-	vcipher	15, 15, 23
-	vcipher	16, 16, 23
-	vcipher	17, 17, 23
-	vcipher	18, 18, 23
-	vcipher	19, 19, 23
-	vcipher	20, 20, 23
-	vcipher	21, 21, 23
-	vcipher	22, 22, 23
-
-	vcipher	15, 15, 24
-	vcipher	16, 16, 24
-	vcipher	17, 17, 24
-	vcipher	18, 18, 24
-	vcipher	19, 19, 24
-	vcipher	20, 20, 24
-	vcipher	21, 21, 24
-	vcipher	22, 22, 24
-
-	xxlor	23+32, 14, 14
-
-	cmpdi	10, 14
-	beq	Do_next_ghash
-	b	aes_gcm_out
-
-Do_next_ghash:
-
-	#
-	# last round
-	vcipherlast     15, 15, 23
-	vcipherlast     16, 16, 23
-
-	xxlxor		47, 47, 15
-	stxvb16x        47, 0, 9	# store output
-	xxlxor		48, 48, 16
-	stxvb16x        48, 15, 9	# store output
-
-	vcipherlast     17, 17, 23
-	vcipherlast     18, 18, 23
-
-	xxlxor		49, 49, 17
-	stxvb16x        49, 16, 9	# store output
-	xxlxor		50, 50, 18
-	stxvb16x        50, 17, 9	# store output
-
-	vcipherlast     19, 19, 23
-	vcipherlast     20, 20, 23
-
-	xxlxor		51, 51, 19
-	stxvb16x        51, 18, 9	# store output
-	xxlxor		52, 52, 20
-	stxvb16x        52, 19, 9	# store output
-
-	vcipherlast     21, 21, 23
-	vcipherlast     22, 22, 23
-
-	xxlxor		53, 53, 21
-	stxvb16x        53, 20, 9	# store output
-	xxlxor		54, 54, 22
-	stxvb16x        54, 21, 9	# store output
-
-	addi		9, 9, 128
-
-	# ghash here
-	ppc_aes_gcm_ghash2_4x
-
-	xxlor	27+32, 0, 0
-	vaddudm 30, 30, 31		# IV + counter
-	vmr	29, 30
-	vxor    15, 30, 27		# add round key
-	vaddudm 30, 30, 31
-	vxor    16, 30, 27
-	vaddudm 30, 30, 31
-	vxor    17, 30, 27
-	vaddudm 30, 30, 31
-	vxor    18, 30, 27
-	vaddudm 30, 30, 31
-	vxor    19, 30, 27
-	vaddudm 30, 30, 31
-	vxor    20, 30, 27
-	vaddudm 30, 30, 31
-	vxor    21, 30, 27
-	vaddudm 30, 30, 31
-	vxor    22, 30, 27
-
-	addi    12, 12, -128
-	addi    11, 11, 128
-
-	bdnz	Loop_8x_block
-
-	vmr	30, 29
-	stxvb16x 30+32, 0, 7		# update IV
-
-Loop_last_block:
-	cmpdi   12, 0
-	beq     aes_gcm_out
-
-	# loop last few blocks
+################################################################################
+# Compute AES and ghash one block at a time.
+# r23: AES rounds
+# v30: current IV
+# vs0: roundkey 0
+#
+################################################################################
+SYM_FUNC_START_LOCAL(aes_gcm_crypt_1x)
+
+	cmpdi	5, 16
+	bge	__More_1x
+	blr
+__More_1x:
 	li      10, 16
-	divdu   10, 12, 10
-
-	mtctr   10
-
-	lwz	10, 240(6)
-
-	cmpdi   12, 16
-	blt     Final_block
-
-Next_rem_block:
-	lxvb16x 15, 0, 14		# load block
-
-	Loop_aes_middle_1x
-
-	xxlor	23+32, 10, 10
-
-	cmpdi	10, 10
-	beq	Do_next_1x
-
-	# 192 bits
-	xxlor	24+32, 11, 11
-
-	vcipher	15, 15, 23
-	vcipher	15, 15, 24
-
-	xxlor	23+32, 12, 12
+	divdu   12, 5, 10
+
+	xxlxor	32+15, 32+30, 0
+
+	# Pre-load 8 AES rounds to scratch vectors.
+	xxlor	32+16, 1, 1
+	xxlor	32+17, 2, 2
+	xxlor	32+18, 3, 3
+	xxlor	32+19, 4, 4
+	xxlor	32+20, 5, 5
+	xxlor	32+21, 6, 6
+	xxlor	32+28, 7, 7
+	xxlor	32+29, 8, 8
+	lwz	23, 240(6)	# n rounds
+	addi	22, 23, -9	# remaing AES rounds
 
-	cmpdi	10, 12
-	beq	Do_next_1x
-
-	# 256 bits
-	xxlor	24+32, 13, 13
-
-	vcipher	15, 15, 23
-	vcipher	15, 15, 24
-
-	xxlor	23+32, 14, 14
-
-	cmpdi	10, 14
-	beq	Do_next_1x
-
-Do_next_1x:
-	vcipherlast     15, 15, 23
-
-	xxlxor		47, 47, 15
-	stxvb16x	47, 0, 9	# store output
-	addi		14, 14, 16
-	addi		9, 9, 16
-
-	vmr		28, 15
-	ppc_update_hash_1x
-
-	addi		12, 12, -16
-	addi		11, 11, 16
-	xxlor		19+32, 0, 0
-	vaddudm		30, 30, 31		# IV + counter
-	vxor		15, 30, 19		# add round key
-
-	bdnz	Next_rem_block
-
-	li	15, 0
-	std	15, 56(7)		# clear partial?
-	stxvb16x 30+32, 0, 7		# update IV
 	cmpdi	12, 0
-	beq	aes_gcm_out
-
-Final_block:
-	lwz	10, 240(6)
-	Loop_aes_middle_1x
-
-	xxlor	23+32, 10, 10
-
-	cmpdi	10, 10
-	beq	Do_final_1x
-
-	# 192 bits
-	xxlor	24+32, 11, 11
-
-	vcipher	15, 15, 23
-	vcipher	15, 15, 24
-
-	xxlor	23+32, 12, 12
-
-	cmpdi	10, 12
-	beq	Do_final_1x
-
-	# 256 bits
-	xxlor	24+32, 13, 13
-
-	vcipher	15, 15, 23
-	vcipher	15, 15, 24
+	bgt	__Loop_1x
+	blr
 
-	xxlor	23+32, 14, 14
+__Loop_1x:
+	mtctr	22
+	addi	10, 6, 144
+	vcipher	15, 15, 16
+	vcipher	15, 15, 17
+	vcipher	15, 15, 18
+	vcipher	15, 15, 19
+	vcipher	15, 15, 20
+	vcipher	15, 15, 21
+	vcipher	15, 15, 28
+	vcipher	15, 15, 29
 
-	cmpdi	10, 14
-	beq	Do_final_1x
+__Loop_aes_1state:
+	lxv	32+1, 0(10)
+	vcipher	15, 15, 1
+	addi	10, 10, 16
+	bdnz	__Loop_aes_1state
+	lxv	32+1, 0(10)		# last round key
+	lxvb16x 11, 0, 14		# load input block
+	vcipherlast 15, 15, 1
+
+	xxlxor	32+15, 32+15, 11
+	stxvb16x 32+15, 0, 9	# store output
+	addi	14, 14, 16
+	addi	9, 9, 16
 
-Do_final_1x:
-	vcipherlast     15, 15, 23
+	cmpdi	24, 0	# decrypt?
+	bne	__Encrypt_1x
+	xxlor	15+32, 11, 11
+__Encrypt_1x:
+	vxor	15, 15, 0
+	PPC_GHASH1x 0, 15
 
-	# check partial block
-	li	21, 0			# encrypt
-	ld	15, 56(7)		# partial?
-	cmpdi	15, 0
-	beq	Normal_block
-	bl	Do_partial_block
+	addi	5, 5, -16
+	addi	11, 11, 16
 
+	vadduwm 30, 30, 31		# IV + counter
+	xxlxor	32+15, 32+30, 0
+	addi	12, 12, -1
 	cmpdi	12, 0
-	ble aes_gcm_out
+	bgt	__Loop_1x
 
-	b Continue_partial_check
-
-Normal_block:
-	lxvb16x	15, 0, 14		# load last block
-	xxlxor	47, 47, 15
-
-	# create partial block mask
-	li	15, 16
-	sub	15, 15, 12		# index to the mask
-
-	vspltisb	16, -1		# first 16 bytes - 0xffff...ff
-	vspltisb	17, 0		# second 16 bytes - 0x0000...00
-	li	10, 192
-	stvx	16, 10, 1
+	stxvb16x 32+30, 0, 7		# update IV
+	stxvb16x 32+0, 0, 8		# update Xi
+	blr
+SYM_FUNC_END(aes_gcm_crypt_1x)
+
+################################################################################
+# Process a normal partial block when we come here.
+#  Compute partial mask, Load and store partial block to stack.
+#  Update partial_len and pblock.
+#  pblock is (encrypted ^ AES state) for encrypt
+#        and (input ^ AES state) for decrypt.
+#
+################################################################################
+SYM_FUNC_START_LOCAL(__Process_partial)
+
+	# create partial mask
+	vspltisb 16, -1
+	li	12, 16
+	sub	12, 12, 5
+	sldi	12, 12, 3
+	mtvsrdd	32+17, 0, 12
+	vslo	16, 16, 17		# partial block mask
+
+	lxvb16x 11, 0, 14		# load partial block
+	xxland	11, 11, 32+16
+
+	# AES crypt partial
+	xxlxor	32+15, 32+30, 0
+	lwz	23, 240(6)		# n rounds
+	addi	22, 23, -1		# loop - 1
+	mtctr	22
+	addi	10, 6, 16
+
+__Loop_aes_pstate:
+	lxv	32+1, 0(10)
+	vcipher	15, 15, 1
 	addi	10, 10, 16
-	stvx	17, 10, 1
-
-	addi	10, 1, 192
-	lxvb16x	16, 15, 10		# load partial block mask
-	xxland	47, 47, 16
-
-	vmr	28, 15
-	ppc_update_hash_1x
+	bdnz	__Loop_aes_pstate
+	lxv	32+1, 0(10)		# last round key
+	vcipherlast 15, 15, 1
 
-	# * should store only the remaining bytes.
-	bl	Write_partial_block
-
-	stxvb16x 30+32, 0, 7		# update IV
-	std	12, 56(7)		# update partial?
-	li	16, 16
+	xxlxor	32+15, 32+15, 11
+	vand	15, 15, 16
 
-	stxvb16x	32, 0, 8		# write out Xi
-	stxvb16x	32, 16, 8		# write out Xi
-	b aes_gcm_out
-
- #
- # Compute data mask
- #
-.macro GEN_MASK _mask _start _end
-	vspltisb	16, -1		# first 16 bytes - 0xffff...ff
-	vspltisb	17, 0		# second 16 bytes - 0x0000...00
-	li	10, 192
-	stxvb16x	17+32, 10, 1
-	add	10, 10, \_start
-	stxvb16x	16+32, 10, 1
-	add	10, 10, \_end
-	stxvb16x	17+32, 10, 1
-
-	addi	10, 1, 192
-	lxvb16x	\_mask, 0, 10		# load partial block mask
-.endm
+	# AES crypt output v15
+	# Write partial
+	li	10, 224
+	stxvb16x 15+32, 10, 1		# write v15 to stack
+	addi	10, 1, 223
+	addi	12, 9, -1
+        mtctr	5			# partial block len
+__Write_partial:
+        lbzu	22, 1(10)
+	stbu	22, 1(12)
+        bdnz	__Write_partial
+
+	cmpdi	24, 0			# decrypt?
+	bne	__Encrypt_partial
+	xxlor	32+15, 11, 11		# decrypt using the input block
+__Encrypt_partial:
+	#vxor	15, 15, 0		# ^ previous hash
+	#PPC_GHASH1x 0, 15
+
+	add	14, 14, 5
+	add	9, 9, 5
+	std	5, 56(7)		# update partial
+	sub	11, 11, 5
+	li	5, 0			# done last byte
 
- #
- # Handle multiple partial blocks for encrypt and decrypt
- #   operations.
- #
-SYM_FUNC_START_LOCAL(Do_partial_block)
-	add	17, 15, 5
-	cmpdi	17, 16
-	bgt	Big_block
-	GEN_MASK 18, 15, 5
-	b	_Partial
-SYM_FUNC_END(Do_partial_block)
-Big_block:
+	#
+	# Don't increase IV since this is the last partial.
+	# It should get updated in gcm_update if no more data blocks.
+	#vadduwm	30, 30, 31		# increase IV
+	stxvb16x 32+30, 0, 7		# update IV
+	li	10, 64
+	stxvb16x 32+0, 0, 8		# Update X1
+	stxvb16x 32+15, 10, 7		# Update pblock
+	blr
+SYM_FUNC_END(__Process_partial)
+
+################################################################################
+# Combine partial blocks and ghash when we come here.
+#
+# The partial block has to be shifted to the right location to encrypt/decrypt
+# and compute ghash if combing the previous partial block is needed.
+# - Compute ghash for a full block. Clear Partial_len and pblock. Update IV.
+#   Write Xi.
+# - Don't compute ghash if not full block.  gcm_update will take care of it
+#   is the last block. Update Partial_len and pblock.
+#
+################################################################################
+SYM_FUNC_START_LOCAL(__Combine_partial)
+
+	ld	12, 56(7)
+	mr	21, 5			# these bytes to be processed
+
+	li	17, 0
 	li	16, 16
-	GEN_MASK 18, 15, 16
-
-_Partial:
-	lxvb16x	17+32, 0, 14		# load last block
-	sldi	16, 15, 3
-	mtvsrdd	32+16, 0, 16
-	vsro	17, 17, 16
-	xxlxor	47, 47, 17+32
-	xxland	47, 47, 18
-
-	vxor	0, 0, 0			# clear Xi
-	vmr	28, 15
-
-	cmpdi	21, 0			# encrypt/decrypt ops?
-	beq	Skip_decrypt
-	xxland	32+28, 32+17, 18
-
-Skip_decrypt:
-
-	ppc_update_hash_1x
+	sub	22, 16, 12		# bytes to complete a block
+	sub	17, 22, 5		# remaining bytes in a block
+	cmpdi	5, 16
+	ble	__Inp_msg_less16
+	li	17, 0
+	mr	21, 22
+	b	__Combine_continue
+__Inp_msg_less16:
+	cmpd	22, 5
+	bgt	__Combine_continue
+	li	17, 0
+	mr	21, 22			# these bytes to be processed
+
+__Combine_continue:
+	# load msg and shift to the proper location and mask
+	vspltisb 16, -1
+	sldi	15, 12, 3
+	mtvsrdd	32+17, 0, 15
+	vslo	16, 16, 17
+	vsro	16, 16, 17
+	sldi	15, 17, 3
+	mtvsrdd	32+17, 0, 15
+	vsro	16, 16, 17
+	vslo	16, 16, 17		# mask
+
+	lxvb16x 32+19, 0, 14		# load partial block
+	sldi	15, 12, 3
+	mtvsrdd	32+17, 0, 15
+	vsro	19, 19, 17		# 0x00..xxxx??..??
+	sldi	15, 17, 3
+	mtvsrdd	32+17, 0, 15
+	vsro	19, 19, 17		# 0x00..xxxx
+	vslo	19, 19, 17		# shift back to form 0x00..xxxx00..00
+
+	# AES crypt partial
+	xxlxor	32+15, 32+30, 0
+	lwz	23, 240(6)	# n rounds
+	addi	22, 23, -1	# loop - 1
+	mtctr	22
+	addi	10, 6, 16
+
+__Loop_aes_cpstate:
+	lxv	32+1, 0(10)
+	vcipher	15, 15, 1
+	addi	10, 10, 16
+	bdnz	__Loop_aes_cpstate
+	lxv	32+1, 0(10)		# last round key
+	vcipherlast 15, 15, 1
 
-	li	16, 16
-	lxvb16x 32+29, 16, 8
-	vxor	0, 0, 29
-	stxvb16x 32, 0, 8		# save Xi
-	stxvb16x 32, 16, 8		# save Xi
-
-	# store partial block
-	# loop the rest of the stream if any
-	sldi	16, 15, 3
-	mtvsrdd	32+16, 0, 16
-	vslo	15, 15, 16
-	#stxvb16x 15+32, 0, 9		# last block
+	vxor	15, 15, 19
+	vand	15, 15, 16
 
-	li	16, 16
-	sub	17, 16, 15		# 16 - partial
-
-	add	16, 15, 5
-	cmpdi	16, 16
-	bgt	Larger_16
-	mr	17, 5
-Larger_16:
-
-	# write partial
-	li		10, 192
-	stxvb16x	15+32, 10, 1	# save current block
-
-	addi		10, 9, -1
-	addi		16, 1, 191
-	mtctr		17		# move partial byte count
-
-Write_last_partial:
-        lbzu		18, 1(16)
-	stbu		18, 1(10)
-        bdnz		Write_last_partial
-	# Complete loop partial
-
-	add	14, 14, 17
-	add	9, 9, 17
-	sub	12, 12, 17
-	add	11, 11, 17
-
-	add	15, 15, 5
-	cmpdi	15, 16
-	blt	Save_partial
-
-	vaddudm	30, 30, 31
-	stxvb16x 30+32, 0, 7		# update IV
-	xxlor	32+29, 0, 0
-	vxor	15, 30, 29		# IV + round key - add round key 0
-	li	15, 0
-	std	15, 56(7)		# partial done - clear
-	b	Partial_done
-Save_partial:
-	std	15, 56(7)		# partial
-
-Partial_done:
+	# AES crypt output v15
+	# Write partial
+	li	10, 224
+	stxvb16x 15+32, 10, 1		# write v15 to stack
+	addi	10, 1, 223
+	add	10, 10, 12		# add offset
+	addi	15, 9, -1
+        mtctr	21			# partial block len
+__Write_combine_partial:
+        lbzu	22, 1(10)
+	stbu	22, 1(15)
+        bdnz	__Write_combine_partial
+
+	add	14, 14, 21
+	add	11, 11, 21
+	add	9, 9, 21
+	sub	5, 5, 21
+
+	# Encrypt/Decrypt?
+	cmpdi	24, 0			# decrypt?
+	bne	__Encrypt_combine_partial
+	vmr	15, 19		# decrypt using the input block
+
+__Encrypt_combine_partial:
+	#
+	# Update partial flag and combine ghash.
+__Update_partial_ghash:
+	li	10, 64
+	lxvb16x 32+17, 10, 7		# load previous pblock
+	add	12, 12, 21		# combined pprocessed
+	vxor	15, 15, 17		# combined pblock
+
+	cmpdi	12, 16
+	beq	__Clear_partial_flag
+	std	12, 56(7)		# update partial len
+	stxvb16x 32+15, 10, 7		# Update current pblock
 	blr
 
- #
- # Write partial block
- # r9 - output
- # r12 - remaining bytes
- # v15 - partial input data
- #
-SYM_FUNC_START_LOCAL(Write_partial_block)
-	li		10, 192
-	stxvb16x	15+32, 10, 1		# last block
-
-	addi		10, 9, -1
-	addi		16, 1, 191
-
-        mtctr		12			# remaining bytes
-	li		15, 0
-
-Write_last_byte:
-        lbzu		14, 1(16)
-	stbu		14, 1(10)
-        bdnz		Write_last_byte
+__Clear_partial_flag:
+	li	12, 0
+	std	12, 56(7)
+	# Update IV and ghash here
+	vadduwm	30, 30, 31		# increase IV
+	stxvb16x 32+30, 0, 7		# update IV
+
+	# v15 either is either (input blockor encrypted)^(AES state)
+	vxor	15, 15, 0
+	PPC_GHASH1x 0, 15
+	stxvb16x 32+0, 10, 7		# update pblock for debug?
+	stxvb16x 32+0, 0, 8		# update Xi
 	blr
-SYM_FUNC_END(Write_partial_block)
+SYM_FUNC_END(__Combine_partial)
 
-aes_gcm_out:
-	# out = state
-	stxvb16x	32, 0, 8		# write out Xi
-	add	3, 11, 12		# return count
+################################################################################
+# gcm_update(iv, Xi) - compute last hash
+#
+################################################################################
+SYM_FUNC_START(gcm_update)
 
-	RESTORE_REGS
-	blr
+	ld	10, 56(3)
+	cmpdi	10, 0
+	beq	__no_update
 
- #
- # 8x Decrypt
- #
-_GLOBAL(aes_p10_gcm_decrypt)
-.align 5
+	lxvb16x	32, 0, 4	# load Xi
+	# load Hash - h^4, h^3, h^2, h
+	li	10, 32
+	lxvd2x	2+32, 10, 4	# H Poli
+	li	10, 48
+	lxvd2x	3+32, 10, 4	# Hl
+	li	10, 64
+	lxvd2x	4+32, 10, 4	# H
+	li	10, 80
+	lxvd2x	5+32, 10, 4	# Hh
+
+	addis	11, 2, permx@toc@ha
+	addi	11, 11, permx@toc@l
+	lxv	10, 0(11)	# vs10: vpermxor vector
+
+	li	9, 64
+	lxvb16x 32+6, 9, 3		# load pblock
+	vxor	6, 6, 0
+
+	vxor	1, 1, 1
+	vpmsumd	12, 3, 6		# L
+	vpmsumd	13, 4, 6		# M
+	vpmsumd	14, 5, 6		# H
+	vpmsumd	17, 12, 2		# reduction
+	vsldoi	15, 13, 1, 8		# mL
+	vsldoi	16, 1, 13, 8		# mH
+	vxor	12, 12, 15		# LL + LL
+	vxor	14, 14, 16		# HH + HH
+	xxlor	32+15, 10, 10
+	vpermxor 12, 12, 17, 15
+	vsldoi	13, 12, 12, 8		# swap
+	vpmsumd	12, 12, 2		# reduction
+	vxor	13, 13, 14
+	vxor	7, 12, 13
+
+	#vxor	0, 0, 0
+	#stxvb16x 32+0, 9, 3
+	li	10, 0
+	std	10, 56(3)
+	stxvb16x 32+7, 0, 4
+
+__no_update:
+	blr
+SYM_FUNC_END(gcm_update)
+
+################################################################################
+# aes_p10_gcm_encrypt (const void *inp, void *out, size_t len,
+#               const char *rk, unsigned char iv[16], void *Xip);
+#
+#    r3 - inp
+#    r4 - out
+#    r5 - len
+#    r6 - AES round keys
+#    r7 - iv and other data
+#    r8 - Xi, HPoli, hash keys
+#
+#    rounds is at offset 240 in rk
+#    Xi is at 0 in gcm_table (Xip).
+#
+################################################################################
+SYM_FUNC_START(aes_p10_gcm_encrypt)
+
+	cmpdi	5, 0
+	ble	__Invalid_msg_len
 
 	SAVE_REGS
-
 	LOAD_HASH_TABLE
 
 	# initialize ICB: GHASH( IV ), IV - r7
 	lxvb16x	30+32, 0, 7	# load IV  - v30
 
-	mr	12, 5		# length
-	li	11, 0		# block index
+	mr	14, 3
+	mr	9, 4
 
 	# counter 1
 	vxor	31, 31, 31
 	vspltisb 22, 1
 	vsldoi	31, 31, 22,1	# counter 1
 
-	# load round key to VSR
-	lxv	0, 0(6)
-	lxv	1, 0x10(6)
-	lxv	2, 0x20(6)
-	lxv	3, 0x30(6)
-	lxv	4, 0x40(6)
-	lxv	5, 0x50(6)
-	lxv	6, 0x60(6)
-	lxv	7, 0x70(6)
-	lxv	8, 0x80(6)
-	lxv	9, 0x90(6)
-	lxv	10, 0xa0(6)
+	addis	11, 2, permx@toc@ha
+	addi	11, 11, permx@toc@l
+	lxv	10, 0(11)	# vs10: vpermxor vector
+	li	11, 0
+
+	# load 9 round keys to VSR
+	lxv	0, 0(6)			# round key 0
+	lxv	1, 16(6)		# round key 1
+	lxv	2, 32(6)		# round key 2
+	lxv	3, 48(6)		# round key 3
+	lxv	4, 64(6)		# round key 4
+	lxv	5, 80(6)		# round key 5
+	lxv	6, 96(6)		# round key 6
+	lxv	7, 112(6)		# round key 7
+	lxv	8, 128(6)		# round key 8
 
 	# load rounds - 10 (128), 12 (192), 14 (256)
-	lwz	9,240(6)
+	lwz	23, 240(6)		# n rounds
+	li	24, 1			# encrypt
 
+__Process_encrypt:
 	#
-	# vxor	state, state, w # addroundkey
-	xxlor	32+29, 0, 0
-	vxor	15, 30, 29	# IV + round key - add round key 0
-
-	cmpdi	9, 10
-	beq	Loop_aes_gcm_8x_dec
-
-	# load 2 more round keys (v11, v12)
-	lxv	11, 0xb0(6)
-	lxv	12, 0xc0(6)
-
-	cmpdi	9, 12
-	beq	Loop_aes_gcm_8x_dec
-
-	# load 2 more round keys (v11, v12, v13, v14)
-	lxv	13, 0xd0(6)
-	lxv	14, 0xe0(6)
-	cmpdi	9, 14
-	beq	Loop_aes_gcm_8x_dec
+	# Process different blocks
+	#
+	ld	12, 56(7)
+	cmpdi	12, 0
+	bgt	__Do_combine_enc
+	cmpdi	5, 128
+	blt	__Process_more_enc
+
+#
+# Process 8x AES/GCM blocks
+#
+__Process_8x_enc:
+	# 8x blcoks
+	li	10, 128
+	divdu	12, 5, 10	# n 128 bytes-blocks
 
-	b	aes_gcm_out
+	addi	12, 12, -1	# loop - 1
 
-.align 5
-Loop_aes_gcm_8x_dec:
-	mr	14, 3
-	mr	9, 4
+	vmr	15, 30		# first state: IV
+	vadduwm	16, 15, 31	# state + counter
+	vadduwm	17, 16, 31
+	vadduwm	18, 17, 31
+	vadduwm	19, 18, 31
+	vadduwm	20, 19, 31
+	vadduwm	21, 20, 31
+	vadduwm	22, 21, 31
+	xxlor	9, 32+22, 32+22	# save last state
 
-	#
-	# check partial block
-	#
-Continue_partial_check_dec:
-	ld	15, 56(7)
-	cmpdi	15, 0
-	beq	Continue_dec
-	bgt	Final_block_dec
-	cmpdi	15, 16
-	blt	Final_block_dec
-
-Continue_dec:
-	# n blcoks
-	li	10, 128
-	divdu	10, 12, 10	# n 128 bytes-blocks
-	cmpdi	10, 0
-	beq	Loop_last_block_dec
-
-	vaddudm	30, 30, 31	# IV + counter
-	vxor	16, 30, 29
-	vaddudm	30, 30, 31
-	vxor	17, 30, 29
-	vaddudm	30, 30, 31
-	vxor	18, 30, 29
-	vaddudm	30, 30, 31
-	vxor	19, 30, 29
-	vaddudm	30, 30, 31
-	vxor	20, 30, 29
-	vaddudm	30, 30, 31
-	vxor	21, 30, 29
-	vaddudm	30, 30, 31
-	vxor	22, 30, 29
-
-	mtctr	10
+	# vxor  state, state, w # addroundkey
+	xxlor	32+29, 0, 0
+        vxor    15, 15, 29      # IV + round key - add round key 0
+	vxor	16, 16, 29
+	vxor	17, 17, 29
+	vxor	18, 18, 29
+	vxor	19, 19, 29
+	vxor	20, 20, 29
+	vxor	21, 21, 29
+	vxor	22, 22, 29
 
 	li	15, 16
 	li	16, 32
@@ -1217,305 +735,502 @@ Continue_dec:
 	li	20, 96
 	li	21, 112
 
-	lwz	10, 240(6)
-
-Loop_8x_block_dec:
-
-	lxvb16x		15, 0, 14	# load block
-	lxvb16x		16, 15, 14	# load block
-	lxvb16x		17, 16, 14	# load block
-	lxvb16x		18, 17, 14	# load block
-	lxvb16x		19, 18, 14	# load block
-	lxvb16x		20, 19, 14	# load block
-	lxvb16x		21, 20, 14	# load block
-	lxvb16x		22, 21, 14	# load block
-	addi		14, 14, 128
-
-	Loop_aes_middle8x
-
-	xxlor	23+32, 10, 10
-
-	cmpdi	10, 10
-	beq	Do_next_ghash_dec
-
-	# 192 bits
-	xxlor	24+32, 11, 11
-
-	vcipher	15, 15, 23
-	vcipher	16, 16, 23
-	vcipher	17, 17, 23
-	vcipher	18, 18, 23
-	vcipher	19, 19, 23
-	vcipher	20, 20, 23
-	vcipher	21, 21, 23
-	vcipher	22, 22, 23
-
-	vcipher	15, 15, 24
-	vcipher	16, 16, 24
-	vcipher	17, 17, 24
-	vcipher	18, 18, 24
-	vcipher	19, 19, 24
-	vcipher	20, 20, 24
-	vcipher	21, 21, 24
-	vcipher	22, 22, 24
-
-	xxlor	23+32, 12, 12
-
-	cmpdi	10, 12
-	beq	Do_next_ghash_dec
-
-	# 256 bits
-	xxlor	24+32, 13, 13
-
-	vcipher	15, 15, 23
-	vcipher	16, 16, 23
-	vcipher	17, 17, 23
-	vcipher	18, 18, 23
-	vcipher	19, 19, 23
-	vcipher	20, 20, 23
-	vcipher	21, 21, 23
-	vcipher	22, 22, 23
-
-	vcipher	15, 15, 24
-	vcipher	16, 16, 24
-	vcipher	17, 17, 24
-	vcipher	18, 18, 24
-	vcipher	19, 19, 24
-	vcipher	20, 20, 24
-	vcipher	21, 21, 24
-	vcipher	22, 22, 24
-
-	xxlor	23+32, 14, 14
-
-	cmpdi	10, 14
-	beq	Do_next_ghash_dec
-	b	aes_gcm_out
+	#
+	# Pre-compute first 8 AES state and leave 1/3/5 more rounds
+	# for the loop.
+	#
+	addi	22, 23, -9		# process 8 keys
+	mtctr	22			# AES key loop
+	addi	10, 6, 144
 
-Do_next_ghash_dec:
+	LOOP_8AES_STATE			# process 8 AES keys
 
-	#
-	# last round
-	vcipherlast     15, 15, 23
-	vcipherlast     16, 16, 23
-
-	xxlxor		47, 47, 15
-	stxvb16x        47, 0, 9	# store output
-	xxlxor		48, 48, 16
-	stxvb16x        48, 15, 9	# store output
-
-	vcipherlast     17, 17, 23
-	vcipherlast     18, 18, 23
-
-	xxlxor		49, 49, 17
-	stxvb16x        49, 16, 9	# store output
-	xxlxor		50, 50, 18
-	stxvb16x        50, 17, 9	# store output
-
-	vcipherlast     19, 19, 23
-	vcipherlast     20, 20, 23
-
-	xxlxor		51, 51, 19
-	stxvb16x        51, 18, 9	# store output
-	xxlxor		52, 52, 20
-	stxvb16x        52, 19, 9	# store output
-
-	vcipherlast     21, 21, 23
-	vcipherlast     22, 22, 23
-
-	xxlxor		53, 53, 21
-	stxvb16x        53, 20, 9	# store output
-	xxlxor		54, 54, 22
-	stxvb16x        54, 21, 9	# store output
-
-	addi		9, 9, 128
-
-	xxlor           15+32, 15, 15
-	xxlor           16+32, 16, 16
-	xxlor           17+32, 17, 17
-	xxlor           18+32, 18, 18
-	xxlor           19+32, 19, 19
-	xxlor           20+32, 20, 20
-	xxlor           21+32, 21, 21
-	xxlor           22+32, 22, 22
+__PreLoop_aes_state:
+	lxv	32+1, 0(10)		# round key
+	AES_CIPHER_8x vcipher 15 1
+	addi	10, 10, 16
+	bdnz	__PreLoop_aes_state
+	lxv	32+1, 0(10)		# last round key (v1)
+
+	cmpdi	12, 0			# Only one loop (8 block)
+	beq	__Finish_ghash
+
+#
+# Loop 8x blocks and compute ghash
+#
+__Loop_8x_block_enc:
+	vcipherlast     15, 15, 1
+	vcipherlast     16, 16, 1
+	vcipherlast     17, 17, 1
+	vcipherlast     18, 18, 1
+	vcipherlast     19, 19, 1
+	vcipherlast     20, 20, 1
+	vcipherlast     21, 21, 1
+	vcipherlast     22, 22, 1
+
+	lxvb16x	32+23, 0, 14	# load block
+	lxvb16x	32+24, 15, 14	# load block
+	lxvb16x	32+25, 16, 14	# load block
+	lxvb16x	32+26, 17, 14	# load block
+	lxvb16x	32+27, 18, 14	# load block
+	lxvb16x	32+28, 19, 14	# load block
+	lxvb16x	32+29, 20, 14	# load block
+	lxvb16x	32+30, 21, 14	# load block
+	addi	14, 14, 128
+
+	vxor	15, 15, 23
+	vxor	16, 16, 24
+	vxor	17, 17, 25
+	vxor	18, 18, 26
+	vxor	19, 19, 27
+	vxor	20, 20, 28
+	vxor	21, 21, 29
+	vxor	22, 22, 30
+
+	stxvb16x 47, 0, 9	# store output
+	stxvb16x 48, 15, 9	# store output
+	stxvb16x 49, 16, 9	# store output
+	stxvb16x 50, 17, 9	# store output
+	stxvb16x 51, 18, 9	# store output
+	stxvb16x 52, 19, 9	# store output
+	stxvb16x 53, 20, 9	# store output
+	stxvb16x 54, 21, 9	# store output
+	addi	9, 9, 128
 
 	# ghash here
-	ppc_aes_gcm_ghash2_4x
-
-	xxlor	27+32, 0, 0
-	vaddudm 30, 30, 31		# IV + counter
-	vmr	29, 30
-	vxor    15, 30, 27		# add round key
-	vaddudm 30, 30, 31
-	vxor    16, 30, 27
-	vaddudm 30, 30, 31
-	vxor    17, 30, 27
-	vaddudm 30, 30, 31
-	vxor    18, 30, 27
-	vaddudm 30, 30, 31
-	vxor    19, 30, 27
-	vaddudm 30, 30, 31
-	vxor    20, 30, 27
-	vaddudm 30, 30, 31
-	vxor    21, 30, 27
-	vaddudm 30, 30, 31
-	vxor    22, 30, 27
-
-	addi    12, 12, -128
+	vxor	15, 15, 0
+	PPC_GHASH4x 0, 15, 16, 17, 18
+
+	vxor	19, 19, 0
+	PPC_GHASH4x 0, 19, 20, 21, 22
+
+	xxlor	32+15, 9, 9		# last state
+	vadduwm 15, 15, 31		# state + counter
+	vadduwm 16, 15, 31
+	vadduwm 17, 16, 31
+	vadduwm 18, 17, 31
+	vadduwm 19, 18, 31
+	vadduwm 20, 19, 31
+	vadduwm 21, 20, 31
+	vadduwm 22, 21, 31
+	xxlor	9, 32+22, 32+22		# save last state
+
+	xxlor	32+27, 0, 0		# restore roundkey 0
+        vxor    15, 15, 27		# IV + round key - add round key 0
+	vxor	16, 16, 27
+	vxor	17, 17, 27
+	vxor	18, 18, 27
+	vxor	19, 19, 27
+	vxor	20, 20, 27
+	vxor	21, 21, 27
+	vxor	22, 22, 27
+
+	addi    5, 5, -128
 	addi    11, 11, 128
 
-	bdnz	Loop_8x_block_dec
-
-	vmr	30, 29
-	stxvb16x 30+32, 0, 7		# update IV
-
-Loop_last_block_dec:
-	cmpdi   12, 0
-	beq     aes_gcm_out
-
-	# loop last few blocks
-	li      10, 16
-	divdu   10, 12, 10
-
-	mtctr   10
-
-	lwz	10, 240(6)
-
-	cmpdi   12, 16
-	blt     Final_block_dec
-
-Next_rem_block_dec:
-	lxvb16x 15, 0, 14		# load block
-
-	Loop_aes_middle_1x
-
-	xxlor	23+32, 10, 10
+	LOOP_8AES_STATE			# process 8 AES keys
+	mtctr	22			# AES key loop
+	addi	10, 6, 144
+__LastLoop_aes_state:
+	lxv	32+1, 0(10)		# round key
+	AES_CIPHER_8x vcipher 15 1
+	addi	10, 10, 16
+	bdnz	__LastLoop_aes_state
+	lxv	32+1, 0(10)		# last round key (v1)
 
-	cmpdi	10, 10
-	beq	Do_next_1x_dec
+	addi	12, 12, -1
+	cmpdi	12, 0
+	bne	__Loop_8x_block_enc
+
+__Finish_ghash:
+	vcipherlast     15, 15, 1
+	vcipherlast     16, 16, 1
+	vcipherlast     17, 17, 1
+	vcipherlast     18, 18, 1
+	vcipherlast     19, 19, 1
+	vcipherlast     20, 20, 1
+	vcipherlast     21, 21, 1
+	vcipherlast     22, 22, 1
+
+	lxvb16x	32+23, 0, 14	# load block
+	lxvb16x	32+24, 15, 14	# load block
+	lxvb16x	32+25, 16, 14	# load block
+	lxvb16x	32+26, 17, 14	# load block
+	lxvb16x	32+27, 18, 14	# load block
+	lxvb16x	32+28, 19, 14	# load block
+	lxvb16x	32+29, 20, 14	# load block
+	lxvb16x	32+30, 21, 14	# load block
+	addi	14, 14, 128
+
+	vxor	15, 15, 23
+	vxor	16, 16, 24
+	vxor	17, 17, 25
+	vxor	18, 18, 26
+	vxor	19, 19, 27
+	vxor	20, 20, 28
+	vxor	21, 21, 29
+	vxor	22, 22, 30
+
+	stxvb16x 47, 0, 9	# store output
+	stxvb16x 48, 15, 9	# store output
+	stxvb16x 49, 16, 9	# store output
+	stxvb16x 50, 17, 9	# store output
+	stxvb16x 51, 18, 9	# store output
+	stxvb16x 52, 19, 9	# store output
+	stxvb16x 53, 20, 9	# store output
+	stxvb16x 54, 21, 9	# store output
+	addi	9, 9, 128
+
+	vxor	15, 15, 0
+	PPC_GHASH4x 0, 15, 16, 17, 18
+
+	vxor	19, 19, 0
+	PPC_GHASH4x 0, 19, 20, 21, 22
+
+	xxlor	30+32, 9, 9		# last ctr
+	vadduwm	30, 30, 31		# increase ctr
+	stxvb16x 32+30, 0, 7		# update IV
+	stxvb16x 32+0, 0, 8		# update Xi
+
+	addi    5, 5, -128
+	addi    11, 11, 128
 
-	# 192 bits
-	xxlor	24+32, 11, 11
+	#
+	# Done 8x blocks
+	#
 
-	vcipher	15, 15, 23
-	vcipher	15, 15, 24
+	cmpdi   5, 0
+	beq     aes_gcm_out
 
-	xxlor	23+32, 12, 12
+__Process_more_enc:
+	li	24, 1			# encrypt
+	bl	aes_gcm_crypt_1x
+	cmpdi   5, 0
+	beq     aes_gcm_out
 
-	cmpdi	10, 12
-	beq	Do_next_1x_dec
+	bl	__Process_partial
+	cmpdi   5, 0
+	beq     aes_gcm_out
+__Do_combine_enc:
+	bl	__Combine_partial
+	cmpdi	5, 0
+	bgt	__Process_encrypt
+	b	aes_gcm_out
 
-	# 256 bits
-	xxlor	24+32, 13, 13
+SYM_FUNC_END(aes_p10_gcm_encrypt)
 
-	vcipher	15, 15, 23
-	vcipher	15, 15, 24
+################################################################################
+# aes_p10_gcm_decrypt (const void *inp, void *out, size_t len,
+#               const char *rk, unsigned char iv[16], void *Xip);
+# 8x Decrypt
+#
+################################################################################
+SYM_FUNC_START(aes_p10_gcm_decrypt)
 
-	xxlor	23+32, 14, 14
+	cmpdi	5, 0
+	ble	__Invalid_msg_len
 
-	cmpdi	10, 14
-	beq	Do_next_1x_dec
+	SAVE_REGS
+	LOAD_HASH_TABLE
 
-Do_next_1x_dec:
-	vcipherlast     15, 15, 23
+	# initialize ICB: GHASH( IV ), IV - r7
+	lxvb16x	30+32, 0, 7	# load IV  - v30
 
-	xxlxor		47, 47, 15
-	stxvb16x	47, 0, 9	# store output
-	addi		14, 14, 16
-	addi		9, 9, 16
+	mr	14, 3
+	mr	9, 4
 
-	xxlor           28+32, 15, 15
-	#vmr		28, 15
-	ppc_update_hash_1x
+	# counter 1
+	vxor	31, 31, 31
+	vspltisb 22, 1
+	vsldoi	31, 31, 22,1	# counter 1
 
-	addi		12, 12, -16
-	addi		11, 11, 16
-	xxlor		19+32, 0, 0
-	vaddudm		30, 30, 31		# IV + counter
-	vxor		15, 30, 19		# add round key
+	addis	11, 2, permx@toc@ha
+	addi	11, 11, permx@toc@l
+	lxv	10, 0(11)	# vs10: vpermxor vector
+	li	11, 0
+
+	# load 9 round keys to VSR
+	lxv	0, 0(6)			# round key 0
+	lxv	1, 16(6)		# round key 1
+	lxv	2, 32(6)		# round key 2
+	lxv	3, 48(6)		# round key 3
+	lxv	4, 64(6)		# round key 4
+	lxv	5, 80(6)		# round key 5
+	lxv	6, 96(6)		# round key 6
+	lxv	7, 112(6)		# round key 7
+	lxv	8, 128(6)		# round key 8
 
-	bdnz	Next_rem_block_dec
+	# load rounds - 10 (128), 12 (192), 14 (256)
+	lwz	23, 240(6)		# n rounds
+	li	24, 0			# decrypt
 
-	li	15, 0
-	std	15, 56(7)		# clear partial?
-	stxvb16x 30+32, 0, 7		# update IV
+__Process_decrypt:
+	#
+	# Process different blocks
+	#
+	ld	12, 56(7)
 	cmpdi	12, 0
-	beq	aes_gcm_out
-
-Final_block_dec:
-	lwz	10, 240(6)
-	Loop_aes_middle_1x
-
-	xxlor	23+32, 10, 10
-
-	cmpdi	10, 10
-	beq	Do_final_1x_dec
+	bgt	__Do_combine_dec
+	cmpdi	5, 128
+	blt	__Process_more_dec
+
+#
+# Process 8x AES/GCM blocks
+#
+__Process_8x_dec:
+	# 8x blcoks
+	li	10, 128
+	divdu	12, 5, 10	# n 128 bytes-blocks
 
-	# 192 bits
-	xxlor	24+32, 11, 11
+	addi	12, 12, -1	# loop - 1
 
-	vcipher	15, 15, 23
-	vcipher	15, 15, 24
+	vmr	15, 30		# first state: IV
+	vadduwm	16, 15, 31	# state + counter
+	vadduwm	17, 16, 31
+	vadduwm	18, 17, 31
+	vadduwm	19, 18, 31
+	vadduwm	20, 19, 31
+	vadduwm	21, 20, 31
+	vadduwm	22, 21, 31
+	xxlor	9, 32+22, 32+22	# save last state
 
-	xxlor	23+32, 12, 12
+	# vxor  state, state, w # addroundkey
+	xxlor	32+29, 0, 0
+        vxor    15, 15, 29      # IV + round key - add round key 0
+	vxor	16, 16, 29
+	vxor	17, 17, 29
+	vxor	18, 18, 29
+	vxor	19, 19, 29
+	vxor	20, 20, 29
+	vxor	21, 21, 29
+	vxor	22, 22, 29
 
-	cmpdi	10, 12
-	beq	Do_final_1x_dec
+	li	15, 16
+	li	16, 32
+	li	17, 48
+	li	18, 64
+	li	19, 80
+	li	20, 96
+	li	21, 112
 
-	# 256 bits
-	xxlor	24+32, 13, 13
+	#
+	# Pre-compute first 8 AES state and leave 1/3/5 more rounds
+	# for the loop.
+	#
+	addi	22, 23, -9		# process 8 keys
+	mtctr	22			# AES key loop
+	addi	10, 6, 144
 
-	vcipher	15, 15, 23
-	vcipher	15, 15, 24
+	LOOP_8AES_STATE			# process 8 AES keys
 
-	xxlor	23+32, 14, 14
+__PreLoop_aes_state_dec:
+	lxv	32+1, 0(10)		# round key
+	AES_CIPHER_8x vcipher 15 1
+	addi	10, 10, 16
+	bdnz	__PreLoop_aes_state_dec
+	lxv	32+1, 0(10)		# last round key (v1)
+
+	cmpdi	12, 0			# Only one loop (8 block)
+	beq	__Finish_ghash_dec
+
+#
+# Loop 8x blocks and compute ghash
+#
+__Loop_8x_block_dec:
+	vcipherlast     15, 15, 1
+	vcipherlast     16, 16, 1
+	vcipherlast     17, 17, 1
+	vcipherlast     18, 18, 1
+	vcipherlast     19, 19, 1
+	vcipherlast     20, 20, 1
+	vcipherlast     21, 21, 1
+	vcipherlast     22, 22, 1
+
+	lxvb16x	32+23, 0, 14	# load block
+	lxvb16x	32+24, 15, 14	# load block
+	lxvb16x	32+25, 16, 14	# load block
+	lxvb16x	32+26, 17, 14	# load block
+	lxvb16x	32+27, 18, 14	# load block
+	lxvb16x	32+28, 19, 14	# load block
+	lxvb16x	32+29, 20, 14	# load block
+	lxvb16x	32+30, 21, 14	# load block
+	addi	14, 14, 128
+
+	vxor	15, 15, 23
+	vxor	16, 16, 24
+	vxor	17, 17, 25
+	vxor	18, 18, 26
+	vxor	19, 19, 27
+	vxor	20, 20, 28
+	vxor	21, 21, 29
+	vxor	22, 22, 30
+
+	stxvb16x 47, 0, 9	# store output
+	stxvb16x 48, 15, 9	# store output
+	stxvb16x 49, 16, 9	# store output
+	stxvb16x 50, 17, 9	# store output
+	stxvb16x 51, 18, 9	# store output
+	stxvb16x 52, 19, 9	# store output
+	stxvb16x 53, 20, 9	# store output
+	stxvb16x 54, 21, 9	# store output
+
+	addi	9, 9, 128
+
+	vmr	15, 23
+	vmr	16, 24
+	vmr	17, 25
+	vmr	18, 26
+	vmr	19, 27
+	vmr	20, 28
+	vmr	21, 29
+	vmr	22, 30
 
-	cmpdi	10, 14
-	beq	Do_final_1x_dec
+	# ghash here
+	vxor	15, 15, 0
+	PPC_GHASH4x 0, 15, 16, 17, 18
+
+	vxor	19, 19, 0
+	PPC_GHASH4x 0, 19, 20, 21, 22
+
+	xxlor	32+15, 9, 9		# last state
+	vadduwm 15, 15, 31		# state + counter
+	vadduwm 16, 15, 31
+	vadduwm 17, 16, 31
+	vadduwm 18, 17, 31
+	vadduwm 19, 18, 31
+	vadduwm 20, 19, 31
+	vadduwm 21, 20, 31
+	vadduwm 22, 21, 31
+	xxlor	9, 32+22, 32+22		# save last state
+
+	xxlor	32+27, 0, 0		# restore roundkey 0
+        vxor    15, 15, 27		# IV + round key - add round key 0
+	vxor	16, 16, 27
+	vxor	17, 17, 27
+	vxor	18, 18, 27
+	vxor	19, 19, 27
+	vxor	20, 20, 27
+	vxor	21, 21, 27
+	vxor	22, 22, 27
+
+	addi    5, 5, -128
+	addi    11, 11, 128
 
-Do_final_1x_dec:
-	vcipherlast     15, 15, 23
+	LOOP_8AES_STATE			# process 8 AES keys
+	mtctr	22			# AES key loop
+	addi	10, 6, 144
+__LastLoop_aes_state_dec:
+	lxv	32+1, 0(10)		# round key
+	AES_CIPHER_8x vcipher 15 1
+	addi	10, 10, 16
+	bdnz	__LastLoop_aes_state_dec
+	lxv	32+1, 0(10)		# last round key (v1)
 
-	# check partial block
-	li	21, 1			# decrypt
-	ld	15, 56(7)		# partial?
-	cmpdi	15, 0
-	beq	Normal_block_dec
-	bl	Do_partial_block
+	addi	12, 12, -1
 	cmpdi	12, 0
-	ble aes_gcm_out
-
-	b Continue_partial_check_dec
+	bne	__Loop_8x_block_dec
+
+__Finish_ghash_dec:
+	vcipherlast     15, 15, 1
+	vcipherlast     16, 16, 1
+	vcipherlast     17, 17, 1
+	vcipherlast     18, 18, 1
+	vcipherlast     19, 19, 1
+	vcipherlast     20, 20, 1
+	vcipherlast     21, 21, 1
+	vcipherlast     22, 22, 1
+
+	lxvb16x	32+23, 0, 14	# load block
+	lxvb16x	32+24, 15, 14	# load block
+	lxvb16x	32+25, 16, 14	# load block
+	lxvb16x	32+26, 17, 14	# load block
+	lxvb16x	32+27, 18, 14	# load block
+	lxvb16x	32+28, 19, 14	# load block
+	lxvb16x	32+29, 20, 14	# load block
+	lxvb16x	32+30, 21, 14	# load block
+	addi	14, 14, 128
+
+	vxor	15, 15, 23
+	vxor	16, 16, 24
+	vxor	17, 17, 25
+	vxor	18, 18, 26
+	vxor	19, 19, 27
+	vxor	20, 20, 28
+	vxor	21, 21, 29
+	vxor	22, 22, 30
+
+	stxvb16x 47, 0, 9	# store output
+	stxvb16x 48, 15, 9	# store output
+	stxvb16x 49, 16, 9	# store output
+	stxvb16x 50, 17, 9	# store output
+	stxvb16x 51, 18, 9	# store output
+	stxvb16x 52, 19, 9	# store output
+	stxvb16x 53, 20, 9	# store output
+	stxvb16x 54, 21, 9	# store output
+	addi	9, 9, 128
+
+	#vmr	15, 23
+	vxor	15, 23, 0
+	vmr	16, 24
+	vmr	17, 25
+	vmr	18, 26
+	vmr	19, 27
+	vmr	20, 28
+	vmr	21, 29
+	vmr	22, 30
+
+	#vxor	15, 15, 0
+	PPC_GHASH4x 0, 15, 16, 17, 18
+
+	vxor	19, 19, 0
+	PPC_GHASH4x 0, 19, 20, 21, 22
+
+	xxlor	30+32, 9, 9		# last ctr
+	vadduwm	30, 30, 31		# increase ctr
+	stxvb16x 32+30, 0, 7		# update IV
+	stxvb16x 32+0, 0, 8		# update Xi
+
+	addi    5, 5, -128
+	addi    11, 11, 128
 
-Normal_block_dec:
-	lxvb16x	15, 0, 14		# load last block
-	xxlxor	47, 47, 15
+	#
+	# Done 8x blocks
+	#
 
-	# create partial block mask
-	li	15, 16
-	sub	15, 15, 12		# index to the mask
+	cmpdi   5, 0
+	beq     aes_gcm_out
 
-	vspltisb	16, -1		# first 16 bytes - 0xffff...ff
-	vspltisb	17, 0		# second 16 bytes - 0x0000...00
-	li	10, 192
-	stvx	16, 10, 1
-	addi	10, 10, 16
-	stvx	17, 10, 1
+__Process_more_dec:
+	li	24, 0			# decrypt
+	bl	aes_gcm_crypt_1x
+	cmpdi   5, 0
+	beq     aes_gcm_out
 
-	addi	10, 1, 192
-	lxvb16x	16, 15, 10		# load partial block mask
-	xxland	47, 47, 16
+	bl	__Process_partial
+	cmpdi   5, 0
+	beq     aes_gcm_out
+__Do_combine_dec:
+	bl	__Combine_partial
+	cmpdi	5, 0
+	bgt	__Process_decrypt
+	b	aes_gcm_out
+SYM_FUNC_END(aes_p10_gcm_decrypt)
 
-	xxland	32+28, 15, 16
-	#vmr	28, 15
-	ppc_update_hash_1x
+SYM_FUNC_START_LOCAL(aes_gcm_out)
 
-	# * should store only the remaining bytes.
-	bl	Write_partial_block
+	mr	3, 11			# return count
 
-	stxvb16x 30+32, 0, 7		# update IV
-	std	12, 56(7)		# update partial?
-	li	16, 16
+	RESTORE_REGS
+	blr
 
-	stxvb16x	32, 0, 8		# write out Xi
-	stxvb16x	32, 16, 8		# write out Xi
-	b aes_gcm_out
+__Invalid_msg_len:
+	li	3, 0
+	blr
+SYM_FUNC_END(aes_gcm_out)
+
+SYM_DATA_START_LOCAL(PERMX)
+.align 4
+# for vector permute and xor
+permx:
+.long 0x4c5d6e7f, 0x08192a3b, 0xc4d5e6f7, 0x8091a2b3
+SYM_DATA_END(permx)
diff --git a/arch/powerpc/crypto/aes.c b/arch/powerpc/crypto/aes.c
new file mode 100644
index 000000000000..ec06189fbf99
--- /dev/null
+++ b/arch/powerpc/crypto/aes.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AES routines supporting VMX instructions on the Power 8
+ *
+ * Copyright (C) 2015 International Business Machines Inc.
+ *
+ * Author: Marcelo Henrique Cerri <mhcerri@br.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/err.h>
+#include <linux/crypto.h>
+#include <linux/delay.h>
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+#include <crypto/aes.h>
+#include <crypto/internal/cipher.h>
+#include <crypto/internal/simd.h>
+
+#include "aesp8-ppc.h"
+
+struct p8_aes_ctx {
+	struct crypto_cipher *fallback;
+	struct aes_key enc_key;
+	struct aes_key dec_key;
+};
+
+static int p8_aes_init(struct crypto_tfm *tfm)
+{
+	const char *alg = crypto_tfm_alg_name(tfm);
+	struct crypto_cipher *fallback;
+	struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	fallback = crypto_alloc_cipher(alg, 0, CRYPTO_ALG_NEED_FALLBACK);
+	if (IS_ERR(fallback)) {
+		printk(KERN_ERR
+		       "Failed to allocate transformation for '%s': %ld\n",
+		       alg, PTR_ERR(fallback));
+		return PTR_ERR(fallback);
+	}
+
+	crypto_cipher_set_flags(fallback,
+				crypto_cipher_get_flags((struct
+							 crypto_cipher *)
+							tfm));
+	ctx->fallback = fallback;
+
+	return 0;
+}
+
+static void p8_aes_exit(struct crypto_tfm *tfm)
+{
+	struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	if (ctx->fallback) {
+		crypto_free_cipher(ctx->fallback);
+		ctx->fallback = NULL;
+	}
+}
+
+static int p8_aes_setkey(struct crypto_tfm *tfm, const u8 *key,
+			 unsigned int keylen)
+{
+	int ret;
+	struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	preempt_disable();
+	pagefault_disable();
+	enable_kernel_vsx();
+	ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
+	ret |= aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
+	disable_kernel_vsx();
+	pagefault_enable();
+	preempt_enable();
+
+	ret |= crypto_cipher_setkey(ctx->fallback, key, keylen);
+
+	return ret ? -EINVAL : 0;
+}
+
+static void p8_aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	if (!crypto_simd_usable()) {
+		crypto_cipher_encrypt_one(ctx->fallback, dst, src);
+	} else {
+		preempt_disable();
+		pagefault_disable();
+		enable_kernel_vsx();
+		aes_p8_encrypt(src, dst, &ctx->enc_key);
+		disable_kernel_vsx();
+		pagefault_enable();
+		preempt_enable();
+	}
+}
+
+static void p8_aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	if (!crypto_simd_usable()) {
+		crypto_cipher_decrypt_one(ctx->fallback, dst, src);
+	} else {
+		preempt_disable();
+		pagefault_disable();
+		enable_kernel_vsx();
+		aes_p8_decrypt(src, dst, &ctx->dec_key);
+		disable_kernel_vsx();
+		pagefault_enable();
+		preempt_enable();
+	}
+}
+
+struct crypto_alg p8_aes_alg = {
+	.cra_name = "aes",
+	.cra_driver_name = "p8_aes",
+	.cra_module = THIS_MODULE,
+	.cra_priority = 1000,
+	.cra_type = NULL,
+	.cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_NEED_FALLBACK,
+	.cra_alignmask = 0,
+	.cra_blocksize = AES_BLOCK_SIZE,
+	.cra_ctxsize = sizeof(struct p8_aes_ctx),
+	.cra_init = p8_aes_init,
+	.cra_exit = p8_aes_exit,
+	.cra_cipher = {
+		       .cia_min_keysize = AES_MIN_KEY_SIZE,
+		       .cia_max_keysize = AES_MAX_KEY_SIZE,
+		       .cia_setkey = p8_aes_setkey,
+		       .cia_encrypt = p8_aes_encrypt,
+		       .cia_decrypt = p8_aes_decrypt,
+	},
+};
diff --git a/arch/powerpc/crypto/aes_cbc.c b/arch/powerpc/crypto/aes_cbc.c
new file mode 100644
index 000000000000..ed0debc7acb5
--- /dev/null
+++ b/arch/powerpc/crypto/aes_cbc.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AES CBC routines supporting VMX instructions on the Power 8
+ *
+ * Copyright (C) 2015 International Business Machines Inc.
+ *
+ * Author: Marcelo Henrique Cerri <mhcerri@br.ibm.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+#include <crypto/aes.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+
+#include "aesp8-ppc.h"
+
+struct p8_aes_cbc_ctx {
+	struct crypto_skcipher *fallback;
+	struct aes_key enc_key;
+	struct aes_key dec_key;
+};
+
+static int p8_aes_cbc_init(struct crypto_skcipher *tfm)
+{
+	struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct crypto_skcipher *fallback;
+
+	fallback = crypto_alloc_skcipher("cbc(aes)", 0,
+					 CRYPTO_ALG_NEED_FALLBACK |
+					 CRYPTO_ALG_ASYNC);
+	if (IS_ERR(fallback)) {
+		pr_err("Failed to allocate cbc(aes) fallback: %ld\n",
+		       PTR_ERR(fallback));
+		return PTR_ERR(fallback);
+	}
+
+	crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) +
+				    crypto_skcipher_reqsize(fallback));
+	ctx->fallback = fallback;
+	return 0;
+}
+
+static void p8_aes_cbc_exit(struct crypto_skcipher *tfm)
+{
+	struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	crypto_free_skcipher(ctx->fallback);
+}
+
+static int p8_aes_cbc_setkey(struct crypto_skcipher *tfm, const u8 *key,
+			     unsigned int keylen)
+{
+	struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+	int ret;
+
+	preempt_disable();
+	pagefault_disable();
+	enable_kernel_vsx();
+	ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
+	ret |= aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
+	disable_kernel_vsx();
+	pagefault_enable();
+	preempt_enable();
+
+	ret |= crypto_skcipher_setkey(ctx->fallback, key, keylen);
+
+	return ret ? -EINVAL : 0;
+}
+
+static int p8_aes_cbc_crypt(struct skcipher_request *req, int enc)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	const struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	unsigned int nbytes;
+	int ret;
+
+	if (!crypto_simd_usable()) {
+		struct skcipher_request *subreq = skcipher_request_ctx(req);
+
+		*subreq = *req;
+		skcipher_request_set_tfm(subreq, ctx->fallback);
+		return enc ? crypto_skcipher_encrypt(subreq) :
+			     crypto_skcipher_decrypt(subreq);
+	}
+
+	ret = skcipher_walk_virt(&walk, req, false);
+	while ((nbytes = walk.nbytes) != 0) {
+		preempt_disable();
+		pagefault_disable();
+		enable_kernel_vsx();
+		aes_p8_cbc_encrypt(walk.src.virt.addr,
+				   walk.dst.virt.addr,
+				   round_down(nbytes, AES_BLOCK_SIZE),
+				   enc ? &ctx->enc_key : &ctx->dec_key,
+				   walk.iv, enc);
+		disable_kernel_vsx();
+		pagefault_enable();
+		preempt_enable();
+
+		ret = skcipher_walk_done(&walk, nbytes % AES_BLOCK_SIZE);
+	}
+	return ret;
+}
+
+static int p8_aes_cbc_encrypt(struct skcipher_request *req)
+{
+	return p8_aes_cbc_crypt(req, 1);
+}
+
+static int p8_aes_cbc_decrypt(struct skcipher_request *req)
+{
+	return p8_aes_cbc_crypt(req, 0);
+}
+
+struct skcipher_alg p8_aes_cbc_alg = {
+	.base.cra_name = "cbc(aes)",
+	.base.cra_driver_name = "p8_aes_cbc",
+	.base.cra_module = THIS_MODULE,
+	.base.cra_priority = 2000,
+	.base.cra_flags = CRYPTO_ALG_NEED_FALLBACK,
+	.base.cra_blocksize = AES_BLOCK_SIZE,
+	.base.cra_ctxsize = sizeof(struct p8_aes_cbc_ctx),
+	.setkey = p8_aes_cbc_setkey,
+	.encrypt = p8_aes_cbc_encrypt,
+	.decrypt = p8_aes_cbc_decrypt,
+	.init = p8_aes_cbc_init,
+	.exit = p8_aes_cbc_exit,
+	.min_keysize = AES_MIN_KEY_SIZE,
+	.max_keysize = AES_MAX_KEY_SIZE,
+	.ivsize = AES_BLOCK_SIZE,
+};
diff --git a/arch/powerpc/crypto/aes_ctr.c b/arch/powerpc/crypto/aes_ctr.c
new file mode 100644
index 000000000000..9a3da8cd62f3
--- /dev/null
+++ b/arch/powerpc/crypto/aes_ctr.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AES CTR routines supporting VMX instructions on the Power 8
+ *
+ * Copyright (C) 2015 International Business Machines Inc.
+ *
+ * Author: Marcelo Henrique Cerri <mhcerri@br.ibm.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+#include <crypto/aes.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+
+#include "aesp8-ppc.h"
+
+struct p8_aes_ctr_ctx {
+	struct crypto_skcipher *fallback;
+	struct aes_key enc_key;
+};
+
+static int p8_aes_ctr_init(struct crypto_skcipher *tfm)
+{
+	struct p8_aes_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct crypto_skcipher *fallback;
+
+	fallback = crypto_alloc_skcipher("ctr(aes)", 0,
+					 CRYPTO_ALG_NEED_FALLBACK |
+					 CRYPTO_ALG_ASYNC);
+	if (IS_ERR(fallback)) {
+		pr_err("Failed to allocate ctr(aes) fallback: %ld\n",
+		       PTR_ERR(fallback));
+		return PTR_ERR(fallback);
+	}
+
+	crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) +
+				    crypto_skcipher_reqsize(fallback));
+	ctx->fallback = fallback;
+	return 0;
+}
+
+static void p8_aes_ctr_exit(struct crypto_skcipher *tfm)
+{
+	struct p8_aes_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	crypto_free_skcipher(ctx->fallback);
+}
+
+static int p8_aes_ctr_setkey(struct crypto_skcipher *tfm, const u8 *key,
+			     unsigned int keylen)
+{
+	struct p8_aes_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
+	int ret;
+
+	preempt_disable();
+	pagefault_disable();
+	enable_kernel_vsx();
+	ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
+	disable_kernel_vsx();
+	pagefault_enable();
+	preempt_enable();
+
+	ret |= crypto_skcipher_setkey(ctx->fallback, key, keylen);
+
+	return ret ? -EINVAL : 0;
+}
+
+static void p8_aes_ctr_final(const struct p8_aes_ctr_ctx *ctx,
+			     struct skcipher_walk *walk)
+{
+	u8 *ctrblk = walk->iv;
+	u8 keystream[AES_BLOCK_SIZE];
+	u8 *src = walk->src.virt.addr;
+	u8 *dst = walk->dst.virt.addr;
+	unsigned int nbytes = walk->nbytes;
+
+	preempt_disable();
+	pagefault_disable();
+	enable_kernel_vsx();
+	aes_p8_encrypt(ctrblk, keystream, &ctx->enc_key);
+	disable_kernel_vsx();
+	pagefault_enable();
+	preempt_enable();
+
+	crypto_xor_cpy(dst, keystream, src, nbytes);
+	crypto_inc(ctrblk, AES_BLOCK_SIZE);
+}
+
+static int p8_aes_ctr_crypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	const struct p8_aes_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	unsigned int nbytes;
+	int ret;
+
+	if (!crypto_simd_usable()) {
+		struct skcipher_request *subreq = skcipher_request_ctx(req);
+
+		*subreq = *req;
+		skcipher_request_set_tfm(subreq, ctx->fallback);
+		return crypto_skcipher_encrypt(subreq);
+	}
+
+	ret = skcipher_walk_virt(&walk, req, false);
+	while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
+		preempt_disable();
+		pagefault_disable();
+		enable_kernel_vsx();
+		aes_p8_ctr32_encrypt_blocks(walk.src.virt.addr,
+					    walk.dst.virt.addr,
+					    nbytes / AES_BLOCK_SIZE,
+					    &ctx->enc_key, walk.iv);
+		disable_kernel_vsx();
+		pagefault_enable();
+		preempt_enable();
+
+		do {
+			crypto_inc(walk.iv, AES_BLOCK_SIZE);
+		} while ((nbytes -= AES_BLOCK_SIZE) >= AES_BLOCK_SIZE);
+
+		ret = skcipher_walk_done(&walk, nbytes);
+	}
+	if (nbytes) {
+		p8_aes_ctr_final(ctx, &walk);
+		ret = skcipher_walk_done(&walk, 0);
+	}
+	return ret;
+}
+
+struct skcipher_alg p8_aes_ctr_alg = {
+	.base.cra_name = "ctr(aes)",
+	.base.cra_driver_name = "p8_aes_ctr",
+	.base.cra_module = THIS_MODULE,
+	.base.cra_priority = 2000,
+	.base.cra_flags = CRYPTO_ALG_NEED_FALLBACK,
+	.base.cra_blocksize = 1,
+	.base.cra_ctxsize = sizeof(struct p8_aes_ctr_ctx),
+	.setkey = p8_aes_ctr_setkey,
+	.encrypt = p8_aes_ctr_crypt,
+	.decrypt = p8_aes_ctr_crypt,
+	.init = p8_aes_ctr_init,
+	.exit = p8_aes_ctr_exit,
+	.min_keysize = AES_MIN_KEY_SIZE,
+	.max_keysize = AES_MAX_KEY_SIZE,
+	.ivsize = AES_BLOCK_SIZE,
+	.chunksize = AES_BLOCK_SIZE,
+};
diff --git a/arch/powerpc/crypto/aes_xts.c b/arch/powerpc/crypto/aes_xts.c
new file mode 100644
index 000000000000..dabbccb41550
--- /dev/null
+++ b/arch/powerpc/crypto/aes_xts.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AES XTS routines supporting VMX In-core instructions on Power 8
+ *
+ * Copyright (C) 2015 International Business Machines Inc.
+ *
+ * Author: Leonidas S. Barbosa <leosilva@linux.vnet.ibm.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+#include <crypto/aes.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/xts.h>
+
+#include "aesp8-ppc.h"
+
+struct p8_aes_xts_ctx {
+	struct crypto_skcipher *fallback;
+	struct aes_key enc_key;
+	struct aes_key dec_key;
+	struct aes_key tweak_key;
+};
+
+static int p8_aes_xts_init(struct crypto_skcipher *tfm)
+{
+	struct p8_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct crypto_skcipher *fallback;
+
+	fallback = crypto_alloc_skcipher("xts(aes)", 0,
+					 CRYPTO_ALG_NEED_FALLBACK |
+					 CRYPTO_ALG_ASYNC);
+	if (IS_ERR(fallback)) {
+		pr_err("Failed to allocate xts(aes) fallback: %ld\n",
+		       PTR_ERR(fallback));
+		return PTR_ERR(fallback);
+	}
+
+	crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) +
+				    crypto_skcipher_reqsize(fallback));
+	ctx->fallback = fallback;
+	return 0;
+}
+
+static void p8_aes_xts_exit(struct crypto_skcipher *tfm)
+{
+	struct p8_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	crypto_free_skcipher(ctx->fallback);
+}
+
+static int p8_aes_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
+			     unsigned int keylen)
+{
+	struct p8_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	int ret;
+
+	ret = xts_verify_key(tfm, key, keylen);
+	if (ret)
+		return ret;
+
+	preempt_disable();
+	pagefault_disable();
+	enable_kernel_vsx();
+	ret = aes_p8_set_encrypt_key(key + keylen/2, (keylen/2) * 8, &ctx->tweak_key);
+	ret |= aes_p8_set_encrypt_key(key, (keylen/2) * 8, &ctx->enc_key);
+	ret |= aes_p8_set_decrypt_key(key, (keylen/2) * 8, &ctx->dec_key);
+	disable_kernel_vsx();
+	pagefault_enable();
+	preempt_enable();
+
+	ret |= crypto_skcipher_setkey(ctx->fallback, key, keylen);
+
+	return ret ? -EINVAL : 0;
+}
+
+static int p8_aes_xts_crypt(struct skcipher_request *req, int enc)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	const struct p8_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	unsigned int nbytes;
+	u8 tweak[AES_BLOCK_SIZE];
+	int ret;
+
+	if (req->cryptlen < AES_BLOCK_SIZE)
+		return -EINVAL;
+
+	if (!crypto_simd_usable() || (req->cryptlen % XTS_BLOCK_SIZE) != 0) {
+		struct skcipher_request *subreq = skcipher_request_ctx(req);
+
+		*subreq = *req;
+		skcipher_request_set_tfm(subreq, ctx->fallback);
+		return enc ? crypto_skcipher_encrypt(subreq) :
+			     crypto_skcipher_decrypt(subreq);
+	}
+
+	ret = skcipher_walk_virt(&walk, req, false);
+	if (ret)
+		return ret;
+
+	preempt_disable();
+	pagefault_disable();
+	enable_kernel_vsx();
+
+	aes_p8_encrypt(walk.iv, tweak, &ctx->tweak_key);
+
+	disable_kernel_vsx();
+	pagefault_enable();
+	preempt_enable();
+
+	while ((nbytes = walk.nbytes) != 0) {
+		preempt_disable();
+		pagefault_disable();
+		enable_kernel_vsx();
+		if (enc)
+			aes_p8_xts_encrypt(walk.src.virt.addr,
+					   walk.dst.virt.addr,
+					   round_down(nbytes, AES_BLOCK_SIZE),
+					   &ctx->enc_key, NULL, tweak);
+		else
+			aes_p8_xts_decrypt(walk.src.virt.addr,
+					   walk.dst.virt.addr,
+					   round_down(nbytes, AES_BLOCK_SIZE),
+					   &ctx->dec_key, NULL, tweak);
+		disable_kernel_vsx();
+		pagefault_enable();
+		preempt_enable();
+
+		ret = skcipher_walk_done(&walk, nbytes % AES_BLOCK_SIZE);
+	}
+	return ret;
+}
+
+static int p8_aes_xts_encrypt(struct skcipher_request *req)
+{
+	return p8_aes_xts_crypt(req, 1);
+}
+
+static int p8_aes_xts_decrypt(struct skcipher_request *req)
+{
+	return p8_aes_xts_crypt(req, 0);
+}
+
+struct skcipher_alg p8_aes_xts_alg = {
+	.base.cra_name = "xts(aes)",
+	.base.cra_driver_name = "p8_aes_xts",
+	.base.cra_module = THIS_MODULE,
+	.base.cra_priority = 2000,
+	.base.cra_flags = CRYPTO_ALG_NEED_FALLBACK,
+	.base.cra_blocksize = AES_BLOCK_SIZE,
+	.base.cra_ctxsize = sizeof(struct p8_aes_xts_ctx),
+	.setkey = p8_aes_xts_setkey,
+	.encrypt = p8_aes_xts_encrypt,
+	.decrypt = p8_aes_xts_decrypt,
+	.init = p8_aes_xts_init,
+	.exit = p8_aes_xts_exit,
+	.min_keysize = 2 * AES_MIN_KEY_SIZE,
+	.max_keysize = 2 * AES_MAX_KEY_SIZE,
+	.ivsize = AES_BLOCK_SIZE,
+};
diff --git a/arch/powerpc/crypto/aesp8-ppc.h b/arch/powerpc/crypto/aesp8-ppc.h
new file mode 100644
index 000000000000..5764d4438388
--- /dev/null
+++ b/arch/powerpc/crypto/aesp8-ppc.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/types.h>
+#include <crypto/aes.h>
+
+struct aes_key {
+	u8 key[AES_MAX_KEYLENGTH];
+	int rounds;
+};
+
+extern struct shash_alg p8_ghash_alg;
+extern struct crypto_alg p8_aes_alg;
+extern struct skcipher_alg p8_aes_cbc_alg;
+extern struct skcipher_alg p8_aes_ctr_alg;
+extern struct skcipher_alg p8_aes_xts_alg;
+
+int aes_p8_set_encrypt_key(const u8 *userKey, const int bits,
+			   struct aes_key *key);
+int aes_p8_set_decrypt_key(const u8 *userKey, const int bits,
+			   struct aes_key *key);
+void aes_p8_encrypt(const u8 *in, u8 *out, const struct aes_key *key);
+void aes_p8_decrypt(const u8 *in, u8 *out, const struct aes_key *key);
+void aes_p8_cbc_encrypt(const u8 *in, u8 *out, size_t len,
+			const struct aes_key *key, u8 *iv, const int enc);
+void aes_p8_ctr32_encrypt_blocks(const u8 *in, u8 *out,
+				 size_t len, const struct aes_key *key,
+				 const u8 *iv);
+void aes_p8_xts_encrypt(const u8 *in, u8 *out, size_t len,
+			const struct aes_key *key1, const struct aes_key *key2, u8 *iv);
+void aes_p8_xts_decrypt(const u8 *in, u8 *out, size_t len,
+			const struct aes_key *key1, const struct aes_key *key2, u8 *iv);
diff --git a/arch/powerpc/crypto/aesp8-ppc.pl b/arch/powerpc/crypto/aesp8-ppc.pl
new file mode 100644
index 000000000000..f729589d792e
--- /dev/null
+++ b/arch/powerpc/crypto/aesp8-ppc.pl
@@ -0,0 +1,3889 @@
+#! /usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from CRYPTOGAMs[1] and is included here using the option
+# in the license to distribute the code under the GPL. Therefore this program
+# is free software; you can redistribute it and/or modify it under the terms of
+# the GNU General Public License version 2 as published by the Free Software
+# Foundation.
+#
+# [1] https://www.openssl.org/~appro/cryptogams/
+
+# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+#       * Redistributions of source code must retain copyright notices,
+#         this list of conditions and the following disclaimer.
+#
+#       * Redistributions in binary form must reproduce the above
+#         copyright notice, this list of conditions and the following
+#         disclaimer in the documentation and/or other materials
+#         provided with the distribution.
+#
+#       * Neither the name of the CRYPTOGAMS nor the names of its
+#         copyright holder and contributors may be used to endorse or
+#         promote products derived from this software without specific
+#         prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for AES instructions as per PowerISA
+# specification version 2.07, first implemented by POWER8 processor.
+# The module is endian-agnostic in sense that it supports both big-
+# and little-endian cases. Data alignment in parallelizable modes is
+# handled with VSX loads and stores, which implies MSR.VSX flag being
+# set. It should also be noted that ISA specification doesn't prohibit
+# alignment exceptions for these instructions on page boundaries.
+# Initially alignment was handled in pure AltiVec/VMX way [when data
+# is aligned programmatically, which in turn guarantees exception-
+# free execution], but it turned to hamper performance when vcipher
+# instructions are interleaved. It's reckoned that eventual
+# misalignment penalties at page boundaries are in average lower
+# than additional overhead in pure AltiVec approach.
+#
+# May 2016
+#
+# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
+# systems were measured.
+#
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+#		CBC en-/decrypt	CTR	XTS
+# POWER8[le]	3.96/0.72	0.74	1.1
+# POWER8[be]	3.75/0.65	0.66	1.0
+
+$flavour = shift;
+
+if ($flavour =~ /64/) {
+	$SIZE_T	=8;
+	$LRSAVE	=2*$SIZE_T;
+	$STU	="stdu";
+	$POP	="ld";
+	$PUSH	="std";
+	$UCMP	="cmpld";
+	$SHL	="sldi";
+} elsif ($flavour =~ /32/) {
+	$SIZE_T	=4;
+	$LRSAVE	=$SIZE_T;
+	$STU	="stwu";
+	$POP	="lwz";
+	$PUSH	="stw";
+	$UCMP	="cmplw";
+	$SHL	="slwi";
+} else { die "nonsense $flavour"; }
+
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=8*$SIZE_T;
+$prefix="aes_p8";
+
+$sp="r1";
+$vrsave="r12";
+
+#########################################################################
+{{{	# Key setup procedures						#
+my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
+my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
+my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
+
+$code.=<<___;
+.machine	"any"
+
+.text
+
+.align	7
+rcon:
+.long	0x01000000, 0x01000000, 0x01000000, 0x01000000	?rev
+.long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev
+.long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev
+.long	0,0,0,0						?asis
+.long	0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
+Lconsts:
+	mflr	r0
+	bcl	20,31,\$+4
+	mflr	$ptr	 #vvvvv "distance between . and rcon
+	addi	$ptr,$ptr,-0x58
+	mtlr	r0
+	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+.asciz	"AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+
+.globl	.${prefix}_set_encrypt_key
+Lset_encrypt_key:
+	mflr		r11
+	$PUSH		r11,$LRSAVE($sp)
+
+	li		$ptr,-1
+	${UCMP}i	$inp,0
+	beq-		Lenc_key_abort		# if ($inp==0) return -1;
+	${UCMP}i	$out,0
+	beq-		Lenc_key_abort		# if ($out==0) return -1;
+	li		$ptr,-2
+	cmpwi		$bits,128
+	blt-		Lenc_key_abort
+	cmpwi		$bits,256
+	bgt-		Lenc_key_abort
+	andi.		r0,$bits,0x3f
+	bne-		Lenc_key_abort
+
+	lis		r0,0xfff0
+	mfspr		$vrsave,256
+	mtspr		256,r0
+
+	bl		Lconsts
+	mtlr		r11
+
+	neg		r9,$inp
+	lvx		$in0,0,$inp
+	addi		$inp,$inp,15		# 15 is not typo
+	lvsr		$key,0,r9		# borrow $key
+	li		r8,0x20
+	cmpwi		$bits,192
+	lvx		$in1,0,$inp
+	le?vspltisb	$mask,0x0f		# borrow $mask
+	lvx		$rcon,0,$ptr
+	le?vxor		$key,$key,$mask		# adjust for byte swap
+	lvx		$mask,r8,$ptr
+	addi		$ptr,$ptr,0x10
+	vperm		$in0,$in0,$in1,$key	# align [and byte swap in LE]
+	li		$cnt,8
+	vxor		$zero,$zero,$zero
+	mtctr		$cnt
+
+	?lvsr		$outperm,0,$out
+	vspltisb	$outmask,-1
+	lvx		$outhead,0,$out
+	?vperm		$outmask,$zero,$outmask,$outperm
+
+	blt		Loop128
+	addi		$inp,$inp,8
+	beq		L192
+	addi		$inp,$inp,8
+	b		L256
+
+.align	4
+Loop128:
+	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
+	vsldoi		$tmp,$zero,$in0,12	# >>32
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	vcipherlast	$key,$key,$rcon
+	 stvx		$stage,0,$out
+	 addi		$out,$out,16
+
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	 vadduwm	$rcon,$rcon,$rcon
+	vxor		$in0,$in0,$key
+	bdnz		Loop128
+
+	lvx		$rcon,0,$ptr		# last two round keys
+
+	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
+	vsldoi		$tmp,$zero,$in0,12	# >>32
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	vcipherlast	$key,$key,$rcon
+	 stvx		$stage,0,$out
+	 addi		$out,$out,16
+
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	 vadduwm	$rcon,$rcon,$rcon
+	vxor		$in0,$in0,$key
+
+	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
+	vsldoi		$tmp,$zero,$in0,12	# >>32
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	vcipherlast	$key,$key,$rcon
+	 stvx		$stage,0,$out
+	 addi		$out,$out,16
+
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	vxor		$in0,$in0,$key
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	 stvx		$stage,0,$out
+
+	addi		$inp,$out,15		# 15 is not typo
+	addi		$out,$out,0x50
+
+	li		$rounds,10
+	b		Ldone
+
+.align	4
+L192:
+	lvx		$tmp,0,$inp
+	li		$cnt,4
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	 stvx		$stage,0,$out
+	 addi		$out,$out,16
+	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
+	vspltisb	$key,8			# borrow $key
+	mtctr		$cnt
+	vsububm		$mask,$mask,$key	# adjust the mask
+
+Loop192:
+	vperm		$key,$in1,$in1,$mask	# roate-n-splat
+	vsldoi		$tmp,$zero,$in0,12	# >>32
+	vcipherlast	$key,$key,$rcon
+
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+
+	 vsldoi		$stage,$zero,$in1,8
+	vspltw		$tmp,$in0,3
+	vxor		$tmp,$tmp,$in1
+	vsldoi		$in1,$zero,$in1,12	# >>32
+	 vadduwm	$rcon,$rcon,$rcon
+	vxor		$in1,$in1,$tmp
+	vxor		$in0,$in0,$key
+	vxor		$in1,$in1,$key
+	 vsldoi		$stage,$stage,$in0,8
+
+	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
+	vsldoi		$tmp,$zero,$in0,12	# >>32
+	 vperm		$outtail,$stage,$stage,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	vcipherlast	$key,$key,$rcon
+	 stvx		$stage,0,$out
+	 addi		$out,$out,16
+
+	 vsldoi		$stage,$in0,$in1,8
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	 vperm		$outtail,$stage,$stage,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	 stvx		$stage,0,$out
+	 addi		$out,$out,16
+
+	vspltw		$tmp,$in0,3
+	vxor		$tmp,$tmp,$in1
+	vsldoi		$in1,$zero,$in1,12	# >>32
+	 vadduwm	$rcon,$rcon,$rcon
+	vxor		$in1,$in1,$tmp
+	vxor		$in0,$in0,$key
+	vxor		$in1,$in1,$key
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	 stvx		$stage,0,$out
+	 addi		$inp,$out,15		# 15 is not typo
+	 addi		$out,$out,16
+	bdnz		Loop192
+
+	li		$rounds,12
+	addi		$out,$out,0x20
+	b		Ldone
+
+.align	4
+L256:
+	lvx		$tmp,0,$inp
+	li		$cnt,7
+	li		$rounds,14
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	 stvx		$stage,0,$out
+	 addi		$out,$out,16
+	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
+	mtctr		$cnt
+
+Loop256:
+	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
+	vsldoi		$tmp,$zero,$in0,12	# >>32
+	 vperm		$outtail,$in1,$in1,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	vcipherlast	$key,$key,$rcon
+	 stvx		$stage,0,$out
+	 addi		$out,$out,16
+
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	 vadduwm	$rcon,$rcon,$rcon
+	vxor		$in0,$in0,$key
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	 stvx		$stage,0,$out
+	 addi		$inp,$out,15		# 15 is not typo
+	 addi		$out,$out,16
+	bdz		Ldone
+
+	vspltw		$key,$in0,3		# just splat
+	vsldoi		$tmp,$zero,$in1,12	# >>32
+	vsbox		$key,$key
+
+	vxor		$in1,$in1,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in1,$in1,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in1,$in1,$tmp
+
+	vxor		$in1,$in1,$key
+	b		Loop256
+
+.align	4
+Ldone:
+	lvx		$in1,0,$inp		# redundant in aligned case
+	vsel		$in1,$outhead,$in1,$outmask
+	stvx		$in1,0,$inp
+	li		$ptr,0
+	mtspr		256,$vrsave
+	stw		$rounds,0($out)
+
+Lenc_key_abort:
+	mr		r3,$ptr
+	blr
+	.long		0
+	.byte		0,12,0x14,1,0,0,3,0
+	.long		0
+.size	.${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
+
+.globl	.${prefix}_set_decrypt_key
+	$STU		$sp,-$FRAME($sp)
+	mflr		r10
+	$PUSH		r10,$FRAME+$LRSAVE($sp)
+	bl		Lset_encrypt_key
+	mtlr		r10
+
+	cmpwi		r3,0
+	bne-		Ldec_key_abort
+
+	slwi		$cnt,$rounds,4
+	subi		$inp,$out,240		# first round key
+	srwi		$rounds,$rounds,1
+	add		$out,$inp,$cnt		# last round key
+	mtctr		$rounds
+
+Ldeckey:
+	lwz		r0, 0($inp)
+	lwz		r6, 4($inp)
+	lwz		r7, 8($inp)
+	lwz		r8, 12($inp)
+	addi		$inp,$inp,16
+	lwz		r9, 0($out)
+	lwz		r10,4($out)
+	lwz		r11,8($out)
+	lwz		r12,12($out)
+	stw		r0, 0($out)
+	stw		r6, 4($out)
+	stw		r7, 8($out)
+	stw		r8, 12($out)
+	subi		$out,$out,16
+	stw		r9, -16($inp)
+	stw		r10,-12($inp)
+	stw		r11,-8($inp)
+	stw		r12,-4($inp)
+	bdnz		Ldeckey
+
+	xor		r3,r3,r3		# return value
+Ldec_key_abort:
+	addi		$sp,$sp,$FRAME
+	blr
+	.long		0
+	.byte		0,12,4,1,0x80,0,3,0
+	.long		0
+.size	.${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
+___
+}}}
+#########################################################################
+{{{	# Single block en- and decrypt procedures			#
+sub gen_block () {
+my $dir = shift;
+my $n   = $dir eq "de" ? "n" : "";
+my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
+
+$code.=<<___;
+.globl	.${prefix}_${dir}crypt
+	lwz		$rounds,240($key)
+	lis		r0,0xfc00
+	mfspr		$vrsave,256
+	li		$idx,15			# 15 is not typo
+	mtspr		256,r0
+
+	lvx		v0,0,$inp
+	neg		r11,$out
+	lvx		v1,$idx,$inp
+	lvsl		v2,0,$inp		# inpperm
+	le?vspltisb	v4,0x0f
+	?lvsl		v3,0,r11		# outperm
+	le?vxor		v2,v2,v4
+	li		$idx,16
+	vperm		v0,v0,v1,v2		# align [and byte swap in LE]
+	lvx		v1,0,$key
+	?lvsl		v5,0,$key		# keyperm
+	srwi		$rounds,$rounds,1
+	lvx		v2,$idx,$key
+	addi		$idx,$idx,16
+	subi		$rounds,$rounds,1
+	?vperm		v1,v1,v2,v5		# align round key
+
+	vxor		v0,v0,v1
+	lvx		v1,$idx,$key
+	addi		$idx,$idx,16
+	mtctr		$rounds
+
+Loop_${dir}c:
+	?vperm		v2,v2,v1,v5
+	v${n}cipher	v0,v0,v2
+	lvx		v2,$idx,$key
+	addi		$idx,$idx,16
+	?vperm		v1,v1,v2,v5
+	v${n}cipher	v0,v0,v1
+	lvx		v1,$idx,$key
+	addi		$idx,$idx,16
+	bdnz		Loop_${dir}c
+
+	?vperm		v2,v2,v1,v5
+	v${n}cipher	v0,v0,v2
+	lvx		v2,$idx,$key
+	?vperm		v1,v1,v2,v5
+	v${n}cipherlast	v0,v0,v1
+
+	vspltisb	v2,-1
+	vxor		v1,v1,v1
+	li		$idx,15			# 15 is not typo
+	?vperm		v2,v1,v2,v3		# outmask
+	le?vxor		v3,v3,v4
+	lvx		v1,0,$out		# outhead
+	vperm		v0,v0,v0,v3		# rotate [and byte swap in LE]
+	vsel		v1,v1,v0,v2
+	lvx		v4,$idx,$out
+	stvx		v1,0,$out
+	vsel		v0,v0,v4,v2
+	stvx		v0,$idx,$out
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,3,0
+	.long		0
+.size	.${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+#########################################################################
+{{{	# CBC en- and decrypt procedures				#
+my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
+my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
+						map("v$_",(4..10));
+$code.=<<___;
+.globl	.${prefix}_cbc_encrypt
+	${UCMP}i	$len,16
+	bltlr-
+
+	cmpwi		$enc,0			# test direction
+	lis		r0,0xffe0
+	mfspr		$vrsave,256
+	mtspr		256,r0
+
+	li		$idx,15
+	vxor		$rndkey0,$rndkey0,$rndkey0
+	le?vspltisb	$tmp,0x0f
+
+	lvx		$ivec,0,$ivp		# load [unaligned] iv
+	lvsl		$inpperm,0,$ivp
+	lvx		$inptail,$idx,$ivp
+	le?vxor		$inpperm,$inpperm,$tmp
+	vperm		$ivec,$ivec,$inptail,$inpperm
+
+	neg		r11,$inp
+	?lvsl		$keyperm,0,$key		# prepare for unaligned key
+	lwz		$rounds,240($key)
+
+	lvsr		$inpperm,0,r11		# prepare for unaligned load
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,15		# 15 is not typo
+	le?vxor		$inpperm,$inpperm,$tmp
+
+	?lvsr		$outperm,0,$out		# prepare for unaligned store
+	vspltisb	$outmask,-1
+	lvx		$outhead,0,$out
+	?vperm		$outmask,$rndkey0,$outmask,$outperm
+	le?vxor		$outperm,$outperm,$tmp
+
+	srwi		$rounds,$rounds,1
+	li		$idx,16
+	subi		$rounds,$rounds,1
+	beq		Lcbc_dec
+
+Lcbc_enc:
+	vmr		$inout,$inptail
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,16
+	mtctr		$rounds
+	subi		$len,$len,16		# len-=16
+
+	lvx		$rndkey0,0,$key
+	 vperm		$inout,$inout,$inptail,$inpperm
+	lvx		$rndkey1,$idx,$key
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key
+	addi		$idx,$idx,16
+	vxor		$inout,$inout,$ivec
+
+Loop_cbc_enc:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipher		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key
+	addi		$idx,$idx,16
+	bdnz		Loop_cbc_enc
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key
+	li		$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipherlast	$ivec,$inout,$rndkey0
+	${UCMP}i	$len,16
+
+	vperm		$tmp,$ivec,$ivec,$outperm
+	vsel		$inout,$outhead,$tmp,$outmask
+	vmr		$outhead,$tmp
+	stvx		$inout,0,$out
+	addi		$out,$out,16
+	bge		Lcbc_enc
+
+	b		Lcbc_done
+
+.align	4
+Lcbc_dec:
+	${UCMP}i	$len,128
+	bge		_aesp8_cbc_decrypt8x
+	vmr		$tmp,$inptail
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,16
+	mtctr		$rounds
+	subi		$len,$len,16		# len-=16
+
+	lvx		$rndkey0,0,$key
+	 vperm		$tmp,$tmp,$inptail,$inpperm
+	lvx		$rndkey1,$idx,$key
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$inout,$tmp,$rndkey0
+	lvx		$rndkey0,$idx,$key
+	addi		$idx,$idx,16
+
+Loop_cbc_dec:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vncipher	$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vncipher	$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key
+	addi		$idx,$idx,16
+	bdnz		Loop_cbc_dec
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vncipher	$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key
+	li		$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vncipherlast	$inout,$inout,$rndkey0
+	${UCMP}i	$len,16
+
+	vxor		$inout,$inout,$ivec
+	vmr		$ivec,$tmp
+	vperm		$tmp,$inout,$inout,$outperm
+	vsel		$inout,$outhead,$tmp,$outmask
+	vmr		$outhead,$tmp
+	stvx		$inout,0,$out
+	addi		$out,$out,16
+	bge		Lcbc_dec
+
+Lcbc_done:
+	addi		$out,$out,-1
+	lvx		$inout,0,$out		# redundant in aligned case
+	vsel		$inout,$outhead,$inout,$outmask
+	stvx		$inout,0,$out
+
+	neg		$enc,$ivp		# write [unaligned] iv
+	li		$idx,15			# 15 is not typo
+	vxor		$rndkey0,$rndkey0,$rndkey0
+	vspltisb	$outmask,-1
+	le?vspltisb	$tmp,0x0f
+	?lvsl		$outperm,0,$enc
+	?vperm		$outmask,$rndkey0,$outmask,$outperm
+	le?vxor		$outperm,$outperm,$tmp
+	lvx		$outhead,0,$ivp
+	vperm		$ivec,$ivec,$ivec,$outperm
+	vsel		$inout,$outhead,$ivec,$outmask
+	lvx		$inptail,$idx,$ivp
+	stvx		$inout,0,$ivp
+	vsel		$inout,$ivec,$inptail,$outmask
+	stvx		$inout,$idx,$ivp
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,6,0
+	.long		0
+___
+#########################################################################
+{{	# Optimized CBC decrypt procedure				#
+my $key_="r11";
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
+my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
+my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
+my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
+			# v26-v31 last 6 round keys
+my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
+
+$code.=<<___;
+.align	5
+_aesp8_cbc_decrypt8x:
+	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+	li		r10,`$FRAME+8*16+15`
+	li		r11,`$FRAME+8*16+31`
+	stvx		v20,r10,$sp		# ABI says so
+	addi		r10,r10,32
+	stvx		v21,r11,$sp
+	addi		r11,r11,32
+	stvx		v22,r10,$sp
+	addi		r10,r10,32
+	stvx		v23,r11,$sp
+	addi		r11,r11,32
+	stvx		v24,r10,$sp
+	addi		r10,r10,32
+	stvx		v25,r11,$sp
+	addi		r11,r11,32
+	stvx		v26,r10,$sp
+	addi		r10,r10,32
+	stvx		v27,r11,$sp
+	addi		r11,r11,32
+	stvx		v28,r10,$sp
+	addi		r10,r10,32
+	stvx		v29,r11,$sp
+	addi		r11,r11,32
+	stvx		v30,r10,$sp
+	stvx		v31,r11,$sp
+	li		r0,-1
+	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
+	li		$x10,0x10
+	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	li		$x20,0x20
+	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	li		$x30,0x30
+	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	li		$x40,0x40
+	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	li		$x50,0x50
+	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	li		$x60,0x60
+	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	li		$x70,0x70
+	mtspr		256,r0
+
+	subi		$rounds,$rounds,3	# -4 in total
+	subi		$len,$len,128		# bias
+
+	lvx		$rndkey0,$x00,$key	# load key schedule
+	lvx		v30,$x10,$key
+	addi		$key,$key,0x20
+	lvx		v31,$x00,$key
+	?vperm		$rndkey0,$rndkey0,v30,$keyperm
+	addi		$key_,$sp,$FRAME+15
+	mtctr		$rounds
+
+Load_cbc_dec_key:
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v30,$x10,$key
+	addi		$key,$key,0x20
+	stvx		v24,$x00,$key_		# off-load round[1]
+	?vperm		v25,v31,v30,$keyperm
+	lvx		v31,$x00,$key
+	stvx		v25,$x10,$key_		# off-load round[2]
+	addi		$key_,$key_,0x20
+	bdnz		Load_cbc_dec_key
+
+	lvx		v26,$x10,$key
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v27,$x20,$key
+	stvx		v24,$x00,$key_		# off-load round[3]
+	?vperm		v25,v31,v26,$keyperm
+	lvx		v28,$x30,$key
+	stvx		v25,$x10,$key_		# off-load round[4]
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	?vperm		v26,v26,v27,$keyperm
+	lvx		v29,$x40,$key
+	?vperm		v27,v27,v28,$keyperm
+	lvx		v30,$x50,$key
+	?vperm		v28,v28,v29,$keyperm
+	lvx		v31,$x60,$key
+	?vperm		v29,v29,v30,$keyperm
+	lvx		$out0,$x70,$key		# borrow $out0
+	?vperm		v30,v30,v31,$keyperm
+	lvx		v24,$x00,$key_		# pre-load round[1]
+	?vperm		v31,v31,$out0,$keyperm
+	lvx		v25,$x10,$key_		# pre-load round[2]
+
+	#lvx		$inptail,0,$inp		# "caller" already did this
+	#addi		$inp,$inp,15		# 15 is not typo
+	subi		$inp,$inp,15		# undo "caller"
+
+	 le?li		$idx,8
+	lvx_u		$in0,$x00,$inp		# load first 8 "words"
+	 le?lvsl	$inpperm,0,$idx
+	 le?vspltisb	$tmp,0x0f
+	lvx_u		$in1,$x10,$inp
+	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
+	lvx_u		$in2,$x20,$inp
+	 le?vperm	$in0,$in0,$in0,$inpperm
+	lvx_u		$in3,$x30,$inp
+	 le?vperm	$in1,$in1,$in1,$inpperm
+	lvx_u		$in4,$x40,$inp
+	 le?vperm	$in2,$in2,$in2,$inpperm
+	vxor		$out0,$in0,$rndkey0
+	lvx_u		$in5,$x50,$inp
+	 le?vperm	$in3,$in3,$in3,$inpperm
+	vxor		$out1,$in1,$rndkey0
+	lvx_u		$in6,$x60,$inp
+	 le?vperm	$in4,$in4,$in4,$inpperm
+	vxor		$out2,$in2,$rndkey0
+	lvx_u		$in7,$x70,$inp
+	addi		$inp,$inp,0x80
+	 le?vperm	$in5,$in5,$in5,$inpperm
+	vxor		$out3,$in3,$rndkey0
+	 le?vperm	$in6,$in6,$in6,$inpperm
+	vxor		$out4,$in4,$rndkey0
+	 le?vperm	$in7,$in7,$in7,$inpperm
+	vxor		$out5,$in5,$rndkey0
+	vxor		$out6,$in6,$rndkey0
+	vxor		$out7,$in7,$rndkey0
+
+	mtctr		$rounds
+	b		Loop_cbc_dec8x
+.align	5
+Loop_cbc_dec8x:
+	vncipher	$out0,$out0,v24
+	vncipher	$out1,$out1,v24
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	vncipher	$out4,$out4,v24
+	vncipher	$out5,$out5,v24
+	vncipher	$out6,$out6,v24
+	vncipher	$out7,$out7,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vncipher	$out0,$out0,v25
+	vncipher	$out1,$out1,v25
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	vncipher	$out4,$out4,v25
+	vncipher	$out5,$out5,v25
+	vncipher	$out6,$out6,v25
+	vncipher	$out7,$out7,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Loop_cbc_dec8x
+
+	subic		$len,$len,128		# $len-=128
+	vncipher	$out0,$out0,v24
+	vncipher	$out1,$out1,v24
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	vncipher	$out4,$out4,v24
+	vncipher	$out5,$out5,v24
+	vncipher	$out6,$out6,v24
+	vncipher	$out7,$out7,v24
+
+	subfe.		r0,r0,r0		# borrow?-1:0
+	vncipher	$out0,$out0,v25
+	vncipher	$out1,$out1,v25
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	vncipher	$out4,$out4,v25
+	vncipher	$out5,$out5,v25
+	vncipher	$out6,$out6,v25
+	vncipher	$out7,$out7,v25
+
+	and		r0,r0,$len
+	vncipher	$out0,$out0,v26
+	vncipher	$out1,$out1,v26
+	vncipher	$out2,$out2,v26
+	vncipher	$out3,$out3,v26
+	vncipher	$out4,$out4,v26
+	vncipher	$out5,$out5,v26
+	vncipher	$out6,$out6,v26
+	vncipher	$out7,$out7,v26
+
+	add		$inp,$inp,r0		# $inp is adjusted in such
+						# way that at exit from the
+						# loop inX-in7 are loaded
+						# with last "words"
+	vncipher	$out0,$out0,v27
+	vncipher	$out1,$out1,v27
+	vncipher	$out2,$out2,v27
+	vncipher	$out3,$out3,v27
+	vncipher	$out4,$out4,v27
+	vncipher	$out5,$out5,v27
+	vncipher	$out6,$out6,v27
+	vncipher	$out7,$out7,v27
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vncipher	$out0,$out0,v28
+	vncipher	$out1,$out1,v28
+	vncipher	$out2,$out2,v28
+	vncipher	$out3,$out3,v28
+	vncipher	$out4,$out4,v28
+	vncipher	$out5,$out5,v28
+	vncipher	$out6,$out6,v28
+	vncipher	$out7,$out7,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+
+	vncipher	$out0,$out0,v29
+	vncipher	$out1,$out1,v29
+	vncipher	$out2,$out2,v29
+	vncipher	$out3,$out3,v29
+	vncipher	$out4,$out4,v29
+	vncipher	$out5,$out5,v29
+	vncipher	$out6,$out6,v29
+	vncipher	$out7,$out7,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+
+	vncipher	$out0,$out0,v30
+	 vxor		$ivec,$ivec,v31		# xor with last round key
+	vncipher	$out1,$out1,v30
+	 vxor		$in0,$in0,v31
+	vncipher	$out2,$out2,v30
+	 vxor		$in1,$in1,v31
+	vncipher	$out3,$out3,v30
+	 vxor		$in2,$in2,v31
+	vncipher	$out4,$out4,v30
+	 vxor		$in3,$in3,v31
+	vncipher	$out5,$out5,v30
+	 vxor		$in4,$in4,v31
+	vncipher	$out6,$out6,v30
+	 vxor		$in5,$in5,v31
+	vncipher	$out7,$out7,v30
+	 vxor		$in6,$in6,v31
+
+	vncipherlast	$out0,$out0,$ivec
+	vncipherlast	$out1,$out1,$in0
+	 lvx_u		$in0,$x00,$inp		# load next input block
+	vncipherlast	$out2,$out2,$in1
+	 lvx_u		$in1,$x10,$inp
+	vncipherlast	$out3,$out3,$in2
+	 le?vperm	$in0,$in0,$in0,$inpperm
+	 lvx_u		$in2,$x20,$inp
+	vncipherlast	$out4,$out4,$in3
+	 le?vperm	$in1,$in1,$in1,$inpperm
+	 lvx_u		$in3,$x30,$inp
+	vncipherlast	$out5,$out5,$in4
+	 le?vperm	$in2,$in2,$in2,$inpperm
+	 lvx_u		$in4,$x40,$inp
+	vncipherlast	$out6,$out6,$in5
+	 le?vperm	$in3,$in3,$in3,$inpperm
+	 lvx_u		$in5,$x50,$inp
+	vncipherlast	$out7,$out7,$in6
+	 le?vperm	$in4,$in4,$in4,$inpperm
+	 lvx_u		$in6,$x60,$inp
+	vmr		$ivec,$in7
+	 le?vperm	$in5,$in5,$in5,$inpperm
+	 lvx_u		$in7,$x70,$inp
+	 addi		$inp,$inp,0x80
+
+	le?vperm	$out0,$out0,$out0,$inpperm
+	le?vperm	$out1,$out1,$out1,$inpperm
+	stvx_u		$out0,$x00,$out
+	 le?vperm	$in6,$in6,$in6,$inpperm
+	 vxor		$out0,$in0,$rndkey0
+	le?vperm	$out2,$out2,$out2,$inpperm
+	stvx_u		$out1,$x10,$out
+	 le?vperm	$in7,$in7,$in7,$inpperm
+	 vxor		$out1,$in1,$rndkey0
+	le?vperm	$out3,$out3,$out3,$inpperm
+	stvx_u		$out2,$x20,$out
+	 vxor		$out2,$in2,$rndkey0
+	le?vperm	$out4,$out4,$out4,$inpperm
+	stvx_u		$out3,$x30,$out
+	 vxor		$out3,$in3,$rndkey0
+	le?vperm	$out5,$out5,$out5,$inpperm
+	stvx_u		$out4,$x40,$out
+	 vxor		$out4,$in4,$rndkey0
+	le?vperm	$out6,$out6,$out6,$inpperm
+	stvx_u		$out5,$x50,$out
+	 vxor		$out5,$in5,$rndkey0
+	le?vperm	$out7,$out7,$out7,$inpperm
+	stvx_u		$out6,$x60,$out
+	 vxor		$out6,$in6,$rndkey0
+	stvx_u		$out7,$x70,$out
+	addi		$out,$out,0x80
+	 vxor		$out7,$in7,$rndkey0
+
+	mtctr		$rounds
+	beq		Loop_cbc_dec8x		# did $len-=128 borrow?
+
+	addic.		$len,$len,128
+	beq		Lcbc_dec8x_done
+	nop
+	nop
+
+Loop_cbc_dec8x_tail:				# up to 7 "words" tail...
+	vncipher	$out1,$out1,v24
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	vncipher	$out4,$out4,v24
+	vncipher	$out5,$out5,v24
+	vncipher	$out6,$out6,v24
+	vncipher	$out7,$out7,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vncipher	$out1,$out1,v25
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	vncipher	$out4,$out4,v25
+	vncipher	$out5,$out5,v25
+	vncipher	$out6,$out6,v25
+	vncipher	$out7,$out7,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Loop_cbc_dec8x_tail
+
+	vncipher	$out1,$out1,v24
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	vncipher	$out4,$out4,v24
+	vncipher	$out5,$out5,v24
+	vncipher	$out6,$out6,v24
+	vncipher	$out7,$out7,v24
+
+	vncipher	$out1,$out1,v25
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	vncipher	$out4,$out4,v25
+	vncipher	$out5,$out5,v25
+	vncipher	$out6,$out6,v25
+	vncipher	$out7,$out7,v25
+
+	vncipher	$out1,$out1,v26
+	vncipher	$out2,$out2,v26
+	vncipher	$out3,$out3,v26
+	vncipher	$out4,$out4,v26
+	vncipher	$out5,$out5,v26
+	vncipher	$out6,$out6,v26
+	vncipher	$out7,$out7,v26
+
+	vncipher	$out1,$out1,v27
+	vncipher	$out2,$out2,v27
+	vncipher	$out3,$out3,v27
+	vncipher	$out4,$out4,v27
+	vncipher	$out5,$out5,v27
+	vncipher	$out6,$out6,v27
+	vncipher	$out7,$out7,v27
+
+	vncipher	$out1,$out1,v28
+	vncipher	$out2,$out2,v28
+	vncipher	$out3,$out3,v28
+	vncipher	$out4,$out4,v28
+	vncipher	$out5,$out5,v28
+	vncipher	$out6,$out6,v28
+	vncipher	$out7,$out7,v28
+
+	vncipher	$out1,$out1,v29
+	vncipher	$out2,$out2,v29
+	vncipher	$out3,$out3,v29
+	vncipher	$out4,$out4,v29
+	vncipher	$out5,$out5,v29
+	vncipher	$out6,$out6,v29
+	vncipher	$out7,$out7,v29
+
+	vncipher	$out1,$out1,v30
+	 vxor		$ivec,$ivec,v31		# last round key
+	vncipher	$out2,$out2,v30
+	 vxor		$in1,$in1,v31
+	vncipher	$out3,$out3,v30
+	 vxor		$in2,$in2,v31
+	vncipher	$out4,$out4,v30
+	 vxor		$in3,$in3,v31
+	vncipher	$out5,$out5,v30
+	 vxor		$in4,$in4,v31
+	vncipher	$out6,$out6,v30
+	 vxor		$in5,$in5,v31
+	vncipher	$out7,$out7,v30
+	 vxor		$in6,$in6,v31
+
+	cmplwi		$len,32			# switch($len)
+	blt		Lcbc_dec8x_one
+	nop
+	beq		Lcbc_dec8x_two
+	cmplwi		$len,64
+	blt		Lcbc_dec8x_three
+	nop
+	beq		Lcbc_dec8x_four
+	cmplwi		$len,96
+	blt		Lcbc_dec8x_five
+	nop
+	beq		Lcbc_dec8x_six
+
+Lcbc_dec8x_seven:
+	vncipherlast	$out1,$out1,$ivec
+	vncipherlast	$out2,$out2,$in1
+	vncipherlast	$out3,$out3,$in2
+	vncipherlast	$out4,$out4,$in3
+	vncipherlast	$out5,$out5,$in4
+	vncipherlast	$out6,$out6,$in5
+	vncipherlast	$out7,$out7,$in6
+	vmr		$ivec,$in7
+
+	le?vperm	$out1,$out1,$out1,$inpperm
+	le?vperm	$out2,$out2,$out2,$inpperm
+	stvx_u		$out1,$x00,$out
+	le?vperm	$out3,$out3,$out3,$inpperm
+	stvx_u		$out2,$x10,$out
+	le?vperm	$out4,$out4,$out4,$inpperm
+	stvx_u		$out3,$x20,$out
+	le?vperm	$out5,$out5,$out5,$inpperm
+	stvx_u		$out4,$x30,$out
+	le?vperm	$out6,$out6,$out6,$inpperm
+	stvx_u		$out5,$x40,$out
+	le?vperm	$out7,$out7,$out7,$inpperm
+	stvx_u		$out6,$x50,$out
+	stvx_u		$out7,$x60,$out
+	addi		$out,$out,0x70
+	b		Lcbc_dec8x_done
+
+.align	5
+Lcbc_dec8x_six:
+	vncipherlast	$out2,$out2,$ivec
+	vncipherlast	$out3,$out3,$in2
+	vncipherlast	$out4,$out4,$in3
+	vncipherlast	$out5,$out5,$in4
+	vncipherlast	$out6,$out6,$in5
+	vncipherlast	$out7,$out7,$in6
+	vmr		$ivec,$in7
+
+	le?vperm	$out2,$out2,$out2,$inpperm
+	le?vperm	$out3,$out3,$out3,$inpperm
+	stvx_u		$out2,$x00,$out
+	le?vperm	$out4,$out4,$out4,$inpperm
+	stvx_u		$out3,$x10,$out
+	le?vperm	$out5,$out5,$out5,$inpperm
+	stvx_u		$out4,$x20,$out
+	le?vperm	$out6,$out6,$out6,$inpperm
+	stvx_u		$out5,$x30,$out
+	le?vperm	$out7,$out7,$out7,$inpperm
+	stvx_u		$out6,$x40,$out
+	stvx_u		$out7,$x50,$out
+	addi		$out,$out,0x60
+	b		Lcbc_dec8x_done
+
+.align	5
+Lcbc_dec8x_five:
+	vncipherlast	$out3,$out3,$ivec
+	vncipherlast	$out4,$out4,$in3
+	vncipherlast	$out5,$out5,$in4
+	vncipherlast	$out6,$out6,$in5
+	vncipherlast	$out7,$out7,$in6
+	vmr		$ivec,$in7
+
+	le?vperm	$out3,$out3,$out3,$inpperm
+	le?vperm	$out4,$out4,$out4,$inpperm
+	stvx_u		$out3,$x00,$out
+	le?vperm	$out5,$out5,$out5,$inpperm
+	stvx_u		$out4,$x10,$out
+	le?vperm	$out6,$out6,$out6,$inpperm
+	stvx_u		$out5,$x20,$out
+	le?vperm	$out7,$out7,$out7,$inpperm
+	stvx_u		$out6,$x30,$out
+	stvx_u		$out7,$x40,$out
+	addi		$out,$out,0x50
+	b		Lcbc_dec8x_done
+
+.align	5
+Lcbc_dec8x_four:
+	vncipherlast	$out4,$out4,$ivec
+	vncipherlast	$out5,$out5,$in4
+	vncipherlast	$out6,$out6,$in5
+	vncipherlast	$out7,$out7,$in6
+	vmr		$ivec,$in7
+
+	le?vperm	$out4,$out4,$out4,$inpperm
+	le?vperm	$out5,$out5,$out5,$inpperm
+	stvx_u		$out4,$x00,$out
+	le?vperm	$out6,$out6,$out6,$inpperm
+	stvx_u		$out5,$x10,$out
+	le?vperm	$out7,$out7,$out7,$inpperm
+	stvx_u		$out6,$x20,$out
+	stvx_u		$out7,$x30,$out
+	addi		$out,$out,0x40
+	b		Lcbc_dec8x_done
+
+.align	5
+Lcbc_dec8x_three:
+	vncipherlast	$out5,$out5,$ivec
+	vncipherlast	$out6,$out6,$in5
+	vncipherlast	$out7,$out7,$in6
+	vmr		$ivec,$in7
+
+	le?vperm	$out5,$out5,$out5,$inpperm
+	le?vperm	$out6,$out6,$out6,$inpperm
+	stvx_u		$out5,$x00,$out
+	le?vperm	$out7,$out7,$out7,$inpperm
+	stvx_u		$out6,$x10,$out
+	stvx_u		$out7,$x20,$out
+	addi		$out,$out,0x30
+	b		Lcbc_dec8x_done
+
+.align	5
+Lcbc_dec8x_two:
+	vncipherlast	$out6,$out6,$ivec
+	vncipherlast	$out7,$out7,$in6
+	vmr		$ivec,$in7
+
+	le?vperm	$out6,$out6,$out6,$inpperm
+	le?vperm	$out7,$out7,$out7,$inpperm
+	stvx_u		$out6,$x00,$out
+	stvx_u		$out7,$x10,$out
+	addi		$out,$out,0x20
+	b		Lcbc_dec8x_done
+
+.align	5
+Lcbc_dec8x_one:
+	vncipherlast	$out7,$out7,$ivec
+	vmr		$ivec,$in7
+
+	le?vperm	$out7,$out7,$out7,$inpperm
+	stvx_u		$out7,0,$out
+	addi		$out,$out,0x10
+
+Lcbc_dec8x_done:
+	le?vperm	$ivec,$ivec,$ivec,$inpperm
+	stvx_u		$ivec,0,$ivp		# write [unaligned] iv
+
+	li		r10,`$FRAME+15`
+	li		r11,`$FRAME+31`
+	stvx		$inpperm,r10,$sp	# wipe copies of round keys
+	addi		r10,r10,32
+	stvx		$inpperm,r11,$sp
+	addi		r11,r11,32
+	stvx		$inpperm,r10,$sp
+	addi		r10,r10,32
+	stvx		$inpperm,r11,$sp
+	addi		r11,r11,32
+	stvx		$inpperm,r10,$sp
+	addi		r10,r10,32
+	stvx		$inpperm,r11,$sp
+	addi		r11,r11,32
+	stvx		$inpperm,r10,$sp
+	addi		r10,r10,32
+	stvx		$inpperm,r11,$sp
+	addi		r11,r11,32
+
+	mtspr		256,$vrsave
+	lvx		v20,r10,$sp		# ABI says so
+	addi		r10,r10,32
+	lvx		v21,r11,$sp
+	addi		r11,r11,32
+	lvx		v22,r10,$sp
+	addi		r10,r10,32
+	lvx		v23,r11,$sp
+	addi		r11,r11,32
+	lvx		v24,r10,$sp
+	addi		r10,r10,32
+	lvx		v25,r11,$sp
+	addi		r11,r11,32
+	lvx		v26,r10,$sp
+	addi		r10,r10,32
+	lvx		v27,r11,$sp
+	addi		r11,r11,32
+	lvx		v28,r10,$sp
+	addi		r10,r10,32
+	lvx		v29,r11,$sp
+	addi		r11,r11,32
+	lvx		v30,r10,$sp
+	lvx		v31,r11,$sp
+	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0x80,6,6,0
+	.long		0
+.size	.${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
+___
+}}	}}}
+
+#########################################################################
+{{{	# CTR procedure[s]						#
+
+####################### WARNING: Here be dragons! #######################
+#
+# This code is written as 'ctr32', based on a 32-bit counter used
+# upstream. The kernel does *not* use a 32-bit counter. The kernel uses
+# a 128-bit counter.
+#
+# This leads to subtle changes from the upstream code: the counter
+# is incremented with vaddu_q_m rather than vaddu_w_m. This occurs in
+# both the bulk (8 blocks at a time) path, and in the individual block
+# path. Be aware of this when doing updates.
+#
+# See:
+# 1d4aa0b4c181 ("crypto: vmx - Fixing AES-CTR counter bug")
+# 009b30ac7444 ("crypto: vmx - CTR: always increment IV as quadword")
+# https://github.com/openssl/openssl/pull/8942
+#
+#########################################################################
+my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
+my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
+						map("v$_",(4..11));
+my $dat=$tmp;
+
+$code.=<<___;
+.globl	.${prefix}_ctr32_encrypt_blocks
+	${UCMP}i	$len,1
+	bltlr-
+
+	lis		r0,0xfff0
+	mfspr		$vrsave,256
+	mtspr		256,r0
+
+	li		$idx,15
+	vxor		$rndkey0,$rndkey0,$rndkey0
+	le?vspltisb	$tmp,0x0f
+
+	lvx		$ivec,0,$ivp		# load [unaligned] iv
+	lvsl		$inpperm,0,$ivp
+	lvx		$inptail,$idx,$ivp
+	 vspltisb	$one,1
+	le?vxor		$inpperm,$inpperm,$tmp
+	vperm		$ivec,$ivec,$inptail,$inpperm
+	 vsldoi		$one,$rndkey0,$one,1
+
+	neg		r11,$inp
+	?lvsl		$keyperm,0,$key		# prepare for unaligned key
+	lwz		$rounds,240($key)
+
+	lvsr		$inpperm,0,r11		# prepare for unaligned load
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,15		# 15 is not typo
+	le?vxor		$inpperm,$inpperm,$tmp
+
+	srwi		$rounds,$rounds,1
+	li		$idx,16
+	subi		$rounds,$rounds,1
+
+	${UCMP}i	$len,8
+	bge		_aesp8_ctr32_encrypt8x
+
+	?lvsr		$outperm,0,$out		# prepare for unaligned store
+	vspltisb	$outmask,-1
+	lvx		$outhead,0,$out
+	?vperm		$outmask,$rndkey0,$outmask,$outperm
+	le?vxor		$outperm,$outperm,$tmp
+
+	lvx		$rndkey0,0,$key
+	mtctr		$rounds
+	lvx		$rndkey1,$idx,$key
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$inout,$ivec,$rndkey0
+	lvx		$rndkey0,$idx,$key
+	addi		$idx,$idx,16
+	b		Loop_ctr32_enc
+
+.align	5
+Loop_ctr32_enc:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipher		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key
+	addi		$idx,$idx,16
+	bdnz		Loop_ctr32_enc
+
+	vadduqm		$ivec,$ivec,$one	# Kernel change for 128-bit
+	 vmr		$dat,$inptail
+	 lvx		$inptail,0,$inp
+	 addi		$inp,$inp,16
+	 subic.		$len,$len,1		# blocks--
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key
+	 vperm		$dat,$dat,$inptail,$inpperm
+	 li		$idx,16
+	?vperm		$rndkey1,$rndkey0,$rndkey1,$keyperm
+	 lvx		$rndkey0,0,$key
+	vxor		$dat,$dat,$rndkey1	# last round key
+	vcipherlast	$inout,$inout,$dat
+
+	 lvx		$rndkey1,$idx,$key
+	 addi		$idx,$idx,16
+	vperm		$inout,$inout,$inout,$outperm
+	vsel		$dat,$outhead,$inout,$outmask
+	 mtctr		$rounds
+	 ?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vmr		$outhead,$inout
+	 vxor		$inout,$ivec,$rndkey0
+	 lvx		$rndkey0,$idx,$key
+	 addi		$idx,$idx,16
+	stvx		$dat,0,$out
+	addi		$out,$out,16
+	bne		Loop_ctr32_enc
+
+	addi		$out,$out,-1
+	lvx		$inout,0,$out		# redundant in aligned case
+	vsel		$inout,$outhead,$inout,$outmask
+	stvx		$inout,0,$out
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,6,0
+	.long		0
+___
+#########################################################################
+{{	# Optimized CTR procedure					#
+my $key_="r11";
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
+my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
+my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
+my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
+			# v26-v31 last 6 round keys
+my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
+my ($two,$three,$four)=($outhead,$outperm,$outmask);
+
+$code.=<<___;
+.align	5
+_aesp8_ctr32_encrypt8x:
+	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+	li		r10,`$FRAME+8*16+15`
+	li		r11,`$FRAME+8*16+31`
+	stvx		v20,r10,$sp		# ABI says so
+	addi		r10,r10,32
+	stvx		v21,r11,$sp
+	addi		r11,r11,32
+	stvx		v22,r10,$sp
+	addi		r10,r10,32
+	stvx		v23,r11,$sp
+	addi		r11,r11,32
+	stvx		v24,r10,$sp
+	addi		r10,r10,32
+	stvx		v25,r11,$sp
+	addi		r11,r11,32
+	stvx		v26,r10,$sp
+	addi		r10,r10,32
+	stvx		v27,r11,$sp
+	addi		r11,r11,32
+	stvx		v28,r10,$sp
+	addi		r10,r10,32
+	stvx		v29,r11,$sp
+	addi		r11,r11,32
+	stvx		v30,r10,$sp
+	stvx		v31,r11,$sp
+	li		r0,-1
+	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
+	li		$x10,0x10
+	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	li		$x20,0x20
+	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	li		$x30,0x30
+	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	li		$x40,0x40
+	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	li		$x50,0x50
+	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	li		$x60,0x60
+	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	li		$x70,0x70
+	mtspr		256,r0
+
+	subi		$rounds,$rounds,3	# -4 in total
+
+	lvx		$rndkey0,$x00,$key	# load key schedule
+	lvx		v30,$x10,$key
+	addi		$key,$key,0x20
+	lvx		v31,$x00,$key
+	?vperm		$rndkey0,$rndkey0,v30,$keyperm
+	addi		$key_,$sp,$FRAME+15
+	mtctr		$rounds
+
+Load_ctr32_enc_key:
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v30,$x10,$key
+	addi		$key,$key,0x20
+	stvx		v24,$x00,$key_		# off-load round[1]
+	?vperm		v25,v31,v30,$keyperm
+	lvx		v31,$x00,$key
+	stvx		v25,$x10,$key_		# off-load round[2]
+	addi		$key_,$key_,0x20
+	bdnz		Load_ctr32_enc_key
+
+	lvx		v26,$x10,$key
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v27,$x20,$key
+	stvx		v24,$x00,$key_		# off-load round[3]
+	?vperm		v25,v31,v26,$keyperm
+	lvx		v28,$x30,$key
+	stvx		v25,$x10,$key_		# off-load round[4]
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	?vperm		v26,v26,v27,$keyperm
+	lvx		v29,$x40,$key
+	?vperm		v27,v27,v28,$keyperm
+	lvx		v30,$x50,$key
+	?vperm		v28,v28,v29,$keyperm
+	lvx		v31,$x60,$key
+	?vperm		v29,v29,v30,$keyperm
+	lvx		$out0,$x70,$key		# borrow $out0
+	?vperm		v30,v30,v31,$keyperm
+	lvx		v24,$x00,$key_		# pre-load round[1]
+	?vperm		v31,v31,$out0,$keyperm
+	lvx		v25,$x10,$key_		# pre-load round[2]
+
+	vadduqm		$two,$one,$one
+	subi		$inp,$inp,15		# undo "caller"
+	$SHL		$len,$len,4
+
+	vadduqm		$out1,$ivec,$one	# counter values ...
+	vadduqm		$out2,$ivec,$two	# (do all ctr adds as 128-bit)
+	vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
+	 le?li		$idx,8
+	vadduqm		$out3,$out1,$two
+	vxor		$out1,$out1,$rndkey0
+	 le?lvsl	$inpperm,0,$idx
+	vadduqm		$out4,$out2,$two
+	vxor		$out2,$out2,$rndkey0
+	 le?vspltisb	$tmp,0x0f
+	vadduqm		$out5,$out3,$two
+	vxor		$out3,$out3,$rndkey0
+	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
+	vadduqm		$out6,$out4,$two
+	vxor		$out4,$out4,$rndkey0
+	vadduqm		$out7,$out5,$two
+	vxor		$out5,$out5,$rndkey0
+	vadduqm		$ivec,$out6,$two	# next counter value
+	vxor		$out6,$out6,$rndkey0
+	vxor		$out7,$out7,$rndkey0
+
+	mtctr		$rounds
+	b		Loop_ctr32_enc8x
+.align	5
+Loop_ctr32_enc8x:
+	vcipher 	$out0,$out0,v24
+	vcipher 	$out1,$out1,v24
+	vcipher 	$out2,$out2,v24
+	vcipher 	$out3,$out3,v24
+	vcipher 	$out4,$out4,v24
+	vcipher 	$out5,$out5,v24
+	vcipher 	$out6,$out6,v24
+	vcipher 	$out7,$out7,v24
+Loop_ctr32_enc8x_middle:
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vcipher 	$out0,$out0,v25
+	vcipher 	$out1,$out1,v25
+	vcipher 	$out2,$out2,v25
+	vcipher 	$out3,$out3,v25
+	vcipher 	$out4,$out4,v25
+	vcipher 	$out5,$out5,v25
+	vcipher 	$out6,$out6,v25
+	vcipher 	$out7,$out7,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Loop_ctr32_enc8x
+
+	subic		r11,$len,256		# $len-256, borrow $key_
+	vcipher 	$out0,$out0,v24
+	vcipher 	$out1,$out1,v24
+	vcipher 	$out2,$out2,v24
+	vcipher 	$out3,$out3,v24
+	vcipher 	$out4,$out4,v24
+	vcipher 	$out5,$out5,v24
+	vcipher 	$out6,$out6,v24
+	vcipher 	$out7,$out7,v24
+
+	subfe		r0,r0,r0		# borrow?-1:0
+	vcipher 	$out0,$out0,v25
+	vcipher 	$out1,$out1,v25
+	vcipher 	$out2,$out2,v25
+	vcipher 	$out3,$out3,v25
+	vcipher 	$out4,$out4,v25
+	vcipher		$out5,$out5,v25
+	vcipher		$out6,$out6,v25
+	vcipher		$out7,$out7,v25
+
+	and		r0,r0,r11
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vcipher		$out0,$out0,v26
+	vcipher		$out1,$out1,v26
+	vcipher		$out2,$out2,v26
+	vcipher		$out3,$out3,v26
+	vcipher		$out4,$out4,v26
+	vcipher		$out5,$out5,v26
+	vcipher		$out6,$out6,v26
+	vcipher		$out7,$out7,v26
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+
+	subic		$len,$len,129		# $len-=129
+	vcipher		$out0,$out0,v27
+	addi		$len,$len,1		# $len-=128 really
+	vcipher		$out1,$out1,v27
+	vcipher		$out2,$out2,v27
+	vcipher		$out3,$out3,v27
+	vcipher		$out4,$out4,v27
+	vcipher		$out5,$out5,v27
+	vcipher		$out6,$out6,v27
+	vcipher		$out7,$out7,v27
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+
+	vcipher		$out0,$out0,v28
+	 lvx_u		$in0,$x00,$inp		# load input
+	vcipher		$out1,$out1,v28
+	 lvx_u		$in1,$x10,$inp
+	vcipher		$out2,$out2,v28
+	 lvx_u		$in2,$x20,$inp
+	vcipher		$out3,$out3,v28
+	 lvx_u		$in3,$x30,$inp
+	vcipher		$out4,$out4,v28
+	 lvx_u		$in4,$x40,$inp
+	vcipher		$out5,$out5,v28
+	 lvx_u		$in5,$x50,$inp
+	vcipher		$out6,$out6,v28
+	 lvx_u		$in6,$x60,$inp
+	vcipher		$out7,$out7,v28
+	 lvx_u		$in7,$x70,$inp
+	 addi		$inp,$inp,0x80
+
+	vcipher		$out0,$out0,v29
+	 le?vperm	$in0,$in0,$in0,$inpperm
+	vcipher		$out1,$out1,v29
+	 le?vperm	$in1,$in1,$in1,$inpperm
+	vcipher		$out2,$out2,v29
+	 le?vperm	$in2,$in2,$in2,$inpperm
+	vcipher		$out3,$out3,v29
+	 le?vperm	$in3,$in3,$in3,$inpperm
+	vcipher		$out4,$out4,v29
+	 le?vperm	$in4,$in4,$in4,$inpperm
+	vcipher		$out5,$out5,v29
+	 le?vperm	$in5,$in5,$in5,$inpperm
+	vcipher		$out6,$out6,v29
+	 le?vperm	$in6,$in6,$in6,$inpperm
+	vcipher		$out7,$out7,v29
+	 le?vperm	$in7,$in7,$in7,$inpperm
+
+	add		$inp,$inp,r0		# $inp is adjusted in such
+						# way that at exit from the
+						# loop inX-in7 are loaded
+						# with last "words"
+	subfe.		r0,r0,r0		# borrow?-1:0
+	vcipher		$out0,$out0,v30
+	 vxor		$in0,$in0,v31		# xor with last round key
+	vcipher		$out1,$out1,v30
+	 vxor		$in1,$in1,v31
+	vcipher		$out2,$out2,v30
+	 vxor		$in2,$in2,v31
+	vcipher		$out3,$out3,v30
+	 vxor		$in3,$in3,v31
+	vcipher		$out4,$out4,v30
+	 vxor		$in4,$in4,v31
+	vcipher		$out5,$out5,v30
+	 vxor		$in5,$in5,v31
+	vcipher		$out6,$out6,v30
+	 vxor		$in6,$in6,v31
+	vcipher		$out7,$out7,v30
+	 vxor		$in7,$in7,v31
+
+	bne		Lctr32_enc8x_break	# did $len-129 borrow?
+
+	vcipherlast	$in0,$out0,$in0
+	vcipherlast	$in1,$out1,$in1
+	 vadduqm	$out1,$ivec,$one	# counter values ...
+	vcipherlast	$in2,$out2,$in2
+	 vadduqm	$out2,$ivec,$two
+	 vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
+	vcipherlast	$in3,$out3,$in3
+	 vadduqm	$out3,$out1,$two
+	 vxor		$out1,$out1,$rndkey0
+	vcipherlast	$in4,$out4,$in4
+	 vadduqm	$out4,$out2,$two
+	 vxor		$out2,$out2,$rndkey0
+	vcipherlast	$in5,$out5,$in5
+	 vadduqm	$out5,$out3,$two
+	 vxor		$out3,$out3,$rndkey0
+	vcipherlast	$in6,$out6,$in6
+	 vadduqm	$out6,$out4,$two
+	 vxor		$out4,$out4,$rndkey0
+	vcipherlast	$in7,$out7,$in7
+	 vadduqm	$out7,$out5,$two
+	 vxor		$out5,$out5,$rndkey0
+	le?vperm	$in0,$in0,$in0,$inpperm
+	 vadduqm	$ivec,$out6,$two	# next counter value
+	 vxor		$out6,$out6,$rndkey0
+	le?vperm	$in1,$in1,$in1,$inpperm
+	 vxor		$out7,$out7,$rndkey0
+	mtctr		$rounds
+
+	 vcipher	$out0,$out0,v24
+	stvx_u		$in0,$x00,$out
+	le?vperm	$in2,$in2,$in2,$inpperm
+	 vcipher	$out1,$out1,v24
+	stvx_u		$in1,$x10,$out
+	le?vperm	$in3,$in3,$in3,$inpperm
+	 vcipher	$out2,$out2,v24
+	stvx_u		$in2,$x20,$out
+	le?vperm	$in4,$in4,$in4,$inpperm
+	 vcipher	$out3,$out3,v24
+	stvx_u		$in3,$x30,$out
+	le?vperm	$in5,$in5,$in5,$inpperm
+	 vcipher	$out4,$out4,v24
+	stvx_u		$in4,$x40,$out
+	le?vperm	$in6,$in6,$in6,$inpperm
+	 vcipher	$out5,$out5,v24
+	stvx_u		$in5,$x50,$out
+	le?vperm	$in7,$in7,$in7,$inpperm
+	 vcipher	$out6,$out6,v24
+	stvx_u		$in6,$x60,$out
+	 vcipher	$out7,$out7,v24
+	stvx_u		$in7,$x70,$out
+	addi		$out,$out,0x80
+
+	b		Loop_ctr32_enc8x_middle
+
+.align	5
+Lctr32_enc8x_break:
+	cmpwi		$len,-0x60
+	blt		Lctr32_enc8x_one
+	nop
+	beq		Lctr32_enc8x_two
+	cmpwi		$len,-0x40
+	blt		Lctr32_enc8x_three
+	nop
+	beq		Lctr32_enc8x_four
+	cmpwi		$len,-0x20
+	blt		Lctr32_enc8x_five
+	nop
+	beq		Lctr32_enc8x_six
+	cmpwi		$len,0x00
+	blt		Lctr32_enc8x_seven
+
+Lctr32_enc8x_eight:
+	vcipherlast	$out0,$out0,$in0
+	vcipherlast	$out1,$out1,$in1
+	vcipherlast	$out2,$out2,$in2
+	vcipherlast	$out3,$out3,$in3
+	vcipherlast	$out4,$out4,$in4
+	vcipherlast	$out5,$out5,$in5
+	vcipherlast	$out6,$out6,$in6
+	vcipherlast	$out7,$out7,$in7
+
+	le?vperm	$out0,$out0,$out0,$inpperm
+	le?vperm	$out1,$out1,$out1,$inpperm
+	stvx_u		$out0,$x00,$out
+	le?vperm	$out2,$out2,$out2,$inpperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$inpperm
+	stvx_u		$out2,$x20,$out
+	le?vperm	$out4,$out4,$out4,$inpperm
+	stvx_u		$out3,$x30,$out
+	le?vperm	$out5,$out5,$out5,$inpperm
+	stvx_u		$out4,$x40,$out
+	le?vperm	$out6,$out6,$out6,$inpperm
+	stvx_u		$out5,$x50,$out
+	le?vperm	$out7,$out7,$out7,$inpperm
+	stvx_u		$out6,$x60,$out
+	stvx_u		$out7,$x70,$out
+	addi		$out,$out,0x80
+	b		Lctr32_enc8x_done
+
+.align	5
+Lctr32_enc8x_seven:
+	vcipherlast	$out0,$out0,$in1
+	vcipherlast	$out1,$out1,$in2
+	vcipherlast	$out2,$out2,$in3
+	vcipherlast	$out3,$out3,$in4
+	vcipherlast	$out4,$out4,$in5
+	vcipherlast	$out5,$out5,$in6
+	vcipherlast	$out6,$out6,$in7
+
+	le?vperm	$out0,$out0,$out0,$inpperm
+	le?vperm	$out1,$out1,$out1,$inpperm
+	stvx_u		$out0,$x00,$out
+	le?vperm	$out2,$out2,$out2,$inpperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$inpperm
+	stvx_u		$out2,$x20,$out
+	le?vperm	$out4,$out4,$out4,$inpperm
+	stvx_u		$out3,$x30,$out
+	le?vperm	$out5,$out5,$out5,$inpperm
+	stvx_u		$out4,$x40,$out
+	le?vperm	$out6,$out6,$out6,$inpperm
+	stvx_u		$out5,$x50,$out
+	stvx_u		$out6,$x60,$out
+	addi		$out,$out,0x70
+	b		Lctr32_enc8x_done
+
+.align	5
+Lctr32_enc8x_six:
+	vcipherlast	$out0,$out0,$in2
+	vcipherlast	$out1,$out1,$in3
+	vcipherlast	$out2,$out2,$in4
+	vcipherlast	$out3,$out3,$in5
+	vcipherlast	$out4,$out4,$in6
+	vcipherlast	$out5,$out5,$in7
+
+	le?vperm	$out0,$out0,$out0,$inpperm
+	le?vperm	$out1,$out1,$out1,$inpperm
+	stvx_u		$out0,$x00,$out
+	le?vperm	$out2,$out2,$out2,$inpperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$inpperm
+	stvx_u		$out2,$x20,$out
+	le?vperm	$out4,$out4,$out4,$inpperm
+	stvx_u		$out3,$x30,$out
+	le?vperm	$out5,$out5,$out5,$inpperm
+	stvx_u		$out4,$x40,$out
+	stvx_u		$out5,$x50,$out
+	addi		$out,$out,0x60
+	b		Lctr32_enc8x_done
+
+.align	5
+Lctr32_enc8x_five:
+	vcipherlast	$out0,$out0,$in3
+	vcipherlast	$out1,$out1,$in4
+	vcipherlast	$out2,$out2,$in5
+	vcipherlast	$out3,$out3,$in6
+	vcipherlast	$out4,$out4,$in7
+
+	le?vperm	$out0,$out0,$out0,$inpperm
+	le?vperm	$out1,$out1,$out1,$inpperm
+	stvx_u		$out0,$x00,$out
+	le?vperm	$out2,$out2,$out2,$inpperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$inpperm
+	stvx_u		$out2,$x20,$out
+	le?vperm	$out4,$out4,$out4,$inpperm
+	stvx_u		$out3,$x30,$out
+	stvx_u		$out4,$x40,$out
+	addi		$out,$out,0x50
+	b		Lctr32_enc8x_done
+
+.align	5
+Lctr32_enc8x_four:
+	vcipherlast	$out0,$out0,$in4
+	vcipherlast	$out1,$out1,$in5
+	vcipherlast	$out2,$out2,$in6
+	vcipherlast	$out3,$out3,$in7
+
+	le?vperm	$out0,$out0,$out0,$inpperm
+	le?vperm	$out1,$out1,$out1,$inpperm
+	stvx_u		$out0,$x00,$out
+	le?vperm	$out2,$out2,$out2,$inpperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$inpperm
+	stvx_u		$out2,$x20,$out
+	stvx_u		$out3,$x30,$out
+	addi		$out,$out,0x40
+	b		Lctr32_enc8x_done
+
+.align	5
+Lctr32_enc8x_three:
+	vcipherlast	$out0,$out0,$in5
+	vcipherlast	$out1,$out1,$in6
+	vcipherlast	$out2,$out2,$in7
+
+	le?vperm	$out0,$out0,$out0,$inpperm
+	le?vperm	$out1,$out1,$out1,$inpperm
+	stvx_u		$out0,$x00,$out
+	le?vperm	$out2,$out2,$out2,$inpperm
+	stvx_u		$out1,$x10,$out
+	stvx_u		$out2,$x20,$out
+	addi		$out,$out,0x30
+	b		Lctr32_enc8x_done
+
+.align	5
+Lctr32_enc8x_two:
+	vcipherlast	$out0,$out0,$in6
+	vcipherlast	$out1,$out1,$in7
+
+	le?vperm	$out0,$out0,$out0,$inpperm
+	le?vperm	$out1,$out1,$out1,$inpperm
+	stvx_u		$out0,$x00,$out
+	stvx_u		$out1,$x10,$out
+	addi		$out,$out,0x20
+	b		Lctr32_enc8x_done
+
+.align	5
+Lctr32_enc8x_one:
+	vcipherlast	$out0,$out0,$in7
+
+	le?vperm	$out0,$out0,$out0,$inpperm
+	stvx_u		$out0,0,$out
+	addi		$out,$out,0x10
+
+Lctr32_enc8x_done:
+	li		r10,`$FRAME+15`
+	li		r11,`$FRAME+31`
+	stvx		$inpperm,r10,$sp	# wipe copies of round keys
+	addi		r10,r10,32
+	stvx		$inpperm,r11,$sp
+	addi		r11,r11,32
+	stvx		$inpperm,r10,$sp
+	addi		r10,r10,32
+	stvx		$inpperm,r11,$sp
+	addi		r11,r11,32
+	stvx		$inpperm,r10,$sp
+	addi		r10,r10,32
+	stvx		$inpperm,r11,$sp
+	addi		r11,r11,32
+	stvx		$inpperm,r10,$sp
+	addi		r10,r10,32
+	stvx		$inpperm,r11,$sp
+	addi		r11,r11,32
+
+	mtspr		256,$vrsave
+	lvx		v20,r10,$sp		# ABI says so
+	addi		r10,r10,32
+	lvx		v21,r11,$sp
+	addi		r11,r11,32
+	lvx		v22,r10,$sp
+	addi		r10,r10,32
+	lvx		v23,r11,$sp
+	addi		r11,r11,32
+	lvx		v24,r10,$sp
+	addi		r10,r10,32
+	lvx		v25,r11,$sp
+	addi		r11,r11,32
+	lvx		v26,r10,$sp
+	addi		r10,r10,32
+	lvx		v27,r11,$sp
+	addi		r11,r11,32
+	lvx		v28,r10,$sp
+	addi		r10,r10,32
+	lvx		v29,r11,$sp
+	addi		r11,r11,32
+	lvx		v30,r10,$sp
+	lvx		v31,r11,$sp
+	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0x80,6,6,0
+	.long		0
+.size	.${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
+___
+}}	}}}
+
+#########################################################################
+{{{	# XTS procedures						#
+# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len,	#
+#                             const AES_KEY *key1, const AES_KEY *key2,	#
+#                             [const] unsigned char iv[16]);		#
+# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which	#
+# input tweak value is assumed to be encrypted already, and last tweak	#
+# value, one suitable for consecutive call on same chunk of data, is	#
+# written back to original buffer. In addition, in "tweak chaining"	#
+# mode only complete input blocks are processed.			#
+
+my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) =	map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout) =				map("v$_",(0..2));
+my ($output,$inptail,$inpperm,$leperm,$keyperm) =	map("v$_",(3..7));
+my ($tweak,$seven,$eighty7,$tmp,$tweak1) =		map("v$_",(8..12));
+my $taillen = $key2;
+
+   ($inp,$idx) = ($idx,$inp);				# reassign
+
+$code.=<<___;
+.globl	.${prefix}_xts_encrypt
+	mr		$inp,r3				# reassign
+	li		r3,-1
+	${UCMP}i	$len,16
+	bltlr-
+
+	lis		r0,0xfff0
+	mfspr		r12,256				# save vrsave
+	li		r11,0
+	mtspr		256,r0
+
+	vspltisb	$seven,0x07			# 0x070707..07
+	le?lvsl		$leperm,r11,r11
+	le?vspltisb	$tmp,0x0f
+	le?vxor		$leperm,$leperm,$seven
+
+	li		$idx,15
+	lvx		$tweak,0,$ivp			# load [unaligned] iv
+	lvsl		$inpperm,0,$ivp
+	lvx		$inptail,$idx,$ivp
+	le?vxor		$inpperm,$inpperm,$tmp
+	vperm		$tweak,$tweak,$inptail,$inpperm
+
+	neg		r11,$inp
+	lvsr		$inpperm,0,r11			# prepare for unaligned load
+	lvx		$inout,0,$inp
+	addi		$inp,$inp,15			# 15 is not typo
+	le?vxor		$inpperm,$inpperm,$tmp
+
+	${UCMP}i	$key2,0				# key2==NULL?
+	beq		Lxts_enc_no_key2
+
+	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
+	lwz		$rounds,240($key2)
+	srwi		$rounds,$rounds,1
+	subi		$rounds,$rounds,1
+	li		$idx,16
+
+	lvx		$rndkey0,0,$key2
+	lvx		$rndkey1,$idx,$key2
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$tweak,$tweak,$rndkey0
+	lvx		$rndkey0,$idx,$key2
+	addi		$idx,$idx,16
+	mtctr		$rounds
+
+Ltweak_xts_enc:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$tweak,$tweak,$rndkey1
+	lvx		$rndkey1,$idx,$key2
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipher		$tweak,$tweak,$rndkey0
+	lvx		$rndkey0,$idx,$key2
+	addi		$idx,$idx,16
+	bdnz		Ltweak_xts_enc
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$tweak,$tweak,$rndkey1
+	lvx		$rndkey1,$idx,$key2
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipherlast	$tweak,$tweak,$rndkey0
+
+	li		$ivp,0				# don't chain the tweak
+	b		Lxts_enc
+
+Lxts_enc_no_key2:
+	li		$idx,-16
+	and		$len,$len,$idx			# in "tweak chaining"
+							# mode only complete
+							# blocks are processed
+Lxts_enc:
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,16
+
+	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
+	lwz		$rounds,240($key1)
+	srwi		$rounds,$rounds,1
+	subi		$rounds,$rounds,1
+	li		$idx,16
+
+	vslb		$eighty7,$seven,$seven		# 0x808080..80
+	vor		$eighty7,$eighty7,$seven	# 0x878787..87
+	vspltisb	$tmp,1				# 0x010101..01
+	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
+
+	${UCMP}i	$len,96
+	bge		_aesp8_xts_encrypt6x
+
+	andi.		$taillen,$len,15
+	subic		r0,$len,32
+	subi		$taillen,$taillen,16
+	subfe		r0,r0,r0
+	and		r0,r0,$taillen
+	add		$inp,$inp,r0
+
+	lvx		$rndkey0,0,$key1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+	vperm		$inout,$inout,$inptail,$inpperm
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$inout,$inout,$tweak
+	vxor		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+	mtctr		$rounds
+	b		Loop_xts_enc
+
+.align	5
+Loop_xts_enc:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipher		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+	bdnz		Loop_xts_enc
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key1
+	li		$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$rndkey0,$rndkey0,$tweak
+	vcipherlast	$output,$inout,$rndkey0
+
+	le?vperm	$tmp,$output,$output,$leperm
+	be?nop
+	le?stvx_u	$tmp,0,$out
+	be?stvx_u	$output,0,$out
+	addi		$out,$out,16
+
+	subic.		$len,$len,16
+	beq		Lxts_enc_done
+
+	vmr		$inout,$inptail
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,16
+	lvx		$rndkey0,0,$key1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+
+	subic		r0,$len,32
+	subfe		r0,r0,r0
+	and		r0,r0,$taillen
+	add		$inp,$inp,r0
+
+	vsrab		$tmp,$tweak,$seven		# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	vand		$tmp,$tmp,$eighty7
+	vxor		$tweak,$tweak,$tmp
+
+	vperm		$inout,$inout,$inptail,$inpperm
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$inout,$inout,$tweak
+	vxor		$output,$output,$rndkey0	# just in case $len<16
+	vxor		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+
+	mtctr		$rounds
+	${UCMP}i	$len,16
+	bge		Loop_xts_enc
+
+	vxor		$output,$output,$tweak
+	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
+	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
+	vspltisb	$tmp,-1
+	vperm		$inptail,$inptail,$tmp,$inpperm
+	vsel		$inout,$inout,$output,$inptail
+
+	subi		r11,$out,17
+	subi		$out,$out,16
+	mtctr		$len
+	li		$len,16
+Loop_xts_enc_steal:
+	lbzu		r0,1(r11)
+	stb		r0,16(r11)
+	bdnz		Loop_xts_enc_steal
+
+	mtctr		$rounds
+	b		Loop_xts_enc			# one more time...
+
+Lxts_enc_done:
+	${UCMP}i	$ivp,0
+	beq		Lxts_enc_ret
+
+	vsrab		$tmp,$tweak,$seven		# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	vand		$tmp,$tmp,$eighty7
+	vxor		$tweak,$tweak,$tmp
+
+	le?vperm	$tweak,$tweak,$tweak,$leperm
+	stvx_u		$tweak,0,$ivp
+
+Lxts_enc_ret:
+	mtspr		256,r12				# restore vrsave
+	li		r3,0
+	blr
+	.long		0
+	.byte		0,12,0x04,0,0x80,6,6,0
+	.long		0
+.size	.${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
+
+.globl	.${prefix}_xts_decrypt
+	mr		$inp,r3				# reassign
+	li		r3,-1
+	${UCMP}i	$len,16
+	bltlr-
+
+	lis		r0,0xfff8
+	mfspr		r12,256				# save vrsave
+	li		r11,0
+	mtspr		256,r0
+
+	andi.		r0,$len,15
+	neg		r0,r0
+	andi.		r0,r0,16
+	sub		$len,$len,r0
+
+	vspltisb	$seven,0x07			# 0x070707..07
+	le?lvsl		$leperm,r11,r11
+	le?vspltisb	$tmp,0x0f
+	le?vxor		$leperm,$leperm,$seven
+
+	li		$idx,15
+	lvx		$tweak,0,$ivp			# load [unaligned] iv
+	lvsl		$inpperm,0,$ivp
+	lvx		$inptail,$idx,$ivp
+	le?vxor		$inpperm,$inpperm,$tmp
+	vperm		$tweak,$tweak,$inptail,$inpperm
+
+	neg		r11,$inp
+	lvsr		$inpperm,0,r11			# prepare for unaligned load
+	lvx		$inout,0,$inp
+	addi		$inp,$inp,15			# 15 is not typo
+	le?vxor		$inpperm,$inpperm,$tmp
+
+	${UCMP}i	$key2,0				# key2==NULL?
+	beq		Lxts_dec_no_key2
+
+	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
+	lwz		$rounds,240($key2)
+	srwi		$rounds,$rounds,1
+	subi		$rounds,$rounds,1
+	li		$idx,16
+
+	lvx		$rndkey0,0,$key2
+	lvx		$rndkey1,$idx,$key2
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$tweak,$tweak,$rndkey0
+	lvx		$rndkey0,$idx,$key2
+	addi		$idx,$idx,16
+	mtctr		$rounds
+
+Ltweak_xts_dec:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$tweak,$tweak,$rndkey1
+	lvx		$rndkey1,$idx,$key2
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipher		$tweak,$tweak,$rndkey0
+	lvx		$rndkey0,$idx,$key2
+	addi		$idx,$idx,16
+	bdnz		Ltweak_xts_dec
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$tweak,$tweak,$rndkey1
+	lvx		$rndkey1,$idx,$key2
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipherlast	$tweak,$tweak,$rndkey0
+
+	li		$ivp,0				# don't chain the tweak
+	b		Lxts_dec
+
+Lxts_dec_no_key2:
+	neg		$idx,$len
+	andi.		$idx,$idx,15
+	add		$len,$len,$idx			# in "tweak chaining"
+							# mode only complete
+							# blocks are processed
+Lxts_dec:
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,16
+
+	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
+	lwz		$rounds,240($key1)
+	srwi		$rounds,$rounds,1
+	subi		$rounds,$rounds,1
+	li		$idx,16
+
+	vslb		$eighty7,$seven,$seven		# 0x808080..80
+	vor		$eighty7,$eighty7,$seven	# 0x878787..87
+	vspltisb	$tmp,1				# 0x010101..01
+	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
+
+	${UCMP}i	$len,96
+	bge		_aesp8_xts_decrypt6x
+
+	lvx		$rndkey0,0,$key1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+	vperm		$inout,$inout,$inptail,$inpperm
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$inout,$inout,$tweak
+	vxor		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+	mtctr		$rounds
+
+	${UCMP}i	$len,16
+	blt		Ltail_xts_dec
+	be?b		Loop_xts_dec
+
+.align	5
+Loop_xts_dec:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vncipher	$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vncipher	$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+	bdnz		Loop_xts_dec
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vncipher	$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key1
+	li		$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$rndkey0,$rndkey0,$tweak
+	vncipherlast	$output,$inout,$rndkey0
+
+	le?vperm	$tmp,$output,$output,$leperm
+	be?nop
+	le?stvx_u	$tmp,0,$out
+	be?stvx_u	$output,0,$out
+	addi		$out,$out,16
+
+	subic.		$len,$len,16
+	beq		Lxts_dec_done
+
+	vmr		$inout,$inptail
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,16
+	lvx		$rndkey0,0,$key1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+
+	vsrab		$tmp,$tweak,$seven		# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	vand		$tmp,$tmp,$eighty7
+	vxor		$tweak,$tweak,$tmp
+
+	vperm		$inout,$inout,$inptail,$inpperm
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$inout,$inout,$tweak
+	vxor		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+
+	mtctr		$rounds
+	${UCMP}i	$len,16
+	bge		Loop_xts_dec
+
+Ltail_xts_dec:
+	vsrab		$tmp,$tweak,$seven		# next tweak value
+	vaddubm		$tweak1,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	vand		$tmp,$tmp,$eighty7
+	vxor		$tweak1,$tweak1,$tmp
+
+	subi		$inp,$inp,16
+	add		$inp,$inp,$len
+
+	vxor		$inout,$inout,$tweak		# :-(
+	vxor		$inout,$inout,$tweak1		# :-)
+
+Loop_xts_dec_short:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vncipher	$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vncipher	$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+	bdnz		Loop_xts_dec_short
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vncipher	$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key1
+	li		$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$rndkey0,$rndkey0,$tweak1
+	vncipherlast	$output,$inout,$rndkey0
+
+	le?vperm	$tmp,$output,$output,$leperm
+	be?nop
+	le?stvx_u	$tmp,0,$out
+	be?stvx_u	$output,0,$out
+
+	vmr		$inout,$inptail
+	lvx		$inptail,0,$inp
+	#addi		$inp,$inp,16
+	lvx		$rndkey0,0,$key1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+	vperm		$inout,$inout,$inptail,$inpperm
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+
+	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
+	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
+	vspltisb	$tmp,-1
+	vperm		$inptail,$inptail,$tmp,$inpperm
+	vsel		$inout,$inout,$output,$inptail
+
+	vxor		$rndkey0,$rndkey0,$tweak
+	vxor		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+
+	subi		r11,$out,1
+	mtctr		$len
+	li		$len,16
+Loop_xts_dec_steal:
+	lbzu		r0,1(r11)
+	stb		r0,16(r11)
+	bdnz		Loop_xts_dec_steal
+
+	mtctr		$rounds
+	b		Loop_xts_dec			# one more time...
+
+Lxts_dec_done:
+	${UCMP}i	$ivp,0
+	beq		Lxts_dec_ret
+
+	vsrab		$tmp,$tweak,$seven		# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	vand		$tmp,$tmp,$eighty7
+	vxor		$tweak,$tweak,$tmp
+
+	le?vperm	$tweak,$tweak,$tweak,$leperm
+	stvx_u		$tweak,0,$ivp
+
+Lxts_dec_ret:
+	mtspr		256,r12				# restore vrsave
+	li		r3,0
+	blr
+	.long		0
+	.byte		0,12,0x04,0,0x80,6,6,0
+	.long		0
+.size	.${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
+___
+#########################################################################
+{{	# Optimized XTS procedures					#
+my $key_=$key2;
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
+    $x00=0 if ($flavour =~ /osx/);
+my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5 )=map("v$_",(0..5));
+my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
+my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
+my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
+			# v26-v31 last 6 round keys
+my ($keyperm)=($out0);	# aliases with "caller", redundant assignment
+my $taillen=$x70;
+
+$code.=<<___;
+.align	5
+_aesp8_xts_encrypt6x:
+	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+	mflr		r11
+	li		r7,`$FRAME+8*16+15`
+	li		r3,`$FRAME+8*16+31`
+	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+	stvx		v20,r7,$sp		# ABI says so
+	addi		r7,r7,32
+	stvx		v21,r3,$sp
+	addi		r3,r3,32
+	stvx		v22,r7,$sp
+	addi		r7,r7,32
+	stvx		v23,r3,$sp
+	addi		r3,r3,32
+	stvx		v24,r7,$sp
+	addi		r7,r7,32
+	stvx		v25,r3,$sp
+	addi		r3,r3,32
+	stvx		v26,r7,$sp
+	addi		r7,r7,32
+	stvx		v27,r3,$sp
+	addi		r3,r3,32
+	stvx		v28,r7,$sp
+	addi		r7,r7,32
+	stvx		v29,r3,$sp
+	addi		r3,r3,32
+	stvx		v30,r7,$sp
+	stvx		v31,r3,$sp
+	li		r0,-1
+	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
+	li		$x10,0x10
+	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	li		$x20,0x20
+	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	li		$x30,0x30
+	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	li		$x40,0x40
+	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	li		$x50,0x50
+	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	li		$x60,0x60
+	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	li		$x70,0x70
+	mtspr		256,r0
+
+	xxlor		2, 32+$eighty7, 32+$eighty7
+	vsldoi		$eighty7,$tmp,$eighty7,1        # 0x010101..87
+	xxlor		1, 32+$eighty7, 32+$eighty7
+
+	# Load XOR Lconsts.
+	mr		$x70, r6
+	bl		Lconsts
+	lxvw4x		0, $x40, r6		# load XOR contents
+	mr		r6, $x70
+	li		$x70,0x70
+
+	subi		$rounds,$rounds,3	# -4 in total
+
+	lvx		$rndkey0,$x00,$key1	# load key schedule
+	lvx		v30,$x10,$key1
+	addi		$key1,$key1,0x20
+	lvx		v31,$x00,$key1
+	?vperm		$rndkey0,$rndkey0,v30,$keyperm
+	addi		$key_,$sp,$FRAME+15
+	mtctr		$rounds
+
+Load_xts_enc_key:
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v30,$x10,$key1
+	addi		$key1,$key1,0x20
+	stvx		v24,$x00,$key_		# off-load round[1]
+	?vperm		v25,v31,v30,$keyperm
+	lvx		v31,$x00,$key1
+	stvx		v25,$x10,$key_		# off-load round[2]
+	addi		$key_,$key_,0x20
+	bdnz		Load_xts_enc_key
+
+	lvx		v26,$x10,$key1
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v27,$x20,$key1
+	stvx		v24,$x00,$key_		# off-load round[3]
+	?vperm		v25,v31,v26,$keyperm
+	lvx		v28,$x30,$key1
+	stvx		v25,$x10,$key_		# off-load round[4]
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	?vperm		v26,v26,v27,$keyperm
+	lvx		v29,$x40,$key1
+	?vperm		v27,v27,v28,$keyperm
+	lvx		v30,$x50,$key1
+	?vperm		v28,v28,v29,$keyperm
+	lvx		v31,$x60,$key1
+	?vperm		v29,v29,v30,$keyperm
+	lvx		$twk5,$x70,$key1	# borrow $twk5
+	?vperm		v30,v30,v31,$keyperm
+	lvx		v24,$x00,$key_		# pre-load round[1]
+	?vperm		v31,v31,$twk5,$keyperm
+	lvx		v25,$x10,$key_		# pre-load round[2]
+
+	# Switch to use the following codes with 0x010101..87 to generate tweak.
+	#     eighty7 = 0x010101..87
+	# vsrab         tmp, tweak, seven       # next tweak value, right shift 7 bits
+	# vand          tmp, tmp, eighty7       # last byte with carry
+	# vaddubm       tweak, tweak, tweak     # left shift 1 bit (x2)
+	# xxlor         vsx, 0, 0
+	# vpermxor      tweak, tweak, tmp, vsx
+
+	 vperm		$in0,$inout,$inptail,$inpperm
+	 subi		$inp,$inp,31		# undo "caller"
+	vxor		$twk0,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out0,$in0,$twk0
+	xxlor		32+$in1, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in1
+
+	 lvx_u		$in1,$x10,$inp
+	vxor		$twk1,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	 le?vperm	$in1,$in1,$in1,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out1,$in1,$twk1
+	xxlor		32+$in2, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in2
+
+	 lvx_u		$in2,$x20,$inp
+	 andi.		$taillen,$len,15
+	vxor		$twk2,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	 le?vperm	$in2,$in2,$in2,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out2,$in2,$twk2
+	xxlor		32+$in3, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in3
+
+	 lvx_u		$in3,$x30,$inp
+	 sub		$len,$len,$taillen
+	vxor		$twk3,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	 le?vperm	$in3,$in3,$in3,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out3,$in3,$twk3
+	xxlor		32+$in4, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in4
+
+	 lvx_u		$in4,$x40,$inp
+	 subi		$len,$len,0x60
+	vxor		$twk4,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	 le?vperm	$in4,$in4,$in4,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out4,$in4,$twk4
+	xxlor		32+$in5, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in5
+
+	 lvx_u		$in5,$x50,$inp
+	 addi		$inp,$inp,0x60
+	vxor		$twk5,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	 le?vperm	$in5,$in5,$in5,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out5,$in5,$twk5
+	xxlor		32+$in0, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in0
+
+	vxor		v31,v31,$rndkey0
+	mtctr		$rounds
+	b		Loop_xts_enc6x
+
+.align	5
+Loop_xts_enc6x:
+	vcipher		$out0,$out0,v24
+	vcipher		$out1,$out1,v24
+	vcipher		$out2,$out2,v24
+	vcipher		$out3,$out3,v24
+	vcipher		$out4,$out4,v24
+	vcipher		$out5,$out5,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vcipher		$out0,$out0,v25
+	vcipher		$out1,$out1,v25
+	vcipher		$out2,$out2,v25
+	vcipher		$out3,$out3,v25
+	vcipher		$out4,$out4,v25
+	vcipher		$out5,$out5,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Loop_xts_enc6x
+
+	xxlor		32+$eighty7, 1, 1	# 0x010101..87
+
+	subic		$len,$len,96		# $len-=96
+	 vxor		$in0,$twk0,v31		# xor with last round key
+	vcipher		$out0,$out0,v24
+	vcipher		$out1,$out1,v24
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk0,$tweak,$rndkey0
+	 vaddubm	$tweak,$tweak,$tweak
+	vcipher		$out2,$out2,v24
+	vcipher		$out3,$out3,v24
+	vcipher		$out4,$out4,v24
+	vcipher		$out5,$out5,v24
+
+	subfe.		r0,r0,r0		# borrow?-1:0
+	 vand		$tmp,$tmp,$eighty7
+	vcipher		$out0,$out0,v25
+	vcipher		$out1,$out1,v25
+	 xxlor		32+$in1, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in1
+	vcipher		$out2,$out2,v25
+	vcipher		$out3,$out3,v25
+	 vxor		$in1,$twk1,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk1,$tweak,$rndkey0
+	vcipher		$out4,$out4,v25
+	vcipher		$out5,$out5,v25
+
+	and		r0,r0,$len
+	 vaddubm	$tweak,$tweak,$tweak
+	vcipher		$out0,$out0,v26
+	vcipher		$out1,$out1,v26
+	 vand		$tmp,$tmp,$eighty7
+	vcipher		$out2,$out2,v26
+	vcipher		$out3,$out3,v26
+	 xxlor		32+$in2, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in2
+	vcipher		$out4,$out4,v26
+	vcipher		$out5,$out5,v26
+
+	add		$inp,$inp,r0		# $inp is adjusted in such
+						# way that at exit from the
+						# loop inX-in5 are loaded
+						# with last "words"
+	 vxor		$in2,$twk2,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk2,$tweak,$rndkey0
+	 vaddubm	$tweak,$tweak,$tweak
+	vcipher		$out0,$out0,v27
+	vcipher		$out1,$out1,v27
+	vcipher		$out2,$out2,v27
+	vcipher		$out3,$out3,v27
+	 vand		$tmp,$tmp,$eighty7
+	vcipher		$out4,$out4,v27
+	vcipher		$out5,$out5,v27
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	 xxlor		32+$in3, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in3
+	vcipher		$out0,$out0,v28
+	vcipher		$out1,$out1,v28
+	 vxor		$in3,$twk3,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk3,$tweak,$rndkey0
+	vcipher		$out2,$out2,v28
+	vcipher		$out3,$out3,v28
+	 vaddubm	$tweak,$tweak,$tweak
+	vcipher		$out4,$out4,v28
+	vcipher		$out5,$out5,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+	 vand		$tmp,$tmp,$eighty7
+
+	vcipher		$out0,$out0,v29
+	vcipher		$out1,$out1,v29
+	 xxlor		32+$in4, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in4
+	vcipher		$out2,$out2,v29
+	vcipher		$out3,$out3,v29
+	 vxor		$in4,$twk4,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk4,$tweak,$rndkey0
+	vcipher		$out4,$out4,v29
+	vcipher		$out5,$out5,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vaddubm	$tweak,$tweak,$tweak
+
+	vcipher		$out0,$out0,v30
+	vcipher		$out1,$out1,v30
+	 vand		$tmp,$tmp,$eighty7
+	vcipher		$out2,$out2,v30
+	vcipher		$out3,$out3,v30
+	 xxlor		32+$in5, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in5
+	vcipher		$out4,$out4,v30
+	vcipher		$out5,$out5,v30
+	 vxor		$in5,$twk5,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk5,$tweak,$rndkey0
+
+	vcipherlast	$out0,$out0,$in0
+	 lvx_u		$in0,$x00,$inp		# load next input block
+	 vaddubm	$tweak,$tweak,$tweak
+	vcipherlast	$out1,$out1,$in1
+	 lvx_u		$in1,$x10,$inp
+	vcipherlast	$out2,$out2,$in2
+	 le?vperm	$in0,$in0,$in0,$leperm
+	 lvx_u		$in2,$x20,$inp
+	 vand		$tmp,$tmp,$eighty7
+	vcipherlast	$out3,$out3,$in3
+	 le?vperm	$in1,$in1,$in1,$leperm
+	 lvx_u		$in3,$x30,$inp
+	vcipherlast	$out4,$out4,$in4
+	 le?vperm	$in2,$in2,$in2,$leperm
+	 lvx_u		$in4,$x40,$inp
+	 xxlor		10, 32+$in0, 32+$in0
+	 xxlor		32+$in0, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in0
+	 xxlor		32+$in0, 10, 10
+	vcipherlast	$tmp,$out5,$in5		# last block might be needed
+						# in stealing mode
+	 le?vperm	$in3,$in3,$in3,$leperm
+	 lvx_u		$in5,$x50,$inp
+	 addi		$inp,$inp,0x60
+	 le?vperm	$in4,$in4,$in4,$leperm
+	 le?vperm	$in5,$in5,$in5,$leperm
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	 vxor		$out0,$in0,$twk0
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	 vxor		$out1,$in1,$twk1
+	le?vperm	$out3,$out3,$out3,$leperm
+	stvx_u		$out2,$x20,$out
+	 vxor		$out2,$in2,$twk2
+	le?vperm	$out4,$out4,$out4,$leperm
+	stvx_u		$out3,$x30,$out
+	 vxor		$out3,$in3,$twk3
+	le?vperm	$out5,$tmp,$tmp,$leperm
+	stvx_u		$out4,$x40,$out
+	 vxor		$out4,$in4,$twk4
+	le?stvx_u	$out5,$x50,$out
+	be?stvx_u	$tmp, $x50,$out
+	 vxor		$out5,$in5,$twk5
+	addi		$out,$out,0x60
+
+	mtctr		$rounds
+	beq		Loop_xts_enc6x		# did $len-=96 borrow?
+
+	xxlor		32+$eighty7, 2, 2	# 0x010101..87
+
+	addic.		$len,$len,0x60
+	beq		Lxts_enc6x_zero
+	cmpwi		$len,0x20
+	blt		Lxts_enc6x_one
+	nop
+	beq		Lxts_enc6x_two
+	cmpwi		$len,0x40
+	blt		Lxts_enc6x_three
+	nop
+	beq		Lxts_enc6x_four
+
+Lxts_enc6x_five:
+	vxor		$out0,$in1,$twk0
+	vxor		$out1,$in2,$twk1
+	vxor		$out2,$in3,$twk2
+	vxor		$out3,$in4,$twk3
+	vxor		$out4,$in5,$twk4
+
+	bl		_aesp8_xts_enc5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk5		# unused tweak
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$leperm
+	stvx_u		$out2,$x20,$out
+	vxor		$tmp,$out4,$twk5	# last block prep for stealing
+	le?vperm	$out4,$out4,$out4,$leperm
+	stvx_u		$out3,$x30,$out
+	stvx_u		$out4,$x40,$out
+	addi		$out,$out,0x50
+	bne		Lxts_enc6x_steal
+	b		Lxts_enc6x_done
+
+.align	4
+Lxts_enc6x_four:
+	vxor		$out0,$in2,$twk0
+	vxor		$out1,$in3,$twk1
+	vxor		$out2,$in4,$twk2
+	vxor		$out3,$in5,$twk3
+	vxor		$out4,$out4,$out4
+
+	bl		_aesp8_xts_enc5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk4		# unused tweak
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	vxor		$tmp,$out3,$twk4	# last block prep for stealing
+	le?vperm	$out3,$out3,$out3,$leperm
+	stvx_u		$out2,$x20,$out
+	stvx_u		$out3,$x30,$out
+	addi		$out,$out,0x40
+	bne		Lxts_enc6x_steal
+	b		Lxts_enc6x_done
+
+.align	4
+Lxts_enc6x_three:
+	vxor		$out0,$in3,$twk0
+	vxor		$out1,$in4,$twk1
+	vxor		$out2,$in5,$twk2
+	vxor		$out3,$out3,$out3
+	vxor		$out4,$out4,$out4
+
+	bl		_aesp8_xts_enc5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk3		# unused tweak
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	vxor		$tmp,$out2,$twk3	# last block prep for stealing
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	stvx_u		$out2,$x20,$out
+	addi		$out,$out,0x30
+	bne		Lxts_enc6x_steal
+	b		Lxts_enc6x_done
+
+.align	4
+Lxts_enc6x_two:
+	vxor		$out0,$in4,$twk0
+	vxor		$out1,$in5,$twk1
+	vxor		$out2,$out2,$out2
+	vxor		$out3,$out3,$out3
+	vxor		$out4,$out4,$out4
+
+	bl		_aesp8_xts_enc5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk2		# unused tweak
+	vxor		$tmp,$out1,$twk2	# last block prep for stealing
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	stvx_u		$out1,$x10,$out
+	addi		$out,$out,0x20
+	bne		Lxts_enc6x_steal
+	b		Lxts_enc6x_done
+
+.align	4
+Lxts_enc6x_one:
+	vxor		$out0,$in5,$twk0
+	nop
+Loop_xts_enc1x:
+	vcipher		$out0,$out0,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vcipher		$out0,$out0,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Loop_xts_enc1x
+
+	add		$inp,$inp,$taillen
+	cmpwi		$taillen,0
+	vcipher		$out0,$out0,v24
+
+	subi		$inp,$inp,16
+	vcipher		$out0,$out0,v25
+
+	lvsr		$inpperm,0,$taillen
+	vcipher		$out0,$out0,v26
+
+	lvx_u		$in0,0,$inp
+	vcipher		$out0,$out0,v27
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vcipher		$out0,$out0,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+
+	vcipher		$out0,$out0,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vxor		$twk0,$twk0,v31
+
+	le?vperm	$in0,$in0,$in0,$leperm
+	vcipher		$out0,$out0,v30
+
+	vperm		$in0,$in0,$in0,$inpperm
+	vcipherlast	$out0,$out0,$twk0
+
+	vmr		$twk0,$twk1		# unused tweak
+	vxor		$tmp,$out0,$twk1	# last block prep for stealing
+	le?vperm	$out0,$out0,$out0,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	addi		$out,$out,0x10
+	bne		Lxts_enc6x_steal
+	b		Lxts_enc6x_done
+
+.align	4
+Lxts_enc6x_zero:
+	cmpwi		$taillen,0
+	beq		Lxts_enc6x_done
+
+	add		$inp,$inp,$taillen
+	subi		$inp,$inp,16
+	lvx_u		$in0,0,$inp
+	lvsr		$inpperm,0,$taillen	# $in5 is no more
+	le?vperm	$in0,$in0,$in0,$leperm
+	vperm		$in0,$in0,$in0,$inpperm
+	vxor		$tmp,$tmp,$twk0
+Lxts_enc6x_steal:
+	vxor		$in0,$in0,$twk0
+	vxor		$out0,$out0,$out0
+	vspltisb	$out1,-1
+	vperm		$out0,$out0,$out1,$inpperm
+	vsel		$out0,$in0,$tmp,$out0	# $tmp is last block, remember?
+
+	subi		r30,$out,17
+	subi		$out,$out,16
+	mtctr		$taillen
+Loop_xts_enc6x_steal:
+	lbzu		r0,1(r30)
+	stb		r0,16(r30)
+	bdnz		Loop_xts_enc6x_steal
+
+	li		$taillen,0
+	mtctr		$rounds
+	b		Loop_xts_enc1x		# one more time...
+
+.align	4
+Lxts_enc6x_done:
+	${UCMP}i	$ivp,0
+	beq		Lxts_enc6x_ret
+
+	vxor		$tweak,$twk0,$rndkey0
+	le?vperm	$tweak,$tweak,$tweak,$leperm
+	stvx_u		$tweak,0,$ivp
+
+Lxts_enc6x_ret:
+	mtlr		r11
+	li		r10,`$FRAME+15`
+	li		r11,`$FRAME+31`
+	stvx		$seven,r10,$sp		# wipe copies of round keys
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+	stvx		$seven,r10,$sp
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+	stvx		$seven,r10,$sp
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+	stvx		$seven,r10,$sp
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+
+	mtspr		256,$vrsave
+	lvx		v20,r10,$sp		# ABI says so
+	addi		r10,r10,32
+	lvx		v21,r11,$sp
+	addi		r11,r11,32
+	lvx		v22,r10,$sp
+	addi		r10,r10,32
+	lvx		v23,r11,$sp
+	addi		r11,r11,32
+	lvx		v24,r10,$sp
+	addi		r10,r10,32
+	lvx		v25,r11,$sp
+	addi		r11,r11,32
+	lvx		v26,r10,$sp
+	addi		r10,r10,32
+	lvx		v27,r11,$sp
+	addi		r11,r11,32
+	lvx		v28,r10,$sp
+	addi		r10,r10,32
+	lvx		v29,r11,$sp
+	addi		r11,r11,32
+	lvx		v30,r10,$sp
+	lvx		v31,r11,$sp
+	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+	blr
+	.long		0
+	.byte		0,12,0x04,1,0x80,6,6,0
+	.long		0
+
+.align	5
+_aesp8_xts_enc5x:
+	vcipher		$out0,$out0,v24
+	vcipher		$out1,$out1,v24
+	vcipher		$out2,$out2,v24
+	vcipher		$out3,$out3,v24
+	vcipher		$out4,$out4,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vcipher		$out0,$out0,v25
+	vcipher		$out1,$out1,v25
+	vcipher		$out2,$out2,v25
+	vcipher		$out3,$out3,v25
+	vcipher		$out4,$out4,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		_aesp8_xts_enc5x
+
+	add		$inp,$inp,$taillen
+	cmpwi		$taillen,0
+	vcipher		$out0,$out0,v24
+	vcipher		$out1,$out1,v24
+	vcipher		$out2,$out2,v24
+	vcipher		$out3,$out3,v24
+	vcipher		$out4,$out4,v24
+
+	subi		$inp,$inp,16
+	vcipher		$out0,$out0,v25
+	vcipher		$out1,$out1,v25
+	vcipher		$out2,$out2,v25
+	vcipher		$out3,$out3,v25
+	vcipher		$out4,$out4,v25
+	 vxor		$twk0,$twk0,v31
+
+	vcipher		$out0,$out0,v26
+	lvsr		$inpperm,r0,$taillen	# $in5 is no more
+	vcipher		$out1,$out1,v26
+	vcipher		$out2,$out2,v26
+	vcipher		$out3,$out3,v26
+	vcipher		$out4,$out4,v26
+	 vxor		$in1,$twk1,v31
+
+	vcipher		$out0,$out0,v27
+	lvx_u		$in0,0,$inp
+	vcipher		$out1,$out1,v27
+	vcipher		$out2,$out2,v27
+	vcipher		$out3,$out3,v27
+	vcipher		$out4,$out4,v27
+	 vxor		$in2,$twk2,v31
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vcipher		$out0,$out0,v28
+	vcipher		$out1,$out1,v28
+	vcipher		$out2,$out2,v28
+	vcipher		$out3,$out3,v28
+	vcipher		$out4,$out4,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+	 vxor		$in3,$twk3,v31
+
+	vcipher		$out0,$out0,v29
+	le?vperm	$in0,$in0,$in0,$leperm
+	vcipher		$out1,$out1,v29
+	vcipher		$out2,$out2,v29
+	vcipher		$out3,$out3,v29
+	vcipher		$out4,$out4,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vxor		$in4,$twk4,v31
+
+	vcipher		$out0,$out0,v30
+	vperm		$in0,$in0,$in0,$inpperm
+	vcipher		$out1,$out1,v30
+	vcipher		$out2,$out2,v30
+	vcipher		$out3,$out3,v30
+	vcipher		$out4,$out4,v30
+
+	vcipherlast	$out0,$out0,$twk0
+	vcipherlast	$out1,$out1,$in1
+	vcipherlast	$out2,$out2,$in2
+	vcipherlast	$out3,$out3,$in3
+	vcipherlast	$out4,$out4,$in4
+	blr
+        .long   	0
+        .byte   	0,12,0x14,0,0,0,0,0
+
+.align	5
+_aesp8_xts_decrypt6x:
+	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+	mflr		r11
+	li		r7,`$FRAME+8*16+15`
+	li		r3,`$FRAME+8*16+31`
+	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+	stvx		v20,r7,$sp		# ABI says so
+	addi		r7,r7,32
+	stvx		v21,r3,$sp
+	addi		r3,r3,32
+	stvx		v22,r7,$sp
+	addi		r7,r7,32
+	stvx		v23,r3,$sp
+	addi		r3,r3,32
+	stvx		v24,r7,$sp
+	addi		r7,r7,32
+	stvx		v25,r3,$sp
+	addi		r3,r3,32
+	stvx		v26,r7,$sp
+	addi		r7,r7,32
+	stvx		v27,r3,$sp
+	addi		r3,r3,32
+	stvx		v28,r7,$sp
+	addi		r7,r7,32
+	stvx		v29,r3,$sp
+	addi		r3,r3,32
+	stvx		v30,r7,$sp
+	stvx		v31,r3,$sp
+	li		r0,-1
+	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
+	li		$x10,0x10
+	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	li		$x20,0x20
+	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	li		$x30,0x30
+	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	li		$x40,0x40
+	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	li		$x50,0x50
+	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	li		$x60,0x60
+	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	li		$x70,0x70
+	mtspr		256,r0
+
+	xxlor		2, 32+$eighty7, 32+$eighty7
+	vsldoi		$eighty7,$tmp,$eighty7,1        # 0x010101..87
+	xxlor		1, 32+$eighty7, 32+$eighty7
+
+	# Load XOR Lconsts.
+	mr		$x70, r6
+	bl		Lconsts
+	lxvw4x		0, $x40, r6		# load XOR contents
+	mr		r6, $x70
+	li		$x70,0x70
+
+	subi		$rounds,$rounds,3	# -4 in total
+
+	lvx		$rndkey0,$x00,$key1	# load key schedule
+	lvx		v30,$x10,$key1
+	addi		$key1,$key1,0x20
+	lvx		v31,$x00,$key1
+	?vperm		$rndkey0,$rndkey0,v30,$keyperm
+	addi		$key_,$sp,$FRAME+15
+	mtctr		$rounds
+
+Load_xts_dec_key:
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v30,$x10,$key1
+	addi		$key1,$key1,0x20
+	stvx		v24,$x00,$key_		# off-load round[1]
+	?vperm		v25,v31,v30,$keyperm
+	lvx		v31,$x00,$key1
+	stvx		v25,$x10,$key_		# off-load round[2]
+	addi		$key_,$key_,0x20
+	bdnz		Load_xts_dec_key
+
+	lvx		v26,$x10,$key1
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v27,$x20,$key1
+	stvx		v24,$x00,$key_		# off-load round[3]
+	?vperm		v25,v31,v26,$keyperm
+	lvx		v28,$x30,$key1
+	stvx		v25,$x10,$key_		# off-load round[4]
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	?vperm		v26,v26,v27,$keyperm
+	lvx		v29,$x40,$key1
+	?vperm		v27,v27,v28,$keyperm
+	lvx		v30,$x50,$key1
+	?vperm		v28,v28,v29,$keyperm
+	lvx		v31,$x60,$key1
+	?vperm		v29,v29,v30,$keyperm
+	lvx		$twk5,$x70,$key1	# borrow $twk5
+	?vperm		v30,v30,v31,$keyperm
+	lvx		v24,$x00,$key_		# pre-load round[1]
+	?vperm		v31,v31,$twk5,$keyperm
+	lvx		v25,$x10,$key_		# pre-load round[2]
+
+	 vperm		$in0,$inout,$inptail,$inpperm
+	 subi		$inp,$inp,31		# undo "caller"
+	vxor		$twk0,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out0,$in0,$twk0
+	xxlor		32+$in1, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in1
+
+	 lvx_u		$in1,$x10,$inp
+	vxor		$twk1,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	 le?vperm	$in1,$in1,$in1,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out1,$in1,$twk1
+	xxlor		32+$in2, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in2
+
+	 lvx_u		$in2,$x20,$inp
+	 andi.		$taillen,$len,15
+	vxor		$twk2,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	 le?vperm	$in2,$in2,$in2,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out2,$in2,$twk2
+	xxlor		32+$in3, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in3
+
+	 lvx_u		$in3,$x30,$inp
+	 sub		$len,$len,$taillen
+	vxor		$twk3,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	 le?vperm	$in3,$in3,$in3,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out3,$in3,$twk3
+	xxlor		32+$in4, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in4
+
+	 lvx_u		$in4,$x40,$inp
+	 subi		$len,$len,0x60
+	vxor		$twk4,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	 le?vperm	$in4,$in4,$in4,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out4,$in4,$twk4
+	xxlor		32+$in5, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in5
+
+	 lvx_u		$in5,$x50,$inp
+	 addi		$inp,$inp,0x60
+	vxor		$twk5,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	 le?vperm	$in5,$in5,$in5,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out5,$in5,$twk5
+	xxlor		32+$in0, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in0
+
+	vxor		v31,v31,$rndkey0
+	mtctr		$rounds
+	b		Loop_xts_dec6x
+
+.align	5
+Loop_xts_dec6x:
+	vncipher	$out0,$out0,v24
+	vncipher	$out1,$out1,v24
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	vncipher	$out4,$out4,v24
+	vncipher	$out5,$out5,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vncipher	$out0,$out0,v25
+	vncipher	$out1,$out1,v25
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	vncipher	$out4,$out4,v25
+	vncipher	$out5,$out5,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Loop_xts_dec6x
+
+	xxlor		32+$eighty7, 1, 1	# 0x010101..87
+
+	subic		$len,$len,96		# $len-=96
+	 vxor		$in0,$twk0,v31		# xor with last round key
+	vncipher	$out0,$out0,v24
+	vncipher	$out1,$out1,v24
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk0,$tweak,$rndkey0
+	 vaddubm	$tweak,$tweak,$tweak
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	vncipher	$out4,$out4,v24
+	vncipher	$out5,$out5,v24
+
+	subfe.		r0,r0,r0		# borrow?-1:0
+	 vand		$tmp,$tmp,$eighty7
+	vncipher	$out0,$out0,v25
+	vncipher	$out1,$out1,v25
+	 xxlor		32+$in1, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in1
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	 vxor		$in1,$twk1,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk1,$tweak,$rndkey0
+	vncipher	$out4,$out4,v25
+	vncipher	$out5,$out5,v25
+
+	and		r0,r0,$len
+	 vaddubm	$tweak,$tweak,$tweak
+	vncipher	$out0,$out0,v26
+	vncipher	$out1,$out1,v26
+	 vand		$tmp,$tmp,$eighty7
+	vncipher	$out2,$out2,v26
+	vncipher	$out3,$out3,v26
+	 xxlor		32+$in2, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in2
+	vncipher	$out4,$out4,v26
+	vncipher	$out5,$out5,v26
+
+	add		$inp,$inp,r0		# $inp is adjusted in such
+						# way that at exit from the
+						# loop inX-in5 are loaded
+						# with last "words"
+	 vxor		$in2,$twk2,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk2,$tweak,$rndkey0
+	 vaddubm	$tweak,$tweak,$tweak
+	vncipher	$out0,$out0,v27
+	vncipher	$out1,$out1,v27
+	vncipher	$out2,$out2,v27
+	vncipher	$out3,$out3,v27
+	 vand		$tmp,$tmp,$eighty7
+	vncipher	$out4,$out4,v27
+	vncipher	$out5,$out5,v27
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	 xxlor		32+$in3, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in3
+	vncipher	$out0,$out0,v28
+	vncipher	$out1,$out1,v28
+	 vxor		$in3,$twk3,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk3,$tweak,$rndkey0
+	vncipher	$out2,$out2,v28
+	vncipher	$out3,$out3,v28
+	 vaddubm	$tweak,$tweak,$tweak
+	vncipher	$out4,$out4,v28
+	vncipher	$out5,$out5,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+	 vand		$tmp,$tmp,$eighty7
+
+	vncipher	$out0,$out0,v29
+	vncipher	$out1,$out1,v29
+	 xxlor		32+$in4, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in4
+	vncipher	$out2,$out2,v29
+	vncipher	$out3,$out3,v29
+	 vxor		$in4,$twk4,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk4,$tweak,$rndkey0
+	vncipher	$out4,$out4,v29
+	vncipher	$out5,$out5,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vaddubm	$tweak,$tweak,$tweak
+
+	vncipher	$out0,$out0,v30
+	vncipher	$out1,$out1,v30
+	 vand		$tmp,$tmp,$eighty7
+	vncipher	$out2,$out2,v30
+	vncipher	$out3,$out3,v30
+	 xxlor		32+$in5, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in5
+	vncipher	$out4,$out4,v30
+	vncipher	$out5,$out5,v30
+	 vxor		$in5,$twk5,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk5,$tweak,$rndkey0
+
+	vncipherlast	$out0,$out0,$in0
+	 lvx_u		$in0,$x00,$inp		# load next input block
+	 vaddubm	$tweak,$tweak,$tweak
+	vncipherlast	$out1,$out1,$in1
+	 lvx_u		$in1,$x10,$inp
+	vncipherlast	$out2,$out2,$in2
+	 le?vperm	$in0,$in0,$in0,$leperm
+	 lvx_u		$in2,$x20,$inp
+	 vand		$tmp,$tmp,$eighty7
+	vncipherlast	$out3,$out3,$in3
+	 le?vperm	$in1,$in1,$in1,$leperm
+	 lvx_u		$in3,$x30,$inp
+	vncipherlast	$out4,$out4,$in4
+	 le?vperm	$in2,$in2,$in2,$leperm
+	 lvx_u		$in4,$x40,$inp
+	 xxlor		10, 32+$in0, 32+$in0
+	 xxlor		32+$in0, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in0
+	 xxlor		32+$in0, 10, 10
+	vncipherlast	$out5,$out5,$in5
+	 le?vperm	$in3,$in3,$in3,$leperm
+	 lvx_u		$in5,$x50,$inp
+	 addi		$inp,$inp,0x60
+	 le?vperm	$in4,$in4,$in4,$leperm
+	 le?vperm	$in5,$in5,$in5,$leperm
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	 vxor		$out0,$in0,$twk0
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	 vxor		$out1,$in1,$twk1
+	le?vperm	$out3,$out3,$out3,$leperm
+	stvx_u		$out2,$x20,$out
+	 vxor		$out2,$in2,$twk2
+	le?vperm	$out4,$out4,$out4,$leperm
+	stvx_u		$out3,$x30,$out
+	 vxor		$out3,$in3,$twk3
+	le?vperm	$out5,$out5,$out5,$leperm
+	stvx_u		$out4,$x40,$out
+	 vxor		$out4,$in4,$twk4
+	stvx_u		$out5,$x50,$out
+	 vxor		$out5,$in5,$twk5
+	addi		$out,$out,0x60
+
+	mtctr		$rounds
+	beq		Loop_xts_dec6x		# did $len-=96 borrow?
+
+	xxlor		32+$eighty7, 2, 2	# 0x010101..87
+
+	addic.		$len,$len,0x60
+	beq		Lxts_dec6x_zero
+	cmpwi		$len,0x20
+	blt		Lxts_dec6x_one
+	nop
+	beq		Lxts_dec6x_two
+	cmpwi		$len,0x40
+	blt		Lxts_dec6x_three
+	nop
+	beq		Lxts_dec6x_four
+
+Lxts_dec6x_five:
+	vxor		$out0,$in1,$twk0
+	vxor		$out1,$in2,$twk1
+	vxor		$out2,$in3,$twk2
+	vxor		$out3,$in4,$twk3
+	vxor		$out4,$in5,$twk4
+
+	bl		_aesp8_xts_dec5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk5		# unused tweak
+	vxor		$twk1,$tweak,$rndkey0
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	vxor		$out0,$in0,$twk1
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$leperm
+	stvx_u		$out2,$x20,$out
+	le?vperm	$out4,$out4,$out4,$leperm
+	stvx_u		$out3,$x30,$out
+	stvx_u		$out4,$x40,$out
+	addi		$out,$out,0x50
+	bne		Lxts_dec6x_steal
+	b		Lxts_dec6x_done
+
+.align	4
+Lxts_dec6x_four:
+	vxor		$out0,$in2,$twk0
+	vxor		$out1,$in3,$twk1
+	vxor		$out2,$in4,$twk2
+	vxor		$out3,$in5,$twk3
+	vxor		$out4,$out4,$out4
+
+	bl		_aesp8_xts_dec5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk4		# unused tweak
+	vmr		$twk1,$twk5
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	vxor		$out0,$in0,$twk5
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$leperm
+	stvx_u		$out2,$x20,$out
+	stvx_u		$out3,$x30,$out
+	addi		$out,$out,0x40
+	bne		Lxts_dec6x_steal
+	b		Lxts_dec6x_done
+
+.align	4
+Lxts_dec6x_three:
+	vxor		$out0,$in3,$twk0
+	vxor		$out1,$in4,$twk1
+	vxor		$out2,$in5,$twk2
+	vxor		$out3,$out3,$out3
+	vxor		$out4,$out4,$out4
+
+	bl		_aesp8_xts_dec5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk3		# unused tweak
+	vmr		$twk1,$twk4
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	vxor		$out0,$in0,$twk4
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	stvx_u		$out2,$x20,$out
+	addi		$out,$out,0x30
+	bne		Lxts_dec6x_steal
+	b		Lxts_dec6x_done
+
+.align	4
+Lxts_dec6x_two:
+	vxor		$out0,$in4,$twk0
+	vxor		$out1,$in5,$twk1
+	vxor		$out2,$out2,$out2
+	vxor		$out3,$out3,$out3
+	vxor		$out4,$out4,$out4
+
+	bl		_aesp8_xts_dec5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk2		# unused tweak
+	vmr		$twk1,$twk3
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	vxor		$out0,$in0,$twk3
+	stvx_u		$out1,$x10,$out
+	addi		$out,$out,0x20
+	bne		Lxts_dec6x_steal
+	b		Lxts_dec6x_done
+
+.align	4
+Lxts_dec6x_one:
+	vxor		$out0,$in5,$twk0
+	nop
+Loop_xts_dec1x:
+	vncipher	$out0,$out0,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vncipher	$out0,$out0,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Loop_xts_dec1x
+
+	subi		r0,$taillen,1
+	vncipher	$out0,$out0,v24
+
+	andi.		r0,r0,16
+	cmpwi		$taillen,0
+	vncipher	$out0,$out0,v25
+
+	sub		$inp,$inp,r0
+	vncipher	$out0,$out0,v26
+
+	lvx_u		$in0,0,$inp
+	vncipher	$out0,$out0,v27
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vncipher	$out0,$out0,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+
+	vncipher	$out0,$out0,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vxor		$twk0,$twk0,v31
+
+	le?vperm	$in0,$in0,$in0,$leperm
+	vncipher	$out0,$out0,v30
+
+	mtctr		$rounds
+	vncipherlast	$out0,$out0,$twk0
+
+	vmr		$twk0,$twk1		# unused tweak
+	vmr		$twk1,$twk2
+	le?vperm	$out0,$out0,$out0,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	addi		$out,$out,0x10
+	vxor		$out0,$in0,$twk2
+	bne		Lxts_dec6x_steal
+	b		Lxts_dec6x_done
+
+.align	4
+Lxts_dec6x_zero:
+	cmpwi		$taillen,0
+	beq		Lxts_dec6x_done
+
+	lvx_u		$in0,0,$inp
+	le?vperm	$in0,$in0,$in0,$leperm
+	vxor		$out0,$in0,$twk1
+Lxts_dec6x_steal:
+	vncipher	$out0,$out0,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vncipher	$out0,$out0,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Lxts_dec6x_steal
+
+	add		$inp,$inp,$taillen
+	vncipher	$out0,$out0,v24
+
+	cmpwi		$taillen,0
+	vncipher	$out0,$out0,v25
+
+	lvx_u		$in0,0,$inp
+	vncipher	$out0,$out0,v26
+
+	lvsr		$inpperm,0,$taillen	# $in5 is no more
+	vncipher	$out0,$out0,v27
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vncipher	$out0,$out0,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+
+	vncipher	$out0,$out0,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vxor		$twk1,$twk1,v31
+
+	le?vperm	$in0,$in0,$in0,$leperm
+	vncipher	$out0,$out0,v30
+
+	vperm		$in0,$in0,$in0,$inpperm
+	vncipherlast	$tmp,$out0,$twk1
+
+	le?vperm	$out0,$tmp,$tmp,$leperm
+	le?stvx_u	$out0,0,$out
+	be?stvx_u	$tmp,0,$out
+
+	vxor		$out0,$out0,$out0
+	vspltisb	$out1,-1
+	vperm		$out0,$out0,$out1,$inpperm
+	vsel		$out0,$in0,$tmp,$out0
+	vxor		$out0,$out0,$twk0
+
+	subi		r30,$out,1
+	mtctr		$taillen
+Loop_xts_dec6x_steal:
+	lbzu		r0,1(r30)
+	stb		r0,16(r30)
+	bdnz		Loop_xts_dec6x_steal
+
+	li		$taillen,0
+	mtctr		$rounds
+	b		Loop_xts_dec1x		# one more time...
+
+.align	4
+Lxts_dec6x_done:
+	${UCMP}i	$ivp,0
+	beq		Lxts_dec6x_ret
+
+	vxor		$tweak,$twk0,$rndkey0
+	le?vperm	$tweak,$tweak,$tweak,$leperm
+	stvx_u		$tweak,0,$ivp
+
+Lxts_dec6x_ret:
+	mtlr		r11
+	li		r10,`$FRAME+15`
+	li		r11,`$FRAME+31`
+	stvx		$seven,r10,$sp		# wipe copies of round keys
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+	stvx		$seven,r10,$sp
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+	stvx		$seven,r10,$sp
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+	stvx		$seven,r10,$sp
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+
+	mtspr		256,$vrsave
+	lvx		v20,r10,$sp		# ABI says so
+	addi		r10,r10,32
+	lvx		v21,r11,$sp
+	addi		r11,r11,32
+	lvx		v22,r10,$sp
+	addi		r10,r10,32
+	lvx		v23,r11,$sp
+	addi		r11,r11,32
+	lvx		v24,r10,$sp
+	addi		r10,r10,32
+	lvx		v25,r11,$sp
+	addi		r11,r11,32
+	lvx		v26,r10,$sp
+	addi		r10,r10,32
+	lvx		v27,r11,$sp
+	addi		r11,r11,32
+	lvx		v28,r10,$sp
+	addi		r10,r10,32
+	lvx		v29,r11,$sp
+	addi		r11,r11,32
+	lvx		v30,r10,$sp
+	lvx		v31,r11,$sp
+	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+	blr
+	.long		0
+	.byte		0,12,0x04,1,0x80,6,6,0
+	.long		0
+
+.align	5
+_aesp8_xts_dec5x:
+	vncipher	$out0,$out0,v24
+	vncipher	$out1,$out1,v24
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	vncipher	$out4,$out4,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vncipher	$out0,$out0,v25
+	vncipher	$out1,$out1,v25
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	vncipher	$out4,$out4,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		_aesp8_xts_dec5x
+
+	subi		r0,$taillen,1
+	vncipher	$out0,$out0,v24
+	vncipher	$out1,$out1,v24
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	vncipher	$out4,$out4,v24
+
+	andi.		r0,r0,16
+	cmpwi		$taillen,0
+	vncipher	$out0,$out0,v25
+	vncipher	$out1,$out1,v25
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	vncipher	$out4,$out4,v25
+	 vxor		$twk0,$twk0,v31
+
+	sub		$inp,$inp,r0
+	vncipher	$out0,$out0,v26
+	vncipher	$out1,$out1,v26
+	vncipher	$out2,$out2,v26
+	vncipher	$out3,$out3,v26
+	vncipher	$out4,$out4,v26
+	 vxor		$in1,$twk1,v31
+
+	vncipher	$out0,$out0,v27
+	lvx_u		$in0,0,$inp
+	vncipher	$out1,$out1,v27
+	vncipher	$out2,$out2,v27
+	vncipher	$out3,$out3,v27
+	vncipher	$out4,$out4,v27
+	 vxor		$in2,$twk2,v31
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vncipher	$out0,$out0,v28
+	vncipher	$out1,$out1,v28
+	vncipher	$out2,$out2,v28
+	vncipher	$out3,$out3,v28
+	vncipher	$out4,$out4,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+	 vxor		$in3,$twk3,v31
+
+	vncipher	$out0,$out0,v29
+	le?vperm	$in0,$in0,$in0,$leperm
+	vncipher	$out1,$out1,v29
+	vncipher	$out2,$out2,v29
+	vncipher	$out3,$out3,v29
+	vncipher	$out4,$out4,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vxor		$in4,$twk4,v31
+
+	vncipher	$out0,$out0,v30
+	vncipher	$out1,$out1,v30
+	vncipher	$out2,$out2,v30
+	vncipher	$out3,$out3,v30
+	vncipher	$out4,$out4,v30
+
+	vncipherlast	$out0,$out0,$twk0
+	vncipherlast	$out1,$out1,$in1
+	vncipherlast	$out2,$out2,$in2
+	vncipherlast	$out3,$out3,$in3
+	vncipherlast	$out4,$out4,$in4
+	mtctr		$rounds
+	blr
+        .long   	0
+        .byte   	0,12,0x14,0,0,0,0,0
+___
+}}	}}}
+
+my $consts=1;
+foreach(split("\n",$code)) {
+        s/\`([^\`]*)\`/eval($1)/geo;
+
+	# constants table endian-specific conversion
+	if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
+	    my $conv=$3;
+	    my @bytes=();
+
+	    # convert to endian-agnostic format
+	    if ($1 eq "long") {
+	      foreach (split(/,\s*/,$2)) {
+		my $l = /^0/?oct:int;
+		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
+	      }
+	    } else {
+		@bytes = map(/^0/?oct:int,split(/,\s*/,$2));
+	    }
+
+	    # little-endian conversion
+	    if ($flavour =~ /le$/o) {
+		SWITCH: for($conv)  {
+		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
+		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
+		}
+	    }
+
+	    #emit
+	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
+	    next;
+	}
+	$consts=0 if (m/Lconsts:/o);	# end of table
+
+	# instructions prefixed with '?' are endian-specific and need
+	# to be adjusted accordingly...
+	if ($flavour =~ /le$/o) {	# little-endian
+	    s/le\?//o		or
+	    s/be\?/#be#/o	or
+	    s/\?lvsr/lvsl/o	or
+	    s/\?lvsl/lvsr/o	or
+	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
+	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
+	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
+	} else {			# big-endian
+	    s/le\?/#le#/o	or
+	    s/be\?//o		or
+	    s/\?([a-z]+)/$1/o;
+	}
+
+        print $_,"\n";
+}
+
+close STDOUT;
diff --git a/arch/powerpc/crypto/chacha-p10-glue.c b/arch/powerpc/crypto/chacha-p10-glue.c
index 74fb86b0d209..7c728755852e 100644
--- a/arch/powerpc/crypto/chacha-p10-glue.c
+++ b/arch/powerpc/crypto/chacha-p10-glue.c
@@ -197,6 +197,9 @@ static struct skcipher_alg algs[] = {
 
 static int __init chacha_p10_init(void)
 {
+	if (!cpu_has_feature(CPU_FTR_ARCH_31))
+		return 0;
+
 	static_branch_enable(&have_p10);
 
 	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
@@ -204,10 +207,13 @@ static int __init chacha_p10_init(void)
 
 static void __exit chacha_p10_exit(void)
 {
+	if (!static_branch_likely(&have_p10))
+		return;
+
 	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
 }
 
-module_cpu_feature_match(PPC_MODULE_FEATURE_P10, chacha_p10_init);
+module_init(chacha_p10_init);
 module_exit(chacha_p10_exit);
 
 MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (P10 accelerated)");
diff --git a/arch/powerpc/crypto/crc-vpmsum_test.c b/arch/powerpc/crypto/crc-vpmsum_test.c
deleted file mode 100644
index c61a874a3a5c..000000000000
--- a/arch/powerpc/crypto/crc-vpmsum_test.c
+++ /dev/null
@@ -1,133 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * CRC vpmsum tester
- * Copyright 2017 Daniel Axtens, IBM Corporation.
- */
-
-#include <linux/crc-t10dif.h>
-#include <linux/crc32.h>
-#include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/cpufeature.h>
-#include <asm/switch_to.h>
-
-static unsigned long iterations = 10000;
-
-#define MAX_CRC_LENGTH 65535
-
-
-static int __init crc_test_init(void)
-{
-	u16 crc16 = 0, verify16 = 0;
-	__le32 verify32le = 0;
-	unsigned char *data;
-	u32 verify32 = 0;
-	unsigned long i;
-	__le32 crc32;
-	int ret;
-
-	struct crypto_shash *crct10dif_tfm;
-	struct crypto_shash *crc32c_tfm;
-
-	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
-		return -ENODEV;
-
-	data = kmalloc(MAX_CRC_LENGTH, GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
-	crct10dif_tfm = crypto_alloc_shash("crct10dif", 0, 0);
-
-	if (IS_ERR(crct10dif_tfm)) {
-		pr_err("Error allocating crc-t10dif\n");
-		goto free_buf;
-	}
-
-	crc32c_tfm = crypto_alloc_shash("crc32c", 0, 0);
-
-	if (IS_ERR(crc32c_tfm)) {
-		pr_err("Error allocating crc32c\n");
-		goto free_16;
-	}
-
-	do {
-		SHASH_DESC_ON_STACK(crct10dif_shash, crct10dif_tfm);
-		SHASH_DESC_ON_STACK(crc32c_shash, crc32c_tfm);
-
-		crct10dif_shash->tfm = crct10dif_tfm;
-		ret = crypto_shash_init(crct10dif_shash);
-
-		if (ret) {
-			pr_err("Error initing crc-t10dif\n");
-			goto free_32;
-		}
-
-
-		crc32c_shash->tfm = crc32c_tfm;
-		ret = crypto_shash_init(crc32c_shash);
-
-		if (ret) {
-			pr_err("Error initing crc32c\n");
-			goto free_32;
-		}
-
-		pr_info("crc-vpmsum_test begins, %lu iterations\n", iterations);
-		for (i=0; i<iterations; i++) {
-			size_t offset = get_random_u32_below(16);
-			size_t len = get_random_u32_below(MAX_CRC_LENGTH);
-
-			if (len <= offset)
-				continue;
-			get_random_bytes(data, len);
-			len -= offset;
-
-			crypto_shash_update(crct10dif_shash, data+offset, len);
-			crypto_shash_final(crct10dif_shash, (u8 *)(&crc16));
-			verify16 = crc_t10dif_generic(verify16, data+offset, len);
-
-
-			if (crc16 != verify16) {
-				pr_err("FAILURE in CRC16: got 0x%04x expected 0x%04x (len %lu)\n",
-				       crc16, verify16, len);
-				break;
-			}
-
-			crypto_shash_update(crc32c_shash, data+offset, len);
-			crypto_shash_final(crc32c_shash, (u8 *)(&crc32));
-			verify32 = le32_to_cpu(verify32le);
-		        verify32le = ~cpu_to_le32(__crc32c_le(~verify32, data+offset, len));
-			if (crc32 != verify32le) {
-				pr_err("FAILURE in CRC32: got 0x%08x expected 0x%08x (len %lu)\n",
-				       crc32, verify32, len);
-				break;
-			}
-		cond_resched();
-		}
-		pr_info("crc-vpmsum_test done, completed %lu iterations\n", i);
-	} while (0);
-
-free_32:
-	crypto_free_shash(crc32c_tfm);
-
-free_16:
-	crypto_free_shash(crct10dif_tfm);
-
-free_buf:
-	kfree(data);
-
-	return 0;
-}
-
-static void __exit crc_test_exit(void) {}
-
-module_init(crc_test_init);
-module_exit(crc_test_exit);
-module_param(iterations, long, 0400);
-
-MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>");
-MODULE_DESCRIPTION("Vector polynomial multiply-sum CRC tester");
-MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/crypto/crc32-vpmsum_core.S b/arch/powerpc/crypto/crc32-vpmsum_core.S
deleted file mode 100644
index b0f87f595b26..000000000000
--- a/arch/powerpc/crypto/crc32-vpmsum_core.S
+++ /dev/null
@@ -1,746 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Core of the accelerated CRC algorithm.
- * In your file, define the constants and CRC_FUNCTION_NAME
- * Then include this file.
- *
- * Calculate the checksum of data that is 16 byte aligned and a multiple of
- * 16 bytes.
- *
- * The first step is to reduce it to 1024 bits. We do this in 8 parallel
- * chunks in order to mask the latency of the vpmsum instructions. If we
- * have more than 32 kB of data to checksum we repeat this step multiple
- * times, passing in the previous 1024 bits.
- *
- * The next step is to reduce the 1024 bits to 64 bits. This step adds
- * 32 bits of 0s to the end - this matches what a CRC does. We just
- * calculate constants that land the data in this 32 bits.
- *
- * We then use fixed point Barrett reduction to compute a mod n over GF(2)
- * for n = CRC using POWER8 instructions. We use x = 32.
- *
- * https://en.wikipedia.org/wiki/Barrett_reduction
- *
- * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
-*/
-
-#include <asm/ppc_asm.h>
-#include <asm/ppc-opcode.h>
-
-#define MAX_SIZE	32768
-
-	.text
-
-#if defined(__BIG_ENDIAN__) && defined(REFLECT)
-#define BYTESWAP_DATA
-#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
-#define BYTESWAP_DATA
-#else
-#undef BYTESWAP_DATA
-#endif
-
-#define off16		r25
-#define off32		r26
-#define off48		r27
-#define off64		r28
-#define off80		r29
-#define off96		r30
-#define off112		r31
-
-#define const1		v24
-#define const2		v25
-
-#define byteswap	v26
-#define	mask_32bit	v27
-#define	mask_64bit	v28
-#define zeroes		v29
-
-#ifdef BYTESWAP_DATA
-#define VPERM(A, B, C, D) vperm	A, B, C, D
-#else
-#define VPERM(A, B, C, D)
-#endif
-
-/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
-FUNC_START(CRC_FUNCTION_NAME)
-	std	r31,-8(r1)
-	std	r30,-16(r1)
-	std	r29,-24(r1)
-	std	r28,-32(r1)
-	std	r27,-40(r1)
-	std	r26,-48(r1)
-	std	r25,-56(r1)
-
-	li	off16,16
-	li	off32,32
-	li	off48,48
-	li	off64,64
-	li	off80,80
-	li	off96,96
-	li	off112,112
-	li	r0,0
-
-	/* Enough room for saving 10 non volatile VMX registers */
-	subi	r6,r1,56+10*16
-	subi	r7,r1,56+2*16
-
-	stvx	v20,0,r6
-	stvx	v21,off16,r6
-	stvx	v22,off32,r6
-	stvx	v23,off48,r6
-	stvx	v24,off64,r6
-	stvx	v25,off80,r6
-	stvx	v26,off96,r6
-	stvx	v27,off112,r6
-	stvx	v28,0,r7
-	stvx	v29,off16,r7
-
-	mr	r10,r3
-
-	vxor	zeroes,zeroes,zeroes
-	vspltisw v0,-1
-
-	vsldoi	mask_32bit,zeroes,v0,4
-	vsldoi	mask_64bit,zeroes,v0,8
-
-	/* Get the initial value into v8 */
-	vxor	v8,v8,v8
-	MTVRD(v8, R3)
-#ifdef REFLECT
-	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
-#else
-	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
-#endif
-
-#ifdef BYTESWAP_DATA
-	LOAD_REG_ADDR(r3, .byteswap_constant)
-	lvx	byteswap,0,r3
-	addi	r3,r3,16
-#endif
-
-	cmpdi	r5,256
-	blt	.Lshort
-
-	rldicr	r6,r5,0,56
-
-	/* Checksum in blocks of MAX_SIZE */
-1:	lis	r7,MAX_SIZE@h
-	ori	r7,r7,MAX_SIZE@l
-	mr	r9,r7
-	cmpd	r6,r7
-	bgt	2f
-	mr	r7,r6
-2:	subf	r6,r7,r6
-
-	/* our main loop does 128 bytes at a time */
-	srdi	r7,r7,7
-
-	/*
-	 * Work out the offset into the constants table to start at. Each
-	 * constant is 16 bytes, and it is used against 128 bytes of input
-	 * data - 128 / 16 = 8
-	 */
-	sldi	r8,r7,4
-	srdi	r9,r9,3
-	subf	r8,r8,r9
-
-	/* We reduce our final 128 bytes in a separate step */
-	addi	r7,r7,-1
-	mtctr	r7
-
-	LOAD_REG_ADDR(r3, .constants)
-
-	/* Find the start of our constants */
-	add	r3,r3,r8
-
-	/* zero v0-v7 which will contain our checksums */
-	vxor	v0,v0,v0
-	vxor	v1,v1,v1
-	vxor	v2,v2,v2
-	vxor	v3,v3,v3
-	vxor	v4,v4,v4
-	vxor	v5,v5,v5
-	vxor	v6,v6,v6
-	vxor	v7,v7,v7
-
-	lvx	const1,0,r3
-
-	/*
-	 * If we are looping back to consume more data we use the values
-	 * already in v16-v23.
-	 */
-	cmpdi	r0,1
-	beq	2f
-
-	/* First warm up pass */
-	lvx	v16,0,r4
-	lvx	v17,off16,r4
-	VPERM(v16,v16,v16,byteswap)
-	VPERM(v17,v17,v17,byteswap)
-	lvx	v18,off32,r4
-	lvx	v19,off48,r4
-	VPERM(v18,v18,v18,byteswap)
-	VPERM(v19,v19,v19,byteswap)
-	lvx	v20,off64,r4
-	lvx	v21,off80,r4
-	VPERM(v20,v20,v20,byteswap)
-	VPERM(v21,v21,v21,byteswap)
-	lvx	v22,off96,r4
-	lvx	v23,off112,r4
-	VPERM(v22,v22,v22,byteswap)
-	VPERM(v23,v23,v23,byteswap)
-	addi	r4,r4,8*16
-
-	/* xor in initial value */
-	vxor	v16,v16,v8
-
-2:	bdz	.Lfirst_warm_up_done
-
-	addi	r3,r3,16
-	lvx	const2,0,r3
-
-	/* Second warm up pass */
-	VPMSUMD(v8,v16,const1)
-	lvx	v16,0,r4
-	VPERM(v16,v16,v16,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v9,v17,const1)
-	lvx	v17,off16,r4
-	VPERM(v17,v17,v17,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v10,v18,const1)
-	lvx	v18,off32,r4
-	VPERM(v18,v18,v18,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v11,v19,const1)
-	lvx	v19,off48,r4
-	VPERM(v19,v19,v19,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v12,v20,const1)
-	lvx	v20,off64,r4
-	VPERM(v20,v20,v20,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v13,v21,const1)
-	lvx	v21,off80,r4
-	VPERM(v21,v21,v21,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v14,v22,const1)
-	lvx	v22,off96,r4
-	VPERM(v22,v22,v22,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v15,v23,const1)
-	lvx	v23,off112,r4
-	VPERM(v23,v23,v23,byteswap)
-
-	addi	r4,r4,8*16
-
-	bdz	.Lfirst_cool_down
-
-	/*
-	 * main loop. We modulo schedule it such that it takes three iterations
-	 * to complete - first iteration load, second iteration vpmsum, third
-	 * iteration xor.
-	 */
-	.balign	16
-4:	lvx	const1,0,r3
-	addi	r3,r3,16
-	ori	r2,r2,0
-
-	vxor	v0,v0,v8
-	VPMSUMD(v8,v16,const2)
-	lvx	v16,0,r4
-	VPERM(v16,v16,v16,byteswap)
-	ori	r2,r2,0
-
-	vxor	v1,v1,v9
-	VPMSUMD(v9,v17,const2)
-	lvx	v17,off16,r4
-	VPERM(v17,v17,v17,byteswap)
-	ori	r2,r2,0
-
-	vxor	v2,v2,v10
-	VPMSUMD(v10,v18,const2)
-	lvx	v18,off32,r4
-	VPERM(v18,v18,v18,byteswap)
-	ori	r2,r2,0
-
-	vxor	v3,v3,v11
-	VPMSUMD(v11,v19,const2)
-	lvx	v19,off48,r4
-	VPERM(v19,v19,v19,byteswap)
-	lvx	const2,0,r3
-	ori	r2,r2,0
-
-	vxor	v4,v4,v12
-	VPMSUMD(v12,v20,const1)
-	lvx	v20,off64,r4
-	VPERM(v20,v20,v20,byteswap)
-	ori	r2,r2,0
-
-	vxor	v5,v5,v13
-	VPMSUMD(v13,v21,const1)
-	lvx	v21,off80,r4
-	VPERM(v21,v21,v21,byteswap)
-	ori	r2,r2,0
-
-	vxor	v6,v6,v14
-	VPMSUMD(v14,v22,const1)
-	lvx	v22,off96,r4
-	VPERM(v22,v22,v22,byteswap)
-	ori	r2,r2,0
-
-	vxor	v7,v7,v15
-	VPMSUMD(v15,v23,const1)
-	lvx	v23,off112,r4
-	VPERM(v23,v23,v23,byteswap)
-
-	addi	r4,r4,8*16
-
-	bdnz	4b
-
-.Lfirst_cool_down:
-	/* First cool down pass */
-	lvx	const1,0,r3
-	addi	r3,r3,16
-
-	vxor	v0,v0,v8
-	VPMSUMD(v8,v16,const1)
-	ori	r2,r2,0
-
-	vxor	v1,v1,v9
-	VPMSUMD(v9,v17,const1)
-	ori	r2,r2,0
-
-	vxor	v2,v2,v10
-	VPMSUMD(v10,v18,const1)
-	ori	r2,r2,0
-
-	vxor	v3,v3,v11
-	VPMSUMD(v11,v19,const1)
-	ori	r2,r2,0
-
-	vxor	v4,v4,v12
-	VPMSUMD(v12,v20,const1)
-	ori	r2,r2,0
-
-	vxor	v5,v5,v13
-	VPMSUMD(v13,v21,const1)
-	ori	r2,r2,0
-
-	vxor	v6,v6,v14
-	VPMSUMD(v14,v22,const1)
-	ori	r2,r2,0
-
-	vxor	v7,v7,v15
-	VPMSUMD(v15,v23,const1)
-	ori	r2,r2,0
-
-.Lsecond_cool_down:
-	/* Second cool down pass */
-	vxor	v0,v0,v8
-	vxor	v1,v1,v9
-	vxor	v2,v2,v10
-	vxor	v3,v3,v11
-	vxor	v4,v4,v12
-	vxor	v5,v5,v13
-	vxor	v6,v6,v14
-	vxor	v7,v7,v15
-
-#ifdef REFLECT
-	/*
-	 * vpmsumd produces a 96 bit result in the least significant bits
-	 * of the register. Since we are bit reflected we have to shift it
-	 * left 32 bits so it occupies the least significant bits in the
-	 * bit reflected domain.
-	 */
-	vsldoi	v0,v0,zeroes,4
-	vsldoi	v1,v1,zeroes,4
-	vsldoi	v2,v2,zeroes,4
-	vsldoi	v3,v3,zeroes,4
-	vsldoi	v4,v4,zeroes,4
-	vsldoi	v5,v5,zeroes,4
-	vsldoi	v6,v6,zeroes,4
-	vsldoi	v7,v7,zeroes,4
-#endif
-
-	/* xor with last 1024 bits */
-	lvx	v8,0,r4
-	lvx	v9,off16,r4
-	VPERM(v8,v8,v8,byteswap)
-	VPERM(v9,v9,v9,byteswap)
-	lvx	v10,off32,r4
-	lvx	v11,off48,r4
-	VPERM(v10,v10,v10,byteswap)
-	VPERM(v11,v11,v11,byteswap)
-	lvx	v12,off64,r4
-	lvx	v13,off80,r4
-	VPERM(v12,v12,v12,byteswap)
-	VPERM(v13,v13,v13,byteswap)
-	lvx	v14,off96,r4
-	lvx	v15,off112,r4
-	VPERM(v14,v14,v14,byteswap)
-	VPERM(v15,v15,v15,byteswap)
-
-	addi	r4,r4,8*16
-
-	vxor	v16,v0,v8
-	vxor	v17,v1,v9
-	vxor	v18,v2,v10
-	vxor	v19,v3,v11
-	vxor	v20,v4,v12
-	vxor	v21,v5,v13
-	vxor	v22,v6,v14
-	vxor	v23,v7,v15
-
-	li	r0,1
-	cmpdi	r6,0
-	addi	r6,r6,128
-	bne	1b
-
-	/* Work out how many bytes we have left */
-	andi.	r5,r5,127
-
-	/* Calculate where in the constant table we need to start */
-	subfic	r6,r5,128
-	add	r3,r3,r6
-
-	/* How many 16 byte chunks are in the tail */
-	srdi	r7,r5,4
-	mtctr	r7
-
-	/*
-	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
-	 * 32 bits to include the trailing 32 bits of zeros
-	 */
-	lvx	v0,0,r3
-	lvx	v1,off16,r3
-	lvx	v2,off32,r3
-	lvx	v3,off48,r3
-	lvx	v4,off64,r3
-	lvx	v5,off80,r3
-	lvx	v6,off96,r3
-	lvx	v7,off112,r3
-	addi	r3,r3,8*16
-
-	VPMSUMW(v0,v16,v0)
-	VPMSUMW(v1,v17,v1)
-	VPMSUMW(v2,v18,v2)
-	VPMSUMW(v3,v19,v3)
-	VPMSUMW(v4,v20,v4)
-	VPMSUMW(v5,v21,v5)
-	VPMSUMW(v6,v22,v6)
-	VPMSUMW(v7,v23,v7)
-
-	/* Now reduce the tail (0 - 112 bytes) */
-	cmpdi	r7,0
-	beq	1f
-
-	lvx	v16,0,r4
-	lvx	v17,0,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-
-	lvx	v16,off16,r4
-	lvx	v17,off16,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-
-	lvx	v16,off32,r4
-	lvx	v17,off32,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-
-	lvx	v16,off48,r4
-	lvx	v17,off48,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-
-	lvx	v16,off64,r4
-	lvx	v17,off64,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-
-	lvx	v16,off80,r4
-	lvx	v17,off80,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-
-	lvx	v16,off96,r4
-	lvx	v17,off96,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-
-	/* Now xor all the parallel chunks together */
-1:	vxor	v0,v0,v1
-	vxor	v2,v2,v3
-	vxor	v4,v4,v5
-	vxor	v6,v6,v7
-
-	vxor	v0,v0,v2
-	vxor	v4,v4,v6
-
-	vxor	v0,v0,v4
-
-.Lbarrett_reduction:
-	/* Barrett constants */
-	LOAD_REG_ADDR(r3, .barrett_constants)
-
-	lvx	const1,0,r3
-	lvx	const2,off16,r3
-
-	vsldoi	v1,v0,v0,8
-	vxor	v0,v0,v1		/* xor two 64 bit results together */
-
-#ifdef REFLECT
-	/* shift left one bit */
-	vspltisb v1,1
-	vsl	v0,v0,v1
-#endif
-
-	vand	v0,v0,mask_64bit
-#ifndef REFLECT
-	/*
-	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
-	 * the multiple of our polynomial that we need to subtract. By
-	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
-	 * result back down 2x bits, we round down to the nearest multiple.
-	 */
-	VPMSUMD(v1,v0,const1)	/* ma */
-	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
-	VPMSUMD(v1,v1,const2)	/* qn */
-	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */
-
-	/*
-	 * Get the result into r3. We need to shift it left 8 bytes:
-	 * V0 [ 0 1 2 X ]
-	 * V0 [ 0 X 2 3 ]
-	 */
-	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
-#else
-	/*
-	 * The reflected version of Barrett reduction. Instead of bit
-	 * reflecting our data (which is expensive to do), we bit reflect our
-	 * constants and our algorithm, which means the intermediate data in
-	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
-	 * the algorithm because we don't carry in mod 2 arithmetic.
-	 */
-	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
-	VPMSUMD(v1,v1,const1)		/* ma */
-	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
-	VPMSUMD(v1,v1,const2)		/* qn */
-	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
-
-	/*
-	 * Since we are bit reflected, the result (ie the low 32 bits) is in
-	 * the high 32 bits. We just need to shift it left 4 bytes
-	 * V0 [ 0 1 X 3 ]
-	 * V0 [ 0 X 2 3 ]
-	 */
-	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
-#endif
-
-	/* Get it into r3 */
-	MFVRD(R3, v0)
-
-.Lout:
-	subi	r6,r1,56+10*16
-	subi	r7,r1,56+2*16
-
-	lvx	v20,0,r6
-	lvx	v21,off16,r6
-	lvx	v22,off32,r6
-	lvx	v23,off48,r6
-	lvx	v24,off64,r6
-	lvx	v25,off80,r6
-	lvx	v26,off96,r6
-	lvx	v27,off112,r6
-	lvx	v28,0,r7
-	lvx	v29,off16,r7
-
-	ld	r31,-8(r1)
-	ld	r30,-16(r1)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
-	ld	r27,-40(r1)
-	ld	r26,-48(r1)
-	ld	r25,-56(r1)
-
-	blr
-
-.Lfirst_warm_up_done:
-	lvx	const1,0,r3
-	addi	r3,r3,16
-
-	VPMSUMD(v8,v16,const1)
-	VPMSUMD(v9,v17,const1)
-	VPMSUMD(v10,v18,const1)
-	VPMSUMD(v11,v19,const1)
-	VPMSUMD(v12,v20,const1)
-	VPMSUMD(v13,v21,const1)
-	VPMSUMD(v14,v22,const1)
-	VPMSUMD(v15,v23,const1)
-
-	b	.Lsecond_cool_down
-
-.Lshort:
-	cmpdi	r5,0
-	beq	.Lzero
-
-	LOAD_REG_ADDR(r3, .short_constants)
-
-	/* Calculate where in the constant table we need to start */
-	subfic	r6,r5,256
-	add	r3,r3,r6
-
-	/* How many 16 byte chunks? */
-	srdi	r7,r5,4
-	mtctr	r7
-
-	vxor	v19,v19,v19
-	vxor	v20,v20,v20
-
-	lvx	v0,0,r4
-	lvx	v16,0,r3
-	VPERM(v0,v0,v16,byteswap)
-	vxor	v0,v0,v8	/* xor in initial value */
-	VPMSUMW(v0,v0,v16)
-	bdz	.Lv0
-
-	lvx	v1,off16,r4
-	lvx	v17,off16,r3
-	VPERM(v1,v1,v17,byteswap)
-	VPMSUMW(v1,v1,v17)
-	bdz	.Lv1
-
-	lvx	v2,off32,r4
-	lvx	v16,off32,r3
-	VPERM(v2,v2,v16,byteswap)
-	VPMSUMW(v2,v2,v16)
-	bdz	.Lv2
-
-	lvx	v3,off48,r4
-	lvx	v17,off48,r3
-	VPERM(v3,v3,v17,byteswap)
-	VPMSUMW(v3,v3,v17)
-	bdz	.Lv3
-
-	lvx	v4,off64,r4
-	lvx	v16,off64,r3
-	VPERM(v4,v4,v16,byteswap)
-	VPMSUMW(v4,v4,v16)
-	bdz	.Lv4
-
-	lvx	v5,off80,r4
-	lvx	v17,off80,r3
-	VPERM(v5,v5,v17,byteswap)
-	VPMSUMW(v5,v5,v17)
-	bdz	.Lv5
-
-	lvx	v6,off96,r4
-	lvx	v16,off96,r3
-	VPERM(v6,v6,v16,byteswap)
-	VPMSUMW(v6,v6,v16)
-	bdz	.Lv6
-
-	lvx	v7,off112,r4
-	lvx	v17,off112,r3
-	VPERM(v7,v7,v17,byteswap)
-	VPMSUMW(v7,v7,v17)
-	bdz	.Lv7
-
-	addi	r3,r3,128
-	addi	r4,r4,128
-
-	lvx	v8,0,r4
-	lvx	v16,0,r3
-	VPERM(v8,v8,v16,byteswap)
-	VPMSUMW(v8,v8,v16)
-	bdz	.Lv8
-
-	lvx	v9,off16,r4
-	lvx	v17,off16,r3
-	VPERM(v9,v9,v17,byteswap)
-	VPMSUMW(v9,v9,v17)
-	bdz	.Lv9
-
-	lvx	v10,off32,r4
-	lvx	v16,off32,r3
-	VPERM(v10,v10,v16,byteswap)
-	VPMSUMW(v10,v10,v16)
-	bdz	.Lv10
-
-	lvx	v11,off48,r4
-	lvx	v17,off48,r3
-	VPERM(v11,v11,v17,byteswap)
-	VPMSUMW(v11,v11,v17)
-	bdz	.Lv11
-
-	lvx	v12,off64,r4
-	lvx	v16,off64,r3
-	VPERM(v12,v12,v16,byteswap)
-	VPMSUMW(v12,v12,v16)
-	bdz	.Lv12
-
-	lvx	v13,off80,r4
-	lvx	v17,off80,r3
-	VPERM(v13,v13,v17,byteswap)
-	VPMSUMW(v13,v13,v17)
-	bdz	.Lv13
-
-	lvx	v14,off96,r4
-	lvx	v16,off96,r3
-	VPERM(v14,v14,v16,byteswap)
-	VPMSUMW(v14,v14,v16)
-	bdz	.Lv14
-
-	lvx	v15,off112,r4
-	lvx	v17,off112,r3
-	VPERM(v15,v15,v17,byteswap)
-	VPMSUMW(v15,v15,v17)
-
-.Lv15:	vxor	v19,v19,v15
-.Lv14:	vxor	v20,v20,v14
-.Lv13:	vxor	v19,v19,v13
-.Lv12:	vxor	v20,v20,v12
-.Lv11:	vxor	v19,v19,v11
-.Lv10:	vxor	v20,v20,v10
-.Lv9:	vxor	v19,v19,v9
-.Lv8:	vxor	v20,v20,v8
-.Lv7:	vxor	v19,v19,v7
-.Lv6:	vxor	v20,v20,v6
-.Lv5:	vxor	v19,v19,v5
-.Lv4:	vxor	v20,v20,v4
-.Lv3:	vxor	v19,v19,v3
-.Lv2:	vxor	v20,v20,v2
-.Lv1:	vxor	v19,v19,v1
-.Lv0:	vxor	v20,v20,v0
-
-	vxor	v0,v19,v20
-
-	b	.Lbarrett_reduction
-
-.Lzero:
-	mr	r3,r10
-	b	.Lout
-
-FUNC_END(CRC_FUNCTION_NAME)
diff --git a/arch/powerpc/crypto/crc32c-vpmsum_asm.S b/arch/powerpc/crypto/crc32c-vpmsum_asm.S
deleted file mode 100644
index bf442004ea1f..000000000000
--- a/arch/powerpc/crypto/crc32c-vpmsum_asm.S
+++ /dev/null
@@ -1,842 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Calculate a crc32c with vpmsum acceleration
- *
- * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
- */
-	.section	.rodata
-.balign 16
-
-.byteswap_constant:
-	/* byte reverse permute constant */
-	.octa 0x0F0E0D0C0B0A09080706050403020100
-
-.constants:
-
-	/* Reduce 262144 kbits to 1024 bits */
-	/* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
-	.octa 0x00000000b6ca9e20000000009c37c408
-
-	/* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
-	.octa 0x00000000350249a800000001b51df26c
-
-	/* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
-	.octa 0x00000001862dac54000000000724b9d0
-
-	/* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
-	.octa 0x00000001d87fb48c00000001c00532fe
-
-	/* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
-	.octa 0x00000001f39b699e00000000f05a9362
-
-	/* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
-	.octa 0x0000000101da11b400000001e1007970
-
-	/* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
-	.octa 0x00000001cab571e000000000a57366ee
-
-	/* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
-	.octa 0x00000000c7020cfe0000000192011284
-
-	/* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
-	.octa 0x00000000cdaed1ae0000000162716d9a
-
-	/* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
-	.octa 0x00000001e804effc00000000cd97ecde
-
-	/* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
-	.octa 0x0000000077c3ea3a0000000058812bc0
-
-	/* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
-	.octa 0x0000000068df31b40000000088b8c12e
-
-	/* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
-	.octa 0x00000000b059b6c200000001230b234c
-
-	/* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
-	.octa 0x0000000145fb8ed800000001120b416e
-
-	/* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
-	.octa 0x00000000cbc0916800000001974aecb0
-
-	/* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
-	.octa 0x000000005ceeedc2000000008ee3f226
-
-	/* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
-	.octa 0x0000000047d74e8600000001089aba9a
-
-	/* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
-	.octa 0x00000001407e9e220000000065113872
-
-	/* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
-	.octa 0x00000001da967bda000000005c07ec10
-
-	/* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
-	.octa 0x000000006c8983680000000187590924
-
-	/* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
-	.octa 0x00000000f2d14c9800000000e35da7c6
-
-	/* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
-	.octa 0x00000001993c6ad4000000000415855a
-
-	/* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
-	.octa 0x000000014683d1ac0000000073617758
-
-	/* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
-	.octa 0x00000001a7c93e6c0000000176021d28
-
-	/* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
-	.octa 0x000000010211e90a00000001c358fd0a
-
-	/* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
-	.octa 0x000000001119403e00000001ff7a2c18
-
-	/* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
-	.octa 0x000000001c3261aa00000000f2d9f7e4
-
-	/* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
-	.octa 0x000000014e37a634000000016cf1f9c8
-
-	/* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
-	.octa 0x0000000073786c0c000000010af9279a
-
-	/* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
-	.octa 0x000000011dc037f80000000004f101e8
-
-	/* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
-	.octa 0x0000000031433dfc0000000070bcf184
-
-	/* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
-	.octa 0x000000009cde8348000000000a8de642
-
-	/* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
-	.octa 0x0000000038d3c2a60000000062ea130c
-
-	/* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
-	.octa 0x000000011b25f26000000001eb31cbb2
-
-	/* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
-	.octa 0x000000001629e6f00000000170783448
-
-	/* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
-	.octa 0x0000000160838b4c00000001a684b4c6
-
-	/* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
-	.octa 0x000000007a44011c00000000253ca5b4
-
-	/* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
-	.octa 0x00000000226f417a0000000057b4b1e2
-
-	/* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
-	.octa 0x0000000045eb2eb400000000b6bd084c
-
-	/* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
-	.octa 0x000000014459d70c0000000123c2d592
-
-	/* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
-	.octa 0x00000001d406ed8200000000159dafce
-
-	/* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
-	.octa 0x0000000160c8e1a80000000127e1a64e
-
-	/* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
-	.octa 0x0000000027ba80980000000056860754
-
-	/* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
-	.octa 0x000000006d92d01800000001e661aae8
-
-	/* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
-	.octa 0x000000012ed7e3f200000000f82c6166
-
-	/* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
-	.octa 0x000000002dc8778800000000c4f9c7ae
-
-	/* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
-	.octa 0x0000000018240bb80000000074203d20
-
-	/* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
-	.octa 0x000000001ad381580000000198173052
-
-	/* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
-	.octa 0x00000001396b78f200000001ce8aba54
-
-	/* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
-	.octa 0x000000011a68133400000001850d5d94
-
-	/* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
-	.octa 0x000000012104732e00000001d609239c
-
-	/* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
-	.octa 0x00000000a140d90c000000001595f048
-
-	/* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
-	.octa 0x00000001b7215eda0000000042ccee08
-
-	/* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
-	.octa 0x00000001aaf1df3c000000010a389d74
-
-	/* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
-	.octa 0x0000000029d15b8a000000012a840da6
-
-	/* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
-	.octa 0x00000000f1a96922000000001d181c0c
-
-	/* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
-	.octa 0x00000001ac80d03c0000000068b7d1f6
-
-	/* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
-	.octa 0x000000000f11d56a000000005b0f14fc
-
-	/* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
-	.octa 0x00000001f1c022a20000000179e9e730
-
-	/* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
-	.octa 0x0000000173d00ae200000001ce1368d6
-
-	/* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
-	.octa 0x00000001d4ffe4ac0000000112c3a84c
-
-	/* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
-	.octa 0x000000016edc5ae400000000de940fee
-
-	/* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
-	.octa 0x00000001f1a0214000000000fe896b7e
-
-	/* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
-	.octa 0x00000000ca0b28a000000001f797431c
-
-	/* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
-	.octa 0x00000001928e30a20000000053e989ba
-
-	/* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
-	.octa 0x0000000097b1b002000000003920cd16
-
-	/* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
-	.octa 0x00000000b15bf90600000001e6f579b8
-
-	/* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
-	.octa 0x00000000411c5d52000000007493cb0a
-
-	/* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
-	.octa 0x00000001c36f330000000001bdd376d8
-
-	/* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
-	.octa 0x00000001119227e0000000016badfee6
-
-	/* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
-	.octa 0x00000000114d47020000000071de5c58
-
-	/* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
-	.octa 0x00000000458b5b9800000000453f317c
-
-	/* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
-	.octa 0x000000012e31fb8e0000000121675cce
-
-	/* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
-	.octa 0x000000005cf619d800000001f409ee92
-
-	/* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
-	.octa 0x0000000063f4d8b200000000f36b9c88
-
-	/* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
-	.octa 0x000000004138dc8a0000000036b398f4
-
-	/* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
-	.octa 0x00000001d29ee8e000000001748f9adc
-
-	/* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
-	.octa 0x000000006a08ace800000001be94ec00
-
-	/* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
-	.octa 0x0000000127d4201000000000b74370d6
-
-	/* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
-	.octa 0x0000000019d76b6200000001174d0b98
-
-	/* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
-	.octa 0x00000001b1471f6e00000000befc06a4
-
-	/* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
-	.octa 0x00000001f64c19cc00000001ae125288
-
-	/* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
-	.octa 0x00000000003c0ea00000000095c19b34
-
-	/* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
-	.octa 0x000000014d73abf600000001a78496f2
-
-	/* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
-	.octa 0x00000001620eb84400000001ac5390a0
-
-	/* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
-	.octa 0x0000000147655048000000002a80ed6e
-
-	/* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
-	.octa 0x0000000067b5077e00000001fa9b0128
-
-	/* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
-	.octa 0x0000000010ffe20600000001ea94929e
-
-	/* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
-	.octa 0x000000000fee8f1e0000000125f4305c
-
-	/* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
-	.octa 0x00000001da26fbae00000001471e2002
-
-	/* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
-	.octa 0x00000001b3a8bd880000000132d2253a
-
-	/* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
-	.octa 0x00000000e8f3898e00000000f26b3592
-
-	/* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
-	.octa 0x00000000b0d0d28c00000000bc8b67b0
-
-	/* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
-	.octa 0x0000000030f2a798000000013a826ef2
-
-	/* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
-	.octa 0x000000000fba10020000000081482c84
-
-	/* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
-	.octa 0x00000000bdb9bd7200000000e77307c2
-
-	/* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
-	.octa 0x0000000075d3bf5a00000000d4a07ec8
-
-	/* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
-	.octa 0x00000000ef1f98a00000000017102100
-
-	/* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
-	.octa 0x00000000689c760200000000db406486
-
-	/* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
-	.octa 0x000000016d5fa5fe0000000192db7f88
-
-	/* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
-	.octa 0x00000001d0d2b9ca000000018bf67b1e
-
-	/* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
-	.octa 0x0000000041e7b470000000007c09163e
-
-	/* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
-	.octa 0x00000001cbb6495e000000000adac060
-
-	/* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
-	.octa 0x000000010052a0b000000000bd8316ae
-
-	/* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
-	.octa 0x00000001d8effb5c000000019f09ab54
-
-	/* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
-	.octa 0x00000001d969853c0000000125155542
-
-	/* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
-	.octa 0x00000000523ccce2000000018fdb5882
-
-	/* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
-	.octa 0x000000001e2436bc00000000e794b3f4
-
-	/* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
-	.octa 0x00000000ddd1c3a2000000016f9bb022
-
-	/* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
-	.octa 0x0000000019fcfe3800000000290c9978
-
-	/* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
-	.octa 0x00000001ce95db640000000083c0f350
-
-	/* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
-	.octa 0x00000000af5828060000000173ea6628
-
-	/* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
-	.octa 0x00000001006388f600000001c8b4e00a
-
-	/* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
-	.octa 0x0000000179eca00a00000000de95d6aa
-
-	/* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
-	.octa 0x0000000122410a6a000000010b7f7248
-
-	/* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
-	.octa 0x000000004288e87c00000001326e3a06
-
-	/* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
-	.octa 0x000000016c5490da00000000bb62c2e6
-
-	/* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
-	.octa 0x00000000d1c71f6e0000000156a4b2c2
-
-	/* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
-	.octa 0x00000001b4ce08a6000000011dfe763a
-
-	/* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
-	.octa 0x00000001466ba60c000000007bcca8e2
-
-	/* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
-	.octa 0x00000001f6c488a40000000186118faa
-
-	/* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
-	.octa 0x000000013bfb06820000000111a65a88
-
-	/* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
-	.octa 0x00000000690e9e54000000003565e1c4
-
-	/* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
-	.octa 0x00000000281346b6000000012ed02a82
-
-	/* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
-	.octa 0x000000015646402400000000c486ecfc
-
-	/* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
-	.octa 0x000000016063a8dc0000000001b951b2
-
-	/* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
-	.octa 0x0000000116a663620000000048143916
-
-	/* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
-	.octa 0x000000017e8aa4d200000001dc2ae124
-
-	/* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
-	.octa 0x00000001728eb10c00000001416c58d6
-
-	/* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
-	.octa 0x00000001b08fd7fa00000000a479744a
-
-	/* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
-	.octa 0x00000001092a16e80000000096ca3a26
-
-	/* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
-	.octa 0x00000000a505637c00000000ff223d4e
-
-	/* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
-	.octa 0x00000000d94869b2000000010e84da42
-
-	/* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
-	.octa 0x00000001c8b203ae00000001b61ba3d0
-
-	/* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
-	.octa 0x000000005704aea000000000680f2de8
-
-	/* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
-	.octa 0x000000012e295fa2000000008772a9a8
-
-	/* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
-	.octa 0x000000011d0908bc0000000155f295bc
-
-	/* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
-	.octa 0x0000000193ed97ea00000000595f9282
-
-	/* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
-	.octa 0x000000013a0f1c520000000164b1c25a
-
-	/* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
-	.octa 0x000000010c2c40c000000000fbd67c50
-
-	/* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
-	.octa 0x00000000ff6fac3e0000000096076268
-
-	/* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
-	.octa 0x000000017b3609c000000001d288e4cc
-
-	/* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
-	.octa 0x0000000088c8c92200000001eaac1bdc
-
-	/* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
-	.octa 0x00000001751baae600000001f1ea39e2
-
-	/* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
-	.octa 0x000000010795297200000001eb6506fc
-
-	/* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
-	.octa 0x0000000162b00abe000000010f806ffe
-
-	/* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
-	.octa 0x000000000d7b404c000000010408481e
-
-	/* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
-	.octa 0x00000000763b13d40000000188260534
-
-	/* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
-	.octa 0x00000000f6dc22d80000000058fc73e0
-
-	/* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
-	.octa 0x000000007daae06000000000391c59b8
-
-	/* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
-	.octa 0x000000013359ab7c000000018b638400
-
-	/* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
-	.octa 0x000000008add438a000000011738f5c4
-
-	/* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
-	.octa 0x00000001edbefdea000000008cf7c6da
-
-	/* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
-	.octa 0x000000004104e0f800000001ef97fb16
-
-	/* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
-	.octa 0x00000000b48a82220000000102130e20
-
-	/* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
-	.octa 0x00000001bcb4684400000000db968898
-
-	/* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
-	.octa 0x000000013293ce0a00000000b5047b5e
-
-	/* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
-	.octa 0x00000001710d0844000000010b90fdb2
-
-	/* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
-	.octa 0x0000000117907f6e000000004834a32e
-
-	/* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
-	.octa 0x0000000087ddf93e0000000059c8f2b0
-
-	/* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
-	.octa 0x000000005970e9b00000000122cec508
-
-	/* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
-	.octa 0x0000000185b2b7d0000000000a330cda
-
-	/* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
-	.octa 0x00000001dcee0efc000000014a47148c
-
-	/* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
-	.octa 0x0000000030da27220000000042c61cb8
-
-	/* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
-	.octa 0x000000012f925a180000000012fe6960
-
-	/* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
-	.octa 0x00000000dd2e357c00000000dbda2c20
-
-	/* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
-	.octa 0x00000000071c80de000000011122410c
-
-	/* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
-	.octa 0x000000011513140a00000000977b2070
-
-	/* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
-	.octa 0x00000001df876e8e000000014050438e
-
-	/* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
-	.octa 0x000000015f81d6ce0000000147c840e8
-
-	/* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
-	.octa 0x000000019dd94dbe00000001cc7c88ce
-
-	/* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
-	.octa 0x00000001373d206e00000001476b35a4
-
-	/* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
-	.octa 0x00000000668ccade000000013d52d508
-
-	/* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
-	.octa 0x00000001b192d268000000008e4be32e
-
-	/* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
-	.octa 0x00000000e30f3a7800000000024120fe
-
-	/* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
-	.octa 0x000000010ef1f7bc00000000ddecddb4
-
-	/* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
-	.octa 0x00000001f5ac738000000000d4d403bc
-
-	/* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
-	.octa 0x000000011822ea7000000001734b89aa
-
-	/* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
-	.octa 0x00000000c3a33848000000010e7a58d6
-
-	/* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
-	.octa 0x00000001bd151c2400000001f9f04e9c
-
-	/* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
-	.octa 0x0000000056002d7600000000b692225e
-
-	/* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
-	.octa 0x000000014657c4f4000000019b8d3f3e
-
-	/* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
-	.octa 0x0000000113742d7c00000001a874f11e
-
-	/* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
-	.octa 0x000000019c5920ba000000010d5a4254
-
-	/* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
-	.octa 0x000000005216d2d600000000bbb2f5d6
-
-	/* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
-	.octa 0x0000000136f5ad8a0000000179cc0e36
-
-	/* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
-	.octa 0x000000018b07beb600000001dca1da4a
-
-	/* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
-	.octa 0x00000000db1e93b000000000feb1a192
-
-	/* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
-	.octa 0x000000000b96fa3a00000000d1eeedd6
-
-	/* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
-	.octa 0x00000001d9968af0000000008fad9bb4
-
-	/* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
-	.octa 0x000000000e4a77a200000001884938e4
-
-	/* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
-	.octa 0x00000000508c2ac800000001bc2e9bc0
-
-	/* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
-	.octa 0x0000000021572a8000000001f9658a68
-
-	/* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
-	.octa 0x00000001b859daf2000000001b9224fc
-
-	/* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
-	.octa 0x000000016f7884740000000055b2fb84
-
-	/* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
-	.octa 0x00000001b438810e000000018b090348
-
-	/* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
-	.octa 0x0000000095ddc6f2000000011ccbd5ea
-
-	/* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
-	.octa 0x00000001d977c20c0000000007ae47f8
-
-	/* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
-	.octa 0x00000000ebedb99a0000000172acbec0
-
-	/* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
-	.octa 0x00000001df9e9e9200000001c6e3ff20
-
-	/* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
-	.octa 0x00000001a4a3f95200000000e1b38744
-
-	/* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
-	.octa 0x00000000e2f5122000000000791585b2
-
-	/* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
-	.octa 0x000000004aa01f3e00000000ac53b894
-
-	/* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
-	.octa 0x00000000b3e90a5800000001ed5f2cf4
-
-	/* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
-	.octa 0x000000000c9ca2aa00000001df48b2e0
-
-	/* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
-	.octa 0x000000015168231600000000049c1c62
-
-	/* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
-	.octa 0x0000000036fce78c000000017c460c12
-
-	/* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
-	.octa 0x000000009037dc10000000015be4da7e
-
-	/* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
-	.octa 0x00000000d3298582000000010f38f668
-
-	/* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
-	.octa 0x00000001b42e8ad60000000039f40a00
-
-	/* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
-	.octa 0x00000000142a983800000000bd4c10c4
-
-	/* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
-	.octa 0x0000000109c7f1900000000042db1d98
-
-	/* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
-	.octa 0x0000000056ff931000000001c905bae6
-
-	/* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
-	.octa 0x00000001594513aa00000000069d40ea
-
-	/* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
-	.octa 0x00000001e3b5b1e8000000008e4fbad0
-
-	/* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
-	.octa 0x000000011dd5fc080000000047bedd46
-
-	/* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
-	.octa 0x00000001675f0cc20000000026396bf8
-
-	/* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
-	.octa 0x00000000d1c8dd4400000000379beb92
-
-	/* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
-	.octa 0x0000000115ebd3d8000000000abae54a
-
-	/* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
-	.octa 0x00000001ecbd0dac0000000007e6a128
-
-	/* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
-	.octa 0x00000000cdf67af2000000000ade29d2
-
-	/* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
-	.octa 0x000000004c01ff4c00000000f974c45c
-
-	/* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
-	.octa 0x00000000f2d8657e00000000e77ac60a
-
-	/* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
-	.octa 0x000000006bae74c40000000145895816
-
-	/* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
-	.octa 0x0000000152af8aa00000000038e362be
-
-	/* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
-	.octa 0x0000000004663802000000007f991a64
-
-	/* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
-	.octa 0x00000001ab2f5afc00000000fa366d3a
-
-	/* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
-	.octa 0x0000000074a4ebd400000001a2bb34f0
-
-	/* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
-	.octa 0x00000001d7ab3a4c0000000028a9981e
-
-	/* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
-	.octa 0x00000001a8da60c600000001dbc672be
-
-	/* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
-	.octa 0x000000013cf6382000000000b04d77f6
-
-	/* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
-	.octa 0x00000000bec12e1e0000000124400d96
-
-	/* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
-	.octa 0x00000001c6368010000000014ca4b414
-
-	/* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
-	.octa 0x00000001e6e78758000000012fe2c938
-
-	/* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
-	.octa 0x000000008d7f2b3c00000001faed01e6
-
-	/* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
-	.octa 0x000000016b4a156e000000007e80ecfe
-
-	/* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
-	.octa 0x00000001c63cfeb60000000098daee94
-
-	/* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
-	.octa 0x000000015f902670000000010a04edea
-
-	/* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
-	.octa 0x00000001cd5de11e00000001c00b4524
-
-	/* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
-	.octa 0x000000001acaec540000000170296550
-
-	/* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
-	.octa 0x000000002bd0ca780000000181afaa48
-
-	/* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
-	.octa 0x0000000032d63d5c0000000185a31ffa
-
-	/* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
-	.octa 0x000000001c6d4e4c000000002469f608
-
-	/* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
-	.octa 0x0000000106a60b92000000006980102a
-
-	/* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
-	.octa 0x00000000d3855e120000000111ea9ca8
-
-	/* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
-	.octa 0x00000000e312563600000001bd1d29ce
-
-	/* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
-	.octa 0x000000009e8f7ea400000001b34b9580
-
-	/* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
-	.octa 0x00000001c82e562c000000003076054e
-
-	/* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
-	.octa 0x00000000ca9f09ce000000012a608ea4
-
-	/* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
-	.octa 0x00000000c63764e600000000784d05fe
-
-	/* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
-	.octa 0x0000000168d2e49e000000016ef0d82a
-
-	/* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
-	.octa 0x00000000e986c1480000000075bda454
-
-	/* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
-	.octa 0x00000000cfb65894000000003dc0a1c4
-
-	/* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
-	.octa 0x0000000111cadee400000000e9a5d8be
-
-	/* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
-	.octa 0x0000000171fb63ce00000001609bc4b4
-
-.short_constants:
-
-	/* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
-	/* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */
-	.octa 0x7fec2963e5bf80485cf015c388e56f72
-
-	/* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */
-	.octa 0x38e888d4844752a9963a18920246e2e6
-
-	/* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */
-	.octa 0x42316c00730206ad419a441956993a31
-
-	/* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */
-	.octa 0x543d5c543e65ddf9924752ba2b830011
-
-	/* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */
-	.octa 0x78e87aaf56767c9255bd7f9518e4a304
-
-	/* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */
-	.octa 0x8f68fcec1903da7f6d76739fe0553f1e
-
-	/* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */
-	.octa 0x3f4840246791d588c133722b1fe0b5c3
-
-	/* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */
-	.octa 0x34c96751b04de25a64b67ee0e55ef1f3
-
-	/* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */
-	.octa 0x156c8e180b4a395b069db049b8fdb1e7
-
-	/* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */
-	.octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e
-
-	/* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */
-	.octa 0x041d37768cd75659817cdc5119b29a35
-
-	/* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */
-	.octa 0x3a0777818cfaa9651ce9d94b36c41f1c
-
-	/* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */
-	.octa 0x0e148e8252377a554f256efcb82be955
-
-	/* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */
-	.octa 0x9c25531d19e65ddeec1631edb2dea967
-
-	/* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */
-	.octa 0x790606ff9957c0a65d27e147510ac59a
-
-	/* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */
-	.octa 0x82f63b786ea2d55ca66805eb18b8ea18
-
-
-.barrett_constants:
-	/* 33 bit reflected Barrett constant m - (4^32)/n */
-	.octa 0x000000000000000000000000dea713f1	/* x^64 div p(x)` */
-	/* 33 bit reflected Barrett constant n */
-	.octa 0x00000000000000000000000105ec76f1
-
-#define CRC_FUNCTION_NAME __crc32c_vpmsum
-#define REFLECT
-#include "crc32-vpmsum_core.S"
diff --git a/arch/powerpc/crypto/crc32c-vpmsum_glue.c b/arch/powerpc/crypto/crc32c-vpmsum_glue.c
deleted file mode 100644
index 63760b7dbb76..000000000000
--- a/arch/powerpc/crypto/crc32c-vpmsum_glue.c
+++ /dev/null
@@ -1,173 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/crc32.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/cpufeature.h>
-#include <asm/simd.h>
-#include <asm/switch_to.h>
-
-#define CHKSUM_BLOCK_SIZE	1
-#define CHKSUM_DIGEST_SIZE	4
-
-#define VMX_ALIGN		16
-#define VMX_ALIGN_MASK		(VMX_ALIGN-1)
-
-#define VECTOR_BREAKPOINT	512
-
-u32 __crc32c_vpmsum(u32 crc, unsigned char const *p, size_t len);
-
-static u32 crc32c_vpmsum(u32 crc, unsigned char const *p, size_t len)
-{
-	unsigned int prealign;
-	unsigned int tail;
-
-	if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || !crypto_simd_usable())
-		return __crc32c_le(crc, p, len);
-
-	if ((unsigned long)p & VMX_ALIGN_MASK) {
-		prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
-		crc = __crc32c_le(crc, p, prealign);
-		len -= prealign;
-		p += prealign;
-	}
-
-	if (len & ~VMX_ALIGN_MASK) {
-		preempt_disable();
-		pagefault_disable();
-		enable_kernel_altivec();
-		crc = __crc32c_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
-		disable_kernel_altivec();
-		pagefault_enable();
-		preempt_enable();
-	}
-
-	tail = len & VMX_ALIGN_MASK;
-	if (tail) {
-		p += len & ~VMX_ALIGN_MASK;
-		crc = __crc32c_le(crc, p, tail);
-	}
-
-	return crc;
-}
-
-static int crc32c_vpmsum_cra_init(struct crypto_tfm *tfm)
-{
-	u32 *key = crypto_tfm_ctx(tfm);
-
-	*key = ~0;
-
-	return 0;
-}
-
-/*
- * Setting the seed allows arbitrary accumulators and flexible XOR policy
- * If your algorithm starts with ~0, then XOR with ~0 before you set
- * the seed.
- */
-static int crc32c_vpmsum_setkey(struct crypto_shash *hash, const u8 *key,
-			       unsigned int keylen)
-{
-	u32 *mctx = crypto_shash_ctx(hash);
-
-	if (keylen != sizeof(u32))
-		return -EINVAL;
-	*mctx = le32_to_cpup((__le32 *)key);
-	return 0;
-}
-
-static int crc32c_vpmsum_init(struct shash_desc *desc)
-{
-	u32 *mctx = crypto_shash_ctx(desc->tfm);
-	u32 *crcp = shash_desc_ctx(desc);
-
-	*crcp = *mctx;
-
-	return 0;
-}
-
-static int crc32c_vpmsum_update(struct shash_desc *desc, const u8 *data,
-			       unsigned int len)
-{
-	u32 *crcp = shash_desc_ctx(desc);
-
-	*crcp = crc32c_vpmsum(*crcp, data, len);
-
-	return 0;
-}
-
-static int __crc32c_vpmsum_finup(u32 *crcp, const u8 *data, unsigned int len,
-				u8 *out)
-{
-	*(__le32 *)out = ~cpu_to_le32(crc32c_vpmsum(*crcp, data, len));
-
-	return 0;
-}
-
-static int crc32c_vpmsum_finup(struct shash_desc *desc, const u8 *data,
-			      unsigned int len, u8 *out)
-{
-	return __crc32c_vpmsum_finup(shash_desc_ctx(desc), data, len, out);
-}
-
-static int crc32c_vpmsum_final(struct shash_desc *desc, u8 *out)
-{
-	u32 *crcp = shash_desc_ctx(desc);
-
-	*(__le32 *)out = ~cpu_to_le32p(crcp);
-
-	return 0;
-}
-
-static int crc32c_vpmsum_digest(struct shash_desc *desc, const u8 *data,
-			       unsigned int len, u8 *out)
-{
-	return __crc32c_vpmsum_finup(crypto_shash_ctx(desc->tfm), data, len,
-				     out);
-}
-
-static struct shash_alg alg = {
-	.setkey		= crc32c_vpmsum_setkey,
-	.init		= crc32c_vpmsum_init,
-	.update		= crc32c_vpmsum_update,
-	.final		= crc32c_vpmsum_final,
-	.finup		= crc32c_vpmsum_finup,
-	.digest		= crc32c_vpmsum_digest,
-	.descsize	= sizeof(u32),
-	.digestsize	= CHKSUM_DIGEST_SIZE,
-	.base		= {
-		.cra_name		= "crc32c",
-		.cra_driver_name	= "crc32c-vpmsum",
-		.cra_priority		= 200,
-		.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
-		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
-		.cra_ctxsize		= sizeof(u32),
-		.cra_module		= THIS_MODULE,
-		.cra_init		= crc32c_vpmsum_cra_init,
-	}
-};
-
-static int __init crc32c_vpmsum_mod_init(void)
-{
-	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
-		return -ENODEV;
-
-	return crypto_register_shash(&alg);
-}
-
-static void __exit crc32c_vpmsum_mod_fini(void)
-{
-	crypto_unregister_shash(&alg);
-}
-
-module_cpu_feature_match(PPC_MODULE_FEATURE_VEC_CRYPTO, crc32c_vpmsum_mod_init);
-module_exit(crc32c_vpmsum_mod_fini);
-
-MODULE_AUTHOR("Anton Blanchard <anton@samba.org>");
-MODULE_DESCRIPTION("CRC32C using vector polynomial multiply-sum instructions");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_CRYPTO("crc32c");
-MODULE_ALIAS_CRYPTO("crc32c-vpmsum");
diff --git a/arch/powerpc/crypto/crct10dif-vpmsum_asm.S b/arch/powerpc/crypto/crct10dif-vpmsum_asm.S
deleted file mode 100644
index f0b93a0fe168..000000000000
--- a/arch/powerpc/crypto/crct10dif-vpmsum_asm.S
+++ /dev/null
@@ -1,845 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Calculate a CRC T10DIF  with vpmsum acceleration
- *
- * Constants generated by crc32-vpmsum, available at
- * https://github.com/antonblanchard/crc32-vpmsum
- *
- * crc32-vpmsum is
- * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
- */
-	.section	.rodata
-.balign 16
-
-.byteswap_constant:
-	/* byte reverse permute constant */
-	.octa 0x0F0E0D0C0B0A09080706050403020100
-
-.constants:
-
-	/* Reduce 262144 kbits to 1024 bits */
-	/* x^261184 mod p(x), x^261120 mod p(x) */
-	.octa 0x0000000056d300000000000052550000
-
-	/* x^260160 mod p(x), x^260096 mod p(x) */
-	.octa 0x00000000ee67000000000000a1e40000
-
-	/* x^259136 mod p(x), x^259072 mod p(x) */
-	.octa 0x0000000060830000000000004ad10000
-
-	/* x^258112 mod p(x), x^258048 mod p(x) */
-	.octa 0x000000008cfe0000000000009ab40000
-
-	/* x^257088 mod p(x), x^257024 mod p(x) */
-	.octa 0x000000003e93000000000000fdb50000
-
-	/* x^256064 mod p(x), x^256000 mod p(x) */
-	.octa 0x000000003c2000000000000045480000
-
-	/* x^255040 mod p(x), x^254976 mod p(x) */
-	.octa 0x00000000b1fc0000000000008d690000
-
-	/* x^254016 mod p(x), x^253952 mod p(x) */
-	.octa 0x00000000f82b00000000000024ad0000
-
-	/* x^252992 mod p(x), x^252928 mod p(x) */
-	.octa 0x0000000044420000000000009f1a0000
-
-	/* x^251968 mod p(x), x^251904 mod p(x) */
-	.octa 0x00000000e88c00000000000066ec0000
-
-	/* x^250944 mod p(x), x^250880 mod p(x) */
-	.octa 0x00000000385c000000000000c87d0000
-
-	/* x^249920 mod p(x), x^249856 mod p(x) */
-	.octa 0x000000003227000000000000c8ff0000
-
-	/* x^248896 mod p(x), x^248832 mod p(x) */
-	.octa 0x00000000a9a900000000000033440000
-
-	/* x^247872 mod p(x), x^247808 mod p(x) */
-	.octa 0x00000000abaa00000000000066eb0000
-
-	/* x^246848 mod p(x), x^246784 mod p(x) */
-	.octa 0x000000001ac3000000000000c4ef0000
-
-	/* x^245824 mod p(x), x^245760 mod p(x) */
-	.octa 0x0000000063f000000000000056f30000
-
-	/* x^244800 mod p(x), x^244736 mod p(x) */
-	.octa 0x0000000032cc00000000000002050000
-
-	/* x^243776 mod p(x), x^243712 mod p(x) */
-	.octa 0x00000000f8b5000000000000568e0000
-
-	/* x^242752 mod p(x), x^242688 mod p(x) */
-	.octa 0x000000008db100000000000064290000
-
-	/* x^241728 mod p(x), x^241664 mod p(x) */
-	.octa 0x0000000059ca0000000000006b660000
-
-	/* x^240704 mod p(x), x^240640 mod p(x) */
-	.octa 0x000000005f5c00000000000018f80000
-
-	/* x^239680 mod p(x), x^239616 mod p(x) */
-	.octa 0x0000000061af000000000000b6090000
-
-	/* x^238656 mod p(x), x^238592 mod p(x) */
-	.octa 0x00000000e29e000000000000099a0000
-
-	/* x^237632 mod p(x), x^237568 mod p(x) */
-	.octa 0x000000000975000000000000a8360000
-
-	/* x^236608 mod p(x), x^236544 mod p(x) */
-	.octa 0x0000000043900000000000004f570000
-
-	/* x^235584 mod p(x), x^235520 mod p(x) */
-	.octa 0x00000000f9cd000000000000134c0000
-
-	/* x^234560 mod p(x), x^234496 mod p(x) */
-	.octa 0x000000007c29000000000000ec380000
-
-	/* x^233536 mod p(x), x^233472 mod p(x) */
-	.octa 0x000000004c6a000000000000b0d10000
-
-	/* x^232512 mod p(x), x^232448 mod p(x) */
-	.octa 0x00000000e7290000000000007d3e0000
-
-	/* x^231488 mod p(x), x^231424 mod p(x) */
-	.octa 0x00000000f1ab000000000000f0b20000
-
-	/* x^230464 mod p(x), x^230400 mod p(x) */
-	.octa 0x0000000039db0000000000009c270000
-
-	/* x^229440 mod p(x), x^229376 mod p(x) */
-	.octa 0x000000005e2800000000000092890000
-
-	/* x^228416 mod p(x), x^228352 mod p(x) */
-	.octa 0x00000000d44e000000000000d5ee0000
-
-	/* x^227392 mod p(x), x^227328 mod p(x) */
-	.octa 0x00000000cd0a00000000000041f50000
-
-	/* x^226368 mod p(x), x^226304 mod p(x) */
-	.octa 0x00000000c5b400000000000010520000
-
-	/* x^225344 mod p(x), x^225280 mod p(x) */
-	.octa 0x00000000fd2100000000000042170000
-
-	/* x^224320 mod p(x), x^224256 mod p(x) */
-	.octa 0x000000002f2500000000000095c20000
-
-	/* x^223296 mod p(x), x^223232 mod p(x) */
-	.octa 0x000000001b0100000000000001ce0000
-
-	/* x^222272 mod p(x), x^222208 mod p(x) */
-	.octa 0x000000000d430000000000002aca0000
-
-	/* x^221248 mod p(x), x^221184 mod p(x) */
-	.octa 0x0000000030a6000000000000385e0000
-
-	/* x^220224 mod p(x), x^220160 mod p(x) */
-	.octa 0x00000000e37b0000000000006f7a0000
-
-	/* x^219200 mod p(x), x^219136 mod p(x) */
-	.octa 0x00000000873600000000000024320000
-
-	/* x^218176 mod p(x), x^218112 mod p(x) */
-	.octa 0x00000000e9fb000000000000bd9c0000
-
-	/* x^217152 mod p(x), x^217088 mod p(x) */
-	.octa 0x000000003b9500000000000054bc0000
-
-	/* x^216128 mod p(x), x^216064 mod p(x) */
-	.octa 0x00000000133e000000000000a4660000
-
-	/* x^215104 mod p(x), x^215040 mod p(x) */
-	.octa 0x00000000784500000000000079930000
-
-	/* x^214080 mod p(x), x^214016 mod p(x) */
-	.octa 0x00000000b9800000000000001bb80000
-
-	/* x^213056 mod p(x), x^212992 mod p(x) */
-	.octa 0x00000000687600000000000024400000
-
-	/* x^212032 mod p(x), x^211968 mod p(x) */
-	.octa 0x00000000aff300000000000029e10000
-
-	/* x^211008 mod p(x), x^210944 mod p(x) */
-	.octa 0x0000000024b50000000000005ded0000
-
-	/* x^209984 mod p(x), x^209920 mod p(x) */
-	.octa 0x0000000017e8000000000000b12e0000
-
-	/* x^208960 mod p(x), x^208896 mod p(x) */
-	.octa 0x00000000128400000000000026d20000
-
-	/* x^207936 mod p(x), x^207872 mod p(x) */
-	.octa 0x000000002115000000000000a32a0000
-
-	/* x^206912 mod p(x), x^206848 mod p(x) */
-	.octa 0x000000009595000000000000a1210000
-
-	/* x^205888 mod p(x), x^205824 mod p(x) */
-	.octa 0x00000000281e000000000000ee8b0000
-
-	/* x^204864 mod p(x), x^204800 mod p(x) */
-	.octa 0x0000000006010000000000003d0d0000
-
-	/* x^203840 mod p(x), x^203776 mod p(x) */
-	.octa 0x00000000e2b600000000000034e90000
-
-	/* x^202816 mod p(x), x^202752 mod p(x) */
-	.octa 0x000000001bd40000000000004cdb0000
-
-	/* x^201792 mod p(x), x^201728 mod p(x) */
-	.octa 0x00000000df2800000000000030e90000
-
-	/* x^200768 mod p(x), x^200704 mod p(x) */
-	.octa 0x0000000049c200000000000042590000
-
-	/* x^199744 mod p(x), x^199680 mod p(x) */
-	.octa 0x000000009b97000000000000df950000
-
-	/* x^198720 mod p(x), x^198656 mod p(x) */
-	.octa 0x000000006184000000000000da7b0000
-
-	/* x^197696 mod p(x), x^197632 mod p(x) */
-	.octa 0x00000000461700000000000012510000
-
-	/* x^196672 mod p(x), x^196608 mod p(x) */
-	.octa 0x000000009b40000000000000f37e0000
-
-	/* x^195648 mod p(x), x^195584 mod p(x) */
-	.octa 0x00000000eeb2000000000000ecf10000
-
-	/* x^194624 mod p(x), x^194560 mod p(x) */
-	.octa 0x00000000b2e800000000000050f20000
-
-	/* x^193600 mod p(x), x^193536 mod p(x) */
-	.octa 0x00000000f59a000000000000e0b30000
-
-	/* x^192576 mod p(x), x^192512 mod p(x) */
-	.octa 0x00000000467f0000000000004d5a0000
-
-	/* x^191552 mod p(x), x^191488 mod p(x) */
-	.octa 0x00000000da92000000000000bb010000
-
-	/* x^190528 mod p(x), x^190464 mod p(x) */
-	.octa 0x000000001e1000000000000022a40000
-
-	/* x^189504 mod p(x), x^189440 mod p(x) */
-	.octa 0x0000000058fe000000000000836f0000
-
-	/* x^188480 mod p(x), x^188416 mod p(x) */
-	.octa 0x00000000b9ce000000000000d78d0000
-
-	/* x^187456 mod p(x), x^187392 mod p(x) */
-	.octa 0x0000000022210000000000004f8d0000
-
-	/* x^186432 mod p(x), x^186368 mod p(x) */
-	.octa 0x00000000744600000000000033760000
-
-	/* x^185408 mod p(x), x^185344 mod p(x) */
-	.octa 0x000000001c2e000000000000a1e50000
-
-	/* x^184384 mod p(x), x^184320 mod p(x) */
-	.octa 0x00000000dcc8000000000000a1a40000
-
-	/* x^183360 mod p(x), x^183296 mod p(x) */
-	.octa 0x00000000910f00000000000019a20000
-
-	/* x^182336 mod p(x), x^182272 mod p(x) */
-	.octa 0x0000000055d5000000000000f6ae0000
-
-	/* x^181312 mod p(x), x^181248 mod p(x) */
-	.octa 0x00000000c8ba000000000000a7ac0000
-
-	/* x^180288 mod p(x), x^180224 mod p(x) */
-	.octa 0x0000000031f8000000000000eea20000
-
-	/* x^179264 mod p(x), x^179200 mod p(x) */
-	.octa 0x000000001966000000000000c4d90000
-
-	/* x^178240 mod p(x), x^178176 mod p(x) */
-	.octa 0x00000000b9810000000000002b470000
-
-	/* x^177216 mod p(x), x^177152 mod p(x) */
-	.octa 0x000000008303000000000000f7cf0000
-
-	/* x^176192 mod p(x), x^176128 mod p(x) */
-	.octa 0x000000002ce500000000000035b30000
-
-	/* x^175168 mod p(x), x^175104 mod p(x) */
-	.octa 0x000000002fae0000000000000c7c0000
-
-	/* x^174144 mod p(x), x^174080 mod p(x) */
-	.octa 0x00000000f50c0000000000009edf0000
-
-	/* x^173120 mod p(x), x^173056 mod p(x) */
-	.octa 0x00000000714f00000000000004cd0000
-
-	/* x^172096 mod p(x), x^172032 mod p(x) */
-	.octa 0x00000000c161000000000000541b0000
-
-	/* x^171072 mod p(x), x^171008 mod p(x) */
-	.octa 0x0000000021c8000000000000e2700000
-
-	/* x^170048 mod p(x), x^169984 mod p(x) */
-	.octa 0x00000000b93d00000000000009a60000
-
-	/* x^169024 mod p(x), x^168960 mod p(x) */
-	.octa 0x00000000fbcf000000000000761c0000
-
-	/* x^168000 mod p(x), x^167936 mod p(x) */
-	.octa 0x0000000026350000000000009db30000
-
-	/* x^166976 mod p(x), x^166912 mod p(x) */
-	.octa 0x00000000b64f0000000000003e9f0000
-
-	/* x^165952 mod p(x), x^165888 mod p(x) */
-	.octa 0x00000000bd0e00000000000078590000
-
-	/* x^164928 mod p(x), x^164864 mod p(x) */
-	.octa 0x00000000d9360000000000008bc80000
-
-	/* x^163904 mod p(x), x^163840 mod p(x) */
-	.octa 0x000000002f140000000000008c9f0000
-
-	/* x^162880 mod p(x), x^162816 mod p(x) */
-	.octa 0x000000006a270000000000006af70000
-
-	/* x^161856 mod p(x), x^161792 mod p(x) */
-	.octa 0x000000006685000000000000e5210000
-
-	/* x^160832 mod p(x), x^160768 mod p(x) */
-	.octa 0x0000000062da00000000000008290000
-
-	/* x^159808 mod p(x), x^159744 mod p(x) */
-	.octa 0x00000000bb4b000000000000e4d00000
-
-	/* x^158784 mod p(x), x^158720 mod p(x) */
-	.octa 0x00000000d2490000000000004ae10000
-
-	/* x^157760 mod p(x), x^157696 mod p(x) */
-	.octa 0x00000000c85b00000000000000e70000
-
-	/* x^156736 mod p(x), x^156672 mod p(x) */
-	.octa 0x00000000c37a00000000000015650000
-
-	/* x^155712 mod p(x), x^155648 mod p(x) */
-	.octa 0x0000000018530000000000001c2f0000
-
-	/* x^154688 mod p(x), x^154624 mod p(x) */
-	.octa 0x00000000b46600000000000037bd0000
-
-	/* x^153664 mod p(x), x^153600 mod p(x) */
-	.octa 0x00000000439b00000000000012190000
-
-	/* x^152640 mod p(x), x^152576 mod p(x) */
-	.octa 0x00000000b1260000000000005ece0000
-
-	/* x^151616 mod p(x), x^151552 mod p(x) */
-	.octa 0x00000000d8110000000000002a5e0000
-
-	/* x^150592 mod p(x), x^150528 mod p(x) */
-	.octa 0x00000000099f00000000000052330000
-
-	/* x^149568 mod p(x), x^149504 mod p(x) */
-	.octa 0x00000000f9f9000000000000f9120000
-
-	/* x^148544 mod p(x), x^148480 mod p(x) */
-	.octa 0x000000005cc00000000000000ddc0000
-
-	/* x^147520 mod p(x), x^147456 mod p(x) */
-	.octa 0x00000000343b00000000000012200000
-
-	/* x^146496 mod p(x), x^146432 mod p(x) */
-	.octa 0x000000009222000000000000d12b0000
-
-	/* x^145472 mod p(x), x^145408 mod p(x) */
-	.octa 0x00000000d781000000000000eb2d0000
-
-	/* x^144448 mod p(x), x^144384 mod p(x) */
-	.octa 0x000000000bf400000000000058970000
-
-	/* x^143424 mod p(x), x^143360 mod p(x) */
-	.octa 0x00000000094200000000000013690000
-
-	/* x^142400 mod p(x), x^142336 mod p(x) */
-	.octa 0x00000000d55100000000000051950000
-
-	/* x^141376 mod p(x), x^141312 mod p(x) */
-	.octa 0x000000008f11000000000000954b0000
-
-	/* x^140352 mod p(x), x^140288 mod p(x) */
-	.octa 0x00000000140f000000000000b29e0000
-
-	/* x^139328 mod p(x), x^139264 mod p(x) */
-	.octa 0x00000000c6db000000000000db5d0000
-
-	/* x^138304 mod p(x), x^138240 mod p(x) */
-	.octa 0x00000000715b000000000000dfaf0000
-
-	/* x^137280 mod p(x), x^137216 mod p(x) */
-	.octa 0x000000000dea000000000000e3b60000
-
-	/* x^136256 mod p(x), x^136192 mod p(x) */
-	.octa 0x000000006f94000000000000ddaf0000
-
-	/* x^135232 mod p(x), x^135168 mod p(x) */
-	.octa 0x0000000024e1000000000000e4f70000
-
-	/* x^134208 mod p(x), x^134144 mod p(x) */
-	.octa 0x000000008810000000000000aa110000
-
-	/* x^133184 mod p(x), x^133120 mod p(x) */
-	.octa 0x0000000030c2000000000000a8e60000
-
-	/* x^132160 mod p(x), x^132096 mod p(x) */
-	.octa 0x00000000e6d0000000000000ccf30000
-
-	/* x^131136 mod p(x), x^131072 mod p(x) */
-	.octa 0x000000004da000000000000079bf0000
-
-	/* x^130112 mod p(x), x^130048 mod p(x) */
-	.octa 0x000000007759000000000000b3a30000
-
-	/* x^129088 mod p(x), x^129024 mod p(x) */
-	.octa 0x00000000597400000000000028790000
-
-	/* x^128064 mod p(x), x^128000 mod p(x) */
-	.octa 0x000000007acd000000000000b5820000
-
-	/* x^127040 mod p(x), x^126976 mod p(x) */
-	.octa 0x00000000e6e400000000000026ad0000
-
-	/* x^126016 mod p(x), x^125952 mod p(x) */
-	.octa 0x000000006d49000000000000985b0000
-
-	/* x^124992 mod p(x), x^124928 mod p(x) */
-	.octa 0x000000000f0800000000000011520000
-
-	/* x^123968 mod p(x), x^123904 mod p(x) */
-	.octa 0x000000002c7f000000000000846c0000
-
-	/* x^122944 mod p(x), x^122880 mod p(x) */
-	.octa 0x000000005ce7000000000000ae1d0000
-
-	/* x^121920 mod p(x), x^121856 mod p(x) */
-	.octa 0x00000000d4cb000000000000e21d0000
-
-	/* x^120896 mod p(x), x^120832 mod p(x) */
-	.octa 0x000000003a2300000000000019bb0000
-
-	/* x^119872 mod p(x), x^119808 mod p(x) */
-	.octa 0x000000000e1700000000000095290000
-
-	/* x^118848 mod p(x), x^118784 mod p(x) */
-	.octa 0x000000006e6400000000000050d20000
-
-	/* x^117824 mod p(x), x^117760 mod p(x) */
-	.octa 0x000000008d5c0000000000000cd10000
-
-	/* x^116800 mod p(x), x^116736 mod p(x) */
-	.octa 0x00000000ef310000000000007b570000
-
-	/* x^115776 mod p(x), x^115712 mod p(x) */
-	.octa 0x00000000645d00000000000053d60000
-
-	/* x^114752 mod p(x), x^114688 mod p(x) */
-	.octa 0x0000000018fc00000000000077510000
-
-	/* x^113728 mod p(x), x^113664 mod p(x) */
-	.octa 0x000000000cb3000000000000a7b70000
-
-	/* x^112704 mod p(x), x^112640 mod p(x) */
-	.octa 0x00000000991b000000000000d0780000
-
-	/* x^111680 mod p(x), x^111616 mod p(x) */
-	.octa 0x00000000845a000000000000be3c0000
-
-	/* x^110656 mod p(x), x^110592 mod p(x) */
-	.octa 0x00000000d3a9000000000000df020000
-
-	/* x^109632 mod p(x), x^109568 mod p(x) */
-	.octa 0x0000000017d7000000000000063e0000
-
-	/* x^108608 mod p(x), x^108544 mod p(x) */
-	.octa 0x000000007a860000000000008ab40000
-
-	/* x^107584 mod p(x), x^107520 mod p(x) */
-	.octa 0x00000000fd7c000000000000c7bd0000
-
-	/* x^106560 mod p(x), x^106496 mod p(x) */
-	.octa 0x00000000a56b000000000000efd60000
-
-	/* x^105536 mod p(x), x^105472 mod p(x) */
-	.octa 0x0000000010e400000000000071380000
-
-	/* x^104512 mod p(x), x^104448 mod p(x) */
-	.octa 0x00000000994500000000000004d30000
-
-	/* x^103488 mod p(x), x^103424 mod p(x) */
-	.octa 0x00000000b83c0000000000003b0e0000
-
-	/* x^102464 mod p(x), x^102400 mod p(x) */
-	.octa 0x00000000d6c10000000000008b020000
-
-	/* x^101440 mod p(x), x^101376 mod p(x) */
-	.octa 0x000000009efc000000000000da940000
-
-	/* x^100416 mod p(x), x^100352 mod p(x) */
-	.octa 0x000000005e87000000000000f9f70000
-
-	/* x^99392 mod p(x), x^99328 mod p(x) */
-	.octa 0x000000006c9b00000000000045e40000
-
-	/* x^98368 mod p(x), x^98304 mod p(x) */
-	.octa 0x00000000178a00000000000083940000
-
-	/* x^97344 mod p(x), x^97280 mod p(x) */
-	.octa 0x00000000f0c8000000000000f0a00000
-
-	/* x^96320 mod p(x), x^96256 mod p(x) */
-	.octa 0x00000000f699000000000000b74b0000
-
-	/* x^95296 mod p(x), x^95232 mod p(x) */
-	.octa 0x00000000316d000000000000c1cf0000
-
-	/* x^94272 mod p(x), x^94208 mod p(x) */
-	.octa 0x00000000987e00000000000072680000
-
-	/* x^93248 mod p(x), x^93184 mod p(x) */
-	.octa 0x00000000acff000000000000e0ab0000
-
-	/* x^92224 mod p(x), x^92160 mod p(x) */
-	.octa 0x00000000a1f6000000000000c5a80000
-
-	/* x^91200 mod p(x), x^91136 mod p(x) */
-	.octa 0x0000000061bd000000000000cf690000
-
-	/* x^90176 mod p(x), x^90112 mod p(x) */
-	.octa 0x00000000c9f2000000000000cbcc0000
-
-	/* x^89152 mod p(x), x^89088 mod p(x) */
-	.octa 0x000000005a33000000000000de050000
-
-	/* x^88128 mod p(x), x^88064 mod p(x) */
-	.octa 0x00000000e416000000000000ccd70000
-
-	/* x^87104 mod p(x), x^87040 mod p(x) */
-	.octa 0x0000000058930000000000002f670000
-
-	/* x^86080 mod p(x), x^86016 mod p(x) */
-	.octa 0x00000000a9d3000000000000152f0000
-
-	/* x^85056 mod p(x), x^84992 mod p(x) */
-	.octa 0x00000000c114000000000000ecc20000
-
-	/* x^84032 mod p(x), x^83968 mod p(x) */
-	.octa 0x00000000b9270000000000007c890000
-
-	/* x^83008 mod p(x), x^82944 mod p(x) */
-	.octa 0x000000002e6000000000000006ee0000
-
-	/* x^81984 mod p(x), x^81920 mod p(x) */
-	.octa 0x00000000dfc600000000000009100000
-
-	/* x^80960 mod p(x), x^80896 mod p(x) */
-	.octa 0x000000004911000000000000ad4e0000
-
-	/* x^79936 mod p(x), x^79872 mod p(x) */
-	.octa 0x00000000ae1b000000000000b04d0000
-
-	/* x^78912 mod p(x), x^78848 mod p(x) */
-	.octa 0x0000000005fa000000000000e9900000
-
-	/* x^77888 mod p(x), x^77824 mod p(x) */
-	.octa 0x0000000004a1000000000000cc6f0000
-
-	/* x^76864 mod p(x), x^76800 mod p(x) */
-	.octa 0x00000000af73000000000000ed110000
-
-	/* x^75840 mod p(x), x^75776 mod p(x) */
-	.octa 0x0000000082530000000000008f7e0000
-
-	/* x^74816 mod p(x), x^74752 mod p(x) */
-	.octa 0x00000000cfdc000000000000594f0000
-
-	/* x^73792 mod p(x), x^73728 mod p(x) */
-	.octa 0x00000000a6b6000000000000a8750000
-
-	/* x^72768 mod p(x), x^72704 mod p(x) */
-	.octa 0x00000000fd76000000000000aa0c0000
-
-	/* x^71744 mod p(x), x^71680 mod p(x) */
-	.octa 0x0000000006f500000000000071db0000
-
-	/* x^70720 mod p(x), x^70656 mod p(x) */
-	.octa 0x0000000037ca000000000000ab0c0000
-
-	/* x^69696 mod p(x), x^69632 mod p(x) */
-	.octa 0x00000000d7ab000000000000b7a00000
-
-	/* x^68672 mod p(x), x^68608 mod p(x) */
-	.octa 0x00000000440800000000000090d30000
-
-	/* x^67648 mod p(x), x^67584 mod p(x) */
-	.octa 0x00000000186100000000000054730000
-
-	/* x^66624 mod p(x), x^66560 mod p(x) */
-	.octa 0x000000007368000000000000a3a20000
-
-	/* x^65600 mod p(x), x^65536 mod p(x) */
-	.octa 0x0000000026d0000000000000f9040000
-
-	/* x^64576 mod p(x), x^64512 mod p(x) */
-	.octa 0x00000000fe770000000000009c0a0000
-
-	/* x^63552 mod p(x), x^63488 mod p(x) */
-	.octa 0x000000002cba000000000000d1e70000
-
-	/* x^62528 mod p(x), x^62464 mod p(x) */
-	.octa 0x00000000f8bd0000000000005ac10000
-
-	/* x^61504 mod p(x), x^61440 mod p(x) */
-	.octa 0x000000007372000000000000d68d0000
-
-	/* x^60480 mod p(x), x^60416 mod p(x) */
-	.octa 0x00000000f37f00000000000089f60000
-
-	/* x^59456 mod p(x), x^59392 mod p(x) */
-	.octa 0x00000000078400000000000008a90000
-
-	/* x^58432 mod p(x), x^58368 mod p(x) */
-	.octa 0x00000000d3e400000000000042360000
-
-	/* x^57408 mod p(x), x^57344 mod p(x) */
-	.octa 0x00000000eba800000000000092d50000
-
-	/* x^56384 mod p(x), x^56320 mod p(x) */
-	.octa 0x00000000afbe000000000000b4d50000
-
-	/* x^55360 mod p(x), x^55296 mod p(x) */
-	.octa 0x00000000d8ca000000000000c9060000
-
-	/* x^54336 mod p(x), x^54272 mod p(x) */
-	.octa 0x00000000c2d00000000000008f4f0000
-
-	/* x^53312 mod p(x), x^53248 mod p(x) */
-	.octa 0x00000000373200000000000028690000
-
-	/* x^52288 mod p(x), x^52224 mod p(x) */
-	.octa 0x0000000046ae000000000000c3b30000
-
-	/* x^51264 mod p(x), x^51200 mod p(x) */
-	.octa 0x00000000b243000000000000f8700000
-
-	/* x^50240 mod p(x), x^50176 mod p(x) */
-	.octa 0x00000000f7f500000000000029eb0000
-
-	/* x^49216 mod p(x), x^49152 mod p(x) */
-	.octa 0x000000000c7e000000000000fe730000
-
-	/* x^48192 mod p(x), x^48128 mod p(x) */
-	.octa 0x00000000c38200000000000096000000
-
-	/* x^47168 mod p(x), x^47104 mod p(x) */
-	.octa 0x000000008956000000000000683c0000
-
-	/* x^46144 mod p(x), x^46080 mod p(x) */
-	.octa 0x00000000422d0000000000005f1e0000
-
-	/* x^45120 mod p(x), x^45056 mod p(x) */
-	.octa 0x00000000ac0f0000000000006f810000
-
-	/* x^44096 mod p(x), x^44032 mod p(x) */
-	.octa 0x00000000ce30000000000000031f0000
-
-	/* x^43072 mod p(x), x^43008 mod p(x) */
-	.octa 0x000000003d43000000000000455a0000
-
-	/* x^42048 mod p(x), x^41984 mod p(x) */
-	.octa 0x000000007ebe000000000000a6050000
-
-	/* x^41024 mod p(x), x^40960 mod p(x) */
-	.octa 0x00000000976e00000000000077eb0000
-
-	/* x^40000 mod p(x), x^39936 mod p(x) */
-	.octa 0x000000000872000000000000389c0000
-
-	/* x^38976 mod p(x), x^38912 mod p(x) */
-	.octa 0x000000008979000000000000c7b20000
-
-	/* x^37952 mod p(x), x^37888 mod p(x) */
-	.octa 0x000000005c1e0000000000001d870000
-
-	/* x^36928 mod p(x), x^36864 mod p(x) */
-	.octa 0x00000000aebb00000000000045810000
-
-	/* x^35904 mod p(x), x^35840 mod p(x) */
-	.octa 0x000000004f7e0000000000006d4a0000
-
-	/* x^34880 mod p(x), x^34816 mod p(x) */
-	.octa 0x00000000ea98000000000000b9200000
-
-	/* x^33856 mod p(x), x^33792 mod p(x) */
-	.octa 0x00000000f39600000000000022f20000
-
-	/* x^32832 mod p(x), x^32768 mod p(x) */
-	.octa 0x000000000bc500000000000041ca0000
-
-	/* x^31808 mod p(x), x^31744 mod p(x) */
-	.octa 0x00000000786400000000000078500000
-
-	/* x^30784 mod p(x), x^30720 mod p(x) */
-	.octa 0x00000000be970000000000009e7e0000
-
-	/* x^29760 mod p(x), x^29696 mod p(x) */
-	.octa 0x00000000dd6d000000000000a53c0000
-
-	/* x^28736 mod p(x), x^28672 mod p(x) */
-	.octa 0x000000004c3f00000000000039340000
-
-	/* x^27712 mod p(x), x^27648 mod p(x) */
-	.octa 0x0000000093a4000000000000b58e0000
-
-	/* x^26688 mod p(x), x^26624 mod p(x) */
-	.octa 0x0000000050fb00000000000062d40000
-
-	/* x^25664 mod p(x), x^25600 mod p(x) */
-	.octa 0x00000000f505000000000000a26f0000
-
-	/* x^24640 mod p(x), x^24576 mod p(x) */
-	.octa 0x0000000064f900000000000065e60000
-
-	/* x^23616 mod p(x), x^23552 mod p(x) */
-	.octa 0x00000000e8c2000000000000aad90000
-
-	/* x^22592 mod p(x), x^22528 mod p(x) */
-	.octa 0x00000000720b000000000000a3b00000
-
-	/* x^21568 mod p(x), x^21504 mod p(x) */
-	.octa 0x00000000e992000000000000d2680000
-
-	/* x^20544 mod p(x), x^20480 mod p(x) */
-	.octa 0x000000009132000000000000cf4c0000
-
-	/* x^19520 mod p(x), x^19456 mod p(x) */
-	.octa 0x00000000608a00000000000076610000
-
-	/* x^18496 mod p(x), x^18432 mod p(x) */
-	.octa 0x000000009948000000000000fb9f0000
-
-	/* x^17472 mod p(x), x^17408 mod p(x) */
-	.octa 0x00000000173000000000000003770000
-
-	/* x^16448 mod p(x), x^16384 mod p(x) */
-	.octa 0x000000006fe300000000000004880000
-
-	/* x^15424 mod p(x), x^15360 mod p(x) */
-	.octa 0x00000000e15300000000000056a70000
-
-	/* x^14400 mod p(x), x^14336 mod p(x) */
-	.octa 0x0000000092d60000000000009dfd0000
-
-	/* x^13376 mod p(x), x^13312 mod p(x) */
-	.octa 0x0000000002fd00000000000074c80000
-
-	/* x^12352 mod p(x), x^12288 mod p(x) */
-	.octa 0x00000000c78b000000000000a3ec0000
-
-	/* x^11328 mod p(x), x^11264 mod p(x) */
-	.octa 0x000000009262000000000000b3530000
-
-	/* x^10304 mod p(x), x^10240 mod p(x) */
-	.octa 0x0000000084f200000000000047bf0000
-
-	/* x^9280 mod p(x), x^9216 mod p(x) */
-	.octa 0x0000000067ee000000000000e97c0000
-
-	/* x^8256 mod p(x), x^8192 mod p(x) */
-	.octa 0x00000000535b00000000000091e10000
-
-	/* x^7232 mod p(x), x^7168 mod p(x) */
-	.octa 0x000000007ebb00000000000055060000
-
-	/* x^6208 mod p(x), x^6144 mod p(x) */
-	.octa 0x00000000c6a1000000000000fd360000
-
-	/* x^5184 mod p(x), x^5120 mod p(x) */
-	.octa 0x000000001be500000000000055860000
-
-	/* x^4160 mod p(x), x^4096 mod p(x) */
-	.octa 0x00000000ae0e0000000000005bd00000
-
-	/* x^3136 mod p(x), x^3072 mod p(x) */
-	.octa 0x0000000022040000000000008db20000
-
-	/* x^2112 mod p(x), x^2048 mod p(x) */
-	.octa 0x00000000c9eb000000000000efe20000
-
-	/* x^1088 mod p(x), x^1024 mod p(x) */
-	.octa 0x0000000039b400000000000051d10000
-
-.short_constants:
-
-	/* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
-	/* x^2048 mod p(x), x^2016 mod p(x), x^1984 mod p(x), x^1952 mod p(x) */
-	.octa 0xefe20000dccf00009440000033590000
-
-	/* x^1920 mod p(x), x^1888 mod p(x), x^1856 mod p(x), x^1824 mod p(x) */
-	.octa 0xee6300002f3f000062180000e0ed0000
-
-	/* x^1792 mod p(x), x^1760 mod p(x), x^1728 mod p(x), x^1696 mod p(x) */
-	.octa 0xcf5f000017ef0000ccbe000023d30000
-
-	/* x^1664 mod p(x), x^1632 mod p(x), x^1600 mod p(x), x^1568 mod p(x) */
-	.octa 0x6d0c0000a30e00000920000042630000
-
-	/* x^1536 mod p(x), x^1504 mod p(x), x^1472 mod p(x), x^1440 mod p(x) */
-	.octa 0x21d30000932b0000a7a00000efcc0000
-
-	/* x^1408 mod p(x), x^1376 mod p(x), x^1344 mod p(x), x^1312 mod p(x) */
-	.octa 0x10be00000b310000666f00000d1c0000
-
-	/* x^1280 mod p(x), x^1248 mod p(x), x^1216 mod p(x), x^1184 mod p(x) */
-	.octa 0x1f240000ce9e0000caad0000589e0000
-
-	/* x^1152 mod p(x), x^1120 mod p(x), x^1088 mod p(x), x^1056 mod p(x) */
-	.octa 0x29610000d02b000039b400007cf50000
-
-	/* x^1024 mod p(x), x^992 mod p(x), x^960 mod p(x), x^928 mod p(x) */
-	.octa 0x51d100009d9d00003c0e0000bfd60000
-
-	/* x^896 mod p(x), x^864 mod p(x), x^832 mod p(x), x^800 mod p(x) */
-	.octa 0xda390000ceae000013830000713c0000
-
-	/* x^768 mod p(x), x^736 mod p(x), x^704 mod p(x), x^672 mod p(x) */
-	.octa 0xb67800001e16000085c0000080a60000
-
-	/* x^640 mod p(x), x^608 mod p(x), x^576 mod p(x), x^544 mod p(x) */
-	.octa 0x0db40000f7f90000371d0000e6580000
-
-	/* x^512 mod p(x), x^480 mod p(x), x^448 mod p(x), x^416 mod p(x) */
-	.octa 0x87e70000044c0000aadb0000a4970000
-
-	/* x^384 mod p(x), x^352 mod p(x), x^320 mod p(x), x^288 mod p(x) */
-	.octa 0x1f990000ad180000d8b30000e7b50000
-
-	/* x^256 mod p(x), x^224 mod p(x), x^192 mod p(x), x^160 mod p(x) */
-	.octa 0xbe6c00006ee300004c1a000006df0000
-
-	/* x^128 mod p(x), x^96 mod p(x), x^64 mod p(x), x^32 mod p(x) */
-	.octa 0xfb0b00002d560000136800008bb70000
-
-
-.barrett_constants:
-	/* Barrett constant m - (4^32)/n */
-	.octa 0x000000000000000000000001f65a57f8	/* x^64 div p(x) */
-	/* Barrett constant n */
-	.octa 0x0000000000000000000000018bb70000
-
-#define CRC_FUNCTION_NAME __crct10dif_vpmsum
-#include "crc32-vpmsum_core.S"
diff --git a/arch/powerpc/crypto/crct10dif-vpmsum_glue.c b/arch/powerpc/crypto/crct10dif-vpmsum_glue.c
deleted file mode 100644
index 1dc8b6915178..000000000000
--- a/arch/powerpc/crypto/crct10dif-vpmsum_glue.c
+++ /dev/null
@@ -1,126 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Calculate a CRC T10-DIF with vpmsum acceleration
- *
- * Copyright 2017, Daniel Axtens, IBM Corporation.
- * [based on crc32c-vpmsum_glue.c]
- */
-
-#include <linux/crc-t10dif.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/cpufeature.h>
-#include <asm/simd.h>
-#include <asm/switch_to.h>
-
-#define VMX_ALIGN		16
-#define VMX_ALIGN_MASK		(VMX_ALIGN-1)
-
-#define VECTOR_BREAKPOINT	64
-
-u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len);
-
-static u16 crct10dif_vpmsum(u16 crci, unsigned char const *p, size_t len)
-{
-	unsigned int prealign;
-	unsigned int tail;
-	u32 crc = crci;
-
-	if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || !crypto_simd_usable())
-		return crc_t10dif_generic(crc, p, len);
-
-	if ((unsigned long)p & VMX_ALIGN_MASK) {
-		prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
-		crc = crc_t10dif_generic(crc, p, prealign);
-		len -= prealign;
-		p += prealign;
-	}
-
-	if (len & ~VMX_ALIGN_MASK) {
-		crc <<= 16;
-		preempt_disable();
-		pagefault_disable();
-		enable_kernel_altivec();
-		crc = __crct10dif_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
-		disable_kernel_altivec();
-		pagefault_enable();
-		preempt_enable();
-		crc >>= 16;
-	}
-
-	tail = len & VMX_ALIGN_MASK;
-	if (tail) {
-		p += len & ~VMX_ALIGN_MASK;
-		crc = crc_t10dif_generic(crc, p, tail);
-	}
-
-	return crc & 0xffff;
-}
-
-static int crct10dif_vpmsum_init(struct shash_desc *desc)
-{
-	u16 *crc = shash_desc_ctx(desc);
-
-	*crc = 0;
-	return 0;
-}
-
-static int crct10dif_vpmsum_update(struct shash_desc *desc, const u8 *data,
-			    unsigned int length)
-{
-	u16 *crc = shash_desc_ctx(desc);
-
-	*crc = crct10dif_vpmsum(*crc, data, length);
-
-	return 0;
-}
-
-
-static int crct10dif_vpmsum_final(struct shash_desc *desc, u8 *out)
-{
-	u16 *crcp = shash_desc_ctx(desc);
-
-	*(u16 *)out = *crcp;
-	return 0;
-}
-
-static struct shash_alg alg = {
-	.init		= crct10dif_vpmsum_init,
-	.update		= crct10dif_vpmsum_update,
-	.final		= crct10dif_vpmsum_final,
-	.descsize	= CRC_T10DIF_DIGEST_SIZE,
-	.digestsize	= CRC_T10DIF_DIGEST_SIZE,
-	.base		= {
-		.cra_name		= "crct10dif",
-		.cra_driver_name	= "crct10dif-vpmsum",
-		.cra_priority		= 200,
-		.cra_blocksize		= CRC_T10DIF_BLOCK_SIZE,
-		.cra_module		= THIS_MODULE,
-	}
-};
-
-static int __init crct10dif_vpmsum_mod_init(void)
-{
-	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
-		return -ENODEV;
-
-	return crypto_register_shash(&alg);
-}
-
-static void __exit crct10dif_vpmsum_mod_fini(void)
-{
-	crypto_unregister_shash(&alg);
-}
-
-module_cpu_feature_match(PPC_MODULE_FEATURE_VEC_CRYPTO, crct10dif_vpmsum_mod_init);
-module_exit(crct10dif_vpmsum_mod_fini);
-
-MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>");
-MODULE_DESCRIPTION("CRCT10DIF using vector polynomial multiply-sum instructions");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_CRYPTO("crct10dif");
-MODULE_ALIAS_CRYPTO("crct10dif-vpmsum");
diff --git a/arch/powerpc/crypto/curve25519-ppc64le-core.c b/arch/powerpc/crypto/curve25519-ppc64le-core.c
new file mode 100644
index 000000000000..f7810be0b292
--- /dev/null
+++ b/arch/powerpc/crypto/curve25519-ppc64le-core.c
@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2024- IBM Corp.
+ *
+ * X25519 scalar multiplication with 51 bits limbs for PPC64le.
+ *   Based on RFC7748 and AArch64 optimized implementation for X25519
+ *     - Algorithm 1 Scalar multiplication of a variable point
+ */
+
+#include <crypto/curve25519.h>
+#include <crypto/internal/kpp.h>
+
+#include <linux/types.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+
+#include <linux/cpufeature.h>
+#include <linux/processor.h>
+
+typedef uint64_t fe51[5];
+
+asmlinkage void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
+asmlinkage void x25519_fe51_sqr(fe51 h, const fe51 f);
+asmlinkage void x25519_fe51_mul121666(fe51 h, fe51 f);
+asmlinkage void x25519_fe51_sqr_times(fe51 h, const fe51 f, int n);
+asmlinkage void x25519_fe51_frombytes(fe51 h, const uint8_t *s);
+asmlinkage void x25519_fe51_tobytes(uint8_t *s, const fe51 h);
+asmlinkage void x25519_cswap(fe51 p, fe51 q, unsigned int bit);
+
+#define fmul x25519_fe51_mul
+#define fsqr x25519_fe51_sqr
+#define fmul121666 x25519_fe51_mul121666
+#define fe51_tobytes x25519_fe51_tobytes
+
+static void fadd(fe51 h, const fe51 f, const fe51 g)
+{
+	h[0] = f[0] + g[0];
+	h[1] = f[1] + g[1];
+	h[2] = f[2] + g[2];
+	h[3] = f[3] + g[3];
+	h[4] = f[4] + g[4];
+}
+
+/*
+ * Prime = 2 ** 255 - 19, 255 bits
+ *    (0x7fffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffed)
+ *
+ * Prime in 5 51-bit limbs
+ */
+static fe51 prime51 = { 0x7ffffffffffed, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff};
+
+static void fsub(fe51 h, const fe51 f, const fe51 g)
+{
+	h[0] = (f[0] + ((prime51[0] * 2))) - g[0];
+	h[1] = (f[1] + ((prime51[1] * 2))) - g[1];
+	h[2] = (f[2] + ((prime51[2] * 2))) - g[2];
+	h[3] = (f[3] + ((prime51[3] * 2))) - g[3];
+	h[4] = (f[4] + ((prime51[4] * 2))) - g[4];
+}
+
+static void fe51_frombytes(fe51 h, const uint8_t *s)
+{
+	/*
+	 * Make sure 64-bit aligned.
+	 */
+	unsigned char sbuf[32+8];
+	unsigned char *sb = PTR_ALIGN((void *)sbuf, 8);
+
+	memcpy(sb, s, 32);
+	x25519_fe51_frombytes(h, sb);
+}
+
+static void finv(fe51 o, const fe51 i)
+{
+	fe51 a0, b, c, t00;
+
+	fsqr(a0, i);
+	x25519_fe51_sqr_times(t00, a0, 2);
+
+	fmul(b, t00, i);
+	fmul(a0, b, a0);
+
+	fsqr(t00, a0);
+
+	fmul(b, t00, b);
+	x25519_fe51_sqr_times(t00, b, 5);
+
+	fmul(b, t00, b);
+	x25519_fe51_sqr_times(t00, b, 10);
+
+	fmul(c, t00, b);
+	x25519_fe51_sqr_times(t00, c, 20);
+
+	fmul(t00, t00, c);
+	x25519_fe51_sqr_times(t00, t00, 10);
+
+	fmul(b, t00, b);
+	x25519_fe51_sqr_times(t00, b, 50);
+
+	fmul(c, t00, b);
+	x25519_fe51_sqr_times(t00, c, 100);
+
+	fmul(t00, t00, c);
+	x25519_fe51_sqr_times(t00, t00, 50);
+
+	fmul(t00, t00, b);
+	x25519_fe51_sqr_times(t00, t00, 5);
+
+	fmul(o, t00, a0);
+}
+
+static void curve25519_fe51(uint8_t out[32], const uint8_t scalar[32],
+			    const uint8_t point[32])
+{
+	fe51 x1, x2, z2, x3, z3;
+	uint8_t s[32];
+	unsigned int swap = 0;
+	int i;
+
+	memcpy(s, scalar, 32);
+	s[0]  &= 0xf8;
+	s[31] &= 0x7f;
+	s[31] |= 0x40;
+	fe51_frombytes(x1, point);
+
+	z2[0] = z2[1] = z2[2] = z2[3] = z2[4] = 0;
+	x3[0] = x1[0];
+	x3[1] = x1[1];
+	x3[2] = x1[2];
+	x3[3] = x1[3];
+	x3[4] = x1[4];
+
+	x2[0] = z3[0] = 1;
+	x2[1] = z3[1] = 0;
+	x2[2] = z3[2] = 0;
+	x2[3] = z3[3] = 0;
+	x2[4] = z3[4] = 0;
+
+	for (i = 254; i >= 0; --i) {
+		unsigned int k_t = 1 & (s[i / 8] >> (i & 7));
+		fe51 a, b, c, d, e;
+		fe51 da, cb, aa, bb;
+		fe51 dacb_p, dacb_m;
+
+		swap ^= k_t;
+		x25519_cswap(x2, x3, swap);
+		x25519_cswap(z2, z3, swap);
+		swap = k_t;
+
+		fsub(b, x2, z2);		// B = x_2 - z_2
+		fadd(a, x2, z2);		// A = x_2 + z_2
+		fsub(d, x3, z3);		// D = x_3 - z_3
+		fadd(c, x3, z3);		// C = x_3 + z_3
+
+		fsqr(bb, b);			// BB = B^2
+		fsqr(aa, a);			// AA = A^2
+		fmul(da, d, a);			// DA = D * A
+		fmul(cb, c, b);			// CB = C * B
+
+		fsub(e, aa, bb);		// E = AA - BB
+		fmul(x2, aa, bb);		// x2 = AA * BB
+		fadd(dacb_p, da, cb);		// DA + CB
+		fsub(dacb_m, da, cb);		// DA - CB
+
+		fmul121666(z3, e);		// 121666 * E
+		fsqr(z2, dacb_m);		// (DA - CB)^2
+		fsqr(x3, dacb_p);		// x3 = (DA + CB)^2
+		fadd(b, bb, z3);		// BB + 121666 * E
+		fmul(z3, x1, z2);		// z3 = x1 * (DA - CB)^2
+		fmul(z2, e, b);		// z2 = e * (BB + (DA + CB)^2)
+	}
+
+	finv(z2, z2);
+	fmul(x2, x2, z2);
+	fe51_tobytes(out, x2);
+}
+
+void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
+		     const u8 secret[CURVE25519_KEY_SIZE],
+		     const u8 basepoint[CURVE25519_KEY_SIZE])
+{
+	curve25519_fe51(mypublic, secret, basepoint);
+}
+EXPORT_SYMBOL(curve25519_arch);
+
+void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+			  const u8 secret[CURVE25519_KEY_SIZE])
+{
+	curve25519_fe51(pub, secret, curve25519_base_point);
+}
+EXPORT_SYMBOL(curve25519_base_arch);
+
+static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
+				 unsigned int len)
+{
+	u8 *secret = kpp_tfm_ctx(tfm);
+
+	if (!len)
+		curve25519_generate_secret(secret);
+	else if (len == CURVE25519_KEY_SIZE &&
+		 crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
+		memcpy(secret, buf, CURVE25519_KEY_SIZE);
+	else
+		return -EINVAL;
+	return 0;
+}
+
+static int curve25519_generate_public_key(struct kpp_request *req)
+{
+	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
+	const u8 *secret = kpp_tfm_ctx(tfm);
+	u8 buf[CURVE25519_KEY_SIZE];
+	int copied, nbytes;
+
+	if (req->src)
+		return -EINVAL;
+
+	curve25519_base_arch(buf, secret);
+
+	/* might want less than we've got */
+	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
+	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
+								nbytes),
+				     buf, nbytes);
+	if (copied != nbytes)
+		return -EINVAL;
+	return 0;
+}
+
+static int curve25519_compute_shared_secret(struct kpp_request *req)
+{
+	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
+	const u8 *secret = kpp_tfm_ctx(tfm);
+	u8 public_key[CURVE25519_KEY_SIZE];
+	u8 buf[CURVE25519_KEY_SIZE];
+	int copied, nbytes;
+
+	if (!req->src)
+		return -EINVAL;
+
+	copied = sg_copy_to_buffer(req->src,
+				   sg_nents_for_len(req->src,
+						    CURVE25519_KEY_SIZE),
+				   public_key, CURVE25519_KEY_SIZE);
+	if (copied != CURVE25519_KEY_SIZE)
+		return -EINVAL;
+
+	curve25519_arch(buf, secret, public_key);
+
+	/* might want less than we've got */
+	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
+	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
+								nbytes),
+				     buf, nbytes);
+	if (copied != nbytes)
+		return -EINVAL;
+	return 0;
+}
+
+static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
+{
+	return CURVE25519_KEY_SIZE;
+}
+
+static struct kpp_alg curve25519_alg = {
+	.base.cra_name		= "curve25519",
+	.base.cra_driver_name	= "curve25519-ppc64le",
+	.base.cra_priority	= 200,
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_ctxsize	= CURVE25519_KEY_SIZE,
+
+	.set_secret		= curve25519_set_secret,
+	.generate_public_key	= curve25519_generate_public_key,
+	.compute_shared_secret	= curve25519_compute_shared_secret,
+	.max_size		= curve25519_max_size,
+};
+
+
+static int __init curve25519_mod_init(void)
+{
+	return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
+		crypto_register_kpp(&curve25519_alg) : 0;
+}
+
+static void __exit curve25519_mod_exit(void)
+{
+	if (IS_REACHABLE(CONFIG_CRYPTO_KPP))
+		crypto_unregister_kpp(&curve25519_alg);
+}
+
+module_init(curve25519_mod_init);
+module_exit(curve25519_mod_exit);
+
+MODULE_ALIAS_CRYPTO("curve25519");
+MODULE_ALIAS_CRYPTO("curve25519-ppc64le");
+MODULE_DESCRIPTION("PPC64le Curve25519 scalar multiplication with 51 bits limbs");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Danny Tsen <dtsen@us.ibm.com>");
diff --git a/arch/powerpc/crypto/curve25519-ppc64le_asm.S b/arch/powerpc/crypto/curve25519-ppc64le_asm.S
new file mode 100644
index 000000000000..06c1febe24b9
--- /dev/null
+++ b/arch/powerpc/crypto/curve25519-ppc64le_asm.S
@@ -0,0 +1,671 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# This code is taken from CRYPTOGAMs[1] and is included here using the option
+# in the license to distribute the code under the GPL. Therefore this program
+# is free software; you can redistribute it and/or modify it under the terms of
+# the GNU General Public License version 2 as published by the Free Software
+# Foundation.
+#
+# [1] https://github.com/dot-asm/cryptogams/
+
+# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+#       * Redistributions of source code must retain copyright notices,
+#         this list of conditions and the following disclaimer.
+#
+#       * Redistributions in binary form must reproduce the above
+#         copyright notice, this list of conditions and the following
+#         disclaimer in the documentation and/or other materials
+#         provided with the distribution.
+#
+#       * Neither the name of the CRYPTOGAMS nor the names of its
+#         copyright holder and contributors may be used to endorse or
+#         promote products derived from this software without specific
+#         prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+#
+# ====================================================================
+# Written and Modified by Danny Tsen <dtsen@us.ibm.com>
+# - Added x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes
+#   and x25519_cswap
+#
+# Copyright 2024- IBM Corp.
+#
+# X25519 lower-level primitives for PPC64.
+#
+
+#include <linux/linkage.h>
+
+.text
+
+.align	5
+SYM_FUNC_START(x25519_fe51_mul)
+
+	stdu	1,-144(1)
+	std	21,56(1)
+	std	22,64(1)
+	std	23,72(1)
+	std	24,80(1)
+	std	25,88(1)
+	std	26,96(1)
+	std	27,104(1)
+	std	28,112(1)
+	std	29,120(1)
+	std	30,128(1)
+	std	31,136(1)
+
+	ld	6,0(5)
+	ld	7,0(4)
+	ld	8,8(4)
+	ld	9,16(4)
+	ld	10,24(4)
+	ld	11,32(4)
+
+	mulld	22,7,6
+	mulhdu	23,7,6
+
+	mulld	24,8,6
+	mulhdu	25,8,6
+
+	mulld	30,11,6
+	mulhdu	31,11,6
+	ld	4,8(5)
+	mulli	11,11,19
+
+	mulld	26,9,6
+	mulhdu	27,9,6
+
+	mulld	28,10,6
+	mulhdu	29,10,6
+	mulld	12,11,4
+	mulhdu	21,11,4
+	addc	22,22,12
+	adde	23,23,21
+
+	mulld	12,7,4
+	mulhdu	21,7,4
+	addc	24,24,12
+	adde	25,25,21
+
+	mulld	12,10,4
+	mulhdu	21,10,4
+	ld	6,16(5)
+	mulli	10,10,19
+	addc	30,30,12
+	adde	31,31,21
+
+	mulld	12,8,4
+	mulhdu	21,8,4
+	addc	26,26,12
+	adde	27,27,21
+
+	mulld	12,9,4
+	mulhdu	21,9,4
+	addc	28,28,12
+	adde	29,29,21
+	mulld	12,10,6
+	mulhdu	21,10,6
+	addc	22,22,12
+	adde	23,23,21
+
+	mulld	12,11,6
+	mulhdu	21,11,6
+	addc	24,24,12
+	adde	25,25,21
+
+	mulld	12,9,6
+	mulhdu	21,9,6
+	ld	4,24(5)
+	mulli	9,9,19
+	addc	30,30,12
+	adde	31,31,21
+
+	mulld	12,7,6
+	mulhdu	21,7,6
+	addc	26,26,12
+	adde	27,27,21
+
+	mulld	12,8,6
+	mulhdu	21,8,6
+	addc	28,28,12
+	adde	29,29,21
+	mulld	12,9,4
+	mulhdu	21,9,4
+	addc	22,22,12
+	adde	23,23,21
+
+	mulld	12,10,4
+	mulhdu	21,10,4
+	addc	24,24,12
+	adde	25,25,21
+
+	mulld	12,8,4
+	mulhdu	21,8,4
+	ld	6,32(5)
+	mulli	8,8,19
+	addc	30,30,12
+	adde	31,31,21
+
+	mulld	12,11,4
+	mulhdu	21,11,4
+	addc	26,26,12
+	adde	27,27,21
+
+	mulld	12,7,4
+	mulhdu	21,7,4
+	addc	28,28,12
+	adde	29,29,21
+	mulld	12,8,6
+	mulhdu	21,8,6
+	addc	22,22,12
+	adde	23,23,21
+
+	mulld	12,9,6
+	mulhdu	21,9,6
+	addc	24,24,12
+	adde	25,25,21
+
+	mulld	12,10,6
+	mulhdu	21,10,6
+	addc	26,26,12
+	adde	27,27,21
+
+	mulld	12,11,6
+	mulhdu	21,11,6
+	addc	28,28,12
+	adde	29,29,21
+
+	mulld	12,7,6
+	mulhdu	21,7,6
+	addc	30,30,12
+	adde	31,31,21
+
+.Lfe51_reduce:
+	li	0,-1
+	srdi	0,0,13
+
+	srdi	12,26,51
+	and	9,26,0
+	insrdi	12,27,51,0
+	srdi	21,22,51
+	and	7,22,0
+	insrdi	21,23,51,0
+	addc	28,28,12
+	addze	29,29
+	addc	24,24,21
+	addze	25,25
+
+	srdi	12,28,51
+	and	10,28,0
+	insrdi	12,29,51,0
+	srdi	21,24,51
+	and	8,24,0
+	insrdi	21,25,51,0
+	addc	30,30,12
+	addze	31,31
+	add	9,9,21
+
+	srdi	12,30,51
+	and	11,30,0
+	insrdi	12,31,51,0
+	mulli	12,12,19
+
+	add	7,7,12
+
+	srdi	21,9,51
+	and	9,9,0
+	add	10,10,21
+
+	srdi	12,7,51
+	and	7,7,0
+	add	8,8,12
+
+	std	9,16(3)
+	std	10,24(3)
+	std	11,32(3)
+	std	7,0(3)
+	std	8,8(3)
+
+	ld	21,56(1)
+	ld	22,64(1)
+	ld	23,72(1)
+	ld	24,80(1)
+	ld	25,88(1)
+	ld	26,96(1)
+	ld	27,104(1)
+	ld	28,112(1)
+	ld	29,120(1)
+	ld	30,128(1)
+	ld	31,136(1)
+	addi	1,1,144
+	blr
+SYM_FUNC_END(x25519_fe51_mul)
+
+.align	5
+SYM_FUNC_START(x25519_fe51_sqr)
+
+	stdu	1,-144(1)
+	std	21,56(1)
+	std	22,64(1)
+	std	23,72(1)
+	std	24,80(1)
+	std	25,88(1)
+	std	26,96(1)
+	std	27,104(1)
+	std	28,112(1)
+	std	29,120(1)
+	std	30,128(1)
+	std	31,136(1)
+
+	ld	7,0(4)
+	ld	8,8(4)
+	ld	9,16(4)
+	ld	10,24(4)
+	ld	11,32(4)
+
+	add	6,7,7
+	mulli	21,11,19
+
+	mulld	22,7,7
+	mulhdu	23,7,7
+	mulld	24,8,6
+	mulhdu	25,8,6
+	mulld	26,9,6
+	mulhdu	27,9,6
+	mulld	28,10,6
+	mulhdu	29,10,6
+	mulld	30,11,6
+	mulhdu	31,11,6
+	add	6,8,8
+	mulld	12,11,21
+	mulhdu	11,11,21
+	addc	28,28,12
+	adde	29,29,11
+
+	mulli	5,10,19
+
+	mulld	12,8,8
+	mulhdu	11,8,8
+	addc	26,26,12
+	adde	27,27,11
+	mulld	12,9,6
+	mulhdu	11,9,6
+	addc	28,28,12
+	adde	29,29,11
+	mulld	12,10,6
+	mulhdu	11,10,6
+	addc	30,30,12
+	adde	31,31,11
+	mulld	12,21,6
+	mulhdu	11,21,6
+	add	6,10,10
+	addc	22,22,12
+	adde	23,23,11
+	mulld	12,10,5
+	mulhdu	10,10,5
+	addc	24,24,12
+	adde	25,25,10
+	mulld	12,6,21
+	mulhdu	10,6,21
+	add	6,9,9
+	addc	26,26,12
+	adde	27,27,10
+
+	mulld	12,9,9
+	mulhdu	10,9,9
+	addc	30,30,12
+	adde	31,31,10
+	mulld	12,5,6
+	mulhdu	10,5,6
+	addc	22,22,12
+	adde	23,23,10
+	mulld	12,21,6
+	mulhdu	10,21,6
+	addc	24,24,12
+	adde	25,25,10
+
+	b	.Lfe51_reduce
+SYM_FUNC_END(x25519_fe51_sqr)
+
+.align	5
+SYM_FUNC_START(x25519_fe51_mul121666)
+
+	stdu	1,-144(1)
+	std	21,56(1)
+	std	22,64(1)
+	std	23,72(1)
+	std	24,80(1)
+	std	25,88(1)
+	std	26,96(1)
+	std	27,104(1)
+	std	28,112(1)
+	std	29,120(1)
+	std	30,128(1)
+	std	31,136(1)
+
+	lis	6,1
+	ori	6,6,56130
+	ld	7,0(4)
+	ld	8,8(4)
+	ld	9,16(4)
+	ld	10,24(4)
+	ld	11,32(4)
+
+	mulld	22,7,6
+	mulhdu	23,7,6
+	mulld	24,8,6
+	mulhdu	25,8,6
+	mulld	26,9,6
+	mulhdu	27,9,6
+	mulld	28,10,6
+	mulhdu	29,10,6
+	mulld	30,11,6
+	mulhdu	31,11,6
+
+	b	.Lfe51_reduce
+SYM_FUNC_END(x25519_fe51_mul121666)
+
+.align	5
+SYM_FUNC_START(x25519_fe51_sqr_times)
+
+	stdu	1,-144(1)
+	std	21,56(1)
+	std	22,64(1)
+	std	23,72(1)
+	std	24,80(1)
+	std	25,88(1)
+	std	26,96(1)
+	std	27,104(1)
+	std	28,112(1)
+	std	29,120(1)
+	std	30,128(1)
+	std	31,136(1)
+
+	ld	7,0(4)
+	ld	8,8(4)
+	ld	9,16(4)
+	ld	10,24(4)
+	ld	11,32(4)
+
+	mtctr	5
+
+.Lsqr_times_loop:
+	add	6,7,7
+	mulli	21,11,19
+
+	mulld	22,7,7
+	mulhdu	23,7,7
+	mulld	24,8,6
+	mulhdu	25,8,6
+	mulld	26,9,6
+	mulhdu	27,9,6
+	mulld	28,10,6
+	mulhdu	29,10,6
+	mulld	30,11,6
+	mulhdu	31,11,6
+	add	6,8,8
+	mulld	12,11,21
+	mulhdu	11,11,21
+	addc	28,28,12
+	adde	29,29,11
+
+	mulli	5,10,19
+
+	mulld	12,8,8
+	mulhdu	11,8,8
+	addc	26,26,12
+	adde	27,27,11
+	mulld	12,9,6
+	mulhdu	11,9,6
+	addc	28,28,12
+	adde	29,29,11
+	mulld	12,10,6
+	mulhdu	11,10,6
+	addc	30,30,12
+	adde	31,31,11
+	mulld	12,21,6
+	mulhdu	11,21,6
+	add	6,10,10
+	addc	22,22,12
+	adde	23,23,11
+	mulld	12,10,5
+	mulhdu	10,10,5
+	addc	24,24,12
+	adde	25,25,10
+	mulld	12,6,21
+	mulhdu	10,6,21
+	add	6,9,9
+	addc	26,26,12
+	adde	27,27,10
+
+	mulld	12,9,9
+	mulhdu	10,9,9
+	addc	30,30,12
+	adde	31,31,10
+	mulld	12,5,6
+	mulhdu	10,5,6
+	addc	22,22,12
+	adde	23,23,10
+	mulld	12,21,6
+	mulhdu	10,21,6
+	addc	24,24,12
+	adde	25,25,10
+
+	# fe51_reduce
+	li	0,-1
+	srdi	0,0,13
+
+	srdi	12,26,51
+	and	9,26,0
+	insrdi	12,27,51,0
+	srdi	21,22,51
+	and	7,22,0
+	insrdi	21,23,51,0
+	addc	28,28,12
+	addze	29,29
+	addc	24,24,21
+	addze	25,25
+
+	srdi	12,28,51
+	and	10,28,0
+	insrdi	12,29,51,0
+	srdi	21,24,51
+	and	8,24,0
+	insrdi	21,25,51,0
+	addc	30,30,12
+	addze	31,31
+	add	9,9,21
+
+	srdi	12,30,51
+	and	11,30,0
+	insrdi	12,31,51,0
+	mulli	12,12,19
+
+	add	7,7,12
+
+	srdi	21,9,51
+	and	9,9,0
+	add	10,10,21
+
+	srdi	12,7,51
+	and	7,7,0
+	add	8,8,12
+
+	bdnz	.Lsqr_times_loop
+
+	std	9,16(3)
+	std	10,24(3)
+	std	11,32(3)
+	std	7,0(3)
+	std	8,8(3)
+
+	ld	21,56(1)
+	ld	22,64(1)
+	ld	23,72(1)
+	ld	24,80(1)
+	ld	25,88(1)
+	ld	26,96(1)
+	ld	27,104(1)
+	ld	28,112(1)
+	ld	29,120(1)
+	ld	30,128(1)
+	ld	31,136(1)
+	addi	1,1,144
+	blr
+SYM_FUNC_END(x25519_fe51_sqr_times)
+
+.align	5
+SYM_FUNC_START(x25519_fe51_frombytes)
+
+	li	12, -1
+	srdi	12, 12, 13	# 0x7ffffffffffff
+
+	ld	5, 0(4)
+	ld	6, 8(4)
+	ld	7, 16(4)
+	ld	8, 24(4)
+
+	srdi	10, 5, 51
+	and	5, 5, 12	# h0
+
+	sldi	11, 6, 13
+	or	11, 10, 11	# h1t
+	srdi	10, 6, 38
+	and	6, 11, 12	# h1
+
+	sldi	11, 7, 26
+	or	10, 10, 11	# h2t
+
+	srdi	11, 7, 25
+	and	7, 10, 12	# h2
+	sldi	10, 8, 39
+	or	11, 11, 10	# h3t
+
+	srdi	9, 8, 12
+	and	8, 11, 12	# h3
+	and	9, 9, 12	# h4
+
+	std	5, 0(3)
+	std	6, 8(3)
+	std	7, 16(3)
+	std	8, 24(3)
+	std	9, 32(3)
+
+	blr
+SYM_FUNC_END(x25519_fe51_frombytes)
+
+.align	5
+SYM_FUNC_START(x25519_fe51_tobytes)
+
+	ld	5, 0(4)
+	ld	6, 8(4)
+	ld	7, 16(4)
+	ld	8, 24(4)
+	ld	9, 32(4)
+
+	li	12, -1
+	srdi	12, 12, 13	# 0x7ffffffffffff
+
+	# Full reducuction
+	addi	10, 5, 19
+	srdi	10, 10, 51
+	add	10, 10, 6
+	srdi	10, 10, 51
+	add	10, 10, 7
+	srdi	10, 10, 51
+	add	10, 10, 8
+	srdi	10, 10, 51
+	add	10, 10, 9
+	srdi	10, 10, 51
+
+	mulli	10, 10, 19
+	add	5, 5, 10
+	srdi	11, 5, 51
+	add	6, 6, 11
+	srdi	11, 6, 51
+	add	7, 7, 11
+	srdi	11, 7, 51
+	add	8, 8, 11
+	srdi	11, 8, 51
+	add	9, 9, 11
+
+	and	5, 5, 12
+	and	6, 6, 12
+	and	7, 7, 12
+	and	8, 8, 12
+	and	9, 9, 12
+
+	sldi	10, 6, 51
+	or	5, 5, 10	# s0
+
+	srdi	11, 6, 13
+	sldi	10, 7, 38
+	or	6, 11, 10	# s1
+
+	srdi	11, 7, 26
+	sldi	10, 8, 25
+	or	7, 11, 10	# s2
+
+	srdi	11, 8, 39
+	sldi	10, 9, 12
+	or	8, 11, 10	# s4
+
+	std	5, 0(3)
+	std	6, 8(3)
+	std	7, 16(3)
+	std	8, 24(3)
+
+	blr
+SYM_FUNC_END(x25519_fe51_tobytes)
+
+.align	5
+SYM_FUNC_START(x25519_cswap)
+
+	li	7, 5
+	neg	6, 5
+	mtctr	7
+
+.Lswap_loop:
+	ld	8, 0(3)
+	ld	9, 0(4)
+	xor	10, 8, 9
+	and	10, 10, 6
+	xor	11, 8, 10
+	xor	12, 9, 10
+	std	11, 0(3)
+	addi	3, 3, 8
+	std	12, 0(4)
+	addi	4, 4, 8
+	bdnz	.Lswap_loop
+
+	blr
+SYM_FUNC_END(x25519_cswap)
diff --git a/arch/powerpc/crypto/ghash.c b/arch/powerpc/crypto/ghash.c
new file mode 100644
index 000000000000..77eca20bc7ac
--- /dev/null
+++ b/arch/powerpc/crypto/ghash.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * GHASH routines supporting VMX instructions on the Power 8
+ *
+ * Copyright (C) 2015, 2019 International Business Machines Inc.
+ *
+ * Author: Marcelo Henrique Cerri <mhcerri@br.ibm.com>
+ *
+ * Extended by Daniel Axtens <dja@axtens.net> to replace the fallback
+ * mechanism. The new approach is based on arm64 code, which is:
+ *   Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/types.h>
+#include <linux/err.h>
+#include <linux/crypto.h>
+#include <linux/delay.h>
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+#include <crypto/aes.h>
+#include <crypto/ghash.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/b128ops.h>
+#include "aesp8-ppc.h"
+
+void gcm_init_p8(u128 htable[16], const u64 Xi[2]);
+void gcm_gmult_p8(u64 Xi[2], const u128 htable[16]);
+void gcm_ghash_p8(u64 Xi[2], const u128 htable[16],
+		  const u8 *in, size_t len);
+
+struct p8_ghash_ctx {
+	/* key used by vector asm */
+	u128 htable[16];
+	/* key used by software fallback */
+	be128 key;
+};
+
+struct p8_ghash_desc_ctx {
+	u64 shash[2];
+	u8 buffer[GHASH_DIGEST_SIZE];
+	int bytes;
+};
+
+static int p8_ghash_init(struct shash_desc *desc)
+{
+	struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+	dctx->bytes = 0;
+	memset(dctx->shash, 0, GHASH_DIGEST_SIZE);
+	return 0;
+}
+
+static int p8_ghash_setkey(struct crypto_shash *tfm, const u8 *key,
+			   unsigned int keylen)
+{
+	struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(tfm));
+
+	if (keylen != GHASH_BLOCK_SIZE)
+		return -EINVAL;
+
+	preempt_disable();
+	pagefault_disable();
+	enable_kernel_vsx();
+	gcm_init_p8(ctx->htable, (const u64 *) key);
+	disable_kernel_vsx();
+	pagefault_enable();
+	preempt_enable();
+
+	memcpy(&ctx->key, key, GHASH_BLOCK_SIZE);
+
+	return 0;
+}
+
+static inline void __ghash_block(struct p8_ghash_ctx *ctx,
+				 struct p8_ghash_desc_ctx *dctx)
+{
+	if (crypto_simd_usable()) {
+		preempt_disable();
+		pagefault_disable();
+		enable_kernel_vsx();
+		gcm_ghash_p8(dctx->shash, ctx->htable,
+				dctx->buffer, GHASH_DIGEST_SIZE);
+		disable_kernel_vsx();
+		pagefault_enable();
+		preempt_enable();
+	} else {
+		crypto_xor((u8 *)dctx->shash, dctx->buffer, GHASH_BLOCK_SIZE);
+		gf128mul_lle((be128 *)dctx->shash, &ctx->key);
+	}
+}
+
+static inline void __ghash_blocks(struct p8_ghash_ctx *ctx,
+				  struct p8_ghash_desc_ctx *dctx,
+				  const u8 *src, unsigned int srclen)
+{
+	if (crypto_simd_usable()) {
+		preempt_disable();
+		pagefault_disable();
+		enable_kernel_vsx();
+		gcm_ghash_p8(dctx->shash, ctx->htable,
+				src, srclen);
+		disable_kernel_vsx();
+		pagefault_enable();
+		preempt_enable();
+	} else {
+		while (srclen >= GHASH_BLOCK_SIZE) {
+			crypto_xor((u8 *)dctx->shash, src, GHASH_BLOCK_SIZE);
+			gf128mul_lle((be128 *)dctx->shash, &ctx->key);
+			srclen -= GHASH_BLOCK_SIZE;
+			src += GHASH_BLOCK_SIZE;
+		}
+	}
+}
+
+static int p8_ghash_update(struct shash_desc *desc,
+			   const u8 *src, unsigned int srclen)
+{
+	unsigned int len;
+	struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
+	struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+	if (dctx->bytes) {
+		if (dctx->bytes + srclen < GHASH_DIGEST_SIZE) {
+			memcpy(dctx->buffer + dctx->bytes, src,
+				srclen);
+			dctx->bytes += srclen;
+			return 0;
+		}
+		memcpy(dctx->buffer + dctx->bytes, src,
+			GHASH_DIGEST_SIZE - dctx->bytes);
+
+		__ghash_block(ctx, dctx);
+
+		src += GHASH_DIGEST_SIZE - dctx->bytes;
+		srclen -= GHASH_DIGEST_SIZE - dctx->bytes;
+		dctx->bytes = 0;
+	}
+	len = srclen & ~(GHASH_DIGEST_SIZE - 1);
+	if (len) {
+		__ghash_blocks(ctx, dctx, src, len);
+		src += len;
+		srclen -= len;
+	}
+	if (srclen) {
+		memcpy(dctx->buffer, src, srclen);
+		dctx->bytes = srclen;
+	}
+	return 0;
+}
+
+static int p8_ghash_final(struct shash_desc *desc, u8 *out)
+{
+	int i;
+	struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
+	struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+	if (dctx->bytes) {
+		for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++)
+			dctx->buffer[i] = 0;
+		__ghash_block(ctx, dctx);
+		dctx->bytes = 0;
+	}
+	memcpy(out, dctx->shash, GHASH_DIGEST_SIZE);
+	return 0;
+}
+
+struct shash_alg p8_ghash_alg = {
+	.digestsize = GHASH_DIGEST_SIZE,
+	.init = p8_ghash_init,
+	.update = p8_ghash_update,
+	.final = p8_ghash_final,
+	.setkey = p8_ghash_setkey,
+	.descsize = sizeof(struct p8_ghash_desc_ctx)
+		+ sizeof(struct ghash_desc_ctx),
+	.base = {
+		 .cra_name = "ghash",
+		 .cra_driver_name = "p8_ghash",
+		 .cra_priority = 1000,
+		 .cra_blocksize = GHASH_BLOCK_SIZE,
+		 .cra_ctxsize = sizeof(struct p8_ghash_ctx),
+		 .cra_module = THIS_MODULE,
+	},
+};
diff --git a/arch/powerpc/crypto/ghashp8-ppc.pl b/arch/powerpc/crypto/ghashp8-ppc.pl
new file mode 100644
index 000000000000..041e633c214f
--- /dev/null
+++ b/arch/powerpc/crypto/ghashp8-ppc.pl
@@ -0,0 +1,243 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for PowerISA v2.07.
+#
+# July 2014
+#
+# Accurate performance measurements are problematic, because it's
+# always virtualized setup with possibly throttled processor.
+# Relative comparison is therefore more informative. This initial
+# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
+# faster than "4-bit" integer-only compiler-generated 64-bit code.
+# "Initial version" means that there is room for futher improvement.
+
+$flavour=shift;
+$output =shift;
+
+if ($flavour =~ /64/) {
+	$SIZE_T=8;
+	$LRSAVE=2*$SIZE_T;
+	$STU="stdu";
+	$POP="ld";
+	$PUSH="std";
+} elsif ($flavour =~ /32/) {
+	$SIZE_T=4;
+	$LRSAVE=$SIZE_T;
+	$STU="stwu";
+	$POP="lwz";
+	$PUSH="stw";
+} else { die "nonsense $flavour"; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
+
+my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block
+
+my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
+my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
+my $vrsave="r12";
+
+$code=<<___;
+.machine	"any"
+
+.text
+
+.globl	.gcm_init_p8
+	lis		r0,0xfff0
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$H,0,r4			# load H
+	le?xor		r7,r7,r7
+	le?addi		r7,r7,0x8		# need a vperm start with 08
+	le?lvsr		5,0,r7
+	le?vspltisb	6,0x0f
+	le?vxor		5,5,6			# set a b-endian mask
+	le?vperm	$H,$H,$H,5
+
+	vspltisb	$xC2,-16		# 0xf0
+	vspltisb	$t0,1			# one
+	vaddubm		$xC2,$xC2,$xC2		# 0xe0
+	vxor		$zero,$zero,$zero
+	vor		$xC2,$xC2,$t0		# 0xe1
+	vsldoi		$xC2,$xC2,$zero,15	# 0xe1...
+	vsldoi		$t1,$zero,$t0,1		# ...1
+	vaddubm		$xC2,$xC2,$xC2		# 0xc2...
+	vspltisb	$t2,7
+	vor		$xC2,$xC2,$t1		# 0xc2....01
+	vspltb		$t1,$H,0		# most significant byte
+	vsl		$H,$H,$t0		# H<<=1
+	vsrab		$t1,$t1,$t2		# broadcast carry bit
+	vand		$t1,$t1,$xC2
+	vxor		$H,$H,$t1		# twisted H
+
+	vsldoi		$H,$H,$H,8		# twist even more ...
+	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
+	vsldoi		$Hl,$zero,$H,8		# ... and split
+	vsldoi		$Hh,$H,$zero,8
+
+	stvx_u		$xC2,0,r3		# save pre-computed table
+	stvx_u		$Hl,r8,r3
+	stvx_u		$H, r9,r3
+	stvx_u		$Hh,r10,r3
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,2,0
+	.long		0
+.size	.gcm_init_p8,.-.gcm_init_p8
+
+.globl	.gcm_gmult_p8
+	lis		r0,0xfff8
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$IN,0,$Xip		# load Xi
+
+	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
+	 le?lvsl	$lemask,r0,r0
+	lvx_u		$H, r9,$Htbl
+	 le?vspltisb	$t0,0x07
+	lvx_u		$Hh,r10,$Htbl
+	 le?vxor	$lemask,$lemask,$t0
+	lvx_u		$xC2,0,$Htbl
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$zero,$zero,$zero
+
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$Xl,$Xl,$t1
+
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,2,0
+	.long		0
+.size	.gcm_gmult_p8,.-.gcm_gmult_p8
+
+.globl	.gcm_ghash_p8
+	lis		r0,0xfff8
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$Xl,0,$Xip		# load Xi
+
+	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
+	 le?lvsl	$lemask,r0,r0
+	lvx_u		$H, r9,$Htbl
+	 le?vspltisb	$t0,0x07
+	lvx_u		$Hh,r10,$Htbl
+	 le?vxor	$lemask,$lemask,$t0
+	lvx_u		$xC2,0,$Htbl
+	 le?vperm	$Xl,$Xl,$Xl,$lemask
+	vxor		$zero,$zero,$zero
+
+	lvx_u		$IN,0,$inp
+	addi		$inp,$inp,16
+	subi		$len,$len,16
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$IN,$IN,$Xl
+	b		Loop
+
+.align	5
+Loop:
+	 subic		$len,$len,16
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
+	 subfe.		r0,r0,r0		# borrow?-1:0
+	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
+	 and		r0,r0,$len
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
+	 add		$inp,$inp,r0
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+	 lvx_u		$IN,0,$inp
+	 addi		$inp,$inp,16
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
+	vpmsumd		$Xl,$Xl,$xC2
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$t1,$t1,$Xh
+	vxor		$IN,$IN,$t1
+	vxor		$IN,$IN,$Xl
+	beq		Loop			# did $len-=16 borrow?
+
+	vxor		$Xl,$Xl,$t1
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,4,0
+	.long		0
+.size	.gcm_ghash_p8,.-.gcm_ghash_p8
+
+.asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+foreach (split("\n",$code)) {
+	if ($flavour =~ /le$/o) {	# little-endian
+	    s/le\?//o		or
+	    s/be\?/#be#/o;
+	} else {
+	    s/le\?/#le#/o	or
+	    s/be\?//o;
+	}
+	print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/arch/powerpc/crypto/poly1305-p10-glue.c b/arch/powerpc/crypto/poly1305-p10-glue.c
index 95dd708573ee..369686e9370b 100644
--- a/arch/powerpc/crypto/poly1305-p10-glue.c
+++ b/arch/powerpc/crypto/poly1305-p10-glue.c
@@ -14,7 +14,7 @@
 #include <crypto/internal/poly1305.h>
 #include <crypto/internal/simd.h>
 #include <linux/cpufeature.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 #include <asm/simd.h>
 #include <asm/switch_to.h>
 
diff --git a/arch/powerpc/crypto/vmx.c b/arch/powerpc/crypto/vmx.c
new file mode 100644
index 000000000000..0b725e826388
--- /dev/null
+++ b/arch/powerpc/crypto/vmx.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Routines supporting VMX instructions on the Power 8
+ *
+ * Copyright (C) 2015 International Business Machines Inc.
+ *
+ * Author: Marcelo Henrique Cerri <mhcerri@br.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/types.h>
+#include <linux/err.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <asm/cputable.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/skcipher.h>
+
+#include "aesp8-ppc.h"
+
+static int __init p8_init(void)
+{
+	int ret;
+
+	ret = crypto_register_shash(&p8_ghash_alg);
+	if (ret)
+		goto err;
+
+	ret = crypto_register_alg(&p8_aes_alg);
+	if (ret)
+		goto err_unregister_ghash;
+
+	ret = crypto_register_skcipher(&p8_aes_cbc_alg);
+	if (ret)
+		goto err_unregister_aes;
+
+	ret = crypto_register_skcipher(&p8_aes_ctr_alg);
+	if (ret)
+		goto err_unregister_aes_cbc;
+
+	ret = crypto_register_skcipher(&p8_aes_xts_alg);
+	if (ret)
+		goto err_unregister_aes_ctr;
+
+	return 0;
+
+err_unregister_aes_ctr:
+	crypto_unregister_skcipher(&p8_aes_ctr_alg);
+err_unregister_aes_cbc:
+	crypto_unregister_skcipher(&p8_aes_cbc_alg);
+err_unregister_aes:
+	crypto_unregister_alg(&p8_aes_alg);
+err_unregister_ghash:
+	crypto_unregister_shash(&p8_ghash_alg);
+err:
+	return ret;
+}
+
+static void __exit p8_exit(void)
+{
+	crypto_unregister_skcipher(&p8_aes_xts_alg);
+	crypto_unregister_skcipher(&p8_aes_ctr_alg);
+	crypto_unregister_skcipher(&p8_aes_cbc_alg);
+	crypto_unregister_alg(&p8_aes_alg);
+	crypto_unregister_shash(&p8_ghash_alg);
+}
+
+module_cpu_feature_match(PPC_MODULE_FEATURE_VEC_CRYPTO, p8_init);
+module_exit(p8_exit);
+
+MODULE_AUTHOR("Marcelo Cerri<mhcerri@br.ibm.com>");
+MODULE_DESCRIPTION("IBM VMX cryptographic acceleration instructions "
+		   "support on Power 8");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0.0");
+MODULE_IMPORT_NS("CRYPTO_INTERNAL");