summaryrefslogtreecommitdiff
path: root/arch/powerpc/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/crypto')
-rw-r--r--arch/powerpc/crypto/.gitignore3
-rw-r--r--arch/powerpc/crypto/Kconfig160
-rw-r--r--arch/powerpc/crypto/Makefile33
-rw-r--r--arch/powerpc/crypto/aes-gcm-p10-glue.c343
-rw-r--r--arch/powerpc/crypto/aes-gcm-p10.S1521
-rw-r--r--arch/powerpc/crypto/aes-spe-glue.c22
-rw-r--r--arch/powerpc/crypto/aes.c134
-rw-r--r--arch/powerpc/crypto/aes_cbc.c133
-rw-r--r--arch/powerpc/crypto/aes_ctr.c149
-rw-r--r--arch/powerpc/crypto/aes_xts.c162
-rw-r--r--arch/powerpc/crypto/aesp10-ppc.pl585
-rw-r--r--arch/powerpc/crypto/aesp8-ppc.h30
-rw-r--r--arch/powerpc/crypto/aesp8-ppc.pl3889
-rw-r--r--arch/powerpc/crypto/chacha-p10-glue.c221
-rw-r--r--arch/powerpc/crypto/chacha-p10le-8x.S842
-rw-r--r--arch/powerpc/crypto/crc-vpmsum_test.c12
-rw-r--r--arch/powerpc/crypto/crc32-vpmsum_core.S15
-rw-r--r--arch/powerpc/crypto/crc32c-vpmsum_glue.c4
-rw-r--r--arch/powerpc/crypto/ghash.c185
-rw-r--r--arch/powerpc/crypto/ghashp10-ppc.pl370
-rw-r--r--arch/powerpc/crypto/ghashp8-ppc.pl243
-rw-r--r--arch/powerpc/crypto/md5-asm.S10
-rw-r--r--arch/powerpc/crypto/md5-glue.c1
-rw-r--r--arch/powerpc/crypto/poly1305-p10-glue.c186
-rw-r--r--arch/powerpc/crypto/poly1305-p10le_64.S1075
-rw-r--r--arch/powerpc/crypto/ppc-xlate.pl229
-rw-r--r--arch/powerpc/crypto/sha1-powerpc-asm.S6
-rw-r--r--arch/powerpc/crypto/sha1-spe-glue.c22
-rw-r--r--arch/powerpc/crypto/sha1.c45
-rw-r--r--arch/powerpc/crypto/sha256-spe-glue.c46
-rw-r--r--arch/powerpc/crypto/vmx.c77
31 files changed, 10620 insertions, 133 deletions
diff --git a/arch/powerpc/crypto/.gitignore b/arch/powerpc/crypto/.gitignore
new file mode 100644
index 000000000000..e1094f08f713
--- /dev/null
+++ b/arch/powerpc/crypto/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+aesp10-ppc.S
+ghashp10-ppc.S
diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
new file mode 100644
index 000000000000..1e201b7ae2fc
--- /dev/null
+++ b/arch/powerpc/crypto/Kconfig
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: GPL-2.0
+
+menu "Accelerated Cryptographic Algorithms for CPU (powerpc)"
+
+config CRYPTO_CRC32C_VPMSUM
+ tristate "CRC32c"
+ depends on PPC64 && ALTIVEC
+ select CRYPTO_HASH
+ select CRC32
+ help
+ CRC32c CRC algorithm with the iSCSI polynomial (RFC 3385 and RFC 3720)
+
+ Architecture: powerpc64 using
+ - AltiVec extensions
+
+ Enable on POWER8 and newer processors for improved performance.
+
+config CRYPTO_CRCT10DIF_VPMSUM
+ tristate "CRC32T10DIF"
+ depends on PPC64 && ALTIVEC && CRC_T10DIF
+ select CRYPTO_HASH
+ help
+ CRC16 CRC algorithm used for the T10 (SCSI) Data Integrity Field (DIF)
+
+ Architecture: powerpc64 using
+ - AltiVec extensions
+
+ Enable on POWER8 and newer processors for improved performance.
+
+config CRYPTO_VPMSUM_TESTER
+ tristate "CRC32c and CRC32T10DIF hardware acceleration tester"
+ depends on CRYPTO_CRCT10DIF_VPMSUM && CRYPTO_CRC32C_VPMSUM
+ help
+ Stress test for CRC32c and CRCT10DIF algorithms implemented with
+ powerpc64 AltiVec extensions (POWER8 vpmsum instructions).
+ Unless you are testing these algorithms, you don't need this.
+
+config CRYPTO_MD5_PPC
+ tristate "Digests: MD5"
+ depends on PPC
+ select CRYPTO_HASH
+ help
+ MD5 message digest algorithm (RFC1321)
+
+ Architecture: powerpc
+
+config CRYPTO_SHA1_PPC
+ tristate "Hash functions: SHA-1"
+ depends on PPC
+ help
+ SHA-1 secure hash algorithm (FIPS 180)
+
+ Architecture: powerpc
+
+config CRYPTO_SHA1_PPC_SPE
+ tristate "Hash functions: SHA-1 (SPE)"
+ depends on PPC && SPE
+ help
+ SHA-1 secure hash algorithm (FIPS 180)
+
+ Architecture: powerpc using
+ - SPE (Signal Processing Engine) extensions
+
+config CRYPTO_SHA256_PPC_SPE
+ tristate "Hash functions: SHA-224 and SHA-256 (SPE)"
+ depends on PPC && SPE
+ select CRYPTO_SHA256
+ select CRYPTO_HASH
+ help
+ SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
+
+ Architecture: powerpc using
+ - SPE (Signal Processing Engine) extensions
+
+config CRYPTO_AES_PPC_SPE
+ tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (SPE)"
+ depends on PPC && SPE
+ select CRYPTO_SKCIPHER
+ help
+ Block ciphers: AES cipher algorithms (FIPS-197)
+ Length-preserving ciphers: AES with ECB, CBC, CTR, and XTS modes
+
+ Architecture: powerpc using:
+ - SPE (Signal Processing Engine) extensions
+
+ SPE is available for:
+ - Processor Type: Freescale 8500
+ - CPU selection: e500 (8540)
+
+ This module should only be used for low power (router) devices
+ without hardware AES acceleration (e.g. caam crypto). It reduces the
+ size of the AES tables from 16KB to 8KB + 256 bytes and mitigates
+ timining attacks. Nevertheless it might be not as secure as other
+ architecture specific assembler implementations that work on 1KB
+ tables or 256 bytes S-boxes.
+
+config CRYPTO_AES_GCM_P10
+ tristate "Stitched AES/GCM acceleration support on P10 or later CPU (PPC)"
+ depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
+ select CRYPTO_LIB_AES
+ select CRYPTO_ALGAPI
+ select CRYPTO_AEAD
+ select CRYPTO_SKCIPHER
+ help
+ AEAD cipher: AES cipher algorithms (FIPS-197)
+ GCM (Galois/Counter Mode) authenticated encryption mode (NIST SP800-38D)
+ Architecture: powerpc64 using:
+ - little-endian
+ - Power10 or later features
+
+ Support for cryptographic acceleration instructions on Power10 or
+ later CPU. This module supports stitched acceleration for AES/GCM.
+
+config CRYPTO_CHACHA20_P10
+ tristate "Ciphers: ChaCha20, XChacha20, XChacha12 (P10 or later)"
+ depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
+ select CRYPTO_SKCIPHER
+ select CRYPTO_LIB_CHACHA_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_CHACHA
+ help
+ Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12
+ stream cipher algorithms
+
+ Architecture: PowerPC64
+ - Power10 or later
+ - Little-endian
+
+config CRYPTO_POLY1305_P10
+ tristate "Hash functions: Poly1305 (P10 or later)"
+ depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
+ select CRYPTO_HASH
+ select CRYPTO_LIB_POLY1305_GENERIC
+ help
+ Poly1305 authenticator algorithm (RFC7539)
+
+ Architecture: PowerPC64
+ - Power10 or later
+ - Little-endian
+
+config CRYPTO_DEV_VMX
+ bool "Support for VMX cryptographic acceleration instructions"
+ depends on PPC64 && VSX
+ help
+ Support for VMX cryptographic acceleration instructions.
+
+config CRYPTO_DEV_VMX_ENCRYPT
+ tristate "Encryption acceleration support on P8 CPU"
+ depends on CRYPTO_DEV_VMX
+ select CRYPTO_AES
+ select CRYPTO_CBC
+ select CRYPTO_CTR
+ select CRYPTO_GHASH
+ select CRYPTO_XTS
+ default m
+ help
+ Support for VMX cryptographic acceleration instructions on Power8 CPU.
+ This module supports acceleration for AES and GHASH in hardware. If you
+ choose 'M' here, this module will be called vmx-crypto.
+
+endmenu
diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
index 4808d97fede5..fca0e9739869 100644
--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -13,6 +13,10 @@ obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o
obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o
obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o
obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o
+obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o
+obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o
+obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o
+obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o
aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o
md5-ppc-y := md5-asm.o md5-glue.o
@@ -21,3 +25,32 @@ sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o
sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o
crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o
crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o
+aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o
+chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o
+poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o
+vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o
+
+ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
+override flavour := linux-ppc64le
+else
+ifdef CONFIG_PPC64_ELF_ABI_V2
+override flavour := linux-ppc64-elfv2
+else
+override flavour := linux-ppc64
+endif
+endif
+
+quiet_cmd_perl = PERL $@
+ cmd_perl = $(PERL) $< $(flavour) > $@
+
+targets += aesp10-ppc.S ghashp10-ppc.S aesp8-ppc.S ghashp8-ppc.S
+
+$(obj)/aesp10-ppc.S $(obj)/ghashp10-ppc.S: $(obj)/%.S: $(src)/%.pl FORCE
+ $(call if_changed,perl)
+
+$(obj)/aesp8-ppc.S $(obj)/ghashp8-ppc.S: $(obj)/%.S: $(src)/%.pl FORCE
+ $(call if_changed,perl)
+
+OBJECT_FILES_NON_STANDARD_aesp10-ppc.o := y
+OBJECT_FILES_NON_STANDARD_ghashp10-ppc.o := y
+OBJECT_FILES_NON_STANDARD_aesp8-ppc.o := y
diff --git a/arch/powerpc/crypto/aes-gcm-p10-glue.c b/arch/powerpc/crypto/aes-gcm-p10-glue.c
new file mode 100644
index 000000000000..f62ee54076c0
--- /dev/null
+++ b/arch/powerpc/crypto/aes-gcm-p10-glue.c
@@ -0,0 +1,343 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue code for accelerated AES-GCM stitched implementation for ppc64le.
+ *
+ * Copyright 2022- IBM Inc. All rights reserved
+ */
+
+#include <asm/unaligned.h>
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+#include <crypto/aes.h>
+#include <crypto/algapi.h>
+#include <crypto/b128ops.h>
+#include <crypto/gf128mul.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#define PPC_ALIGN 16
+#define GCM_IV_SIZE 12
+
+MODULE_DESCRIPTION("PPC64le AES-GCM with Stitched implementation");
+MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("aes");
+
+asmlinkage int aes_p10_set_encrypt_key(const u8 *userKey, const int bits,
+ void *key);
+asmlinkage void aes_p10_encrypt(const u8 *in, u8 *out, const void *key);
+asmlinkage void aes_p10_gcm_encrypt(u8 *in, u8 *out, size_t len,
+ void *rkey, u8 *iv, void *Xi);
+asmlinkage void aes_p10_gcm_decrypt(u8 *in, u8 *out, size_t len,
+ void *rkey, u8 *iv, void *Xi);
+asmlinkage void gcm_init_htable(unsigned char htable[], unsigned char Xi[]);
+asmlinkage void gcm_ghash_p10(unsigned char *Xi, unsigned char *Htable,
+ unsigned char *aad, unsigned int alen);
+
+struct aes_key {
+ u8 key[AES_MAX_KEYLENGTH];
+ u64 rounds;
+};
+
+struct gcm_ctx {
+ u8 iv[16];
+ u8 ivtag[16];
+ u8 aad_hash[16];
+ u64 aadLen;
+ u64 Plen; /* offset 56 - used in aes_p10_gcm_{en/de}crypt */
+};
+struct Hash_ctx {
+ u8 H[16]; /* subkey */
+ u8 Htable[256]; /* Xi, Hash table(offset 32) */
+};
+
+struct p10_aes_gcm_ctx {
+ struct aes_key enc_key;
+};
+
+static void vsx_begin(void)
+{
+ preempt_disable();
+ enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+ disable_kernel_vsx();
+ preempt_enable();
+}
+
+static void set_subkey(unsigned char *hash)
+{
+ *(u64 *)&hash[0] = be64_to_cpup((__be64 *)&hash[0]);
+ *(u64 *)&hash[8] = be64_to_cpup((__be64 *)&hash[8]);
+}
+
+/*
+ * Compute aad if any.
+ * - Hash aad and copy to Xi.
+ */
+static void set_aad(struct gcm_ctx *gctx, struct Hash_ctx *hash,
+ unsigned char *aad, int alen)
+{
+ int i;
+ u8 nXi[16] = {0, };
+
+ gctx->aadLen = alen;
+ i = alen & ~0xf;
+ if (i) {
+ gcm_ghash_p10(nXi, hash->Htable+32, aad, i);
+ aad += i;
+ alen -= i;
+ }
+ if (alen) {
+ for (i = 0; i < alen; i++)
+ nXi[i] ^= aad[i];
+
+ memset(gctx->aad_hash, 0, 16);
+ gcm_ghash_p10(gctx->aad_hash, hash->Htable+32, nXi, 16);
+ } else {
+ memcpy(gctx->aad_hash, nXi, 16);
+ }
+
+ memcpy(hash->Htable, gctx->aad_hash, 16);
+}
+
+static void gcmp10_init(struct gcm_ctx *gctx, u8 *iv, unsigned char *rdkey,
+ struct Hash_ctx *hash, u8 *assoc, unsigned int assoclen)
+{
+ __be32 counter = cpu_to_be32(1);
+
+ aes_p10_encrypt(hash->H, hash->H, rdkey);
+ set_subkey(hash->H);
+ gcm_init_htable(hash->Htable+32, hash->H);
+
+ *((__be32 *)(iv+12)) = counter;
+
+ gctx->Plen = 0;
+
+ /*
+ * Encrypt counter vector as iv tag and increment counter.
+ */
+ aes_p10_encrypt(iv, gctx->ivtag, rdkey);
+
+ counter = cpu_to_be32(2);
+ *((__be32 *)(iv+12)) = counter;
+ memcpy(gctx->iv, iv, 16);
+
+ gctx->aadLen = assoclen;
+ memset(gctx->aad_hash, 0, 16);
+ if (assoclen)
+ set_aad(gctx, hash, assoc, assoclen);
+}
+
+static void finish_tag(struct gcm_ctx *gctx, struct Hash_ctx *hash, int len)
+{
+ int i;
+ unsigned char len_ac[16 + PPC_ALIGN];
+ unsigned char *aclen = PTR_ALIGN((void *)len_ac, PPC_ALIGN);
+ __be64 clen = cpu_to_be64(len << 3);
+ __be64 alen = cpu_to_be64(gctx->aadLen << 3);
+
+ if (len == 0 && gctx->aadLen == 0) {
+ memcpy(hash->Htable, gctx->ivtag, 16);
+ return;
+ }
+
+ /*
+ * Len is in bits.
+ */
+ *((__be64 *)(aclen)) = alen;
+ *((__be64 *)(aclen+8)) = clen;
+
+ /*
+ * hash (AAD len and len)
+ */
+ gcm_ghash_p10(hash->Htable, hash->Htable+32, aclen, 16);
+
+ for (i = 0; i < 16; i++)
+ hash->Htable[i] ^= gctx->ivtag[i];
+}
+
+static int set_authsize(struct crypto_aead *tfm, unsigned int authsize)
+{
+ switch (authsize) {
+ case 4:
+ case 8:
+ case 12:
+ case 13:
+ case 14:
+ case 15:
+ case 16:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int p10_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key,
+ unsigned int keylen)
+{
+ struct crypto_tfm *tfm = crypto_aead_tfm(aead);
+ struct p10_aes_gcm_ctx *ctx = crypto_tfm_ctx(tfm);
+ int ret;
+
+ vsx_begin();
+ ret = aes_p10_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
+ vsx_end();
+
+ return ret ? -EINVAL : 0;
+}
+
+static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
+{
+ struct crypto_tfm *tfm = req->base.tfm;
+ struct p10_aes_gcm_ctx *ctx = crypto_tfm_ctx(tfm);
+ u8 databuf[sizeof(struct gcm_ctx) + PPC_ALIGN];
+ struct gcm_ctx *gctx = PTR_ALIGN((void *)databuf, PPC_ALIGN);
+ u8 hashbuf[sizeof(struct Hash_ctx) + PPC_ALIGN];
+ struct Hash_ctx *hash = PTR_ALIGN((void *)hashbuf, PPC_ALIGN);
+ struct scatter_walk assoc_sg_walk;
+ struct skcipher_walk walk;
+ u8 *assocmem = NULL;
+ u8 *assoc;
+ unsigned int assoclen = req->assoclen;
+ unsigned int cryptlen = req->cryptlen;
+ unsigned char ivbuf[AES_BLOCK_SIZE+PPC_ALIGN];
+ unsigned char *iv = PTR_ALIGN((void *)ivbuf, PPC_ALIGN);
+ int ret;
+ unsigned long auth_tag_len = crypto_aead_authsize(__crypto_aead_cast(tfm));
+ u8 otag[16];
+ int total_processed = 0;
+
+ memset(databuf, 0, sizeof(databuf));
+ memset(hashbuf, 0, sizeof(hashbuf));
+ memset(ivbuf, 0, sizeof(ivbuf));
+ memcpy(iv, req->iv, GCM_IV_SIZE);
+
+ /* Linearize assoc, if not already linear */
+ if (req->src->length >= assoclen && req->src->length) {
+ scatterwalk_start(&assoc_sg_walk, req->src);
+ assoc = scatterwalk_map(&assoc_sg_walk);
+ } else {
+ gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ?
+ GFP_KERNEL : GFP_ATOMIC;
+
+ /* assoc can be any length, so must be on heap */
+ assocmem = kmalloc(assoclen, flags);
+ if (unlikely(!assocmem))
+ return -ENOMEM;
+ assoc = assocmem;
+
+ scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0);
+ }
+
+ vsx_begin();
+ gcmp10_init(gctx, iv, (unsigned char *) &ctx->enc_key, hash, assoc, assoclen);
+ vsx_end();
+
+ if (!assocmem)
+ scatterwalk_unmap(assoc);
+ else
+ kfree(assocmem);
+
+ if (enc)
+ ret = skcipher_walk_aead_encrypt(&walk, req, false);
+ else
+ ret = skcipher_walk_aead_decrypt(&walk, req, false);
+ if (ret)
+ return ret;
+
+ while (walk.nbytes > 0 && ret == 0) {
+
+ vsx_begin();
+ if (enc)
+ aes_p10_gcm_encrypt(walk.src.virt.addr,
+ walk.dst.virt.addr,
+ walk.nbytes,
+ &ctx->enc_key, gctx->iv, hash->Htable);
+ else
+ aes_p10_gcm_decrypt(walk.src.virt.addr,
+ walk.dst.virt.addr,
+ walk.nbytes,
+ &ctx->enc_key, gctx->iv, hash->Htable);
+ vsx_end();
+
+ total_processed += walk.nbytes;
+ ret = skcipher_walk_done(&walk, 0);
+ }
+
+ if (ret)
+ return ret;
+
+ /* Finalize hash */
+ vsx_begin();
+ finish_tag(gctx, hash, total_processed);
+ vsx_end();
+
+ /* copy Xi to end of dst */
+ if (enc)
+ scatterwalk_map_and_copy(hash->Htable, req->dst, req->assoclen + cryptlen,
+ auth_tag_len, 1);
+ else {
+ scatterwalk_map_and_copy(otag, req->src,
+ req->assoclen + cryptlen - auth_tag_len,
+ auth_tag_len, 0);
+
+ if (crypto_memneq(otag, hash->Htable, auth_tag_len)) {
+ memzero_explicit(hash->Htable, 16);
+ return -EBADMSG;
+ }
+ }
+
+ return 0;
+}
+
+static int p10_aes_gcm_encrypt(struct aead_request *req)
+{
+ return p10_aes_gcm_crypt(req, 1);
+}
+
+static int p10_aes_gcm_decrypt(struct aead_request *req)
+{
+ return p10_aes_gcm_crypt(req, 0);
+}
+
+static struct aead_alg gcm_aes_alg = {
+ .ivsize = GCM_IV_SIZE,
+ .maxauthsize = 16,
+
+ .setauthsize = set_authsize,
+ .setkey = p10_aes_gcm_setkey,
+ .encrypt = p10_aes_gcm_encrypt,
+ .decrypt = p10_aes_gcm_decrypt,
+
+ .base.cra_name = "gcm(aes)",
+ .base.cra_driver_name = "aes_gcm_p10",
+ .base.cra_priority = 2100,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct p10_aes_gcm_ctx),
+ .base.cra_module = THIS_MODULE,
+};
+
+static int __init p10_init(void)
+{
+ return crypto_register_aead(&gcm_aes_alg);
+}
+
+static void __exit p10_exit(void)
+{
+ crypto_unregister_aead(&gcm_aes_alg);
+}
+
+module_cpu_feature_match(PPC_MODULE_FEATURE_P10, p10_init);
+module_exit(p10_exit);
diff --git a/arch/powerpc/crypto/aes-gcm-p10.S b/arch/powerpc/crypto/aes-gcm-p10.S
new file mode 100644
index 000000000000..a51f4b265308
--- /dev/null
+++ b/arch/powerpc/crypto/aes-gcm-p10.S
@@ -0,0 +1,1521 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+ #
+ # Accelerated AES-GCM stitched implementation for ppc64le.
+ #
+ # Copyright 2022- IBM Inc. All rights reserved
+ #
+ #===================================================================================
+ # Written by Danny Tsen <dtsen@linux.ibm.com>
+ #
+ # GHASH is based on the Karatsuba multiplication method.
+ #
+ # Xi xor X1
+ #
+ # X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
+ # (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
+ # (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
+ # (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
+ # (X4.h * H.h + X4.l * H.l + X4 * H)
+ #
+ # Xi = v0
+ # H Poly = v2
+ # Hash keys = v3 - v14
+ # ( H.l, H, H.h)
+ # ( H^2.l, H^2, H^2.h)
+ # ( H^3.l, H^3, H^3.h)
+ # ( H^4.l, H^4, H^4.h)
+ #
+ # v30 is IV
+ # v31 - counter 1
+ #
+ # AES used,
+ # vs0 - vs14 for round keys
+ # v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
+ #
+ # This implementation uses stitched AES-GCM approach to improve overall performance.
+ # AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
+ #
+ # ===================================================================================
+ #
+
+#include <asm/ppc_asm.h>
+#include <linux/linkage.h>
+
+.machine "any"
+.text
+
+ # 4x loops
+ # v15 - v18 - input states
+ # vs1 - vs9 - round keys
+ #
+.macro Loop_aes_middle4x
+ xxlor 19+32, 1, 1
+ xxlor 20+32, 2, 2
+ xxlor 21+32, 3, 3
+ xxlor 22+32, 4, 4
+
+ vcipher 15, 15, 19
+ vcipher 16, 16, 19
+ vcipher 17, 17, 19
+ vcipher 18, 18, 19
+
+ vcipher 15, 15, 20
+ vcipher 16, 16, 20
+ vcipher 17, 17, 20
+ vcipher 18, 18, 20
+
+ vcipher 15, 15, 21
+ vcipher 16, 16, 21
+ vcipher 17, 17, 21
+ vcipher 18, 18, 21
+
+ vcipher 15, 15, 22
+ vcipher 16, 16, 22
+ vcipher 17, 17, 22
+ vcipher 18, 18, 22
+
+ xxlor 19+32, 5, 5
+ xxlor 20+32, 6, 6
+ xxlor 21+32, 7, 7
+ xxlor 22+32, 8, 8
+
+ vcipher 15, 15, 19
+ vcipher 16, 16, 19
+ vcipher 17, 17, 19
+ vcipher 18, 18, 19
+
+ vcipher 15, 15, 20
+ vcipher 16, 16, 20
+ vcipher 17, 17, 20
+ vcipher 18, 18, 20
+
+ vcipher 15, 15, 21
+ vcipher 16, 16, 21
+ vcipher 17, 17, 21
+ vcipher 18, 18, 21
+
+ vcipher 15, 15, 22
+ vcipher 16, 16, 22
+ vcipher 17, 17, 22
+ vcipher 18, 18, 22
+
+ xxlor 23+32, 9, 9
+ vcipher 15, 15, 23
+ vcipher 16, 16, 23
+ vcipher 17, 17, 23
+ vcipher 18, 18, 23
+.endm
+
+ # 8x loops
+ # v15 - v22 - input states
+ # vs1 - vs9 - round keys
+ #
+.macro Loop_aes_middle8x
+ xxlor 23+32, 1, 1
+ xxlor 24+32, 2, 2
+ xxlor 25+32, 3, 3
+ xxlor 26+32, 4, 4
+
+ vcipher 15, 15, 23
+ vcipher 16, 16, 23
+ vcipher 17, 17, 23
+ vcipher 18, 18, 23
+ vcipher 19, 19, 23
+ vcipher 20, 20, 23
+ vcipher 21, 21, 23
+ vcipher 22, 22, 23
+
+ vcipher 15, 15, 24
+ vcipher 16, 16, 24
+ vcipher 17, 17, 24
+ vcipher 18, 18, 24
+ vcipher 19, 19, 24
+ vcipher 20, 20, 24
+ vcipher 21, 21, 24
+ vcipher 22, 22, 24
+
+ vcipher 15, 15, 25
+ vcipher 16, 16, 25
+ vcipher 17, 17, 25
+ vcipher 18, 18, 25
+ vcipher 19, 19, 25
+ vcipher 20, 20, 25
+ vcipher 21, 21, 25
+ vcipher 22, 22, 25
+
+ vcipher 15, 15, 26
+ vcipher 16, 16, 26
+ vcipher 17, 17, 26
+ vcipher 18, 18, 26
+ vcipher 19, 19, 26
+ vcipher 20, 20, 26
+ vcipher 21, 21, 26
+ vcipher 22, 22, 26
+
+ xxlor 23+32, 5, 5
+ xxlor 24+32, 6, 6
+ xxlor 25+32, 7, 7
+ xxlor 26+32, 8, 8
+
+ vcipher 15, 15, 23
+ vcipher 16, 16, 23
+ vcipher 17, 17, 23
+ vcipher 18, 18, 23
+ vcipher 19, 19, 23
+ vcipher 20, 20, 23
+ vcipher 21, 21, 23
+ vcipher 22, 22, 23
+
+ vcipher 15, 15, 24
+ vcipher 16, 16, 24
+ vcipher 17, 17, 24
+ vcipher 18, 18, 24
+ vcipher 19, 19, 24
+ vcipher 20, 20, 24
+ vcipher 21, 21, 24
+ vcipher 22, 22, 24
+
+ vcipher 15, 15, 25
+ vcipher 16, 16, 25
+ vcipher 17, 17, 25
+ vcipher 18, 18, 25
+ vcipher 19, 19, 25
+ vcipher 20, 20, 25
+ vcipher 21, 21, 25
+ vcipher 22, 22, 25
+
+ vcipher 15, 15, 26
+ vcipher 16, 16, 26
+ vcipher 17, 17, 26
+ vcipher 18, 18, 26
+ vcipher 19, 19, 26
+ vcipher 20, 20, 26
+ vcipher 21, 21, 26
+ vcipher 22, 22, 26
+
+ xxlor 23+32, 9, 9
+ vcipher 15, 15, 23
+ vcipher 16, 16, 23
+ vcipher 17, 17, 23
+ vcipher 18, 18, 23
+ vcipher 19, 19, 23
+ vcipher 20, 20, 23
+ vcipher 21, 21, 23
+ vcipher 22, 22, 23
+.endm
+
+.macro Loop_aes_middle_1x
+ xxlor 19+32, 1, 1
+ xxlor 20+32, 2, 2
+ xxlor 21+32, 3, 3
+ xxlor 22+32, 4, 4
+
+ vcipher 15, 15, 19
+ vcipher 15, 15, 20
+ vcipher 15, 15, 21
+ vcipher 15, 15, 22
+
+ xxlor 19+32, 5, 5
+ xxlor 20+32, 6, 6
+ xxlor 21+32, 7, 7
+ xxlor 22+32, 8, 8
+
+ vcipher 15, 15, 19
+ vcipher 15, 15, 20
+ vcipher 15, 15, 21
+ vcipher 15, 15, 22
+
+ xxlor 19+32, 9, 9
+ vcipher 15, 15, 19
+.endm
+
+ #
+ # Compute 4x hash values based on Karatsuba method.
+ #
+.macro ppc_aes_gcm_ghash
+ vxor 15, 15, 0
+
+ vpmsumd 23, 12, 15 # H4.L * X.L
+ vpmsumd 24, 9, 16
+ vpmsumd 25, 6, 17
+ vpmsumd 26, 3, 18
+
+ vxor 23, 23, 24
+ vxor 23, 23, 25
+ vxor 23, 23, 26 # L
+
+ vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
+ vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
+ vpmsumd 26, 7, 17
+ vpmsumd 27, 4, 18
+
+ vxor 24, 24, 25
+ vxor 24, 24, 26
+ vxor 24, 24, 27 # M
+
+ # sum hash and reduction with H Poly
+ vpmsumd 28, 23, 2 # reduction
+
+ vxor 29, 29, 29
+ vsldoi 26, 24, 29, 8 # mL
+ vsldoi 29, 29, 24, 8 # mH
+ vxor 23, 23, 26 # mL + L
+
+ vsldoi 23, 23, 23, 8 # swap
+ vxor 23, 23, 28
+
+ vpmsumd 24, 14, 15 # H4.H * X.H
+ vpmsumd 25, 11, 16
+ vpmsumd 26, 8, 17
+ vpmsumd 27, 5, 18
+
+ vxor 24, 24, 25
+ vxor 24, 24, 26
+ vxor 24, 24, 27
+
+ vxor 24, 24, 29
+
+ # sum hash and reduction with H Poly
+ vsldoi 27, 23, 23, 8 # swap
+ vpmsumd 23, 23, 2
+ vxor 27, 27, 24
+ vxor 23, 23, 27
+
+ xxlor 32, 23+32, 23+32 # update hash
+
+.endm
+
+ #
+ # Combine two 4x ghash
+ # v15 - v22 - input blocks
+ #
+.macro ppc_aes_gcm_ghash2_4x
+ # first 4x hash
+ vxor 15, 15, 0 # Xi + X
+
+ vpmsumd 23, 12, 15 # H4.L * X.L
+ vpmsumd 24, 9, 16
+ vpmsumd 25, 6, 17
+ vpmsumd 26, 3, 18
+
+ vxor 23, 23, 24
+ vxor 23, 23, 25
+ vxor 23, 23, 26 # L
+
+ vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
+ vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
+ vpmsumd 26, 7, 17
+ vpmsumd 27, 4, 18
+
+ vxor 24, 24, 25
+ vxor 24, 24, 26
+
+ # sum hash and reduction with H Poly
+ vpmsumd 28, 23, 2 # reduction
+
+ vxor 29, 29, 29
+
+ vxor 24, 24, 27 # M
+ vsldoi 26, 24, 29, 8 # mL
+ vsldoi 29, 29, 24, 8 # mH
+ vxor 23, 23, 26 # mL + L
+
+ vsldoi 23, 23, 23, 8 # swap
+ vxor 23, 23, 28
+
+ vpmsumd 24, 14, 15 # H4.H * X.H
+ vpmsumd 25, 11, 16
+ vpmsumd 26, 8, 17
+ vpmsumd 27, 5, 18
+
+ vxor 24, 24, 25
+ vxor 24, 24, 26
+ vxor 24, 24, 27 # H
+
+ vxor 24, 24, 29 # H + mH
+
+ # sum hash and reduction with H Poly
+ vsldoi 27, 23, 23, 8 # swap
+ vpmsumd 23, 23, 2
+ vxor 27, 27, 24
+ vxor 27, 23, 27 # 1st Xi
+
+ # 2nd 4x hash
+ vpmsumd 24, 9, 20
+ vpmsumd 25, 6, 21
+ vpmsumd 26, 3, 22
+ vxor 19, 19, 27 # Xi + X
+ vpmsumd 23, 12, 19 # H4.L * X.L
+
+ vxor 23, 23, 24
+ vxor 23, 23, 25
+ vxor 23, 23, 26 # L
+
+ vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L
+ vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L
+ vpmsumd 26, 7, 21
+ vpmsumd 27, 4, 22
+
+ vxor 24, 24, 25
+ vxor 24, 24, 26
+
+ # sum hash and reduction with H Poly
+ vpmsumd 28, 23, 2 # reduction
+
+ vxor 29, 29, 29
+
+ vxor 24, 24, 27 # M
+ vsldoi 26, 24, 29, 8 # mL
+ vsldoi 29, 29, 24, 8 # mH
+ vxor 23, 23, 26 # mL + L
+
+ vsldoi 23, 23, 23, 8 # swap
+ vxor 23, 23, 28
+
+ vpmsumd 24, 14, 19 # H4.H * X.H
+ vpmsumd 25, 11, 20
+ vpmsumd 26, 8, 21
+ vpmsumd 27, 5, 22
+
+ vxor 24, 24, 25
+ vxor 24, 24, 26
+ vxor 24, 24, 27 # H
+
+ vxor 24, 24, 29 # H + mH
+
+ # sum hash and reduction with H Poly
+ vsldoi 27, 23, 23, 8 # swap
+ vpmsumd 23, 23, 2
+ vxor 27, 27, 24
+ vxor 23, 23, 27
+
+ xxlor 32, 23+32, 23+32 # update hash
+
+.endm
+
+ #
+ # Compute update single hash
+ #
+.macro ppc_update_hash_1x
+ vxor 28, 28, 0
+
+ vxor 19, 19, 19
+
+ vpmsumd 22, 3, 28 # L
+ vpmsumd 23, 4, 28 # M
+ vpmsumd 24, 5, 28 # H
+
+ vpmsumd 27, 22, 2 # reduction
+
+ vsldoi 25, 23, 19, 8 # mL
+ vsldoi 26, 19, 23, 8 # mH
+ vxor 22, 22, 25 # LL + LL
+ vxor 24, 24, 26 # HH + HH
+
+ vsldoi 22, 22, 22, 8 # swap
+ vxor 22, 22, 27
+
+ vsldoi 20, 22, 22, 8 # swap
+ vpmsumd 22, 22, 2 # reduction
+ vxor 20, 20, 24
+ vxor 22, 22, 20
+
+ vmr 0, 22 # update hash
+
+.endm
+
+.macro SAVE_REGS
+ stdu 1,-640(1)
+ mflr 0
+
+ std 14,112(1)
+ std 15,120(1)
+ std 16,128(1)
+ std 17,136(1)
+ std 18,144(1)
+ std 19,152(1)
+ std 20,160(1)
+ std 21,168(1)
+ li 9, 256
+ stvx 20, 9, 1
+ addi 9, 9, 16
+ stvx 21, 9, 1
+ addi 9, 9, 16
+ stvx 22, 9, 1
+ addi 9, 9, 16
+ stvx 23, 9, 1
+ addi 9, 9, 16
+ stvx 24, 9, 1
+ addi 9, 9, 16
+ stvx 25, 9, 1
+ addi 9, 9, 16
+ stvx 26, 9, 1
+ addi 9, 9, 16
+ stvx 27, 9, 1
+ addi 9, 9, 16
+ stvx 28, 9, 1
+ addi 9, 9, 16
+ stvx 29, 9, 1
+ addi 9, 9, 16
+ stvx 30, 9, 1
+ addi 9, 9, 16
+ stvx 31, 9, 1
+ stxv 14, 464(1)
+ stxv 15, 480(1)
+ stxv 16, 496(1)
+ stxv 17, 512(1)
+ stxv 18, 528(1)
+ stxv 19, 544(1)
+ stxv 20, 560(1)
+ stxv 21, 576(1)
+ stxv 22, 592(1)
+ std 0, 656(1)
+.endm
+
+.macro RESTORE_REGS
+ lxv 14, 464(1)
+ lxv 15, 480(1)
+ lxv 16, 496(1)
+ lxv 17, 512(1)
+ lxv 18, 528(1)
+ lxv 19, 544(1)
+ lxv 20, 560(1)
+ lxv 21, 576(1)
+ lxv 22, 592(1)
+ li 9, 256
+ lvx 20, 9, 1
+ addi 9, 9, 16
+ lvx 21, 9, 1
+ addi 9, 9, 16
+ lvx 22, 9, 1
+ addi 9, 9, 16
+ lvx 23, 9, 1
+ addi 9, 9, 16
+ lvx 24, 9, 1
+ addi 9, 9, 16
+ lvx 25, 9, 1
+ addi 9, 9, 16
+ lvx 26, 9, 1
+ addi 9, 9, 16
+ lvx 27, 9, 1
+ addi 9, 9, 16
+ lvx 28, 9, 1
+ addi 9, 9, 16
+ lvx 29, 9, 1
+ addi 9, 9, 16
+ lvx 30, 9, 1
+ addi 9, 9, 16
+ lvx 31, 9, 1
+
+ ld 0, 656(1)
+ ld 14,112(1)
+ ld 15,120(1)
+ ld 16,128(1)
+ ld 17,136(1)
+ ld 18,144(1)
+ ld 19,152(1)
+ ld 20,160(1)
+ ld 21,168(1)
+
+ mtlr 0
+ addi 1, 1, 640
+.endm
+
+.macro LOAD_HASH_TABLE
+ # Load Xi
+ lxvb16x 32, 0, 8 # load Xi
+
+ # load Hash - h^4, h^3, h^2, h
+ li 10, 32
+ lxvd2x 2+32, 10, 8 # H Poli
+ li 10, 48
+ lxvd2x 3+32, 10, 8 # Hl
+ li 10, 64
+ lxvd2x 4+32, 10, 8 # H
+ li 10, 80
+ lxvd2x 5+32, 10, 8 # Hh
+
+ li 10, 96
+ lxvd2x 6+32, 10, 8 # H^2l
+ li 10, 112
+ lxvd2x 7+32, 10, 8 # H^2
+ li 10, 128
+ lxvd2x 8+32, 10, 8 # H^2h
+
+ li 10, 144
+ lxvd2x 9+32, 10, 8 # H^3l
+ li 10, 160
+ lxvd2x 10+32, 10, 8 # H^3
+ li 10, 176
+ lxvd2x 11+32, 10, 8 # H^3h
+
+ li 10, 192
+ lxvd2x 12+32, 10, 8 # H^4l
+ li 10, 208
+ lxvd2x 13+32, 10, 8 # H^4
+ li 10, 224
+ lxvd2x 14+32, 10, 8 # H^4h
+.endm
+
+ #
+ # aes_p10_gcm_encrypt (const void *inp, void *out, size_t len,
+ # const char *rk, unsigned char iv[16], void *Xip);
+ #
+ # r3 - inp
+ # r4 - out
+ # r5 - len
+ # r6 - AES round keys
+ # r7 - iv and other data
+ # r8 - Xi, HPoli, hash keys
+ #
+ # rounds is at offset 240 in rk
+ # Xi is at 0 in gcm_table (Xip).
+ #
+_GLOBAL(aes_p10_gcm_encrypt)
+.align 5
+
+ SAVE_REGS
+
+ LOAD_HASH_TABLE
+
+ # initialize ICB: GHASH( IV ), IV - r7
+ lxvb16x 30+32, 0, 7 # load IV - v30
+
+ mr 12, 5 # length
+ li 11, 0 # block index
+
+ # counter 1
+ vxor 31, 31, 31
+ vspltisb 22, 1
+ vsldoi 31, 31, 22,1 # counter 1
+
+ # load round key to VSR
+ lxv 0, 0(6)
+ lxv 1, 0x10(6)
+ lxv 2, 0x20(6)
+ lxv 3, 0x30(6)
+ lxv 4, 0x40(6)
+ lxv 5, 0x50(6)
+ lxv 6, 0x60(6)
+ lxv 7, 0x70(6)
+ lxv 8, 0x80(6)
+ lxv 9, 0x90(6)
+ lxv 10, 0xa0(6)
+
+ # load rounds - 10 (128), 12 (192), 14 (256)
+ lwz 9,240(6)
+
+ #
+ # vxor state, state, w # addroundkey
+ xxlor 32+29, 0, 0
+ vxor 15, 30, 29 # IV + round key - add round key 0
+
+ cmpdi 9, 10
+ beq Loop_aes_gcm_8x
+
+ # load 2 more round keys (v11, v12)
+ lxv 11, 0xb0(6)
+ lxv 12, 0xc0(6)
+
+ cmpdi 9, 12
+ beq Loop_aes_gcm_8x
+
+ # load 2 more round keys (v11, v12, v13, v14)
+ lxv 13, 0xd0(6)
+ lxv 14, 0xe0(6)
+ cmpdi 9, 14
+ beq Loop_aes_gcm_8x
+
+ b aes_gcm_out
+
+.align 5
+Loop_aes_gcm_8x:
+ mr 14, 3
+ mr 9, 4
+
+ #
+ # check partial block
+ #
+Continue_partial_check:
+ ld 15, 56(7)
+ cmpdi 15, 0
+ beq Continue
+ bgt Final_block
+ cmpdi 15, 16
+ blt Final_block
+
+Continue:
+ # n blcoks
+ li 10, 128
+ divdu 10, 12, 10 # n 128 bytes-blocks
+ cmpdi 10, 0
+ beq Loop_last_block
+
+ vaddudm 30, 30, 31 # IV + counter
+ vxor 16, 30, 29
+ vaddudm 30, 30, 31
+ vxor 17, 30, 29
+ vaddudm 30, 30, 31
+ vxor 18, 30, 29
+ vaddudm 30, 30, 31
+ vxor 19, 30, 29
+ vaddudm 30, 30, 31
+ vxor 20, 30, 29
+ vaddudm 30, 30, 31
+ vxor 21, 30, 29
+ vaddudm 30, 30, 31
+ vxor 22, 30, 29
+
+ mtctr 10
+
+ li 15, 16
+ li 16, 32
+ li 17, 48
+ li 18, 64
+ li 19, 80
+ li 20, 96
+ li 21, 112
+
+ lwz 10, 240(6)
+
+Loop_8x_block:
+
+ lxvb16x 15, 0, 14 # load block
+ lxvb16x 16, 15, 14 # load block
+ lxvb16x 17, 16, 14 # load block
+ lxvb16x 18, 17, 14 # load block
+ lxvb16x 19, 18, 14 # load block
+ lxvb16x 20, 19, 14 # load block
+ lxvb16x 21, 20, 14 # load block
+ lxvb16x 22, 21, 14 # load block
+ addi 14, 14, 128
+
+ Loop_aes_middle8x
+
+ xxlor 23+32, 10, 10
+
+ cmpdi 10, 10
+ beq Do_next_ghash
+
+ # 192 bits
+ xxlor 24+32, 11, 11
+
+ vcipher 15, 15, 23
+ vcipher 16, 16, 23
+ vcipher 17, 17, 23
+ vcipher 18, 18, 23
+ vcipher 19, 19, 23
+ vcipher 20, 20, 23
+ vcipher 21, 21, 23
+ vcipher 22, 22, 23
+
+ vcipher 15, 15, 24
+ vcipher 16, 16, 24
+ vcipher 17, 17, 24
+ vcipher 18, 18, 24
+ vcipher 19, 19, 24
+ vcipher 20, 20, 24
+ vcipher 21, 21, 24
+ vcipher 22, 22, 24
+
+ xxlor 23+32, 12, 12
+
+ cmpdi 10, 12
+ beq Do_next_ghash
+
+ # 256 bits
+ xxlor 24+32, 13, 13
+
+ vcipher 15, 15, 23
+ vcipher 16, 16, 23
+ vcipher 17, 17, 23
+ vcipher 18, 18, 23
+ vcipher 19, 19, 23
+ vcipher 20, 20, 23
+ vcipher 21, 21, 23
+ vcipher 22, 22, 23
+
+ vcipher 15, 15, 24
+ vcipher 16, 16, 24
+ vcipher 17, 17, 24
+ vcipher 18, 18, 24
+ vcipher 19, 19, 24
+ vcipher 20, 20, 24
+ vcipher 21, 21, 24
+ vcipher 22, 22, 24
+
+ xxlor 23+32, 14, 14
+
+ cmpdi 10, 14
+ beq Do_next_ghash
+ b aes_gcm_out
+
+Do_next_ghash:
+
+ #
+ # last round
+ vcipherlast 15, 15, 23
+ vcipherlast 16, 16, 23
+
+ xxlxor 47, 47, 15
+ stxvb16x 47, 0, 9 # store output
+ xxlxor 48, 48, 16
+ stxvb16x 48, 15, 9 # store output
+
+ vcipherlast 17, 17, 23
+ vcipherlast 18, 18, 23
+
+ xxlxor 49, 49, 17
+ stxvb16x 49, 16, 9 # store output
+ xxlxor 50, 50, 18
+ stxvb16x 50, 17, 9 # store output
+
+ vcipherlast 19, 19, 23
+ vcipherlast 20, 20, 23
+
+ xxlxor 51, 51, 19
+ stxvb16x 51, 18, 9 # store output
+ xxlxor 52, 52, 20
+ stxvb16x 52, 19, 9 # store output
+
+ vcipherlast 21, 21, 23
+ vcipherlast 22, 22, 23
+
+ xxlxor 53, 53, 21
+ stxvb16x 53, 20, 9 # store output
+ xxlxor 54, 54, 22
+ stxvb16x 54, 21, 9 # store output
+
+ addi 9, 9, 128
+
+ # ghash here
+ ppc_aes_gcm_ghash2_4x
+
+ xxlor 27+32, 0, 0
+ vaddudm 30, 30, 31 # IV + counter
+ vmr 29, 30
+ vxor 15, 30, 27 # add round key
+ vaddudm 30, 30, 31
+ vxor 16, 30, 27
+ vaddudm 30, 30, 31
+ vxor 17, 30, 27
+ vaddudm 30, 30, 31
+ vxor 18, 30, 27
+ vaddudm 30, 30, 31
+ vxor 19, 30, 27
+ vaddudm 30, 30, 31
+ vxor 20, 30, 27
+ vaddudm 30, 30, 31
+ vxor 21, 30, 27
+ vaddudm 30, 30, 31
+ vxor 22, 30, 27
+
+ addi 12, 12, -128
+ addi 11, 11, 128
+
+ bdnz Loop_8x_block
+
+ vmr 30, 29
+ stxvb16x 30+32, 0, 7 # update IV
+
+Loop_last_block:
+ cmpdi 12, 0
+ beq aes_gcm_out
+
+ # loop last few blocks
+ li 10, 16
+ divdu 10, 12, 10
+
+ mtctr 10
+
+ lwz 10, 240(6)
+
+ cmpdi 12, 16
+ blt Final_block
+
+Next_rem_block:
+ lxvb16x 15, 0, 14 # load block
+
+ Loop_aes_middle_1x
+
+ xxlor 23+32, 10, 10
+
+ cmpdi 10, 10
+ beq Do_next_1x
+
+ # 192 bits
+ xxlor 24+32, 11, 11
+
+ vcipher 15, 15, 23
+ vcipher 15, 15, 24
+
+ xxlor 23+32, 12, 12
+
+ cmpdi 10, 12
+ beq Do_next_1x
+
+ # 256 bits
+ xxlor 24+32, 13, 13
+
+ vcipher 15, 15, 23
+ vcipher 15, 15, 24
+
+ xxlor 23+32, 14, 14
+
+ cmpdi 10, 14
+ beq Do_next_1x
+
+Do_next_1x:
+ vcipherlast 15, 15, 23
+
+ xxlxor 47, 47, 15
+ stxvb16x 47, 0, 9 # store output
+ addi 14, 14, 16
+ addi 9, 9, 16
+
+ vmr 28, 15
+ ppc_update_hash_1x
+
+ addi 12, 12, -16
+ addi 11, 11, 16
+ xxlor 19+32, 0, 0
+ vaddudm 30, 30, 31 # IV + counter
+ vxor 15, 30, 19 # add round key
+
+ bdnz Next_rem_block
+
+ li 15, 0
+ std 15, 56(7) # clear partial?
+ stxvb16x 30+32, 0, 7 # update IV
+ cmpdi 12, 0
+ beq aes_gcm_out
+
+Final_block:
+ lwz 10, 240(6)
+ Loop_aes_middle_1x
+
+ xxlor 23+32, 10, 10
+
+ cmpdi 10, 10
+ beq Do_final_1x
+
+ # 192 bits
+ xxlor 24+32, 11, 11
+
+ vcipher 15, 15, 23
+ vcipher 15, 15, 24
+
+ xxlor 23+32, 12, 12
+
+ cmpdi 10, 12
+ beq Do_final_1x
+
+ # 256 bits
+ xxlor 24+32, 13, 13
+
+ vcipher 15, 15, 23
+ vcipher 15, 15, 24
+
+ xxlor 23+32, 14, 14
+
+ cmpdi 10, 14
+ beq Do_final_1x
+
+Do_final_1x:
+ vcipherlast 15, 15, 23
+
+ # check partial block
+ li 21, 0 # encrypt
+ ld 15, 56(7) # partial?
+ cmpdi 15, 0
+ beq Normal_block
+ bl Do_partial_block
+
+ cmpdi 12, 0
+ ble aes_gcm_out
+
+ b Continue_partial_check
+
+Normal_block:
+ lxvb16x 15, 0, 14 # load last block
+ xxlxor 47, 47, 15
+
+ # create partial block mask
+ li 15, 16
+ sub 15, 15, 12 # index to the mask
+
+ vspltisb 16, -1 # first 16 bytes - 0xffff...ff
+ vspltisb 17, 0 # second 16 bytes - 0x0000...00
+ li 10, 192
+ stvx 16, 10, 1
+ addi 10, 10, 16
+ stvx 17, 10, 1
+
+ addi 10, 1, 192
+ lxvb16x 16, 15, 10 # load partial block mask
+ xxland 47, 47, 16
+
+ vmr 28, 15
+ ppc_update_hash_1x
+
+ # * should store only the remaining bytes.
+ bl Write_partial_block
+
+ stxvb16x 30+32, 0, 7 # update IV
+ std 12, 56(7) # update partial?
+ li 16, 16
+
+ stxvb16x 32, 0, 8 # write out Xi
+ stxvb16x 32, 16, 8 # write out Xi
+ b aes_gcm_out
+
+ #
+ # Compute data mask
+ #
+.macro GEN_MASK _mask _start _end
+ vspltisb 16, -1 # first 16 bytes - 0xffff...ff
+ vspltisb 17, 0 # second 16 bytes - 0x0000...00
+ li 10, 192
+ stxvb16x 17+32, 10, 1
+ add 10, 10, \_start
+ stxvb16x 16+32, 10, 1
+ add 10, 10, \_end
+ stxvb16x 17+32, 10, 1
+
+ addi 10, 1, 192
+ lxvb16x \_mask, 0, 10 # load partial block mask
+.endm
+
+ #
+ # Handle multiple partial blocks for encrypt and decrypt
+ # operations.
+ #
+SYM_FUNC_START_LOCAL(Do_partial_block)
+ add 17, 15, 5
+ cmpdi 17, 16
+ bgt Big_block
+ GEN_MASK 18, 15, 5
+ b _Partial
+SYM_FUNC_END(Do_partial_block)
+Big_block:
+ li 16, 16
+ GEN_MASK 18, 15, 16
+
+_Partial:
+ lxvb16x 17+32, 0, 14 # load last block
+ sldi 16, 15, 3
+ mtvsrdd 32+16, 0, 16
+ vsro 17, 17, 16
+ xxlxor 47, 47, 17+32
+ xxland 47, 47, 18
+
+ vxor 0, 0, 0 # clear Xi
+ vmr 28, 15
+
+ cmpdi 21, 0 # encrypt/decrypt ops?
+ beq Skip_decrypt
+ xxland 32+28, 32+17, 18
+
+Skip_decrypt:
+
+ ppc_update_hash_1x
+
+ li 16, 16
+ lxvb16x 32+29, 16, 8
+ vxor 0, 0, 29
+ stxvb16x 32, 0, 8 # save Xi
+ stxvb16x 32, 16, 8 # save Xi
+
+ # store partial block
+ # loop the rest of the stream if any
+ sldi 16, 15, 3
+ mtvsrdd 32+16, 0, 16
+ vslo 15, 15, 16
+ #stxvb16x 15+32, 0, 9 # last block
+
+ li 16, 16
+ sub 17, 16, 15 # 16 - partial
+
+ add 16, 15, 5
+ cmpdi 16, 16
+ bgt Larger_16
+ mr 17, 5
+Larger_16:
+
+ # write partial
+ li 10, 192
+ stxvb16x 15+32, 10, 1 # save current block
+
+ addi 10, 9, -1
+ addi 16, 1, 191
+ mtctr 17 # move partial byte count
+
+Write_last_partial:
+ lbzu 18, 1(16)
+ stbu 18, 1(10)
+ bdnz Write_last_partial
+ # Complete loop partial
+
+ add 14, 14, 17
+ add 9, 9, 17
+ sub 12, 12, 17
+ add 11, 11, 17
+
+ add 15, 15, 5
+ cmpdi 15, 16
+ blt Save_partial
+
+ vaddudm 30, 30, 31
+ stxvb16x 30+32, 0, 7 # update IV
+ xxlor 32+29, 0, 0
+ vxor 15, 30, 29 # IV + round key - add round key 0
+ li 15, 0
+ std 15, 56(7) # partial done - clear
+ b Partial_done
+Save_partial:
+ std 15, 56(7) # partial
+
+Partial_done:
+ blr
+
+ #
+ # Write partial block
+ # r9 - output
+ # r12 - remaining bytes
+ # v15 - partial input data
+ #
+SYM_FUNC_START_LOCAL(Write_partial_block)
+ li 10, 192
+ stxvb16x 15+32, 10, 1 # last block
+
+ addi 10, 9, -1
+ addi 16, 1, 191
+
+ mtctr 12 # remaining bytes
+ li 15, 0
+
+Write_last_byte:
+ lbzu 14, 1(16)
+ stbu 14, 1(10)
+ bdnz Write_last_byte
+ blr
+SYM_FUNC_END(Write_partial_block)
+
+aes_gcm_out:
+ # out = state
+ stxvb16x 32, 0, 8 # write out Xi
+ add 3, 11, 12 # return count
+
+ RESTORE_REGS
+ blr
+
+ #
+ # 8x Decrypt
+ #
+_GLOBAL(aes_p10_gcm_decrypt)
+.align 5
+
+ SAVE_REGS
+
+ LOAD_HASH_TABLE
+
+ # initialize ICB: GHASH( IV ), IV - r7
+ lxvb16x 30+32, 0, 7 # load IV - v30
+
+ mr 12, 5 # length
+ li 11, 0 # block index
+
+ # counter 1
+ vxor 31, 31, 31
+ vspltisb 22, 1
+ vsldoi 31, 31, 22,1 # counter 1
+
+ # load round key to VSR
+ lxv 0, 0(6)
+ lxv 1, 0x10(6)
+ lxv 2, 0x20(6)
+ lxv 3, 0x30(6)
+ lxv 4, 0x40(6)
+ lxv 5, 0x50(6)
+ lxv 6, 0x60(6)
+ lxv 7, 0x70(6)
+ lxv 8, 0x80(6)
+ lxv 9, 0x90(6)
+ lxv 10, 0xa0(6)
+
+ # load rounds - 10 (128), 12 (192), 14 (256)
+ lwz 9,240(6)
+
+ #
+ # vxor state, state, w # addroundkey
+ xxlor 32+29, 0, 0
+ vxor 15, 30, 29 # IV + round key - add round key 0
+
+ cmpdi 9, 10
+ beq Loop_aes_gcm_8x_dec
+
+ # load 2 more round keys (v11, v12)
+ lxv 11, 0xb0(6)
+ lxv 12, 0xc0(6)
+
+ cmpdi 9, 12
+ beq Loop_aes_gcm_8x_dec
+
+ # load 2 more round keys (v11, v12, v13, v14)
+ lxv 13, 0xd0(6)
+ lxv 14, 0xe0(6)
+ cmpdi 9, 14
+ beq Loop_aes_gcm_8x_dec
+
+ b aes_gcm_out
+
+.align 5
+Loop_aes_gcm_8x_dec:
+ mr 14, 3
+ mr 9, 4
+
+ #
+ # check partial block
+ #
+Continue_partial_check_dec:
+ ld 15, 56(7)
+ cmpdi 15, 0
+ beq Continue_dec
+ bgt Final_block_dec
+ cmpdi 15, 16
+ blt Final_block_dec
+
+Continue_dec:
+ # n blcoks
+ li 10, 128
+ divdu 10, 12, 10 # n 128 bytes-blocks
+ cmpdi 10, 0
+ beq Loop_last_block_dec
+
+ vaddudm 30, 30, 31 # IV + counter
+ vxor 16, 30, 29
+ vaddudm 30, 30, 31
+ vxor 17, 30, 29
+ vaddudm 30, 30, 31
+ vxor 18, 30, 29
+ vaddudm 30, 30, 31
+ vxor 19, 30, 29
+ vaddudm 30, 30, 31
+ vxor 20, 30, 29
+ vaddudm 30, 30, 31
+ vxor 21, 30, 29
+ vaddudm 30, 30, 31
+ vxor 22, 30, 29
+
+ mtctr 10
+
+ li 15, 16
+ li 16, 32
+ li 17, 48
+ li 18, 64
+ li 19, 80
+ li 20, 96
+ li 21, 112
+
+ lwz 10, 240(6)
+
+Loop_8x_block_dec:
+
+ lxvb16x 15, 0, 14 # load block
+ lxvb16x 16, 15, 14 # load block
+ lxvb16x 17, 16, 14 # load block
+ lxvb16x 18, 17, 14 # load block
+ lxvb16x 19, 18, 14 # load block
+ lxvb16x 20, 19, 14 # load block
+ lxvb16x 21, 20, 14 # load block
+ lxvb16x 22, 21, 14 # load block
+ addi 14, 14, 128
+
+ Loop_aes_middle8x
+
+ xxlor 23+32, 10, 10
+
+ cmpdi 10, 10
+ beq Do_next_ghash_dec
+
+ # 192 bits
+ xxlor 24+32, 11, 11
+
+ vcipher 15, 15, 23
+ vcipher 16, 16, 23
+ vcipher 17, 17, 23
+ vcipher 18, 18, 23
+ vcipher 19, 19, 23
+ vcipher 20, 20, 23
+ vcipher 21, 21, 23
+ vcipher 22, 22, 23
+
+ vcipher 15, 15, 24
+ vcipher 16, 16, 24
+ vcipher 17, 17, 24
+ vcipher 18, 18, 24
+ vcipher 19, 19, 24
+ vcipher 20, 20, 24
+ vcipher 21, 21, 24
+ vcipher 22, 22, 24
+
+ xxlor 23+32, 12, 12
+
+ cmpdi 10, 12
+ beq Do_next_ghash_dec
+
+ # 256 bits
+ xxlor 24+32, 13, 13
+
+ vcipher 15, 15, 23
+ vcipher 16, 16, 23
+ vcipher 17, 17, 23
+ vcipher 18, 18, 23
+ vcipher 19, 19, 23
+ vcipher 20, 20, 23
+ vcipher 21, 21, 23
+ vcipher 22, 22, 23
+
+ vcipher 15, 15, 24
+ vcipher 16, 16, 24
+ vcipher 17, 17, 24
+ vcipher 18, 18, 24
+ vcipher 19, 19, 24
+ vcipher 20, 20, 24
+ vcipher 21, 21, 24
+ vcipher 22, 22, 24
+
+ xxlor 23+32, 14, 14
+
+ cmpdi 10, 14
+ beq Do_next_ghash_dec
+ b aes_gcm_out
+
+Do_next_ghash_dec:
+
+ #
+ # last round
+ vcipherlast 15, 15, 23
+ vcipherlast 16, 16, 23
+
+ xxlxor 47, 47, 15
+ stxvb16x 47, 0, 9 # store output
+ xxlxor 48, 48, 16
+ stxvb16x 48, 15, 9 # store output
+
+ vcipherlast 17, 17, 23
+ vcipherlast 18, 18, 23
+
+ xxlxor 49, 49, 17
+ stxvb16x 49, 16, 9 # store output
+ xxlxor 50, 50, 18
+ stxvb16x 50, 17, 9 # store output
+
+ vcipherlast 19, 19, 23
+ vcipherlast 20, 20, 23
+
+ xxlxor 51, 51, 19
+ stxvb16x 51, 18, 9 # store output
+ xxlxor 52, 52, 20
+ stxvb16x 52, 19, 9 # store output
+
+ vcipherlast 21, 21, 23
+ vcipherlast 22, 22, 23
+
+ xxlxor 53, 53, 21
+ stxvb16x 53, 20, 9 # store output
+ xxlxor 54, 54, 22
+ stxvb16x 54, 21, 9 # store output
+
+ addi 9, 9, 128
+
+ xxlor 15+32, 15, 15
+ xxlor 16+32, 16, 16
+ xxlor 17+32, 17, 17
+ xxlor 18+32, 18, 18
+ xxlor 19+32, 19, 19
+ xxlor 20+32, 20, 20
+ xxlor 21+32, 21, 21
+ xxlor 22+32, 22, 22
+
+ # ghash here
+ ppc_aes_gcm_ghash2_4x
+
+ xxlor 27+32, 0, 0
+ vaddudm 30, 30, 31 # IV + counter
+ vmr 29, 30
+ vxor 15, 30, 27 # add round key
+ vaddudm 30, 30, 31
+ vxor 16, 30, 27
+ vaddudm 30, 30, 31
+ vxor 17, 30, 27
+ vaddudm 30, 30, 31
+ vxor 18, 30, 27
+ vaddudm 30, 30, 31
+ vxor 19, 30, 27
+ vaddudm 30, 30, 31
+ vxor 20, 30, 27
+ vaddudm 30, 30, 31
+ vxor 21, 30, 27
+ vaddudm 30, 30, 31
+ vxor 22, 30, 27
+
+ addi 12, 12, -128
+ addi 11, 11, 128
+
+ bdnz Loop_8x_block_dec
+
+ vmr 30, 29
+ stxvb16x 30+32, 0, 7 # update IV
+
+Loop_last_block_dec:
+ cmpdi 12, 0
+ beq aes_gcm_out
+
+ # loop last few blocks
+ li 10, 16
+ divdu 10, 12, 10
+
+ mtctr 10
+
+ lwz 10, 240(6)
+
+ cmpdi 12, 16
+ blt Final_block_dec
+
+Next_rem_block_dec:
+ lxvb16x 15, 0, 14 # load block
+
+ Loop_aes_middle_1x
+
+ xxlor 23+32, 10, 10
+
+ cmpdi 10, 10
+ beq Do_next_1x_dec
+
+ # 192 bits
+ xxlor 24+32, 11, 11
+
+ vcipher 15, 15, 23
+ vcipher 15, 15, 24
+
+ xxlor 23+32, 12, 12
+
+ cmpdi 10, 12
+ beq Do_next_1x_dec
+
+ # 256 bits
+ xxlor 24+32, 13, 13
+
+ vcipher 15, 15, 23
+ vcipher 15, 15, 24
+
+ xxlor 23+32, 14, 14
+
+ cmpdi 10, 14
+ beq Do_next_1x_dec
+
+Do_next_1x_dec:
+ vcipherlast 15, 15, 23
+
+ xxlxor 47, 47, 15
+ stxvb16x 47, 0, 9 # store output
+ addi 14, 14, 16
+ addi 9, 9, 16
+
+ xxlor 28+32, 15, 15
+ #vmr 28, 15
+ ppc_update_hash_1x
+
+ addi 12, 12, -16
+ addi 11, 11, 16
+ xxlor 19+32, 0, 0
+ vaddudm 30, 30, 31 # IV + counter
+ vxor 15, 30, 19 # add round key
+
+ bdnz Next_rem_block_dec
+
+ li 15, 0
+ std 15, 56(7) # clear partial?
+ stxvb16x 30+32, 0, 7 # update IV
+ cmpdi 12, 0
+ beq aes_gcm_out
+
+Final_block_dec:
+ lwz 10, 240(6)
+ Loop_aes_middle_1x
+
+ xxlor 23+32, 10, 10
+
+ cmpdi 10, 10
+ beq Do_final_1x_dec
+
+ # 192 bits
+ xxlor 24+32, 11, 11
+
+ vcipher 15, 15, 23
+ vcipher 15, 15, 24
+
+ xxlor 23+32, 12, 12
+
+ cmpdi 10, 12
+ beq Do_final_1x_dec
+
+ # 256 bits
+ xxlor 24+32, 13, 13
+
+ vcipher 15, 15, 23
+ vcipher 15, 15, 24
+
+ xxlor 23+32, 14, 14
+
+ cmpdi 10, 14
+ beq Do_final_1x_dec
+
+Do_final_1x_dec:
+ vcipherlast 15, 15, 23
+
+ # check partial block
+ li 21, 1 # decrypt
+ ld 15, 56(7) # partial?
+ cmpdi 15, 0
+ beq Normal_block_dec
+ bl Do_partial_block
+ cmpdi 12, 0
+ ble aes_gcm_out
+
+ b Continue_partial_check_dec
+
+Normal_block_dec:
+ lxvb16x 15, 0, 14 # load last block
+ xxlxor 47, 47, 15
+
+ # create partial block mask
+ li 15, 16
+ sub 15, 15, 12 # index to the mask
+
+ vspltisb 16, -1 # first 16 bytes - 0xffff...ff
+ vspltisb 17, 0 # second 16 bytes - 0x0000...00
+ li 10, 192
+ stvx 16, 10, 1
+ addi 10, 10, 16
+ stvx 17, 10, 1
+
+ addi 10, 1, 192
+ lxvb16x 16, 15, 10 # load partial block mask
+ xxland 47, 47, 16
+
+ xxland 32+28, 15, 16
+ #vmr 28, 15
+ ppc_update_hash_1x
+
+ # * should store only the remaining bytes.
+ bl Write_partial_block
+
+ stxvb16x 30+32, 0, 7 # update IV
+ std 12, 56(7) # update partial?
+ li 16, 16
+
+ stxvb16x 32, 0, 8 # write out Xi
+ stxvb16x 32, 16, 8 # write out Xi
+ b aes_gcm_out
diff --git a/arch/powerpc/crypto/aes-spe-glue.c b/arch/powerpc/crypto/aes-spe-glue.c
index 1fad5d4c658d..efab78a3a8f6 100644
--- a/arch/powerpc/crypto/aes-spe-glue.c
+++ b/arch/powerpc/crypto/aes-spe-glue.c
@@ -28,7 +28,7 @@
* instructions per clock cycle using one 32/64 bit unit (SU1) and one 32
* bit unit (SU2). One of these can be a memory access that is executed via
* a single load and store unit (LSU). XTS-AES-256 takes ~780 operations per
- * 16 byte block block or 25 cycles per byte. Thus 768 bytes of input data
+ * 16 byte block or 25 cycles per byte. Thus 768 bytes of input data
* will need an estimated maximum of 20,000 cycles. Headroom for cache misses
* included. Even with the low end model clocked at 667 MHz this equals to a
* critical time window of less than 30us. The value has been chosen to
@@ -94,13 +94,6 @@ static int ppc_aes_setkey(struct crypto_tfm *tfm, const u8 *in_key,
{
struct ppc_aes_ctx *ctx = crypto_tfm_ctx(tfm);
- if (key_len != AES_KEYSIZE_128 &&
- key_len != AES_KEYSIZE_192 &&
- key_len != AES_KEYSIZE_256) {
- tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
- }
-
switch (key_len) {
case AES_KEYSIZE_128:
ctx->rounds = 4;
@@ -114,6 +107,8 @@ static int ppc_aes_setkey(struct crypto_tfm *tfm, const u8 *in_key,
ctx->rounds = 6;
ppc_expand_key_256(ctx->key_enc, in_key);
break;
+ default:
+ return -EINVAL;
}
ppc_generate_decrypt_key(ctx->key_dec, ctx->key_enc, key_len);
@@ -139,13 +134,6 @@ static int ppc_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
key_len >>= 1;
- if (key_len != AES_KEYSIZE_128 &&
- key_len != AES_KEYSIZE_192 &&
- key_len != AES_KEYSIZE_256) {
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
- }
-
switch (key_len) {
case AES_KEYSIZE_128:
ctx->rounds = 4;
@@ -162,6 +150,8 @@ static int ppc_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
ppc_expand_key_256(ctx->key_enc, in_key);
ppc_expand_key_256(ctx->key_twk, in_key + AES_KEYSIZE_256);
break;
+ default:
+ return -EINVAL;
}
ppc_generate_decrypt_key(ctx->key_dec, ctx->key_enc, key_len);
@@ -414,7 +404,7 @@ static int ppc_xts_decrypt(struct skcipher_request *req)
/*
* Algorithm definitions. Disabling alignment (cra_alignmask=0) was chosen
- * because the e500 platform can handle unaligned reads/writes very efficently.
+ * because the e500 platform can handle unaligned reads/writes very efficiently.
* This improves IPsec thoughput by another few percent. Additionally we assume
* that AES context is always aligned to at least 8 bytes because it is created
* with kmalloc() in the crypto infrastructure
diff --git a/arch/powerpc/crypto/aes.c b/arch/powerpc/crypto/aes.c
new file mode 100644
index 000000000000..ec06189fbf99
--- /dev/null
+++ b/arch/powerpc/crypto/aes.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AES routines supporting VMX instructions on the Power 8
+ *
+ * Copyright (C) 2015 International Business Machines Inc.
+ *
+ * Author: Marcelo Henrique Cerri <mhcerri@br.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/err.h>
+#include <linux/crypto.h>
+#include <linux/delay.h>
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+#include <crypto/aes.h>
+#include <crypto/internal/cipher.h>
+#include <crypto/internal/simd.h>
+
+#include "aesp8-ppc.h"
+
+struct p8_aes_ctx {
+ struct crypto_cipher *fallback;
+ struct aes_key enc_key;
+ struct aes_key dec_key;
+};
+
+static int p8_aes_init(struct crypto_tfm *tfm)
+{
+ const char *alg = crypto_tfm_alg_name(tfm);
+ struct crypto_cipher *fallback;
+ struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ fallback = crypto_alloc_cipher(alg, 0, CRYPTO_ALG_NEED_FALLBACK);
+ if (IS_ERR(fallback)) {
+ printk(KERN_ERR
+ "Failed to allocate transformation for '%s': %ld\n",
+ alg, PTR_ERR(fallback));
+ return PTR_ERR(fallback);
+ }
+
+ crypto_cipher_set_flags(fallback,
+ crypto_cipher_get_flags((struct
+ crypto_cipher *)
+ tfm));
+ ctx->fallback = fallback;
+
+ return 0;
+}
+
+static void p8_aes_exit(struct crypto_tfm *tfm)
+{
+ struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ if (ctx->fallback) {
+ crypto_free_cipher(ctx->fallback);
+ ctx->fallback = NULL;
+ }
+}
+
+static int p8_aes_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ int ret;
+ struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_vsx();
+ ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
+ ret |= aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
+ disable_kernel_vsx();
+ pagefault_enable();
+ preempt_enable();
+
+ ret |= crypto_cipher_setkey(ctx->fallback, key, keylen);
+
+ return ret ? -EINVAL : 0;
+}
+
+static void p8_aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ if (!crypto_simd_usable()) {
+ crypto_cipher_encrypt_one(ctx->fallback, dst, src);
+ } else {
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_vsx();
+ aes_p8_encrypt(src, dst, &ctx->enc_key);
+ disable_kernel_vsx();
+ pagefault_enable();
+ preempt_enable();
+ }
+}
+
+static void p8_aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ if (!crypto_simd_usable()) {
+ crypto_cipher_decrypt_one(ctx->fallback, dst, src);
+ } else {
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_vsx();
+ aes_p8_decrypt(src, dst, &ctx->dec_key);
+ disable_kernel_vsx();
+ pagefault_enable();
+ preempt_enable();
+ }
+}
+
+struct crypto_alg p8_aes_alg = {
+ .cra_name = "aes",
+ .cra_driver_name = "p8_aes",
+ .cra_module = THIS_MODULE,
+ .cra_priority = 1000,
+ .cra_type = NULL,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_NEED_FALLBACK,
+ .cra_alignmask = 0,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct p8_aes_ctx),
+ .cra_init = p8_aes_init,
+ .cra_exit = p8_aes_exit,
+ .cra_cipher = {
+ .cia_min_keysize = AES_MIN_KEY_SIZE,
+ .cia_max_keysize = AES_MAX_KEY_SIZE,
+ .cia_setkey = p8_aes_setkey,
+ .cia_encrypt = p8_aes_encrypt,
+ .cia_decrypt = p8_aes_decrypt,
+ },
+};
diff --git a/arch/powerpc/crypto/aes_cbc.c b/arch/powerpc/crypto/aes_cbc.c
new file mode 100644
index 000000000000..ed0debc7acb5
--- /dev/null
+++ b/arch/powerpc/crypto/aes_cbc.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AES CBC routines supporting VMX instructions on the Power 8
+ *
+ * Copyright (C) 2015 International Business Machines Inc.
+ *
+ * Author: Marcelo Henrique Cerri <mhcerri@br.ibm.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+#include <crypto/aes.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+
+#include "aesp8-ppc.h"
+
+struct p8_aes_cbc_ctx {
+ struct crypto_skcipher *fallback;
+ struct aes_key enc_key;
+ struct aes_key dec_key;
+};
+
+static int p8_aes_cbc_init(struct crypto_skcipher *tfm)
+{
+ struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct crypto_skcipher *fallback;
+
+ fallback = crypto_alloc_skcipher("cbc(aes)", 0,
+ CRYPTO_ALG_NEED_FALLBACK |
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(fallback)) {
+ pr_err("Failed to allocate cbc(aes) fallback: %ld\n",
+ PTR_ERR(fallback));
+ return PTR_ERR(fallback);
+ }
+
+ crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) +
+ crypto_skcipher_reqsize(fallback));
+ ctx->fallback = fallback;
+ return 0;
+}
+
+static void p8_aes_cbc_exit(struct crypto_skcipher *tfm)
+{
+ struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ crypto_free_skcipher(ctx->fallback);
+}
+
+static int p8_aes_cbc_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int ret;
+
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_vsx();
+ ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
+ ret |= aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
+ disable_kernel_vsx();
+ pagefault_enable();
+ preempt_enable();
+
+ ret |= crypto_skcipher_setkey(ctx->fallback, key, keylen);
+
+ return ret ? -EINVAL : 0;
+}
+
+static int p8_aes_cbc_crypt(struct skcipher_request *req, int enc)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ const struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int ret;
+
+ if (!crypto_simd_usable()) {
+ struct skcipher_request *subreq = skcipher_request_ctx(req);
+
+ *subreq = *req;
+ skcipher_request_set_tfm(subreq, ctx->fallback);
+ return enc ? crypto_skcipher_encrypt(subreq) :
+ crypto_skcipher_decrypt(subreq);
+ }
+
+ ret = skcipher_walk_virt(&walk, req, false);
+ while ((nbytes = walk.nbytes) != 0) {
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_vsx();
+ aes_p8_cbc_encrypt(walk.src.virt.addr,
+ walk.dst.virt.addr,
+ round_down(nbytes, AES_BLOCK_SIZE),
+ enc ? &ctx->enc_key : &ctx->dec_key,
+ walk.iv, enc);
+ disable_kernel_vsx();
+ pagefault_enable();
+ preempt_enable();
+
+ ret = skcipher_walk_done(&walk, nbytes % AES_BLOCK_SIZE);
+ }
+ return ret;
+}
+
+static int p8_aes_cbc_encrypt(struct skcipher_request *req)
+{
+ return p8_aes_cbc_crypt(req, 1);
+}
+
+static int p8_aes_cbc_decrypt(struct skcipher_request *req)
+{
+ return p8_aes_cbc_crypt(req, 0);
+}
+
+struct skcipher_alg p8_aes_cbc_alg = {
+ .base.cra_name = "cbc(aes)",
+ .base.cra_driver_name = "p8_aes_cbc",
+ .base.cra_module = THIS_MODULE,
+ .base.cra_priority = 2000,
+ .base.cra_flags = CRYPTO_ALG_NEED_FALLBACK,
+ .base.cra_blocksize = AES_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct p8_aes_cbc_ctx),
+ .setkey = p8_aes_cbc_setkey,
+ .encrypt = p8_aes_cbc_encrypt,
+ .decrypt = p8_aes_cbc_decrypt,
+ .init = p8_aes_cbc_init,
+ .exit = p8_aes_cbc_exit,
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+};
diff --git a/arch/powerpc/crypto/aes_ctr.c b/arch/powerpc/crypto/aes_ctr.c
new file mode 100644
index 000000000000..9a3da8cd62f3
--- /dev/null
+++ b/arch/powerpc/crypto/aes_ctr.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AES CTR routines supporting VMX instructions on the Power 8
+ *
+ * Copyright (C) 2015 International Business Machines Inc.
+ *
+ * Author: Marcelo Henrique Cerri <mhcerri@br.ibm.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+#include <crypto/aes.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+
+#include "aesp8-ppc.h"
+
+struct p8_aes_ctr_ctx {
+ struct crypto_skcipher *fallback;
+ struct aes_key enc_key;
+};
+
+static int p8_aes_ctr_init(struct crypto_skcipher *tfm)
+{
+ struct p8_aes_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct crypto_skcipher *fallback;
+
+ fallback = crypto_alloc_skcipher("ctr(aes)", 0,
+ CRYPTO_ALG_NEED_FALLBACK |
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(fallback)) {
+ pr_err("Failed to allocate ctr(aes) fallback: %ld\n",
+ PTR_ERR(fallback));
+ return PTR_ERR(fallback);
+ }
+
+ crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) +
+ crypto_skcipher_reqsize(fallback));
+ ctx->fallback = fallback;
+ return 0;
+}
+
+static void p8_aes_ctr_exit(struct crypto_skcipher *tfm)
+{
+ struct p8_aes_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ crypto_free_skcipher(ctx->fallback);
+}
+
+static int p8_aes_ctr_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct p8_aes_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int ret;
+
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_vsx();
+ ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
+ disable_kernel_vsx();
+ pagefault_enable();
+ preempt_enable();
+
+ ret |= crypto_skcipher_setkey(ctx->fallback, key, keylen);
+
+ return ret ? -EINVAL : 0;
+}
+
+static void p8_aes_ctr_final(const struct p8_aes_ctr_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ u8 *ctrblk = walk->iv;
+ u8 keystream[AES_BLOCK_SIZE];
+ u8 *src = walk->src.virt.addr;
+ u8 *dst = walk->dst.virt.addr;
+ unsigned int nbytes = walk->nbytes;
+
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_vsx();
+ aes_p8_encrypt(ctrblk, keystream, &ctx->enc_key);
+ disable_kernel_vsx();
+ pagefault_enable();
+ preempt_enable();
+
+ crypto_xor_cpy(dst, keystream, src, nbytes);
+ crypto_inc(ctrblk, AES_BLOCK_SIZE);
+}
+
+static int p8_aes_ctr_crypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ const struct p8_aes_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int ret;
+
+ if (!crypto_simd_usable()) {
+ struct skcipher_request *subreq = skcipher_request_ctx(req);
+
+ *subreq = *req;
+ skcipher_request_set_tfm(subreq, ctx->fallback);
+ return crypto_skcipher_encrypt(subreq);
+ }
+
+ ret = skcipher_walk_virt(&walk, req, false);
+ while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_vsx();
+ aes_p8_ctr32_encrypt_blocks(walk.src.virt.addr,
+ walk.dst.virt.addr,
+ nbytes / AES_BLOCK_SIZE,
+ &ctx->enc_key, walk.iv);
+ disable_kernel_vsx();
+ pagefault_enable();
+ preempt_enable();
+
+ do {
+ crypto_inc(walk.iv, AES_BLOCK_SIZE);
+ } while ((nbytes -= AES_BLOCK_SIZE) >= AES_BLOCK_SIZE);
+
+ ret = skcipher_walk_done(&walk, nbytes);
+ }
+ if (nbytes) {
+ p8_aes_ctr_final(ctx, &walk);
+ ret = skcipher_walk_done(&walk, 0);
+ }
+ return ret;
+}
+
+struct skcipher_alg p8_aes_ctr_alg = {
+ .base.cra_name = "ctr(aes)",
+ .base.cra_driver_name = "p8_aes_ctr",
+ .base.cra_module = THIS_MODULE,
+ .base.cra_priority = 2000,
+ .base.cra_flags = CRYPTO_ALG_NEED_FALLBACK,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct p8_aes_ctr_ctx),
+ .setkey = p8_aes_ctr_setkey,
+ .encrypt = p8_aes_ctr_crypt,
+ .decrypt = p8_aes_ctr_crypt,
+ .init = p8_aes_ctr_init,
+ .exit = p8_aes_ctr_exit,
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
+};
diff --git a/arch/powerpc/crypto/aes_xts.c b/arch/powerpc/crypto/aes_xts.c
new file mode 100644
index 000000000000..dabbccb41550
--- /dev/null
+++ b/arch/powerpc/crypto/aes_xts.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AES XTS routines supporting VMX In-core instructions on Power 8
+ *
+ * Copyright (C) 2015 International Business Machines Inc.
+ *
+ * Author: Leonidas S. Barbosa <leosilva@linux.vnet.ibm.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+#include <crypto/aes.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/xts.h>
+
+#include "aesp8-ppc.h"
+
+struct p8_aes_xts_ctx {
+ struct crypto_skcipher *fallback;
+ struct aes_key enc_key;
+ struct aes_key dec_key;
+ struct aes_key tweak_key;
+};
+
+static int p8_aes_xts_init(struct crypto_skcipher *tfm)
+{
+ struct p8_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct crypto_skcipher *fallback;
+
+ fallback = crypto_alloc_skcipher("xts(aes)", 0,
+ CRYPTO_ALG_NEED_FALLBACK |
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(fallback)) {
+ pr_err("Failed to allocate xts(aes) fallback: %ld\n",
+ PTR_ERR(fallback));
+ return PTR_ERR(fallback);
+ }
+
+ crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) +
+ crypto_skcipher_reqsize(fallback));
+ ctx->fallback = fallback;
+ return 0;
+}
+
+static void p8_aes_xts_exit(struct crypto_skcipher *tfm)
+{
+ struct p8_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ crypto_free_skcipher(ctx->fallback);
+}
+
+static int p8_aes_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct p8_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int ret;
+
+ ret = xts_verify_key(tfm, key, keylen);
+ if (ret)
+ return ret;
+
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_vsx();
+ ret = aes_p8_set_encrypt_key(key + keylen/2, (keylen/2) * 8, &ctx->tweak_key);
+ ret |= aes_p8_set_encrypt_key(key, (keylen/2) * 8, &ctx->enc_key);
+ ret |= aes_p8_set_decrypt_key(key, (keylen/2) * 8, &ctx->dec_key);
+ disable_kernel_vsx();
+ pagefault_enable();
+ preempt_enable();
+
+ ret |= crypto_skcipher_setkey(ctx->fallback, key, keylen);
+
+ return ret ? -EINVAL : 0;
+}
+
+static int p8_aes_xts_crypt(struct skcipher_request *req, int enc)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ const struct p8_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ u8 tweak[AES_BLOCK_SIZE];
+ int ret;
+
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+
+ if (!crypto_simd_usable() || (req->cryptlen % XTS_BLOCK_SIZE) != 0) {
+ struct skcipher_request *subreq = skcipher_request_ctx(req);
+
+ *subreq = *req;
+ skcipher_request_set_tfm(subreq, ctx->fallback);
+ return enc ? crypto_skcipher_encrypt(subreq) :
+ crypto_skcipher_decrypt(subreq);
+ }
+
+ ret = skcipher_walk_virt(&walk, req, false);
+ if (ret)
+ return ret;
+
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_vsx();
+
+ aes_p8_encrypt(walk.iv, tweak, &ctx->tweak_key);
+
+ disable_kernel_vsx();
+ pagefault_enable();
+ preempt_enable();
+
+ while ((nbytes = walk.nbytes) != 0) {
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_vsx();
+ if (enc)
+ aes_p8_xts_encrypt(walk.src.virt.addr,
+ walk.dst.virt.addr,
+ round_down(nbytes, AES_BLOCK_SIZE),
+ &ctx->enc_key, NULL, tweak);
+ else
+ aes_p8_xts_decrypt(walk.src.virt.addr,
+ walk.dst.virt.addr,
+ round_down(nbytes, AES_BLOCK_SIZE),
+ &ctx->dec_key, NULL, tweak);
+ disable_kernel_vsx();
+ pagefault_enable();
+ preempt_enable();
+
+ ret = skcipher_walk_done(&walk, nbytes % AES_BLOCK_SIZE);
+ }
+ return ret;
+}
+
+static int p8_aes_xts_encrypt(struct skcipher_request *req)
+{
+ return p8_aes_xts_crypt(req, 1);
+}
+
+static int p8_aes_xts_decrypt(struct skcipher_request *req)
+{
+ return p8_aes_xts_crypt(req, 0);
+}
+
+struct skcipher_alg p8_aes_xts_alg = {
+ .base.cra_name = "xts(aes)",
+ .base.cra_driver_name = "p8_aes_xts",
+ .base.cra_module = THIS_MODULE,
+ .base.cra_priority = 2000,
+ .base.cra_flags = CRYPTO_ALG_NEED_FALLBACK,
+ .base.cra_blocksize = AES_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct p8_aes_xts_ctx),
+ .setkey = p8_aes_xts_setkey,
+ .encrypt = p8_aes_xts_encrypt,
+ .decrypt = p8_aes_xts_decrypt,
+ .init = p8_aes_xts_init,
+ .exit = p8_aes_xts_exit,
+ .min_keysize = 2 * AES_MIN_KEY_SIZE,
+ .max_keysize = 2 * AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+};
diff --git a/arch/powerpc/crypto/aesp10-ppc.pl b/arch/powerpc/crypto/aesp10-ppc.pl
new file mode 100644
index 000000000000..2c06ce2a2c7c
--- /dev/null
+++ b/arch/powerpc/crypto/aesp10-ppc.pl
@@ -0,0 +1,585 @@
+#! /usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from CRYPTOGAMs[1] and is included here using the option
+# in the license to distribute the code under the GPL. Therefore this program
+# is free software; you can redistribute it and/or modify it under the terms of
+# the GNU General Public License version 2 as published by the Free Software
+# Foundation.
+#
+# [1] https://www.openssl.org/~appro/cryptogams/
+
+# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain copyright notices,
+# this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# * Neither the name of the CRYPTOGAMS nor the names of its
+# copyright holder and contributors may be used to endorse or
+# promote products derived from this software without specific
+# prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for AES instructions as per PowerISA
+# specification version 2.07, first implemented by POWER8 processor.
+# The module is endian-agnostic in sense that it supports both big-
+# and little-endian cases. Data alignment in parallelizable modes is
+# handled with VSX loads and stores, which implies MSR.VSX flag being
+# set. It should also be noted that ISA specification doesn't prohibit
+# alignment exceptions for these instructions on page boundaries.
+# Initially alignment was handled in pure AltiVec/VMX way [when data
+# is aligned programmatically, which in turn guarantees exception-
+# free execution], but it turned to hamper performance when vcipher
+# instructions are interleaved. It's reckoned that eventual
+# misalignment penalties at page boundaries are in average lower
+# than additional overhead in pure AltiVec approach.
+#
+# May 2016
+#
+# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
+# systems were measured.
+#
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+# CBC en-/decrypt CTR XTS
+# POWER8[le] 3.96/0.72 0.74 1.1
+# POWER8[be] 3.75/0.65 0.66 1.0
+
+$flavour = shift;
+
+if ($flavour =~ /64/) {
+ $SIZE_T =8;
+ $LRSAVE =2*$SIZE_T;
+ $STU ="stdu";
+ $POP ="ld";
+ $PUSH ="std";
+ $UCMP ="cmpld";
+ $SHL ="sldi";
+} elsif ($flavour =~ /32/) {
+ $SIZE_T =4;
+ $LRSAVE =$SIZE_T;
+ $STU ="stwu";
+ $POP ="lwz";
+ $PUSH ="stw";
+ $UCMP ="cmplw";
+ $SHL ="slwi";
+} else { die "nonsense $flavour"; }
+
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=8*$SIZE_T;
+$prefix="aes_p10";
+
+$sp="r1";
+$vrsave="r12";
+
+#########################################################################
+{{{ # Key setup procedures #
+my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
+my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
+my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
+
+$code.=<<___;
+.machine "any"
+
+.text
+
+.align 7
+rcon:
+.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 ?rev
+.long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev
+.long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev
+.long 0,0,0,0 ?asis
+Lconsts:
+ mflr r0
+ bcl 20,31,\$+4
+ mflr $ptr #vvvvv "distance between . and rcon
+ addi $ptr,$ptr,-0x48
+ mtlr r0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+.asciz "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+
+.globl .${prefix}_set_encrypt_key
+Lset_encrypt_key:
+ mflr r11
+ $PUSH r11,$LRSAVE($sp)
+
+ li $ptr,-1
+ ${UCMP}i $inp,0
+ beq- Lenc_key_abort # if ($inp==0) return -1;
+ ${UCMP}i $out,0
+ beq- Lenc_key_abort # if ($out==0) return -1;
+ li $ptr,-2
+ cmpwi $bits,128
+ blt- Lenc_key_abort
+ cmpwi $bits,256
+ bgt- Lenc_key_abort
+ andi. r0,$bits,0x3f
+ bne- Lenc_key_abort
+
+ lis r0,0xfff0
+ mfspr $vrsave,256
+ mtspr 256,r0
+
+ bl Lconsts
+ mtlr r11
+
+ neg r9,$inp
+ lvx $in0,0,$inp
+ addi $inp,$inp,15 # 15 is not typo
+ lvsr $key,0,r9 # borrow $key
+ li r8,0x20
+ cmpwi $bits,192
+ lvx $in1,0,$inp
+ le?vspltisb $mask,0x0f # borrow $mask
+ lvx $rcon,0,$ptr
+ le?vxor $key,$key,$mask # adjust for byte swap
+ lvx $mask,r8,$ptr
+ addi $ptr,$ptr,0x10
+ vperm $in0,$in0,$in1,$key # align [and byte swap in LE]
+ li $cnt,8
+ vxor $zero,$zero,$zero
+ mtctr $cnt
+
+ ?lvsr $outperm,0,$out
+ vspltisb $outmask,-1
+ lvx $outhead,0,$out
+ ?vperm $outmask,$zero,$outmask,$outperm
+
+ blt Loop128
+ addi $inp,$inp,8
+ beq L192
+ addi $inp,$inp,8
+ b L256
+
+.align 4
+Loop128:
+ vperm $key,$in0,$in0,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in0,$in0,$key
+ bdnz Loop128
+
+ lvx $rcon,0,$ptr # last two round keys
+
+ vperm $key,$in0,$in0,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in0,$in0,$key
+
+ vperm $key,$in0,$in0,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vxor $in0,$in0,$key
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+
+ addi $inp,$out,15 # 15 is not typo
+ addi $out,$out,0x50
+
+ li $rounds,10
+ b Ldone
+
+.align 4
+L192:
+ lvx $tmp,0,$inp
+ li $cnt,4
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+ addi $out,$out,16
+ vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
+ vspltisb $key,8 # borrow $key
+ mtctr $cnt
+ vsububm $mask,$mask,$key # adjust the mask
+
+Loop192:
+ vperm $key,$in1,$in1,$mask # roate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vcipherlast $key,$key,$rcon
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+
+ vsldoi $stage,$zero,$in1,8
+ vspltw $tmp,$in0,3
+ vxor $tmp,$tmp,$in1
+ vsldoi $in1,$zero,$in1,12 # >>32
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in1,$in1,$tmp
+ vxor $in0,$in0,$key
+ vxor $in1,$in1,$key
+ vsldoi $stage,$stage,$in0,8
+
+ vperm $key,$in1,$in1,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$stage,$stage,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vsldoi $stage,$in0,$in1,8
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vperm $outtail,$stage,$stage,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vspltw $tmp,$in0,3
+ vxor $tmp,$tmp,$in1
+ vsldoi $in1,$zero,$in1,12 # >>32
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in1,$in1,$tmp
+ vxor $in0,$in0,$key
+ vxor $in1,$in1,$key
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+ addi $inp,$out,15 # 15 is not typo
+ addi $out,$out,16
+ bdnz Loop192
+
+ li $rounds,12
+ addi $out,$out,0x20
+ b Ldone
+
+.align 4
+L256:
+ lvx $tmp,0,$inp
+ li $cnt,7
+ li $rounds,14
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+ addi $out,$out,16
+ vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
+ mtctr $cnt
+
+Loop256:
+ vperm $key,$in1,$in1,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$in1,$in1,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in0,$in0,$key
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+ addi $inp,$out,15 # 15 is not typo
+ addi $out,$out,16
+ bdz Ldone
+
+ vspltw $key,$in0,3 # just splat
+ vsldoi $tmp,$zero,$in1,12 # >>32
+ vsbox $key,$key
+
+ vxor $in1,$in1,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in1,$in1,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in1,$in1,$tmp
+
+ vxor $in1,$in1,$key
+ b Loop256
+
+.align 4
+Ldone:
+ lvx $in1,0,$inp # redundant in aligned case
+ vsel $in1,$outhead,$in1,$outmask
+ stvx $in1,0,$inp
+ li $ptr,0
+ mtspr 256,$vrsave
+ stw $rounds,0($out)
+
+Lenc_key_abort:
+ mr r3,$ptr
+ blr
+ .long 0
+ .byte 0,12,0x14,1,0,0,3,0
+ .long 0
+.size .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
+
+.globl .${prefix}_set_decrypt_key
+ $STU $sp,-$FRAME($sp)
+ mflr r10
+ $PUSH r10,$FRAME+$LRSAVE($sp)
+ bl Lset_encrypt_key
+ mtlr r10
+
+ cmpwi r3,0
+ bne- Ldec_key_abort
+
+ slwi $cnt,$rounds,4
+ subi $inp,$out,240 # first round key
+ srwi $rounds,$rounds,1
+ add $out,$inp,$cnt # last round key
+ mtctr $rounds
+
+Ldeckey:
+ lwz r0, 0($inp)
+ lwz r6, 4($inp)
+ lwz r7, 8($inp)
+ lwz r8, 12($inp)
+ addi $inp,$inp,16
+ lwz r9, 0($out)
+ lwz r10,4($out)
+ lwz r11,8($out)
+ lwz r12,12($out)
+ stw r0, 0($out)
+ stw r6, 4($out)
+ stw r7, 8($out)
+ stw r8, 12($out)
+ subi $out,$out,16
+ stw r9, -16($inp)
+ stw r10,-12($inp)
+ stw r11,-8($inp)
+ stw r12,-4($inp)
+ bdnz Ldeckey
+
+ xor r3,r3,r3 # return value
+Ldec_key_abort:
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,4,1,0x80,0,3,0
+ .long 0
+.size .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
+___
+}}}
+#########################################################################
+{{{ # Single block en- and decrypt procedures #
+sub gen_block () {
+my $dir = shift;
+my $n = $dir eq "de" ? "n" : "";
+my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
+
+$code.=<<___;
+.globl .${prefix}_${dir}crypt
+ lwz $rounds,240($key)
+ lis r0,0xfc00
+ mfspr $vrsave,256
+ li $idx,15 # 15 is not typo
+ mtspr 256,r0
+
+ lvx v0,0,$inp
+ neg r11,$out
+ lvx v1,$idx,$inp
+ lvsl v2,0,$inp # inpperm
+ le?vspltisb v4,0x0f
+ ?lvsl v3,0,r11 # outperm
+ le?vxor v2,v2,v4
+ li $idx,16
+ vperm v0,v0,v1,v2 # align [and byte swap in LE]
+ lvx v1,0,$key
+ ?lvsl v5,0,$key # keyperm
+ srwi $rounds,$rounds,1
+ lvx v2,$idx,$key
+ addi $idx,$idx,16
+ subi $rounds,$rounds,1
+ ?vperm v1,v1,v2,v5 # align round key
+
+ vxor v0,v0,v1
+ lvx v1,$idx,$key
+ addi $idx,$idx,16
+ mtctr $rounds
+
+Loop_${dir}c:
+ ?vperm v2,v2,v1,v5
+ v${n}cipher v0,v0,v2
+ lvx v2,$idx,$key
+ addi $idx,$idx,16
+ ?vperm v1,v1,v2,v5
+ v${n}cipher v0,v0,v1
+ lvx v1,$idx,$key
+ addi $idx,$idx,16
+ bdnz Loop_${dir}c
+
+ ?vperm v2,v2,v1,v5
+ v${n}cipher v0,v0,v2
+ lvx v2,$idx,$key
+ ?vperm v1,v1,v2,v5
+ v${n}cipherlast v0,v0,v1
+
+ vspltisb v2,-1
+ vxor v1,v1,v1
+ li $idx,15 # 15 is not typo
+ ?vperm v2,v1,v2,v3 # outmask
+ le?vxor v3,v3,v4
+ lvx v1,0,$out # outhead
+ vperm v0,v0,v0,v3 # rotate [and byte swap in LE]
+ vsel v1,v1,v0,v2
+ lvx v4,$idx,$out
+ stvx v1,0,$out
+ vsel v0,v0,v4,v2
+ stvx v0,$idx,$out
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,3,0
+ .long 0
+.size .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+
+my $consts=1;
+foreach(split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/geo;
+
+ # constants table endian-specific conversion
+ if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
+ my $conv=$3;
+ my @bytes=();
+
+ # convert to endian-agnostic format
+ if ($1 eq "long") {
+ foreach (split(/,\s*/,$2)) {
+ my $l = /^0/?oct:int;
+ push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
+ }
+ } else {
+ @bytes = map(/^0/?oct:int,split(/,\s*/,$2));
+ }
+
+ # little-endian conversion
+ if ($flavour =~ /le$/o) {
+ SWITCH: for($conv) {
+ /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
+ /\?rev/ && do { @bytes=reverse(@bytes); last; };
+ }
+ }
+
+ #emit
+ print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
+ next;
+ }
+ $consts=0 if (m/Lconsts:/o); # end of table
+
+ # instructions prefixed with '?' are endian-specific and need
+ # to be adjusted accordingly...
+ if ($flavour =~ /le$/o) { # little-endian
+ s/le\?//o or
+ s/be\?/#be#/o or
+ s/\?lvsr/lvsl/o or
+ s/\?lvsl/lvsr/o or
+ s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
+ s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
+ s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
+ } else { # big-endian
+ s/le\?/#le#/o or
+ s/be\?//o or
+ s/\?([a-z]+)/$1/o;
+ }
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/arch/powerpc/crypto/aesp8-ppc.h b/arch/powerpc/crypto/aesp8-ppc.h
new file mode 100644
index 000000000000..5764d4438388
--- /dev/null
+++ b/arch/powerpc/crypto/aesp8-ppc.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/types.h>
+#include <crypto/aes.h>
+
+struct aes_key {
+ u8 key[AES_MAX_KEYLENGTH];
+ int rounds;
+};
+
+extern struct shash_alg p8_ghash_alg;
+extern struct crypto_alg p8_aes_alg;
+extern struct skcipher_alg p8_aes_cbc_alg;
+extern struct skcipher_alg p8_aes_ctr_alg;
+extern struct skcipher_alg p8_aes_xts_alg;
+
+int aes_p8_set_encrypt_key(const u8 *userKey, const int bits,
+ struct aes_key *key);
+int aes_p8_set_decrypt_key(const u8 *userKey, const int bits,
+ struct aes_key *key);
+void aes_p8_encrypt(const u8 *in, u8 *out, const struct aes_key *key);
+void aes_p8_decrypt(const u8 *in, u8 *out, const struct aes_key *key);
+void aes_p8_cbc_encrypt(const u8 *in, u8 *out, size_t len,
+ const struct aes_key *key, u8 *iv, const int enc);
+void aes_p8_ctr32_encrypt_blocks(const u8 *in, u8 *out,
+ size_t len, const struct aes_key *key,
+ const u8 *iv);
+void aes_p8_xts_encrypt(const u8 *in, u8 *out, size_t len,
+ const struct aes_key *key1, const struct aes_key *key2, u8 *iv);
+void aes_p8_xts_decrypt(const u8 *in, u8 *out, size_t len,
+ const struct aes_key *key1, const struct aes_key *key2, u8 *iv);
diff --git a/arch/powerpc/crypto/aesp8-ppc.pl b/arch/powerpc/crypto/aesp8-ppc.pl
new file mode 100644
index 000000000000..f729589d792e
--- /dev/null
+++ b/arch/powerpc/crypto/aesp8-ppc.pl
@@ -0,0 +1,3889 @@
+#! /usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from CRYPTOGAMs[1] and is included here using the option
+# in the license to distribute the code under the GPL. Therefore this program
+# is free software; you can redistribute it and/or modify it under the terms of
+# the GNU General Public License version 2 as published by the Free Software
+# Foundation.
+#
+# [1] https://www.openssl.org/~appro/cryptogams/
+
+# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain copyright notices,
+# this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# * Neither the name of the CRYPTOGAMS nor the names of its
+# copyright holder and contributors may be used to endorse or
+# promote products derived from this software without specific
+# prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for AES instructions as per PowerISA
+# specification version 2.07, first implemented by POWER8 processor.
+# The module is endian-agnostic in sense that it supports both big-
+# and little-endian cases. Data alignment in parallelizable modes is
+# handled with VSX loads and stores, which implies MSR.VSX flag being
+# set. It should also be noted that ISA specification doesn't prohibit
+# alignment exceptions for these instructions on page boundaries.
+# Initially alignment was handled in pure AltiVec/VMX way [when data
+# is aligned programmatically, which in turn guarantees exception-
+# free execution], but it turned to hamper performance when vcipher
+# instructions are interleaved. It's reckoned that eventual
+# misalignment penalties at page boundaries are in average lower
+# than additional overhead in pure AltiVec approach.
+#
+# May 2016
+#
+# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
+# systems were measured.
+#
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+# CBC en-/decrypt CTR XTS
+# POWER8[le] 3.96/0.72 0.74 1.1
+# POWER8[be] 3.75/0.65 0.66 1.0
+
+$flavour = shift;
+
+if ($flavour =~ /64/) {
+ $SIZE_T =8;
+ $LRSAVE =2*$SIZE_T;
+ $STU ="stdu";
+ $POP ="ld";
+ $PUSH ="std";
+ $UCMP ="cmpld";
+ $SHL ="sldi";
+} elsif ($flavour =~ /32/) {
+ $SIZE_T =4;
+ $LRSAVE =$SIZE_T;
+ $STU ="stwu";
+ $POP ="lwz";
+ $PUSH ="stw";
+ $UCMP ="cmplw";
+ $SHL ="slwi";
+} else { die "nonsense $flavour"; }
+
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=8*$SIZE_T;
+$prefix="aes_p8";
+
+$sp="r1";
+$vrsave="r12";
+
+#########################################################################
+{{{ # Key setup procedures #
+my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
+my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
+my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
+
+$code.=<<___;
+.machine "any"
+
+.text
+
+.align 7
+rcon:
+.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 ?rev
+.long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev
+.long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev
+.long 0,0,0,0 ?asis
+.long 0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
+Lconsts:
+ mflr r0
+ bcl 20,31,\$+4
+ mflr $ptr #vvvvv "distance between . and rcon
+ addi $ptr,$ptr,-0x58
+ mtlr r0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+.asciz "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+
+.globl .${prefix}_set_encrypt_key
+Lset_encrypt_key:
+ mflr r11
+ $PUSH r11,$LRSAVE($sp)
+
+ li $ptr,-1
+ ${UCMP}i $inp,0
+ beq- Lenc_key_abort # if ($inp==0) return -1;
+ ${UCMP}i $out,0
+ beq- Lenc_key_abort # if ($out==0) return -1;
+ li $ptr,-2
+ cmpwi $bits,128
+ blt- Lenc_key_abort
+ cmpwi $bits,256
+ bgt- Lenc_key_abort
+ andi. r0,$bits,0x3f
+ bne- Lenc_key_abort
+
+ lis r0,0xfff0
+ mfspr $vrsave,256
+ mtspr 256,r0
+
+ bl Lconsts
+ mtlr r11
+
+ neg r9,$inp
+ lvx $in0,0,$inp
+ addi $inp,$inp,15 # 15 is not typo
+ lvsr $key,0,r9 # borrow $key
+ li r8,0x20
+ cmpwi $bits,192
+ lvx $in1,0,$inp
+ le?vspltisb $mask,0x0f # borrow $mask
+ lvx $rcon,0,$ptr
+ le?vxor $key,$key,$mask # adjust for byte swap
+ lvx $mask,r8,$ptr
+ addi $ptr,$ptr,0x10
+ vperm $in0,$in0,$in1,$key # align [and byte swap in LE]
+ li $cnt,8
+ vxor $zero,$zero,$zero
+ mtctr $cnt
+
+ ?lvsr $outperm,0,$out
+ vspltisb $outmask,-1
+ lvx $outhead,0,$out
+ ?vperm $outmask,$zero,$outmask,$outperm
+
+ blt Loop128
+ addi $inp,$inp,8
+ beq L192
+ addi $inp,$inp,8
+ b L256
+
+.align 4
+Loop128:
+ vperm $key,$in0,$in0,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in0,$in0,$key
+ bdnz Loop128
+
+ lvx $rcon,0,$ptr # last two round keys
+
+ vperm $key,$in0,$in0,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in0,$in0,$key
+
+ vperm $key,$in0,$in0,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vxor $in0,$in0,$key
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+
+ addi $inp,$out,15 # 15 is not typo
+ addi $out,$out,0x50
+
+ li $rounds,10
+ b Ldone
+
+.align 4
+L192:
+ lvx $tmp,0,$inp
+ li $cnt,4
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+ addi $out,$out,16
+ vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
+ vspltisb $key,8 # borrow $key
+ mtctr $cnt
+ vsububm $mask,$mask,$key # adjust the mask
+
+Loop192:
+ vperm $key,$in1,$in1,$mask # roate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vcipherlast $key,$key,$rcon
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+
+ vsldoi $stage,$zero,$in1,8
+ vspltw $tmp,$in0,3
+ vxor $tmp,$tmp,$in1
+ vsldoi $in1,$zero,$in1,12 # >>32
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in1,$in1,$tmp
+ vxor $in0,$in0,$key
+ vxor $in1,$in1,$key
+ vsldoi $stage,$stage,$in0,8
+
+ vperm $key,$in1,$in1,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$stage,$stage,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vsldoi $stage,$in0,$in1,8
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vperm $outtail,$stage,$stage,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vspltw $tmp,$in0,3
+ vxor $tmp,$tmp,$in1
+ vsldoi $in1,$zero,$in1,12 # >>32
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in1,$in1,$tmp
+ vxor $in0,$in0,$key
+ vxor $in1,$in1,$key
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+ addi $inp,$out,15 # 15 is not typo
+ addi $out,$out,16
+ bdnz Loop192
+
+ li $rounds,12
+ addi $out,$out,0x20
+ b Ldone
+
+.align 4
+L256:
+ lvx $tmp,0,$inp
+ li $cnt,7
+ li $rounds,14
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+ addi $out,$out,16
+ vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
+ mtctr $cnt
+
+Loop256:
+ vperm $key,$in1,$in1,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$in1,$in1,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in0,$in0,$key
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+ addi $inp,$out,15 # 15 is not typo
+ addi $out,$out,16
+ bdz Ldone
+
+ vspltw $key,$in0,3 # just splat
+ vsldoi $tmp,$zero,$in1,12 # >>32
+ vsbox $key,$key
+
+ vxor $in1,$in1,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in1,$in1,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in1,$in1,$tmp
+
+ vxor $in1,$in1,$key
+ b Loop256
+
+.align 4
+Ldone:
+ lvx $in1,0,$inp # redundant in aligned case
+ vsel $in1,$outhead,$in1,$outmask
+ stvx $in1,0,$inp
+ li $ptr,0
+ mtspr 256,$vrsave
+ stw $rounds,0($out)
+
+Lenc_key_abort:
+ mr r3,$ptr
+ blr
+ .long 0
+ .byte 0,12,0x14,1,0,0,3,0
+ .long 0
+.size .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
+
+.globl .${prefix}_set_decrypt_key
+ $STU $sp,-$FRAME($sp)
+ mflr r10
+ $PUSH r10,$FRAME+$LRSAVE($sp)
+ bl Lset_encrypt_key
+ mtlr r10
+
+ cmpwi r3,0
+ bne- Ldec_key_abort
+
+ slwi $cnt,$rounds,4
+ subi $inp,$out,240 # first round key
+ srwi $rounds,$rounds,1
+ add $out,$inp,$cnt # last round key
+ mtctr $rounds
+
+Ldeckey:
+ lwz r0, 0($inp)
+ lwz r6, 4($inp)
+ lwz r7, 8($inp)
+ lwz r8, 12($inp)
+ addi $inp,$inp,16
+ lwz r9, 0($out)
+ lwz r10,4($out)
+ lwz r11,8($out)
+ lwz r12,12($out)
+ stw r0, 0($out)
+ stw r6, 4($out)
+ stw r7, 8($out)
+ stw r8, 12($out)
+ subi $out,$out,16
+ stw r9, -16($inp)
+ stw r10,-12($inp)
+ stw r11,-8($inp)
+ stw r12,-4($inp)
+ bdnz Ldeckey
+
+ xor r3,r3,r3 # return value
+Ldec_key_abort:
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,4,1,0x80,0,3,0
+ .long 0
+.size .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
+___
+}}}
+#########################################################################
+{{{ # Single block en- and decrypt procedures #
+sub gen_block () {
+my $dir = shift;
+my $n = $dir eq "de" ? "n" : "";
+my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
+
+$code.=<<___;
+.globl .${prefix}_${dir}crypt
+ lwz $rounds,240($key)
+ lis r0,0xfc00
+ mfspr $vrsave,256
+ li $idx,15 # 15 is not typo
+ mtspr 256,r0
+
+ lvx v0,0,$inp
+ neg r11,$out
+ lvx v1,$idx,$inp
+ lvsl v2,0,$inp # inpperm
+ le?vspltisb v4,0x0f
+ ?lvsl v3,0,r11 # outperm
+ le?vxor v2,v2,v4
+ li $idx,16
+ vperm v0,v0,v1,v2 # align [and byte swap in LE]
+ lvx v1,0,$key
+ ?lvsl v5,0,$key # keyperm
+ srwi $rounds,$rounds,1
+ lvx v2,$idx,$key
+ addi $idx,$idx,16
+ subi $rounds,$rounds,1
+ ?vperm v1,v1,v2,v5 # align round key
+
+ vxor v0,v0,v1
+ lvx v1,$idx,$key
+ addi $idx,$idx,16
+ mtctr $rounds
+
+Loop_${dir}c:
+ ?vperm v2,v2,v1,v5
+ v${n}cipher v0,v0,v2
+ lvx v2,$idx,$key
+ addi $idx,$idx,16
+ ?vperm v1,v1,v2,v5
+ v${n}cipher v0,v0,v1
+ lvx v1,$idx,$key
+ addi $idx,$idx,16
+ bdnz Loop_${dir}c
+
+ ?vperm v2,v2,v1,v5
+ v${n}cipher v0,v0,v2
+ lvx v2,$idx,$key
+ ?vperm v1,v1,v2,v5
+ v${n}cipherlast v0,v0,v1
+
+ vspltisb v2,-1
+ vxor v1,v1,v1
+ li $idx,15 # 15 is not typo
+ ?vperm v2,v1,v2,v3 # outmask
+ le?vxor v3,v3,v4
+ lvx v1,0,$out # outhead
+ vperm v0,v0,v0,v3 # rotate [and byte swap in LE]
+ vsel v1,v1,v0,v2
+ lvx v4,$idx,$out
+ stvx v1,0,$out
+ vsel v0,v0,v4,v2
+ stvx v0,$idx,$out
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,3,0
+ .long 0
+.size .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+#########################################################################
+{{{ # CBC en- and decrypt procedures #
+my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3));
+my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
+ map("v$_",(4..10));
+$code.=<<___;
+.globl .${prefix}_cbc_encrypt
+ ${UCMP}i $len,16
+ bltlr-
+
+ cmpwi $enc,0 # test direction
+ lis r0,0xffe0
+ mfspr $vrsave,256
+ mtspr 256,r0
+
+ li $idx,15
+ vxor $rndkey0,$rndkey0,$rndkey0
+ le?vspltisb $tmp,0x0f
+
+ lvx $ivec,0,$ivp # load [unaligned] iv
+ lvsl $inpperm,0,$ivp
+ lvx $inptail,$idx,$ivp
+ le?vxor $inpperm,$inpperm,$tmp
+ vperm $ivec,$ivec,$inptail,$inpperm
+
+ neg r11,$inp
+ ?lvsl $keyperm,0,$key # prepare for unaligned key
+ lwz $rounds,240($key)
+
+ lvsr $inpperm,0,r11 # prepare for unaligned load
+ lvx $inptail,0,$inp
+ addi $inp,$inp,15 # 15 is not typo
+ le?vxor $inpperm,$inpperm,$tmp
+
+ ?lvsr $outperm,0,$out # prepare for unaligned store
+ vspltisb $outmask,-1
+ lvx $outhead,0,$out
+ ?vperm $outmask,$rndkey0,$outmask,$outperm
+ le?vxor $outperm,$outperm,$tmp
+
+ srwi $rounds,$rounds,1
+ li $idx,16
+ subi $rounds,$rounds,1
+ beq Lcbc_dec
+
+Lcbc_enc:
+ vmr $inout,$inptail
+ lvx $inptail,0,$inp
+ addi $inp,$inp,16
+ mtctr $rounds
+ subi $len,$len,16 # len-=16
+
+ lvx $rndkey0,0,$key
+ vperm $inout,$inout,$inptail,$inpperm
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+ vxor $inout,$inout,$ivec
+
+Loop_cbc_enc:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipher $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+ bdnz Loop_cbc_enc
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key
+ li $idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipherlast $ivec,$inout,$rndkey0
+ ${UCMP}i $len,16
+
+ vperm $tmp,$ivec,$ivec,$outperm
+ vsel $inout,$outhead,$tmp,$outmask
+ vmr $outhead,$tmp
+ stvx $inout,0,$out
+ addi $out,$out,16
+ bge Lcbc_enc
+
+ b Lcbc_done
+
+.align 4
+Lcbc_dec:
+ ${UCMP}i $len,128
+ bge _aesp8_cbc_decrypt8x
+ vmr $tmp,$inptail
+ lvx $inptail,0,$inp
+ addi $inp,$inp,16
+ mtctr $rounds
+ subi $len,$len,16 # len-=16
+
+ lvx $rndkey0,0,$key
+ vperm $tmp,$tmp,$inptail,$inpperm
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $inout,$tmp,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+
+Loop_cbc_dec:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vncipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vncipher $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+ bdnz Loop_cbc_dec
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vncipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key
+ li $idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vncipherlast $inout,$inout,$rndkey0
+ ${UCMP}i $len,16
+
+ vxor $inout,$inout,$ivec
+ vmr $ivec,$tmp
+ vperm $tmp,$inout,$inout,$outperm
+ vsel $inout,$outhead,$tmp,$outmask
+ vmr $outhead,$tmp
+ stvx $inout,0,$out
+ addi $out,$out,16
+ bge Lcbc_dec
+
+Lcbc_done:
+ addi $out,$out,-1
+ lvx $inout,0,$out # redundant in aligned case
+ vsel $inout,$outhead,$inout,$outmask
+ stvx $inout,0,$out
+
+ neg $enc,$ivp # write [unaligned] iv
+ li $idx,15 # 15 is not typo
+ vxor $rndkey0,$rndkey0,$rndkey0
+ vspltisb $outmask,-1
+ le?vspltisb $tmp,0x0f
+ ?lvsl $outperm,0,$enc
+ ?vperm $outmask,$rndkey0,$outmask,$outperm
+ le?vxor $outperm,$outperm,$tmp
+ lvx $outhead,0,$ivp
+ vperm $ivec,$ivec,$ivec,$outperm
+ vsel $inout,$outhead,$ivec,$outmask
+ lvx $inptail,$idx,$ivp
+ stvx $inout,0,$ivp
+ vsel $inout,$ivec,$inptail,$outmask
+ stvx $inout,$idx,$ivp
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,6,0
+ .long 0
+___
+#########################################################################
+{{ # Optimized CBC decrypt procedure #
+my $key_="r11";
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
+my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
+my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
+my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
+ # v26-v31 last 6 round keys
+my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
+
+$code.=<<___;
+.align 5
+_aesp8_cbc_decrypt8x:
+ $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+ li r10,`$FRAME+8*16+15`
+ li r11,`$FRAME+8*16+31`
+ stvx v20,r10,$sp # ABI says so
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ li r0,-1
+ stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
+ li $x10,0x10
+ $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ li $x20,0x20
+ $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ li $x30,0x30
+ $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ li $x40,0x40
+ $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ li $x50,0x50
+ $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ li $x60,0x60
+ $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ li $x70,0x70
+ mtspr 256,r0
+
+ subi $rounds,$rounds,3 # -4 in total
+ subi $len,$len,128 # bias
+
+ lvx $rndkey0,$x00,$key # load key schedule
+ lvx v30,$x10,$key
+ addi $key,$key,0x20
+ lvx v31,$x00,$key
+ ?vperm $rndkey0,$rndkey0,v30,$keyperm
+ addi $key_,$sp,$FRAME+15
+ mtctr $rounds
+
+Load_cbc_dec_key:
+ ?vperm v24,v30,v31,$keyperm
+ lvx v30,$x10,$key
+ addi $key,$key,0x20
+ stvx v24,$x00,$key_ # off-load round[1]
+ ?vperm v25,v31,v30,$keyperm
+ lvx v31,$x00,$key
+ stvx v25,$x10,$key_ # off-load round[2]
+ addi $key_,$key_,0x20
+ bdnz Load_cbc_dec_key
+
+ lvx v26,$x10,$key
+ ?vperm v24,v30,v31,$keyperm
+ lvx v27,$x20,$key
+ stvx v24,$x00,$key_ # off-load round[3]
+ ?vperm v25,v31,v26,$keyperm
+ lvx v28,$x30,$key
+ stvx v25,$x10,$key_ # off-load round[4]
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ ?vperm v26,v26,v27,$keyperm
+ lvx v29,$x40,$key
+ ?vperm v27,v27,v28,$keyperm
+ lvx v30,$x50,$key
+ ?vperm v28,v28,v29,$keyperm
+ lvx v31,$x60,$key
+ ?vperm v29,v29,v30,$keyperm
+ lvx $out0,$x70,$key # borrow $out0
+ ?vperm v30,v30,v31,$keyperm
+ lvx v24,$x00,$key_ # pre-load round[1]
+ ?vperm v31,v31,$out0,$keyperm
+ lvx v25,$x10,$key_ # pre-load round[2]
+
+ #lvx $inptail,0,$inp # "caller" already did this
+ #addi $inp,$inp,15 # 15 is not typo
+ subi $inp,$inp,15 # undo "caller"
+
+ le?li $idx,8
+ lvx_u $in0,$x00,$inp # load first 8 "words"
+ le?lvsl $inpperm,0,$idx
+ le?vspltisb $tmp,0x0f
+ lvx_u $in1,$x10,$inp
+ le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u
+ lvx_u $in2,$x20,$inp
+ le?vperm $in0,$in0,$in0,$inpperm
+ lvx_u $in3,$x30,$inp
+ le?vperm $in1,$in1,$in1,$inpperm
+ lvx_u $in4,$x40,$inp
+ le?vperm $in2,$in2,$in2,$inpperm
+ vxor $out0,$in0,$rndkey0
+ lvx_u $in5,$x50,$inp
+ le?vperm $in3,$in3,$in3,$inpperm
+ vxor $out1,$in1,$rndkey0
+ lvx_u $in6,$x60,$inp
+ le?vperm $in4,$in4,$in4,$inpperm
+ vxor $out2,$in2,$rndkey0
+ lvx_u $in7,$x70,$inp
+ addi $inp,$inp,0x80
+ le?vperm $in5,$in5,$in5,$inpperm
+ vxor $out3,$in3,$rndkey0
+ le?vperm $in6,$in6,$in6,$inpperm
+ vxor $out4,$in4,$rndkey0
+ le?vperm $in7,$in7,$in7,$inpperm
+ vxor $out5,$in5,$rndkey0
+ vxor $out6,$in6,$rndkey0
+ vxor $out7,$in7,$rndkey0
+
+ mtctr $rounds
+ b Loop_cbc_dec8x
+.align 5
+Loop_cbc_dec8x:
+ vncipher $out0,$out0,v24
+ vncipher $out1,$out1,v24
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+ vncipher $out5,$out5,v24
+ vncipher $out6,$out6,v24
+ vncipher $out7,$out7,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vncipher $out0,$out0,v25
+ vncipher $out1,$out1,v25
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vncipher $out4,$out4,v25
+ vncipher $out5,$out5,v25
+ vncipher $out6,$out6,v25
+ vncipher $out7,$out7,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Loop_cbc_dec8x
+
+ subic $len,$len,128 # $len-=128
+ vncipher $out0,$out0,v24
+ vncipher $out1,$out1,v24
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+ vncipher $out5,$out5,v24
+ vncipher $out6,$out6,v24
+ vncipher $out7,$out7,v24
+
+ subfe. r0,r0,r0 # borrow?-1:0
+ vncipher $out0,$out0,v25
+ vncipher $out1,$out1,v25
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vncipher $out4,$out4,v25
+ vncipher $out5,$out5,v25
+ vncipher $out6,$out6,v25
+ vncipher $out7,$out7,v25
+
+ and r0,r0,$len
+ vncipher $out0,$out0,v26
+ vncipher $out1,$out1,v26
+ vncipher $out2,$out2,v26
+ vncipher $out3,$out3,v26
+ vncipher $out4,$out4,v26
+ vncipher $out5,$out5,v26
+ vncipher $out6,$out6,v26
+ vncipher $out7,$out7,v26
+
+ add $inp,$inp,r0 # $inp is adjusted in such
+ # way that at exit from the
+ # loop inX-in7 are loaded
+ # with last "words"
+ vncipher $out0,$out0,v27
+ vncipher $out1,$out1,v27
+ vncipher $out2,$out2,v27
+ vncipher $out3,$out3,v27
+ vncipher $out4,$out4,v27
+ vncipher $out5,$out5,v27
+ vncipher $out6,$out6,v27
+ vncipher $out7,$out7,v27
+
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ vncipher $out0,$out0,v28
+ vncipher $out1,$out1,v28
+ vncipher $out2,$out2,v28
+ vncipher $out3,$out3,v28
+ vncipher $out4,$out4,v28
+ vncipher $out5,$out5,v28
+ vncipher $out6,$out6,v28
+ vncipher $out7,$out7,v28
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+
+ vncipher $out0,$out0,v29
+ vncipher $out1,$out1,v29
+ vncipher $out2,$out2,v29
+ vncipher $out3,$out3,v29
+ vncipher $out4,$out4,v29
+ vncipher $out5,$out5,v29
+ vncipher $out6,$out6,v29
+ vncipher $out7,$out7,v29
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+
+ vncipher $out0,$out0,v30
+ vxor $ivec,$ivec,v31 # xor with last round key
+ vncipher $out1,$out1,v30
+ vxor $in0,$in0,v31
+ vncipher $out2,$out2,v30
+ vxor $in1,$in1,v31
+ vncipher $out3,$out3,v30
+ vxor $in2,$in2,v31
+ vncipher $out4,$out4,v30
+ vxor $in3,$in3,v31
+ vncipher $out5,$out5,v30
+ vxor $in4,$in4,v31
+ vncipher $out6,$out6,v30
+ vxor $in5,$in5,v31
+ vncipher $out7,$out7,v30
+ vxor $in6,$in6,v31
+
+ vncipherlast $out0,$out0,$ivec
+ vncipherlast $out1,$out1,$in0
+ lvx_u $in0,$x00,$inp # load next input block
+ vncipherlast $out2,$out2,$in1
+ lvx_u $in1,$x10,$inp
+ vncipherlast $out3,$out3,$in2
+ le?vperm $in0,$in0,$in0,$inpperm
+ lvx_u $in2,$x20,$inp
+ vncipherlast $out4,$out4,$in3
+ le?vperm $in1,$in1,$in1,$inpperm
+ lvx_u $in3,$x30,$inp
+ vncipherlast $out5,$out5,$in4
+ le?vperm $in2,$in2,$in2,$inpperm
+ lvx_u $in4,$x40,$inp
+ vncipherlast $out6,$out6,$in5
+ le?vperm $in3,$in3,$in3,$inpperm
+ lvx_u $in5,$x50,$inp
+ vncipherlast $out7,$out7,$in6
+ le?vperm $in4,$in4,$in4,$inpperm
+ lvx_u $in6,$x60,$inp
+ vmr $ivec,$in7
+ le?vperm $in5,$in5,$in5,$inpperm
+ lvx_u $in7,$x70,$inp
+ addi $inp,$inp,0x80
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $in6,$in6,$in6,$inpperm
+ vxor $out0,$in0,$rndkey0
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ le?vperm $in7,$in7,$in7,$inpperm
+ vxor $out1,$in1,$rndkey0
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x20,$out
+ vxor $out2,$in2,$rndkey0
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x30,$out
+ vxor $out3,$in3,$rndkey0
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x40,$out
+ vxor $out4,$in4,$rndkey0
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x50,$out
+ vxor $out5,$in5,$rndkey0
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x60,$out
+ vxor $out6,$in6,$rndkey0
+ stvx_u $out7,$x70,$out
+ addi $out,$out,0x80
+ vxor $out7,$in7,$rndkey0
+
+ mtctr $rounds
+ beq Loop_cbc_dec8x # did $len-=128 borrow?
+
+ addic. $len,$len,128
+ beq Lcbc_dec8x_done
+ nop
+ nop
+
+Loop_cbc_dec8x_tail: # up to 7 "words" tail...
+ vncipher $out1,$out1,v24
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+ vncipher $out5,$out5,v24
+ vncipher $out6,$out6,v24
+ vncipher $out7,$out7,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vncipher $out1,$out1,v25
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vncipher $out4,$out4,v25
+ vncipher $out5,$out5,v25
+ vncipher $out6,$out6,v25
+ vncipher $out7,$out7,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Loop_cbc_dec8x_tail
+
+ vncipher $out1,$out1,v24
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+ vncipher $out5,$out5,v24
+ vncipher $out6,$out6,v24
+ vncipher $out7,$out7,v24
+
+ vncipher $out1,$out1,v25
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vncipher $out4,$out4,v25
+ vncipher $out5,$out5,v25
+ vncipher $out6,$out6,v25
+ vncipher $out7,$out7,v25
+
+ vncipher $out1,$out1,v26
+ vncipher $out2,$out2,v26
+ vncipher $out3,$out3,v26
+ vncipher $out4,$out4,v26
+ vncipher $out5,$out5,v26
+ vncipher $out6,$out6,v26
+ vncipher $out7,$out7,v26
+
+ vncipher $out1,$out1,v27
+ vncipher $out2,$out2,v27
+ vncipher $out3,$out3,v27
+ vncipher $out4,$out4,v27
+ vncipher $out5,$out5,v27
+ vncipher $out6,$out6,v27
+ vncipher $out7,$out7,v27
+
+ vncipher $out1,$out1,v28
+ vncipher $out2,$out2,v28
+ vncipher $out3,$out3,v28
+ vncipher $out4,$out4,v28
+ vncipher $out5,$out5,v28
+ vncipher $out6,$out6,v28
+ vncipher $out7,$out7,v28
+
+ vncipher $out1,$out1,v29
+ vncipher $out2,$out2,v29
+ vncipher $out3,$out3,v29
+ vncipher $out4,$out4,v29
+ vncipher $out5,$out5,v29
+ vncipher $out6,$out6,v29
+ vncipher $out7,$out7,v29
+
+ vncipher $out1,$out1,v30
+ vxor $ivec,$ivec,v31 # last round key
+ vncipher $out2,$out2,v30
+ vxor $in1,$in1,v31
+ vncipher $out3,$out3,v30
+ vxor $in2,$in2,v31
+ vncipher $out4,$out4,v30
+ vxor $in3,$in3,v31
+ vncipher $out5,$out5,v30
+ vxor $in4,$in4,v31
+ vncipher $out6,$out6,v30
+ vxor $in5,$in5,v31
+ vncipher $out7,$out7,v30
+ vxor $in6,$in6,v31
+
+ cmplwi $len,32 # switch($len)
+ blt Lcbc_dec8x_one
+ nop
+ beq Lcbc_dec8x_two
+ cmplwi $len,64
+ blt Lcbc_dec8x_three
+ nop
+ beq Lcbc_dec8x_four
+ cmplwi $len,96
+ blt Lcbc_dec8x_five
+ nop
+ beq Lcbc_dec8x_six
+
+Lcbc_dec8x_seven:
+ vncipherlast $out1,$out1,$ivec
+ vncipherlast $out2,$out2,$in1
+ vncipherlast $out3,$out3,$in2
+ vncipherlast $out4,$out4,$in3
+ vncipherlast $out5,$out5,$in4
+ vncipherlast $out6,$out6,$in5
+ vncipherlast $out7,$out7,$in6
+ vmr $ivec,$in7
+
+ le?vperm $out1,$out1,$out1,$inpperm
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x00,$out
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x10,$out
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x20,$out
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x30,$out
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x40,$out
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x50,$out
+ stvx_u $out7,$x60,$out
+ addi $out,$out,0x70
+ b Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_six:
+ vncipherlast $out2,$out2,$ivec
+ vncipherlast $out3,$out3,$in2
+ vncipherlast $out4,$out4,$in3
+ vncipherlast $out5,$out5,$in4
+ vncipherlast $out6,$out6,$in5
+ vncipherlast $out7,$out7,$in6
+ vmr $ivec,$in7
+
+ le?vperm $out2,$out2,$out2,$inpperm
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x00,$out
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x10,$out
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x20,$out
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x30,$out
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x40,$out
+ stvx_u $out7,$x50,$out
+ addi $out,$out,0x60
+ b Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_five:
+ vncipherlast $out3,$out3,$ivec
+ vncipherlast $out4,$out4,$in3
+ vncipherlast $out5,$out5,$in4
+ vncipherlast $out6,$out6,$in5
+ vncipherlast $out7,$out7,$in6
+ vmr $ivec,$in7
+
+ le?vperm $out3,$out3,$out3,$inpperm
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x00,$out
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x10,$out
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x20,$out
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x30,$out
+ stvx_u $out7,$x40,$out
+ addi $out,$out,0x50
+ b Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_four:
+ vncipherlast $out4,$out4,$ivec
+ vncipherlast $out5,$out5,$in4
+ vncipherlast $out6,$out6,$in5
+ vncipherlast $out7,$out7,$in6
+ vmr $ivec,$in7
+
+ le?vperm $out4,$out4,$out4,$inpperm
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x00,$out
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x10,$out
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x20,$out
+ stvx_u $out7,$x30,$out
+ addi $out,$out,0x40
+ b Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_three:
+ vncipherlast $out5,$out5,$ivec
+ vncipherlast $out6,$out6,$in5
+ vncipherlast $out7,$out7,$in6
+ vmr $ivec,$in7
+
+ le?vperm $out5,$out5,$out5,$inpperm
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x00,$out
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x10,$out
+ stvx_u $out7,$x20,$out
+ addi $out,$out,0x30
+ b Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_two:
+ vncipherlast $out6,$out6,$ivec
+ vncipherlast $out7,$out7,$in6
+ vmr $ivec,$in7
+
+ le?vperm $out6,$out6,$out6,$inpperm
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x00,$out
+ stvx_u $out7,$x10,$out
+ addi $out,$out,0x20
+ b Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_one:
+ vncipherlast $out7,$out7,$ivec
+ vmr $ivec,$in7
+
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out7,0,$out
+ addi $out,$out,0x10
+
+Lcbc_dec8x_done:
+ le?vperm $ivec,$ivec,$ivec,$inpperm
+ stvx_u $ivec,0,$ivp # write [unaligned] iv
+
+ li r10,`$FRAME+15`
+ li r11,`$FRAME+31`
+ stvx $inpperm,r10,$sp # wipe copies of round keys
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+ stvx $inpperm,r10,$sp
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+ stvx $inpperm,r10,$sp
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+ stvx $inpperm,r10,$sp
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+
+ mtspr 256,$vrsave
+ lvx v20,r10,$sp # ABI says so
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0x80,6,6,0
+ .long 0
+.size .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
+___
+}} }}}
+
+#########################################################################
+{{{ # CTR procedure[s] #
+
+####################### WARNING: Here be dragons! #######################
+#
+# This code is written as 'ctr32', based on a 32-bit counter used
+# upstream. The kernel does *not* use a 32-bit counter. The kernel uses
+# a 128-bit counter.
+#
+# This leads to subtle changes from the upstream code: the counter
+# is incremented with vaddu_q_m rather than vaddu_w_m. This occurs in
+# both the bulk (8 blocks at a time) path, and in the individual block
+# path. Be aware of this when doing updates.
+#
+# See:
+# 1d4aa0b4c181 ("crypto: vmx - Fixing AES-CTR counter bug")
+# 009b30ac7444 ("crypto: vmx - CTR: always increment IV as quadword")
+# https://github.com/openssl/openssl/pull/8942
+#
+#########################################################################
+my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3));
+my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
+ map("v$_",(4..11));
+my $dat=$tmp;
+
+$code.=<<___;
+.globl .${prefix}_ctr32_encrypt_blocks
+ ${UCMP}i $len,1
+ bltlr-
+
+ lis r0,0xfff0
+ mfspr $vrsave,256
+ mtspr 256,r0
+
+ li $idx,15
+ vxor $rndkey0,$rndkey0,$rndkey0
+ le?vspltisb $tmp,0x0f
+
+ lvx $ivec,0,$ivp # load [unaligned] iv
+ lvsl $inpperm,0,$ivp
+ lvx $inptail,$idx,$ivp
+ vspltisb $one,1
+ le?vxor $inpperm,$inpperm,$tmp
+ vperm $ivec,$ivec,$inptail,$inpperm
+ vsldoi $one,$rndkey0,$one,1
+
+ neg r11,$inp
+ ?lvsl $keyperm,0,$key # prepare for unaligned key
+ lwz $rounds,240($key)
+
+ lvsr $inpperm,0,r11 # prepare for unaligned load
+ lvx $inptail,0,$inp
+ addi $inp,$inp,15 # 15 is not typo
+ le?vxor $inpperm,$inpperm,$tmp
+
+ srwi $rounds,$rounds,1
+ li $idx,16
+ subi $rounds,$rounds,1
+
+ ${UCMP}i $len,8
+ bge _aesp8_ctr32_encrypt8x
+
+ ?lvsr $outperm,0,$out # prepare for unaligned store
+ vspltisb $outmask,-1
+ lvx $outhead,0,$out
+ ?vperm $outmask,$rndkey0,$outmask,$outperm
+ le?vxor $outperm,$outperm,$tmp
+
+ lvx $rndkey0,0,$key
+ mtctr $rounds
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $inout,$ivec,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+ b Loop_ctr32_enc
+
+.align 5
+Loop_ctr32_enc:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipher $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+ bdnz Loop_ctr32_enc
+
+ vadduqm $ivec,$ivec,$one # Kernel change for 128-bit
+ vmr $dat,$inptail
+ lvx $inptail,0,$inp
+ addi $inp,$inp,16
+ subic. $len,$len,1 # blocks--
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key
+ vperm $dat,$dat,$inptail,$inpperm
+ li $idx,16
+ ?vperm $rndkey1,$rndkey0,$rndkey1,$keyperm
+ lvx $rndkey0,0,$key
+ vxor $dat,$dat,$rndkey1 # last round key
+ vcipherlast $inout,$inout,$dat
+
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ vperm $inout,$inout,$inout,$outperm
+ vsel $dat,$outhead,$inout,$outmask
+ mtctr $rounds
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vmr $outhead,$inout
+ vxor $inout,$ivec,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+ stvx $dat,0,$out
+ addi $out,$out,16
+ bne Loop_ctr32_enc
+
+ addi $out,$out,-1
+ lvx $inout,0,$out # redundant in aligned case
+ vsel $inout,$outhead,$inout,$outmask
+ stvx $inout,0,$out
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,6,0
+ .long 0
+___
+#########################################################################
+{{ # Optimized CTR procedure #
+my $key_="r11";
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
+my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
+my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
+my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
+ # v26-v31 last 6 round keys
+my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
+my ($two,$three,$four)=($outhead,$outperm,$outmask);
+
+$code.=<<___;
+.align 5
+_aesp8_ctr32_encrypt8x:
+ $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+ li r10,`$FRAME+8*16+15`
+ li r11,`$FRAME+8*16+31`
+ stvx v20,r10,$sp # ABI says so
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ li r0,-1
+ stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
+ li $x10,0x10
+ $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ li $x20,0x20
+ $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ li $x30,0x30
+ $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ li $x40,0x40
+ $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ li $x50,0x50
+ $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ li $x60,0x60
+ $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ li $x70,0x70
+ mtspr 256,r0
+
+ subi $rounds,$rounds,3 # -4 in total
+
+ lvx $rndkey0,$x00,$key # load key schedule
+ lvx v30,$x10,$key
+ addi $key,$key,0x20
+ lvx v31,$x00,$key
+ ?vperm $rndkey0,$rndkey0,v30,$keyperm
+ addi $key_,$sp,$FRAME+15
+ mtctr $rounds
+
+Load_ctr32_enc_key:
+ ?vperm v24,v30,v31,$keyperm
+ lvx v30,$x10,$key
+ addi $key,$key,0x20
+ stvx v24,$x00,$key_ # off-load round[1]
+ ?vperm v25,v31,v30,$keyperm
+ lvx v31,$x00,$key
+ stvx v25,$x10,$key_ # off-load round[2]
+ addi $key_,$key_,0x20
+ bdnz Load_ctr32_enc_key
+
+ lvx v26,$x10,$key
+ ?vperm v24,v30,v31,$keyperm
+ lvx v27,$x20,$key
+ stvx v24,$x00,$key_ # off-load round[3]
+ ?vperm v25,v31,v26,$keyperm
+ lvx v28,$x30,$key
+ stvx v25,$x10,$key_ # off-load round[4]
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ ?vperm v26,v26,v27,$keyperm
+ lvx v29,$x40,$key
+ ?vperm v27,v27,v28,$keyperm
+ lvx v30,$x50,$key
+ ?vperm v28,v28,v29,$keyperm
+ lvx v31,$x60,$key
+ ?vperm v29,v29,v30,$keyperm
+ lvx $out0,$x70,$key # borrow $out0
+ ?vperm v30,v30,v31,$keyperm
+ lvx v24,$x00,$key_ # pre-load round[1]
+ ?vperm v31,v31,$out0,$keyperm
+ lvx v25,$x10,$key_ # pre-load round[2]
+
+ vadduqm $two,$one,$one
+ subi $inp,$inp,15 # undo "caller"
+ $SHL $len,$len,4
+
+ vadduqm $out1,$ivec,$one # counter values ...
+ vadduqm $out2,$ivec,$two # (do all ctr adds as 128-bit)
+ vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0]
+ le?li $idx,8
+ vadduqm $out3,$out1,$two
+ vxor $out1,$out1,$rndkey0
+ le?lvsl $inpperm,0,$idx
+ vadduqm $out4,$out2,$two
+ vxor $out2,$out2,$rndkey0
+ le?vspltisb $tmp,0x0f
+ vadduqm $out5,$out3,$two
+ vxor $out3,$out3,$rndkey0
+ le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u
+ vadduqm $out6,$out4,$two
+ vxor $out4,$out4,$rndkey0
+ vadduqm $out7,$out5,$two
+ vxor $out5,$out5,$rndkey0
+ vadduqm $ivec,$out6,$two # next counter value
+ vxor $out6,$out6,$rndkey0
+ vxor $out7,$out7,$rndkey0
+
+ mtctr $rounds
+ b Loop_ctr32_enc8x
+.align 5
+Loop_ctr32_enc8x:
+ vcipher $out0,$out0,v24
+ vcipher $out1,$out1,v24
+ vcipher $out2,$out2,v24
+ vcipher $out3,$out3,v24
+ vcipher $out4,$out4,v24
+ vcipher $out5,$out5,v24
+ vcipher $out6,$out6,v24
+ vcipher $out7,$out7,v24
+Loop_ctr32_enc8x_middle:
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vcipher $out0,$out0,v25
+ vcipher $out1,$out1,v25
+ vcipher $out2,$out2,v25
+ vcipher $out3,$out3,v25
+ vcipher $out4,$out4,v25
+ vcipher $out5,$out5,v25
+ vcipher $out6,$out6,v25
+ vcipher $out7,$out7,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Loop_ctr32_enc8x
+
+ subic r11,$len,256 # $len-256, borrow $key_
+ vcipher $out0,$out0,v24
+ vcipher $out1,$out1,v24
+ vcipher $out2,$out2,v24
+ vcipher $out3,$out3,v24
+ vcipher $out4,$out4,v24
+ vcipher $out5,$out5,v24
+ vcipher $out6,$out6,v24
+ vcipher $out7,$out7,v24
+
+ subfe r0,r0,r0 # borrow?-1:0
+ vcipher $out0,$out0,v25
+ vcipher $out1,$out1,v25
+ vcipher $out2,$out2,v25
+ vcipher $out3,$out3,v25
+ vcipher $out4,$out4,v25
+ vcipher $out5,$out5,v25
+ vcipher $out6,$out6,v25
+ vcipher $out7,$out7,v25
+
+ and r0,r0,r11
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ vcipher $out0,$out0,v26
+ vcipher $out1,$out1,v26
+ vcipher $out2,$out2,v26
+ vcipher $out3,$out3,v26
+ vcipher $out4,$out4,v26
+ vcipher $out5,$out5,v26
+ vcipher $out6,$out6,v26
+ vcipher $out7,$out7,v26
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+
+ subic $len,$len,129 # $len-=129
+ vcipher $out0,$out0,v27
+ addi $len,$len,1 # $len-=128 really
+ vcipher $out1,$out1,v27
+ vcipher $out2,$out2,v27
+ vcipher $out3,$out3,v27
+ vcipher $out4,$out4,v27
+ vcipher $out5,$out5,v27
+ vcipher $out6,$out6,v27
+ vcipher $out7,$out7,v27
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+
+ vcipher $out0,$out0,v28
+ lvx_u $in0,$x00,$inp # load input
+ vcipher $out1,$out1,v28
+ lvx_u $in1,$x10,$inp
+ vcipher $out2,$out2,v28
+ lvx_u $in2,$x20,$inp
+ vcipher $out3,$out3,v28
+ lvx_u $in3,$x30,$inp
+ vcipher $out4,$out4,v28
+ lvx_u $in4,$x40,$inp
+ vcipher $out5,$out5,v28
+ lvx_u $in5,$x50,$inp
+ vcipher $out6,$out6,v28
+ lvx_u $in6,$x60,$inp
+ vcipher $out7,$out7,v28
+ lvx_u $in7,$x70,$inp
+ addi $inp,$inp,0x80
+
+ vcipher $out0,$out0,v29
+ le?vperm $in0,$in0,$in0,$inpperm
+ vcipher $out1,$out1,v29
+ le?vperm $in1,$in1,$in1,$inpperm
+ vcipher $out2,$out2,v29
+ le?vperm $in2,$in2,$in2,$inpperm
+ vcipher $out3,$out3,v29
+ le?vperm $in3,$in3,$in3,$inpperm
+ vcipher $out4,$out4,v29
+ le?vperm $in4,$in4,$in4,$inpperm
+ vcipher $out5,$out5,v29
+ le?vperm $in5,$in5,$in5,$inpperm
+ vcipher $out6,$out6,v29
+ le?vperm $in6,$in6,$in6,$inpperm
+ vcipher $out7,$out7,v29
+ le?vperm $in7,$in7,$in7,$inpperm
+
+ add $inp,$inp,r0 # $inp is adjusted in such
+ # way that at exit from the
+ # loop inX-in7 are loaded
+ # with last "words"
+ subfe. r0,r0,r0 # borrow?-1:0
+ vcipher $out0,$out0,v30
+ vxor $in0,$in0,v31 # xor with last round key
+ vcipher $out1,$out1,v30
+ vxor $in1,$in1,v31
+ vcipher $out2,$out2,v30
+ vxor $in2,$in2,v31
+ vcipher $out3,$out3,v30
+ vxor $in3,$in3,v31
+ vcipher $out4,$out4,v30
+ vxor $in4,$in4,v31
+ vcipher $out5,$out5,v30
+ vxor $in5,$in5,v31
+ vcipher $out6,$out6,v30
+ vxor $in6,$in6,v31
+ vcipher $out7,$out7,v30
+ vxor $in7,$in7,v31
+
+ bne Lctr32_enc8x_break # did $len-129 borrow?
+
+ vcipherlast $in0,$out0,$in0
+ vcipherlast $in1,$out1,$in1
+ vadduqm $out1,$ivec,$one # counter values ...
+ vcipherlast $in2,$out2,$in2
+ vadduqm $out2,$ivec,$two
+ vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0]
+ vcipherlast $in3,$out3,$in3
+ vadduqm $out3,$out1,$two
+ vxor $out1,$out1,$rndkey0
+ vcipherlast $in4,$out4,$in4
+ vadduqm $out4,$out2,$two
+ vxor $out2,$out2,$rndkey0
+ vcipherlast $in5,$out5,$in5
+ vadduqm $out5,$out3,$two
+ vxor $out3,$out3,$rndkey0
+ vcipherlast $in6,$out6,$in6
+ vadduqm $out6,$out4,$two
+ vxor $out4,$out4,$rndkey0
+ vcipherlast $in7,$out7,$in7
+ vadduqm $out7,$out5,$two
+ vxor $out5,$out5,$rndkey0
+ le?vperm $in0,$in0,$in0,$inpperm
+ vadduqm $ivec,$out6,$two # next counter value
+ vxor $out6,$out6,$rndkey0
+ le?vperm $in1,$in1,$in1,$inpperm
+ vxor $out7,$out7,$rndkey0
+ mtctr $rounds
+
+ vcipher $out0,$out0,v24
+ stvx_u $in0,$x00,$out
+ le?vperm $in2,$in2,$in2,$inpperm
+ vcipher $out1,$out1,v24
+ stvx_u $in1,$x10,$out
+ le?vperm $in3,$in3,$in3,$inpperm
+ vcipher $out2,$out2,v24
+ stvx_u $in2,$x20,$out
+ le?vperm $in4,$in4,$in4,$inpperm
+ vcipher $out3,$out3,v24
+ stvx_u $in3,$x30,$out
+ le?vperm $in5,$in5,$in5,$inpperm
+ vcipher $out4,$out4,v24
+ stvx_u $in4,$x40,$out
+ le?vperm $in6,$in6,$in6,$inpperm
+ vcipher $out5,$out5,v24
+ stvx_u $in5,$x50,$out
+ le?vperm $in7,$in7,$in7,$inpperm
+ vcipher $out6,$out6,v24
+ stvx_u $in6,$x60,$out
+ vcipher $out7,$out7,v24
+ stvx_u $in7,$x70,$out
+ addi $out,$out,0x80
+
+ b Loop_ctr32_enc8x_middle
+
+.align 5
+Lctr32_enc8x_break:
+ cmpwi $len,-0x60
+ blt Lctr32_enc8x_one
+ nop
+ beq Lctr32_enc8x_two
+ cmpwi $len,-0x40
+ blt Lctr32_enc8x_three
+ nop
+ beq Lctr32_enc8x_four
+ cmpwi $len,-0x20
+ blt Lctr32_enc8x_five
+ nop
+ beq Lctr32_enc8x_six
+ cmpwi $len,0x00
+ blt Lctr32_enc8x_seven
+
+Lctr32_enc8x_eight:
+ vcipherlast $out0,$out0,$in0
+ vcipherlast $out1,$out1,$in1
+ vcipherlast $out2,$out2,$in2
+ vcipherlast $out3,$out3,$in3
+ vcipherlast $out4,$out4,$in4
+ vcipherlast $out5,$out5,$in5
+ vcipherlast $out6,$out6,$in6
+ vcipherlast $out7,$out7,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x20,$out
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x30,$out
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x40,$out
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x50,$out
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x60,$out
+ stvx_u $out7,$x70,$out
+ addi $out,$out,0x80
+ b Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_seven:
+ vcipherlast $out0,$out0,$in1
+ vcipherlast $out1,$out1,$in2
+ vcipherlast $out2,$out2,$in3
+ vcipherlast $out3,$out3,$in4
+ vcipherlast $out4,$out4,$in5
+ vcipherlast $out5,$out5,$in6
+ vcipherlast $out6,$out6,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x20,$out
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x30,$out
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x40,$out
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x50,$out
+ stvx_u $out6,$x60,$out
+ addi $out,$out,0x70
+ b Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_six:
+ vcipherlast $out0,$out0,$in2
+ vcipherlast $out1,$out1,$in3
+ vcipherlast $out2,$out2,$in4
+ vcipherlast $out3,$out3,$in5
+ vcipherlast $out4,$out4,$in6
+ vcipherlast $out5,$out5,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x20,$out
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x30,$out
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x40,$out
+ stvx_u $out5,$x50,$out
+ addi $out,$out,0x60
+ b Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_five:
+ vcipherlast $out0,$out0,$in3
+ vcipherlast $out1,$out1,$in4
+ vcipherlast $out2,$out2,$in5
+ vcipherlast $out3,$out3,$in6
+ vcipherlast $out4,$out4,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x20,$out
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x30,$out
+ stvx_u $out4,$x40,$out
+ addi $out,$out,0x50
+ b Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_four:
+ vcipherlast $out0,$out0,$in4
+ vcipherlast $out1,$out1,$in5
+ vcipherlast $out2,$out2,$in6
+ vcipherlast $out3,$out3,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x20,$out
+ stvx_u $out3,$x30,$out
+ addi $out,$out,0x40
+ b Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_three:
+ vcipherlast $out0,$out0,$in5
+ vcipherlast $out1,$out1,$in6
+ vcipherlast $out2,$out2,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ stvx_u $out2,$x20,$out
+ addi $out,$out,0x30
+ b Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_two:
+ vcipherlast $out0,$out0,$in6
+ vcipherlast $out1,$out1,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ stvx_u $out1,$x10,$out
+ addi $out,$out,0x20
+ b Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_one:
+ vcipherlast $out0,$out0,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ stvx_u $out0,0,$out
+ addi $out,$out,0x10
+
+Lctr32_enc8x_done:
+ li r10,`$FRAME+15`
+ li r11,`$FRAME+31`
+ stvx $inpperm,r10,$sp # wipe copies of round keys
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+ stvx $inpperm,r10,$sp
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+ stvx $inpperm,r10,$sp
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+ stvx $inpperm,r10,$sp
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+
+ mtspr 256,$vrsave
+ lvx v20,r10,$sp # ABI says so
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0x80,6,6,0
+ .long 0
+.size .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
+___
+}} }}}
+
+#########################################################################
+{{{ # XTS procedures #
+# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len, #
+# const AES_KEY *key1, const AES_KEY *key2, #
+# [const] unsigned char iv[16]); #
+# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which #
+# input tweak value is assumed to be encrypted already, and last tweak #
+# value, one suitable for consecutive call on same chunk of data, is #
+# written back to original buffer. In addition, in "tweak chaining" #
+# mode only complete input blocks are processed. #
+
+my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) = map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout) = map("v$_",(0..2));
+my ($output,$inptail,$inpperm,$leperm,$keyperm) = map("v$_",(3..7));
+my ($tweak,$seven,$eighty7,$tmp,$tweak1) = map("v$_",(8..12));
+my $taillen = $key2;
+
+ ($inp,$idx) = ($idx,$inp); # reassign
+
+$code.=<<___;
+.globl .${prefix}_xts_encrypt
+ mr $inp,r3 # reassign
+ li r3,-1
+ ${UCMP}i $len,16
+ bltlr-
+
+ lis r0,0xfff0
+ mfspr r12,256 # save vrsave
+ li r11,0
+ mtspr 256,r0
+
+ vspltisb $seven,0x07 # 0x070707..07
+ le?lvsl $leperm,r11,r11
+ le?vspltisb $tmp,0x0f
+ le?vxor $leperm,$leperm,$seven
+
+ li $idx,15
+ lvx $tweak,0,$ivp # load [unaligned] iv
+ lvsl $inpperm,0,$ivp
+ lvx $inptail,$idx,$ivp
+ le?vxor $inpperm,$inpperm,$tmp
+ vperm $tweak,$tweak,$inptail,$inpperm
+
+ neg r11,$inp
+ lvsr $inpperm,0,r11 # prepare for unaligned load
+ lvx $inout,0,$inp
+ addi $inp,$inp,15 # 15 is not typo
+ le?vxor $inpperm,$inpperm,$tmp
+
+ ${UCMP}i $key2,0 # key2==NULL?
+ beq Lxts_enc_no_key2
+
+ ?lvsl $keyperm,0,$key2 # prepare for unaligned key
+ lwz $rounds,240($key2)
+ srwi $rounds,$rounds,1
+ subi $rounds,$rounds,1
+ li $idx,16
+
+ lvx $rndkey0,0,$key2
+ lvx $rndkey1,$idx,$key2
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $tweak,$tweak,$rndkey0
+ lvx $rndkey0,$idx,$key2
+ addi $idx,$idx,16
+ mtctr $rounds
+
+Ltweak_xts_enc:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $tweak,$tweak,$rndkey1
+ lvx $rndkey1,$idx,$key2
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipher $tweak,$tweak,$rndkey0
+ lvx $rndkey0,$idx,$key2
+ addi $idx,$idx,16
+ bdnz Ltweak_xts_enc
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $tweak,$tweak,$rndkey1
+ lvx $rndkey1,$idx,$key2
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipherlast $tweak,$tweak,$rndkey0
+
+ li $ivp,0 # don't chain the tweak
+ b Lxts_enc
+
+Lxts_enc_no_key2:
+ li $idx,-16
+ and $len,$len,$idx # in "tweak chaining"
+ # mode only complete
+ # blocks are processed
+Lxts_enc:
+ lvx $inptail,0,$inp
+ addi $inp,$inp,16
+
+ ?lvsl $keyperm,0,$key1 # prepare for unaligned key
+ lwz $rounds,240($key1)
+ srwi $rounds,$rounds,1
+ subi $rounds,$rounds,1
+ li $idx,16
+
+ vslb $eighty7,$seven,$seven # 0x808080..80
+ vor $eighty7,$eighty7,$seven # 0x878787..87
+ vspltisb $tmp,1 # 0x010101..01
+ vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01
+
+ ${UCMP}i $len,96
+ bge _aesp8_xts_encrypt6x
+
+ andi. $taillen,$len,15
+ subic r0,$len,32
+ subi $taillen,$taillen,16
+ subfe r0,r0,r0
+ and r0,r0,$taillen
+ add $inp,$inp,r0
+
+ lvx $rndkey0,0,$key1
+ lvx $rndkey1,$idx,$key1
+ addi $idx,$idx,16
+ vperm $inout,$inout,$inptail,$inpperm
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $inout,$inout,$tweak
+ vxor $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key1
+ addi $idx,$idx,16
+ mtctr $rounds
+ b Loop_xts_enc
+
+.align 5
+Loop_xts_enc:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key1
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipher $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key1
+ addi $idx,$idx,16
+ bdnz Loop_xts_enc
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key1
+ li $idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $rndkey0,$rndkey0,$tweak
+ vcipherlast $output,$inout,$rndkey0
+
+ le?vperm $tmp,$output,$output,$leperm
+ be?nop
+ le?stvx_u $tmp,0,$out
+ be?stvx_u $output,0,$out
+ addi $out,$out,16
+
+ subic. $len,$len,16
+ beq Lxts_enc_done
+
+ vmr $inout,$inptail
+ lvx $inptail,0,$inp
+ addi $inp,$inp,16
+ lvx $rndkey0,0,$key1
+ lvx $rndkey1,$idx,$key1
+ addi $idx,$idx,16
+
+ subic r0,$len,32
+ subfe r0,r0,r0
+ and r0,r0,$taillen
+ add $inp,$inp,r0
+
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ vand $tmp,$tmp,$eighty7
+ vxor $tweak,$tweak,$tmp
+
+ vperm $inout,$inout,$inptail,$inpperm
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $inout,$inout,$tweak
+ vxor $output,$output,$rndkey0 # just in case $len<16
+ vxor $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key1
+ addi $idx,$idx,16
+
+ mtctr $rounds
+ ${UCMP}i $len,16
+ bge Loop_xts_enc
+
+ vxor $output,$output,$tweak
+ lvsr $inpperm,0,$len # $inpperm is no longer needed
+ vxor $inptail,$inptail,$inptail # $inptail is no longer needed
+ vspltisb $tmp,-1
+ vperm $inptail,$inptail,$tmp,$inpperm
+ vsel $inout,$inout,$output,$inptail
+
+ subi r11,$out,17
+ subi $out,$out,16
+ mtctr $len
+ li $len,16
+Loop_xts_enc_steal:
+ lbzu r0,1(r11)
+ stb r0,16(r11)
+ bdnz Loop_xts_enc_steal
+
+ mtctr $rounds
+ b Loop_xts_enc # one more time...
+
+Lxts_enc_done:
+ ${UCMP}i $ivp,0
+ beq Lxts_enc_ret
+
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ vand $tmp,$tmp,$eighty7
+ vxor $tweak,$tweak,$tmp
+
+ le?vperm $tweak,$tweak,$tweak,$leperm
+ stvx_u $tweak,0,$ivp
+
+Lxts_enc_ret:
+ mtspr 256,r12 # restore vrsave
+ li r3,0
+ blr
+ .long 0
+ .byte 0,12,0x04,0,0x80,6,6,0
+ .long 0
+.size .${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
+
+.globl .${prefix}_xts_decrypt
+ mr $inp,r3 # reassign
+ li r3,-1
+ ${UCMP}i $len,16
+ bltlr-
+
+ lis r0,0xfff8
+ mfspr r12,256 # save vrsave
+ li r11,0
+ mtspr 256,r0
+
+ andi. r0,$len,15
+ neg r0,r0
+ andi. r0,r0,16
+ sub $len,$len,r0
+
+ vspltisb $seven,0x07 # 0x070707..07
+ le?lvsl $leperm,r11,r11
+ le?vspltisb $tmp,0x0f
+ le?vxor $leperm,$leperm,$seven
+
+ li $idx,15
+ lvx $tweak,0,$ivp # load [unaligned] iv
+ lvsl $inpperm,0,$ivp
+ lvx $inptail,$idx,$ivp
+ le?vxor $inpperm,$inpperm,$tmp
+ vperm $tweak,$tweak,$inptail,$inpperm
+
+ neg r11,$inp
+ lvsr $inpperm,0,r11 # prepare for unaligned load
+ lvx $inout,0,$inp
+ addi $inp,$inp,15 # 15 is not typo
+ le?vxor $inpperm,$inpperm,$tmp
+
+ ${UCMP}i $key2,0 # key2==NULL?
+ beq Lxts_dec_no_key2
+
+ ?lvsl $keyperm,0,$key2 # prepare for unaligned key
+ lwz $rounds,240($key2)
+ srwi $rounds,$rounds,1
+ subi $rounds,$rounds,1
+ li $idx,16
+
+ lvx $rndkey0,0,$key2
+ lvx $rndkey1,$idx,$key2
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $tweak,$tweak,$rndkey0
+ lvx $rndkey0,$idx,$key2
+ addi $idx,$idx,16
+ mtctr $rounds
+
+Ltweak_xts_dec:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $tweak,$tweak,$rndkey1
+ lvx $rndkey1,$idx,$key2
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipher $tweak,$tweak,$rndkey0
+ lvx $rndkey0,$idx,$key2
+ addi $idx,$idx,16
+ bdnz Ltweak_xts_dec
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $tweak,$tweak,$rndkey1
+ lvx $rndkey1,$idx,$key2
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipherlast $tweak,$tweak,$rndkey0
+
+ li $ivp,0 # don't chain the tweak
+ b Lxts_dec
+
+Lxts_dec_no_key2:
+ neg $idx,$len
+ andi. $idx,$idx,15
+ add $len,$len,$idx # in "tweak chaining"
+ # mode only complete
+ # blocks are processed
+Lxts_dec:
+ lvx $inptail,0,$inp
+ addi $inp,$inp,16
+
+ ?lvsl $keyperm,0,$key1 # prepare for unaligned key
+ lwz $rounds,240($key1)
+ srwi $rounds,$rounds,1
+ subi $rounds,$rounds,1
+ li $idx,16
+
+ vslb $eighty7,$seven,$seven # 0x808080..80
+ vor $eighty7,$eighty7,$seven # 0x878787..87
+ vspltisb $tmp,1 # 0x010101..01
+ vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01
+
+ ${UCMP}i $len,96
+ bge _aesp8_xts_decrypt6x
+
+ lvx $rndkey0,0,$key1
+ lvx $rndkey1,$idx,$key1
+ addi $idx,$idx,16
+ vperm $inout,$inout,$inptail,$inpperm
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $inout,$inout,$tweak
+ vxor $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key1
+ addi $idx,$idx,16
+ mtctr $rounds
+
+ ${UCMP}i $len,16
+ blt Ltail_xts_dec
+ be?b Loop_xts_dec
+
+.align 5
+Loop_xts_dec:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vncipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key1
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vncipher $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key1
+ addi $idx,$idx,16
+ bdnz Loop_xts_dec
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vncipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key1
+ li $idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $rndkey0,$rndkey0,$tweak
+ vncipherlast $output,$inout,$rndkey0
+
+ le?vperm $tmp,$output,$output,$leperm
+ be?nop
+ le?stvx_u $tmp,0,$out
+ be?stvx_u $output,0,$out
+ addi $out,$out,16
+
+ subic. $len,$len,16
+ beq Lxts_dec_done
+
+ vmr $inout,$inptail
+ lvx $inptail,0,$inp
+ addi $inp,$inp,16
+ lvx $rndkey0,0,$key1
+ lvx $rndkey1,$idx,$key1
+ addi $idx,$idx,16
+
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ vand $tmp,$tmp,$eighty7
+ vxor $tweak,$tweak,$tmp
+
+ vperm $inout,$inout,$inptail,$inpperm
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $inout,$inout,$tweak
+ vxor $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key1
+ addi $idx,$idx,16
+
+ mtctr $rounds
+ ${UCMP}i $len,16
+ bge Loop_xts_dec
+
+Ltail_xts_dec:
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak1,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ vand $tmp,$tmp,$eighty7
+ vxor $tweak1,$tweak1,$tmp
+
+ subi $inp,$inp,16
+ add $inp,$inp,$len
+
+ vxor $inout,$inout,$tweak # :-(
+ vxor $inout,$inout,$tweak1 # :-)
+
+Loop_xts_dec_short:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vncipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key1
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vncipher $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key1
+ addi $idx,$idx,16
+ bdnz Loop_xts_dec_short
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vncipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key1
+ li $idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $rndkey0,$rndkey0,$tweak1
+ vncipherlast $output,$inout,$rndkey0
+
+ le?vperm $tmp,$output,$output,$leperm
+ be?nop
+ le?stvx_u $tmp,0,$out
+ be?stvx_u $output,0,$out
+
+ vmr $inout,$inptail
+ lvx $inptail,0,$inp
+ #addi $inp,$inp,16
+ lvx $rndkey0,0,$key1
+ lvx $rndkey1,$idx,$key1
+ addi $idx,$idx,16
+ vperm $inout,$inout,$inptail,$inpperm
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+
+ lvsr $inpperm,0,$len # $inpperm is no longer needed
+ vxor $inptail,$inptail,$inptail # $inptail is no longer needed
+ vspltisb $tmp,-1
+ vperm $inptail,$inptail,$tmp,$inpperm
+ vsel $inout,$inout,$output,$inptail
+
+ vxor $rndkey0,$rndkey0,$tweak
+ vxor $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key1
+ addi $idx,$idx,16
+
+ subi r11,$out,1
+ mtctr $len
+ li $len,16
+Loop_xts_dec_steal:
+ lbzu r0,1(r11)
+ stb r0,16(r11)
+ bdnz Loop_xts_dec_steal
+
+ mtctr $rounds
+ b Loop_xts_dec # one more time...
+
+Lxts_dec_done:
+ ${UCMP}i $ivp,0
+ beq Lxts_dec_ret
+
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ vand $tmp,$tmp,$eighty7
+ vxor $tweak,$tweak,$tmp
+
+ le?vperm $tweak,$tweak,$tweak,$leperm
+ stvx_u $tweak,0,$ivp
+
+Lxts_dec_ret:
+ mtspr 256,r12 # restore vrsave
+ li r3,0
+ blr
+ .long 0
+ .byte 0,12,0x04,0,0x80,6,6,0
+ .long 0
+.size .${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
+___
+#########################################################################
+{{ # Optimized XTS procedures #
+my $key_=$key2;
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
+ $x00=0 if ($flavour =~ /osx/);
+my ($in0, $in1, $in2, $in3, $in4, $in5 )=map("v$_",(0..5));
+my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
+my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
+my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
+ # v26-v31 last 6 round keys
+my ($keyperm)=($out0); # aliases with "caller", redundant assignment
+my $taillen=$x70;
+
+$code.=<<___;
+.align 5
+_aesp8_xts_encrypt6x:
+ $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+ mflr r11
+ li r7,`$FRAME+8*16+15`
+ li r3,`$FRAME+8*16+31`
+ $PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+ stvx v20,r7,$sp # ABI says so
+ addi r7,r7,32
+ stvx v21,r3,$sp
+ addi r3,r3,32
+ stvx v22,r7,$sp
+ addi r7,r7,32
+ stvx v23,r3,$sp
+ addi r3,r3,32
+ stvx v24,r7,$sp
+ addi r7,r7,32
+ stvx v25,r3,$sp
+ addi r3,r3,32
+ stvx v26,r7,$sp
+ addi r7,r7,32
+ stvx v27,r3,$sp
+ addi r3,r3,32
+ stvx v28,r7,$sp
+ addi r7,r7,32
+ stvx v29,r3,$sp
+ addi r3,r3,32
+ stvx v30,r7,$sp
+ stvx v31,r3,$sp
+ li r0,-1
+ stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
+ li $x10,0x10
+ $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ li $x20,0x20
+ $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ li $x30,0x30
+ $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ li $x40,0x40
+ $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ li $x50,0x50
+ $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ li $x60,0x60
+ $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ li $x70,0x70
+ mtspr 256,r0
+
+ xxlor 2, 32+$eighty7, 32+$eighty7
+ vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87
+ xxlor 1, 32+$eighty7, 32+$eighty7
+
+ # Load XOR Lconsts.
+ mr $x70, r6
+ bl Lconsts
+ lxvw4x 0, $x40, r6 # load XOR contents
+ mr r6, $x70
+ li $x70,0x70
+
+ subi $rounds,$rounds,3 # -4 in total
+
+ lvx $rndkey0,$x00,$key1 # load key schedule
+ lvx v30,$x10,$key1
+ addi $key1,$key1,0x20
+ lvx v31,$x00,$key1
+ ?vperm $rndkey0,$rndkey0,v30,$keyperm
+ addi $key_,$sp,$FRAME+15
+ mtctr $rounds
+
+Load_xts_enc_key:
+ ?vperm v24,v30,v31,$keyperm
+ lvx v30,$x10,$key1
+ addi $key1,$key1,0x20
+ stvx v24,$x00,$key_ # off-load round[1]
+ ?vperm v25,v31,v30,$keyperm
+ lvx v31,$x00,$key1
+ stvx v25,$x10,$key_ # off-load round[2]
+ addi $key_,$key_,0x20
+ bdnz Load_xts_enc_key
+
+ lvx v26,$x10,$key1
+ ?vperm v24,v30,v31,$keyperm
+ lvx v27,$x20,$key1
+ stvx v24,$x00,$key_ # off-load round[3]
+ ?vperm v25,v31,v26,$keyperm
+ lvx v28,$x30,$key1
+ stvx v25,$x10,$key_ # off-load round[4]
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ ?vperm v26,v26,v27,$keyperm
+ lvx v29,$x40,$key1
+ ?vperm v27,v27,v28,$keyperm
+ lvx v30,$x50,$key1
+ ?vperm v28,v28,v29,$keyperm
+ lvx v31,$x60,$key1
+ ?vperm v29,v29,v30,$keyperm
+ lvx $twk5,$x70,$key1 # borrow $twk5
+ ?vperm v30,v30,v31,$keyperm
+ lvx v24,$x00,$key_ # pre-load round[1]
+ ?vperm v31,v31,$twk5,$keyperm
+ lvx v25,$x10,$key_ # pre-load round[2]
+
+ # Switch to use the following codes with 0x010101..87 to generate tweak.
+ # eighty7 = 0x010101..87
+ # vsrab tmp, tweak, seven # next tweak value, right shift 7 bits
+ # vand tmp, tmp, eighty7 # last byte with carry
+ # vaddubm tweak, tweak, tweak # left shift 1 bit (x2)
+ # xxlor vsx, 0, 0
+ # vpermxor tweak, tweak, tmp, vsx
+
+ vperm $in0,$inout,$inptail,$inpperm
+ subi $inp,$inp,31 # undo "caller"
+ vxor $twk0,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vand $tmp,$tmp,$eighty7
+ vxor $out0,$in0,$twk0
+ xxlor 32+$in1, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in1
+
+ lvx_u $in1,$x10,$inp
+ vxor $twk1,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ le?vperm $in1,$in1,$in1,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out1,$in1,$twk1
+ xxlor 32+$in2, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in2
+
+ lvx_u $in2,$x20,$inp
+ andi. $taillen,$len,15
+ vxor $twk2,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ le?vperm $in2,$in2,$in2,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out2,$in2,$twk2
+ xxlor 32+$in3, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in3
+
+ lvx_u $in3,$x30,$inp
+ sub $len,$len,$taillen
+ vxor $twk3,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ le?vperm $in3,$in3,$in3,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out3,$in3,$twk3
+ xxlor 32+$in4, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in4
+
+ lvx_u $in4,$x40,$inp
+ subi $len,$len,0x60
+ vxor $twk4,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ le?vperm $in4,$in4,$in4,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out4,$in4,$twk4
+ xxlor 32+$in5, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in5
+
+ lvx_u $in5,$x50,$inp
+ addi $inp,$inp,0x60
+ vxor $twk5,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ le?vperm $in5,$in5,$in5,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out5,$in5,$twk5
+ xxlor 32+$in0, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in0
+
+ vxor v31,v31,$rndkey0
+ mtctr $rounds
+ b Loop_xts_enc6x
+
+.align 5
+Loop_xts_enc6x:
+ vcipher $out0,$out0,v24
+ vcipher $out1,$out1,v24
+ vcipher $out2,$out2,v24
+ vcipher $out3,$out3,v24
+ vcipher $out4,$out4,v24
+ vcipher $out5,$out5,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vcipher $out0,$out0,v25
+ vcipher $out1,$out1,v25
+ vcipher $out2,$out2,v25
+ vcipher $out3,$out3,v25
+ vcipher $out4,$out4,v25
+ vcipher $out5,$out5,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Loop_xts_enc6x
+
+ xxlor 32+$eighty7, 1, 1 # 0x010101..87
+
+ subic $len,$len,96 # $len-=96
+ vxor $in0,$twk0,v31 # xor with last round key
+ vcipher $out0,$out0,v24
+ vcipher $out1,$out1,v24
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk0,$tweak,$rndkey0
+ vaddubm $tweak,$tweak,$tweak
+ vcipher $out2,$out2,v24
+ vcipher $out3,$out3,v24
+ vcipher $out4,$out4,v24
+ vcipher $out5,$out5,v24
+
+ subfe. r0,r0,r0 # borrow?-1:0
+ vand $tmp,$tmp,$eighty7
+ vcipher $out0,$out0,v25
+ vcipher $out1,$out1,v25
+ xxlor 32+$in1, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in1
+ vcipher $out2,$out2,v25
+ vcipher $out3,$out3,v25
+ vxor $in1,$twk1,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk1,$tweak,$rndkey0
+ vcipher $out4,$out4,v25
+ vcipher $out5,$out5,v25
+
+ and r0,r0,$len
+ vaddubm $tweak,$tweak,$tweak
+ vcipher $out0,$out0,v26
+ vcipher $out1,$out1,v26
+ vand $tmp,$tmp,$eighty7
+ vcipher $out2,$out2,v26
+ vcipher $out3,$out3,v26
+ xxlor 32+$in2, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in2
+ vcipher $out4,$out4,v26
+ vcipher $out5,$out5,v26
+
+ add $inp,$inp,r0 # $inp is adjusted in such
+ # way that at exit from the
+ # loop inX-in5 are loaded
+ # with last "words"
+ vxor $in2,$twk2,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk2,$tweak,$rndkey0
+ vaddubm $tweak,$tweak,$tweak
+ vcipher $out0,$out0,v27
+ vcipher $out1,$out1,v27
+ vcipher $out2,$out2,v27
+ vcipher $out3,$out3,v27
+ vand $tmp,$tmp,$eighty7
+ vcipher $out4,$out4,v27
+ vcipher $out5,$out5,v27
+
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ xxlor 32+$in3, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in3
+ vcipher $out0,$out0,v28
+ vcipher $out1,$out1,v28
+ vxor $in3,$twk3,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk3,$tweak,$rndkey0
+ vcipher $out2,$out2,v28
+ vcipher $out3,$out3,v28
+ vaddubm $tweak,$tweak,$tweak
+ vcipher $out4,$out4,v28
+ vcipher $out5,$out5,v28
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+ vand $tmp,$tmp,$eighty7
+
+ vcipher $out0,$out0,v29
+ vcipher $out1,$out1,v29
+ xxlor 32+$in4, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in4
+ vcipher $out2,$out2,v29
+ vcipher $out3,$out3,v29
+ vxor $in4,$twk4,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk4,$tweak,$rndkey0
+ vcipher $out4,$out4,v29
+ vcipher $out5,$out5,v29
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+ vaddubm $tweak,$tweak,$tweak
+
+ vcipher $out0,$out0,v30
+ vcipher $out1,$out1,v30
+ vand $tmp,$tmp,$eighty7
+ vcipher $out2,$out2,v30
+ vcipher $out3,$out3,v30
+ xxlor 32+$in5, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in5
+ vcipher $out4,$out4,v30
+ vcipher $out5,$out5,v30
+ vxor $in5,$twk5,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk5,$tweak,$rndkey0
+
+ vcipherlast $out0,$out0,$in0
+ lvx_u $in0,$x00,$inp # load next input block
+ vaddubm $tweak,$tweak,$tweak
+ vcipherlast $out1,$out1,$in1
+ lvx_u $in1,$x10,$inp
+ vcipherlast $out2,$out2,$in2
+ le?vperm $in0,$in0,$in0,$leperm
+ lvx_u $in2,$x20,$inp
+ vand $tmp,$tmp,$eighty7
+ vcipherlast $out3,$out3,$in3
+ le?vperm $in1,$in1,$in1,$leperm
+ lvx_u $in3,$x30,$inp
+ vcipherlast $out4,$out4,$in4
+ le?vperm $in2,$in2,$in2,$leperm
+ lvx_u $in4,$x40,$inp
+ xxlor 10, 32+$in0, 32+$in0
+ xxlor 32+$in0, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in0
+ xxlor 32+$in0, 10, 10
+ vcipherlast $tmp,$out5,$in5 # last block might be needed
+ # in stealing mode
+ le?vperm $in3,$in3,$in3,$leperm
+ lvx_u $in5,$x50,$inp
+ addi $inp,$inp,0x60
+ le?vperm $in4,$in4,$in4,$leperm
+ le?vperm $in5,$in5,$in5,$leperm
+
+ le?vperm $out0,$out0,$out0,$leperm
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ vxor $out0,$in0,$twk0
+ le?vperm $out2,$out2,$out2,$leperm
+ stvx_u $out1,$x10,$out
+ vxor $out1,$in1,$twk1
+ le?vperm $out3,$out3,$out3,$leperm
+ stvx_u $out2,$x20,$out
+ vxor $out2,$in2,$twk2
+ le?vperm $out4,$out4,$out4,$leperm
+ stvx_u $out3,$x30,$out
+ vxor $out3,$in3,$twk3
+ le?vperm $out5,$tmp,$tmp,$leperm
+ stvx_u $out4,$x40,$out
+ vxor $out4,$in4,$twk4
+ le?stvx_u $out5,$x50,$out
+ be?stvx_u $tmp, $x50,$out
+ vxor $out5,$in5,$twk5
+ addi $out,$out,0x60
+
+ mtctr $rounds
+ beq Loop_xts_enc6x # did $len-=96 borrow?
+
+ xxlor 32+$eighty7, 2, 2 # 0x010101..87
+
+ addic. $len,$len,0x60
+ beq Lxts_enc6x_zero
+ cmpwi $len,0x20
+ blt Lxts_enc6x_one
+ nop
+ beq Lxts_enc6x_two
+ cmpwi $len,0x40
+ blt Lxts_enc6x_three
+ nop
+ beq Lxts_enc6x_four
+
+Lxts_enc6x_five:
+ vxor $out0,$in1,$twk0
+ vxor $out1,$in2,$twk1
+ vxor $out2,$in3,$twk2
+ vxor $out3,$in4,$twk3
+ vxor $out4,$in5,$twk4
+
+ bl _aesp8_xts_enc5x
+
+ le?vperm $out0,$out0,$out0,$leperm
+ vmr $twk0,$twk5 # unused tweak
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ le?vperm $out2,$out2,$out2,$leperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$leperm
+ stvx_u $out2,$x20,$out
+ vxor $tmp,$out4,$twk5 # last block prep for stealing
+ le?vperm $out4,$out4,$out4,$leperm
+ stvx_u $out3,$x30,$out
+ stvx_u $out4,$x40,$out
+ addi $out,$out,0x50
+ bne Lxts_enc6x_steal
+ b Lxts_enc6x_done
+
+.align 4
+Lxts_enc6x_four:
+ vxor $out0,$in2,$twk0
+ vxor $out1,$in3,$twk1
+ vxor $out2,$in4,$twk2
+ vxor $out3,$in5,$twk3
+ vxor $out4,$out4,$out4
+
+ bl _aesp8_xts_enc5x
+
+ le?vperm $out0,$out0,$out0,$leperm
+ vmr $twk0,$twk4 # unused tweak
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ le?vperm $out2,$out2,$out2,$leperm
+ stvx_u $out1,$x10,$out
+ vxor $tmp,$out3,$twk4 # last block prep for stealing
+ le?vperm $out3,$out3,$out3,$leperm
+ stvx_u $out2,$x20,$out
+ stvx_u $out3,$x30,$out
+ addi $out,$out,0x40
+ bne Lxts_enc6x_steal
+ b Lxts_enc6x_done
+
+.align 4
+Lxts_enc6x_three:
+ vxor $out0,$in3,$twk0
+ vxor $out1,$in4,$twk1
+ vxor $out2,$in5,$twk2
+ vxor $out3,$out3,$out3
+ vxor $out4,$out4,$out4
+
+ bl _aesp8_xts_enc5x
+
+ le?vperm $out0,$out0,$out0,$leperm
+ vmr $twk0,$twk3 # unused tweak
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ vxor $tmp,$out2,$twk3 # last block prep for stealing
+ le?vperm $out2,$out2,$out2,$leperm
+ stvx_u $out1,$x10,$out
+ stvx_u $out2,$x20,$out
+ addi $out,$out,0x30
+ bne Lxts_enc6x_steal
+ b Lxts_enc6x_done
+
+.align 4
+Lxts_enc6x_two:
+ vxor $out0,$in4,$twk0
+ vxor $out1,$in5,$twk1
+ vxor $out2,$out2,$out2
+ vxor $out3,$out3,$out3
+ vxor $out4,$out4,$out4
+
+ bl _aesp8_xts_enc5x
+
+ le?vperm $out0,$out0,$out0,$leperm
+ vmr $twk0,$twk2 # unused tweak
+ vxor $tmp,$out1,$twk2 # last block prep for stealing
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ stvx_u $out1,$x10,$out
+ addi $out,$out,0x20
+ bne Lxts_enc6x_steal
+ b Lxts_enc6x_done
+
+.align 4
+Lxts_enc6x_one:
+ vxor $out0,$in5,$twk0
+ nop
+Loop_xts_enc1x:
+ vcipher $out0,$out0,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vcipher $out0,$out0,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Loop_xts_enc1x
+
+ add $inp,$inp,$taillen
+ cmpwi $taillen,0
+ vcipher $out0,$out0,v24
+
+ subi $inp,$inp,16
+ vcipher $out0,$out0,v25
+
+ lvsr $inpperm,0,$taillen
+ vcipher $out0,$out0,v26
+
+ lvx_u $in0,0,$inp
+ vcipher $out0,$out0,v27
+
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ vcipher $out0,$out0,v28
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+
+ vcipher $out0,$out0,v29
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+ vxor $twk0,$twk0,v31
+
+ le?vperm $in0,$in0,$in0,$leperm
+ vcipher $out0,$out0,v30
+
+ vperm $in0,$in0,$in0,$inpperm
+ vcipherlast $out0,$out0,$twk0
+
+ vmr $twk0,$twk1 # unused tweak
+ vxor $tmp,$out0,$twk1 # last block prep for stealing
+ le?vperm $out0,$out0,$out0,$leperm
+ stvx_u $out0,$x00,$out # store output
+ addi $out,$out,0x10
+ bne Lxts_enc6x_steal
+ b Lxts_enc6x_done
+
+.align 4
+Lxts_enc6x_zero:
+ cmpwi $taillen,0
+ beq Lxts_enc6x_done
+
+ add $inp,$inp,$taillen
+ subi $inp,$inp,16
+ lvx_u $in0,0,$inp
+ lvsr $inpperm,0,$taillen # $in5 is no more
+ le?vperm $in0,$in0,$in0,$leperm
+ vperm $in0,$in0,$in0,$inpperm
+ vxor $tmp,$tmp,$twk0
+Lxts_enc6x_steal:
+ vxor $in0,$in0,$twk0
+ vxor $out0,$out0,$out0
+ vspltisb $out1,-1
+ vperm $out0,$out0,$out1,$inpperm
+ vsel $out0,$in0,$tmp,$out0 # $tmp is last block, remember?
+
+ subi r30,$out,17
+ subi $out,$out,16
+ mtctr $taillen
+Loop_xts_enc6x_steal:
+ lbzu r0,1(r30)
+ stb r0,16(r30)
+ bdnz Loop_xts_enc6x_steal
+
+ li $taillen,0
+ mtctr $rounds
+ b Loop_xts_enc1x # one more time...
+
+.align 4
+Lxts_enc6x_done:
+ ${UCMP}i $ivp,0
+ beq Lxts_enc6x_ret
+
+ vxor $tweak,$twk0,$rndkey0
+ le?vperm $tweak,$tweak,$tweak,$leperm
+ stvx_u $tweak,0,$ivp
+
+Lxts_enc6x_ret:
+ mtlr r11
+ li r10,`$FRAME+15`
+ li r11,`$FRAME+31`
+ stvx $seven,r10,$sp # wipe copies of round keys
+ addi r10,r10,32
+ stvx $seven,r11,$sp
+ addi r11,r11,32
+ stvx $seven,r10,$sp
+ addi r10,r10,32
+ stvx $seven,r11,$sp
+ addi r11,r11,32
+ stvx $seven,r10,$sp
+ addi r10,r10,32
+ stvx $seven,r11,$sp
+ addi r11,r11,32
+ stvx $seven,r10,$sp
+ addi r10,r10,32
+ stvx $seven,r11,$sp
+ addi r11,r11,32
+
+ mtspr 256,$vrsave
+ lvx v20,r10,$sp # ABI says so
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+ blr
+ .long 0
+ .byte 0,12,0x04,1,0x80,6,6,0
+ .long 0
+
+.align 5
+_aesp8_xts_enc5x:
+ vcipher $out0,$out0,v24
+ vcipher $out1,$out1,v24
+ vcipher $out2,$out2,v24
+ vcipher $out3,$out3,v24
+ vcipher $out4,$out4,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vcipher $out0,$out0,v25
+ vcipher $out1,$out1,v25
+ vcipher $out2,$out2,v25
+ vcipher $out3,$out3,v25
+ vcipher $out4,$out4,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz _aesp8_xts_enc5x
+
+ add $inp,$inp,$taillen
+ cmpwi $taillen,0
+ vcipher $out0,$out0,v24
+ vcipher $out1,$out1,v24
+ vcipher $out2,$out2,v24
+ vcipher $out3,$out3,v24
+ vcipher $out4,$out4,v24
+
+ subi $inp,$inp,16
+ vcipher $out0,$out0,v25
+ vcipher $out1,$out1,v25
+ vcipher $out2,$out2,v25
+ vcipher $out3,$out3,v25
+ vcipher $out4,$out4,v25
+ vxor $twk0,$twk0,v31
+
+ vcipher $out0,$out0,v26
+ lvsr $inpperm,r0,$taillen # $in5 is no more
+ vcipher $out1,$out1,v26
+ vcipher $out2,$out2,v26
+ vcipher $out3,$out3,v26
+ vcipher $out4,$out4,v26
+ vxor $in1,$twk1,v31
+
+ vcipher $out0,$out0,v27
+ lvx_u $in0,0,$inp
+ vcipher $out1,$out1,v27
+ vcipher $out2,$out2,v27
+ vcipher $out3,$out3,v27
+ vcipher $out4,$out4,v27
+ vxor $in2,$twk2,v31
+
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ vcipher $out0,$out0,v28
+ vcipher $out1,$out1,v28
+ vcipher $out2,$out2,v28
+ vcipher $out3,$out3,v28
+ vcipher $out4,$out4,v28
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+ vxor $in3,$twk3,v31
+
+ vcipher $out0,$out0,v29
+ le?vperm $in0,$in0,$in0,$leperm
+ vcipher $out1,$out1,v29
+ vcipher $out2,$out2,v29
+ vcipher $out3,$out3,v29
+ vcipher $out4,$out4,v29
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+ vxor $in4,$twk4,v31
+
+ vcipher $out0,$out0,v30
+ vperm $in0,$in0,$in0,$inpperm
+ vcipher $out1,$out1,v30
+ vcipher $out2,$out2,v30
+ vcipher $out3,$out3,v30
+ vcipher $out4,$out4,v30
+
+ vcipherlast $out0,$out0,$twk0
+ vcipherlast $out1,$out1,$in1
+ vcipherlast $out2,$out2,$in2
+ vcipherlast $out3,$out3,$in3
+ vcipherlast $out4,$out4,$in4
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+.align 5
+_aesp8_xts_decrypt6x:
+ $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+ mflr r11
+ li r7,`$FRAME+8*16+15`
+ li r3,`$FRAME+8*16+31`
+ $PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+ stvx v20,r7,$sp # ABI says so
+ addi r7,r7,32
+ stvx v21,r3,$sp
+ addi r3,r3,32
+ stvx v22,r7,$sp
+ addi r7,r7,32
+ stvx v23,r3,$sp
+ addi r3,r3,32
+ stvx v24,r7,$sp
+ addi r7,r7,32
+ stvx v25,r3,$sp
+ addi r3,r3,32
+ stvx v26,r7,$sp
+ addi r7,r7,32
+ stvx v27,r3,$sp
+ addi r3,r3,32
+ stvx v28,r7,$sp
+ addi r7,r7,32
+ stvx v29,r3,$sp
+ addi r3,r3,32
+ stvx v30,r7,$sp
+ stvx v31,r3,$sp
+ li r0,-1
+ stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
+ li $x10,0x10
+ $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ li $x20,0x20
+ $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ li $x30,0x30
+ $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ li $x40,0x40
+ $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ li $x50,0x50
+ $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ li $x60,0x60
+ $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ li $x70,0x70
+ mtspr 256,r0
+
+ xxlor 2, 32+$eighty7, 32+$eighty7
+ vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87
+ xxlor 1, 32+$eighty7, 32+$eighty7
+
+ # Load XOR Lconsts.
+ mr $x70, r6
+ bl Lconsts
+ lxvw4x 0, $x40, r6 # load XOR contents
+ mr r6, $x70
+ li $x70,0x70
+
+ subi $rounds,$rounds,3 # -4 in total
+
+ lvx $rndkey0,$x00,$key1 # load key schedule
+ lvx v30,$x10,$key1
+ addi $key1,$key1,0x20
+ lvx v31,$x00,$key1
+ ?vperm $rndkey0,$rndkey0,v30,$keyperm
+ addi $key_,$sp,$FRAME+15
+ mtctr $rounds
+
+Load_xts_dec_key:
+ ?vperm v24,v30,v31,$keyperm
+ lvx v30,$x10,$key1
+ addi $key1,$key1,0x20
+ stvx v24,$x00,$key_ # off-load round[1]
+ ?vperm v25,v31,v30,$keyperm
+ lvx v31,$x00,$key1
+ stvx v25,$x10,$key_ # off-load round[2]
+ addi $key_,$key_,0x20
+ bdnz Load_xts_dec_key
+
+ lvx v26,$x10,$key1
+ ?vperm v24,v30,v31,$keyperm
+ lvx v27,$x20,$key1
+ stvx v24,$x00,$key_ # off-load round[3]
+ ?vperm v25,v31,v26,$keyperm
+ lvx v28,$x30,$key1
+ stvx v25,$x10,$key_ # off-load round[4]
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ ?vperm v26,v26,v27,$keyperm
+ lvx v29,$x40,$key1
+ ?vperm v27,v27,v28,$keyperm
+ lvx v30,$x50,$key1
+ ?vperm v28,v28,v29,$keyperm
+ lvx v31,$x60,$key1
+ ?vperm v29,v29,v30,$keyperm
+ lvx $twk5,$x70,$key1 # borrow $twk5
+ ?vperm v30,v30,v31,$keyperm
+ lvx v24,$x00,$key_ # pre-load round[1]
+ ?vperm v31,v31,$twk5,$keyperm
+ lvx v25,$x10,$key_ # pre-load round[2]
+
+ vperm $in0,$inout,$inptail,$inpperm
+ subi $inp,$inp,31 # undo "caller"
+ vxor $twk0,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vand $tmp,$tmp,$eighty7
+ vxor $out0,$in0,$twk0
+ xxlor 32+$in1, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in1
+
+ lvx_u $in1,$x10,$inp
+ vxor $twk1,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ le?vperm $in1,$in1,$in1,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out1,$in1,$twk1
+ xxlor 32+$in2, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in2
+
+ lvx_u $in2,$x20,$inp
+ andi. $taillen,$len,15
+ vxor $twk2,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ le?vperm $in2,$in2,$in2,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out2,$in2,$twk2
+ xxlor 32+$in3, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in3
+
+ lvx_u $in3,$x30,$inp
+ sub $len,$len,$taillen
+ vxor $twk3,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ le?vperm $in3,$in3,$in3,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out3,$in3,$twk3
+ xxlor 32+$in4, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in4
+
+ lvx_u $in4,$x40,$inp
+ subi $len,$len,0x60
+ vxor $twk4,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ le?vperm $in4,$in4,$in4,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out4,$in4,$twk4
+ xxlor 32+$in5, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in5
+
+ lvx_u $in5,$x50,$inp
+ addi $inp,$inp,0x60
+ vxor $twk5,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ le?vperm $in5,$in5,$in5,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out5,$in5,$twk5
+ xxlor 32+$in0, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in0
+
+ vxor v31,v31,$rndkey0
+ mtctr $rounds
+ b Loop_xts_dec6x
+
+.align 5
+Loop_xts_dec6x:
+ vncipher $out0,$out0,v24
+ vncipher $out1,$out1,v24
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+ vncipher $out5,$out5,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vncipher $out0,$out0,v25
+ vncipher $out1,$out1,v25
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vncipher $out4,$out4,v25
+ vncipher $out5,$out5,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Loop_xts_dec6x
+
+ xxlor 32+$eighty7, 1, 1 # 0x010101..87
+
+ subic $len,$len,96 # $len-=96
+ vxor $in0,$twk0,v31 # xor with last round key
+ vncipher $out0,$out0,v24
+ vncipher $out1,$out1,v24
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk0,$tweak,$rndkey0
+ vaddubm $tweak,$tweak,$tweak
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+ vncipher $out5,$out5,v24
+
+ subfe. r0,r0,r0 # borrow?-1:0
+ vand $tmp,$tmp,$eighty7
+ vncipher $out0,$out0,v25
+ vncipher $out1,$out1,v25
+ xxlor 32+$in1, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in1
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vxor $in1,$twk1,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk1,$tweak,$rndkey0
+ vncipher $out4,$out4,v25
+ vncipher $out5,$out5,v25
+
+ and r0,r0,$len
+ vaddubm $tweak,$tweak,$tweak
+ vncipher $out0,$out0,v26
+ vncipher $out1,$out1,v26
+ vand $tmp,$tmp,$eighty7
+ vncipher $out2,$out2,v26
+ vncipher $out3,$out3,v26
+ xxlor 32+$in2, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in2
+ vncipher $out4,$out4,v26
+ vncipher $out5,$out5,v26
+
+ add $inp,$inp,r0 # $inp is adjusted in such
+ # way that at exit from the
+ # loop inX-in5 are loaded
+ # with last "words"
+ vxor $in2,$twk2,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk2,$tweak,$rndkey0
+ vaddubm $tweak,$tweak,$tweak
+ vncipher $out0,$out0,v27
+ vncipher $out1,$out1,v27
+ vncipher $out2,$out2,v27
+ vncipher $out3,$out3,v27
+ vand $tmp,$tmp,$eighty7
+ vncipher $out4,$out4,v27
+ vncipher $out5,$out5,v27
+
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ xxlor 32+$in3, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in3
+ vncipher $out0,$out0,v28
+ vncipher $out1,$out1,v28
+ vxor $in3,$twk3,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk3,$tweak,$rndkey0
+ vncipher $out2,$out2,v28
+ vncipher $out3,$out3,v28
+ vaddubm $tweak,$tweak,$tweak
+ vncipher $out4,$out4,v28
+ vncipher $out5,$out5,v28
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+ vand $tmp,$tmp,$eighty7
+
+ vncipher $out0,$out0,v29
+ vncipher $out1,$out1,v29
+ xxlor 32+$in4, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in4
+ vncipher $out2,$out2,v29
+ vncipher $out3,$out3,v29
+ vxor $in4,$twk4,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk4,$tweak,$rndkey0
+ vncipher $out4,$out4,v29
+ vncipher $out5,$out5,v29
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+ vaddubm $tweak,$tweak,$tweak
+
+ vncipher $out0,$out0,v30
+ vncipher $out1,$out1,v30
+ vand $tmp,$tmp,$eighty7
+ vncipher $out2,$out2,v30
+ vncipher $out3,$out3,v30
+ xxlor 32+$in5, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in5
+ vncipher $out4,$out4,v30
+ vncipher $out5,$out5,v30
+ vxor $in5,$twk5,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk5,$tweak,$rndkey0
+
+ vncipherlast $out0,$out0,$in0
+ lvx_u $in0,$x00,$inp # load next input block
+ vaddubm $tweak,$tweak,$tweak
+ vncipherlast $out1,$out1,$in1
+ lvx_u $in1,$x10,$inp
+ vncipherlast $out2,$out2,$in2
+ le?vperm $in0,$in0,$in0,$leperm
+ lvx_u $in2,$x20,$inp
+ vand $tmp,$tmp,$eighty7
+ vncipherlast $out3,$out3,$in3
+ le?vperm $in1,$in1,$in1,$leperm
+ lvx_u $in3,$x30,$inp
+ vncipherlast $out4,$out4,$in4
+ le?vperm $in2,$in2,$in2,$leperm
+ lvx_u $in4,$x40,$inp
+ xxlor 10, 32+$in0, 32+$in0
+ xxlor 32+$in0, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in0
+ xxlor 32+$in0, 10, 10
+ vncipherlast $out5,$out5,$in5
+ le?vperm $in3,$in3,$in3,$leperm
+ lvx_u $in5,$x50,$inp
+ addi $inp,$inp,0x60
+ le?vperm $in4,$in4,$in4,$leperm
+ le?vperm $in5,$in5,$in5,$leperm
+
+ le?vperm $out0,$out0,$out0,$leperm
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ vxor $out0,$in0,$twk0
+ le?vperm $out2,$out2,$out2,$leperm
+ stvx_u $out1,$x10,$out
+ vxor $out1,$in1,$twk1
+ le?vperm $out3,$out3,$out3,$leperm
+ stvx_u $out2,$x20,$out
+ vxor $out2,$in2,$twk2
+ le?vperm $out4,$out4,$out4,$leperm
+ stvx_u $out3,$x30,$out
+ vxor $out3,$in3,$twk3
+ le?vperm $out5,$out5,$out5,$leperm
+ stvx_u $out4,$x40,$out
+ vxor $out4,$in4,$twk4
+ stvx_u $out5,$x50,$out
+ vxor $out5,$in5,$twk5
+ addi $out,$out,0x60
+
+ mtctr $rounds
+ beq Loop_xts_dec6x # did $len-=96 borrow?
+
+ xxlor 32+$eighty7, 2, 2 # 0x010101..87
+
+ addic. $len,$len,0x60
+ beq Lxts_dec6x_zero
+ cmpwi $len,0x20
+ blt Lxts_dec6x_one
+ nop
+ beq Lxts_dec6x_two
+ cmpwi $len,0x40
+ blt Lxts_dec6x_three
+ nop
+ beq Lxts_dec6x_four
+
+Lxts_dec6x_five:
+ vxor $out0,$in1,$twk0
+ vxor $out1,$in2,$twk1
+ vxor $out2,$in3,$twk2
+ vxor $out3,$in4,$twk3
+ vxor $out4,$in5,$twk4
+
+ bl _aesp8_xts_dec5x
+
+ le?vperm $out0,$out0,$out0,$leperm
+ vmr $twk0,$twk5 # unused tweak
+ vxor $twk1,$tweak,$rndkey0
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ vxor $out0,$in0,$twk1
+ le?vperm $out2,$out2,$out2,$leperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$leperm
+ stvx_u $out2,$x20,$out
+ le?vperm $out4,$out4,$out4,$leperm
+ stvx_u $out3,$x30,$out
+ stvx_u $out4,$x40,$out
+ addi $out,$out,0x50
+ bne Lxts_dec6x_steal
+ b Lxts_dec6x_done
+
+.align 4
+Lxts_dec6x_four:
+ vxor $out0,$in2,$twk0
+ vxor $out1,$in3,$twk1
+ vxor $out2,$in4,$twk2
+ vxor $out3,$in5,$twk3
+ vxor $out4,$out4,$out4
+
+ bl _aesp8_xts_dec5x
+
+ le?vperm $out0,$out0,$out0,$leperm
+ vmr $twk0,$twk4 # unused tweak
+ vmr $twk1,$twk5
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ vxor $out0,$in0,$twk5
+ le?vperm $out2,$out2,$out2,$leperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$leperm
+ stvx_u $out2,$x20,$out
+ stvx_u $out3,$x30,$out
+ addi $out,$out,0x40
+ bne Lxts_dec6x_steal
+ b Lxts_dec6x_done
+
+.align 4
+Lxts_dec6x_three:
+ vxor $out0,$in3,$twk0
+ vxor $out1,$in4,$twk1
+ vxor $out2,$in5,$twk2
+ vxor $out3,$out3,$out3
+ vxor $out4,$out4,$out4
+
+ bl _aesp8_xts_dec5x
+
+ le?vperm $out0,$out0,$out0,$leperm
+ vmr $twk0,$twk3 # unused tweak
+ vmr $twk1,$twk4
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ vxor $out0,$in0,$twk4
+ le?vperm $out2,$out2,$out2,$leperm
+ stvx_u $out1,$x10,$out
+ stvx_u $out2,$x20,$out
+ addi $out,$out,0x30
+ bne Lxts_dec6x_steal
+ b Lxts_dec6x_done
+
+.align 4
+Lxts_dec6x_two:
+ vxor $out0,$in4,$twk0
+ vxor $out1,$in5,$twk1
+ vxor $out2,$out2,$out2
+ vxor $out3,$out3,$out3
+ vxor $out4,$out4,$out4
+
+ bl _aesp8_xts_dec5x
+
+ le?vperm $out0,$out0,$out0,$leperm
+ vmr $twk0,$twk2 # unused tweak
+ vmr $twk1,$twk3
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ vxor $out0,$in0,$twk3
+ stvx_u $out1,$x10,$out
+ addi $out,$out,0x20
+ bne Lxts_dec6x_steal
+ b Lxts_dec6x_done
+
+.align 4
+Lxts_dec6x_one:
+ vxor $out0,$in5,$twk0
+ nop
+Loop_xts_dec1x:
+ vncipher $out0,$out0,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vncipher $out0,$out0,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Loop_xts_dec1x
+
+ subi r0,$taillen,1
+ vncipher $out0,$out0,v24
+
+ andi. r0,r0,16
+ cmpwi $taillen,0
+ vncipher $out0,$out0,v25
+
+ sub $inp,$inp,r0
+ vncipher $out0,$out0,v26
+
+ lvx_u $in0,0,$inp
+ vncipher $out0,$out0,v27
+
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ vncipher $out0,$out0,v28
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+
+ vncipher $out0,$out0,v29
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+ vxor $twk0,$twk0,v31
+
+ le?vperm $in0,$in0,$in0,$leperm
+ vncipher $out0,$out0,v30
+
+ mtctr $rounds
+ vncipherlast $out0,$out0,$twk0
+
+ vmr $twk0,$twk1 # unused tweak
+ vmr $twk1,$twk2
+ le?vperm $out0,$out0,$out0,$leperm
+ stvx_u $out0,$x00,$out # store output
+ addi $out,$out,0x10
+ vxor $out0,$in0,$twk2
+ bne Lxts_dec6x_steal
+ b Lxts_dec6x_done
+
+.align 4
+Lxts_dec6x_zero:
+ cmpwi $taillen,0
+ beq Lxts_dec6x_done
+
+ lvx_u $in0,0,$inp
+ le?vperm $in0,$in0,$in0,$leperm
+ vxor $out0,$in0,$twk1
+Lxts_dec6x_steal:
+ vncipher $out0,$out0,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vncipher $out0,$out0,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Lxts_dec6x_steal
+
+ add $inp,$inp,$taillen
+ vncipher $out0,$out0,v24
+
+ cmpwi $taillen,0
+ vncipher $out0,$out0,v25
+
+ lvx_u $in0,0,$inp
+ vncipher $out0,$out0,v26
+
+ lvsr $inpperm,0,$taillen # $in5 is no more
+ vncipher $out0,$out0,v27
+
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ vncipher $out0,$out0,v28
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+
+ vncipher $out0,$out0,v29
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+ vxor $twk1,$twk1,v31
+
+ le?vperm $in0,$in0,$in0,$leperm
+ vncipher $out0,$out0,v30
+
+ vperm $in0,$in0,$in0,$inpperm
+ vncipherlast $tmp,$out0,$twk1
+
+ le?vperm $out0,$tmp,$tmp,$leperm
+ le?stvx_u $out0,0,$out
+ be?stvx_u $tmp,0,$out
+
+ vxor $out0,$out0,$out0
+ vspltisb $out1,-1
+ vperm $out0,$out0,$out1,$inpperm
+ vsel $out0,$in0,$tmp,$out0
+ vxor $out0,$out0,$twk0
+
+ subi r30,$out,1
+ mtctr $taillen
+Loop_xts_dec6x_steal:
+ lbzu r0,1(r30)
+ stb r0,16(r30)
+ bdnz Loop_xts_dec6x_steal
+
+ li $taillen,0
+ mtctr $rounds
+ b Loop_xts_dec1x # one more time...
+
+.align 4
+Lxts_dec6x_done:
+ ${UCMP}i $ivp,0
+ beq Lxts_dec6x_ret
+
+ vxor $tweak,$twk0,$rndkey0
+ le?vperm $tweak,$tweak,$tweak,$leperm
+ stvx_u $tweak,0,$ivp
+
+Lxts_dec6x_ret:
+ mtlr r11
+ li r10,`$FRAME+15`
+ li r11,`$FRAME+31`
+ stvx $seven,r10,$sp # wipe copies of round keys
+ addi r10,r10,32
+ stvx $seven,r11,$sp
+ addi r11,r11,32
+ stvx $seven,r10,$sp
+ addi r10,r10,32
+ stvx $seven,r11,$sp
+ addi r11,r11,32
+ stvx $seven,r10,$sp
+ addi r10,r10,32
+ stvx $seven,r11,$sp
+ addi r11,r11,32
+ stvx $seven,r10,$sp
+ addi r10,r10,32
+ stvx $seven,r11,$sp
+ addi r11,r11,32
+
+ mtspr 256,$vrsave
+ lvx v20,r10,$sp # ABI says so
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+ blr
+ .long 0
+ .byte 0,12,0x04,1,0x80,6,6,0
+ .long 0
+
+.align 5
+_aesp8_xts_dec5x:
+ vncipher $out0,$out0,v24
+ vncipher $out1,$out1,v24
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vncipher $out0,$out0,v25
+ vncipher $out1,$out1,v25
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vncipher $out4,$out4,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz _aesp8_xts_dec5x
+
+ subi r0,$taillen,1
+ vncipher $out0,$out0,v24
+ vncipher $out1,$out1,v24
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+
+ andi. r0,r0,16
+ cmpwi $taillen,0
+ vncipher $out0,$out0,v25
+ vncipher $out1,$out1,v25
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vncipher $out4,$out4,v25
+ vxor $twk0,$twk0,v31
+
+ sub $inp,$inp,r0
+ vncipher $out0,$out0,v26
+ vncipher $out1,$out1,v26
+ vncipher $out2,$out2,v26
+ vncipher $out3,$out3,v26
+ vncipher $out4,$out4,v26
+ vxor $in1,$twk1,v31
+
+ vncipher $out0,$out0,v27
+ lvx_u $in0,0,$inp
+ vncipher $out1,$out1,v27
+ vncipher $out2,$out2,v27
+ vncipher $out3,$out3,v27
+ vncipher $out4,$out4,v27
+ vxor $in2,$twk2,v31
+
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ vncipher $out0,$out0,v28
+ vncipher $out1,$out1,v28
+ vncipher $out2,$out2,v28
+ vncipher $out3,$out3,v28
+ vncipher $out4,$out4,v28
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+ vxor $in3,$twk3,v31
+
+ vncipher $out0,$out0,v29
+ le?vperm $in0,$in0,$in0,$leperm
+ vncipher $out1,$out1,v29
+ vncipher $out2,$out2,v29
+ vncipher $out3,$out3,v29
+ vncipher $out4,$out4,v29
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+ vxor $in4,$twk4,v31
+
+ vncipher $out0,$out0,v30
+ vncipher $out1,$out1,v30
+ vncipher $out2,$out2,v30
+ vncipher $out3,$out3,v30
+ vncipher $out4,$out4,v30
+
+ vncipherlast $out0,$out0,$twk0
+ vncipherlast $out1,$out1,$in1
+ vncipherlast $out2,$out2,$in2
+ vncipherlast $out3,$out3,$in3
+ vncipherlast $out4,$out4,$in4
+ mtctr $rounds
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+___
+}} }}}
+
+my $consts=1;
+foreach(split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/geo;
+
+ # constants table endian-specific conversion
+ if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
+ my $conv=$3;
+ my @bytes=();
+
+ # convert to endian-agnostic format
+ if ($1 eq "long") {
+ foreach (split(/,\s*/,$2)) {
+ my $l = /^0/?oct:int;
+ push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
+ }
+ } else {
+ @bytes = map(/^0/?oct:int,split(/,\s*/,$2));
+ }
+
+ # little-endian conversion
+ if ($flavour =~ /le$/o) {
+ SWITCH: for($conv) {
+ /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
+ /\?rev/ && do { @bytes=reverse(@bytes); last; };
+ }
+ }
+
+ #emit
+ print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
+ next;
+ }
+ $consts=0 if (m/Lconsts:/o); # end of table
+
+ # instructions prefixed with '?' are endian-specific and need
+ # to be adjusted accordingly...
+ if ($flavour =~ /le$/o) { # little-endian
+ s/le\?//o or
+ s/be\?/#be#/o or
+ s/\?lvsr/lvsl/o or
+ s/\?lvsl/lvsr/o or
+ s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
+ s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
+ s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
+ } else { # big-endian
+ s/le\?/#le#/o or
+ s/be\?//o or
+ s/\?([a-z]+)/$1/o;
+ }
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/arch/powerpc/crypto/chacha-p10-glue.c b/arch/powerpc/crypto/chacha-p10-glue.c
new file mode 100644
index 000000000000..74fb86b0d209
--- /dev/null
+++ b/arch/powerpc/crypto/chacha-p10-glue.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerPC P10 (ppc64le) accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright 2023- IBM Corp. All rights reserved.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/chacha.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#include <linux/sizes.h>
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+
+asmlinkage void chacha_p10le_8x(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
+
+static void vsx_begin(void)
+{
+ preempt_disable();
+ enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+ disable_kernel_vsx();
+ preempt_enable();
+}
+
+static void chacha_p10_do_8x(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ unsigned int l = bytes & ~0x0FF;
+
+ if (l > 0) {
+ chacha_p10le_8x(state, dst, src, l, nrounds);
+ bytes -= l;
+ src += l;
+ dst += l;
+ state[12] += l / CHACHA_BLOCK_SIZE;
+ }
+
+ if (bytes > 0)
+ chacha_crypt_generic(state, dst, src, bytes, nrounds);
+}
+
+void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+{
+ hchacha_block_generic(state, stream, nrounds);
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+{
+ chacha_init_generic(state, key, iv);
+}
+EXPORT_SYMBOL(chacha_init_arch);
+
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
+ int nrounds)
+{
+ if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE ||
+ !crypto_simd_usable())
+ return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ vsx_begin();
+ chacha_p10_do_8x(state, dst, src, todo, nrounds);
+ vsx_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+static int chacha_p10_stream_xor(struct skcipher_request *req,
+ const struct chacha_ctx *ctx, const u8 *iv)
+{
+ struct skcipher_walk walk;
+ u32 state[16];
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+ if (err)
+ return err;
+
+ chacha_init_generic(state, ctx->key, iv);
+
+ while (walk.nbytes > 0) {
+ unsigned int nbytes = walk.nbytes;
+
+ if (nbytes < walk.total)
+ nbytes = rounddown(nbytes, walk.stride);
+
+ if (!crypto_simd_usable()) {
+ chacha_crypt_generic(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes,
+ ctx->nrounds);
+ } else {
+ vsx_begin();
+ chacha_p10_do_8x(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes, ctx->nrounds);
+ vsx_end();
+ }
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
+static int chacha_p10(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return chacha_p10_stream_xor(req, ctx, req->iv);
+}
+
+static int xchacha_p10(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct chacha_ctx subctx;
+ u32 state[16];
+ u8 real_iv[16];
+
+ chacha_init_generic(state, ctx->key, req->iv);
+ hchacha_block_arch(state, subctx.key, ctx->nrounds);
+ subctx.nrounds = ctx->nrounds;
+
+ memcpy(&real_iv[0], req->iv + 24, 8);
+ memcpy(&real_iv[8], req->iv + 16, 8);
+ return chacha_p10_stream_xor(req, &subctx, real_iv);
+}
+
+static struct skcipher_alg algs[] = {
+ {
+ .base.cra_name = "chacha20",
+ .base.cra_driver_name = "chacha20-p10",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = CHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = chacha20_setkey,
+ .encrypt = chacha_p10,
+ .decrypt = chacha_p10,
+ }, {
+ .base.cra_name = "xchacha20",
+ .base.cra_driver_name = "xchacha20-p10",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = chacha20_setkey,
+ .encrypt = xchacha_p10,
+ .decrypt = xchacha_p10,
+ }, {
+ .base.cra_name = "xchacha12",
+ .base.cra_driver_name = "xchacha12-p10",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = chacha12_setkey,
+ .encrypt = xchacha_p10,
+ .decrypt = xchacha_p10,
+ }
+};
+
+static int __init chacha_p10_init(void)
+{
+ static_branch_enable(&have_p10);
+
+ return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit chacha_p10_exit(void)
+{
+ crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+module_cpu_feature_match(PPC_MODULE_FEATURE_P10, chacha_p10_init);
+module_exit(chacha_p10_exit);
+
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (P10 accelerated)");
+MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-p10");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-p10");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-p10");
diff --git a/arch/powerpc/crypto/chacha-p10le-8x.S b/arch/powerpc/crypto/chacha-p10le-8x.S
new file mode 100644
index 000000000000..17bedb66b822
--- /dev/null
+++ b/arch/powerpc/crypto/chacha-p10le-8x.S
@@ -0,0 +1,842 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Accelerated chacha20 implementation for ppc64le.
+#
+# Copyright 2023- IBM Corp. All rights reserved
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# chacha_p10le_8x(u32 *state, byte *dst, const byte *src,
+# size_t len, int nrounds);
+#
+# do rounds, 8 quarter rounds
+# 1. a += b; d ^= a; d <<<= 16;
+# 2. c += d; b ^= c; b <<<= 12;
+# 3. a += b; d ^= a; d <<<= 8;
+# 4. c += d; b ^= c; b <<<= 7
+#
+# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16
+# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12
+# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8
+# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7
+#
+# 4 blocks (a b c d)
+#
+# a0 b0 c0 d0
+# a1 b1 c1 d1
+# ...
+# a4 b4 c4 d4
+# ...
+# a8 b8 c8 d8
+# ...
+# a12 b12 c12 d12
+# a13 ...
+# a14 ...
+# a15 b15 c15 d15
+#
+# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
+#
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+#include <linux/linkage.h>
+
+.machine "any"
+.text
+
+.macro SAVE_GPR GPR OFFSET FRAME
+ std \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro SAVE_VRS VRS OFFSET FRAME
+ li 16, \OFFSET
+ stvx \VRS, 16, \FRAME
+.endm
+
+.macro SAVE_VSX VSX OFFSET FRAME
+ li 16, \OFFSET
+ stxvx \VSX, 16, \FRAME
+.endm
+
+.macro RESTORE_GPR GPR OFFSET FRAME
+ ld \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro RESTORE_VRS VRS OFFSET FRAME
+ li 16, \OFFSET
+ lvx \VRS, 16, \FRAME
+.endm
+
+.macro RESTORE_VSX VSX OFFSET FRAME
+ li 16, \OFFSET
+ lxvx \VSX, 16, \FRAME
+.endm
+
+.macro SAVE_REGS
+ mflr 0
+ std 0, 16(1)
+ stdu 1,-752(1)
+
+ SAVE_GPR 14, 112, 1
+ SAVE_GPR 15, 120, 1
+ SAVE_GPR 16, 128, 1
+ SAVE_GPR 17, 136, 1
+ SAVE_GPR 18, 144, 1
+ SAVE_GPR 19, 152, 1
+ SAVE_GPR 20, 160, 1
+ SAVE_GPR 21, 168, 1
+ SAVE_GPR 22, 176, 1
+ SAVE_GPR 23, 184, 1
+ SAVE_GPR 24, 192, 1
+ SAVE_GPR 25, 200, 1
+ SAVE_GPR 26, 208, 1
+ SAVE_GPR 27, 216, 1
+ SAVE_GPR 28, 224, 1
+ SAVE_GPR 29, 232, 1
+ SAVE_GPR 30, 240, 1
+ SAVE_GPR 31, 248, 1
+
+ addi 9, 1, 256
+ SAVE_VRS 20, 0, 9
+ SAVE_VRS 21, 16, 9
+ SAVE_VRS 22, 32, 9
+ SAVE_VRS 23, 48, 9
+ SAVE_VRS 24, 64, 9
+ SAVE_VRS 25, 80, 9
+ SAVE_VRS 26, 96, 9
+ SAVE_VRS 27, 112, 9
+ SAVE_VRS 28, 128, 9
+ SAVE_VRS 29, 144, 9
+ SAVE_VRS 30, 160, 9
+ SAVE_VRS 31, 176, 9
+
+ SAVE_VSX 14, 192, 9
+ SAVE_VSX 15, 208, 9
+ SAVE_VSX 16, 224, 9
+ SAVE_VSX 17, 240, 9
+ SAVE_VSX 18, 256, 9
+ SAVE_VSX 19, 272, 9
+ SAVE_VSX 20, 288, 9
+ SAVE_VSX 21, 304, 9
+ SAVE_VSX 22, 320, 9
+ SAVE_VSX 23, 336, 9
+ SAVE_VSX 24, 352, 9
+ SAVE_VSX 25, 368, 9
+ SAVE_VSX 26, 384, 9
+ SAVE_VSX 27, 400, 9
+ SAVE_VSX 28, 416, 9
+ SAVE_VSX 29, 432, 9
+ SAVE_VSX 30, 448, 9
+ SAVE_VSX 31, 464, 9
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+ addi 9, 1, 256
+ RESTORE_VRS 20, 0, 9
+ RESTORE_VRS 21, 16, 9
+ RESTORE_VRS 22, 32, 9
+ RESTORE_VRS 23, 48, 9
+ RESTORE_VRS 24, 64, 9
+ RESTORE_VRS 25, 80, 9
+ RESTORE_VRS 26, 96, 9
+ RESTORE_VRS 27, 112, 9
+ RESTORE_VRS 28, 128, 9
+ RESTORE_VRS 29, 144, 9
+ RESTORE_VRS 30, 160, 9
+ RESTORE_VRS 31, 176, 9
+
+ RESTORE_VSX 14, 192, 9
+ RESTORE_VSX 15, 208, 9
+ RESTORE_VSX 16, 224, 9
+ RESTORE_VSX 17, 240, 9
+ RESTORE_VSX 18, 256, 9
+ RESTORE_VSX 19, 272, 9
+ RESTORE_VSX 20, 288, 9
+ RESTORE_VSX 21, 304, 9
+ RESTORE_VSX 22, 320, 9
+ RESTORE_VSX 23, 336, 9
+ RESTORE_VSX 24, 352, 9
+ RESTORE_VSX 25, 368, 9
+ RESTORE_VSX 26, 384, 9
+ RESTORE_VSX 27, 400, 9
+ RESTORE_VSX 28, 416, 9
+ RESTORE_VSX 29, 432, 9
+ RESTORE_VSX 30, 448, 9
+ RESTORE_VSX 31, 464, 9
+
+ RESTORE_GPR 14, 112, 1
+ RESTORE_GPR 15, 120, 1
+ RESTORE_GPR 16, 128, 1
+ RESTORE_GPR 17, 136, 1
+ RESTORE_GPR 18, 144, 1
+ RESTORE_GPR 19, 152, 1
+ RESTORE_GPR 20, 160, 1
+ RESTORE_GPR 21, 168, 1
+ RESTORE_GPR 22, 176, 1
+ RESTORE_GPR 23, 184, 1
+ RESTORE_GPR 24, 192, 1
+ RESTORE_GPR 25, 200, 1
+ RESTORE_GPR 26, 208, 1
+ RESTORE_GPR 27, 216, 1
+ RESTORE_GPR 28, 224, 1
+ RESTORE_GPR 29, 232, 1
+ RESTORE_GPR 30, 240, 1
+ RESTORE_GPR 31, 248, 1
+
+ addi 1, 1, 752
+ ld 0, 16(1)
+ mtlr 0
+.endm # RESTORE_REGS
+
+.macro QT_loop_8x
+ # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 20, 20
+ vadduwm 0, 0, 4
+ vadduwm 1, 1, 5
+ vadduwm 2, 2, 6
+ vadduwm 3, 3, 7
+ vadduwm 16, 16, 20
+ vadduwm 17, 17, 21
+ vadduwm 18, 18, 22
+ vadduwm 19, 19, 23
+
+ vpermxor 12, 12, 0, 25
+ vpermxor 13, 13, 1, 25
+ vpermxor 14, 14, 2, 25
+ vpermxor 15, 15, 3, 25
+ vpermxor 28, 28, 16, 25
+ vpermxor 29, 29, 17, 25
+ vpermxor 30, 30, 18, 25
+ vpermxor 31, 31, 19, 25
+ xxlor 32+25, 0, 0
+ vadduwm 8, 8, 12
+ vadduwm 9, 9, 13
+ vadduwm 10, 10, 14
+ vadduwm 11, 11, 15
+ vadduwm 24, 24, 28
+ vadduwm 25, 25, 29
+ vadduwm 26, 26, 30
+ vadduwm 27, 27, 31
+ vxor 4, 4, 8
+ vxor 5, 5, 9
+ vxor 6, 6, 10
+ vxor 7, 7, 11
+ vxor 20, 20, 24
+ vxor 21, 21, 25
+ vxor 22, 22, 26
+ vxor 23, 23, 27
+
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 21, 21
+ vrlw 4, 4, 25 #
+ vrlw 5, 5, 25
+ vrlw 6, 6, 25
+ vrlw 7, 7, 25
+ vrlw 20, 20, 25 #
+ vrlw 21, 21, 25
+ vrlw 22, 22, 25
+ vrlw 23, 23, 25
+ xxlor 32+25, 0, 0
+ vadduwm 0, 0, 4
+ vadduwm 1, 1, 5
+ vadduwm 2, 2, 6
+ vadduwm 3, 3, 7
+ vadduwm 16, 16, 20
+ vadduwm 17, 17, 21
+ vadduwm 18, 18, 22
+ vadduwm 19, 19, 23
+
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 22, 22
+ vpermxor 12, 12, 0, 25
+ vpermxor 13, 13, 1, 25
+ vpermxor 14, 14, 2, 25
+ vpermxor 15, 15, 3, 25
+ vpermxor 28, 28, 16, 25
+ vpermxor 29, 29, 17, 25
+ vpermxor 30, 30, 18, 25
+ vpermxor 31, 31, 19, 25
+ xxlor 32+25, 0, 0
+ vadduwm 8, 8, 12
+ vadduwm 9, 9, 13
+ vadduwm 10, 10, 14
+ vadduwm 11, 11, 15
+ vadduwm 24, 24, 28
+ vadduwm 25, 25, 29
+ vadduwm 26, 26, 30
+ vadduwm 27, 27, 31
+ xxlor 0, 32+28, 32+28
+ xxlor 32+28, 23, 23
+ vxor 4, 4, 8
+ vxor 5, 5, 9
+ vxor 6, 6, 10
+ vxor 7, 7, 11
+ vxor 20, 20, 24
+ vxor 21, 21, 25
+ vxor 22, 22, 26
+ vxor 23, 23, 27
+ vrlw 4, 4, 28 #
+ vrlw 5, 5, 28
+ vrlw 6, 6, 28
+ vrlw 7, 7, 28
+ vrlw 20, 20, 28 #
+ vrlw 21, 21, 28
+ vrlw 22, 22, 28
+ vrlw 23, 23, 28
+ xxlor 32+28, 0, 0
+
+ # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 20, 20
+ vadduwm 0, 0, 5
+ vadduwm 1, 1, 6
+ vadduwm 2, 2, 7
+ vadduwm 3, 3, 4
+ vadduwm 16, 16, 21
+ vadduwm 17, 17, 22
+ vadduwm 18, 18, 23
+ vadduwm 19, 19, 20
+
+ vpermxor 15, 15, 0, 25
+ vpermxor 12, 12, 1, 25
+ vpermxor 13, 13, 2, 25
+ vpermxor 14, 14, 3, 25
+ vpermxor 31, 31, 16, 25
+ vpermxor 28, 28, 17, 25
+ vpermxor 29, 29, 18, 25
+ vpermxor 30, 30, 19, 25
+
+ xxlor 32+25, 0, 0
+ vadduwm 10, 10, 15
+ vadduwm 11, 11, 12
+ vadduwm 8, 8, 13
+ vadduwm 9, 9, 14
+ vadduwm 26, 26, 31
+ vadduwm 27, 27, 28
+ vadduwm 24, 24, 29
+ vadduwm 25, 25, 30
+ vxor 5, 5, 10
+ vxor 6, 6, 11
+ vxor 7, 7, 8
+ vxor 4, 4, 9
+ vxor 21, 21, 26
+ vxor 22, 22, 27
+ vxor 23, 23, 24
+ vxor 20, 20, 25
+
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 21, 21
+ vrlw 5, 5, 25
+ vrlw 6, 6, 25
+ vrlw 7, 7, 25
+ vrlw 4, 4, 25
+ vrlw 21, 21, 25
+ vrlw 22, 22, 25
+ vrlw 23, 23, 25
+ vrlw 20, 20, 25
+ xxlor 32+25, 0, 0
+
+ vadduwm 0, 0, 5
+ vadduwm 1, 1, 6
+ vadduwm 2, 2, 7
+ vadduwm 3, 3, 4
+ vadduwm 16, 16, 21
+ vadduwm 17, 17, 22
+ vadduwm 18, 18, 23
+ vadduwm 19, 19, 20
+
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 22, 22
+ vpermxor 15, 15, 0, 25
+ vpermxor 12, 12, 1, 25
+ vpermxor 13, 13, 2, 25
+ vpermxor 14, 14, 3, 25
+ vpermxor 31, 31, 16, 25
+ vpermxor 28, 28, 17, 25
+ vpermxor 29, 29, 18, 25
+ vpermxor 30, 30, 19, 25
+ xxlor 32+25, 0, 0
+
+ vadduwm 10, 10, 15
+ vadduwm 11, 11, 12
+ vadduwm 8, 8, 13
+ vadduwm 9, 9, 14
+ vadduwm 26, 26, 31
+ vadduwm 27, 27, 28
+ vadduwm 24, 24, 29
+ vadduwm 25, 25, 30
+
+ xxlor 0, 32+28, 32+28
+ xxlor 32+28, 23, 23
+ vxor 5, 5, 10
+ vxor 6, 6, 11
+ vxor 7, 7, 8
+ vxor 4, 4, 9
+ vxor 21, 21, 26
+ vxor 22, 22, 27
+ vxor 23, 23, 24
+ vxor 20, 20, 25
+ vrlw 5, 5, 28
+ vrlw 6, 6, 28
+ vrlw 7, 7, 28
+ vrlw 4, 4, 28
+ vrlw 21, 21, 28
+ vrlw 22, 22, 28
+ vrlw 23, 23, 28
+ vrlw 20, 20, 28
+ xxlor 32+28, 0, 0
+.endm
+
+.macro QT_loop_4x
+ # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+ vadduwm 0, 0, 4
+ vadduwm 1, 1, 5
+ vadduwm 2, 2, 6
+ vadduwm 3, 3, 7
+ vpermxor 12, 12, 0, 20
+ vpermxor 13, 13, 1, 20
+ vpermxor 14, 14, 2, 20
+ vpermxor 15, 15, 3, 20
+ vadduwm 8, 8, 12
+ vadduwm 9, 9, 13
+ vadduwm 10, 10, 14
+ vadduwm 11, 11, 15
+ vxor 4, 4, 8
+ vxor 5, 5, 9
+ vxor 6, 6, 10
+ vxor 7, 7, 11
+ vrlw 4, 4, 21
+ vrlw 5, 5, 21
+ vrlw 6, 6, 21
+ vrlw 7, 7, 21
+ vadduwm 0, 0, 4
+ vadduwm 1, 1, 5
+ vadduwm 2, 2, 6
+ vadduwm 3, 3, 7
+ vpermxor 12, 12, 0, 22
+ vpermxor 13, 13, 1, 22
+ vpermxor 14, 14, 2, 22
+ vpermxor 15, 15, 3, 22
+ vadduwm 8, 8, 12
+ vadduwm 9, 9, 13
+ vadduwm 10, 10, 14
+ vadduwm 11, 11, 15
+ vxor 4, 4, 8
+ vxor 5, 5, 9
+ vxor 6, 6, 10
+ vxor 7, 7, 11
+ vrlw 4, 4, 23
+ vrlw 5, 5, 23
+ vrlw 6, 6, 23
+ vrlw 7, 7, 23
+
+ # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
+ vadduwm 0, 0, 5
+ vadduwm 1, 1, 6
+ vadduwm 2, 2, 7
+ vadduwm 3, 3, 4
+ vpermxor 15, 15, 0, 20
+ vpermxor 12, 12, 1, 20
+ vpermxor 13, 13, 2, 20
+ vpermxor 14, 14, 3, 20
+ vadduwm 10, 10, 15
+ vadduwm 11, 11, 12
+ vadduwm 8, 8, 13
+ vadduwm 9, 9, 14
+ vxor 5, 5, 10
+ vxor 6, 6, 11
+ vxor 7, 7, 8
+ vxor 4, 4, 9
+ vrlw 5, 5, 21
+ vrlw 6, 6, 21
+ vrlw 7, 7, 21
+ vrlw 4, 4, 21
+ vadduwm 0, 0, 5
+ vadduwm 1, 1, 6
+ vadduwm 2, 2, 7
+ vadduwm 3, 3, 4
+ vpermxor 15, 15, 0, 22
+ vpermxor 12, 12, 1, 22
+ vpermxor 13, 13, 2, 22
+ vpermxor 14, 14, 3, 22
+ vadduwm 10, 10, 15
+ vadduwm 11, 11, 12
+ vadduwm 8, 8, 13
+ vadduwm 9, 9, 14
+ vxor 5, 5, 10
+ vxor 6, 6, 11
+ vxor 7, 7, 8
+ vxor 4, 4, 9
+ vrlw 5, 5, 23
+ vrlw 6, 6, 23
+ vrlw 7, 7, 23
+ vrlw 4, 4, 23
+.endm
+
+# Transpose
+.macro TP_4x a0 a1 a2 a3
+ xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1
+ xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3
+ xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1
+ xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3
+ xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3
+ xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3
+ xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3
+ xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3
+.endm
+
+# key stream = working state + state
+.macro Add_state S
+ vadduwm \S+0, \S+0, 16-\S
+ vadduwm \S+4, \S+4, 17-\S
+ vadduwm \S+8, \S+8, 18-\S
+ vadduwm \S+12, \S+12, 19-\S
+
+ vadduwm \S+1, \S+1, 16-\S
+ vadduwm \S+5, \S+5, 17-\S
+ vadduwm \S+9, \S+9, 18-\S
+ vadduwm \S+13, \S+13, 19-\S
+
+ vadduwm \S+2, \S+2, 16-\S
+ vadduwm \S+6, \S+6, 17-\S
+ vadduwm \S+10, \S+10, 18-\S
+ vadduwm \S+14, \S+14, 19-\S
+
+ vadduwm \S+3, \S+3, 16-\S
+ vadduwm \S+7, \S+7, 17-\S
+ vadduwm \S+11, \S+11, 18-\S
+ vadduwm \S+15, \S+15, 19-\S
+.endm
+
+#
+# write 256 bytes
+#
+.macro Write_256 S
+ add 9, 14, 5
+ add 16, 14, 4
+ lxvw4x 0, 0, 9
+ lxvw4x 1, 17, 9
+ lxvw4x 2, 18, 9
+ lxvw4x 3, 19, 9
+ lxvw4x 4, 20, 9
+ lxvw4x 5, 21, 9
+ lxvw4x 6, 22, 9
+ lxvw4x 7, 23, 9
+ lxvw4x 8, 24, 9
+ lxvw4x 9, 25, 9
+ lxvw4x 10, 26, 9
+ lxvw4x 11, 27, 9
+ lxvw4x 12, 28, 9
+ lxvw4x 13, 29, 9
+ lxvw4x 14, 30, 9
+ lxvw4x 15, 31, 9
+
+ xxlxor \S+32, \S+32, 0
+ xxlxor \S+36, \S+36, 1
+ xxlxor \S+40, \S+40, 2
+ xxlxor \S+44, \S+44, 3
+ xxlxor \S+33, \S+33, 4
+ xxlxor \S+37, \S+37, 5
+ xxlxor \S+41, \S+41, 6
+ xxlxor \S+45, \S+45, 7
+ xxlxor \S+34, \S+34, 8
+ xxlxor \S+38, \S+38, 9
+ xxlxor \S+42, \S+42, 10
+ xxlxor \S+46, \S+46, 11
+ xxlxor \S+35, \S+35, 12
+ xxlxor \S+39, \S+39, 13
+ xxlxor \S+43, \S+43, 14
+ xxlxor \S+47, \S+47, 15
+
+ stxvw4x \S+32, 0, 16
+ stxvw4x \S+36, 17, 16
+ stxvw4x \S+40, 18, 16
+ stxvw4x \S+44, 19, 16
+
+ stxvw4x \S+33, 20, 16
+ stxvw4x \S+37, 21, 16
+ stxvw4x \S+41, 22, 16
+ stxvw4x \S+45, 23, 16
+
+ stxvw4x \S+34, 24, 16
+ stxvw4x \S+38, 25, 16
+ stxvw4x \S+42, 26, 16
+ stxvw4x \S+46, 27, 16
+
+ stxvw4x \S+35, 28, 16
+ stxvw4x \S+39, 29, 16
+ stxvw4x \S+43, 30, 16
+ stxvw4x \S+47, 31, 16
+
+.endm
+
+#
+# chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len, int nrounds);
+#
+SYM_FUNC_START(chacha_p10le_8x)
+.align 5
+ cmpdi 6, 0
+ ble Out_no_chacha
+
+ SAVE_REGS
+
+ # r17 - r31 mainly for Write_256 macro.
+ li 17, 16
+ li 18, 32
+ li 19, 48
+ li 20, 64
+ li 21, 80
+ li 22, 96
+ li 23, 112
+ li 24, 128
+ li 25, 144
+ li 26, 160
+ li 27, 176
+ li 28, 192
+ li 29, 208
+ li 30, 224
+ li 31, 240
+
+ mr 15, 6 # len
+ li 14, 0 # offset to inp and outp
+
+ lxvw4x 48, 0, 3 # vr16, constants
+ lxvw4x 49, 17, 3 # vr17, key 1
+ lxvw4x 50, 18, 3 # vr18, key 2
+ lxvw4x 51, 19, 3 # vr19, counter, nonce
+
+ # create (0, 1, 2, 3) counters
+ vspltisw 0, 0
+ vspltisw 1, 1
+ vspltisw 2, 2
+ vspltisw 3, 3
+ vmrghw 4, 0, 1
+ vmrglw 5, 2, 3
+ vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3)
+
+ vspltisw 21, 12
+ vspltisw 23, 7
+
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxvw4x 32+20, 0, 11
+ lxvw4x 32+22, 17, 11
+
+ sradi 8, 7, 1
+
+ mtctr 8
+
+ # save constants to vsx
+ xxlor 16, 48, 48
+ xxlor 17, 49, 49
+ xxlor 18, 50, 50
+ xxlor 19, 51, 51
+
+ vspltisw 25, 4
+ vspltisw 26, 8
+
+ xxlor 25, 32+26, 32+26
+ xxlor 24, 32+25, 32+25
+
+ vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4)
+ xxlor 30, 32+30, 32+30
+ xxlor 31, 32+31, 32+31
+
+ xxlor 20, 32+20, 32+20
+ xxlor 21, 32+21, 32+21
+ xxlor 22, 32+22, 32+22
+ xxlor 23, 32+23, 32+23
+
+ cmpdi 6, 512
+ blt Loop_last
+
+Loop_8x:
+ xxspltw 32+0, 16, 0
+ xxspltw 32+1, 16, 1
+ xxspltw 32+2, 16, 2
+ xxspltw 32+3, 16, 3
+
+ xxspltw 32+4, 17, 0
+ xxspltw 32+5, 17, 1
+ xxspltw 32+6, 17, 2
+ xxspltw 32+7, 17, 3
+ xxspltw 32+8, 18, 0
+ xxspltw 32+9, 18, 1
+ xxspltw 32+10, 18, 2
+ xxspltw 32+11, 18, 3
+ xxspltw 32+12, 19, 0
+ xxspltw 32+13, 19, 1
+ xxspltw 32+14, 19, 2
+ xxspltw 32+15, 19, 3
+ vadduwm 12, 12, 30 # increase counter
+
+ xxspltw 32+16, 16, 0
+ xxspltw 32+17, 16, 1
+ xxspltw 32+18, 16, 2
+ xxspltw 32+19, 16, 3
+
+ xxspltw 32+20, 17, 0
+ xxspltw 32+21, 17, 1
+ xxspltw 32+22, 17, 2
+ xxspltw 32+23, 17, 3
+ xxspltw 32+24, 18, 0
+ xxspltw 32+25, 18, 1
+ xxspltw 32+26, 18, 2
+ xxspltw 32+27, 18, 3
+ xxspltw 32+28, 19, 0
+ xxspltw 32+29, 19, 1
+ vadduwm 28, 28, 31 # increase counter
+ xxspltw 32+30, 19, 2
+ xxspltw 32+31, 19, 3
+
+.align 5
+quarter_loop_8x:
+ QT_loop_8x
+
+ bdnz quarter_loop_8x
+
+ xxlor 0, 32+30, 32+30
+ xxlor 32+30, 30, 30
+ vadduwm 12, 12, 30
+ xxlor 32+30, 0, 0
+ TP_4x 0, 1, 2, 3
+ TP_4x 4, 5, 6, 7
+ TP_4x 8, 9, 10, 11
+ TP_4x 12, 13, 14, 15
+
+ xxlor 0, 48, 48
+ xxlor 1, 49, 49
+ xxlor 2, 50, 50
+ xxlor 3, 51, 51
+ xxlor 48, 16, 16
+ xxlor 49, 17, 17
+ xxlor 50, 18, 18
+ xxlor 51, 19, 19
+ Add_state 0
+ xxlor 48, 0, 0
+ xxlor 49, 1, 1
+ xxlor 50, 2, 2
+ xxlor 51, 3, 3
+ Write_256 0
+ addi 14, 14, 256 # offset +=256
+ addi 15, 15, -256 # len -=256
+
+ xxlor 5, 32+31, 32+31
+ xxlor 32+31, 31, 31
+ vadduwm 28, 28, 31
+ xxlor 32+31, 5, 5
+ TP_4x 16+0, 16+1, 16+2, 16+3
+ TP_4x 16+4, 16+5, 16+6, 16+7
+ TP_4x 16+8, 16+9, 16+10, 16+11
+ TP_4x 16+12, 16+13, 16+14, 16+15
+
+ xxlor 32, 16, 16
+ xxlor 33, 17, 17
+ xxlor 34, 18, 18
+ xxlor 35, 19, 19
+ Add_state 16
+ Write_256 16
+ addi 14, 14, 256 # offset +=256
+ addi 15, 15, -256 # len +=256
+
+ xxlor 32+24, 24, 24
+ xxlor 32+25, 25, 25
+ xxlor 32+30, 30, 30
+ vadduwm 30, 30, 25
+ vadduwm 31, 30, 24
+ xxlor 30, 32+30, 32+30
+ xxlor 31, 32+31, 32+31
+
+ cmpdi 15, 0
+ beq Out_loop
+
+ cmpdi 15, 512
+ blt Loop_last
+
+ mtctr 8
+ b Loop_8x
+
+Loop_last:
+ lxvw4x 48, 0, 3 # vr16, constants
+ lxvw4x 49, 17, 3 # vr17, key 1
+ lxvw4x 50, 18, 3 # vr18, key 2
+ lxvw4x 51, 19, 3 # vr19, counter, nonce
+
+ vspltisw 21, 12
+ vspltisw 23, 7
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxvw4x 32+20, 0, 11
+ lxvw4x 32+22, 17, 11
+
+ sradi 8, 7, 1
+ mtctr 8
+
+Loop_4x:
+ vspltw 0, 16, 0
+ vspltw 1, 16, 1
+ vspltw 2, 16, 2
+ vspltw 3, 16, 3
+
+ vspltw 4, 17, 0
+ vspltw 5, 17, 1
+ vspltw 6, 17, 2
+ vspltw 7, 17, 3
+ vspltw 8, 18, 0
+ vspltw 9, 18, 1
+ vspltw 10, 18, 2
+ vspltw 11, 18, 3
+ vspltw 12, 19, 0
+ vadduwm 12, 12, 30 # increase counter
+ vspltw 13, 19, 1
+ vspltw 14, 19, 2
+ vspltw 15, 19, 3
+
+.align 5
+quarter_loop:
+ QT_loop_4x
+
+ bdnz quarter_loop
+
+ vadduwm 12, 12, 30
+ TP_4x 0, 1, 2, 3
+ TP_4x 4, 5, 6, 7
+ TP_4x 8, 9, 10, 11
+ TP_4x 12, 13, 14, 15
+
+ Add_state 0
+ Write_256 0
+ addi 14, 14, 256 # offset += 256
+ addi 15, 15, -256 # len += 256
+
+ # Update state counter
+ vspltisw 25, 4
+ vadduwm 30, 30, 25
+
+ cmpdi 15, 0
+ beq Out_loop
+ cmpdi 15, 256
+ blt Out_loop
+
+ mtctr 8
+ b Loop_4x
+
+Out_loop:
+ RESTORE_REGS
+ blr
+
+Out_no_chacha:
+ li 3, 0
+ blr
+SYM_FUNC_END(chacha_p10le_8x)
+
+SYM_DATA_START_LOCAL(PERMX)
+.align 5
+permx:
+.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
+.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
+SYM_DATA_END(PERMX)
diff --git a/arch/powerpc/crypto/crc-vpmsum_test.c b/arch/powerpc/crypto/crc-vpmsum_test.c
index dce86e75f1a8..c61a874a3a5c 100644
--- a/arch/powerpc/crypto/crc-vpmsum_test.c
+++ b/arch/powerpc/crypto/crc-vpmsum_test.c
@@ -9,6 +9,7 @@
#include <crypto/internal/hash.h>
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/random.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/cpufeature.h>
@@ -22,10 +23,11 @@ static unsigned long iterations = 10000;
static int __init crc_test_init(void)
{
u16 crc16 = 0, verify16 = 0;
- u32 crc32 = 0, verify32 = 0;
__le32 verify32le = 0;
unsigned char *data;
+ u32 verify32 = 0;
unsigned long i;
+ __le32 crc32;
int ret;
struct crypto_shash *crct10dif_tfm;
@@ -75,12 +77,12 @@ static int __init crc_test_init(void)
pr_info("crc-vpmsum_test begins, %lu iterations\n", iterations);
for (i=0; i<iterations; i++) {
- size_t offset = prandom_u32_max(16);
- size_t len = prandom_u32_max(MAX_CRC_LENGTH);
+ size_t offset = get_random_u32_below(16);
+ size_t len = get_random_u32_below(MAX_CRC_LENGTH);
if (len <= offset)
continue;
- prandom_bytes(data, len);
+ get_random_bytes(data, len);
len -= offset;
crypto_shash_update(crct10dif_shash, data+offset, len);
@@ -98,7 +100,7 @@ static int __init crc_test_init(void)
crypto_shash_final(crc32c_shash, (u8 *)(&crc32));
verify32 = le32_to_cpu(verify32le);
verify32le = ~cpu_to_le32(__crc32c_le(~verify32, data+offset, len));
- if (crc32 != (u32)verify32le) {
+ if (crc32 != verify32le) {
pr_err("FAILURE in CRC32: got 0x%08x expected 0x%08x (len %lu)\n",
crc32, verify32, len);
break;
diff --git a/arch/powerpc/crypto/crc32-vpmsum_core.S b/arch/powerpc/crypto/crc32-vpmsum_core.S
index c3524eba4d0d..b0f87f595b26 100644
--- a/arch/powerpc/crypto/crc32-vpmsum_core.S
+++ b/arch/powerpc/crypto/crc32-vpmsum_core.S
@@ -19,7 +19,7 @@
* We then use fixed point Barrett reduction to compute a mod n over GF(2)
* for n = CRC using POWER8 instructions. We use x = 32.
*
- * http://en.wikipedia.org/wiki/Barrett_reduction
+ * https://en.wikipedia.org/wiki/Barrett_reduction
*
* Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
*/
@@ -113,9 +113,7 @@ FUNC_START(CRC_FUNCTION_NAME)
#endif
#ifdef BYTESWAP_DATA
- addis r3,r2,.byteswap_constant@toc@ha
- addi r3,r3,.byteswap_constant@toc@l
-
+ LOAD_REG_ADDR(r3, .byteswap_constant)
lvx byteswap,0,r3
addi r3,r3,16
#endif
@@ -150,8 +148,7 @@ FUNC_START(CRC_FUNCTION_NAME)
addi r7,r7,-1
mtctr r7
- addis r3,r2,.constants@toc@ha
- addi r3,r3,.constants@toc@l
+ LOAD_REG_ADDR(r3, .constants)
/* Find the start of our constants */
add r3,r3,r8
@@ -506,8 +503,7 @@ FUNC_START(CRC_FUNCTION_NAME)
.Lbarrett_reduction:
/* Barrett constants */
- addis r3,r2,.barrett_constants@toc@ha
- addi r3,r3,.barrett_constants@toc@l
+ LOAD_REG_ADDR(r3, .barrett_constants)
lvx const1,0,r3
lvx const2,off16,r3
@@ -610,8 +606,7 @@ FUNC_START(CRC_FUNCTION_NAME)
cmpdi r5,0
beq .Lzero
- addis r3,r2,.short_constants@toc@ha
- addi r3,r3,.short_constants@toc@l
+ LOAD_REG_ADDR(r3, .short_constants)
/* Calculate where in the constant table we need to start */
subfic r6,r5,256
diff --git a/arch/powerpc/crypto/crc32c-vpmsum_glue.c b/arch/powerpc/crypto/crc32c-vpmsum_glue.c
index 2c232898b933..63760b7dbb76 100644
--- a/arch/powerpc/crypto/crc32c-vpmsum_glue.c
+++ b/arch/powerpc/crypto/crc32c-vpmsum_glue.c
@@ -73,10 +73,8 @@ static int crc32c_vpmsum_setkey(struct crypto_shash *hash, const u8 *key,
{
u32 *mctx = crypto_shash_ctx(hash);
- if (keylen != sizeof(u32)) {
- crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ if (keylen != sizeof(u32))
return -EINVAL;
- }
*mctx = le32_to_cpup((__le32 *)key);
return 0;
}
diff --git a/arch/powerpc/crypto/ghash.c b/arch/powerpc/crypto/ghash.c
new file mode 100644
index 000000000000..77eca20bc7ac
--- /dev/null
+++ b/arch/powerpc/crypto/ghash.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * GHASH routines supporting VMX instructions on the Power 8
+ *
+ * Copyright (C) 2015, 2019 International Business Machines Inc.
+ *
+ * Author: Marcelo Henrique Cerri <mhcerri@br.ibm.com>
+ *
+ * Extended by Daniel Axtens <dja@axtens.net> to replace the fallback
+ * mechanism. The new approach is based on arm64 code, which is:
+ * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/types.h>
+#include <linux/err.h>
+#include <linux/crypto.h>
+#include <linux/delay.h>
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+#include <crypto/aes.h>
+#include <crypto/ghash.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/b128ops.h>
+#include "aesp8-ppc.h"
+
+void gcm_init_p8(u128 htable[16], const u64 Xi[2]);
+void gcm_gmult_p8(u64 Xi[2], const u128 htable[16]);
+void gcm_ghash_p8(u64 Xi[2], const u128 htable[16],
+ const u8 *in, size_t len);
+
+struct p8_ghash_ctx {
+ /* key used by vector asm */
+ u128 htable[16];
+ /* key used by software fallback */
+ be128 key;
+};
+
+struct p8_ghash_desc_ctx {
+ u64 shash[2];
+ u8 buffer[GHASH_DIGEST_SIZE];
+ int bytes;
+};
+
+static int p8_ghash_init(struct shash_desc *desc)
+{
+ struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ dctx->bytes = 0;
+ memset(dctx->shash, 0, GHASH_DIGEST_SIZE);
+ return 0;
+}
+
+static int p8_ghash_setkey(struct crypto_shash *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(tfm));
+
+ if (keylen != GHASH_BLOCK_SIZE)
+ return -EINVAL;
+
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_vsx();
+ gcm_init_p8(ctx->htable, (const u64 *) key);
+ disable_kernel_vsx();
+ pagefault_enable();
+ preempt_enable();
+
+ memcpy(&ctx->key, key, GHASH_BLOCK_SIZE);
+
+ return 0;
+}
+
+static inline void __ghash_block(struct p8_ghash_ctx *ctx,
+ struct p8_ghash_desc_ctx *dctx)
+{
+ if (crypto_simd_usable()) {
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_vsx();
+ gcm_ghash_p8(dctx->shash, ctx->htable,
+ dctx->buffer, GHASH_DIGEST_SIZE);
+ disable_kernel_vsx();
+ pagefault_enable();
+ preempt_enable();
+ } else {
+ crypto_xor((u8 *)dctx->shash, dctx->buffer, GHASH_BLOCK_SIZE);
+ gf128mul_lle((be128 *)dctx->shash, &ctx->key);
+ }
+}
+
+static inline void __ghash_blocks(struct p8_ghash_ctx *ctx,
+ struct p8_ghash_desc_ctx *dctx,
+ const u8 *src, unsigned int srclen)
+{
+ if (crypto_simd_usable()) {
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_vsx();
+ gcm_ghash_p8(dctx->shash, ctx->htable,
+ src, srclen);
+ disable_kernel_vsx();
+ pagefault_enable();
+ preempt_enable();
+ } else {
+ while (srclen >= GHASH_BLOCK_SIZE) {
+ crypto_xor((u8 *)dctx->shash, src, GHASH_BLOCK_SIZE);
+ gf128mul_lle((be128 *)dctx->shash, &ctx->key);
+ srclen -= GHASH_BLOCK_SIZE;
+ src += GHASH_BLOCK_SIZE;
+ }
+ }
+}
+
+static int p8_ghash_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ unsigned int len;
+ struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
+ struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ if (dctx->bytes) {
+ if (dctx->bytes + srclen < GHASH_DIGEST_SIZE) {
+ memcpy(dctx->buffer + dctx->bytes, src,
+ srclen);
+ dctx->bytes += srclen;
+ return 0;
+ }
+ memcpy(dctx->buffer + dctx->bytes, src,
+ GHASH_DIGEST_SIZE - dctx->bytes);
+
+ __ghash_block(ctx, dctx);
+
+ src += GHASH_DIGEST_SIZE - dctx->bytes;
+ srclen -= GHASH_DIGEST_SIZE - dctx->bytes;
+ dctx->bytes = 0;
+ }
+ len = srclen & ~(GHASH_DIGEST_SIZE - 1);
+ if (len) {
+ __ghash_blocks(ctx, dctx, src, len);
+ src += len;
+ srclen -= len;
+ }
+ if (srclen) {
+ memcpy(dctx->buffer, src, srclen);
+ dctx->bytes = srclen;
+ }
+ return 0;
+}
+
+static int p8_ghash_final(struct shash_desc *desc, u8 *out)
+{
+ int i;
+ struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
+ struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ if (dctx->bytes) {
+ for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++)
+ dctx->buffer[i] = 0;
+ __ghash_block(ctx, dctx);
+ dctx->bytes = 0;
+ }
+ memcpy(out, dctx->shash, GHASH_DIGEST_SIZE);
+ return 0;
+}
+
+struct shash_alg p8_ghash_alg = {
+ .digestsize = GHASH_DIGEST_SIZE,
+ .init = p8_ghash_init,
+ .update = p8_ghash_update,
+ .final = p8_ghash_final,
+ .setkey = p8_ghash_setkey,
+ .descsize = sizeof(struct p8_ghash_desc_ctx)
+ + sizeof(struct ghash_desc_ctx),
+ .base = {
+ .cra_name = "ghash",
+ .cra_driver_name = "p8_ghash",
+ .cra_priority = 1000,
+ .cra_blocksize = GHASH_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct p8_ghash_ctx),
+ .cra_module = THIS_MODULE,
+ },
+};
diff --git a/arch/powerpc/crypto/ghashp10-ppc.pl b/arch/powerpc/crypto/ghashp10-ppc.pl
new file mode 100644
index 000000000000..27a6b0bec645
--- /dev/null
+++ b/arch/powerpc/crypto/ghashp10-ppc.pl
@@ -0,0 +1,370 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for PowerISA v2.07.
+#
+# July 2014
+#
+# Accurate performance measurements are problematic, because it's
+# always virtualized setup with possibly throttled processor.
+# Relative comparison is therefore more informative. This initial
+# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
+# faster than "4-bit" integer-only compiler-generated 64-bit code.
+# "Initial version" means that there is room for futher improvement.
+
+$flavour=shift;
+$output =shift;
+
+if ($flavour =~ /64/) {
+ $SIZE_T=8;
+ $LRSAVE=2*$SIZE_T;
+ $STU="stdu";
+ $POP="ld";
+ $PUSH="std";
+} elsif ($flavour =~ /32/) {
+ $SIZE_T=4;
+ $LRSAVE=$SIZE_T;
+ $STU="stwu";
+ $POP="lwz";
+ $PUSH="stw";
+} else { die "nonsense $flavour"; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
+
+my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
+
+my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
+my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
+my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
+my $vrsave="r12";
+my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
+
+$code=<<___;
+.machine "any"
+
+.text
+
+.globl .gcm_init_p10
+ lis r0,0xfff0
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $H,0,r4 # load H
+ le?xor r7,r7,r7
+ le?addi r7,r7,0x8 # need a vperm start with 08
+ le?lvsr 5,0,r7
+ le?vspltisb 6,0x0f
+ le?vxor 5,5,6 # set a b-endian mask
+ le?vperm $H,$H,$H,5
+
+ vspltisb $xC2,-16 # 0xf0
+ vspltisb $t0,1 # one
+ vaddubm $xC2,$xC2,$xC2 # 0xe0
+ vxor $zero,$zero,$zero
+ vor $xC2,$xC2,$t0 # 0xe1
+ vsldoi $xC2,$xC2,$zero,15 # 0xe1...
+ vsldoi $t1,$zero,$t0,1 # ...1
+ vaddubm $xC2,$xC2,$xC2 # 0xc2...
+ vspltisb $t2,7
+ vor $xC2,$xC2,$t1 # 0xc2....01
+ vspltb $t1,$H,0 # most significant byte
+ vsl $H,$H,$t0 # H<<=1
+ vsrab $t1,$t1,$t2 # broadcast carry bit
+ vand $t1,$t1,$xC2
+ vxor $H,$H,$t1 # twisted H
+
+ vsldoi $H,$H,$H,8 # twist even more ...
+ vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
+ vsldoi $Hl,$zero,$H,8 # ... and split
+ vsldoi $Hh,$H,$zero,8
+
+ stvx_u $xC2,0,r3 # save pre-computed table
+ stvx_u $Hl,r8,r3
+ stvx_u $H, r9,r3
+ stvx_u $Hh,r10,r3
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
+.size .gcm_init_p10,.-.gcm_init_p10
+
+.globl .gcm_init_htable
+ lis r0,0xfff0
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $H,0,r4 # load H
+
+ vspltisb $xC2,-16 # 0xf0
+ vspltisb $t0,1 # one
+ vaddubm $xC2,$xC2,$xC2 # 0xe0
+ vxor $zero,$zero,$zero
+ vor $xC2,$xC2,$t0 # 0xe1
+ vsldoi $xC2,$xC2,$zero,15 # 0xe1...
+ vsldoi $t1,$zero,$t0,1 # ...1
+ vaddubm $xC2,$xC2,$xC2 # 0xc2...
+ vspltisb $t2,7
+ vor $xC2,$xC2,$t1 # 0xc2....01
+ vspltb $t1,$H,0 # most significant byte
+ vsl $H,$H,$t0 # H<<=1
+ vsrab $t1,$t1,$t2 # broadcast carry bit
+ vand $t1,$t1,$xC2
+ vxor $IN,$H,$t1 # twisted H
+
+ vsldoi $H,$IN,$IN,8 # twist even more ...
+ vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
+ vsldoi $Hl,$zero,$H,8 # ... and split
+ vsldoi $Hh,$H,$zero,8
+
+ stvx_u $xC2,0,r3 # save pre-computed table
+ stvx_u $Hl,r8,r3
+ li r8,0x40
+ stvx_u $H, r9,r3
+ li r9,0x50
+ stvx_u $Hh,r10,r3
+ li r10,0x60
+
+ vpmsumd $Xl,$IN,$Hl # H.lo·H.lo
+ vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi
+ vpmsumd $Xh,$IN,$Hh # H.hi·H.hi
+
+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
+ vpmsumd $Xl,$Xl,$xC2
+ vxor $t1,$t1,$Xh
+ vxor $IN1,$Xl,$t1
+
+ vsldoi $H2,$IN1,$IN1,8
+ vsldoi $H2l,$zero,$H2,8
+ vsldoi $H2h,$H2,$zero,8
+
+ stvx_u $H2l,r8,r3 # save H^2
+ li r8,0x70
+ stvx_u $H2,r9,r3
+ li r9,0x80
+ stvx_u $H2h,r10,r3
+ li r10,0x90
+
+ vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo
+ vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo
+ vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi
+ vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi
+ vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi
+ vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi
+
+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
+ vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vsldoi $t4,$Xm1,$zero,8
+ vsldoi $t5,$zero,$Xm1,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+ vxor $Xl1,$Xl1,$t4
+ vxor $Xh1,$Xh1,$t5
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vsldoi $Xl1,$Xl1,$Xl1,8
+ vxor $Xl,$Xl,$t2
+ vxor $Xl1,$Xl1,$t6
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
+ vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase
+ vpmsumd $Xl,$Xl,$xC2
+ vpmsumd $Xl1,$Xl1,$xC2
+ vxor $t1,$t1,$Xh
+ vxor $t5,$t5,$Xh1
+ vxor $Xl,$Xl,$t1
+ vxor $Xl1,$Xl1,$t5
+
+ vsldoi $H,$Xl,$Xl,8
+ vsldoi $H2,$Xl1,$Xl1,8
+ vsldoi $Hl,$zero,$H,8
+ vsldoi $Hh,$H,$zero,8
+ vsldoi $H2l,$zero,$H2,8
+ vsldoi $H2h,$H2,$zero,8
+
+ stvx_u $Hl,r8,r3 # save H^3
+ li r8,0xa0
+ stvx_u $H,r9,r3
+ li r9,0xb0
+ stvx_u $Hh,r10,r3
+ li r10,0xc0
+ stvx_u $H2l,r8,r3 # save H^4
+ stvx_u $H2,r9,r3
+ stvx_u $H2h,r10,r3
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
+.size .gcm_init_htable,.-.gcm_init_htable
+
+.globl .gcm_gmult_p10
+ lis r0,0xfff8
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $IN,0,$Xip # load Xi
+
+ lvx_u $Hl,r8,$Htbl # load pre-computed table
+ le?lvsl $lemask,r0,r0
+ lvx_u $H, r9,$Htbl
+ le?vspltisb $t0,0x07
+ lvx_u $Hh,r10,$Htbl
+ le?vxor $lemask,$lemask,$t0
+ lvx_u $xC2,0,$Htbl
+ le?vperm $IN,$IN,$IN,$lemask
+ vxor $zero,$zero,$zero
+
+ vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
+ vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
+ vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
+
+ vpmsumd $t2,$Xl,$xC2 # 1st phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd phase
+ vpmsumd $Xl,$Xl,$xC2
+ vxor $t1,$t1,$Xh
+ vxor $Xl,$Xl,$t1
+
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ stvx_u $Xl,0,$Xip # write out Xi
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
+.size .gcm_gmult_p10,.-.gcm_gmult_p10
+
+.globl .gcm_ghash_p10
+ lis r0,0xfff8
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $Xl,0,$Xip # load Xi
+
+ lvx_u $Hl,r8,$Htbl # load pre-computed table
+ le?lvsl $lemask,r0,r0
+ lvx_u $H, r9,$Htbl
+ le?vspltisb $t0,0x07
+ lvx_u $Hh,r10,$Htbl
+ le?vxor $lemask,$lemask,$t0
+ lvx_u $xC2,0,$Htbl
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ vxor $zero,$zero,$zero
+
+ lvx_u $IN,0,$inp
+ addi $inp,$inp,16
+ subi $len,$len,16
+ le?vperm $IN,$IN,$IN,$lemask
+ vxor $IN,$IN,$Xl
+ b Loop
+
+.align 5
+Loop:
+ subic $len,$len,16
+ vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
+ subfe. r0,r0,r0 # borrow?-1:0
+ vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
+ and r0,r0,$len
+ vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
+ add $inp,$inp,r0
+
+ vpmsumd $t2,$Xl,$xC2 # 1st phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+ lvx_u $IN,0,$inp
+ addi $inp,$inp,16
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd phase
+ vpmsumd $Xl,$Xl,$xC2
+ le?vperm $IN,$IN,$IN,$lemask
+ vxor $t1,$t1,$Xh
+ vxor $IN,$IN,$t1
+ vxor $IN,$IN,$Xl
+ beq Loop # did $len-=16 borrow?
+
+ vxor $Xl,$Xl,$t1
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ stvx_u $Xl,0,$Xip # write out Xi
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
+.size .gcm_ghash_p10,.-.gcm_ghash_p10
+
+.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+foreach (split("\n",$code)) {
+ if ($flavour =~ /le$/o) { # little-endian
+ s/le\?//o or
+ s/be\?/#be#/o;
+ } else {
+ s/le\?/#le#/o or
+ s/be\?//o;
+ }
+ print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/arch/powerpc/crypto/ghashp8-ppc.pl b/arch/powerpc/crypto/ghashp8-ppc.pl
new file mode 100644
index 000000000000..041e633c214f
--- /dev/null
+++ b/arch/powerpc/crypto/ghashp8-ppc.pl
@@ -0,0 +1,243 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for PowerISA v2.07.
+#
+# July 2014
+#
+# Accurate performance measurements are problematic, because it's
+# always virtualized setup with possibly throttled processor.
+# Relative comparison is therefore more informative. This initial
+# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
+# faster than "4-bit" integer-only compiler-generated 64-bit code.
+# "Initial version" means that there is room for futher improvement.
+
+$flavour=shift;
+$output =shift;
+
+if ($flavour =~ /64/) {
+ $SIZE_T=8;
+ $LRSAVE=2*$SIZE_T;
+ $STU="stdu";
+ $POP="ld";
+ $PUSH="std";
+} elsif ($flavour =~ /32/) {
+ $SIZE_T=4;
+ $LRSAVE=$SIZE_T;
+ $STU="stwu";
+ $POP="lwz";
+ $PUSH="stw";
+} else { die "nonsense $flavour"; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
+
+my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
+
+my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
+my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
+my $vrsave="r12";
+
+$code=<<___;
+.machine "any"
+
+.text
+
+.globl .gcm_init_p8
+ lis r0,0xfff0
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $H,0,r4 # load H
+ le?xor r7,r7,r7
+ le?addi r7,r7,0x8 # need a vperm start with 08
+ le?lvsr 5,0,r7
+ le?vspltisb 6,0x0f
+ le?vxor 5,5,6 # set a b-endian mask
+ le?vperm $H,$H,$H,5
+
+ vspltisb $xC2,-16 # 0xf0
+ vspltisb $t0,1 # one
+ vaddubm $xC2,$xC2,$xC2 # 0xe0
+ vxor $zero,$zero,$zero
+ vor $xC2,$xC2,$t0 # 0xe1
+ vsldoi $xC2,$xC2,$zero,15 # 0xe1...
+ vsldoi $t1,$zero,$t0,1 # ...1
+ vaddubm $xC2,$xC2,$xC2 # 0xc2...
+ vspltisb $t2,7
+ vor $xC2,$xC2,$t1 # 0xc2....01
+ vspltb $t1,$H,0 # most significant byte
+ vsl $H,$H,$t0 # H<<=1
+ vsrab $t1,$t1,$t2 # broadcast carry bit
+ vand $t1,$t1,$xC2
+ vxor $H,$H,$t1 # twisted H
+
+ vsldoi $H,$H,$H,8 # twist even more ...
+ vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
+ vsldoi $Hl,$zero,$H,8 # ... and split
+ vsldoi $Hh,$H,$zero,8
+
+ stvx_u $xC2,0,r3 # save pre-computed table
+ stvx_u $Hl,r8,r3
+ stvx_u $H, r9,r3
+ stvx_u $Hh,r10,r3
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
+.size .gcm_init_p8,.-.gcm_init_p8
+
+.globl .gcm_gmult_p8
+ lis r0,0xfff8
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $IN,0,$Xip # load Xi
+
+ lvx_u $Hl,r8,$Htbl # load pre-computed table
+ le?lvsl $lemask,r0,r0
+ lvx_u $H, r9,$Htbl
+ le?vspltisb $t0,0x07
+ lvx_u $Hh,r10,$Htbl
+ le?vxor $lemask,$lemask,$t0
+ lvx_u $xC2,0,$Htbl
+ le?vperm $IN,$IN,$IN,$lemask
+ vxor $zero,$zero,$zero
+
+ vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
+ vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
+ vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
+
+ vpmsumd $t2,$Xl,$xC2 # 1st phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd phase
+ vpmsumd $Xl,$Xl,$xC2
+ vxor $t1,$t1,$Xh
+ vxor $Xl,$Xl,$t1
+
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ stvx_u $Xl,0,$Xip # write out Xi
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
+.size .gcm_gmult_p8,.-.gcm_gmult_p8
+
+.globl .gcm_ghash_p8
+ lis r0,0xfff8
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $Xl,0,$Xip # load Xi
+
+ lvx_u $Hl,r8,$Htbl # load pre-computed table
+ le?lvsl $lemask,r0,r0
+ lvx_u $H, r9,$Htbl
+ le?vspltisb $t0,0x07
+ lvx_u $Hh,r10,$Htbl
+ le?vxor $lemask,$lemask,$t0
+ lvx_u $xC2,0,$Htbl
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ vxor $zero,$zero,$zero
+
+ lvx_u $IN,0,$inp
+ addi $inp,$inp,16
+ subi $len,$len,16
+ le?vperm $IN,$IN,$IN,$lemask
+ vxor $IN,$IN,$Xl
+ b Loop
+
+.align 5
+Loop:
+ subic $len,$len,16
+ vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
+ subfe. r0,r0,r0 # borrow?-1:0
+ vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
+ and r0,r0,$len
+ vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
+ add $inp,$inp,r0
+
+ vpmsumd $t2,$Xl,$xC2 # 1st phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+ lvx_u $IN,0,$inp
+ addi $inp,$inp,16
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd phase
+ vpmsumd $Xl,$Xl,$xC2
+ le?vperm $IN,$IN,$IN,$lemask
+ vxor $t1,$t1,$Xh
+ vxor $IN,$IN,$t1
+ vxor $IN,$IN,$Xl
+ beq Loop # did $len-=16 borrow?
+
+ vxor $Xl,$Xl,$t1
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ stvx_u $Xl,0,$Xip # write out Xi
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
+.size .gcm_ghash_p8,.-.gcm_ghash_p8
+
+.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+foreach (split("\n",$code)) {
+ if ($flavour =~ /le$/o) { # little-endian
+ s/le\?//o or
+ s/be\?/#be#/o;
+ } else {
+ s/le\?/#le#/o or
+ s/be\?//o;
+ }
+ print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/arch/powerpc/crypto/md5-asm.S b/arch/powerpc/crypto/md5-asm.S
index 948d100a2934..fa6bc440cf4a 100644
--- a/arch/powerpc/crypto/md5-asm.S
+++ b/arch/powerpc/crypto/md5-asm.S
@@ -38,15 +38,11 @@
#define INITIALIZE \
PPC_STLU r1,-INT_FRAME_SIZE(r1); \
- SAVE_8GPRS(14, r1); /* push registers onto stack */ \
- SAVE_4GPRS(22, r1); \
- SAVE_GPR(26, r1)
+ SAVE_GPRS(14, 26, r1) /* push registers onto stack */
#define FINALIZE \
- REST_8GPRS(14, r1); /* pop registers from stack */ \
- REST_4GPRS(22, r1); \
- REST_GPR(26, r1); \
- addi r1,r1,INT_FRAME_SIZE;
+ REST_GPRS(14, 26, r1); /* pop registers from stack */ \
+ addi r1,r1,INT_FRAME_SIZE
#ifdef __BIG_ENDIAN__
#define LOAD_DATA(reg, off) \
diff --git a/arch/powerpc/crypto/md5-glue.c b/arch/powerpc/crypto/md5-glue.c
index 7d1bf2fcf668..c24f605033bd 100644
--- a/arch/powerpc/crypto/md5-glue.c
+++ b/arch/powerpc/crypto/md5-glue.c
@@ -11,7 +11,6 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
-#include <linux/cryptohash.h>
#include <linux/types.h>
#include <crypto/md5.h>
#include <asm/byteorder.h>
diff --git a/arch/powerpc/crypto/poly1305-p10-glue.c b/arch/powerpc/crypto/poly1305-p10-glue.c
new file mode 100644
index 000000000000..95dd708573ee
--- /dev/null
+++ b/arch/powerpc/crypto/poly1305-p10-glue.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Poly1305 authenticator algorithm, RFC7539.
+ *
+ * Copyright 2023- IBM Corp. All rights reserved.
+ */
+
+#include <crypto/algapi.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/jump_label.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/poly1305.h>
+#include <crypto/internal/simd.h>
+#include <linux/cpufeature.h>
+#include <asm/unaligned.h>
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+
+asmlinkage void poly1305_p10le_4blocks(void *h, const u8 *m, u32 mlen);
+asmlinkage void poly1305_64s(void *h, const u8 *m, u32 mlen, int highbit);
+asmlinkage void poly1305_emit_64(void *h, void *s, u8 *dst);
+
+static void vsx_begin(void)
+{
+ preempt_disable();
+ enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+ disable_kernel_vsx();
+ preempt_enable();
+}
+
+static int crypto_poly1305_p10_init(struct shash_desc *desc)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ poly1305_core_init(&dctx->h);
+ dctx->buflen = 0;
+ dctx->rset = 0;
+ dctx->sset = false;
+
+ return 0;
+}
+
+static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
+ const u8 *inp, unsigned int len)
+{
+ unsigned int acc = 0;
+
+ if (unlikely(!dctx->sset)) {
+ if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
+ struct poly1305_core_key *key = &dctx->core_r;
+
+ key->key.r64[0] = get_unaligned_le64(&inp[0]);
+ key->key.r64[1] = get_unaligned_le64(&inp[8]);
+ inp += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ acc += POLY1305_BLOCK_SIZE;
+ dctx->rset = 1;
+ }
+ if (len >= POLY1305_BLOCK_SIZE) {
+ dctx->s[0] = get_unaligned_le32(&inp[0]);
+ dctx->s[1] = get_unaligned_le32(&inp[4]);
+ dctx->s[2] = get_unaligned_le32(&inp[8]);
+ dctx->s[3] = get_unaligned_le32(&inp[12]);
+ acc += POLY1305_BLOCK_SIZE;
+ dctx->sset = true;
+ }
+ }
+ return acc;
+}
+
+static int crypto_poly1305_p10_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+ unsigned int bytes, used;
+
+ if (unlikely(dctx->buflen)) {
+ bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
+ memcpy(dctx->buf + dctx->buflen, src, bytes);
+ src += bytes;
+ srclen -= bytes;
+ dctx->buflen += bytes;
+
+ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+ if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf,
+ POLY1305_BLOCK_SIZE))) {
+ vsx_begin();
+ poly1305_64s(&dctx->h, dctx->buf,
+ POLY1305_BLOCK_SIZE, 1);
+ vsx_end();
+ }
+ dctx->buflen = 0;
+ }
+ }
+
+ if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+ bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
+ used = crypto_poly1305_setdctxkey(dctx, src, bytes);
+ if (likely(used)) {
+ srclen -= used;
+ src += used;
+ }
+ if (crypto_simd_usable() && (srclen >= POLY1305_BLOCK_SIZE*4)) {
+ vsx_begin();
+ poly1305_p10le_4blocks(&dctx->h, src, srclen);
+ vsx_end();
+ src += srclen - (srclen % (POLY1305_BLOCK_SIZE * 4));
+ srclen %= POLY1305_BLOCK_SIZE * 4;
+ }
+ while (srclen >= POLY1305_BLOCK_SIZE) {
+ vsx_begin();
+ poly1305_64s(&dctx->h, src, POLY1305_BLOCK_SIZE, 1);
+ vsx_end();
+ srclen -= POLY1305_BLOCK_SIZE;
+ src += POLY1305_BLOCK_SIZE;
+ }
+ }
+
+ if (unlikely(srclen)) {
+ dctx->buflen = srclen;
+ memcpy(dctx->buf, src, srclen);
+ }
+
+ return 0;
+}
+
+static int crypto_poly1305_p10_final(struct shash_desc *desc, u8 *dst)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ if (unlikely(!dctx->sset))
+ return -ENOKEY;
+
+ if ((dctx->buflen)) {
+ dctx->buf[dctx->buflen++] = 1;
+ memset(dctx->buf + dctx->buflen, 0,
+ POLY1305_BLOCK_SIZE - dctx->buflen);
+ vsx_begin();
+ poly1305_64s(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+ vsx_end();
+ dctx->buflen = 0;
+ }
+
+ poly1305_emit_64(&dctx->h, &dctx->s, dst);
+ return 0;
+}
+
+static struct shash_alg poly1305_alg = {
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .init = crypto_poly1305_p10_init,
+ .update = crypto_poly1305_p10_update,
+ .final = crypto_poly1305_p10_final,
+ .descsize = sizeof(struct poly1305_desc_ctx),
+ .base = {
+ .cra_name = "poly1305",
+ .cra_driver_name = "poly1305-p10",
+ .cra_priority = 300,
+ .cra_blocksize = POLY1305_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+};
+
+static int __init poly1305_p10_init(void)
+{
+ return crypto_register_shash(&poly1305_alg);
+}
+
+static void __exit poly1305_p10_exit(void)
+{
+ crypto_unregister_shash(&poly1305_alg);
+}
+
+module_cpu_feature_match(PPC_MODULE_FEATURE_P10, poly1305_p10_init);
+module_exit(poly1305_p10_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>");
+MODULE_DESCRIPTION("Optimized Poly1305 for P10");
+MODULE_ALIAS_CRYPTO("poly1305");
+MODULE_ALIAS_CRYPTO("poly1305-p10");
diff --git a/arch/powerpc/crypto/poly1305-p10le_64.S b/arch/powerpc/crypto/poly1305-p10le_64.S
new file mode 100644
index 000000000000..a3c1987f1ecd
--- /dev/null
+++ b/arch/powerpc/crypto/poly1305-p10le_64.S
@@ -0,0 +1,1075 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Accelerated poly1305 implementation for ppc64le.
+#
+# Copyright 2023- IBM Corp. All rights reserved
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# Poly1305 - this version mainly using vector/VSX/Scalar
+# - 26 bits limbs
+# - Handle multiple 64 byte blcok.
+#
+# Block size 16 bytes
+# key = (r, s)
+# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
+# p = 2^130 - 5
+# a += m
+# a = (r + a) % p
+# a += s
+#
+# Improve performance by breaking down polynominal to the sum of products with
+# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
+#
+# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0
+# to 9 vectors for multiplications.
+#
+# setup r^4, r^3, r^2, r vectors
+# vs [r^1, r^3, r^2, r^4]
+# vs0 = [r0,.....]
+# vs1 = [r1,.....]
+# vs2 = [r2,.....]
+# vs3 = [r3,.....]
+# vs4 = [r4,.....]
+# vs5 = [r1*5,...]
+# vs6 = [r2*5,...]
+# vs7 = [r2*5,...]
+# vs8 = [r4*5,...]
+#
+# Each word in a vector consists a member of a "r/s" in [a * r/s].
+#
+# r0, r4*5, r3*5, r2*5, r1*5;
+# r1, r0, r4*5, r3*5, r2*5;
+# r2, r1, r0, r4*5, r3*5;
+# r3, r2, r1, r0, r4*5;
+# r4, r3, r2, r1, r0 ;
+#
+#
+# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
+# k = 32 bytes key
+# r3 = k (r, s)
+# r4 = mlen
+# r5 = m
+#
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+#include <linux/linkage.h>
+
+.machine "any"
+
+.text
+
+.macro SAVE_GPR GPR OFFSET FRAME
+ std \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro SAVE_VRS VRS OFFSET FRAME
+ li 16, \OFFSET
+ stvx \VRS, 16, \FRAME
+.endm
+
+.macro SAVE_VSX VSX OFFSET FRAME
+ li 16, \OFFSET
+ stxvx \VSX, 16, \FRAME
+.endm
+
+.macro RESTORE_GPR GPR OFFSET FRAME
+ ld \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro RESTORE_VRS VRS OFFSET FRAME
+ li 16, \OFFSET
+ lvx \VRS, 16, \FRAME
+.endm
+
+.macro RESTORE_VSX VSX OFFSET FRAME
+ li 16, \OFFSET
+ lxvx \VSX, 16, \FRAME
+.endm
+
+.macro SAVE_REGS
+ mflr 0
+ std 0, 16(1)
+ stdu 1,-752(1)
+
+ SAVE_GPR 14, 112, 1
+ SAVE_GPR 15, 120, 1
+ SAVE_GPR 16, 128, 1
+ SAVE_GPR 17, 136, 1
+ SAVE_GPR 18, 144, 1
+ SAVE_GPR 19, 152, 1
+ SAVE_GPR 20, 160, 1
+ SAVE_GPR 21, 168, 1
+ SAVE_GPR 22, 176, 1
+ SAVE_GPR 23, 184, 1
+ SAVE_GPR 24, 192, 1
+ SAVE_GPR 25, 200, 1
+ SAVE_GPR 26, 208, 1
+ SAVE_GPR 27, 216, 1
+ SAVE_GPR 28, 224, 1
+ SAVE_GPR 29, 232, 1
+ SAVE_GPR 30, 240, 1
+ SAVE_GPR 31, 248, 1
+
+ addi 9, 1, 256
+ SAVE_VRS 20, 0, 9
+ SAVE_VRS 21, 16, 9
+ SAVE_VRS 22, 32, 9
+ SAVE_VRS 23, 48, 9
+ SAVE_VRS 24, 64, 9
+ SAVE_VRS 25, 80, 9
+ SAVE_VRS 26, 96, 9
+ SAVE_VRS 27, 112, 9
+ SAVE_VRS 28, 128, 9
+ SAVE_VRS 29, 144, 9
+ SAVE_VRS 30, 160, 9
+ SAVE_VRS 31, 176, 9
+
+ SAVE_VSX 14, 192, 9
+ SAVE_VSX 15, 208, 9
+ SAVE_VSX 16, 224, 9
+ SAVE_VSX 17, 240, 9
+ SAVE_VSX 18, 256, 9
+ SAVE_VSX 19, 272, 9
+ SAVE_VSX 20, 288, 9
+ SAVE_VSX 21, 304, 9
+ SAVE_VSX 22, 320, 9
+ SAVE_VSX 23, 336, 9
+ SAVE_VSX 24, 352, 9
+ SAVE_VSX 25, 368, 9
+ SAVE_VSX 26, 384, 9
+ SAVE_VSX 27, 400, 9
+ SAVE_VSX 28, 416, 9
+ SAVE_VSX 29, 432, 9
+ SAVE_VSX 30, 448, 9
+ SAVE_VSX 31, 464, 9
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+ addi 9, 1, 256
+ RESTORE_VRS 20, 0, 9
+ RESTORE_VRS 21, 16, 9
+ RESTORE_VRS 22, 32, 9
+ RESTORE_VRS 23, 48, 9
+ RESTORE_VRS 24, 64, 9
+ RESTORE_VRS 25, 80, 9
+ RESTORE_VRS 26, 96, 9
+ RESTORE_VRS 27, 112, 9
+ RESTORE_VRS 28, 128, 9
+ RESTORE_VRS 29, 144, 9
+ RESTORE_VRS 30, 160, 9
+ RESTORE_VRS 31, 176, 9
+
+ RESTORE_VSX 14, 192, 9
+ RESTORE_VSX 15, 208, 9
+ RESTORE_VSX 16, 224, 9
+ RESTORE_VSX 17, 240, 9
+ RESTORE_VSX 18, 256, 9
+ RESTORE_VSX 19, 272, 9
+ RESTORE_VSX 20, 288, 9
+ RESTORE_VSX 21, 304, 9
+ RESTORE_VSX 22, 320, 9
+ RESTORE_VSX 23, 336, 9
+ RESTORE_VSX 24, 352, 9
+ RESTORE_VSX 25, 368, 9
+ RESTORE_VSX 26, 384, 9
+ RESTORE_VSX 27, 400, 9
+ RESTORE_VSX 28, 416, 9
+ RESTORE_VSX 29, 432, 9
+ RESTORE_VSX 30, 448, 9
+ RESTORE_VSX 31, 464, 9
+
+ RESTORE_GPR 14, 112, 1
+ RESTORE_GPR 15, 120, 1
+ RESTORE_GPR 16, 128, 1
+ RESTORE_GPR 17, 136, 1
+ RESTORE_GPR 18, 144, 1
+ RESTORE_GPR 19, 152, 1
+ RESTORE_GPR 20, 160, 1
+ RESTORE_GPR 21, 168, 1
+ RESTORE_GPR 22, 176, 1
+ RESTORE_GPR 23, 184, 1
+ RESTORE_GPR 24, 192, 1
+ RESTORE_GPR 25, 200, 1
+ RESTORE_GPR 26, 208, 1
+ RESTORE_GPR 27, 216, 1
+ RESTORE_GPR 28, 224, 1
+ RESTORE_GPR 29, 232, 1
+ RESTORE_GPR 30, 240, 1
+ RESTORE_GPR 31, 248, 1
+
+ addi 1, 1, 752
+ ld 0, 16(1)
+ mtlr 0
+.endm # RESTORE_REGS
+
+#
+# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
+# p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5;
+# p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5;
+# p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5;
+# p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ;
+#
+# [r^2, r^3, r^1, r^4]
+# [m3, m2, m4, m1]
+#
+# multiply odd and even words
+.macro mul_odd
+ vmulouw 14, 4, 26
+ vmulouw 10, 5, 3
+ vmulouw 11, 6, 2
+ vmulouw 12, 7, 1
+ vmulouw 13, 8, 0
+ vmulouw 15, 4, 27
+ vaddudm 14, 14, 10
+ vaddudm 14, 14, 11
+ vmulouw 10, 5, 26
+ vmulouw 11, 6, 3
+ vaddudm 14, 14, 12
+ vaddudm 14, 14, 13 # x0
+ vaddudm 15, 15, 10
+ vaddudm 15, 15, 11
+ vmulouw 12, 7, 2
+ vmulouw 13, 8, 1
+ vaddudm 15, 15, 12
+ vaddudm 15, 15, 13 # x1
+ vmulouw 16, 4, 28
+ vmulouw 10, 5, 27
+ vmulouw 11, 6, 26
+ vaddudm 16, 16, 10
+ vaddudm 16, 16, 11
+ vmulouw 12, 7, 3
+ vmulouw 13, 8, 2
+ vaddudm 16, 16, 12
+ vaddudm 16, 16, 13 # x2
+ vmulouw 17, 4, 29
+ vmulouw 10, 5, 28
+ vmulouw 11, 6, 27
+ vaddudm 17, 17, 10
+ vaddudm 17, 17, 11
+ vmulouw 12, 7, 26
+ vmulouw 13, 8, 3
+ vaddudm 17, 17, 12
+ vaddudm 17, 17, 13 # x3
+ vmulouw 18, 4, 30
+ vmulouw 10, 5, 29
+ vmulouw 11, 6, 28
+ vaddudm 18, 18, 10
+ vaddudm 18, 18, 11
+ vmulouw 12, 7, 27
+ vmulouw 13, 8, 26
+ vaddudm 18, 18, 12
+ vaddudm 18, 18, 13 # x4
+.endm
+
+.macro mul_even
+ vmuleuw 9, 4, 26
+ vmuleuw 10, 5, 3
+ vmuleuw 11, 6, 2
+ vmuleuw 12, 7, 1
+ vmuleuw 13, 8, 0
+ vaddudm 14, 14, 9
+ vaddudm 14, 14, 10
+ vaddudm 14, 14, 11
+ vaddudm 14, 14, 12
+ vaddudm 14, 14, 13 # x0
+
+ vmuleuw 9, 4, 27
+ vmuleuw 10, 5, 26
+ vmuleuw 11, 6, 3
+ vmuleuw 12, 7, 2
+ vmuleuw 13, 8, 1
+ vaddudm 15, 15, 9
+ vaddudm 15, 15, 10
+ vaddudm 15, 15, 11
+ vaddudm 15, 15, 12
+ vaddudm 15, 15, 13 # x1
+
+ vmuleuw 9, 4, 28
+ vmuleuw 10, 5, 27
+ vmuleuw 11, 6, 26
+ vmuleuw 12, 7, 3
+ vmuleuw 13, 8, 2
+ vaddudm 16, 16, 9
+ vaddudm 16, 16, 10
+ vaddudm 16, 16, 11
+ vaddudm 16, 16, 12
+ vaddudm 16, 16, 13 # x2
+
+ vmuleuw 9, 4, 29
+ vmuleuw 10, 5, 28
+ vmuleuw 11, 6, 27
+ vmuleuw 12, 7, 26
+ vmuleuw 13, 8, 3
+ vaddudm 17, 17, 9
+ vaddudm 17, 17, 10
+ vaddudm 17, 17, 11
+ vaddudm 17, 17, 12
+ vaddudm 17, 17, 13 # x3
+
+ vmuleuw 9, 4, 30
+ vmuleuw 10, 5, 29
+ vmuleuw 11, 6, 28
+ vmuleuw 12, 7, 27
+ vmuleuw 13, 8, 26
+ vaddudm 18, 18, 9
+ vaddudm 18, 18, 10
+ vaddudm 18, 18, 11
+ vaddudm 18, 18, 12
+ vaddudm 18, 18, 13 # x4
+.endm
+
+#
+# poly1305_setup_r
+#
+# setup r^4, r^3, r^2, r vectors
+# [r, r^3, r^2, r^4]
+# vs0 = [r0,...]
+# vs1 = [r1,...]
+# vs2 = [r2,...]
+# vs3 = [r3,...]
+# vs4 = [r4,...]
+# vs5 = [r4*5,...]
+# vs6 = [r3*5,...]
+# vs7 = [r2*5,...]
+# vs8 = [r1*5,...]
+#
+# r0, r4*5, r3*5, r2*5, r1*5;
+# r1, r0, r4*5, r3*5, r2*5;
+# r2, r1, r0, r4*5, r3*5;
+# r3, r2, r1, r0, r4*5;
+# r4, r3, r2, r1, r0 ;
+#
+.macro poly1305_setup_r
+
+ # save r
+ xxlor 26, 58, 58
+ xxlor 27, 59, 59
+ xxlor 28, 60, 60
+ xxlor 29, 61, 61
+ xxlor 30, 62, 62
+
+ xxlxor 31, 31, 31
+
+# [r, r^3, r^2, r^4]
+ # compute r^2
+ vmr 4, 26
+ vmr 5, 27
+ vmr 6, 28
+ vmr 7, 29
+ vmr 8, 30
+ bl do_mul # r^2 r^1
+ xxpermdi 58, 58, 36, 0x3 # r0
+ xxpermdi 59, 59, 37, 0x3 # r1
+ xxpermdi 60, 60, 38, 0x3 # r2
+ xxpermdi 61, 61, 39, 0x3 # r3
+ xxpermdi 62, 62, 40, 0x3 # r4
+ xxpermdi 36, 36, 36, 0x3
+ xxpermdi 37, 37, 37, 0x3
+ xxpermdi 38, 38, 38, 0x3
+ xxpermdi 39, 39, 39, 0x3
+ xxpermdi 40, 40, 40, 0x3
+ vspltisb 13, 2
+ vsld 9, 27, 13
+ vsld 10, 28, 13
+ vsld 11, 29, 13
+ vsld 12, 30, 13
+ vaddudm 0, 9, 27
+ vaddudm 1, 10, 28
+ vaddudm 2, 11, 29
+ vaddudm 3, 12, 30
+
+ bl do_mul # r^4 r^3
+ vmrgow 26, 26, 4
+ vmrgow 27, 27, 5
+ vmrgow 28, 28, 6
+ vmrgow 29, 29, 7
+ vmrgow 30, 30, 8
+ vspltisb 13, 2
+ vsld 9, 27, 13
+ vsld 10, 28, 13
+ vsld 11, 29, 13
+ vsld 12, 30, 13
+ vaddudm 0, 9, 27
+ vaddudm 1, 10, 28
+ vaddudm 2, 11, 29
+ vaddudm 3, 12, 30
+
+ # r^2 r^4
+ xxlor 0, 58, 58
+ xxlor 1, 59, 59
+ xxlor 2, 60, 60
+ xxlor 3, 61, 61
+ xxlor 4, 62, 62
+ xxlor 5, 32, 32
+ xxlor 6, 33, 33
+ xxlor 7, 34, 34
+ xxlor 8, 35, 35
+
+ vspltw 9, 26, 3
+ vspltw 10, 26, 2
+ vmrgow 26, 10, 9
+ vspltw 9, 27, 3
+ vspltw 10, 27, 2
+ vmrgow 27, 10, 9
+ vspltw 9, 28, 3
+ vspltw 10, 28, 2
+ vmrgow 28, 10, 9
+ vspltw 9, 29, 3
+ vspltw 10, 29, 2
+ vmrgow 29, 10, 9
+ vspltw 9, 30, 3
+ vspltw 10, 30, 2
+ vmrgow 30, 10, 9
+
+ vsld 9, 27, 13
+ vsld 10, 28, 13
+ vsld 11, 29, 13
+ vsld 12, 30, 13
+ vaddudm 0, 9, 27
+ vaddudm 1, 10, 28
+ vaddudm 2, 11, 29
+ vaddudm 3, 12, 30
+.endm
+
+SYM_FUNC_START_LOCAL(do_mul)
+ mul_odd
+
+ # do reduction ( h %= p )
+ # carry reduction
+ vspltisb 9, 2
+ vsrd 10, 14, 31
+ vsrd 11, 17, 31
+ vand 7, 17, 25
+ vand 4, 14, 25
+ vaddudm 18, 18, 11
+ vsrd 12, 18, 31
+ vaddudm 15, 15, 10
+
+ vsrd 11, 15, 31
+ vand 8, 18, 25
+ vand 5, 15, 25
+ vaddudm 4, 4, 12
+ vsld 10, 12, 9
+ vaddudm 6, 16, 11
+
+ vsrd 13, 6, 31
+ vand 6, 6, 25
+ vaddudm 4, 4, 10
+ vsrd 10, 4, 31
+ vaddudm 7, 7, 13
+
+ vsrd 11, 7, 31
+ vand 7, 7, 25
+ vand 4, 4, 25
+ vaddudm 5, 5, 10
+ vaddudm 8, 8, 11
+ blr
+SYM_FUNC_END(do_mul)
+
+#
+# init key
+#
+.macro do_poly1305_init
+ addis 10, 2, rmask@toc@ha
+ addi 10, 10, rmask@toc@l
+
+ ld 11, 0(10)
+ ld 12, 8(10)
+
+ li 14, 16
+ li 15, 32
+ addis 10, 2, cnum@toc@ha
+ addi 10, 10, cnum@toc@l
+ lvx 25, 0, 10 # v25 - mask
+ lvx 31, 14, 10 # v31 = 1a
+ lvx 19, 15, 10 # v19 = 1 << 24
+ lxv 24, 48(10) # vs24
+ lxv 25, 64(10) # vs25
+
+ # initialize
+ # load key from r3 to vectors
+ ld 9, 24(3)
+ ld 10, 32(3)
+ and. 9, 9, 11
+ and. 10, 10, 12
+
+ # break 26 bits
+ extrdi 14, 9, 26, 38
+ extrdi 15, 9, 26, 12
+ extrdi 16, 9, 12, 0
+ mtvsrdd 58, 0, 14
+ insrdi 16, 10, 14, 38
+ mtvsrdd 59, 0, 15
+ extrdi 17, 10, 26, 24
+ mtvsrdd 60, 0, 16
+ extrdi 18, 10, 24, 0
+ mtvsrdd 61, 0, 17
+ mtvsrdd 62, 0, 18
+
+ # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
+ li 9, 5
+ mtvsrdd 36, 0, 9
+ vmulouw 0, 27, 4 # v0 = rr0
+ vmulouw 1, 28, 4 # v1 = rr1
+ vmulouw 2, 29, 4 # v2 = rr2
+ vmulouw 3, 30, 4 # v3 = rr3
+.endm
+
+#
+# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
+# k = 32 bytes key
+# r3 = k (r, s)
+# r4 = mlen
+# r5 = m
+#
+SYM_FUNC_START(poly1305_p10le_4blocks)
+.align 5
+ cmpdi 5, 64
+ blt Out_no_poly1305
+
+ SAVE_REGS
+
+ do_poly1305_init
+
+ li 21, 0 # counter to message
+
+ poly1305_setup_r
+
+ # load previous H state
+ # break/convert r6 to 26 bits
+ ld 9, 0(3)
+ ld 10, 8(3)
+ ld 19, 16(3)
+ sldi 19, 19, 24
+ mtvsrdd 41, 0, 19
+ extrdi 14, 9, 26, 38
+ extrdi 15, 9, 26, 12
+ extrdi 16, 9, 12, 0
+ mtvsrdd 36, 0, 14
+ insrdi 16, 10, 14, 38
+ mtvsrdd 37, 0, 15
+ extrdi 17, 10, 26, 24
+ mtvsrdd 38, 0, 16
+ extrdi 18, 10, 24, 0
+ mtvsrdd 39, 0, 17
+ mtvsrdd 40, 0, 18
+ vor 8, 8, 9
+
+ # input m1 m2
+ add 20, 4, 21
+ xxlor 49, 24, 24
+ xxlor 50, 25, 25
+ lxvw4x 43, 0, 20
+ addi 17, 20, 16
+ lxvw4x 44, 0, 17
+ vperm 14, 11, 12, 17
+ vperm 15, 11, 12, 18
+ vand 9, 14, 25 # a0
+ vsrd 10, 14, 31 # >> 26
+ vsrd 11, 10, 31 # 12 bits left
+ vand 10, 10, 25 # a1
+ vspltisb 13, 12
+ vand 16, 15, 25
+ vsld 12, 16, 13
+ vor 11, 11, 12
+ vand 11, 11, 25 # a2
+ vspltisb 13, 14
+ vsrd 12, 15, 13 # >> 14
+ vsrd 13, 12, 31 # >> 26, a4
+ vand 12, 12, 25 # a3
+
+ vaddudm 20, 4, 9
+ vaddudm 21, 5, 10
+ vaddudm 22, 6, 11
+ vaddudm 23, 7, 12
+ vaddudm 24, 8, 13
+
+ # m3 m4
+ addi 17, 17, 16
+ lxvw4x 43, 0, 17
+ addi 17, 17, 16
+ lxvw4x 44, 0, 17
+ vperm 14, 11, 12, 17
+ vperm 15, 11, 12, 18
+ vand 9, 14, 25 # a0
+ vsrd 10, 14, 31 # >> 26
+ vsrd 11, 10, 31 # 12 bits left
+ vand 10, 10, 25 # a1
+ vspltisb 13, 12
+ vand 16, 15, 25
+ vsld 12, 16, 13
+ vspltisb 13, 14
+ vor 11, 11, 12
+ vand 11, 11, 25 # a2
+ vsrd 12, 15, 13 # >> 14
+ vsrd 13, 12, 31 # >> 26, a4
+ vand 12, 12, 25 # a3
+
+ # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
+ vmrgow 4, 9, 20
+ vmrgow 5, 10, 21
+ vmrgow 6, 11, 22
+ vmrgow 7, 12, 23
+ vmrgow 8, 13, 24
+ vaddudm 8, 8, 19
+
+ addi 5, 5, -64 # len -= 64
+ addi 21, 21, 64 # offset += 64
+
+ li 9, 64
+ divdu 31, 5, 9
+
+ cmpdi 31, 0
+ ble Skip_block_loop
+
+ mtctr 31
+
+# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
+# Rewrite the polynominal sum of product as follows,
+# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2
+# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
+# .... Repeat
+# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 -->
+# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
+#
+loop_4blocks:
+
+ # Multiply odd words and even words
+ mul_odd
+ mul_even
+ # carry reduction
+ vspltisb 9, 2
+ vsrd 10, 14, 31
+ vsrd 11, 17, 31
+ vand 7, 17, 25
+ vand 4, 14, 25
+ vaddudm 18, 18, 11
+ vsrd 12, 18, 31
+ vaddudm 15, 15, 10
+
+ vsrd 11, 15, 31
+ vand 8, 18, 25
+ vand 5, 15, 25
+ vaddudm 4, 4, 12
+ vsld 10, 12, 9
+ vaddudm 6, 16, 11
+
+ vsrd 13, 6, 31
+ vand 6, 6, 25
+ vaddudm 4, 4, 10
+ vsrd 10, 4, 31
+ vaddudm 7, 7, 13
+
+ vsrd 11, 7, 31
+ vand 7, 7, 25
+ vand 4, 4, 25
+ vaddudm 5, 5, 10
+ vaddudm 8, 8, 11
+
+ # input m1 m2 m3 m4
+ add 20, 4, 21
+ xxlor 49, 24, 24
+ xxlor 50, 25, 25
+ lxvw4x 43, 0, 20
+ addi 17, 20, 16
+ lxvw4x 44, 0, 17
+ vperm 14, 11, 12, 17
+ vperm 15, 11, 12, 18
+ addi 17, 17, 16
+ lxvw4x 43, 0, 17
+ addi 17, 17, 16
+ lxvw4x 44, 0, 17
+ vperm 17, 11, 12, 17
+ vperm 18, 11, 12, 18
+
+ vand 20, 14, 25 # a0
+ vand 9, 17, 25 # a0
+ vsrd 21, 14, 31 # >> 26
+ vsrd 22, 21, 31 # 12 bits left
+ vsrd 10, 17, 31 # >> 26
+ vsrd 11, 10, 31 # 12 bits left
+
+ vand 21, 21, 25 # a1
+ vand 10, 10, 25 # a1
+
+ vspltisb 13, 12
+ vand 16, 15, 25
+ vsld 23, 16, 13
+ vor 22, 22, 23
+ vand 22, 22, 25 # a2
+ vand 16, 18, 25
+ vsld 12, 16, 13
+ vor 11, 11, 12
+ vand 11, 11, 25 # a2
+ vspltisb 13, 14
+ vsrd 23, 15, 13 # >> 14
+ vsrd 24, 23, 31 # >> 26, a4
+ vand 23, 23, 25 # a3
+ vsrd 12, 18, 13 # >> 14
+ vsrd 13, 12, 31 # >> 26, a4
+ vand 12, 12, 25 # a3
+
+ vaddudm 4, 4, 20
+ vaddudm 5, 5, 21
+ vaddudm 6, 6, 22
+ vaddudm 7, 7, 23
+ vaddudm 8, 8, 24
+
+ # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
+ vmrgow 4, 9, 4
+ vmrgow 5, 10, 5
+ vmrgow 6, 11, 6
+ vmrgow 7, 12, 7
+ vmrgow 8, 13, 8
+ vaddudm 8, 8, 19
+
+ addi 5, 5, -64 # len -= 64
+ addi 21, 21, 64 # offset += 64
+
+ bdnz loop_4blocks
+
+Skip_block_loop:
+ xxlor 58, 0, 0
+ xxlor 59, 1, 1
+ xxlor 60, 2, 2
+ xxlor 61, 3, 3
+ xxlor 62, 4, 4
+ xxlor 32, 5, 5
+ xxlor 33, 6, 6
+ xxlor 34, 7, 7
+ xxlor 35, 8, 8
+
+ # Multiply odd words and even words
+ mul_odd
+ mul_even
+
+ # Sum the products.
+ xxpermdi 41, 31, 46, 0
+ xxpermdi 42, 31, 47, 0
+ vaddudm 4, 14, 9
+ xxpermdi 36, 31, 36, 3
+ vaddudm 5, 15, 10
+ xxpermdi 37, 31, 37, 3
+ xxpermdi 43, 31, 48, 0
+ vaddudm 6, 16, 11
+ xxpermdi 38, 31, 38, 3
+ xxpermdi 44, 31, 49, 0
+ vaddudm 7, 17, 12
+ xxpermdi 39, 31, 39, 3
+ xxpermdi 45, 31, 50, 0
+ vaddudm 8, 18, 13
+ xxpermdi 40, 31, 40, 3
+
+ # carry reduction
+ vspltisb 9, 2
+ vsrd 10, 4, 31
+ vsrd 11, 7, 31
+ vand 7, 7, 25
+ vand 4, 4, 25
+ vaddudm 8, 8, 11
+ vsrd 12, 8, 31
+ vaddudm 5, 5, 10
+
+ vsrd 11, 5, 31
+ vand 8, 8, 25
+ vand 5, 5, 25
+ vaddudm 4, 4, 12
+ vsld 10, 12, 9
+ vaddudm 6, 6, 11
+
+ vsrd 13, 6, 31
+ vand 6, 6, 25
+ vaddudm 4, 4, 10
+ vsrd 10, 4, 31
+ vaddudm 7, 7, 13
+
+ vsrd 11, 7, 31
+ vand 7, 7, 25
+ vand 4, 4, 25
+ vaddudm 5, 5, 10
+ vsrd 10, 5, 31
+ vand 5, 5, 25
+ vaddudm 6, 6, 10
+ vaddudm 8, 8, 11
+
+ b do_final_update
+
+do_final_update:
+ # combine 26 bit limbs
+ # v4, v5, v6, v7 and v8 are 26 bit vectors
+ vsld 5, 5, 31
+ vor 20, 4, 5
+ vspltisb 11, 12
+ vsrd 12, 6, 11
+ vsld 6, 6, 31
+ vsld 6, 6, 31
+ vor 20, 20, 6
+ vspltisb 11, 14
+ vsld 7, 7, 11
+ vor 21, 7, 12
+ mfvsrld 16, 40 # save last 2 bytes
+ vsld 8, 8, 11
+ vsld 8, 8, 31
+ vor 21, 21, 8
+ mfvsrld 17, 52
+ mfvsrld 19, 53
+ srdi 16, 16, 24
+
+ std 17, 0(3)
+ std 19, 8(3)
+ stw 16, 16(3)
+
+Out_loop:
+ li 3, 0
+
+ RESTORE_REGS
+
+ blr
+
+Out_no_poly1305:
+ li 3, 0
+ blr
+SYM_FUNC_END(poly1305_p10le_4blocks)
+
+#
+# =======================================================================
+# The following functions implement 64 x 64 bits multiplication poly1305.
+#
+SYM_FUNC_START_LOCAL(Poly1305_init_64)
+ # mask 0x0FFFFFFC0FFFFFFC
+ # mask 0x0FFFFFFC0FFFFFFF
+ addis 10, 2, rmask@toc@ha
+ addi 10, 10, rmask@toc@l
+ ld 11, 0(10)
+ ld 12, 8(10)
+
+ # initialize
+ # load key from r3
+ ld 9, 24(3)
+ ld 10, 32(3)
+ and. 9, 9, 11 # cramp mask r0
+ and. 10, 10, 12 # cramp mask r1
+
+ srdi 21, 10, 2
+ add 19, 21, 10 # s1: r19 - (r1 >> 2) *5
+
+ # setup r and s
+ li 25, 0
+ mtvsrdd 32+0, 9, 19 # r0, s1
+ mtvsrdd 32+1, 10, 9 # r1, r0
+ mtvsrdd 32+2, 19, 25 # s1
+ mtvsrdd 32+3, 9, 25 # r0
+
+ blr
+SYM_FUNC_END(Poly1305_init_64)
+
+# Poly1305_mult
+# v6 = (h0, h1), v8 = h2
+# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0
+#
+# Output: v7, v10, v11
+#
+SYM_FUNC_START_LOCAL(Poly1305_mult)
+ #
+ # d0 = h0 * r0 + h1 * s1
+ vmsumudm 7, 6, 0, 9 # h0 * r0, h1 * s1
+
+ # d1 = h0 * r1 + h1 * r0 + h2 * s1
+ vmsumudm 11, 6, 1, 9 # h0 * r1, h1 * r0
+ vmsumudm 10, 8, 2, 11 # d1 += h2 * s1
+
+ # d2 = r0
+ vmsumudm 11, 8, 3, 9 # d2 = h2 * r0
+ blr
+SYM_FUNC_END(Poly1305_mult)
+
+#
+# carry reduction
+# h %=p
+#
+# Input: v7, v10, v11
+# Output: r27, r28, r29
+#
+SYM_FUNC_START_LOCAL(Carry_reduction)
+ mfvsrld 27, 32+7
+ mfvsrld 28, 32+10
+ mfvsrld 29, 32+11
+ mfvsrd 20, 32+7 # h0.h
+ mfvsrd 21, 32+10 # h1.h
+
+ addc 28, 28, 20
+ adde 29, 29, 21
+ srdi 22, 29, 0x2
+ sldi 23, 22, 0x2
+ add 23, 23, 22 # (h2 & 3) * 5
+ addc 27, 27, 23 # h0
+ addze 28, 28 # h1
+ andi. 29, 29, 0x3 # h2
+ blr
+SYM_FUNC_END(Carry_reduction)
+
+#
+# poly1305 multiplication
+# h *= r, h %= p
+# d0 = h0 * r0 + h1 * s1
+# d1 = h0 * r1 + h1 * r0 + h2 * s1
+# d2 = h0 * r0
+#
+#
+# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit)
+# - no highbit if final leftover block (highbit = 0)
+#
+SYM_FUNC_START(poly1305_64s)
+ cmpdi 5, 0
+ ble Out_no_poly1305_64
+
+ mflr 0
+ std 0, 16(1)
+ stdu 1,-400(1)
+
+ SAVE_GPR 14, 112, 1
+ SAVE_GPR 15, 120, 1
+ SAVE_GPR 16, 128, 1
+ SAVE_GPR 17, 136, 1
+ SAVE_GPR 18, 144, 1
+ SAVE_GPR 19, 152, 1
+ SAVE_GPR 20, 160, 1
+ SAVE_GPR 21, 168, 1
+ SAVE_GPR 22, 176, 1
+ SAVE_GPR 23, 184, 1
+ SAVE_GPR 24, 192, 1
+ SAVE_GPR 25, 200, 1
+ SAVE_GPR 26, 208, 1
+ SAVE_GPR 27, 216, 1
+ SAVE_GPR 28, 224, 1
+ SAVE_GPR 29, 232, 1
+ SAVE_GPR 30, 240, 1
+ SAVE_GPR 31, 248, 1
+
+ # Init poly1305
+ bl Poly1305_init_64
+
+ li 25, 0 # offset to inp and outp
+
+ add 11, 25, 4
+
+ # load h
+ # h0, h1, h2?
+ ld 27, 0(3)
+ ld 28, 8(3)
+ lwz 29, 16(3)
+
+ li 30, 16
+ divdu 31, 5, 30
+
+ mtctr 31
+
+ mr 24, 6 # highbit
+
+Loop_block_64:
+ vxor 9, 9, 9
+
+ ld 20, 0(11)
+ ld 21, 8(11)
+ addi 11, 11, 16
+
+ addc 27, 27, 20
+ adde 28, 28, 21
+ adde 29, 29, 24
+
+ li 22, 0
+ mtvsrdd 32+6, 27, 28 # h0, h1
+ mtvsrdd 32+8, 29, 22 # h2
+
+ bl Poly1305_mult
+
+ bl Carry_reduction
+
+ bdnz Loop_block_64
+
+ std 27, 0(3)
+ std 28, 8(3)
+ stw 29, 16(3)
+
+ li 3, 0
+
+ RESTORE_GPR 14, 112, 1
+ RESTORE_GPR 15, 120, 1
+ RESTORE_GPR 16, 128, 1
+ RESTORE_GPR 17, 136, 1
+ RESTORE_GPR 18, 144, 1
+ RESTORE_GPR 19, 152, 1
+ RESTORE_GPR 20, 160, 1
+ RESTORE_GPR 21, 168, 1
+ RESTORE_GPR 22, 176, 1
+ RESTORE_GPR 23, 184, 1
+ RESTORE_GPR 24, 192, 1
+ RESTORE_GPR 25, 200, 1
+ RESTORE_GPR 26, 208, 1
+ RESTORE_GPR 27, 216, 1
+ RESTORE_GPR 28, 224, 1
+ RESTORE_GPR 29, 232, 1
+ RESTORE_GPR 30, 240, 1
+ RESTORE_GPR 31, 248, 1
+
+ addi 1, 1, 400
+ ld 0, 16(1)
+ mtlr 0
+
+ blr
+
+Out_no_poly1305_64:
+ li 3, 0
+ blr
+SYM_FUNC_END(poly1305_64s)
+
+#
+# Input: r3 = h, r4 = s, r5 = mac
+# mac = h + s
+#
+SYM_FUNC_START(poly1305_emit_64)
+ ld 10, 0(3)
+ ld 11, 8(3)
+ ld 12, 16(3)
+
+ # compare modulus
+ # h + 5 + (-p)
+ mr 6, 10
+ mr 7, 11
+ mr 8, 12
+ addic. 6, 6, 5
+ addze 7, 7
+ addze 8, 8
+ srdi 9, 8, 2 # overflow?
+ cmpdi 9, 0
+ beq Skip_h64
+ mr 10, 6
+ mr 11, 7
+ mr 12, 8
+
+Skip_h64:
+ ld 6, 0(4)
+ ld 7, 8(4)
+ addc 10, 10, 6
+ adde 11, 11, 7
+ addze 12, 12
+
+ std 10, 0(5)
+ std 11, 8(5)
+ blr
+SYM_FUNC_END(poly1305_emit_64)
+
+SYM_DATA_START_LOCAL(RMASK)
+.align 5
+rmask:
+.byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
+cnum:
+.long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
+.long 0x1a, 0x00, 0x1a, 0x00
+.long 0x01000000, 0x01000000, 0x01000000, 0x01000000
+.long 0x00010203, 0x04050607, 0x10111213, 0x14151617
+.long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f
+SYM_DATA_END(RMASK)
diff --git a/arch/powerpc/crypto/ppc-xlate.pl b/arch/powerpc/crypto/ppc-xlate.pl
new file mode 100644
index 000000000000..23cca703ce29
--- /dev/null
+++ b/arch/powerpc/crypto/ppc-xlate.pl
@@ -0,0 +1,229 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# PowerPC assembler distiller by <appro>.
+
+my $flavour = shift;
+my $output = shift;
+open STDOUT,">$output" || die "can't open $output: $!";
+
+my %GLOBALS;
+my $dotinlocallabels=($flavour=~/linux/)?1:0;
+
+################################################################
+# directives which need special treatment on different platforms
+################################################################
+my $globl = sub {
+ my $junk = shift;
+ my $name = shift;
+ my $global = \$GLOBALS{$name};
+ my $ret;
+
+ $name =~ s|^[\.\_]||;
+
+ SWITCH: for ($flavour) {
+ /aix/ && do { $name = ".$name";
+ last;
+ };
+ /osx/ && do { $name = "_$name";
+ last;
+ };
+ /linux/
+ && do { $ret = "_GLOBAL($name)";
+ last;
+ };
+ }
+
+ $ret = ".globl $name\nalign 5\n$name:" if (!$ret);
+ $$global = $name;
+ $ret;
+};
+my $text = sub {
+ my $ret = ($flavour =~ /aix/) ? ".csect\t.text[PR],7" : ".text";
+ $ret = ".abiversion 2\n".$ret if ($flavour =~ /linux.*64le/);
+ $ret;
+};
+my $machine = sub {
+ my $junk = shift;
+ my $arch = shift;
+ if ($flavour =~ /osx/)
+ { $arch =~ s/\"//g;
+ $arch = ($flavour=~/64/) ? "ppc970-64" : "ppc970" if ($arch eq "any");
+ }
+ ".machine $arch";
+};
+my $size = sub {
+ if ($flavour =~ /linux/)
+ { shift;
+ my $name = shift; $name =~ s|^[\.\_]||;
+ my $ret = ".size $name,.-".($flavour=~/64$/?".":"").$name;
+ $ret .= "\n.size .$name,.-.$name" if ($flavour=~/64$/);
+ $ret;
+ }
+ else
+ { ""; }
+};
+my $asciz = sub {
+ shift;
+ my $line = join(",",@_);
+ if ($line =~ /^"(.*)"$/)
+ { ".byte " . join(",",unpack("C*",$1),0) . "\n.align 2"; }
+ else
+ { ""; }
+};
+my $quad = sub {
+ shift;
+ my @ret;
+ my ($hi,$lo);
+ for (@_) {
+ if (/^0x([0-9a-f]*?)([0-9a-f]{1,8})$/io)
+ { $hi=$1?"0x$1":"0"; $lo="0x$2"; }
+ elsif (/^([0-9]+)$/o)
+ { $hi=$1>>32; $lo=$1&0xffffffff; } # error-prone with 32-bit perl
+ else
+ { $hi=undef; $lo=$_; }
+
+ if (defined($hi))
+ { push(@ret,$flavour=~/le$/o?".long\t$lo,$hi":".long\t$hi,$lo"); }
+ else
+ { push(@ret,".quad $lo"); }
+ }
+ join("\n",@ret);
+};
+
+################################################################
+# simplified mnemonics not handled by at least one assembler
+################################################################
+my $cmplw = sub {
+ my $f = shift;
+ my $cr = 0; $cr = shift if ($#_>1);
+ # Some out-of-date 32-bit GNU assembler just can't handle cmplw...
+ ($flavour =~ /linux.*32/) ?
+ " .long ".sprintf "0x%x",31<<26|$cr<<23|$_[0]<<16|$_[1]<<11|64 :
+ " cmplw ".join(',',$cr,@_);
+};
+my $bdnz = sub {
+ my $f = shift;
+ my $bo = $f=~/[\+\-]/ ? 16+9 : 16; # optional "to be taken" hint
+ " bc $bo,0,".shift;
+} if ($flavour!~/linux/);
+my $bltlr = sub {
+ my $f = shift;
+ my $bo = $f=~/\-/ ? 12+2 : 12; # optional "not to be taken" hint
+ ($flavour =~ /linux/) ? # GNU as doesn't allow most recent hints
+ " .long ".sprintf "0x%x",19<<26|$bo<<21|16<<1 :
+ " bclr $bo,0";
+};
+my $bnelr = sub {
+ my $f = shift;
+ my $bo = $f=~/\-/ ? 4+2 : 4; # optional "not to be taken" hint
+ ($flavour =~ /linux/) ? # GNU as doesn't allow most recent hints
+ " .long ".sprintf "0x%x",19<<26|$bo<<21|2<<16|16<<1 :
+ " bclr $bo,2";
+};
+my $beqlr = sub {
+ my $f = shift;
+ my $bo = $f=~/-/ ? 12+2 : 12; # optional "not to be taken" hint
+ ($flavour =~ /linux/) ? # GNU as doesn't allow most recent hints
+ " .long ".sprintf "0x%X",19<<26|$bo<<21|2<<16|16<<1 :
+ " bclr $bo,2";
+};
+# GNU assembler can't handle extrdi rA,rS,16,48, or when sum of last two
+# arguments is 64, with "operand out of range" error.
+my $extrdi = sub {
+ my ($f,$ra,$rs,$n,$b) = @_;
+ $b = ($b+$n)&63; $n = 64-$n;
+ " rldicl $ra,$rs,$b,$n";
+};
+my $vmr = sub {
+ my ($f,$vx,$vy) = @_;
+ " vor $vx,$vy,$vy";
+};
+
+# Some ABIs specify vrsave, special-purpose register #256, as reserved
+# for system use.
+my $no_vrsave = ($flavour =~ /linux-ppc64le/);
+my $mtspr = sub {
+ my ($f,$idx,$ra) = @_;
+ if ($idx == 256 && $no_vrsave) {
+ " or $ra,$ra,$ra";
+ } else {
+ " mtspr $idx,$ra";
+ }
+};
+my $mfspr = sub {
+ my ($f,$rd,$idx) = @_;
+ if ($idx == 256 && $no_vrsave) {
+ " li $rd,-1";
+ } else {
+ " mfspr $rd,$idx";
+ }
+};
+
+# PowerISA 2.06 stuff
+sub vsxmem_op {
+ my ($f, $vrt, $ra, $rb, $op) = @_;
+ " .long ".sprintf "0x%X",(31<<26)|($vrt<<21)|($ra<<16)|($rb<<11)|($op*2+1);
+}
+# made-up unaligned memory reference AltiVec/VMX instructions
+my $lvx_u = sub { vsxmem_op(@_, 844); }; # lxvd2x
+my $stvx_u = sub { vsxmem_op(@_, 972); }; # stxvd2x
+my $lvdx_u = sub { vsxmem_op(@_, 588); }; # lxsdx
+my $stvdx_u = sub { vsxmem_op(@_, 716); }; # stxsdx
+my $lvx_4w = sub { vsxmem_op(@_, 780); }; # lxvw4x
+my $stvx_4w = sub { vsxmem_op(@_, 908); }; # stxvw4x
+
+# PowerISA 2.07 stuff
+sub vcrypto_op {
+ my ($f, $vrt, $vra, $vrb, $op) = @_;
+ " .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op;
+}
+my $vcipher = sub { vcrypto_op(@_, 1288); };
+my $vcipherlast = sub { vcrypto_op(@_, 1289); };
+my $vncipher = sub { vcrypto_op(@_, 1352); };
+my $vncipherlast= sub { vcrypto_op(@_, 1353); };
+my $vsbox = sub { vcrypto_op(@_, 0, 1480); };
+my $vshasigmad = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1730); };
+my $vshasigmaw = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1666); };
+my $vpmsumb = sub { vcrypto_op(@_, 1032); };
+my $vpmsumd = sub { vcrypto_op(@_, 1224); };
+my $vpmsubh = sub { vcrypto_op(@_, 1096); };
+my $vpmsumw = sub { vcrypto_op(@_, 1160); };
+my $vaddudm = sub { vcrypto_op(@_, 192); };
+my $vadduqm = sub { vcrypto_op(@_, 256); };
+
+my $mtsle = sub {
+ my ($f, $arg) = @_;
+ " .long ".sprintf "0x%X",(31<<26)|($arg<<21)|(147*2);
+};
+
+print "#include <asm/ppc_asm.h>\n" if $flavour =~ /linux/;
+
+while($line=<>) {
+
+ $line =~ s|[#!;].*$||; # get rid of asm-style comments...
+ $line =~ s|/\*.*\*/||; # ... and C-style comments...
+ $line =~ s|^\s+||; # ... and skip white spaces in beginning...
+ $line =~ s|\s+$||; # ... and at the end
+
+ {
+ $line =~ s|\b\.L(\w+)|L$1|g; # common denominator for Locallabel
+ $line =~ s|\bL(\w+)|\.L$1|g if ($dotinlocallabels);
+ }
+
+ {
+ $line =~ s|^\s*(\.?)(\w+)([\.\+\-]?)\s*||;
+ my $c = $1; $c = "\t" if ($c eq "");
+ my $mnemonic = $2;
+ my $f = $3;
+ my $opcode = eval("\$$mnemonic");
+ $line =~ s/\b(c?[rf]|v|vs)([0-9]+)\b/$2/g if ($c ne "." and $flavour !~ /osx/);
+ if (ref($opcode) eq 'CODE') { $line = &$opcode($f,split(',',$line)); }
+ elsif ($mnemonic) { $line = $c.$mnemonic.$f."\t".$line; }
+ }
+
+ print $line if ($line);
+ print "\n";
+}
+
+close STDOUT;
diff --git a/arch/powerpc/crypto/sha1-powerpc-asm.S b/arch/powerpc/crypto/sha1-powerpc-asm.S
index 23e248beff71..f0d5ed557ab1 100644
--- a/arch/powerpc/crypto/sha1-powerpc-asm.S
+++ b/arch/powerpc/crypto/sha1-powerpc-asm.S
@@ -125,8 +125,7 @@
_GLOBAL(powerpc_sha_transform)
PPC_STLU r1,-INT_FRAME_SIZE(r1)
- SAVE_8GPRS(14, r1)
- SAVE_10GPRS(22, r1)
+ SAVE_GPRS(14, 31, r1)
/* Load up A - E */
lwz RA(0),0(r3) /* A */
@@ -184,7 +183,6 @@ _GLOBAL(powerpc_sha_transform)
stw RD(0),12(r3)
stw RE(0),16(r3)
- REST_8GPRS(14, r1)
- REST_10GPRS(22, r1)
+ REST_GPRS(14, 31, r1)
addi r1,r1,INT_FRAME_SIZE
blr
diff --git a/arch/powerpc/crypto/sha1-spe-glue.c b/arch/powerpc/crypto/sha1-spe-glue.c
index 6379990bd604..9170892a8557 100644
--- a/arch/powerpc/crypto/sha1-spe-glue.c
+++ b/arch/powerpc/crypto/sha1-spe-glue.c
@@ -11,9 +11,9 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
-#include <linux/cryptohash.h>
#include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha1_base.h>
#include <asm/byteorder.h>
#include <asm/switch_to.h>
#include <linux/hardirq.h>
@@ -56,20 +56,6 @@ static inline void ppc_sha1_clear_context(struct sha1_state *sctx)
do { *ptr++ = 0; } while (--count);
}
-static int ppc_spe_sha1_init(struct shash_desc *desc)
-{
- struct sha1_state *sctx = shash_desc_ctx(desc);
-
- sctx->state[0] = SHA1_H0;
- sctx->state[1] = SHA1_H1;
- sctx->state[2] = SHA1_H2;
- sctx->state[3] = SHA1_H3;
- sctx->state[4] = SHA1_H4;
- sctx->count = 0;
-
- return 0;
-}
-
static int ppc_spe_sha1_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
@@ -108,7 +94,7 @@ static int ppc_spe_sha1_update(struct shash_desc *desc, const u8 *data,
src += bytes;
len -= bytes;
- };
+ }
memcpy((char *)sctx->buffer, src, len);
return 0;
@@ -169,7 +155,7 @@ static int ppc_spe_sha1_import(struct shash_desc *desc, const void *in)
static struct shash_alg alg = {
.digestsize = SHA1_DIGEST_SIZE,
- .init = ppc_spe_sha1_init,
+ .init = sha1_base_init,
.update = ppc_spe_sha1_update,
.final = ppc_spe_sha1_final,
.export = ppc_spe_sha1_export,
diff --git a/arch/powerpc/crypto/sha1.c b/arch/powerpc/crypto/sha1.c
index 7b43fc352089..f283bbd3f121 100644
--- a/arch/powerpc/crypto/sha1.c
+++ b/arch/powerpc/crypto/sha1.c
@@ -16,26 +16,15 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
-#include <linux/cryptohash.h>
#include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha1_base.h>
#include <asm/byteorder.h>
-extern void powerpc_sha_transform(u32 *state, const u8 *src, u32 *temp);
+void powerpc_sha_transform(u32 *state, const u8 *src);
-static int sha1_init(struct shash_desc *desc)
-{
- struct sha1_state *sctx = shash_desc_ctx(desc);
-
- *sctx = (struct sha1_state){
- .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
- };
-
- return 0;
-}
-
-static int sha1_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
+static int powerpc_sha1_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
{
struct sha1_state *sctx = shash_desc_ctx(desc);
unsigned int partial, done;
@@ -47,7 +36,6 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
src = data;
if ((partial + len) > 63) {
- u32 temp[SHA_WORKSPACE_WORDS];
if (partial) {
done = -partial;
@@ -56,12 +44,11 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
}
do {
- powerpc_sha_transform(sctx->state, src, temp);
+ powerpc_sha_transform(sctx->state, src);
done += 64;
src = data + done;
} while (done + 63 < len);
- memzero_explicit(temp, sizeof(temp));
partial = 0;
}
memcpy(sctx->buffer + partial, src, len - done);
@@ -71,7 +58,7 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
/* Add padding and return the message digest. */
-static int sha1_final(struct shash_desc *desc, u8 *out)
+static int powerpc_sha1_final(struct shash_desc *desc, u8 *out)
{
struct sha1_state *sctx = shash_desc_ctx(desc);
__be32 *dst = (__be32 *)out;
@@ -84,10 +71,10 @@ static int sha1_final(struct shash_desc *desc, u8 *out)
/* Pad out to 56 mod 64 */
index = sctx->count & 0x3f;
padlen = (index < 56) ? (56 - index) : ((64+56) - index);
- sha1_update(desc, padding, padlen);
+ powerpc_sha1_update(desc, padding, padlen);
/* Append length */
- sha1_update(desc, (const u8 *)&bits, sizeof(bits));
+ powerpc_sha1_update(desc, (const u8 *)&bits, sizeof(bits));
/* Store state in digest */
for (i = 0; i < 5; i++)
@@ -99,7 +86,7 @@ static int sha1_final(struct shash_desc *desc, u8 *out)
return 0;
}
-static int sha1_export(struct shash_desc *desc, void *out)
+static int powerpc_sha1_export(struct shash_desc *desc, void *out)
{
struct sha1_state *sctx = shash_desc_ctx(desc);
@@ -107,7 +94,7 @@ static int sha1_export(struct shash_desc *desc, void *out)
return 0;
}
-static int sha1_import(struct shash_desc *desc, const void *in)
+static int powerpc_sha1_import(struct shash_desc *desc, const void *in)
{
struct sha1_state *sctx = shash_desc_ctx(desc);
@@ -117,11 +104,11 @@ static int sha1_import(struct shash_desc *desc, const void *in)
static struct shash_alg alg = {
.digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_init,
- .update = sha1_update,
- .final = sha1_final,
- .export = sha1_export,
- .import = sha1_import,
+ .init = sha1_base_init,
+ .update = powerpc_sha1_update,
+ .final = powerpc_sha1_final,
+ .export = powerpc_sha1_export,
+ .import = powerpc_sha1_import,
.descsize = sizeof(struct sha1_state),
.statesize = sizeof(struct sha1_state),
.base = {
diff --git a/arch/powerpc/crypto/sha256-spe-glue.c b/arch/powerpc/crypto/sha256-spe-glue.c
index 84939e563b81..2997d13236e0 100644
--- a/arch/powerpc/crypto/sha256-spe-glue.c
+++ b/arch/powerpc/crypto/sha256-spe-glue.c
@@ -12,9 +12,9 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
-#include <linux/cryptohash.h>
#include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
+#include <crypto/sha256_base.h>
#include <asm/byteorder.h>
#include <asm/switch_to.h>
#include <linux/hardirq.h>
@@ -57,40 +57,6 @@ static inline void ppc_sha256_clear_context(struct sha256_state *sctx)
do { *ptr++ = 0; } while (--count);
}
-static int ppc_spe_sha256_init(struct shash_desc *desc)
-{
- struct sha256_state *sctx = shash_desc_ctx(desc);
-
- sctx->state[0] = SHA256_H0;
- sctx->state[1] = SHA256_H1;
- sctx->state[2] = SHA256_H2;
- sctx->state[3] = SHA256_H3;
- sctx->state[4] = SHA256_H4;
- sctx->state[5] = SHA256_H5;
- sctx->state[6] = SHA256_H6;
- sctx->state[7] = SHA256_H7;
- sctx->count = 0;
-
- return 0;
-}
-
-static int ppc_spe_sha224_init(struct shash_desc *desc)
-{
- struct sha256_state *sctx = shash_desc_ctx(desc);
-
- sctx->state[0] = SHA224_H0;
- sctx->state[1] = SHA224_H1;
- sctx->state[2] = SHA224_H2;
- sctx->state[3] = SHA224_H3;
- sctx->state[4] = SHA224_H4;
- sctx->state[5] = SHA224_H5;
- sctx->state[6] = SHA224_H6;
- sctx->state[7] = SHA224_H7;
- sctx->count = 0;
-
- return 0;
-}
-
static int ppc_spe_sha256_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
@@ -130,7 +96,7 @@ static int ppc_spe_sha256_update(struct shash_desc *desc, const u8 *data,
src += bytes;
len -= bytes;
- };
+ }
memcpy((char *)sctx->buf, src, len);
return 0;
@@ -178,7 +144,7 @@ static int ppc_spe_sha256_final(struct shash_desc *desc, u8 *out)
static int ppc_spe_sha224_final(struct shash_desc *desc, u8 *out)
{
- u32 D[SHA256_DIGEST_SIZE >> 2];
+ __be32 D[SHA256_DIGEST_SIZE >> 2];
__be32 *dst = (__be32 *)out;
ppc_spe_sha256_final(desc, (u8 *)D);
@@ -215,7 +181,7 @@ static int ppc_spe_sha256_import(struct shash_desc *desc, const void *in)
static struct shash_alg algs[2] = { {
.digestsize = SHA256_DIGEST_SIZE,
- .init = ppc_spe_sha256_init,
+ .init = sha256_base_init,
.update = ppc_spe_sha256_update,
.final = ppc_spe_sha256_final,
.export = ppc_spe_sha256_export,
@@ -231,7 +197,7 @@ static struct shash_alg algs[2] = { {
}
}, {
.digestsize = SHA224_DIGEST_SIZE,
- .init = ppc_spe_sha224_init,
+ .init = sha224_base_init,
.update = ppc_spe_sha256_update,
.final = ppc_spe_sha224_final,
.export = ppc_spe_sha256_export,
diff --git a/arch/powerpc/crypto/vmx.c b/arch/powerpc/crypto/vmx.c
new file mode 100644
index 000000000000..7eb713cc87c8
--- /dev/null
+++ b/arch/powerpc/crypto/vmx.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Routines supporting VMX instructions on the Power 8
+ *
+ * Copyright (C) 2015 International Business Machines Inc.
+ *
+ * Author: Marcelo Henrique Cerri <mhcerri@br.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/types.h>
+#include <linux/err.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <asm/cputable.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/skcipher.h>
+
+#include "aesp8-ppc.h"
+
+static int __init p8_init(void)
+{
+ int ret;
+
+ ret = crypto_register_shash(&p8_ghash_alg);
+ if (ret)
+ goto err;
+
+ ret = crypto_register_alg(&p8_aes_alg);
+ if (ret)
+ goto err_unregister_ghash;
+
+ ret = crypto_register_skcipher(&p8_aes_cbc_alg);
+ if (ret)
+ goto err_unregister_aes;
+
+ ret = crypto_register_skcipher(&p8_aes_ctr_alg);
+ if (ret)
+ goto err_unregister_aes_cbc;
+
+ ret = crypto_register_skcipher(&p8_aes_xts_alg);
+ if (ret)
+ goto err_unregister_aes_ctr;
+
+ return 0;
+
+err_unregister_aes_ctr:
+ crypto_unregister_skcipher(&p8_aes_ctr_alg);
+err_unregister_aes_cbc:
+ crypto_unregister_skcipher(&p8_aes_cbc_alg);
+err_unregister_aes:
+ crypto_unregister_alg(&p8_aes_alg);
+err_unregister_ghash:
+ crypto_unregister_shash(&p8_ghash_alg);
+err:
+ return ret;
+}
+
+static void __exit p8_exit(void)
+{
+ crypto_unregister_skcipher(&p8_aes_xts_alg);
+ crypto_unregister_skcipher(&p8_aes_ctr_alg);
+ crypto_unregister_skcipher(&p8_aes_cbc_alg);
+ crypto_unregister_alg(&p8_aes_alg);
+ crypto_unregister_shash(&p8_ghash_alg);
+}
+
+module_cpu_feature_match(PPC_MODULE_FEATURE_VEC_CRYPTO, p8_init);
+module_exit(p8_exit);
+
+MODULE_AUTHOR("Marcelo Cerri<mhcerri@br.ibm.com>");
+MODULE_DESCRIPTION("IBM VMX cryptographic acceleration instructions "
+ "support on Power 8");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0.0");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);