diff options
Diffstat (limited to 'arch/arm64/crypto')
-rw-r--r-- | arch/arm64/crypto/Kconfig | 11 | ||||
-rw-r--r-- | arch/arm64/crypto/Makefile | 3 | ||||
-rw-r--r-- | arch/arm64/crypto/aes-ce-ccm-core.S | 265 | ||||
-rw-r--r-- | arch/arm64/crypto/aes-ce-ccm-glue.c | 156 | ||||
-rw-r--r-- | arch/arm64/crypto/aes-ce-glue.c | 2 | ||||
-rw-r--r-- | arch/arm64/crypto/aes-ce.S | 34 | ||||
-rw-r--r-- | arch/arm64/crypto/aes-glue.c | 1 | ||||
-rw-r--r-- | arch/arm64/crypto/aes-neon.S | 20 | ||||
-rw-r--r-- | arch/arm64/crypto/aes-neonbs-glue.c | 1 | ||||
-rw-r--r-- | arch/arm64/crypto/crct10dif-ce-core.S | 514 | ||||
-rw-r--r-- | arch/arm64/crypto/crct10dif-ce-glue.c | 143 | ||||
-rw-r--r-- | arch/arm64/crypto/ghash-ce-glue.c | 2 | ||||
-rw-r--r-- | arch/arm64/crypto/poly1305-armv8.pl | 6 | ||||
-rw-r--r-- | arch/arm64/crypto/poly1305-glue.c | 3 | ||||
-rw-r--r-- | arch/arm64/crypto/sha1-ce-glue.c | 2 | ||||
-rw-r--r-- | arch/arm64/crypto/sha2-ce-glue.c | 2 | ||||
-rw-r--r-- | arch/arm64/crypto/sha3-ce-glue.c | 2 | ||||
-rw-r--r-- | arch/arm64/crypto/sha512-ce-glue.c | 2 | ||||
-rw-r--r-- | arch/arm64/crypto/sm3-ce-glue.c | 2 | ||||
-rw-r--r-- | arch/arm64/crypto/sm3-neon-glue.c | 2 |
20 files changed, 241 insertions, 932 deletions
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index eb7b423ba463..5636ab83f22a 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -268,6 +268,7 @@ config CRYPTO_AES_ARM64_CE_CCM depends on ARM64 && KERNEL_MODE_NEON select CRYPTO_ALGAPI select CRYPTO_AES_ARM64_CE + select CRYPTO_AES_ARM64_CE_BLK select CRYPTO_AEAD select CRYPTO_LIB_AES help @@ -311,15 +312,5 @@ config CRYPTO_SM4_ARM64_CE_GCM - PMULL (Polynomial Multiply Long) instructions - NEON (Advanced SIMD) extensions -config CRYPTO_CRCT10DIF_ARM64_CE - tristate "CRCT10DIF (PMULL)" - depends on KERNEL_MODE_NEON && CRC_T10DIF - select CRYPTO_HASH - help - CRC16 CRC algorithm used for the T10 (SCSI) Data Integrity Field (DIF) - - Architecture: arm64 using - - PMULL (Polynomial Multiply Long) instructions - endmenu diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index fbe64dce66e0..e7139c4768ce 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -44,9 +44,6 @@ ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o -obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o -crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o - obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S index b03f7f71f893..f2624238fd95 100644 --- a/arch/arm64/crypto/aes-ce-ccm-core.S +++ b/arch/arm64/crypto/aes-ce-ccm-core.S @@ -1,8 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions + * aes-ce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions * - * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2013 - 2017 Linaro Ltd. + * Copyright (C) 2024 Google LLC + * + * Author: Ard Biesheuvel <ardb@kernel.org> */ #include <linux/linkage.h> @@ -11,211 +14,129 @@ .text .arch armv8-a+crypto - /* - * u32 ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, - * u32 macp, u8 const rk[], u32 rounds); - */ -SYM_FUNC_START(ce_aes_ccm_auth_data) - ld1 {v0.16b}, [x0] /* load mac */ - cbz w3, 1f - sub w3, w3, #16 - eor v1.16b, v1.16b, v1.16b -0: ldrb w7, [x1], #1 /* get 1 byte of input */ - subs w2, w2, #1 - add w3, w3, #1 - ins v1.b[0], w7 - ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */ - beq 8f /* out of input? */ - cbnz w3, 0b - eor v0.16b, v0.16b, v1.16b -1: ld1 {v3.4s}, [x4] /* load first round key */ - prfm pldl1strm, [x1] - cmp w5, #12 /* which key size? */ - add x6, x4, #16 - sub w7, w5, #2 /* modified # of rounds */ - bmi 2f - bne 5f - mov v5.16b, v3.16b - b 4f -2: mov v4.16b, v3.16b - ld1 {v5.4s}, [x6], #16 /* load 2nd round key */ -3: aese v0.16b, v4.16b - aesmc v0.16b, v0.16b -4: ld1 {v3.4s}, [x6], #16 /* load next round key */ - aese v0.16b, v5.16b - aesmc v0.16b, v0.16b -5: ld1 {v4.4s}, [x6], #16 /* load next round key */ - subs w7, w7, #3 - aese v0.16b, v3.16b - aesmc v0.16b, v0.16b - ld1 {v5.4s}, [x6], #16 /* load next round key */ - bpl 3b - aese v0.16b, v4.16b - subs w2, w2, #16 /* last data? */ - eor v0.16b, v0.16b, v5.16b /* final round */ - bmi 6f - ld1 {v1.16b}, [x1], #16 /* load next input block */ - eor v0.16b, v0.16b, v1.16b /* xor with mac */ - bne 1b -6: st1 {v0.16b}, [x0] /* store mac */ - beq 10f - adds w2, w2, #16 - beq 10f - mov w3, w2 -7: ldrb w7, [x1], #1 - umov w6, v0.b[0] - eor w6, w6, w7 - strb w6, [x0], #1 - subs w2, w2, #1 - beq 10f - ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */ - b 7b -8: cbz w3, 91f - mov w7, w3 - add w3, w3, #16 -9: ext v1.16b, v1.16b, v1.16b, #1 - adds w7, w7, #1 - bne 9b -91: eor v0.16b, v0.16b, v1.16b - st1 {v0.16b}, [x0] -10: mov w0, w3 - ret -SYM_FUNC_END(ce_aes_ccm_auth_data) + .macro load_round_keys, rk, nr, tmp + sub w\tmp, \nr, #10 + add \tmp, \rk, w\tmp, sxtw #4 + ld1 {v10.4s-v13.4s}, [\rk] + ld1 {v14.4s-v17.4s}, [\tmp], #64 + ld1 {v18.4s-v21.4s}, [\tmp], #64 + ld1 {v3.4s-v5.4s}, [\tmp] + .endm - /* - * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[], - * u32 rounds); - */ -SYM_FUNC_START(ce_aes_ccm_final) - ld1 {v3.4s}, [x2], #16 /* load first round key */ - ld1 {v0.16b}, [x0] /* load mac */ - cmp w3, #12 /* which key size? */ - sub w3, w3, #2 /* modified # of rounds */ - ld1 {v1.16b}, [x1] /* load 1st ctriv */ - bmi 0f - bne 3f - mov v5.16b, v3.16b - b 2f -0: mov v4.16b, v3.16b -1: ld1 {v5.4s}, [x2], #16 /* load next round key */ - aese v0.16b, v4.16b - aesmc v0.16b, v0.16b - aese v1.16b, v4.16b - aesmc v1.16b, v1.16b -2: ld1 {v3.4s}, [x2], #16 /* load next round key */ - aese v0.16b, v5.16b - aesmc v0.16b, v0.16b - aese v1.16b, v5.16b - aesmc v1.16b, v1.16b -3: ld1 {v4.4s}, [x2], #16 /* load next round key */ - subs w3, w3, #3 - aese v0.16b, v3.16b - aesmc v0.16b, v0.16b - aese v1.16b, v3.16b - aesmc v1.16b, v1.16b - bpl 1b - aese v0.16b, v4.16b - aese v1.16b, v4.16b - /* final round key cancels out */ - eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */ - st1 {v0.16b}, [x0] /* store result */ - ret -SYM_FUNC_END(ce_aes_ccm_final) + .macro dround, va, vb, vk + aese \va\().16b, \vk\().16b + aesmc \va\().16b, \va\().16b + aese \vb\().16b, \vk\().16b + aesmc \vb\().16b, \vb\().16b + .endm + + .macro aes_encrypt, va, vb, nr + tbz \nr, #2, .L\@ + dround \va, \vb, v10 + dround \va, \vb, v11 + tbz \nr, #1, .L\@ + dround \va, \vb, v12 + dround \va, \vb, v13 +.L\@: .irp v, v14, v15, v16, v17, v18, v19, v20, v21, v3 + dround \va, \vb, \v + .endr + aese \va\().16b, v4.16b + aese \vb\().16b, v4.16b + .endm .macro aes_ccm_do_crypt,enc - cbz x2, 5f - ldr x8, [x6, #8] /* load lower ctr */ + load_round_keys x3, w4, x10 + ld1 {v0.16b}, [x5] /* load mac */ + cbz x2, ce_aes_ccm_final + ldr x8, [x6, #8] /* load lower ctr */ CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */ 0: /* outer loop */ ld1 {v1.8b}, [x6] /* load upper ctr */ prfm pldl1strm, [x1] add x8, x8, #1 rev x9, x8 - cmp w4, #12 /* which key size? */ - sub w7, w4, #2 /* get modified # of rounds */ ins v1.d[1], x9 /* no carry in lower ctr */ - ld1 {v3.4s}, [x3] /* load first round key */ - add x10, x3, #16 - bmi 1f - bne 4f - mov v5.16b, v3.16b - b 3f -1: mov v4.16b, v3.16b - ld1 {v5.4s}, [x10], #16 /* load 2nd round key */ -2: /* inner loop: 3 rounds, 2x interleaved */ - aese v0.16b, v4.16b - aesmc v0.16b, v0.16b - aese v1.16b, v4.16b - aesmc v1.16b, v1.16b -3: ld1 {v3.4s}, [x10], #16 /* load next round key */ - aese v0.16b, v5.16b - aesmc v0.16b, v0.16b - aese v1.16b, v5.16b - aesmc v1.16b, v1.16b -4: ld1 {v4.4s}, [x10], #16 /* load next round key */ - subs w7, w7, #3 - aese v0.16b, v3.16b - aesmc v0.16b, v0.16b - aese v1.16b, v3.16b - aesmc v1.16b, v1.16b - ld1 {v5.4s}, [x10], #16 /* load next round key */ - bpl 2b - aese v0.16b, v4.16b - aese v1.16b, v4.16b + + aes_encrypt v0, v1, w4 + subs w2, w2, #16 - bmi 6f /* partial block? */ + bmi ce_aes_ccm_crypt_tail ld1 {v2.16b}, [x1], #16 /* load next input block */ .if \enc == 1 eor v2.16b, v2.16b, v5.16b /* final round enc+mac */ - eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */ + eor v6.16b, v1.16b, v2.16b /* xor with crypted ctr */ .else eor v2.16b, v2.16b, v1.16b /* xor with crypted ctr */ - eor v1.16b, v2.16b, v5.16b /* final round enc */ + eor v6.16b, v2.16b, v5.16b /* final round enc */ .endif eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */ - st1 {v1.16b}, [x0], #16 /* write output block */ + st1 {v6.16b}, [x0], #16 /* write output block */ bne 0b CPU_LE( rev x8, x8 ) - st1 {v0.16b}, [x5] /* store mac */ str x8, [x6, #8] /* store lsb end of ctr (BE) */ -5: ret - -6: eor v0.16b, v0.16b, v5.16b /* final round mac */ - eor v1.16b, v1.16b, v5.16b /* final round enc */ + cbnz x7, ce_aes_ccm_final st1 {v0.16b}, [x5] /* store mac */ - add w2, w2, #16 /* process partial tail block */ -7: ldrb w9, [x1], #1 /* get 1 byte of input */ - umov w6, v1.b[0] /* get top crypted ctr byte */ - umov w7, v0.b[0] /* get top mac byte */ - .if \enc == 1 - eor w7, w7, w9 - eor w9, w9, w6 - .else - eor w9, w9, w6 - eor w7, w7, w9 - .endif - strb w9, [x0], #1 /* store out byte */ - strb w7, [x5], #1 /* store mac byte */ - subs w2, w2, #1 - beq 5b - ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */ - ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */ - b 7b + ret .endm +SYM_FUNC_START_LOCAL(ce_aes_ccm_crypt_tail) + eor v0.16b, v0.16b, v5.16b /* final round mac */ + eor v1.16b, v1.16b, v5.16b /* final round enc */ + + add x1, x1, w2, sxtw /* rewind the input pointer (w2 < 0) */ + add x0, x0, w2, sxtw /* rewind the output pointer */ + + adr_l x8, .Lpermute /* load permute vectors */ + add x9, x8, w2, sxtw + sub x8, x8, w2, sxtw + ld1 {v7.16b-v8.16b}, [x9] + ld1 {v9.16b}, [x8] + + ld1 {v2.16b}, [x1] /* load a full block of input */ + tbl v1.16b, {v1.16b}, v7.16b /* move keystream to end of register */ + eor v7.16b, v2.16b, v1.16b /* encrypt partial input block */ + bif v2.16b, v7.16b, v22.16b /* select plaintext */ + tbx v7.16b, {v6.16b}, v8.16b /* insert output from previous iteration */ + tbl v2.16b, {v2.16b}, v9.16b /* copy plaintext to start of v2 */ + eor v0.16b, v0.16b, v2.16b /* fold plaintext into mac */ + + st1 {v7.16b}, [x0] /* store output block */ + cbz x7, 0f + +SYM_INNER_LABEL(ce_aes_ccm_final, SYM_L_LOCAL) + ld1 {v1.16b}, [x7] /* load 1st ctriv */ + + aes_encrypt v0, v1, w4 + + /* final round key cancels out */ + eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */ +0: st1 {v0.16b}, [x5] /* store result */ + ret +SYM_FUNC_END(ce_aes_ccm_crypt_tail) + /* * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, * u8 const rk[], u32 rounds, u8 mac[], - * u8 ctr[]); + * u8 ctr[], u8 const final_iv[]); * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, * u8 const rk[], u32 rounds, u8 mac[], - * u8 ctr[]); + * u8 ctr[], u8 const final_iv[]); */ SYM_FUNC_START(ce_aes_ccm_encrypt) + movi v22.16b, #255 aes_ccm_do_crypt 1 SYM_FUNC_END(ce_aes_ccm_encrypt) SYM_FUNC_START(ce_aes_ccm_decrypt) + movi v22.16b, #0 aes_ccm_do_crypt 0 SYM_FUNC_END(ce_aes_ccm_decrypt) + + .section ".rodata", "a" + .align 6 + .fill 15, 1, 0xff +.Lpermute: + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf + .fill 15, 1, 0xff diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c index 25cd3808ecbe..a2b5d6f20f4d 100644 --- a/arch/arm64/crypto/aes-ce-ccm-glue.c +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c @@ -1,12 +1,15 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions + * aes-ce-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions * - * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2013 - 2017 Linaro Ltd. + * Copyright (C) 2024 Google LLC + * + * Author: Ard Biesheuvel <ardb@kernel.org> */ #include <asm/neon.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <crypto/aes.h> #include <crypto/scatterwalk.h> #include <crypto/internal/aead.h> @@ -15,6 +18,8 @@ #include "aes-ce-setkey.h" +MODULE_IMPORT_NS("CRYPTO_INTERNAL"); + static int num_rounds(struct crypto_aes_ctx *ctx) { /* @@ -27,19 +32,17 @@ static int num_rounds(struct crypto_aes_ctx *ctx) return 6 + ctx->key_length / 4; } -asmlinkage u32 ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, - u32 macp, u32 const rk[], u32 rounds); +asmlinkage u32 ce_aes_mac_update(u8 const in[], u32 const rk[], int rounds, + int blocks, u8 dg[], int enc_before, + int enc_after); asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, u32 const rk[], u32 rounds, u8 mac[], - u8 ctr[]); + u8 ctr[], u8 const final_iv[]); asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, u32 const rk[], u32 rounds, u8 mac[], - u8 ctr[]); - -asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[], - u32 rounds); + u8 ctr[], u8 const final_iv[]); static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key, unsigned int key_len) @@ -94,6 +97,41 @@ static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen) return 0; } +static u32 ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, + u32 macp, u32 const rk[], u32 rounds) +{ + int enc_after = (macp + abytes) % AES_BLOCK_SIZE; + + do { + u32 blocks = abytes / AES_BLOCK_SIZE; + + if (macp == AES_BLOCK_SIZE || (!macp && blocks > 0)) { + u32 rem = ce_aes_mac_update(in, rk, rounds, blocks, mac, + macp, enc_after); + u32 adv = (blocks - rem) * AES_BLOCK_SIZE; + + macp = enc_after ? 0 : AES_BLOCK_SIZE; + in += adv; + abytes -= adv; + + if (unlikely(rem)) { + kernel_neon_end(); + kernel_neon_begin(); + macp = 0; + } + } else { + u32 l = min(AES_BLOCK_SIZE - macp, abytes); + + crypto_xor(&mac[macp], in, l); + in += l; + macp += l; + abytes -= l; + } + } while (abytes > 0); + + return macp; +} + static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) { struct crypto_aead *aead = crypto_aead_reqtfm(req); @@ -101,7 +139,7 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) struct __packed { __be16 l; __be32 h; u16 len; } ltag; struct scatter_walk walk; u32 len = req->assoclen; - u32 macp = 0; + u32 macp = AES_BLOCK_SIZE; /* prepend the AAD with a length tag */ if (len < 0xff00) { @@ -125,16 +163,11 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) scatterwalk_start(&walk, sg_next(walk.sg)); n = scatterwalk_clamp(&walk, len); } - n = min_t(u32, n, SZ_4K); /* yield NEON at least every 4k */ p = scatterwalk_map(&walk); macp = ce_aes_ccm_auth_data(mac, p, n, macp, ctx->key_enc, num_rounds(ctx)); - if (len / SZ_4K > (len - n) / SZ_4K) { - kernel_neon_end(); - kernel_neon_begin(); - } len -= n; scatterwalk_unmap(p); @@ -149,7 +182,7 @@ static int ccm_encrypt(struct aead_request *req) struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); struct skcipher_walk walk; u8 __aligned(8) mac[AES_BLOCK_SIZE]; - u8 buf[AES_BLOCK_SIZE]; + u8 orig_iv[AES_BLOCK_SIZE]; u32 len = req->cryptlen; int err; @@ -158,42 +191,55 @@ static int ccm_encrypt(struct aead_request *req) return err; /* preserve the original iv for the final round */ - memcpy(buf, req->iv, AES_BLOCK_SIZE); + memcpy(orig_iv, req->iv, AES_BLOCK_SIZE); err = skcipher_walk_aead_encrypt(&walk, req, false); + if (unlikely(err)) + return err; kernel_neon_begin(); if (req->assoclen) ccm_calculate_auth_mac(req, mac); - while (walk.nbytes) { + do { u32 tail = walk.nbytes % AES_BLOCK_SIZE; - bool final = walk.nbytes == walk.total; + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + u8 buf[AES_BLOCK_SIZE]; + u8 *final_iv = NULL; - if (final) + if (walk.nbytes == walk.total) { tail = 0; + final_iv = orig_iv; + } - ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes - tail, ctx->key_enc, - num_rounds(ctx), mac, walk.iv); + if (unlikely(walk.nbytes < AES_BLOCK_SIZE)) + src = dst = memcpy(&buf[sizeof(buf) - walk.nbytes], + src, walk.nbytes); - if (!final) - kernel_neon_end(); - err = skcipher_walk_done(&walk, tail); - if (!final) - kernel_neon_begin(); - } + ce_aes_ccm_encrypt(dst, src, walk.nbytes - tail, + ctx->key_enc, num_rounds(ctx), + mac, walk.iv, final_iv); + + if (unlikely(walk.nbytes < AES_BLOCK_SIZE)) + memcpy(walk.dst.virt.addr, dst, walk.nbytes); - ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); + if (walk.nbytes) { + err = skcipher_walk_done(&walk, tail); + } + } while (walk.nbytes); kernel_neon_end(); + if (unlikely(err)) + return err; + /* copy authtag to end of dst */ scatterwalk_map_and_copy(mac, req->dst, req->assoclen + req->cryptlen, crypto_aead_authsize(aead), 1); - return err; + return 0; } static int ccm_decrypt(struct aead_request *req) @@ -203,7 +249,7 @@ static int ccm_decrypt(struct aead_request *req) unsigned int authsize = crypto_aead_authsize(aead); struct skcipher_walk walk; u8 __aligned(8) mac[AES_BLOCK_SIZE]; - u8 buf[AES_BLOCK_SIZE]; + u8 orig_iv[AES_BLOCK_SIZE]; u32 len = req->cryptlen - authsize; int err; @@ -212,34 +258,44 @@ static int ccm_decrypt(struct aead_request *req) return err; /* preserve the original iv for the final round */ - memcpy(buf, req->iv, AES_BLOCK_SIZE); + memcpy(orig_iv, req->iv, AES_BLOCK_SIZE); err = skcipher_walk_aead_decrypt(&walk, req, false); + if (unlikely(err)) + return err; kernel_neon_begin(); if (req->assoclen) ccm_calculate_auth_mac(req, mac); - while (walk.nbytes) { + do { u32 tail = walk.nbytes % AES_BLOCK_SIZE; - bool final = walk.nbytes == walk.total; + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + u8 buf[AES_BLOCK_SIZE]; + u8 *final_iv = NULL; - if (final) + if (walk.nbytes == walk.total) { tail = 0; + final_iv = orig_iv; + } - ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes - tail, ctx->key_enc, - num_rounds(ctx), mac, walk.iv); + if (unlikely(walk.nbytes < AES_BLOCK_SIZE)) + src = dst = memcpy(&buf[sizeof(buf) - walk.nbytes], + src, walk.nbytes); - if (!final) - kernel_neon_end(); - err = skcipher_walk_done(&walk, tail); - if (!final) - kernel_neon_begin(); - } + ce_aes_ccm_decrypt(dst, src, walk.nbytes - tail, + ctx->key_enc, num_rounds(ctx), + mac, walk.iv, final_iv); + + if (unlikely(walk.nbytes < AES_BLOCK_SIZE)) + memcpy(walk.dst.virt.addr, dst, walk.nbytes); - ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); + if (walk.nbytes) { + err = skcipher_walk_done(&walk, tail); + } + } while (walk.nbytes); kernel_neon_end(); @@ -247,11 +303,11 @@ static int ccm_decrypt(struct aead_request *req) return err; /* compare calculated auth tag with the stored one */ - scatterwalk_map_and_copy(buf, req->src, + scatterwalk_map_and_copy(orig_iv, req->src, req->assoclen + req->cryptlen - authsize, authsize, 0); - if (crypto_memneq(mac, buf, authsize)) + if (crypto_memneq(mac, orig_iv, authsize)) return -EBADMSG; return 0; } @@ -290,6 +346,6 @@ module_init(aes_mod_init); module_exit(aes_mod_exit); MODULE_DESCRIPTION("Synchronous AES in CCM mode using ARMv8 Crypto Extensions"); -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_AUTHOR("Ard Biesheuvel <ardb@kernel.org>"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("ccm(aes)"); diff --git a/arch/arm64/crypto/aes-ce-glue.c b/arch/arm64/crypto/aes-ce-glue.c index e921823ca103..00b8749013c5 100644 --- a/arch/arm64/crypto/aes-ce-glue.c +++ b/arch/arm64/crypto/aes-ce-glue.c @@ -7,7 +7,7 @@ #include <asm/neon.h> #include <asm/simd.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <crypto/aes.h> #include <crypto/algapi.h> #include <crypto/internal/simd.h> diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S index 1dc5bbbfeed2..b262eaa9170c 100644 --- a/arch/arm64/crypto/aes-ce.S +++ b/arch/arm64/crypto/aes-ce.S @@ -25,33 +25,28 @@ .endm /* preload all round keys */ - .macro load_round_keys, rounds, rk - cmp \rounds, #12 - blo 2222f /* 128 bits */ - beq 1111f /* 192 bits */ - ld1 {v17.4s-v18.4s}, [\rk], #32 -1111: ld1 {v19.4s-v20.4s}, [\rk], #32 -2222: ld1 {v21.4s-v24.4s}, [\rk], #64 - ld1 {v25.4s-v28.4s}, [\rk], #64 - ld1 {v29.4s-v31.4s}, [\rk] + .macro load_round_keys, rk, nr, tmp + add \tmp, \rk, \nr, sxtw #4 + sub \tmp, \tmp, #160 + ld1 {v17.4s-v20.4s}, [\rk] + ld1 {v21.4s-v24.4s}, [\tmp], #64 + ld1 {v25.4s-v28.4s}, [\tmp], #64 + ld1 {v29.4s-v31.4s}, [\tmp] .endm /* prepare for encryption with key in rk[] */ .macro enc_prepare, rounds, rk, temp - mov \temp, \rk - load_round_keys \rounds, \temp + load_round_keys \rk, \rounds, \temp .endm /* prepare for encryption (again) but with new key in rk[] */ .macro enc_switch_key, rounds, rk, temp - mov \temp, \rk - load_round_keys \rounds, \temp + load_round_keys \rk, \rounds, \temp .endm /* prepare for decryption with key in rk[] */ .macro dec_prepare, rounds, rk, temp - mov \temp, \rk - load_round_keys \rounds, \temp + load_round_keys \rk, \rounds, \temp .endm .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3, i4 @@ -110,14 +105,13 @@ /* up to 5 interleaved blocks */ .macro do_block_Nx, enc, rounds, i0, i1, i2, i3, i4 - cmp \rounds, #12 - blo 2222f /* 128 bits */ - beq 1111f /* 192 bits */ + tbz \rounds, #2, .L\@ /* 128 bits */ round_Nx \enc, v17, \i0, \i1, \i2, \i3, \i4 round_Nx \enc, v18, \i0, \i1, \i2, \i3, \i4 -1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3, \i4 + tbz \rounds, #1, .L\@ /* 192 bits */ + round_Nx \enc, v19, \i0, \i1, \i2, \i3, \i4 round_Nx \enc, v20, \i0, \i1, \i2, \i3, \i4 -2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 +.L\@: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 round_Nx \enc, \key, \i0, \i1, \i2, \i3, \i4 .endr fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3, \i4 diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 162787c7aa86..b0150999743f 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -1048,6 +1048,7 @@ unregister_ciphers: #ifdef USE_V8_CRYPTO_EXTENSIONS module_cpu_feature_match(AES, aes_init); +EXPORT_SYMBOL_NS(ce_aes_mac_update, "CRYPTO_INTERNAL"); #else module_init(aes_init); EXPORT_SYMBOL(neon_aes_ecb_encrypt); diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S index 9de7fbc797af..3a8961b6ea51 100644 --- a/arch/arm64/crypto/aes-neon.S +++ b/arch/arm64/crypto/aes-neon.S @@ -99,16 +99,16 @@ ld1 {v15.4s}, [\rk] add \rkp, \rk, #16 mov \i, \rounds -1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ +.La\@: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ movi v15.16b, #0x40 tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */ sub_bytes \in - subs \i, \i, #1 + sub \i, \i, #1 ld1 {v15.4s}, [\rkp], #16 - beq 2222f + cbz \i, .Lb\@ mix_columns \in, \enc - b 1111b -2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ + b .La\@ +.Lb\@: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ .endm .macro encrypt_block, in, rounds, rk, rkp, i @@ -206,7 +206,7 @@ ld1 {v15.4s}, [\rk] add \rkp, \rk, #16 mov \i, \rounds -1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ +.La\@: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ @@ -216,13 +216,13 @@ tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */ tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */ sub_bytes_4x \in0, \in1, \in2, \in3 - subs \i, \i, #1 + sub \i, \i, #1 ld1 {v15.4s}, [\rkp], #16 - beq 2222f + cbz \i, .Lb\@ mix_columns_2x \in0, \in1, \enc mix_columns_2x \in2, \in3, \enc - b 1111b -2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ + b .La\@ +.Lb\@: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index 467ac2f768ac..46425e7b9755 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -16,6 +16,7 @@ #include <linux/module.h> MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_DESCRIPTION("Bit sliced AES using NEON instructions"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("ecb(aes)"); diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S deleted file mode 100644 index 5604de61d06d..000000000000 --- a/arch/arm64/crypto/crct10dif-ce-core.S +++ /dev/null @@ -1,514 +0,0 @@ -// -// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions -// -// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> -// Copyright (C) 2019 Google LLC <ebiggers@google.com> -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License version 2 as -// published by the Free Software Foundation. -// - -// Derived from the x86 version: -// -// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions -// -// Copyright (c) 2013, Intel Corporation -// -// Authors: -// Erdinc Ozturk <erdinc.ozturk@intel.com> -// Vinodh Gopal <vinodh.gopal@intel.com> -// James Guilford <james.guilford@intel.com> -// Tim Chen <tim.c.chen@linux.intel.com> -// -// This software is available to you under a choice of one of two -// licenses. You may choose to be licensed under the terms of the GNU -// General Public License (GPL) Version 2, available from the file -// COPYING in the main directory of this source tree, or the -// OpenIB.org BSD license below: -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the -// distribution. -// -// * Neither the name of the Intel Corporation nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// -// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Reference paper titled "Fast CRC Computation for Generic -// Polynomials Using PCLMULQDQ Instruction" -// URL: http://www.intel.com/content/dam/www/public/us/en/documents -// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf -// - -#include <linux/linkage.h> -#include <asm/assembler.h> - - .text - .arch armv8-a+crypto - - init_crc .req w0 - buf .req x1 - len .req x2 - fold_consts_ptr .req x3 - - fold_consts .req v10 - - ad .req v14 - - k00_16 .req v15 - k32_48 .req v16 - - t3 .req v17 - t4 .req v18 - t5 .req v19 - t6 .req v20 - t7 .req v21 - t8 .req v22 - t9 .req v23 - - perm1 .req v24 - perm2 .req v25 - perm3 .req v26 - perm4 .req v27 - - bd1 .req v28 - bd2 .req v29 - bd3 .req v30 - bd4 .req v31 - - .macro __pmull_init_p64 - .endm - - .macro __pmull_pre_p64, bd - .endm - - .macro __pmull_init_p8 - // k00_16 := 0x0000000000000000_000000000000ffff - // k32_48 := 0x00000000ffffffff_0000ffffffffffff - movi k32_48.2d, #0xffffffff - mov k32_48.h[2], k32_48.h[0] - ushr k00_16.2d, k32_48.2d, #32 - - // prepare the permutation vectors - mov_q x5, 0x080f0e0d0c0b0a09 - movi perm4.8b, #8 - dup perm1.2d, x5 - eor perm1.16b, perm1.16b, perm4.16b - ushr perm2.2d, perm1.2d, #8 - ushr perm3.2d, perm1.2d, #16 - ushr perm4.2d, perm1.2d, #24 - sli perm2.2d, perm1.2d, #56 - sli perm3.2d, perm1.2d, #48 - sli perm4.2d, perm1.2d, #40 - .endm - - .macro __pmull_pre_p8, bd - tbl bd1.16b, {\bd\().16b}, perm1.16b - tbl bd2.16b, {\bd\().16b}, perm2.16b - tbl bd3.16b, {\bd\().16b}, perm3.16b - tbl bd4.16b, {\bd\().16b}, perm4.16b - .endm - -SYM_FUNC_START_LOCAL(__pmull_p8_core) -.L__pmull_p8_core: - ext t4.8b, ad.8b, ad.8b, #1 // A1 - ext t5.8b, ad.8b, ad.8b, #2 // A2 - ext t6.8b, ad.8b, ad.8b, #3 // A3 - - pmull t4.8h, t4.8b, fold_consts.8b // F = A1*B - pmull t8.8h, ad.8b, bd1.8b // E = A*B1 - pmull t5.8h, t5.8b, fold_consts.8b // H = A2*B - pmull t7.8h, ad.8b, bd2.8b // G = A*B2 - pmull t6.8h, t6.8b, fold_consts.8b // J = A3*B - pmull t9.8h, ad.8b, bd3.8b // I = A*B3 - pmull t3.8h, ad.8b, bd4.8b // K = A*B4 - b 0f - -.L__pmull_p8_core2: - tbl t4.16b, {ad.16b}, perm1.16b // A1 - tbl t5.16b, {ad.16b}, perm2.16b // A2 - tbl t6.16b, {ad.16b}, perm3.16b // A3 - - pmull2 t4.8h, t4.16b, fold_consts.16b // F = A1*B - pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1 - pmull2 t5.8h, t5.16b, fold_consts.16b // H = A2*B - pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2 - pmull2 t6.8h, t6.16b, fold_consts.16b // J = A3*B - pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3 - pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4 - -0: eor t4.16b, t4.16b, t8.16b // L = E + F - eor t5.16b, t5.16b, t7.16b // M = G + H - eor t6.16b, t6.16b, t9.16b // N = I + J - - uzp1 t8.2d, t4.2d, t5.2d - uzp2 t4.2d, t4.2d, t5.2d - uzp1 t7.2d, t6.2d, t3.2d - uzp2 t6.2d, t6.2d, t3.2d - - // t4 = (L) (P0 + P1) << 8 - // t5 = (M) (P2 + P3) << 16 - eor t8.16b, t8.16b, t4.16b - and t4.16b, t4.16b, k32_48.16b - - // t6 = (N) (P4 + P5) << 24 - // t7 = (K) (P6 + P7) << 32 - eor t7.16b, t7.16b, t6.16b - and t6.16b, t6.16b, k00_16.16b - - eor t8.16b, t8.16b, t4.16b - eor t7.16b, t7.16b, t6.16b - - zip2 t5.2d, t8.2d, t4.2d - zip1 t4.2d, t8.2d, t4.2d - zip2 t3.2d, t7.2d, t6.2d - zip1 t6.2d, t7.2d, t6.2d - - ext t4.16b, t4.16b, t4.16b, #15 - ext t5.16b, t5.16b, t5.16b, #14 - ext t6.16b, t6.16b, t6.16b, #13 - ext t3.16b, t3.16b, t3.16b, #12 - - eor t4.16b, t4.16b, t5.16b - eor t6.16b, t6.16b, t3.16b - ret -SYM_FUNC_END(__pmull_p8_core) - - .macro __pmull_p8, rq, ad, bd, i - .ifnc \bd, fold_consts - .err - .endif - mov ad.16b, \ad\().16b - .ifb \i - pmull \rq\().8h, \ad\().8b, \bd\().8b // D = A*B - .else - pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B - .endif - - bl .L__pmull_p8_core\i - - eor \rq\().16b, \rq\().16b, t4.16b - eor \rq\().16b, \rq\().16b, t6.16b - .endm - - // Fold reg1, reg2 into the next 32 data bytes, storing the result back - // into reg1, reg2. - .macro fold_32_bytes, p, reg1, reg2 - ldp q11, q12, [buf], #0x20 - - __pmull_\p v8, \reg1, fold_consts, 2 - __pmull_\p \reg1, \reg1, fold_consts - -CPU_LE( rev64 v11.16b, v11.16b ) -CPU_LE( rev64 v12.16b, v12.16b ) - - __pmull_\p v9, \reg2, fold_consts, 2 - __pmull_\p \reg2, \reg2, fold_consts - -CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 ) -CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) - - eor \reg1\().16b, \reg1\().16b, v8.16b - eor \reg2\().16b, \reg2\().16b, v9.16b - eor \reg1\().16b, \reg1\().16b, v11.16b - eor \reg2\().16b, \reg2\().16b, v12.16b - .endm - - // Fold src_reg into dst_reg, optionally loading the next fold constants - .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts - __pmull_\p v8, \src_reg, fold_consts - __pmull_\p \src_reg, \src_reg, fold_consts, 2 - .ifnb \load_next_consts - ld1 {fold_consts.2d}, [fold_consts_ptr], #16 - __pmull_pre_\p fold_consts - .endif - eor \dst_reg\().16b, \dst_reg\().16b, v8.16b - eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b - .endm - - .macro __pmull_p64, rd, rn, rm, n - .ifb \n - pmull \rd\().1q, \rn\().1d, \rm\().1d - .else - pmull2 \rd\().1q, \rn\().2d, \rm\().2d - .endif - .endm - - .macro crc_t10dif_pmull, p - __pmull_init_\p - - // For sizes less than 256 bytes, we can't fold 128 bytes at a time. - cmp len, #256 - b.lt .Lless_than_256_bytes_\@ - - adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts - - // Load the first 128 data bytes. Byte swapping is necessary to make - // the bit order match the polynomial coefficient order. - ldp q0, q1, [buf] - ldp q2, q3, [buf, #0x20] - ldp q4, q5, [buf, #0x40] - ldp q6, q7, [buf, #0x60] - add buf, buf, #0x80 -CPU_LE( rev64 v0.16b, v0.16b ) -CPU_LE( rev64 v1.16b, v1.16b ) -CPU_LE( rev64 v2.16b, v2.16b ) -CPU_LE( rev64 v3.16b, v3.16b ) -CPU_LE( rev64 v4.16b, v4.16b ) -CPU_LE( rev64 v5.16b, v5.16b ) -CPU_LE( rev64 v6.16b, v6.16b ) -CPU_LE( rev64 v7.16b, v7.16b ) -CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) -CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) -CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 ) -CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 ) -CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 ) -CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 ) -CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 ) -CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) - - // XOR the first 16 data *bits* with the initial CRC value. - movi v8.16b, #0 - mov v8.h[7], init_crc - eor v0.16b, v0.16b, v8.16b - - // Load the constants for folding across 128 bytes. - ld1 {fold_consts.2d}, [fold_consts_ptr] - __pmull_pre_\p fold_consts - - // Subtract 128 for the 128 data bytes just consumed. Subtract another - // 128 to simplify the termination condition of the following loop. - sub len, len, #256 - - // While >= 128 data bytes remain (not counting v0-v7), fold the 128 - // bytes v0-v7 into them, storing the result back into v0-v7. -.Lfold_128_bytes_loop_\@: - fold_32_bytes \p, v0, v1 - fold_32_bytes \p, v2, v3 - fold_32_bytes \p, v4, v5 - fold_32_bytes \p, v6, v7 - - subs len, len, #128 - b.ge .Lfold_128_bytes_loop_\@ - - // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7. - - // Fold across 64 bytes. - add fold_consts_ptr, fold_consts_ptr, #16 - ld1 {fold_consts.2d}, [fold_consts_ptr], #16 - __pmull_pre_\p fold_consts - fold_16_bytes \p, v0, v4 - fold_16_bytes \p, v1, v5 - fold_16_bytes \p, v2, v6 - fold_16_bytes \p, v3, v7, 1 - // Fold across 32 bytes. - fold_16_bytes \p, v4, v6 - fold_16_bytes \p, v5, v7, 1 - // Fold across 16 bytes. - fold_16_bytes \p, v6, v7 - - // Add 128 to get the correct number of data bytes remaining in 0...127 - // (not counting v7), following the previous extra subtraction by 128. - // Then subtract 16 to simplify the termination condition of the - // following loop. - adds len, len, #(128-16) - - // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7 - // into them, storing the result back into v7. - b.lt .Lfold_16_bytes_loop_done_\@ -.Lfold_16_bytes_loop_\@: - __pmull_\p v8, v7, fold_consts - __pmull_\p v7, v7, fold_consts, 2 - eor v7.16b, v7.16b, v8.16b - ldr q0, [buf], #16 -CPU_LE( rev64 v0.16b, v0.16b ) -CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) - eor v7.16b, v7.16b, v0.16b - subs len, len, #16 - b.ge .Lfold_16_bytes_loop_\@ - -.Lfold_16_bytes_loop_done_\@: - // Add 16 to get the correct number of data bytes remaining in 0...15 - // (not counting v7), following the previous extra subtraction by 16. - adds len, len, #16 - b.eq .Lreduce_final_16_bytes_\@ - -.Lhandle_partial_segment_\@: - // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first - // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To - // do this without needing a fold constant for each possible 'len', - // redivide the bytes into a first chunk of 'len' bytes and a second - // chunk of 16 bytes, then fold the first chunk into the second. - - // v0 = last 16 original data bytes - add buf, buf, len - ldr q0, [buf, #-16] -CPU_LE( rev64 v0.16b, v0.16b ) -CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) - - // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes. - adr_l x4, .Lbyteshift_table + 16 - sub x4, x4, len - ld1 {v2.16b}, [x4] - tbl v1.16b, {v7.16b}, v2.16b - - // v3 = first chunk: v7 right-shifted by '16-len' bytes. - movi v3.16b, #0x80 - eor v2.16b, v2.16b, v3.16b - tbl v3.16b, {v7.16b}, v2.16b - - // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes. - sshr v2.16b, v2.16b, #7 - - // v2 = second chunk: 'len' bytes from v0 (low-order bytes), - // then '16-len' bytes from v1 (high-order bytes). - bsl v2.16b, v1.16b, v0.16b - - // Fold the first chunk into the second chunk, storing the result in v7. - __pmull_\p v0, v3, fold_consts - __pmull_\p v7, v3, fold_consts, 2 - eor v7.16b, v7.16b, v0.16b - eor v7.16b, v7.16b, v2.16b - -.Lreduce_final_16_bytes_\@: - // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC. - - movi v2.16b, #0 // init zero register - - // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. - ld1 {fold_consts.2d}, [fold_consts_ptr], #16 - __pmull_pre_\p fold_consts - - // Fold the high 64 bits into the low 64 bits, while also multiplying by - // x^64. This produces a 128-bit value congruent to x^64 * M(x) and - // whose low 48 bits are 0. - ext v0.16b, v2.16b, v7.16b, #8 - __pmull_\p v7, v7, fold_consts, 2 // high bits * x^48 * (x^80 mod G(x)) - eor v0.16b, v0.16b, v7.16b // + low bits * x^64 - - // Fold the high 32 bits into the low 96 bits. This produces a 96-bit - // value congruent to x^64 * M(x) and whose low 48 bits are 0. - ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits - mov v0.s[3], v2.s[0] // zero high 32 bits - __pmull_\p v1, v1, fold_consts // high 32 bits * x^48 * (x^48 mod G(x)) - eor v0.16b, v0.16b, v1.16b // + low bits - - // Load G(x) and floor(x^48 / G(x)). - ld1 {fold_consts.2d}, [fold_consts_ptr] - __pmull_pre_\p fold_consts - - // Use Barrett reduction to compute the final CRC value. - __pmull_\p v1, v0, fold_consts, 2 // high 32 bits * floor(x^48 / G(x)) - ushr v1.2d, v1.2d, #32 // /= x^32 - __pmull_\p v1, v1, fold_consts // *= G(x) - ushr v0.2d, v0.2d, #48 - eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits - // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0. - - umov w0, v0.h[0] - .ifc \p, p8 - frame_pop - .endif - ret - -.Lless_than_256_bytes_\@: - // Checksumming a buffer of length 16...255 bytes - - adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts - - // Load the first 16 data bytes. - ldr q7, [buf], #0x10 -CPU_LE( rev64 v7.16b, v7.16b ) -CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) - - // XOR the first 16 data *bits* with the initial CRC value. - movi v0.16b, #0 - mov v0.h[7], init_crc - eor v7.16b, v7.16b, v0.16b - - // Load the fold-across-16-bytes constants. - ld1 {fold_consts.2d}, [fold_consts_ptr], #16 - __pmull_pre_\p fold_consts - - cmp len, #16 - b.eq .Lreduce_final_16_bytes_\@ // len == 16 - subs len, len, #32 - b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255 - add len, len, #16 - b .Lhandle_partial_segment_\@ // 17 <= len <= 31 - .endm - -// -// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len); -// -// Assumes len >= 16. -// -SYM_FUNC_START(crc_t10dif_pmull_p8) - frame_push 1 - crc_t10dif_pmull p8 -SYM_FUNC_END(crc_t10dif_pmull_p8) - - .align 5 -// -// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); -// -// Assumes len >= 16. -// -SYM_FUNC_START(crc_t10dif_pmull_p64) - crc_t10dif_pmull p64 -SYM_FUNC_END(crc_t10dif_pmull_p64) - - .section ".rodata", "a" - .align 4 - -// Fold constants precomputed from the polynomial 0x18bb7 -// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 -.Lfold_across_128_bytes_consts: - .quad 0x0000000000006123 // x^(8*128) mod G(x) - .quad 0x0000000000002295 // x^(8*128+64) mod G(x) -// .Lfold_across_64_bytes_consts: - .quad 0x0000000000001069 // x^(4*128) mod G(x) - .quad 0x000000000000dd31 // x^(4*128+64) mod G(x) -// .Lfold_across_32_bytes_consts: - .quad 0x000000000000857d // x^(2*128) mod G(x) - .quad 0x0000000000007acc // x^(2*128+64) mod G(x) -.Lfold_across_16_bytes_consts: - .quad 0x000000000000a010 // x^(1*128) mod G(x) - .quad 0x0000000000001faa // x^(1*128+64) mod G(x) -// .Lfinal_fold_consts: - .quad 0x1368000000000000 // x^48 * (x^48 mod G(x)) - .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x)) -// .Lbarrett_reduction_consts: - .quad 0x0000000000018bb7 // G(x) - .quad 0x00000001f65a57f8 // floor(x^48 / G(x)) - -// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - -// len] is the index vector to shift left by 'len' bytes, and is also {0x80, -// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes. -.Lbyteshift_table: - .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 - .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f - .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 - .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c deleted file mode 100644 index 09eb1456aed4..000000000000 --- a/arch/arm64/crypto/crct10dif-ce-glue.c +++ /dev/null @@ -1,143 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions - * - * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> - */ - -#include <linux/cpufeature.h> -#include <linux/crc-t10dif.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/string.h> - -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> - -#include <asm/neon.h> -#include <asm/simd.h> - -#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U - -asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len); -asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); - -static int crct10dif_init(struct shash_desc *desc) -{ - u16 *crc = shash_desc_ctx(desc); - - *crc = 0; - return 0; -} - -static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data, - unsigned int length) -{ - u16 *crc = shash_desc_ctx(desc); - - if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { - do { - unsigned int chunk = length; - - if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE) - chunk = SZ_4K; - - kernel_neon_begin(); - *crc = crc_t10dif_pmull_p8(*crc, data, chunk); - kernel_neon_end(); - data += chunk; - length -= chunk; - } while (length); - } else { - *crc = crc_t10dif_generic(*crc, data, length); - } - - return 0; -} - -static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data, - unsigned int length) -{ - u16 *crc = shash_desc_ctx(desc); - - if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { - do { - unsigned int chunk = length; - - if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE) - chunk = SZ_4K; - - kernel_neon_begin(); - *crc = crc_t10dif_pmull_p64(*crc, data, chunk); - kernel_neon_end(); - data += chunk; - length -= chunk; - } while (length); - } else { - *crc = crc_t10dif_generic(*crc, data, length); - } - - return 0; -} - -static int crct10dif_final(struct shash_desc *desc, u8 *out) -{ - u16 *crc = shash_desc_ctx(desc); - - *(u16 *)out = *crc; - return 0; -} - -static struct shash_alg crc_t10dif_alg[] = {{ - .digestsize = CRC_T10DIF_DIGEST_SIZE, - .init = crct10dif_init, - .update = crct10dif_update_pmull_p8, - .final = crct10dif_final, - .descsize = CRC_T10DIF_DIGEST_SIZE, - - .base.cra_name = "crct10dif", - .base.cra_driver_name = "crct10dif-arm64-neon", - .base.cra_priority = 100, - .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, -}, { - .digestsize = CRC_T10DIF_DIGEST_SIZE, - .init = crct10dif_init, - .update = crct10dif_update_pmull_p64, - .final = crct10dif_final, - .descsize = CRC_T10DIF_DIGEST_SIZE, - - .base.cra_name = "crct10dif", - .base.cra_driver_name = "crct10dif-arm64-ce", - .base.cra_priority = 200, - .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, -}}; - -static int __init crc_t10dif_mod_init(void) -{ - if (cpu_have_named_feature(PMULL)) - return crypto_register_shashes(crc_t10dif_alg, - ARRAY_SIZE(crc_t10dif_alg)); - else - /* only register the first array element */ - return crypto_register_shash(crc_t10dif_alg); -} - -static void __exit crc_t10dif_mod_exit(void) -{ - if (cpu_have_named_feature(PMULL)) - crypto_unregister_shashes(crc_t10dif_alg, - ARRAY_SIZE(crc_t10dif_alg)); - else - crypto_unregister_shash(crc_t10dif_alg); -} - -module_cpu_feature_match(ASIMD, crc_t10dif_mod_init); -module_exit(crc_t10dif_mod_exit); - -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("crct10dif"); -MODULE_ALIAS_CRYPTO("crct10dif-arm64-ce"); diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 97331b454ea8..da7b7ec1a664 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -7,7 +7,7 @@ #include <asm/neon.h> #include <asm/simd.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <crypto/aes.h> #include <crypto/gcm.h> #include <crypto/algapi.h> diff --git a/arch/arm64/crypto/poly1305-armv8.pl b/arch/arm64/crypto/poly1305-armv8.pl index cbc980fb02e3..22c9069c0650 100644 --- a/arch/arm64/crypto/poly1305-armv8.pl +++ b/arch/arm64/crypto/poly1305-armv8.pl @@ -473,7 +473,8 @@ poly1305_blocks_neon: subs $len,$len,#64 ldp x9,x13,[$inp,#48] add $in2,$inp,#96 - adr $zeros,.Lzeros + adrp $zeros,.Lzeros + add $zeros,$zeros,#:lo12:.Lzeros lsl $padbit,$padbit,#24 add x15,$ctx,#48 @@ -885,10 +886,13 @@ poly1305_blocks_neon: ret .size poly1305_blocks_neon,.-poly1305_blocks_neon +.pushsection .rodata .align 5 .Lzeros: .long 0,0,0,0,0,0,0,0 .asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm" +.popsection + .align 2 #if !defined(__KERNEL__) && !defined(_WIN64) .comm OPENSSL_armcap_P,4,4 diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c index 1fae18ba11ed..18883ea438f3 100644 --- a/arch/arm64/crypto/poly1305-glue.c +++ b/arch/arm64/crypto/poly1305-glue.c @@ -8,7 +8,7 @@ #include <asm/hwcap.h> #include <asm/neon.h> #include <asm/simd.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <crypto/algapi.h> #include <crypto/internal/hash.h> #include <crypto/internal/poly1305.h> @@ -226,6 +226,7 @@ static void __exit neon_poly1305_mod_exit(void) module_init(neon_poly1305_mod_init); module_exit(neon_poly1305_mod_exit); +MODULE_DESCRIPTION("Poly1305 transform using NEON instructions"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("poly1305"); MODULE_ALIAS_CRYPTO("poly1305-neon"); diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index 1dd93e1fcb39..cbd14f208f83 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -7,7 +7,7 @@ #include <asm/neon.h> #include <asm/simd.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <crypto/internal/hash.h> #include <crypto/internal/simd.h> #include <crypto/sha1.h> diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c index 0a44d2e7ee1f..6b4866a88ded 100644 --- a/arch/arm64/crypto/sha2-ce-glue.c +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -7,7 +7,7 @@ #include <asm/neon.h> #include <asm/simd.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <crypto/internal/hash.h> #include <crypto/internal/simd.h> #include <crypto/sha2.h> diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c index 250e1377c481..5662c3ac49e9 100644 --- a/arch/arm64/crypto/sha3-ce-glue.c +++ b/arch/arm64/crypto/sha3-ce-glue.c @@ -12,7 +12,7 @@ #include <asm/hwcap.h> #include <asm/neon.h> #include <asm/simd.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <crypto/internal/hash.h> #include <crypto/internal/simd.h> #include <crypto/sha3.h> diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c index f3431fc62315..071f64293227 100644 --- a/arch/arm64/crypto/sha512-ce-glue.c +++ b/arch/arm64/crypto/sha512-ce-glue.c @@ -11,7 +11,7 @@ #include <asm/neon.h> #include <asm/simd.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <crypto/internal/hash.h> #include <crypto/internal/simd.h> #include <crypto/sha2.h> diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c index 54bf6ebcfffb..1a71788c4cda 100644 --- a/arch/arm64/crypto/sm3-ce-glue.c +++ b/arch/arm64/crypto/sm3-ce-glue.c @@ -7,7 +7,7 @@ #include <asm/neon.h> #include <asm/simd.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <crypto/internal/hash.h> #include <crypto/internal/simd.h> #include <crypto/sm3.h> diff --git a/arch/arm64/crypto/sm3-neon-glue.c b/arch/arm64/crypto/sm3-neon-glue.c index 7182ee683f14..8dd71ce79b69 100644 --- a/arch/arm64/crypto/sm3-neon-glue.c +++ b/arch/arm64/crypto/sm3-neon-glue.c @@ -7,7 +7,7 @@ #include <asm/neon.h> #include <asm/simd.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <crypto/internal/hash.h> #include <crypto/internal/simd.h> #include <crypto/sm3.h> |