summaryrefslogtreecommitdiff
path: root/arch/arm/crypto/ghash-ce-core.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm/crypto/ghash-ce-core.S')
-rw-r--r--arch/arm/crypto/ghash-ce-core.S387
1 files changed, 372 insertions, 15 deletions
diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S
index 534c9647726d..858c0d66798b 100644
--- a/arch/arm/crypto/ghash-ce-core.S
+++ b/arch/arm/crypto/ghash-ce-core.S
@@ -2,12 +2,16 @@
/*
* Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
*
- * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2015 - 2017 Linaro Ltd.
+ * Copyright (C) 2023 Google LLC. <ardb@google.com>
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
+ .arch armv8-a
+ .fpu crypto-neon-fp-armv8
+
SHASH .req q0
T1 .req q1
XL .req q2
@@ -41,7 +45,7 @@
t2q .req q7
t3q .req q8
t4q .req q9
- T2 .req q9
+ XH2 .req q9
s1l .req d20
s1h .req d21
@@ -77,7 +81,7 @@
XL2 .req q5
XM2 .req q6
- XH2 .req q7
+ T2 .req q7
T3 .req q8
XL2_L .req d10
@@ -88,8 +92,6 @@
T3_H .req d17
.text
- .arch armv8-a
- .fpu crypto-neon-fp-armv8
.macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
vmull.p64 \rd, \rn, \rm
@@ -191,9 +193,10 @@
vshr.u64 XL, XL, #1
.endm
- .macro ghash_update, pn
+ .macro ghash_update, pn, enc, aggregate=1, head=1
vld1.64 {XL}, [r1]
+ .if \head
/* do the head block first, if supplied */
ldr ip, [sp]
teq ip, #0
@@ -201,13 +204,32 @@
vld1.64 {T1}, [ip]
teq r0, #0
b 3f
+ .endif
0: .ifc \pn, p64
+ .if \aggregate
tst r0, #3 // skip until #blocks is a
bne 2f // round multiple of 4
vld1.8 {XL2-XM2}, [r2]!
-1: vld1.8 {T3-T2}, [r2]!
+1: vld1.8 {T2-T3}, [r2]!
+
+ .ifnb \enc
+ \enc\()_4x XL2, XM2, T2, T3
+
+ add ip, r3, #16
+ vld1.64 {HH}, [ip, :128]!
+ vld1.64 {HH3-HH4}, [ip, :128]
+
+ veor SHASH2_p64, SHASH_L, SHASH_H
+ veor SHASH2_H, HH_L, HH_H
+ veor HH34_L, HH3_L, HH3_H
+ veor HH34_H, HH4_L, HH4_H
+
+ vmov.i8 MASK, #0xe1
+ vshl.u64 MASK, MASK, #57
+ .endif
+
vrev64.8 XL2, XL2
vrev64.8 XM2, XM2
@@ -217,8 +239,8 @@
veor XL2_H, XL2_H, XL_L
veor XL, XL, T1
- vrev64.8 T3, T3
- vrev64.8 T1, T2
+ vrev64.8 T1, T3
+ vrev64.8 T3, T2
vmull.p64 XH, HH4_H, XL_H // a1 * b1
veor XL2_H, XL2_H, XL_H
@@ -266,14 +288,22 @@
b 1b
.endif
+ .endif
+
+2: vld1.8 {T1}, [r2]!
+
+ .ifnb \enc
+ \enc\()_1x T1
+ veor SHASH2_p64, SHASH_L, SHASH_H
+ vmov.i8 MASK, #0xe1
+ vshl.u64 MASK, MASK, #57
+ .endif
-2: vld1.64 {T1}, [r2]!
subs r0, r0, #1
3: /* multiply XL by SHASH in GF(2^128) */
-#ifndef CONFIG_CPU_BIG_ENDIAN
vrev64.8 T1, T1
-#endif
+
vext.8 IN1, T1, T1, #8
veor T1_L, T1_L, XL_H
veor XL, XL, IN1
@@ -292,9 +322,6 @@
veor XL, XL, T1
bne 0b
-
- vst1.64 {XL}, [r1]
- bx lr
.endm
/*
@@ -315,6 +342,9 @@ ENTRY(pmull_ghash_update_p64)
vshl.u64 MASK, MASK, #57
ghash_update p64
+ vst1.64 {XL}, [r1]
+
+ bx lr
ENDPROC(pmull_ghash_update_p64)
ENTRY(pmull_ghash_update_p8)
@@ -335,4 +365,331 @@ ENTRY(pmull_ghash_update_p8)
vmov.i64 k48, #0xffffffffffff
ghash_update p8
+ vst1.64 {XL}, [r1]
+
+ bx lr
ENDPROC(pmull_ghash_update_p8)
+
+ e0 .req q9
+ e1 .req q10
+ e2 .req q11
+ e3 .req q12
+ e0l .req d18
+ e0h .req d19
+ e2l .req d22
+ e2h .req d23
+ e3l .req d24
+ e3h .req d25
+ ctr .req q13
+ ctr0 .req d26
+ ctr1 .req d27
+
+ ek0 .req q14
+ ek1 .req q15
+
+ .macro round, rk:req, regs:vararg
+ .irp r, \regs
+ aese.8 \r, \rk
+ aesmc.8 \r, \r
+ .endr
+ .endm
+
+ .macro aes_encrypt, rkp, rounds, regs:vararg
+ vld1.8 {ek0-ek1}, [\rkp, :128]!
+ cmp \rounds, #12
+ blt .L\@ // AES-128
+
+ round ek0, \regs
+ vld1.8 {ek0}, [\rkp, :128]!
+ round ek1, \regs
+ vld1.8 {ek1}, [\rkp, :128]!
+
+ beq .L\@ // AES-192
+
+ round ek0, \regs
+ vld1.8 {ek0}, [\rkp, :128]!
+ round ek1, \regs
+ vld1.8 {ek1}, [\rkp, :128]!
+
+.L\@: .rept 4
+ round ek0, \regs
+ vld1.8 {ek0}, [\rkp, :128]!
+ round ek1, \regs
+ vld1.8 {ek1}, [\rkp, :128]!
+ .endr
+
+ round ek0, \regs
+ vld1.8 {ek0}, [\rkp, :128]
+
+ .irp r, \regs
+ aese.8 \r, ek1
+ .endr
+ .irp r, \regs
+ veor \r, \r, ek0
+ .endr
+ .endm
+
+pmull_aes_encrypt:
+ add ip, r5, #4
+ vld1.8 {ctr0}, [r5] // load 12 byte IV
+ vld1.8 {ctr1}, [ip]
+ rev r8, r7
+ vext.8 ctr1, ctr1, ctr1, #4
+ add r7, r7, #1
+ vmov.32 ctr1[1], r8
+ vmov e0, ctr
+
+ add ip, r3, #64
+ aes_encrypt ip, r6, e0
+ bx lr
+ENDPROC(pmull_aes_encrypt)
+
+pmull_aes_encrypt_4x:
+ add ip, r5, #4
+ vld1.8 {ctr0}, [r5]
+ vld1.8 {ctr1}, [ip]
+ rev r8, r7
+ vext.8 ctr1, ctr1, ctr1, #4
+ add r7, r7, #1
+ vmov.32 ctr1[1], r8
+ rev ip, r7
+ vmov e0, ctr
+ add r7, r7, #1
+ vmov.32 ctr1[1], ip
+ rev r8, r7
+ vmov e1, ctr
+ add r7, r7, #1
+ vmov.32 ctr1[1], r8
+ rev ip, r7
+ vmov e2, ctr
+ add r7, r7, #1
+ vmov.32 ctr1[1], ip
+ vmov e3, ctr
+
+ add ip, r3, #64
+ aes_encrypt ip, r6, e0, e1, e2, e3
+ bx lr
+ENDPROC(pmull_aes_encrypt_4x)
+
+pmull_aes_encrypt_final:
+ add ip, r5, #4
+ vld1.8 {ctr0}, [r5]
+ vld1.8 {ctr1}, [ip]
+ rev r8, r7
+ vext.8 ctr1, ctr1, ctr1, #4
+ mov r7, #1 << 24 // BE #1 for the tag
+ vmov.32 ctr1[1], r8
+ vmov e0, ctr
+ vmov.32 ctr1[1], r7
+ vmov e1, ctr
+
+ add ip, r3, #64
+ aes_encrypt ip, r6, e0, e1
+ bx lr
+ENDPROC(pmull_aes_encrypt_final)
+
+ .macro enc_1x, in0
+ bl pmull_aes_encrypt
+ veor \in0, \in0, e0
+ vst1.8 {\in0}, [r4]!
+ .endm
+
+ .macro dec_1x, in0
+ bl pmull_aes_encrypt
+ veor e0, e0, \in0
+ vst1.8 {e0}, [r4]!
+ .endm
+
+ .macro enc_4x, in0, in1, in2, in3
+ bl pmull_aes_encrypt_4x
+
+ veor \in0, \in0, e0
+ veor \in1, \in1, e1
+ veor \in2, \in2, e2
+ veor \in3, \in3, e3
+
+ vst1.8 {\in0-\in1}, [r4]!
+ vst1.8 {\in2-\in3}, [r4]!
+ .endm
+
+ .macro dec_4x, in0, in1, in2, in3
+ bl pmull_aes_encrypt_4x
+
+ veor e0, e0, \in0
+ veor e1, e1, \in1
+ veor e2, e2, \in2
+ veor e3, e3, \in3
+
+ vst1.8 {e0-e1}, [r4]!
+ vst1.8 {e2-e3}, [r4]!
+ .endm
+
+ /*
+ * void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src,
+ * struct gcm_key const *k, char *dst,
+ * char *iv, int rounds, u32 counter)
+ */
+ENTRY(pmull_gcm_encrypt)
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+
+ vld1.64 {SHASH}, [r3]
+
+ ghash_update p64, enc, head=0
+ vst1.64 {XL}, [r1]
+
+ pop {r4-r8, pc}
+ENDPROC(pmull_gcm_encrypt)
+
+ /*
+ * void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src,
+ * struct gcm_key const *k, char *dst,
+ * char *iv, int rounds, u32 counter)
+ */
+ENTRY(pmull_gcm_decrypt)
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+
+ vld1.64 {SHASH}, [r3]
+
+ ghash_update p64, dec, head=0
+ vst1.64 {XL}, [r1]
+
+ pop {r4-r8, pc}
+ENDPROC(pmull_gcm_decrypt)
+
+ /*
+ * void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag,
+ * struct gcm_key const *k, char *head,
+ * char *iv, int rounds, u32 counter)
+ */
+ENTRY(pmull_gcm_enc_final)
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+
+ bl pmull_aes_encrypt_final
+
+ cmp r0, #0
+ beq .Lenc_final
+
+ mov_l ip, .Lpermute
+ sub r4, r4, #16
+ add r8, ip, r0
+ add ip, ip, #32
+ add r4, r4, r0
+ sub ip, ip, r0
+
+ vld1.8 {e3}, [r8] // permute vector for key stream
+ vld1.8 {e2}, [ip] // permute vector for ghash input
+
+ vtbl.8 e3l, {e0}, e3l
+ vtbl.8 e3h, {e0}, e3h
+
+ vld1.8 {e0}, [r4] // encrypt tail block
+ veor e0, e0, e3
+ vst1.8 {e0}, [r4]
+
+ vtbl.8 T1_L, {e0}, e2l
+ vtbl.8 T1_H, {e0}, e2h
+
+ vld1.64 {XL}, [r1]
+.Lenc_final:
+ vld1.64 {SHASH}, [r3, :128]
+ vmov.i8 MASK, #0xe1
+ veor SHASH2_p64, SHASH_L, SHASH_H
+ vshl.u64 MASK, MASK, #57
+ mov r0, #1
+ bne 3f // process head block first
+ ghash_update p64, aggregate=0, head=0
+
+ vrev64.8 XL, XL
+ vext.8 XL, XL, XL, #8
+ veor XL, XL, e1
+
+ sub r2, r2, #16 // rewind src pointer
+ vst1.8 {XL}, [r2] // store tag
+
+ pop {r4-r8, pc}
+ENDPROC(pmull_gcm_enc_final)
+
+ /*
+ * int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag,
+ * struct gcm_key const *k, char *head,
+ * char *iv, int rounds, u32 counter,
+ * const char *otag, int authsize)
+ */
+ENTRY(pmull_gcm_dec_final)
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+
+ bl pmull_aes_encrypt_final
+
+ cmp r0, #0
+ beq .Ldec_final
+
+ mov_l ip, .Lpermute
+ sub r4, r4, #16
+ add r8, ip, r0
+ add ip, ip, #32
+ add r4, r4, r0
+ sub ip, ip, r0
+
+ vld1.8 {e3}, [r8] // permute vector for key stream
+ vld1.8 {e2}, [ip] // permute vector for ghash input
+
+ vtbl.8 e3l, {e0}, e3l
+ vtbl.8 e3h, {e0}, e3h
+
+ vld1.8 {e0}, [r4]
+
+ vtbl.8 T1_L, {e0}, e2l
+ vtbl.8 T1_H, {e0}, e2h
+
+ veor e0, e0, e3
+ vst1.8 {e0}, [r4]
+
+ vld1.64 {XL}, [r1]
+.Ldec_final:
+ vld1.64 {SHASH}, [r3]
+ vmov.i8 MASK, #0xe1
+ veor SHASH2_p64, SHASH_L, SHASH_H
+ vshl.u64 MASK, MASK, #57
+ mov r0, #1
+ bne 3f // process head block first
+ ghash_update p64, aggregate=0, head=0
+
+ vrev64.8 XL, XL
+ vext.8 XL, XL, XL, #8
+ veor XL, XL, e1
+
+ mov_l ip, .Lpermute
+ ldrd r2, r3, [sp, #40] // otag and authsize
+ vld1.8 {T1}, [r2]
+ add ip, ip, r3
+ vceq.i8 T1, T1, XL // compare tags
+ vmvn T1, T1 // 0 for eq, -1 for ne
+
+ vld1.8 {e0}, [ip]
+ vtbl.8 XL_L, {T1}, e0l // keep authsize bytes only
+ vtbl.8 XL_H, {T1}, e0h
+
+ vpmin.s8 XL_L, XL_L, XL_H // take the minimum s8 across the vector
+ vpmin.s8 XL_L, XL_L, XL_L
+ vmov.32 r0, XL_L[0] // fail if != 0x0
+
+ pop {r4-r8, pc}
+ENDPROC(pmull_gcm_dec_final)
+
+ .section ".rodata", "a", %progbits
+ .align 5
+.Lpermute:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff