summaryrefslogtreecommitdiff
path: root/arch/arm64/crypto/aes-neonbs-core.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/crypto/aes-neonbs-core.S')
-rw-r--r--arch/arm64/crypto/aes-neonbs-core.S307
1 files changed, 84 insertions, 223 deletions
diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S
index 65982039fa36..baf450717b24 100644
--- a/arch/arm64/crypto/aes-neonbs-core.S
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@@ -15,6 +15,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/assembler.h>
.text
@@ -380,7 +381,7 @@ ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f
/*
* void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
*/
-ENTRY(aesbs_convert_key)
+SYM_FUNC_START(aesbs_convert_key)
ld1 {v7.4s}, [x1], #16 // load round 0 key
ld1 {v17.4s}, [x1], #16 // load round 1 key
@@ -425,10 +426,10 @@ ENTRY(aesbs_convert_key)
eor v17.16b, v17.16b, v7.16b
str q17, [x0]
ret
-ENDPROC(aesbs_convert_key)
+SYM_FUNC_END(aesbs_convert_key)
.align 4
-aesbs_encrypt8:
+SYM_FUNC_START_LOCAL(aesbs_encrypt8)
ldr q9, [bskey], #16 // round 0 key
ldr q8, M0SR
ldr q24, SR
@@ -488,10 +489,10 @@ aesbs_encrypt8:
eor v2.16b, v2.16b, v12.16b
eor v5.16b, v5.16b, v12.16b
ret
-ENDPROC(aesbs_encrypt8)
+SYM_FUNC_END(aesbs_encrypt8)
.align 4
-aesbs_decrypt8:
+SYM_FUNC_START_LOCAL(aesbs_decrypt8)
lsl x9, rounds, #7
add bskey, bskey, x9
@@ -553,7 +554,7 @@ aesbs_decrypt8:
eor v3.16b, v3.16b, v12.16b
eor v5.16b, v5.16b, v12.16b
ret
-ENDPROC(aesbs_decrypt8)
+SYM_FUNC_END(aesbs_decrypt8)
/*
* aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
@@ -613,7 +614,6 @@ ENDPROC(aesbs_decrypt8)
st1 {\o7\().16b}, [x19], #16
cbz x23, 1f
- cond_yield_neon
b 99b
1: frame_pop
@@ -621,21 +621,21 @@ ENDPROC(aesbs_decrypt8)
.endm
.align 4
-ENTRY(aesbs_ecb_encrypt)
+SYM_TYPED_FUNC_START(aesbs_ecb_encrypt)
__ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
-ENDPROC(aesbs_ecb_encrypt)
+SYM_FUNC_END(aesbs_ecb_encrypt)
.align 4
-ENTRY(aesbs_ecb_decrypt)
+SYM_TYPED_FUNC_START(aesbs_ecb_decrypt)
__ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
-ENDPROC(aesbs_ecb_decrypt)
+SYM_FUNC_END(aesbs_ecb_decrypt)
/*
* aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 iv[])
*/
.align 4
-ENTRY(aesbs_cbc_decrypt)
+SYM_FUNC_START(aesbs_cbc_decrypt)
frame_push 6
mov x19, x0
@@ -715,12 +715,11 @@ ENTRY(aesbs_cbc_decrypt)
1: st1 {v24.16b}, [x24] // store IV
cbz x23, 2f
- cond_yield_neon
b 99b
2: frame_pop
ret
-ENDPROC(aesbs_cbc_decrypt)
+SYM_FUNC_END(aesbs_cbc_decrypt)
.macro next_tweak, out, in, const, tmp
sshr \tmp\().2d, \in\().2d, #63
@@ -736,131 +735,78 @@ ENDPROC(aesbs_cbc_decrypt)
* aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 iv[])
*/
-__xts_crypt8:
- mov x6, #1
- lsl x6, x6, x23
- subs w23, w23, #8
- csel x23, x23, xzr, pl
- csel x6, x6, xzr, mi
+SYM_FUNC_START_LOCAL(__xts_crypt8)
+ movi v18.2s, #0x1
+ movi v19.2s, #0x87
+ uzp1 v18.4s, v18.4s, v19.4s
+
+ ld1 {v0.16b-v3.16b}, [x1], #64
+ ld1 {v4.16b-v7.16b}, [x1], #64
+
+ next_tweak v26, v25, v18, v19
+ next_tweak v27, v26, v18, v19
+ next_tweak v28, v27, v18, v19
+ next_tweak v29, v28, v18, v19
+ next_tweak v30, v29, v18, v19
+ next_tweak v31, v30, v18, v19
+ next_tweak v16, v31, v18, v19
+ next_tweak v17, v16, v18, v19
- ld1 {v0.16b}, [x20], #16
- next_tweak v26, v25, v30, v31
eor v0.16b, v0.16b, v25.16b
- tbnz x6, #1, 0f
-
- ld1 {v1.16b}, [x20], #16
- next_tweak v27, v26, v30, v31
eor v1.16b, v1.16b, v26.16b
- tbnz x6, #2, 0f
-
- ld1 {v2.16b}, [x20], #16
- next_tweak v28, v27, v30, v31
eor v2.16b, v2.16b, v27.16b
- tbnz x6, #3, 0f
-
- ld1 {v3.16b}, [x20], #16
- next_tweak v29, v28, v30, v31
eor v3.16b, v3.16b, v28.16b
- tbnz x6, #4, 0f
-
- ld1 {v4.16b}, [x20], #16
- str q29, [sp, #.Lframe_local_offset]
eor v4.16b, v4.16b, v29.16b
- next_tweak v29, v29, v30, v31
- tbnz x6, #5, 0f
-
- ld1 {v5.16b}, [x20], #16
- str q29, [sp, #.Lframe_local_offset + 16]
- eor v5.16b, v5.16b, v29.16b
- next_tweak v29, v29, v30, v31
- tbnz x6, #6, 0f
-
- ld1 {v6.16b}, [x20], #16
- str q29, [sp, #.Lframe_local_offset + 32]
- eor v6.16b, v6.16b, v29.16b
- next_tweak v29, v29, v30, v31
- tbnz x6, #7, 0f
+ eor v5.16b, v5.16b, v30.16b
+ eor v6.16b, v6.16b, v31.16b
+ eor v7.16b, v7.16b, v16.16b
- ld1 {v7.16b}, [x20], #16
- str q29, [sp, #.Lframe_local_offset + 48]
- eor v7.16b, v7.16b, v29.16b
- next_tweak v29, v29, v30, v31
+ stp q16, q17, [x6]
-0: mov bskey, x21
- mov rounds, x22
- br x7
-ENDPROC(__xts_crypt8)
+ mov bskey, x2
+ mov rounds, x3
+ br x16
+SYM_FUNC_END(__xts_crypt8)
.macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
- frame_push 6, 64
-
- mov x19, x0
- mov x20, x1
- mov x21, x2
- mov x22, x3
- mov x23, x4
- mov x24, x5
+ frame_push 0, 32
+ add x6, sp, #.Lframe_local_offset
-0: movi v30.2s, #0x1
- movi v25.2s, #0x87
- uzp1 v30.4s, v30.4s, v25.4s
- ld1 {v25.16b}, [x24]
+ ld1 {v25.16b}, [x5]
-99: adr x7, \do8
+0: adr x16, \do8
bl __xts_crypt8
- ldp q16, q17, [sp, #.Lframe_local_offset]
- ldp q18, q19, [sp, #.Lframe_local_offset + 32]
-
- eor \o0\().16b, \o0\().16b, v25.16b
- eor \o1\().16b, \o1\().16b, v26.16b
- eor \o2\().16b, \o2\().16b, v27.16b
- eor \o3\().16b, \o3\().16b, v28.16b
-
- st1 {\o0\().16b}, [x19], #16
- mov v25.16b, v26.16b
- tbnz x6, #1, 1f
- st1 {\o1\().16b}, [x19], #16
- mov v25.16b, v27.16b
- tbnz x6, #2, 1f
- st1 {\o2\().16b}, [x19], #16
- mov v25.16b, v28.16b
- tbnz x6, #3, 1f
- st1 {\o3\().16b}, [x19], #16
- mov v25.16b, v29.16b
- tbnz x6, #4, 1f
+ eor v16.16b, \o0\().16b, v25.16b
+ eor v17.16b, \o1\().16b, v26.16b
+ eor v18.16b, \o2\().16b, v27.16b
+ eor v19.16b, \o3\().16b, v28.16b
- eor \o4\().16b, \o4\().16b, v16.16b
- eor \o5\().16b, \o5\().16b, v17.16b
- eor \o6\().16b, \o6\().16b, v18.16b
- eor \o7\().16b, \o7\().16b, v19.16b
+ ldp q24, q25, [x6]
- st1 {\o4\().16b}, [x19], #16
- tbnz x6, #5, 1f
- st1 {\o5\().16b}, [x19], #16
- tbnz x6, #6, 1f
- st1 {\o6\().16b}, [x19], #16
- tbnz x6, #7, 1f
- st1 {\o7\().16b}, [x19], #16
+ eor v20.16b, \o4\().16b, v29.16b
+ eor v21.16b, \o5\().16b, v30.16b
+ eor v22.16b, \o6\().16b, v31.16b
+ eor v23.16b, \o7\().16b, v24.16b
- cbz x23, 1f
- st1 {v25.16b}, [x24]
+ st1 {v16.16b-v19.16b}, [x0], #64
+ st1 {v20.16b-v23.16b}, [x0], #64
- cond_yield_neon 0b
- b 99b
+ subs x4, x4, #8
+ b.gt 0b
-1: st1 {v25.16b}, [x24]
+ st1 {v25.16b}, [x5]
frame_pop
ret
.endm
-ENTRY(aesbs_xts_encrypt)
+SYM_TYPED_FUNC_START(aesbs_xts_encrypt)
__xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
-ENDPROC(aesbs_xts_encrypt)
+SYM_FUNC_END(aesbs_xts_encrypt)
-ENTRY(aesbs_xts_decrypt)
+SYM_TYPED_FUNC_START(aesbs_xts_decrypt)
__xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
-ENDPROC(aesbs_xts_decrypt)
+SYM_FUNC_END(aesbs_xts_decrypt)
.macro next_ctr, v
mov \v\().d[1], x8
@@ -872,134 +818,49 @@ ENDPROC(aesbs_xts_decrypt)
/*
* aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
- * int rounds, int blocks, u8 iv[], u8 final[])
+ * int rounds, int blocks, u8 iv[])
*/
-ENTRY(aesbs_ctr_encrypt)
- frame_push 8
-
- mov x19, x0
- mov x20, x1
- mov x21, x2
- mov x22, x3
- mov x23, x4
- mov x24, x5
- mov x25, x6
-
- cmp x25, #0
- cset x26, ne
- add x23, x23, x26 // do one extra block if final
-
-98: ldp x7, x8, [x24]
- ld1 {v0.16b}, [x24]
+SYM_FUNC_START(aesbs_ctr_encrypt)
+ frame_push 0
+ ldp x7, x8, [x5]
+ ld1 {v0.16b}, [x5]
CPU_LE( rev x7, x7 )
CPU_LE( rev x8, x8 )
adds x8, x8, #1
adc x7, x7, xzr
-99: mov x9, #1
- lsl x9, x9, x23
- subs w23, w23, #8
- csel x23, x23, xzr, pl
- csel x9, x9, xzr, le
-
- tbnz x9, #1, 0f
- next_ctr v1
- tbnz x9, #2, 0f
+0: next_ctr v1
next_ctr v2
- tbnz x9, #3, 0f
next_ctr v3
- tbnz x9, #4, 0f
next_ctr v4
- tbnz x9, #5, 0f
next_ctr v5
- tbnz x9, #6, 0f
next_ctr v6
- tbnz x9, #7, 0f
next_ctr v7
-0: mov bskey, x21
- mov rounds, x22
+ mov bskey, x2
+ mov rounds, x3
bl aesbs_encrypt8
- lsr x9, x9, x26 // disregard the extra block
- tbnz x9, #0, 0f
-
- ld1 {v8.16b}, [x20], #16
- eor v0.16b, v0.16b, v8.16b
- st1 {v0.16b}, [x19], #16
- tbnz x9, #1, 1f
-
- ld1 {v9.16b}, [x20], #16
- eor v1.16b, v1.16b, v9.16b
- st1 {v1.16b}, [x19], #16
- tbnz x9, #2, 2f
-
- ld1 {v10.16b}, [x20], #16
- eor v4.16b, v4.16b, v10.16b
- st1 {v4.16b}, [x19], #16
- tbnz x9, #3, 3f
-
- ld1 {v11.16b}, [x20], #16
- eor v6.16b, v6.16b, v11.16b
- st1 {v6.16b}, [x19], #16
- tbnz x9, #4, 4f
-
- ld1 {v12.16b}, [x20], #16
- eor v3.16b, v3.16b, v12.16b
- st1 {v3.16b}, [x19], #16
- tbnz x9, #5, 5f
-
- ld1 {v13.16b}, [x20], #16
- eor v7.16b, v7.16b, v13.16b
- st1 {v7.16b}, [x19], #16
- tbnz x9, #6, 6f
+ ld1 { v8.16b-v11.16b}, [x1], #64
+ ld1 {v12.16b-v15.16b}, [x1], #64
- ld1 {v14.16b}, [x20], #16
- eor v2.16b, v2.16b, v14.16b
- st1 {v2.16b}, [x19], #16
- tbnz x9, #7, 7f
+ eor v8.16b, v0.16b, v8.16b
+ eor v9.16b, v1.16b, v9.16b
+ eor v10.16b, v4.16b, v10.16b
+ eor v11.16b, v6.16b, v11.16b
+ eor v12.16b, v3.16b, v12.16b
+ eor v13.16b, v7.16b, v13.16b
+ eor v14.16b, v2.16b, v14.16b
+ eor v15.16b, v5.16b, v15.16b
- ld1 {v15.16b}, [x20], #16
- eor v5.16b, v5.16b, v15.16b
- st1 {v5.16b}, [x19], #16
+ st1 { v8.16b-v11.16b}, [x0], #64
+ st1 {v12.16b-v15.16b}, [x0], #64
-8: next_ctr v0
- st1 {v0.16b}, [x24]
- cbz x23, .Lctr_done
+ next_ctr v0
+ subs x4, x4, #8
+ b.gt 0b
- cond_yield_neon 98b
- b 99b
-
-.Lctr_done:
+ st1 {v0.16b}, [x5]
frame_pop
ret
-
- /*
- * If we are handling the tail of the input (x6 != NULL), return the
- * final keystream block back to the caller.
- */
-0: cbz x25, 8b
- st1 {v0.16b}, [x25]
- b 8b
-1: cbz x25, 8b
- st1 {v1.16b}, [x25]
- b 8b
-2: cbz x25, 8b
- st1 {v4.16b}, [x25]
- b 8b
-3: cbz x25, 8b
- st1 {v6.16b}, [x25]
- b 8b
-4: cbz x25, 8b
- st1 {v3.16b}, [x25]
- b 8b
-5: cbz x25, 8b
- st1 {v7.16b}, [x25]
- b 8b
-6: cbz x25, 8b
- st1 {v2.16b}, [x25]
- b 8b
-7: cbz x25, 8b
- st1 {v5.16b}, [x25]
- b 8b
-ENDPROC(aesbs_ctr_encrypt)
+SYM_FUNC_END(aesbs_ctr_encrypt)