summaryrefslogtreecommitdiff
path: root/arch/arm/crypto/sha1-armv4-large.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm/crypto/sha1-armv4-large.S')
-rw-r--r--arch/arm/crypto/sha1-armv4-large.S507
1 files changed, 0 insertions, 507 deletions
diff --git a/arch/arm/crypto/sha1-armv4-large.S b/arch/arm/crypto/sha1-armv4-large.S
deleted file mode 100644
index 1c8b685149f2..000000000000
--- a/arch/arm/crypto/sha1-armv4-large.S
+++ /dev/null
@@ -1,507 +0,0 @@
-#define __ARM_ARCH__ __LINUX_ARM_ARCH__
-@ SPDX-License-Identifier: GPL-2.0
-
-@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
-@ has relicensed it under the GPLv2. Therefore this program is free software;
-@ you can redistribute it and/or modify it under the terms of the GNU General
-@ Public License version 2 as published by the Free Software Foundation.
-@
-@ The original headers, including the original license headers, are
-@ included below for completeness.
-
-@ ====================================================================
-@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see https://www.openssl.org/~appro/cryptogams/.
-@ ====================================================================
-
-@ sha1_block procedure for ARMv4.
-@
-@ January 2007.
-
-@ Size/performance trade-off
-@ ====================================================================
-@ impl size in bytes comp cycles[*] measured performance
-@ ====================================================================
-@ thumb 304 3212 4420
-@ armv4-small 392/+29% 1958/+64% 2250/+96%
-@ armv4-compact 740/+89% 1552/+26% 1840/+22%
-@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
-@ full unroll ~5100/+260% ~1260/+4% ~1300/+5%
-@ ====================================================================
-@ thumb = same as 'small' but in Thumb instructions[**] and
-@ with recurring code in two private functions;
-@ small = detached Xload/update, loops are folded;
-@ compact = detached Xload/update, 5x unroll;
-@ large = interleaved Xload/update, 5x unroll;
-@ full unroll = interleaved Xload/update, full unroll, estimated[!];
-@
-@ [*] Manually counted instructions in "grand" loop body. Measured
-@ performance is affected by prologue and epilogue overhead,
-@ i-cache availability, branch penalties, etc.
-@ [**] While each Thumb instruction is twice smaller, they are not as
-@ diverse as ARM ones: e.g., there are only two arithmetic
-@ instructions with 3 arguments, no [fixed] rotate, addressing
-@ modes are limited. As result it takes more instructions to do
-@ the same job in Thumb, therefore the code is never twice as
-@ small and always slower.
-@ [***] which is also ~35% better than compiler generated code. Dual-
-@ issue Cortex A8 core was measured to process input block in
-@ ~990 cycles.
-
-@ August 2010.
-@
-@ Rescheduling for dual-issue pipeline resulted in 13% improvement on
-@ Cortex A8 core and in absolute terms ~870 cycles per input block
-@ [or 13.6 cycles per byte].
-
-@ February 2011.
-@
-@ Profiler-assisted and platform-specific optimization resulted in 10%
-@ improvement on Cortex A8 core and 12.2 cycles per byte.
-
-#include <linux/linkage.h>
-
-.text
-
-.align 2
-ENTRY(sha1_block_data_order)
- stmdb sp!,{r4-r12,lr}
- add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
- ldmia r0,{r3,r4,r5,r6,r7}
-.Lloop:
- ldr r8,.LK_00_19
- mov r14,sp
- sub sp,sp,#15*4
- mov r5,r5,ror#30
- mov r6,r6,ror#30
- mov r7,r7,ror#30 @ [6]
-.L_00_15:
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r7,r8,r7,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r5,r6 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r7,r8,r7,ror#2 @ E+=K_00_19
- eor r10,r5,r6 @ F_xx_xx
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r4,r10,ror#2
- add r7,r7,r9 @ E+=X[i]
- eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r7,r7,r10 @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r6,r8,r6,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r4,r5 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r6,r8,r6,ror#2 @ E+=K_00_19
- eor r10,r4,r5 @ F_xx_xx
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r3,r10,ror#2
- add r6,r6,r9 @ E+=X[i]
- eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r6,r6,r10 @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r5,r8,r5,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r3,r4 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r5,r8,r5,ror#2 @ E+=K_00_19
- eor r10,r3,r4 @ F_xx_xx
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r7,r10,ror#2
- add r5,r5,r9 @ E+=X[i]
- eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r5,r5,r10 @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r4,r8,r4,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r7,r3 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r4,r8,r4,ror#2 @ E+=K_00_19
- eor r10,r7,r3 @ F_xx_xx
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r6,r10,ror#2
- add r4,r4,r9 @ E+=X[i]
- eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r4,r4,r10 @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r3,r8,r3,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r6,r7 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r3,r8,r3,ror#2 @ E+=K_00_19
- eor r10,r6,r7 @ F_xx_xx
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r5,r10,ror#2
- add r3,r3,r9 @ E+=X[i]
- eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r3,r3,r10 @ E+=F_00_19(B,C,D)
- cmp r14,sp
- bne .L_00_15 @ [((11+4)*5+2)*3]
- sub sp,sp,#25*4
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r7,r8,r7,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r5,r6 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r7,r8,r7,ror#2 @ E+=K_00_19
- eor r10,r5,r6 @ F_xx_xx
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r4,r10,ror#2
- add r7,r7,r9 @ E+=X[i]
- eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r7,r7,r10 @ E+=F_00_19(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r6,r8,r6,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r4,r5 @ F_xx_xx
- mov r9,r9,ror#31
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r3,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r6,r6,r9 @ E+=X[i]
- eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
- add r6,r6,r10 @ E+=F_00_19(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r5,r8,r5,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r3,r4 @ F_xx_xx
- mov r9,r9,ror#31
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r7,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r5,r5,r9 @ E+=X[i]
- eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
- add r5,r5,r10 @ E+=F_00_19(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r4,r8,r4,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r7,r3 @ F_xx_xx
- mov r9,r9,ror#31
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r6,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r4,r4,r9 @ E+=X[i]
- eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
- add r4,r4,r10 @ E+=F_00_19(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r3,r8,r3,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r6,r7 @ F_xx_xx
- mov r9,r9,ror#31
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r5,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r3,r3,r9 @ E+=X[i]
- eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
- add r3,r3,r10 @ E+=F_00_19(B,C,D)
-
- ldr r8,.LK_20_39 @ [+15+16*4]
- cmn sp,#0 @ [+3], clear carry to denote 20_39
-.L_20_39_or_60_79:
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r7,r8,r7,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r5,r6 @ F_xx_xx
- mov r9,r9,ror#31
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r4,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r7,r7,r9 @ E+=X[i]
- add r7,r7,r10 @ E+=F_20_39(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r6,r8,r6,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r4,r5 @ F_xx_xx
- mov r9,r9,ror#31
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r3,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r6,r6,r9 @ E+=X[i]
- add r6,r6,r10 @ E+=F_20_39(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r5,r8,r5,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r3,r4 @ F_xx_xx
- mov r9,r9,ror#31
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r7,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r5,r5,r9 @ E+=X[i]
- add r5,r5,r10 @ E+=F_20_39(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r4,r8,r4,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r7,r3 @ F_xx_xx
- mov r9,r9,ror#31
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r6,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r4,r4,r9 @ E+=X[i]
- add r4,r4,r10 @ E+=F_20_39(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r3,r8,r3,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r6,r7 @ F_xx_xx
- mov r9,r9,ror#31
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r5,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r3,r3,r9 @ E+=X[i]
- add r3,r3,r10 @ E+=F_20_39(B,C,D)
- ARM( teq r14,sp ) @ preserve carry
- THUMB( mov r11,sp )
- THUMB( teq r14,r11 ) @ preserve carry
- bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
- bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
-
- ldr r8,.LK_40_59
- sub sp,sp,#20*4 @ [+2]
-.L_40_59:
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r7,r8,r7,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r5,r6 @ F_xx_xx
- mov r9,r9,ror#31
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r4,r10,ror#2 @ F_xx_xx
- and r11,r5,r6 @ F_xx_xx
- add r7,r7,r9 @ E+=X[i]
- add r7,r7,r10 @ E+=F_40_59(B,C,D)
- add r7,r7,r11,ror#2
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r6,r8,r6,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r4,r5 @ F_xx_xx
- mov r9,r9,ror#31
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r3,r10,ror#2 @ F_xx_xx
- and r11,r4,r5 @ F_xx_xx
- add r6,r6,r9 @ E+=X[i]
- add r6,r6,r10 @ E+=F_40_59(B,C,D)
- add r6,r6,r11,ror#2
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r5,r8,r5,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r3,r4 @ F_xx_xx
- mov r9,r9,ror#31
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r7,r10,ror#2 @ F_xx_xx
- and r11,r3,r4 @ F_xx_xx
- add r5,r5,r9 @ E+=X[i]
- add r5,r5,r10 @ E+=F_40_59(B,C,D)
- add r5,r5,r11,ror#2
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r4,r8,r4,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r7,r3 @ F_xx_xx
- mov r9,r9,ror#31
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r6,r10,ror#2 @ F_xx_xx
- and r11,r7,r3 @ F_xx_xx
- add r4,r4,r9 @ E+=X[i]
- add r4,r4,r10 @ E+=F_40_59(B,C,D)
- add r4,r4,r11,ror#2
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r3,r8,r3,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r6,r7 @ F_xx_xx
- mov r9,r9,ror#31
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r5,r10,ror#2 @ F_xx_xx
- and r11,r6,r7 @ F_xx_xx
- add r3,r3,r9 @ E+=X[i]
- add r3,r3,r10 @ E+=F_40_59(B,C,D)
- add r3,r3,r11,ror#2
- cmp r14,sp
- bne .L_40_59 @ [+((12+5)*5+2)*4]
-
- ldr r8,.LK_60_79
- sub sp,sp,#20*4
- cmp sp,#0 @ set carry to denote 60_79
- b .L_20_39_or_60_79 @ [+4], spare 300 bytes
-.L_done:
- add sp,sp,#80*4 @ "deallocate" stack frame
- ldmia r0,{r8,r9,r10,r11,r12}
- add r3,r8,r3
- add r4,r9,r4
- add r5,r10,r5,ror#2
- add r6,r11,r6,ror#2
- add r7,r12,r7,ror#2
- stmia r0,{r3,r4,r5,r6,r7}
- teq r1,r2
- bne .Lloop @ [+18], total 1307
-
- ldmia sp!,{r4-r12,pc}
-.align 2
-.LK_00_19: .word 0x5a827999
-.LK_20_39: .word 0x6ed9eba1
-.LK_40_59: .word 0x8f1bbcdc
-.LK_60_79: .word 0xca62c1d6
-ENDPROC(sha1_block_data_order)
-.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
-.align 2