summaryrefslogtreecommitdiff
path: root/arch/powerpc/crypto/chacha-p10le-8x.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/crypto/chacha-p10le-8x.S')
-rw-r--r--arch/powerpc/crypto/chacha-p10le-8x.S842
1 files changed, 0 insertions, 842 deletions
diff --git a/arch/powerpc/crypto/chacha-p10le-8x.S b/arch/powerpc/crypto/chacha-p10le-8x.S
deleted file mode 100644
index 17bedb66b822..000000000000
--- a/arch/powerpc/crypto/chacha-p10le-8x.S
+++ /dev/null
@@ -1,842 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#
-# Accelerated chacha20 implementation for ppc64le.
-#
-# Copyright 2023- IBM Corp. All rights reserved
-#
-#===================================================================================
-# Written by Danny Tsen <dtsen@us.ibm.com>
-#
-# chacha_p10le_8x(u32 *state, byte *dst, const byte *src,
-# size_t len, int nrounds);
-#
-# do rounds, 8 quarter rounds
-# 1. a += b; d ^= a; d <<<= 16;
-# 2. c += d; b ^= c; b <<<= 12;
-# 3. a += b; d ^= a; d <<<= 8;
-# 4. c += d; b ^= c; b <<<= 7
-#
-# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16
-# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12
-# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8
-# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7
-#
-# 4 blocks (a b c d)
-#
-# a0 b0 c0 d0
-# a1 b1 c1 d1
-# ...
-# a4 b4 c4 d4
-# ...
-# a8 b8 c8 d8
-# ...
-# a12 b12 c12 d12
-# a13 ...
-# a14 ...
-# a15 b15 c15 d15
-#
-# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
-# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
-#
-
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/asm-compat.h>
-#include <linux/linkage.h>
-
-.machine "any"
-.text
-
-.macro SAVE_GPR GPR OFFSET FRAME
- std \GPR,\OFFSET(\FRAME)
-.endm
-
-.macro SAVE_VRS VRS OFFSET FRAME
- li 16, \OFFSET
- stvx \VRS, 16, \FRAME
-.endm
-
-.macro SAVE_VSX VSX OFFSET FRAME
- li 16, \OFFSET
- stxvx \VSX, 16, \FRAME
-.endm
-
-.macro RESTORE_GPR GPR OFFSET FRAME
- ld \GPR,\OFFSET(\FRAME)
-.endm
-
-.macro RESTORE_VRS VRS OFFSET FRAME
- li 16, \OFFSET
- lvx \VRS, 16, \FRAME
-.endm
-
-.macro RESTORE_VSX VSX OFFSET FRAME
- li 16, \OFFSET
- lxvx \VSX, 16, \FRAME
-.endm
-
-.macro SAVE_REGS
- mflr 0
- std 0, 16(1)
- stdu 1,-752(1)
-
- SAVE_GPR 14, 112, 1
- SAVE_GPR 15, 120, 1
- SAVE_GPR 16, 128, 1
- SAVE_GPR 17, 136, 1
- SAVE_GPR 18, 144, 1
- SAVE_GPR 19, 152, 1
- SAVE_GPR 20, 160, 1
- SAVE_GPR 21, 168, 1
- SAVE_GPR 22, 176, 1
- SAVE_GPR 23, 184, 1
- SAVE_GPR 24, 192, 1
- SAVE_GPR 25, 200, 1
- SAVE_GPR 26, 208, 1
- SAVE_GPR 27, 216, 1
- SAVE_GPR 28, 224, 1
- SAVE_GPR 29, 232, 1
- SAVE_GPR 30, 240, 1
- SAVE_GPR 31, 248, 1
-
- addi 9, 1, 256
- SAVE_VRS 20, 0, 9
- SAVE_VRS 21, 16, 9
- SAVE_VRS 22, 32, 9
- SAVE_VRS 23, 48, 9
- SAVE_VRS 24, 64, 9
- SAVE_VRS 25, 80, 9
- SAVE_VRS 26, 96, 9
- SAVE_VRS 27, 112, 9
- SAVE_VRS 28, 128, 9
- SAVE_VRS 29, 144, 9
- SAVE_VRS 30, 160, 9
- SAVE_VRS 31, 176, 9
-
- SAVE_VSX 14, 192, 9
- SAVE_VSX 15, 208, 9
- SAVE_VSX 16, 224, 9
- SAVE_VSX 17, 240, 9
- SAVE_VSX 18, 256, 9
- SAVE_VSX 19, 272, 9
- SAVE_VSX 20, 288, 9
- SAVE_VSX 21, 304, 9
- SAVE_VSX 22, 320, 9
- SAVE_VSX 23, 336, 9
- SAVE_VSX 24, 352, 9
- SAVE_VSX 25, 368, 9
- SAVE_VSX 26, 384, 9
- SAVE_VSX 27, 400, 9
- SAVE_VSX 28, 416, 9
- SAVE_VSX 29, 432, 9
- SAVE_VSX 30, 448, 9
- SAVE_VSX 31, 464, 9
-.endm # SAVE_REGS
-
-.macro RESTORE_REGS
- addi 9, 1, 256
- RESTORE_VRS 20, 0, 9
- RESTORE_VRS 21, 16, 9
- RESTORE_VRS 22, 32, 9
- RESTORE_VRS 23, 48, 9
- RESTORE_VRS 24, 64, 9
- RESTORE_VRS 25, 80, 9
- RESTORE_VRS 26, 96, 9
- RESTORE_VRS 27, 112, 9
- RESTORE_VRS 28, 128, 9
- RESTORE_VRS 29, 144, 9
- RESTORE_VRS 30, 160, 9
- RESTORE_VRS 31, 176, 9
-
- RESTORE_VSX 14, 192, 9
- RESTORE_VSX 15, 208, 9
- RESTORE_VSX 16, 224, 9
- RESTORE_VSX 17, 240, 9
- RESTORE_VSX 18, 256, 9
- RESTORE_VSX 19, 272, 9
- RESTORE_VSX 20, 288, 9
- RESTORE_VSX 21, 304, 9
- RESTORE_VSX 22, 320, 9
- RESTORE_VSX 23, 336, 9
- RESTORE_VSX 24, 352, 9
- RESTORE_VSX 25, 368, 9
- RESTORE_VSX 26, 384, 9
- RESTORE_VSX 27, 400, 9
- RESTORE_VSX 28, 416, 9
- RESTORE_VSX 29, 432, 9
- RESTORE_VSX 30, 448, 9
- RESTORE_VSX 31, 464, 9
-
- RESTORE_GPR 14, 112, 1
- RESTORE_GPR 15, 120, 1
- RESTORE_GPR 16, 128, 1
- RESTORE_GPR 17, 136, 1
- RESTORE_GPR 18, 144, 1
- RESTORE_GPR 19, 152, 1
- RESTORE_GPR 20, 160, 1
- RESTORE_GPR 21, 168, 1
- RESTORE_GPR 22, 176, 1
- RESTORE_GPR 23, 184, 1
- RESTORE_GPR 24, 192, 1
- RESTORE_GPR 25, 200, 1
- RESTORE_GPR 26, 208, 1
- RESTORE_GPR 27, 216, 1
- RESTORE_GPR 28, 224, 1
- RESTORE_GPR 29, 232, 1
- RESTORE_GPR 30, 240, 1
- RESTORE_GPR 31, 248, 1
-
- addi 1, 1, 752
- ld 0, 16(1)
- mtlr 0
-.endm # RESTORE_REGS
-
-.macro QT_loop_8x
- # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
- xxlor 0, 32+25, 32+25
- xxlor 32+25, 20, 20
- vadduwm 0, 0, 4
- vadduwm 1, 1, 5
- vadduwm 2, 2, 6
- vadduwm 3, 3, 7
- vadduwm 16, 16, 20
- vadduwm 17, 17, 21
- vadduwm 18, 18, 22
- vadduwm 19, 19, 23
-
- vpermxor 12, 12, 0, 25
- vpermxor 13, 13, 1, 25
- vpermxor 14, 14, 2, 25
- vpermxor 15, 15, 3, 25
- vpermxor 28, 28, 16, 25
- vpermxor 29, 29, 17, 25
- vpermxor 30, 30, 18, 25
- vpermxor 31, 31, 19, 25
- xxlor 32+25, 0, 0
- vadduwm 8, 8, 12
- vadduwm 9, 9, 13
- vadduwm 10, 10, 14
- vadduwm 11, 11, 15
- vadduwm 24, 24, 28
- vadduwm 25, 25, 29
- vadduwm 26, 26, 30
- vadduwm 27, 27, 31
- vxor 4, 4, 8
- vxor 5, 5, 9
- vxor 6, 6, 10
- vxor 7, 7, 11
- vxor 20, 20, 24
- vxor 21, 21, 25
- vxor 22, 22, 26
- vxor 23, 23, 27
-
- xxlor 0, 32+25, 32+25
- xxlor 32+25, 21, 21
- vrlw 4, 4, 25 #
- vrlw 5, 5, 25
- vrlw 6, 6, 25
- vrlw 7, 7, 25
- vrlw 20, 20, 25 #
- vrlw 21, 21, 25
- vrlw 22, 22, 25
- vrlw 23, 23, 25
- xxlor 32+25, 0, 0
- vadduwm 0, 0, 4
- vadduwm 1, 1, 5
- vadduwm 2, 2, 6
- vadduwm 3, 3, 7
- vadduwm 16, 16, 20
- vadduwm 17, 17, 21
- vadduwm 18, 18, 22
- vadduwm 19, 19, 23
-
- xxlor 0, 32+25, 32+25
- xxlor 32+25, 22, 22
- vpermxor 12, 12, 0, 25
- vpermxor 13, 13, 1, 25
- vpermxor 14, 14, 2, 25
- vpermxor 15, 15, 3, 25
- vpermxor 28, 28, 16, 25
- vpermxor 29, 29, 17, 25
- vpermxor 30, 30, 18, 25
- vpermxor 31, 31, 19, 25
- xxlor 32+25, 0, 0
- vadduwm 8, 8, 12
- vadduwm 9, 9, 13
- vadduwm 10, 10, 14
- vadduwm 11, 11, 15
- vadduwm 24, 24, 28
- vadduwm 25, 25, 29
- vadduwm 26, 26, 30
- vadduwm 27, 27, 31
- xxlor 0, 32+28, 32+28
- xxlor 32+28, 23, 23
- vxor 4, 4, 8
- vxor 5, 5, 9
- vxor 6, 6, 10
- vxor 7, 7, 11
- vxor 20, 20, 24
- vxor 21, 21, 25
- vxor 22, 22, 26
- vxor 23, 23, 27
- vrlw 4, 4, 28 #
- vrlw 5, 5, 28
- vrlw 6, 6, 28
- vrlw 7, 7, 28
- vrlw 20, 20, 28 #
- vrlw 21, 21, 28
- vrlw 22, 22, 28
- vrlw 23, 23, 28
- xxlor 32+28, 0, 0
-
- # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
- xxlor 0, 32+25, 32+25
- xxlor 32+25, 20, 20
- vadduwm 0, 0, 5
- vadduwm 1, 1, 6
- vadduwm 2, 2, 7
- vadduwm 3, 3, 4
- vadduwm 16, 16, 21
- vadduwm 17, 17, 22
- vadduwm 18, 18, 23
- vadduwm 19, 19, 20
-
- vpermxor 15, 15, 0, 25
- vpermxor 12, 12, 1, 25
- vpermxor 13, 13, 2, 25
- vpermxor 14, 14, 3, 25
- vpermxor 31, 31, 16, 25
- vpermxor 28, 28, 17, 25
- vpermxor 29, 29, 18, 25
- vpermxor 30, 30, 19, 25
-
- xxlor 32+25, 0, 0
- vadduwm 10, 10, 15
- vadduwm 11, 11, 12
- vadduwm 8, 8, 13
- vadduwm 9, 9, 14
- vadduwm 26, 26, 31
- vadduwm 27, 27, 28
- vadduwm 24, 24, 29
- vadduwm 25, 25, 30
- vxor 5, 5, 10
- vxor 6, 6, 11
- vxor 7, 7, 8
- vxor 4, 4, 9
- vxor 21, 21, 26
- vxor 22, 22, 27
- vxor 23, 23, 24
- vxor 20, 20, 25
-
- xxlor 0, 32+25, 32+25
- xxlor 32+25, 21, 21
- vrlw 5, 5, 25
- vrlw 6, 6, 25
- vrlw 7, 7, 25
- vrlw 4, 4, 25
- vrlw 21, 21, 25
- vrlw 22, 22, 25
- vrlw 23, 23, 25
- vrlw 20, 20, 25
- xxlor 32+25, 0, 0
-
- vadduwm 0, 0, 5
- vadduwm 1, 1, 6
- vadduwm 2, 2, 7
- vadduwm 3, 3, 4
- vadduwm 16, 16, 21
- vadduwm 17, 17, 22
- vadduwm 18, 18, 23
- vadduwm 19, 19, 20
-
- xxlor 0, 32+25, 32+25
- xxlor 32+25, 22, 22
- vpermxor 15, 15, 0, 25
- vpermxor 12, 12, 1, 25
- vpermxor 13, 13, 2, 25
- vpermxor 14, 14, 3, 25
- vpermxor 31, 31, 16, 25
- vpermxor 28, 28, 17, 25
- vpermxor 29, 29, 18, 25
- vpermxor 30, 30, 19, 25
- xxlor 32+25, 0, 0
-
- vadduwm 10, 10, 15
- vadduwm 11, 11, 12
- vadduwm 8, 8, 13
- vadduwm 9, 9, 14
- vadduwm 26, 26, 31
- vadduwm 27, 27, 28
- vadduwm 24, 24, 29
- vadduwm 25, 25, 30
-
- xxlor 0, 32+28, 32+28
- xxlor 32+28, 23, 23
- vxor 5, 5, 10
- vxor 6, 6, 11
- vxor 7, 7, 8
- vxor 4, 4, 9
- vxor 21, 21, 26
- vxor 22, 22, 27
- vxor 23, 23, 24
- vxor 20, 20, 25
- vrlw 5, 5, 28
- vrlw 6, 6, 28
- vrlw 7, 7, 28
- vrlw 4, 4, 28
- vrlw 21, 21, 28
- vrlw 22, 22, 28
- vrlw 23, 23, 28
- vrlw 20, 20, 28
- xxlor 32+28, 0, 0
-.endm
-
-.macro QT_loop_4x
- # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
- vadduwm 0, 0, 4
- vadduwm 1, 1, 5
- vadduwm 2, 2, 6
- vadduwm 3, 3, 7
- vpermxor 12, 12, 0, 20
- vpermxor 13, 13, 1, 20
- vpermxor 14, 14, 2, 20
- vpermxor 15, 15, 3, 20
- vadduwm 8, 8, 12
- vadduwm 9, 9, 13
- vadduwm 10, 10, 14
- vadduwm 11, 11, 15
- vxor 4, 4, 8
- vxor 5, 5, 9
- vxor 6, 6, 10
- vxor 7, 7, 11
- vrlw 4, 4, 21
- vrlw 5, 5, 21
- vrlw 6, 6, 21
- vrlw 7, 7, 21
- vadduwm 0, 0, 4
- vadduwm 1, 1, 5
- vadduwm 2, 2, 6
- vadduwm 3, 3, 7
- vpermxor 12, 12, 0, 22
- vpermxor 13, 13, 1, 22
- vpermxor 14, 14, 2, 22
- vpermxor 15, 15, 3, 22
- vadduwm 8, 8, 12
- vadduwm 9, 9, 13
- vadduwm 10, 10, 14
- vadduwm 11, 11, 15
- vxor 4, 4, 8
- vxor 5, 5, 9
- vxor 6, 6, 10
- vxor 7, 7, 11
- vrlw 4, 4, 23
- vrlw 5, 5, 23
- vrlw 6, 6, 23
- vrlw 7, 7, 23
-
- # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
- vadduwm 0, 0, 5
- vadduwm 1, 1, 6
- vadduwm 2, 2, 7
- vadduwm 3, 3, 4
- vpermxor 15, 15, 0, 20
- vpermxor 12, 12, 1, 20
- vpermxor 13, 13, 2, 20
- vpermxor 14, 14, 3, 20
- vadduwm 10, 10, 15
- vadduwm 11, 11, 12
- vadduwm 8, 8, 13
- vadduwm 9, 9, 14
- vxor 5, 5, 10
- vxor 6, 6, 11
- vxor 7, 7, 8
- vxor 4, 4, 9
- vrlw 5, 5, 21
- vrlw 6, 6, 21
- vrlw 7, 7, 21
- vrlw 4, 4, 21
- vadduwm 0, 0, 5
- vadduwm 1, 1, 6
- vadduwm 2, 2, 7
- vadduwm 3, 3, 4
- vpermxor 15, 15, 0, 22
- vpermxor 12, 12, 1, 22
- vpermxor 13, 13, 2, 22
- vpermxor 14, 14, 3, 22
- vadduwm 10, 10, 15
- vadduwm 11, 11, 12
- vadduwm 8, 8, 13
- vadduwm 9, 9, 14
- vxor 5, 5, 10
- vxor 6, 6, 11
- vxor 7, 7, 8
- vxor 4, 4, 9
- vrlw 5, 5, 23
- vrlw 6, 6, 23
- vrlw 7, 7, 23
- vrlw 4, 4, 23
-.endm
-
-# Transpose
-.macro TP_4x a0 a1 a2 a3
- xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1
- xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3
- xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1
- xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3
- xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3
- xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3
- xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3
- xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3
-.endm
-
-# key stream = working state + state
-.macro Add_state S
- vadduwm \S+0, \S+0, 16-\S
- vadduwm \S+4, \S+4, 17-\S
- vadduwm \S+8, \S+8, 18-\S
- vadduwm \S+12, \S+12, 19-\S
-
- vadduwm \S+1, \S+1, 16-\S
- vadduwm \S+5, \S+5, 17-\S
- vadduwm \S+9, \S+9, 18-\S
- vadduwm \S+13, \S+13, 19-\S
-
- vadduwm \S+2, \S+2, 16-\S
- vadduwm \S+6, \S+6, 17-\S
- vadduwm \S+10, \S+10, 18-\S
- vadduwm \S+14, \S+14, 19-\S
-
- vadduwm \S+3, \S+3, 16-\S
- vadduwm \S+7, \S+7, 17-\S
- vadduwm \S+11, \S+11, 18-\S
- vadduwm \S+15, \S+15, 19-\S
-.endm
-
-#
-# write 256 bytes
-#
-.macro Write_256 S
- add 9, 14, 5
- add 16, 14, 4
- lxvw4x 0, 0, 9
- lxvw4x 1, 17, 9
- lxvw4x 2, 18, 9
- lxvw4x 3, 19, 9
- lxvw4x 4, 20, 9
- lxvw4x 5, 21, 9
- lxvw4x 6, 22, 9
- lxvw4x 7, 23, 9
- lxvw4x 8, 24, 9
- lxvw4x 9, 25, 9
- lxvw4x 10, 26, 9
- lxvw4x 11, 27, 9
- lxvw4x 12, 28, 9
- lxvw4x 13, 29, 9
- lxvw4x 14, 30, 9
- lxvw4x 15, 31, 9
-
- xxlxor \S+32, \S+32, 0
- xxlxor \S+36, \S+36, 1
- xxlxor \S+40, \S+40, 2
- xxlxor \S+44, \S+44, 3
- xxlxor \S+33, \S+33, 4
- xxlxor \S+37, \S+37, 5
- xxlxor \S+41, \S+41, 6
- xxlxor \S+45, \S+45, 7
- xxlxor \S+34, \S+34, 8
- xxlxor \S+38, \S+38, 9
- xxlxor \S+42, \S+42, 10
- xxlxor \S+46, \S+46, 11
- xxlxor \S+35, \S+35, 12
- xxlxor \S+39, \S+39, 13
- xxlxor \S+43, \S+43, 14
- xxlxor \S+47, \S+47, 15
-
- stxvw4x \S+32, 0, 16
- stxvw4x \S+36, 17, 16
- stxvw4x \S+40, 18, 16
- stxvw4x \S+44, 19, 16
-
- stxvw4x \S+33, 20, 16
- stxvw4x \S+37, 21, 16
- stxvw4x \S+41, 22, 16
- stxvw4x \S+45, 23, 16
-
- stxvw4x \S+34, 24, 16
- stxvw4x \S+38, 25, 16
- stxvw4x \S+42, 26, 16
- stxvw4x \S+46, 27, 16
-
- stxvw4x \S+35, 28, 16
- stxvw4x \S+39, 29, 16
- stxvw4x \S+43, 30, 16
- stxvw4x \S+47, 31, 16
-
-.endm
-
-#
-# chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len, int nrounds);
-#
-SYM_FUNC_START(chacha_p10le_8x)
-.align 5
- cmpdi 6, 0
- ble Out_no_chacha
-
- SAVE_REGS
-
- # r17 - r31 mainly for Write_256 macro.
- li 17, 16
- li 18, 32
- li 19, 48
- li 20, 64
- li 21, 80
- li 22, 96
- li 23, 112
- li 24, 128
- li 25, 144
- li 26, 160
- li 27, 176
- li 28, 192
- li 29, 208
- li 30, 224
- li 31, 240
-
- mr 15, 6 # len
- li 14, 0 # offset to inp and outp
-
- lxvw4x 48, 0, 3 # vr16, constants
- lxvw4x 49, 17, 3 # vr17, key 1
- lxvw4x 50, 18, 3 # vr18, key 2
- lxvw4x 51, 19, 3 # vr19, counter, nonce
-
- # create (0, 1, 2, 3) counters
- vspltisw 0, 0
- vspltisw 1, 1
- vspltisw 2, 2
- vspltisw 3, 3
- vmrghw 4, 0, 1
- vmrglw 5, 2, 3
- vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3)
-
- vspltisw 21, 12
- vspltisw 23, 7
-
- addis 11, 2, permx@toc@ha
- addi 11, 11, permx@toc@l
- lxvw4x 32+20, 0, 11
- lxvw4x 32+22, 17, 11
-
- sradi 8, 7, 1
-
- mtctr 8
-
- # save constants to vsx
- xxlor 16, 48, 48
- xxlor 17, 49, 49
- xxlor 18, 50, 50
- xxlor 19, 51, 51
-
- vspltisw 25, 4
- vspltisw 26, 8
-
- xxlor 25, 32+26, 32+26
- xxlor 24, 32+25, 32+25
-
- vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4)
- xxlor 30, 32+30, 32+30
- xxlor 31, 32+31, 32+31
-
- xxlor 20, 32+20, 32+20
- xxlor 21, 32+21, 32+21
- xxlor 22, 32+22, 32+22
- xxlor 23, 32+23, 32+23
-
- cmpdi 6, 512
- blt Loop_last
-
-Loop_8x:
- xxspltw 32+0, 16, 0
- xxspltw 32+1, 16, 1
- xxspltw 32+2, 16, 2
- xxspltw 32+3, 16, 3
-
- xxspltw 32+4, 17, 0
- xxspltw 32+5, 17, 1
- xxspltw 32+6, 17, 2
- xxspltw 32+7, 17, 3
- xxspltw 32+8, 18, 0
- xxspltw 32+9, 18, 1
- xxspltw 32+10, 18, 2
- xxspltw 32+11, 18, 3
- xxspltw 32+12, 19, 0
- xxspltw 32+13, 19, 1
- xxspltw 32+14, 19, 2
- xxspltw 32+15, 19, 3
- vadduwm 12, 12, 30 # increase counter
-
- xxspltw 32+16, 16, 0
- xxspltw 32+17, 16, 1
- xxspltw 32+18, 16, 2
- xxspltw 32+19, 16, 3
-
- xxspltw 32+20, 17, 0
- xxspltw 32+21, 17, 1
- xxspltw 32+22, 17, 2
- xxspltw 32+23, 17, 3
- xxspltw 32+24, 18, 0
- xxspltw 32+25, 18, 1
- xxspltw 32+26, 18, 2
- xxspltw 32+27, 18, 3
- xxspltw 32+28, 19, 0
- xxspltw 32+29, 19, 1
- vadduwm 28, 28, 31 # increase counter
- xxspltw 32+30, 19, 2
- xxspltw 32+31, 19, 3
-
-.align 5
-quarter_loop_8x:
- QT_loop_8x
-
- bdnz quarter_loop_8x
-
- xxlor 0, 32+30, 32+30
- xxlor 32+30, 30, 30
- vadduwm 12, 12, 30
- xxlor 32+30, 0, 0
- TP_4x 0, 1, 2, 3
- TP_4x 4, 5, 6, 7
- TP_4x 8, 9, 10, 11
- TP_4x 12, 13, 14, 15
-
- xxlor 0, 48, 48
- xxlor 1, 49, 49
- xxlor 2, 50, 50
- xxlor 3, 51, 51
- xxlor 48, 16, 16
- xxlor 49, 17, 17
- xxlor 50, 18, 18
- xxlor 51, 19, 19
- Add_state 0
- xxlor 48, 0, 0
- xxlor 49, 1, 1
- xxlor 50, 2, 2
- xxlor 51, 3, 3
- Write_256 0
- addi 14, 14, 256 # offset +=256
- addi 15, 15, -256 # len -=256
-
- xxlor 5, 32+31, 32+31
- xxlor 32+31, 31, 31
- vadduwm 28, 28, 31
- xxlor 32+31, 5, 5
- TP_4x 16+0, 16+1, 16+2, 16+3
- TP_4x 16+4, 16+5, 16+6, 16+7
- TP_4x 16+8, 16+9, 16+10, 16+11
- TP_4x 16+12, 16+13, 16+14, 16+15
-
- xxlor 32, 16, 16
- xxlor 33, 17, 17
- xxlor 34, 18, 18
- xxlor 35, 19, 19
- Add_state 16
- Write_256 16
- addi 14, 14, 256 # offset +=256
- addi 15, 15, -256 # len +=256
-
- xxlor 32+24, 24, 24
- xxlor 32+25, 25, 25
- xxlor 32+30, 30, 30
- vadduwm 30, 30, 25
- vadduwm 31, 30, 24
- xxlor 30, 32+30, 32+30
- xxlor 31, 32+31, 32+31
-
- cmpdi 15, 0
- beq Out_loop
-
- cmpdi 15, 512
- blt Loop_last
-
- mtctr 8
- b Loop_8x
-
-Loop_last:
- lxvw4x 48, 0, 3 # vr16, constants
- lxvw4x 49, 17, 3 # vr17, key 1
- lxvw4x 50, 18, 3 # vr18, key 2
- lxvw4x 51, 19, 3 # vr19, counter, nonce
-
- vspltisw 21, 12
- vspltisw 23, 7
- addis 11, 2, permx@toc@ha
- addi 11, 11, permx@toc@l
- lxvw4x 32+20, 0, 11
- lxvw4x 32+22, 17, 11
-
- sradi 8, 7, 1
- mtctr 8
-
-Loop_4x:
- vspltw 0, 16, 0
- vspltw 1, 16, 1
- vspltw 2, 16, 2
- vspltw 3, 16, 3
-
- vspltw 4, 17, 0
- vspltw 5, 17, 1
- vspltw 6, 17, 2
- vspltw 7, 17, 3
- vspltw 8, 18, 0
- vspltw 9, 18, 1
- vspltw 10, 18, 2
- vspltw 11, 18, 3
- vspltw 12, 19, 0
- vadduwm 12, 12, 30 # increase counter
- vspltw 13, 19, 1
- vspltw 14, 19, 2
- vspltw 15, 19, 3
-
-.align 5
-quarter_loop:
- QT_loop_4x
-
- bdnz quarter_loop
-
- vadduwm 12, 12, 30
- TP_4x 0, 1, 2, 3
- TP_4x 4, 5, 6, 7
- TP_4x 8, 9, 10, 11
- TP_4x 12, 13, 14, 15
-
- Add_state 0
- Write_256 0
- addi 14, 14, 256 # offset += 256
- addi 15, 15, -256 # len += 256
-
- # Update state counter
- vspltisw 25, 4
- vadduwm 30, 30, 25
-
- cmpdi 15, 0
- beq Out_loop
- cmpdi 15, 256
- blt Out_loop
-
- mtctr 8
- b Loop_4x
-
-Out_loop:
- RESTORE_REGS
- blr
-
-Out_no_chacha:
- li 3, 0
- blr
-SYM_FUNC_END(chacha_p10le_8x)
-
-SYM_DATA_START_LOCAL(PERMX)
-.align 5
-permx:
-.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
-.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
-SYM_DATA_END(PERMX)