diff options
Diffstat (limited to 'arch/powerpc/crypto/chacha-p10le-8x.S')
-rw-r--r-- | arch/powerpc/crypto/chacha-p10le-8x.S | 842 |
1 files changed, 0 insertions, 842 deletions
diff --git a/arch/powerpc/crypto/chacha-p10le-8x.S b/arch/powerpc/crypto/chacha-p10le-8x.S deleted file mode 100644 index 17bedb66b822..000000000000 --- a/arch/powerpc/crypto/chacha-p10le-8x.S +++ /dev/null @@ -1,842 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -# -# Accelerated chacha20 implementation for ppc64le. -# -# Copyright 2023- IBM Corp. All rights reserved -# -#=================================================================================== -# Written by Danny Tsen <dtsen@us.ibm.com> -# -# chacha_p10le_8x(u32 *state, byte *dst, const byte *src, -# size_t len, int nrounds); -# -# do rounds, 8 quarter rounds -# 1. a += b; d ^= a; d <<<= 16; -# 2. c += d; b ^= c; b <<<= 12; -# 3. a += b; d ^= a; d <<<= 8; -# 4. c += d; b ^= c; b <<<= 7 -# -# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16 -# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12 -# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8 -# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7 -# -# 4 blocks (a b c d) -# -# a0 b0 c0 d0 -# a1 b1 c1 d1 -# ... -# a4 b4 c4 d4 -# ... -# a8 b8 c8 d8 -# ... -# a12 b12 c12 d12 -# a13 ... -# a14 ... -# a15 b15 c15 d15 -# -# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) -# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) -# - -#include <asm/ppc_asm.h> -#include <asm/asm-offsets.h> -#include <asm/asm-compat.h> -#include <linux/linkage.h> - -.machine "any" -.text - -.macro SAVE_GPR GPR OFFSET FRAME - std \GPR,\OFFSET(\FRAME) -.endm - -.macro SAVE_VRS VRS OFFSET FRAME - li 16, \OFFSET - stvx \VRS, 16, \FRAME -.endm - -.macro SAVE_VSX VSX OFFSET FRAME - li 16, \OFFSET - stxvx \VSX, 16, \FRAME -.endm - -.macro RESTORE_GPR GPR OFFSET FRAME - ld \GPR,\OFFSET(\FRAME) -.endm - -.macro RESTORE_VRS VRS OFFSET FRAME - li 16, \OFFSET - lvx \VRS, 16, \FRAME -.endm - -.macro RESTORE_VSX VSX OFFSET FRAME - li 16, \OFFSET - lxvx \VSX, 16, \FRAME -.endm - -.macro SAVE_REGS - mflr 0 - std 0, 16(1) - stdu 1,-752(1) - - SAVE_GPR 14, 112, 1 - SAVE_GPR 15, 120, 1 - SAVE_GPR 16, 128, 1 - SAVE_GPR 17, 136, 1 - SAVE_GPR 18, 144, 1 - SAVE_GPR 19, 152, 1 - SAVE_GPR 20, 160, 1 - SAVE_GPR 21, 168, 1 - SAVE_GPR 22, 176, 1 - SAVE_GPR 23, 184, 1 - SAVE_GPR 24, 192, 1 - SAVE_GPR 25, 200, 1 - SAVE_GPR 26, 208, 1 - SAVE_GPR 27, 216, 1 - SAVE_GPR 28, 224, 1 - SAVE_GPR 29, 232, 1 - SAVE_GPR 30, 240, 1 - SAVE_GPR 31, 248, 1 - - addi 9, 1, 256 - SAVE_VRS 20, 0, 9 - SAVE_VRS 21, 16, 9 - SAVE_VRS 22, 32, 9 - SAVE_VRS 23, 48, 9 - SAVE_VRS 24, 64, 9 - SAVE_VRS 25, 80, 9 - SAVE_VRS 26, 96, 9 - SAVE_VRS 27, 112, 9 - SAVE_VRS 28, 128, 9 - SAVE_VRS 29, 144, 9 - SAVE_VRS 30, 160, 9 - SAVE_VRS 31, 176, 9 - - SAVE_VSX 14, 192, 9 - SAVE_VSX 15, 208, 9 - SAVE_VSX 16, 224, 9 - SAVE_VSX 17, 240, 9 - SAVE_VSX 18, 256, 9 - SAVE_VSX 19, 272, 9 - SAVE_VSX 20, 288, 9 - SAVE_VSX 21, 304, 9 - SAVE_VSX 22, 320, 9 - SAVE_VSX 23, 336, 9 - SAVE_VSX 24, 352, 9 - SAVE_VSX 25, 368, 9 - SAVE_VSX 26, 384, 9 - SAVE_VSX 27, 400, 9 - SAVE_VSX 28, 416, 9 - SAVE_VSX 29, 432, 9 - SAVE_VSX 30, 448, 9 - SAVE_VSX 31, 464, 9 -.endm # SAVE_REGS - -.macro RESTORE_REGS - addi 9, 1, 256 - RESTORE_VRS 20, 0, 9 - RESTORE_VRS 21, 16, 9 - RESTORE_VRS 22, 32, 9 - RESTORE_VRS 23, 48, 9 - RESTORE_VRS 24, 64, 9 - RESTORE_VRS 25, 80, 9 - RESTORE_VRS 26, 96, 9 - RESTORE_VRS 27, 112, 9 - RESTORE_VRS 28, 128, 9 - RESTORE_VRS 29, 144, 9 - RESTORE_VRS 30, 160, 9 - RESTORE_VRS 31, 176, 9 - - RESTORE_VSX 14, 192, 9 - RESTORE_VSX 15, 208, 9 - RESTORE_VSX 16, 224, 9 - RESTORE_VSX 17, 240, 9 - RESTORE_VSX 18, 256, 9 - RESTORE_VSX 19, 272, 9 - RESTORE_VSX 20, 288, 9 - RESTORE_VSX 21, 304, 9 - RESTORE_VSX 22, 320, 9 - RESTORE_VSX 23, 336, 9 - RESTORE_VSX 24, 352, 9 - RESTORE_VSX 25, 368, 9 - RESTORE_VSX 26, 384, 9 - RESTORE_VSX 27, 400, 9 - RESTORE_VSX 28, 416, 9 - RESTORE_VSX 29, 432, 9 - RESTORE_VSX 30, 448, 9 - RESTORE_VSX 31, 464, 9 - - RESTORE_GPR 14, 112, 1 - RESTORE_GPR 15, 120, 1 - RESTORE_GPR 16, 128, 1 - RESTORE_GPR 17, 136, 1 - RESTORE_GPR 18, 144, 1 - RESTORE_GPR 19, 152, 1 - RESTORE_GPR 20, 160, 1 - RESTORE_GPR 21, 168, 1 - RESTORE_GPR 22, 176, 1 - RESTORE_GPR 23, 184, 1 - RESTORE_GPR 24, 192, 1 - RESTORE_GPR 25, 200, 1 - RESTORE_GPR 26, 208, 1 - RESTORE_GPR 27, 216, 1 - RESTORE_GPR 28, 224, 1 - RESTORE_GPR 29, 232, 1 - RESTORE_GPR 30, 240, 1 - RESTORE_GPR 31, 248, 1 - - addi 1, 1, 752 - ld 0, 16(1) - mtlr 0 -.endm # RESTORE_REGS - -.macro QT_loop_8x - # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) - xxlor 0, 32+25, 32+25 - xxlor 32+25, 20, 20 - vadduwm 0, 0, 4 - vadduwm 1, 1, 5 - vadduwm 2, 2, 6 - vadduwm 3, 3, 7 - vadduwm 16, 16, 20 - vadduwm 17, 17, 21 - vadduwm 18, 18, 22 - vadduwm 19, 19, 23 - - vpermxor 12, 12, 0, 25 - vpermxor 13, 13, 1, 25 - vpermxor 14, 14, 2, 25 - vpermxor 15, 15, 3, 25 - vpermxor 28, 28, 16, 25 - vpermxor 29, 29, 17, 25 - vpermxor 30, 30, 18, 25 - vpermxor 31, 31, 19, 25 - xxlor 32+25, 0, 0 - vadduwm 8, 8, 12 - vadduwm 9, 9, 13 - vadduwm 10, 10, 14 - vadduwm 11, 11, 15 - vadduwm 24, 24, 28 - vadduwm 25, 25, 29 - vadduwm 26, 26, 30 - vadduwm 27, 27, 31 - vxor 4, 4, 8 - vxor 5, 5, 9 - vxor 6, 6, 10 - vxor 7, 7, 11 - vxor 20, 20, 24 - vxor 21, 21, 25 - vxor 22, 22, 26 - vxor 23, 23, 27 - - xxlor 0, 32+25, 32+25 - xxlor 32+25, 21, 21 - vrlw 4, 4, 25 # - vrlw 5, 5, 25 - vrlw 6, 6, 25 - vrlw 7, 7, 25 - vrlw 20, 20, 25 # - vrlw 21, 21, 25 - vrlw 22, 22, 25 - vrlw 23, 23, 25 - xxlor 32+25, 0, 0 - vadduwm 0, 0, 4 - vadduwm 1, 1, 5 - vadduwm 2, 2, 6 - vadduwm 3, 3, 7 - vadduwm 16, 16, 20 - vadduwm 17, 17, 21 - vadduwm 18, 18, 22 - vadduwm 19, 19, 23 - - xxlor 0, 32+25, 32+25 - xxlor 32+25, 22, 22 - vpermxor 12, 12, 0, 25 - vpermxor 13, 13, 1, 25 - vpermxor 14, 14, 2, 25 - vpermxor 15, 15, 3, 25 - vpermxor 28, 28, 16, 25 - vpermxor 29, 29, 17, 25 - vpermxor 30, 30, 18, 25 - vpermxor 31, 31, 19, 25 - xxlor 32+25, 0, 0 - vadduwm 8, 8, 12 - vadduwm 9, 9, 13 - vadduwm 10, 10, 14 - vadduwm 11, 11, 15 - vadduwm 24, 24, 28 - vadduwm 25, 25, 29 - vadduwm 26, 26, 30 - vadduwm 27, 27, 31 - xxlor 0, 32+28, 32+28 - xxlor 32+28, 23, 23 - vxor 4, 4, 8 - vxor 5, 5, 9 - vxor 6, 6, 10 - vxor 7, 7, 11 - vxor 20, 20, 24 - vxor 21, 21, 25 - vxor 22, 22, 26 - vxor 23, 23, 27 - vrlw 4, 4, 28 # - vrlw 5, 5, 28 - vrlw 6, 6, 28 - vrlw 7, 7, 28 - vrlw 20, 20, 28 # - vrlw 21, 21, 28 - vrlw 22, 22, 28 - vrlw 23, 23, 28 - xxlor 32+28, 0, 0 - - # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) - xxlor 0, 32+25, 32+25 - xxlor 32+25, 20, 20 - vadduwm 0, 0, 5 - vadduwm 1, 1, 6 - vadduwm 2, 2, 7 - vadduwm 3, 3, 4 - vadduwm 16, 16, 21 - vadduwm 17, 17, 22 - vadduwm 18, 18, 23 - vadduwm 19, 19, 20 - - vpermxor 15, 15, 0, 25 - vpermxor 12, 12, 1, 25 - vpermxor 13, 13, 2, 25 - vpermxor 14, 14, 3, 25 - vpermxor 31, 31, 16, 25 - vpermxor 28, 28, 17, 25 - vpermxor 29, 29, 18, 25 - vpermxor 30, 30, 19, 25 - - xxlor 32+25, 0, 0 - vadduwm 10, 10, 15 - vadduwm 11, 11, 12 - vadduwm 8, 8, 13 - vadduwm 9, 9, 14 - vadduwm 26, 26, 31 - vadduwm 27, 27, 28 - vadduwm 24, 24, 29 - vadduwm 25, 25, 30 - vxor 5, 5, 10 - vxor 6, 6, 11 - vxor 7, 7, 8 - vxor 4, 4, 9 - vxor 21, 21, 26 - vxor 22, 22, 27 - vxor 23, 23, 24 - vxor 20, 20, 25 - - xxlor 0, 32+25, 32+25 - xxlor 32+25, 21, 21 - vrlw 5, 5, 25 - vrlw 6, 6, 25 - vrlw 7, 7, 25 - vrlw 4, 4, 25 - vrlw 21, 21, 25 - vrlw 22, 22, 25 - vrlw 23, 23, 25 - vrlw 20, 20, 25 - xxlor 32+25, 0, 0 - - vadduwm 0, 0, 5 - vadduwm 1, 1, 6 - vadduwm 2, 2, 7 - vadduwm 3, 3, 4 - vadduwm 16, 16, 21 - vadduwm 17, 17, 22 - vadduwm 18, 18, 23 - vadduwm 19, 19, 20 - - xxlor 0, 32+25, 32+25 - xxlor 32+25, 22, 22 - vpermxor 15, 15, 0, 25 - vpermxor 12, 12, 1, 25 - vpermxor 13, 13, 2, 25 - vpermxor 14, 14, 3, 25 - vpermxor 31, 31, 16, 25 - vpermxor 28, 28, 17, 25 - vpermxor 29, 29, 18, 25 - vpermxor 30, 30, 19, 25 - xxlor 32+25, 0, 0 - - vadduwm 10, 10, 15 - vadduwm 11, 11, 12 - vadduwm 8, 8, 13 - vadduwm 9, 9, 14 - vadduwm 26, 26, 31 - vadduwm 27, 27, 28 - vadduwm 24, 24, 29 - vadduwm 25, 25, 30 - - xxlor 0, 32+28, 32+28 - xxlor 32+28, 23, 23 - vxor 5, 5, 10 - vxor 6, 6, 11 - vxor 7, 7, 8 - vxor 4, 4, 9 - vxor 21, 21, 26 - vxor 22, 22, 27 - vxor 23, 23, 24 - vxor 20, 20, 25 - vrlw 5, 5, 28 - vrlw 6, 6, 28 - vrlw 7, 7, 28 - vrlw 4, 4, 28 - vrlw 21, 21, 28 - vrlw 22, 22, 28 - vrlw 23, 23, 28 - vrlw 20, 20, 28 - xxlor 32+28, 0, 0 -.endm - -.macro QT_loop_4x - # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) - vadduwm 0, 0, 4 - vadduwm 1, 1, 5 - vadduwm 2, 2, 6 - vadduwm 3, 3, 7 - vpermxor 12, 12, 0, 20 - vpermxor 13, 13, 1, 20 - vpermxor 14, 14, 2, 20 - vpermxor 15, 15, 3, 20 - vadduwm 8, 8, 12 - vadduwm 9, 9, 13 - vadduwm 10, 10, 14 - vadduwm 11, 11, 15 - vxor 4, 4, 8 - vxor 5, 5, 9 - vxor 6, 6, 10 - vxor 7, 7, 11 - vrlw 4, 4, 21 - vrlw 5, 5, 21 - vrlw 6, 6, 21 - vrlw 7, 7, 21 - vadduwm 0, 0, 4 - vadduwm 1, 1, 5 - vadduwm 2, 2, 6 - vadduwm 3, 3, 7 - vpermxor 12, 12, 0, 22 - vpermxor 13, 13, 1, 22 - vpermxor 14, 14, 2, 22 - vpermxor 15, 15, 3, 22 - vadduwm 8, 8, 12 - vadduwm 9, 9, 13 - vadduwm 10, 10, 14 - vadduwm 11, 11, 15 - vxor 4, 4, 8 - vxor 5, 5, 9 - vxor 6, 6, 10 - vxor 7, 7, 11 - vrlw 4, 4, 23 - vrlw 5, 5, 23 - vrlw 6, 6, 23 - vrlw 7, 7, 23 - - # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) - vadduwm 0, 0, 5 - vadduwm 1, 1, 6 - vadduwm 2, 2, 7 - vadduwm 3, 3, 4 - vpermxor 15, 15, 0, 20 - vpermxor 12, 12, 1, 20 - vpermxor 13, 13, 2, 20 - vpermxor 14, 14, 3, 20 - vadduwm 10, 10, 15 - vadduwm 11, 11, 12 - vadduwm 8, 8, 13 - vadduwm 9, 9, 14 - vxor 5, 5, 10 - vxor 6, 6, 11 - vxor 7, 7, 8 - vxor 4, 4, 9 - vrlw 5, 5, 21 - vrlw 6, 6, 21 - vrlw 7, 7, 21 - vrlw 4, 4, 21 - vadduwm 0, 0, 5 - vadduwm 1, 1, 6 - vadduwm 2, 2, 7 - vadduwm 3, 3, 4 - vpermxor 15, 15, 0, 22 - vpermxor 12, 12, 1, 22 - vpermxor 13, 13, 2, 22 - vpermxor 14, 14, 3, 22 - vadduwm 10, 10, 15 - vadduwm 11, 11, 12 - vadduwm 8, 8, 13 - vadduwm 9, 9, 14 - vxor 5, 5, 10 - vxor 6, 6, 11 - vxor 7, 7, 8 - vxor 4, 4, 9 - vrlw 5, 5, 23 - vrlw 6, 6, 23 - vrlw 7, 7, 23 - vrlw 4, 4, 23 -.endm - -# Transpose -.macro TP_4x a0 a1 a2 a3 - xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1 - xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3 - xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1 - xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3 - xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3 - xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3 - xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3 - xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3 -.endm - -# key stream = working state + state -.macro Add_state S - vadduwm \S+0, \S+0, 16-\S - vadduwm \S+4, \S+4, 17-\S - vadduwm \S+8, \S+8, 18-\S - vadduwm \S+12, \S+12, 19-\S - - vadduwm \S+1, \S+1, 16-\S - vadduwm \S+5, \S+5, 17-\S - vadduwm \S+9, \S+9, 18-\S - vadduwm \S+13, \S+13, 19-\S - - vadduwm \S+2, \S+2, 16-\S - vadduwm \S+6, \S+6, 17-\S - vadduwm \S+10, \S+10, 18-\S - vadduwm \S+14, \S+14, 19-\S - - vadduwm \S+3, \S+3, 16-\S - vadduwm \S+7, \S+7, 17-\S - vadduwm \S+11, \S+11, 18-\S - vadduwm \S+15, \S+15, 19-\S -.endm - -# -# write 256 bytes -# -.macro Write_256 S - add 9, 14, 5 - add 16, 14, 4 - lxvw4x 0, 0, 9 - lxvw4x 1, 17, 9 - lxvw4x 2, 18, 9 - lxvw4x 3, 19, 9 - lxvw4x 4, 20, 9 - lxvw4x 5, 21, 9 - lxvw4x 6, 22, 9 - lxvw4x 7, 23, 9 - lxvw4x 8, 24, 9 - lxvw4x 9, 25, 9 - lxvw4x 10, 26, 9 - lxvw4x 11, 27, 9 - lxvw4x 12, 28, 9 - lxvw4x 13, 29, 9 - lxvw4x 14, 30, 9 - lxvw4x 15, 31, 9 - - xxlxor \S+32, \S+32, 0 - xxlxor \S+36, \S+36, 1 - xxlxor \S+40, \S+40, 2 - xxlxor \S+44, \S+44, 3 - xxlxor \S+33, \S+33, 4 - xxlxor \S+37, \S+37, 5 - xxlxor \S+41, \S+41, 6 - xxlxor \S+45, \S+45, 7 - xxlxor \S+34, \S+34, 8 - xxlxor \S+38, \S+38, 9 - xxlxor \S+42, \S+42, 10 - xxlxor \S+46, \S+46, 11 - xxlxor \S+35, \S+35, 12 - xxlxor \S+39, \S+39, 13 - xxlxor \S+43, \S+43, 14 - xxlxor \S+47, \S+47, 15 - - stxvw4x \S+32, 0, 16 - stxvw4x \S+36, 17, 16 - stxvw4x \S+40, 18, 16 - stxvw4x \S+44, 19, 16 - - stxvw4x \S+33, 20, 16 - stxvw4x \S+37, 21, 16 - stxvw4x \S+41, 22, 16 - stxvw4x \S+45, 23, 16 - - stxvw4x \S+34, 24, 16 - stxvw4x \S+38, 25, 16 - stxvw4x \S+42, 26, 16 - stxvw4x \S+46, 27, 16 - - stxvw4x \S+35, 28, 16 - stxvw4x \S+39, 29, 16 - stxvw4x \S+43, 30, 16 - stxvw4x \S+47, 31, 16 - -.endm - -# -# chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len, int nrounds); -# -SYM_FUNC_START(chacha_p10le_8x) -.align 5 - cmpdi 6, 0 - ble Out_no_chacha - - SAVE_REGS - - # r17 - r31 mainly for Write_256 macro. - li 17, 16 - li 18, 32 - li 19, 48 - li 20, 64 - li 21, 80 - li 22, 96 - li 23, 112 - li 24, 128 - li 25, 144 - li 26, 160 - li 27, 176 - li 28, 192 - li 29, 208 - li 30, 224 - li 31, 240 - - mr 15, 6 # len - li 14, 0 # offset to inp and outp - - lxvw4x 48, 0, 3 # vr16, constants - lxvw4x 49, 17, 3 # vr17, key 1 - lxvw4x 50, 18, 3 # vr18, key 2 - lxvw4x 51, 19, 3 # vr19, counter, nonce - - # create (0, 1, 2, 3) counters - vspltisw 0, 0 - vspltisw 1, 1 - vspltisw 2, 2 - vspltisw 3, 3 - vmrghw 4, 0, 1 - vmrglw 5, 2, 3 - vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3) - - vspltisw 21, 12 - vspltisw 23, 7 - - addis 11, 2, permx@toc@ha - addi 11, 11, permx@toc@l - lxvw4x 32+20, 0, 11 - lxvw4x 32+22, 17, 11 - - sradi 8, 7, 1 - - mtctr 8 - - # save constants to vsx - xxlor 16, 48, 48 - xxlor 17, 49, 49 - xxlor 18, 50, 50 - xxlor 19, 51, 51 - - vspltisw 25, 4 - vspltisw 26, 8 - - xxlor 25, 32+26, 32+26 - xxlor 24, 32+25, 32+25 - - vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4) - xxlor 30, 32+30, 32+30 - xxlor 31, 32+31, 32+31 - - xxlor 20, 32+20, 32+20 - xxlor 21, 32+21, 32+21 - xxlor 22, 32+22, 32+22 - xxlor 23, 32+23, 32+23 - - cmpdi 6, 512 - blt Loop_last - -Loop_8x: - xxspltw 32+0, 16, 0 - xxspltw 32+1, 16, 1 - xxspltw 32+2, 16, 2 - xxspltw 32+3, 16, 3 - - xxspltw 32+4, 17, 0 - xxspltw 32+5, 17, 1 - xxspltw 32+6, 17, 2 - xxspltw 32+7, 17, 3 - xxspltw 32+8, 18, 0 - xxspltw 32+9, 18, 1 - xxspltw 32+10, 18, 2 - xxspltw 32+11, 18, 3 - xxspltw 32+12, 19, 0 - xxspltw 32+13, 19, 1 - xxspltw 32+14, 19, 2 - xxspltw 32+15, 19, 3 - vadduwm 12, 12, 30 # increase counter - - xxspltw 32+16, 16, 0 - xxspltw 32+17, 16, 1 - xxspltw 32+18, 16, 2 - xxspltw 32+19, 16, 3 - - xxspltw 32+20, 17, 0 - xxspltw 32+21, 17, 1 - xxspltw 32+22, 17, 2 - xxspltw 32+23, 17, 3 - xxspltw 32+24, 18, 0 - xxspltw 32+25, 18, 1 - xxspltw 32+26, 18, 2 - xxspltw 32+27, 18, 3 - xxspltw 32+28, 19, 0 - xxspltw 32+29, 19, 1 - vadduwm 28, 28, 31 # increase counter - xxspltw 32+30, 19, 2 - xxspltw 32+31, 19, 3 - -.align 5 -quarter_loop_8x: - QT_loop_8x - - bdnz quarter_loop_8x - - xxlor 0, 32+30, 32+30 - xxlor 32+30, 30, 30 - vadduwm 12, 12, 30 - xxlor 32+30, 0, 0 - TP_4x 0, 1, 2, 3 - TP_4x 4, 5, 6, 7 - TP_4x 8, 9, 10, 11 - TP_4x 12, 13, 14, 15 - - xxlor 0, 48, 48 - xxlor 1, 49, 49 - xxlor 2, 50, 50 - xxlor 3, 51, 51 - xxlor 48, 16, 16 - xxlor 49, 17, 17 - xxlor 50, 18, 18 - xxlor 51, 19, 19 - Add_state 0 - xxlor 48, 0, 0 - xxlor 49, 1, 1 - xxlor 50, 2, 2 - xxlor 51, 3, 3 - Write_256 0 - addi 14, 14, 256 # offset +=256 - addi 15, 15, -256 # len -=256 - - xxlor 5, 32+31, 32+31 - xxlor 32+31, 31, 31 - vadduwm 28, 28, 31 - xxlor 32+31, 5, 5 - TP_4x 16+0, 16+1, 16+2, 16+3 - TP_4x 16+4, 16+5, 16+6, 16+7 - TP_4x 16+8, 16+9, 16+10, 16+11 - TP_4x 16+12, 16+13, 16+14, 16+15 - - xxlor 32, 16, 16 - xxlor 33, 17, 17 - xxlor 34, 18, 18 - xxlor 35, 19, 19 - Add_state 16 - Write_256 16 - addi 14, 14, 256 # offset +=256 - addi 15, 15, -256 # len +=256 - - xxlor 32+24, 24, 24 - xxlor 32+25, 25, 25 - xxlor 32+30, 30, 30 - vadduwm 30, 30, 25 - vadduwm 31, 30, 24 - xxlor 30, 32+30, 32+30 - xxlor 31, 32+31, 32+31 - - cmpdi 15, 0 - beq Out_loop - - cmpdi 15, 512 - blt Loop_last - - mtctr 8 - b Loop_8x - -Loop_last: - lxvw4x 48, 0, 3 # vr16, constants - lxvw4x 49, 17, 3 # vr17, key 1 - lxvw4x 50, 18, 3 # vr18, key 2 - lxvw4x 51, 19, 3 # vr19, counter, nonce - - vspltisw 21, 12 - vspltisw 23, 7 - addis 11, 2, permx@toc@ha - addi 11, 11, permx@toc@l - lxvw4x 32+20, 0, 11 - lxvw4x 32+22, 17, 11 - - sradi 8, 7, 1 - mtctr 8 - -Loop_4x: - vspltw 0, 16, 0 - vspltw 1, 16, 1 - vspltw 2, 16, 2 - vspltw 3, 16, 3 - - vspltw 4, 17, 0 - vspltw 5, 17, 1 - vspltw 6, 17, 2 - vspltw 7, 17, 3 - vspltw 8, 18, 0 - vspltw 9, 18, 1 - vspltw 10, 18, 2 - vspltw 11, 18, 3 - vspltw 12, 19, 0 - vadduwm 12, 12, 30 # increase counter - vspltw 13, 19, 1 - vspltw 14, 19, 2 - vspltw 15, 19, 3 - -.align 5 -quarter_loop: - QT_loop_4x - - bdnz quarter_loop - - vadduwm 12, 12, 30 - TP_4x 0, 1, 2, 3 - TP_4x 4, 5, 6, 7 - TP_4x 8, 9, 10, 11 - TP_4x 12, 13, 14, 15 - - Add_state 0 - Write_256 0 - addi 14, 14, 256 # offset += 256 - addi 15, 15, -256 # len += 256 - - # Update state counter - vspltisw 25, 4 - vadduwm 30, 30, 25 - - cmpdi 15, 0 - beq Out_loop - cmpdi 15, 256 - blt Out_loop - - mtctr 8 - b Loop_4x - -Out_loop: - RESTORE_REGS - blr - -Out_no_chacha: - li 3, 0 - blr -SYM_FUNC_END(chacha_p10le_8x) - -SYM_DATA_START_LOCAL(PERMX) -.align 5 -permx: -.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd -.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc -SYM_DATA_END(PERMX) |