summaryrefslogtreecommitdiff
path: root/arch/s390/crypto/chacha-s390.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390/crypto/chacha-s390.S')
-rw-r--r--arch/s390/crypto/chacha-s390.S907
1 files changed, 907 insertions, 0 deletions
diff --git a/arch/s390/crypto/chacha-s390.S b/arch/s390/crypto/chacha-s390.S
new file mode 100644
index 000000000000..badf5c49717d
--- /dev/null
+++ b/arch/s390/crypto/chacha-s390.S
@@ -0,0 +1,907 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Original implementation written by Andy Polyakov, @dot-asm.
+ * This is an adaptation of the original code for kernel use.
+ *
+ * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+#include <asm/nospec-insn.h>
+#include <asm/vx-insn.h>
+
+#define SP %r15
+#define FRAME (16 * 8 + 4 * 8)
+
+.data
+.align 32
+
+.Lsigma:
+.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
+.long 1,0,0,0
+.long 2,0,0,0
+.long 3,0,0,0
+.long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap
+
+.long 0,1,2,3
+.long 0x61707865,0x61707865,0x61707865,0x61707865 # smashed sigma
+.long 0x3320646e,0x3320646e,0x3320646e,0x3320646e
+.long 0x79622d32,0x79622d32,0x79622d32,0x79622d32
+.long 0x6b206574,0x6b206574,0x6b206574,0x6b206574
+
+.previous
+
+ GEN_BR_THUNK %r14
+
+.text
+
+#############################################################################
+# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
+# counst u32 *key, const u32 *counter)
+
+#define OUT %r2
+#define INP %r3
+#define LEN %r4
+#define KEY %r5
+#define COUNTER %r6
+
+#define BEPERM %v31
+#define CTR %v26
+
+#define K0 %v16
+#define K1 %v17
+#define K2 %v18
+#define K3 %v19
+
+#define XA0 %v0
+#define XA1 %v1
+#define XA2 %v2
+#define XA3 %v3
+
+#define XB0 %v4
+#define XB1 %v5
+#define XB2 %v6
+#define XB3 %v7
+
+#define XC0 %v8
+#define XC1 %v9
+#define XC2 %v10
+#define XC3 %v11
+
+#define XD0 %v12
+#define XD1 %v13
+#define XD2 %v14
+#define XD3 %v15
+
+#define XT0 %v27
+#define XT1 %v28
+#define XT2 %v29
+#define XT3 %v30
+
+ENTRY(chacha20_vx_4x)
+ stmg %r6,%r7,6*8(SP)
+
+ larl %r7,.Lsigma
+ lhi %r0,10
+ lhi %r1,0
+
+ VL K0,0,,%r7 # load sigma
+ VL K1,0,,KEY # load key
+ VL K2,16,,KEY
+ VL K3,0,,COUNTER # load counter
+
+ VL BEPERM,0x40,,%r7
+ VL CTR,0x50,,%r7
+
+ VLM XA0,XA3,0x60,%r7,4 # load [smashed] sigma
+
+ VREPF XB0,K1,0 # smash the key
+ VREPF XB1,K1,1
+ VREPF XB2,K1,2
+ VREPF XB3,K1,3
+
+ VREPF XD0,K3,0
+ VREPF XD1,K3,1
+ VREPF XD2,K3,2
+ VREPF XD3,K3,3
+ VAF XD0,XD0,CTR
+
+ VREPF XC0,K2,0
+ VREPF XC1,K2,1
+ VREPF XC2,K2,2
+ VREPF XC3,K2,3
+
+.Loop_4x:
+ VAF XA0,XA0,XB0
+ VX XD0,XD0,XA0
+ VERLLF XD0,XD0,16
+
+ VAF XA1,XA1,XB1
+ VX XD1,XD1,XA1
+ VERLLF XD1,XD1,16
+
+ VAF XA2,XA2,XB2
+ VX XD2,XD2,XA2
+ VERLLF XD2,XD2,16
+
+ VAF XA3,XA3,XB3
+ VX XD3,XD3,XA3
+ VERLLF XD3,XD3,16
+
+ VAF XC0,XC0,XD0
+ VX XB0,XB0,XC0
+ VERLLF XB0,XB0,12
+
+ VAF XC1,XC1,XD1
+ VX XB1,XB1,XC1
+ VERLLF XB1,XB1,12
+
+ VAF XC2,XC2,XD2
+ VX XB2,XB2,XC2
+ VERLLF XB2,XB2,12
+
+ VAF XC3,XC3,XD3
+ VX XB3,XB3,XC3
+ VERLLF XB3,XB3,12
+
+ VAF XA0,XA0,XB0
+ VX XD0,XD0,XA0
+ VERLLF XD0,XD0,8
+
+ VAF XA1,XA1,XB1
+ VX XD1,XD1,XA1
+ VERLLF XD1,XD1,8
+
+ VAF XA2,XA2,XB2
+ VX XD2,XD2,XA2
+ VERLLF XD2,XD2,8
+
+ VAF XA3,XA3,XB3
+ VX XD3,XD3,XA3
+ VERLLF XD3,XD3,8
+
+ VAF XC0,XC0,XD0
+ VX XB0,XB0,XC0
+ VERLLF XB0,XB0,7
+
+ VAF XC1,XC1,XD1
+ VX XB1,XB1,XC1
+ VERLLF XB1,XB1,7
+
+ VAF XC2,XC2,XD2
+ VX XB2,XB2,XC2
+ VERLLF XB2,XB2,7
+
+ VAF XC3,XC3,XD3
+ VX XB3,XB3,XC3
+ VERLLF XB3,XB3,7
+
+ VAF XA0,XA0,XB1
+ VX XD3,XD3,XA0
+ VERLLF XD3,XD3,16
+
+ VAF XA1,XA1,XB2
+ VX XD0,XD0,XA1
+ VERLLF XD0,XD0,16
+
+ VAF XA2,XA2,XB3
+ VX XD1,XD1,XA2
+ VERLLF XD1,XD1,16
+
+ VAF XA3,XA3,XB0
+ VX XD2,XD2,XA3
+ VERLLF XD2,XD2,16
+
+ VAF XC2,XC2,XD3
+ VX XB1,XB1,XC2
+ VERLLF XB1,XB1,12
+
+ VAF XC3,XC3,XD0
+ VX XB2,XB2,XC3
+ VERLLF XB2,XB2,12
+
+ VAF XC0,XC0,XD1
+ VX XB3,XB3,XC0
+ VERLLF XB3,XB3,12
+
+ VAF XC1,XC1,XD2
+ VX XB0,XB0,XC1
+ VERLLF XB0,XB0,12
+
+ VAF XA0,XA0,XB1
+ VX XD3,XD3,XA0
+ VERLLF XD3,XD3,8
+
+ VAF XA1,XA1,XB2
+ VX XD0,XD0,XA1
+ VERLLF XD0,XD0,8
+
+ VAF XA2,XA2,XB3
+ VX XD1,XD1,XA2
+ VERLLF XD1,XD1,8
+
+ VAF XA3,XA3,XB0
+ VX XD2,XD2,XA3
+ VERLLF XD2,XD2,8
+
+ VAF XC2,XC2,XD3
+ VX XB1,XB1,XC2
+ VERLLF XB1,XB1,7
+
+ VAF XC3,XC3,XD0
+ VX XB2,XB2,XC3
+ VERLLF XB2,XB2,7
+
+ VAF XC0,XC0,XD1
+ VX XB3,XB3,XC0
+ VERLLF XB3,XB3,7
+
+ VAF XC1,XC1,XD2
+ VX XB0,XB0,XC1
+ VERLLF XB0,XB0,7
+ brct %r0,.Loop_4x
+
+ VAF XD0,XD0,CTR
+
+ VMRHF XT0,XA0,XA1 # transpose data
+ VMRHF XT1,XA2,XA3
+ VMRLF XT2,XA0,XA1
+ VMRLF XT3,XA2,XA3
+ VPDI XA0,XT0,XT1,0b0000
+ VPDI XA1,XT0,XT1,0b0101
+ VPDI XA2,XT2,XT3,0b0000
+ VPDI XA3,XT2,XT3,0b0101
+
+ VMRHF XT0,XB0,XB1
+ VMRHF XT1,XB2,XB3
+ VMRLF XT2,XB0,XB1
+ VMRLF XT3,XB2,XB3
+ VPDI XB0,XT0,XT1,0b0000
+ VPDI XB1,XT0,XT1,0b0101
+ VPDI XB2,XT2,XT3,0b0000
+ VPDI XB3,XT2,XT3,0b0101
+
+ VMRHF XT0,XC0,XC1
+ VMRHF XT1,XC2,XC3
+ VMRLF XT2,XC0,XC1
+ VMRLF XT3,XC2,XC3
+ VPDI XC0,XT0,XT1,0b0000
+ VPDI XC1,XT0,XT1,0b0101
+ VPDI XC2,XT2,XT3,0b0000
+ VPDI XC3,XT2,XT3,0b0101
+
+ VMRHF XT0,XD0,XD1
+ VMRHF XT1,XD2,XD3
+ VMRLF XT2,XD0,XD1
+ VMRLF XT3,XD2,XD3
+ VPDI XD0,XT0,XT1,0b0000
+ VPDI XD1,XT0,XT1,0b0101
+ VPDI XD2,XT2,XT3,0b0000
+ VPDI XD3,XT2,XT3,0b0101
+
+ VAF XA0,XA0,K0
+ VAF XB0,XB0,K1
+ VAF XC0,XC0,K2
+ VAF XD0,XD0,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+
+ VAF XA0,XA1,K0
+ VAF XB0,XB1,K1
+ VAF XC0,XC1,K2
+ VAF XD0,XD1,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ .insn rilu,0xc20e00000000,LEN,0x40 # clgfi LEN,0x40
+ jl .Ltail_4x
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_4x
+
+ VAF XA0,XA2,K0
+ VAF XB0,XB2,K1
+ VAF XC0,XC2,K2
+ VAF XD0,XD2,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ .insn rilu,0xc20e00000000,LEN,0x40 # clgfi LEN,0x40
+ jl .Ltail_4x
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_4x
+
+ VAF XA0,XA3,K0
+ VAF XB0,XB3,K1
+ VAF XC0,XC3,K2
+ VAF XD0,XD3,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ .insn rilu,0xc20e00000000,LEN,0x40 # clgfi LEN,0x40
+ jl .Ltail_4x
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+.Ldone_4x:
+ lmg %r6,%r7,6*8(SP)
+ BR_EX %r14
+
+.Ltail_4x:
+ VLR XT0,XC0
+ VLR XT1,XD0
+
+ VST XA0,8*8+0x00,,SP
+ VST XB0,8*8+0x10,,SP
+ VST XT0,8*8+0x20,,SP
+ VST XT1,8*8+0x30,,SP
+
+ lghi %r1,0
+
+.Loop_tail_4x:
+ llgc %r5,0(%r1,INP)
+ llgc %r6,8*8(%r1,SP)
+ xr %r6,%r5
+ stc %r6,0(%r1,OUT)
+ la %r1,1(%r1)
+ brct LEN,.Loop_tail_4x
+
+ lmg %r6,%r7,6*8(SP)
+ BR_EX %r14
+ENDPROC(chacha20_vx_4x)
+
+#undef OUT
+#undef INP
+#undef LEN
+#undef KEY
+#undef COUNTER
+
+#undef BEPERM
+
+#undef K0
+#undef K1
+#undef K2
+#undef K3
+
+
+#############################################################################
+# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
+# counst u32 *key, const u32 *counter)
+
+#define OUT %r2
+#define INP %r3
+#define LEN %r4
+#define KEY %r5
+#define COUNTER %r6
+
+#define BEPERM %v31
+
+#define K0 %v27
+#define K1 %v24
+#define K2 %v25
+#define K3 %v26
+
+#define A0 %v0
+#define B0 %v1
+#define C0 %v2
+#define D0 %v3
+
+#define A1 %v4
+#define B1 %v5
+#define C1 %v6
+#define D1 %v7
+
+#define A2 %v8
+#define B2 %v9
+#define C2 %v10
+#define D2 %v11
+
+#define A3 %v12
+#define B3 %v13
+#define C3 %v14
+#define D3 %v15
+
+#define A4 %v16
+#define B4 %v17
+#define C4 %v18
+#define D4 %v19
+
+#define A5 %v20
+#define B5 %v21
+#define C5 %v22
+#define D5 %v23
+
+#define T0 %v27
+#define T1 %v28
+#define T2 %v29
+#define T3 %v30
+
+ENTRY(chacha20_vx)
+ .insn rilu,0xc20e00000000,LEN,256 # clgfi LEN,256
+ jle chacha20_vx_4x
+ stmg %r6,%r7,6*8(SP)
+
+ lghi %r1,-FRAME
+ lgr %r0,SP
+ la SP,0(%r1,SP)
+ stg %r0,0(SP) # back-chain
+
+ larl %r7,.Lsigma
+ lhi %r0,10
+
+ VLM K1,K2,0,KEY,0 # load key
+ VL K3,0,,COUNTER # load counter
+
+ VLM K0,BEPERM,0,%r7,4 # load sigma, increments, ...
+
+.Loop_outer_vx:
+ VLR A0,K0
+ VLR B0,K1
+ VLR A1,K0
+ VLR B1,K1
+ VLR A2,K0
+ VLR B2,K1
+ VLR A3,K0
+ VLR B3,K1
+ VLR A4,K0
+ VLR B4,K1
+ VLR A5,K0
+ VLR B5,K1
+
+ VLR D0,K3
+ VAF D1,K3,T1 # K[3]+1
+ VAF D2,K3,T2 # K[3]+2
+ VAF D3,K3,T3 # K[3]+3
+ VAF D4,D2,T2 # K[3]+4
+ VAF D5,D2,T3 # K[3]+5
+
+ VLR C0,K2
+ VLR C1,K2
+ VLR C2,K2
+ VLR C3,K2
+ VLR C4,K2
+ VLR C5,K2
+
+ VLR T1,D1
+ VLR T2,D2
+ VLR T3,D3
+
+.Loop_vx:
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,16
+ VERLLF D1,D1,16
+ VERLLF D2,D2,16
+ VERLLF D3,D3,16
+ VERLLF D4,D4,16
+ VERLLF D5,D5,16
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,12
+ VERLLF B1,B1,12
+ VERLLF B2,B2,12
+ VERLLF B3,B3,12
+ VERLLF B4,B4,12
+ VERLLF B5,B5,12
+
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,8
+ VERLLF D1,D1,8
+ VERLLF D2,D2,8
+ VERLLF D3,D3,8
+ VERLLF D4,D4,8
+ VERLLF D5,D5,8
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,7
+ VERLLF B1,B1,7
+ VERLLF B2,B2,7
+ VERLLF B3,B3,7
+ VERLLF B4,B4,7
+ VERLLF B5,B5,7
+
+ VSLDB C0,C0,C0,8
+ VSLDB C1,C1,C1,8
+ VSLDB C2,C2,C2,8
+ VSLDB C3,C3,C3,8
+ VSLDB C4,C4,C4,8
+ VSLDB C5,C5,C5,8
+ VSLDB B0,B0,B0,4
+ VSLDB B1,B1,B1,4
+ VSLDB B2,B2,B2,4
+ VSLDB B3,B3,B3,4
+ VSLDB B4,B4,B4,4
+ VSLDB B5,B5,B5,4
+ VSLDB D0,D0,D0,12
+ VSLDB D1,D1,D1,12
+ VSLDB D2,D2,D2,12
+ VSLDB D3,D3,D3,12
+ VSLDB D4,D4,D4,12
+ VSLDB D5,D5,D5,12
+
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,16
+ VERLLF D1,D1,16
+ VERLLF D2,D2,16
+ VERLLF D3,D3,16
+ VERLLF D4,D4,16
+ VERLLF D5,D5,16
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,12
+ VERLLF B1,B1,12
+ VERLLF B2,B2,12
+ VERLLF B3,B3,12
+ VERLLF B4,B4,12
+ VERLLF B5,B5,12
+
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,8
+ VERLLF D1,D1,8
+ VERLLF D2,D2,8
+ VERLLF D3,D3,8
+ VERLLF D4,D4,8
+ VERLLF D5,D5,8
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,7
+ VERLLF B1,B1,7
+ VERLLF B2,B2,7
+ VERLLF B3,B3,7
+ VERLLF B4,B4,7
+ VERLLF B5,B5,7
+
+ VSLDB C0,C0,C0,8
+ VSLDB C1,C1,C1,8
+ VSLDB C2,C2,C2,8
+ VSLDB C3,C3,C3,8
+ VSLDB C4,C4,C4,8
+ VSLDB C5,C5,C5,8
+ VSLDB B0,B0,B0,12
+ VSLDB B1,B1,B1,12
+ VSLDB B2,B2,B2,12
+ VSLDB B3,B3,B3,12
+ VSLDB B4,B4,B4,12
+ VSLDB B5,B5,B5,12
+ VSLDB D0,D0,D0,4
+ VSLDB D1,D1,D1,4
+ VSLDB D2,D2,D2,4
+ VSLDB D3,D3,D3,4
+ VSLDB D4,D4,D4,4
+ VSLDB D5,D5,D5,4
+ brct %r0,.Loop_vx
+
+ VAF A0,A0,K0
+ VAF B0,B0,K1
+ VAF C0,C0,K2
+ VAF D0,D0,K3
+ VAF A1,A1,K0
+ VAF D1,D1,T1 # +K[3]+1
+
+ VPERM A0,A0,A0,BEPERM
+ VPERM B0,B0,B0,BEPERM
+ VPERM C0,C0,C0,BEPERM
+ VPERM D0,D0,D0,BEPERM
+
+ .insn rilu,0xc20e00000000,LEN,0x40 # clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VAF D2,D2,T2 # +K[3]+2
+ VAF D3,D3,T3 # +K[3]+3
+ VLM T0,T3,0,INP,0
+
+ VX A0,A0,T0
+ VX B0,B0,T1
+ VX C0,C0,T2
+ VX D0,D0,T3
+
+ VLM K0,T3,0,%r7,4 # re-load sigma and increments
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF B1,B1,K1
+ VAF C1,C1,K2
+
+ VPERM A0,A1,A1,BEPERM
+ VPERM B0,B1,B1,BEPERM
+ VPERM C0,C1,C1,BEPERM
+ VPERM D0,D1,D1,BEPERM
+
+ .insn rilu,0xc20e00000000,LEN,0x40 # clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A2,A2,K0
+ VAF B2,B2,K1
+ VAF C2,C2,K2
+
+ VPERM A0,A2,A2,BEPERM
+ VPERM B0,B2,B2,BEPERM
+ VPERM C0,C2,C2,BEPERM
+ VPERM D0,D2,D2,BEPERM
+
+ .insn rilu,0xc20e00000000,LEN,0x40 # clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A3,A3,K0
+ VAF B3,B3,K1
+ VAF C3,C3,K2
+ VAF D2,K3,T3 # K[3]+3
+
+ VPERM A0,A3,A3,BEPERM
+ VPERM B0,B3,B3,BEPERM
+ VPERM C0,C3,C3,BEPERM
+ VPERM D0,D3,D3,BEPERM
+
+ .insn rilu,0xc20e00000000,LEN,0x40 # clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VAF D3,D2,T1 # K[3]+4
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A4,A4,K0
+ VAF B4,B4,K1
+ VAF C4,C4,K2
+ VAF D4,D4,D3 # +K[3]+4
+ VAF D3,D3,T1 # K[3]+5
+ VAF K3,D2,T3 # K[3]+=6
+
+ VPERM A0,A4,A4,BEPERM
+ VPERM B0,B4,B4,BEPERM
+ VPERM C0,C4,C4,BEPERM
+ VPERM D0,D4,D4,BEPERM
+
+ .insn rilu,0xc20e00000000,LEN,0x40 # clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A5,A5,K0
+ VAF B5,B5,K1
+ VAF C5,C5,K2
+ VAF D5,D5,D3 # +K[3]+5
+
+ VPERM A0,A5,A5,BEPERM
+ VPERM B0,B5,B5,BEPERM
+ VPERM C0,C5,C5,BEPERM
+ VPERM D0,D5,D5,BEPERM
+
+ .insn rilu,0xc20e00000000,LEN,0x40 # clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ lhi %r0,10
+ aghi LEN,-0x40
+ jne .Loop_outer_vx
+
+.Ldone_vx:
+ lmg %r6,%r7,FRAME+6*8(SP)
+ la SP,FRAME(SP)
+ BR_EX %r14
+
+.Ltail_vx:
+ VSTM A0,D0,8*8,SP,3
+ lghi %r1,0
+
+.Loop_tail_vx:
+ llgc %r5,0(%r1,INP)
+ llgc %r6,8*8(%r1,SP)
+ xr %r6,%r5
+ stc %r6,0(%r1,OUT)
+ la %r1,1(%r1)
+ brct LEN,.Loop_tail_vx
+
+ lmg %r6,%r7,FRAME+6*8(SP)
+ la SP,FRAME(SP)
+ BR_EX %r14
+ENDPROC(chacha20_vx)
+
+.previous