arch/arm64/kernel/vdso/vgetrandom-chacha.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172

// SPDX-License-Identifier: GPL-2.0

#include <linux/linkage.h>
#include <asm/cache.h>
#include <asm/assembler.h>

	.text

#define state0		v0
#define state1		v1
#define state2		v2
#define state3		v3
#define copy0		v4
#define copy0_q		q4
#define copy1		v5
#define copy2		v6
#define copy3		v7
#define copy3_d		d7
#define one_d		d16
#define one_q		q16
#define one_v		v16
#define tmp		v17
#define rot8		v18

/*
 * ARM64 ChaCha20 implementation meant for vDSO.  Produces a given positive
 * number of blocks of output with nonce 0, taking an input key and 8-bytes
 * counter.  Importantly does not spill to the stack.
 *
 * This implementation avoids d8-d15 because they are callee-save in user
 * space.
 *
 * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
 *				       const uint8_t *key,
 * 				       uint32_t *counter,
 *				       size_t nblocks)
 *
 * 	x0: output bytes
 *	x1: 32-byte key input
 *	x2: 8-byte counter input/output
 *	x3: number of 64-byte block to write to output
 */
SYM_FUNC_START(__arch_chacha20_blocks_nostack)

	/* copy0 = "expand 32-byte k" */
	mov_q		x8, 0x3320646e61707865
	mov_q		x9, 0x6b20657479622d32
	mov		copy0.d[0], x8
	mov		copy0.d[1], x9

	/* copy1,copy2 = key */
	ld1		{ copy1.4s, copy2.4s }, [x1]
	/* copy3 = counter || zero nonce  */
	ld1		{ copy3.2s }, [x2]

	movi		one_v.2s, #1
	uzp1		one_v.4s, one_v.4s, one_v.4s

.Lblock:
	/* copy state to auxiliary vectors for the final add after the permute.  */
	mov		state0.16b, copy0.16b
	mov		state1.16b, copy1.16b
	mov		state2.16b, copy2.16b
	mov		state3.16b, copy3.16b

	mov		w4, 20
.Lpermute:
	/*
	 * Permute one 64-byte block where the state matrix is stored in the four NEON
	 * registers state0-state3.  It performs matrix operations on four words in parallel,
	 * but requires shuffling to rearrange the words after each round.
	 */

.Ldoubleround:
	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
	add		state0.4s, state0.4s, state1.4s
	eor		state3.16b, state3.16b, state0.16b
	rev32		state3.8h, state3.8h

	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
	add		state2.4s, state2.4s, state3.4s
	eor		tmp.16b, state1.16b, state2.16b
	shl		state1.4s, tmp.4s, #12
	sri		state1.4s, tmp.4s, #20

	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
	add		state0.4s, state0.4s, state1.4s
	eor		tmp.16b, state3.16b, state0.16b
	shl		state3.4s, tmp.4s, #8
	sri		state3.4s, tmp.4s, #24

	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
	add		state2.4s, state2.4s, state3.4s
	eor		tmp.16b, state1.16b, state2.16b
	shl		state1.4s, tmp.4s, #7
	sri		state1.4s, tmp.4s, #25

	/* state1[0,1,2,3] = state1[1,2,3,0] */
	ext		state1.16b, state1.16b, state1.16b, #4
	/* state2[0,1,2,3] = state2[2,3,0,1] */
	ext		state2.16b, state2.16b, state2.16b, #8
	/* state3[0,1,2,3] = state3[1,2,3,0] */
	ext		state3.16b, state3.16b, state3.16b, #12

	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
	add		state0.4s, state0.4s, state1.4s
	eor		state3.16b, state3.16b, state0.16b
	rev32		state3.8h, state3.8h

	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
	add		state2.4s, state2.4s, state3.4s
	eor		tmp.16b, state1.16b, state2.16b
	shl		state1.4s, tmp.4s, #12
	sri		state1.4s, tmp.4s, #20

	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
	add		state0.4s, state0.4s, state1.4s
	eor		tmp.16b, state3.16b, state0.16b
	shl		state3.4s, tmp.4s, #8
	sri		state3.4s, tmp.4s, #24

	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
	add		state2.4s, state2.4s, state3.4s
	eor		tmp.16b, state1.16b, state2.16b
	shl		state1.4s, tmp.4s, #7
	sri		state1.4s, tmp.4s, #25

	/* state1[0,1,2,3] = state1[3,0,1,2] */
	ext		state1.16b, state1.16b, state1.16b, #12
	/* state2[0,1,2,3] = state2[2,3,0,1] */
	ext		state2.16b, state2.16b, state2.16b, #8
	/* state3[0,1,2,3] = state3[1,2,3,0] */
	ext		state3.16b, state3.16b, state3.16b, #4

	subs		w4, w4, #2
	b.ne		.Ldoubleround

	/* output0 = state0 + state0 */
	add		state0.4s, state0.4s, copy0.4s
	/* output1 = state1 + state1 */
	add		state1.4s, state1.4s, copy1.4s
	/* output2 = state2 + state2 */
	add		state2.4s, state2.4s, copy2.4s
	/* output2 = state3 + state3 */
	add		state3.4s, state3.4s, copy3.4s
	st1		{ state0.16b - state3.16b }, [x0]

	/*
	 * ++copy3.counter, the 'add' clears the upper half of the SIMD register
	 * which is the expected behaviour here.
	 */
	add		copy3_d, copy3_d, one_d

	/* output += 64, --nblocks */
	add		x0, x0, 64
	subs		x3, x3, #1
	b.ne		.Lblock

	/* counter = copy3.counter */
	st1		{ copy3.2s }, [x2]

	/* Zero out the potentially sensitive regs, in case nothing uses these again. */
	movi		state0.16b, #0
	movi		state1.16b, #0
	movi		state2.16b, #0
	movi		state3.16b, #0
	movi		copy1.16b, #0
	movi		copy2.16b, #0
	ret
SYM_FUNC_END(__arch_chacha20_blocks_nostack)

emit_aarch64_feature_1_and