1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
|
// SPDX-License-Identifier: GPL-2.0
#include <linux/linkage.h>
#include <asm/cache.h>
#include <asm/assembler.h>
.text
#define state0 v0
#define state1 v1
#define state2 v2
#define state3 v3
#define copy0 v4
#define copy0_q q4
#define copy1 v5
#define copy2 v6
#define copy3 v7
#define copy3_d d7
#define one_d d16
#define one_q q16
#define one_v v16
#define tmp v17
#define rot8 v18
/*
* ARM64 ChaCha20 implementation meant for vDSO. Produces a given positive
* number of blocks of output with nonce 0, taking an input key and 8-bytes
* counter. Importantly does not spill to the stack.
*
* This implementation avoids d8-d15 because they are callee-save in user
* space.
*
* void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
* const uint8_t *key,
* uint32_t *counter,
* size_t nblocks)
*
* x0: output bytes
* x1: 32-byte key input
* x2: 8-byte counter input/output
* x3: number of 64-byte block to write to output
*/
SYM_FUNC_START(__arch_chacha20_blocks_nostack)
/* copy0 = "expand 32-byte k" */
mov_q x8, 0x3320646e61707865
mov_q x9, 0x6b20657479622d32
mov copy0.d[0], x8
mov copy0.d[1], x9
/* copy1,copy2 = key */
ld1 { copy1.4s, copy2.4s }, [x1]
/* copy3 = counter || zero nonce */
ld1 { copy3.2s }, [x2]
movi one_v.2s, #1
uzp1 one_v.4s, one_v.4s, one_v.4s
.Lblock:
/* copy state to auxiliary vectors for the final add after the permute. */
mov state0.16b, copy0.16b
mov state1.16b, copy1.16b
mov state2.16b, copy2.16b
mov state3.16b, copy3.16b
mov w4, 20
.Lpermute:
/*
* Permute one 64-byte block where the state matrix is stored in the four NEON
* registers state0-state3. It performs matrix operations on four words in parallel,
* but requires shuffling to rearrange the words after each round.
*/
.Ldoubleround:
/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
add state0.4s, state0.4s, state1.4s
eor state3.16b, state3.16b, state0.16b
rev32 state3.8h, state3.8h
/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
add state2.4s, state2.4s, state3.4s
eor tmp.16b, state1.16b, state2.16b
shl state1.4s, tmp.4s, #12
sri state1.4s, tmp.4s, #20
/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
add state0.4s, state0.4s, state1.4s
eor tmp.16b, state3.16b, state0.16b
shl state3.4s, tmp.4s, #8
sri state3.4s, tmp.4s, #24
/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
add state2.4s, state2.4s, state3.4s
eor tmp.16b, state1.16b, state2.16b
shl state1.4s, tmp.4s, #7
sri state1.4s, tmp.4s, #25
/* state1[0,1,2,3] = state1[1,2,3,0] */
ext state1.16b, state1.16b, state1.16b, #4
/* state2[0,1,2,3] = state2[2,3,0,1] */
ext state2.16b, state2.16b, state2.16b, #8
/* state3[0,1,2,3] = state3[1,2,3,0] */
ext state3.16b, state3.16b, state3.16b, #12
/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
add state0.4s, state0.4s, state1.4s
eor state3.16b, state3.16b, state0.16b
rev32 state3.8h, state3.8h
/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
add state2.4s, state2.4s, state3.4s
eor tmp.16b, state1.16b, state2.16b
shl state1.4s, tmp.4s, #12
sri state1.4s, tmp.4s, #20
/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
add state0.4s, state0.4s, state1.4s
eor tmp.16b, state3.16b, state0.16b
shl state3.4s, tmp.4s, #8
sri state3.4s, tmp.4s, #24
/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
add state2.4s, state2.4s, state3.4s
eor tmp.16b, state1.16b, state2.16b
shl state1.4s, tmp.4s, #7
sri state1.4s, tmp.4s, #25
/* state1[0,1,2,3] = state1[3,0,1,2] */
ext state1.16b, state1.16b, state1.16b, #12
/* state2[0,1,2,3] = state2[2,3,0,1] */
ext state2.16b, state2.16b, state2.16b, #8
/* state3[0,1,2,3] = state3[1,2,3,0] */
ext state3.16b, state3.16b, state3.16b, #4
subs w4, w4, #2
b.ne .Ldoubleround
/* output0 = state0 + state0 */
add state0.4s, state0.4s, copy0.4s
/* output1 = state1 + state1 */
add state1.4s, state1.4s, copy1.4s
/* output2 = state2 + state2 */
add state2.4s, state2.4s, copy2.4s
/* output2 = state3 + state3 */
add state3.4s, state3.4s, copy3.4s
st1 { state0.16b - state3.16b }, [x0]
/*
* ++copy3.counter, the 'add' clears the upper half of the SIMD register
* which is the expected behaviour here.
*/
add copy3_d, copy3_d, one_d
/* output += 64, --nblocks */
add x0, x0, 64
subs x3, x3, #1
b.ne .Lblock
/* counter = copy3.counter */
st1 { copy3.2s }, [x2]
/* Zero out the potentially sensitive regs, in case nothing uses these again. */
movi state0.16b, #0
movi state1.16b, #0
movi state2.16b, #0
movi state3.16b, #0
movi copy1.16b, #0
movi copy2.16b, #0
ret
SYM_FUNC_END(__arch_chacha20_blocks_nostack)
emit_aarch64_feature_1_and
|