1/*
2 * ChaCha/XChaCha NEON helper functions
3 *
4 * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * Originally based on:
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
12 *
13 * Copyright (C) 2015 Martin Willi
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 */
20
21#include <linux/linkage.h>
22#include <asm/assembler.h>
23#include <asm/cache.h>
24
25 .text
26 .align 6
27
28/*
29 * chacha_permute - permute one block
30 *
31 * Permute one 64-byte block where the state matrix is stored in the four NEON
32 * registers v0-v3. It performs matrix operations on four words in parallel,
33 * but requires shuffling to rearrange the words after each round.
34 *
35 * The round count is given in w3.
36 *
37 * Clobbers: w3, x10, v4, v12
38 */
39SYM_FUNC_START_LOCAL(chacha_permute)
40
41 adr_l x10, ROT8
42 ld1 {v12.4s}, [x10]
43
44.Ldoubleround:
45 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
46 add v0.4s, v0.4s, v1.4s
47 eor v3.16b, v3.16b, v0.16b
48 rev32 v3.8h, v3.8h
49
50 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
51 add v2.4s, v2.4s, v3.4s
52 eor v4.16b, v1.16b, v2.16b
53 shl v1.4s, v4.4s, #12
54 sri v1.4s, v4.4s, #20
55
56 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
57 add v0.4s, v0.4s, v1.4s
58 eor v3.16b, v3.16b, v0.16b
59 tbl v3.16b, {v3.16b}, v12.16b
60
61 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
62 add v2.4s, v2.4s, v3.4s
63 eor v4.16b, v1.16b, v2.16b
64 shl v1.4s, v4.4s, #7
65 sri v1.4s, v4.4s, #25
66
67 // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
68 ext v1.16b, v1.16b, v1.16b, #4
69 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
70 ext v2.16b, v2.16b, v2.16b, #8
71 // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
72 ext v3.16b, v3.16b, v3.16b, #12
73
74 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
75 add v0.4s, v0.4s, v1.4s
76 eor v3.16b, v3.16b, v0.16b
77 rev32 v3.8h, v3.8h
78
79 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
80 add v2.4s, v2.4s, v3.4s
81 eor v4.16b, v1.16b, v2.16b
82 shl v1.4s, v4.4s, #12
83 sri v1.4s, v4.4s, #20
84
85 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
86 add v0.4s, v0.4s, v1.4s
87 eor v3.16b, v3.16b, v0.16b
88 tbl v3.16b, {v3.16b}, v12.16b
89
90 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
91 add v2.4s, v2.4s, v3.4s
92 eor v4.16b, v1.16b, v2.16b
93 shl v1.4s, v4.4s, #7
94 sri v1.4s, v4.4s, #25
95
96 // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
97 ext v1.16b, v1.16b, v1.16b, #12
98 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
99 ext v2.16b, v2.16b, v2.16b, #8
100 // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
101 ext v3.16b, v3.16b, v3.16b, #4
102
103 subs w3, w3, #2
104 b.ne .Ldoubleround
105
106 ret
107SYM_FUNC_END(chacha_permute)
108
109SYM_FUNC_START(chacha_block_xor_neon)
110 // x0: Input state matrix, s
111 // x1: 1 data block output, o
112 // x2: 1 data block input, i
113 // w3: nrounds
114
115 stp x29, x30, [sp, #-16]!
116 mov x29, sp
117
118 // x0..3 = s0..3
119 ld1 {v0.4s-v3.4s}, [x0]
120 ld1 {v8.4s-v11.4s}, [x0]
121
122 bl chacha_permute
123
124 ld1 {v4.16b-v7.16b}, [x2]
125
126 // o0 = i0 ^ (x0 + s0)
127 add v0.4s, v0.4s, v8.4s
128 eor v0.16b, v0.16b, v4.16b
129
130 // o1 = i1 ^ (x1 + s1)
131 add v1.4s, v1.4s, v9.4s
132 eor v1.16b, v1.16b, v5.16b
133
134 // o2 = i2 ^ (x2 + s2)
135 add v2.4s, v2.4s, v10.4s
136 eor v2.16b, v2.16b, v6.16b
137
138 // o3 = i3 ^ (x3 + s3)
139 add v3.4s, v3.4s, v11.4s
140 eor v3.16b, v3.16b, v7.16b
141
142 st1 {v0.16b-v3.16b}, [x1]
143
144 ldp x29, x30, [sp], #16
145 ret
146SYM_FUNC_END(chacha_block_xor_neon)
147
148SYM_FUNC_START(hchacha_block_neon)
149 // x0: Input state matrix, s
150 // x1: output (8 32-bit words)
151 // w2: nrounds
152
153 stp x29, x30, [sp, #-16]!
154 mov x29, sp
155
156 ld1 {v0.4s-v3.4s}, [x0]
157
158 mov w3, w2
159 bl chacha_permute
160
161 st1 {v0.4s}, [x1], #16
162 st1 {v3.4s}, [x1]
163
164 ldp x29, x30, [sp], #16
165 ret
166SYM_FUNC_END(hchacha_block_neon)
167
168 a0 .req w12
169 a1 .req w13
170 a2 .req w14
171 a3 .req w15
172 a4 .req w16
173 a5 .req w17
174 a6 .req w19
175 a7 .req w20
176 a8 .req w21
177 a9 .req w22
178 a10 .req w23
179 a11 .req w24
180 a12 .req w25
181 a13 .req w26
182 a14 .req w27
183 a15 .req w28
184
185 .align 6
186SYM_FUNC_START(chacha_4block_xor_neon)
187 frame_push 10
188
189 // x0: Input state matrix, s
190 // x1: 4 data blocks output, o
191 // x2: 4 data blocks input, i
192 // w3: nrounds
193 // x4: byte count
194
195 adr_l x10, .Lpermute
196 and x5, x4, #63
197 add x10, x10, x5
198
199 //
200 // This function encrypts four consecutive ChaCha blocks by loading
201 // the state matrix in NEON registers four times. The algorithm performs
202 // each operation on the corresponding word of each state matrix, hence
203 // requires no word shuffling. For final XORing step we transpose the
204 // matrix by interleaving 32- and then 64-bit words, which allows us to
205 // do XOR in NEON registers.
206 //
207 // At the same time, a fifth block is encrypted in parallel using
208 // scalar registers
209 //
210 adr_l x9, CTRINC // ... and ROT8
211 ld1 {v30.4s-v31.4s}, [x9]
212
213 // x0..15[0-3] = s0..3[0..3]
214 add x8, x0, #16
215 ld4r { v0.4s- v3.4s}, [x0]
216 ld4r { v4.4s- v7.4s}, [x8], #16
217 ld4r { v8.4s-v11.4s}, [x8], #16
218 ld4r {v12.4s-v15.4s}, [x8]
219
220 mov a0, v0.s[0]
221 mov a1, v1.s[0]
222 mov a2, v2.s[0]
223 mov a3, v3.s[0]
224 mov a4, v4.s[0]
225 mov a5, v5.s[0]
226 mov a6, v6.s[0]
227 mov a7, v7.s[0]
228 mov a8, v8.s[0]
229 mov a9, v9.s[0]
230 mov a10, v10.s[0]
231 mov a11, v11.s[0]
232 mov a12, v12.s[0]
233 mov a13, v13.s[0]
234 mov a14, v14.s[0]
235 mov a15, v15.s[0]
236
237 // x12 += counter values 1-4
238 add v12.4s, v12.4s, v30.4s
239
240.Ldoubleround4:
241 // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
242 // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
243 // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
244 // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
245 add v0.4s, v0.4s, v4.4s
246 add a0, a0, a4
247 add v1.4s, v1.4s, v5.4s
248 add a1, a1, a5
249 add v2.4s, v2.4s, v6.4s
250 add a2, a2, a6
251 add v3.4s, v3.4s, v7.4s
252 add a3, a3, a7
253
254 eor v12.16b, v12.16b, v0.16b
255 eor a12, a12, a0
256 eor v13.16b, v13.16b, v1.16b
257 eor a13, a13, a1
258 eor v14.16b, v14.16b, v2.16b
259 eor a14, a14, a2
260 eor v15.16b, v15.16b, v3.16b
261 eor a15, a15, a3
262
263 rev32 v12.8h, v12.8h
264 ror a12, a12, #16
265 rev32 v13.8h, v13.8h
266 ror a13, a13, #16
267 rev32 v14.8h, v14.8h
268 ror a14, a14, #16
269 rev32 v15.8h, v15.8h
270 ror a15, a15, #16
271
272 // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
273 // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
274 // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
275 // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
276 add v8.4s, v8.4s, v12.4s
277 add a8, a8, a12
278 add v9.4s, v9.4s, v13.4s
279 add a9, a9, a13
280 add v10.4s, v10.4s, v14.4s
281 add a10, a10, a14
282 add v11.4s, v11.4s, v15.4s
283 add a11, a11, a15
284
285 eor v16.16b, v4.16b, v8.16b
286 eor a4, a4, a8
287 eor v17.16b, v5.16b, v9.16b
288 eor a5, a5, a9
289 eor v18.16b, v6.16b, v10.16b
290 eor a6, a6, a10
291 eor v19.16b, v7.16b, v11.16b
292 eor a7, a7, a11
293
294 shl v4.4s, v16.4s, #12
295 shl v5.4s, v17.4s, #12
296 shl v6.4s, v18.4s, #12
297 shl v7.4s, v19.4s, #12
298
299 sri v4.4s, v16.4s, #20
300 ror a4, a4, #20
301 sri v5.4s, v17.4s, #20
302 ror a5, a5, #20
303 sri v6.4s, v18.4s, #20
304 ror a6, a6, #20
305 sri v7.4s, v19.4s, #20
306 ror a7, a7, #20
307
308 // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
309 // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
310 // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
311 // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
312 add v0.4s, v0.4s, v4.4s
313 add a0, a0, a4
314 add v1.4s, v1.4s, v5.4s
315 add a1, a1, a5
316 add v2.4s, v2.4s, v6.4s
317 add a2, a2, a6
318 add v3.4s, v3.4s, v7.4s
319 add a3, a3, a7
320
321 eor v12.16b, v12.16b, v0.16b
322 eor a12, a12, a0
323 eor v13.16b, v13.16b, v1.16b
324 eor a13, a13, a1
325 eor v14.16b, v14.16b, v2.16b
326 eor a14, a14, a2
327 eor v15.16b, v15.16b, v3.16b
328 eor a15, a15, a3
329
330 tbl v12.16b, {v12.16b}, v31.16b
331 ror a12, a12, #24
332 tbl v13.16b, {v13.16b}, v31.16b
333 ror a13, a13, #24
334 tbl v14.16b, {v14.16b}, v31.16b
335 ror a14, a14, #24
336 tbl v15.16b, {v15.16b}, v31.16b
337 ror a15, a15, #24
338
339 // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
340 // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
341 // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
342 // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
343 add v8.4s, v8.4s, v12.4s
344 add a8, a8, a12
345 add v9.4s, v9.4s, v13.4s
346 add a9, a9, a13
347 add v10.4s, v10.4s, v14.4s
348 add a10, a10, a14
349 add v11.4s, v11.4s, v15.4s
350 add a11, a11, a15
351
352 eor v16.16b, v4.16b, v8.16b
353 eor a4, a4, a8
354 eor v17.16b, v5.16b, v9.16b
355 eor a5, a5, a9
356 eor v18.16b, v6.16b, v10.16b
357 eor a6, a6, a10
358 eor v19.16b, v7.16b, v11.16b
359 eor a7, a7, a11
360
361 shl v4.4s, v16.4s, #7
362 shl v5.4s, v17.4s, #7
363 shl v6.4s, v18.4s, #7
364 shl v7.4s, v19.4s, #7
365
366 sri v4.4s, v16.4s, #25
367 ror a4, a4, #25
368 sri v5.4s, v17.4s, #25
369 ror a5, a5, #25
370 sri v6.4s, v18.4s, #25
371 ror a6, a6, #25
372 sri v7.4s, v19.4s, #25
373 ror a7, a7, #25
374
375 // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
376 // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
377 // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
378 // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
379 add v0.4s, v0.4s, v5.4s
380 add a0, a0, a5
381 add v1.4s, v1.4s, v6.4s
382 add a1, a1, a6
383 add v2.4s, v2.4s, v7.4s
384 add a2, a2, a7
385 add v3.4s, v3.4s, v4.4s
386 add a3, a3, a4
387
388 eor v15.16b, v15.16b, v0.16b
389 eor a15, a15, a0
390 eor v12.16b, v12.16b, v1.16b
391 eor a12, a12, a1
392 eor v13.16b, v13.16b, v2.16b
393 eor a13, a13, a2
394 eor v14.16b, v14.16b, v3.16b
395 eor a14, a14, a3
396
397 rev32 v15.8h, v15.8h
398 ror a15, a15, #16
399 rev32 v12.8h, v12.8h
400 ror a12, a12, #16
401 rev32 v13.8h, v13.8h
402 ror a13, a13, #16
403 rev32 v14.8h, v14.8h
404 ror a14, a14, #16
405
406 // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
407 // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
408 // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
409 // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
410 add v10.4s, v10.4s, v15.4s
411 add a10, a10, a15
412 add v11.4s, v11.4s, v12.4s
413 add a11, a11, a12
414 add v8.4s, v8.4s, v13.4s
415 add a8, a8, a13
416 add v9.4s, v9.4s, v14.4s
417 add a9, a9, a14
418
419 eor v16.16b, v5.16b, v10.16b
420 eor a5, a5, a10
421 eor v17.16b, v6.16b, v11.16b
422 eor a6, a6, a11
423 eor v18.16b, v7.16b, v8.16b
424 eor a7, a7, a8
425 eor v19.16b, v4.16b, v9.16b
426 eor a4, a4, a9
427
428 shl v5.4s, v16.4s, #12
429 shl v6.4s, v17.4s, #12
430 shl v7.4s, v18.4s, #12
431 shl v4.4s, v19.4s, #12
432
433 sri v5.4s, v16.4s, #20
434 ror a5, a5, #20
435 sri v6.4s, v17.4s, #20
436 ror a6, a6, #20
437 sri v7.4s, v18.4s, #20
438 ror a7, a7, #20
439 sri v4.4s, v19.4s, #20
440 ror a4, a4, #20
441
442 // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
443 // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
444 // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
445 // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
446 add v0.4s, v0.4s, v5.4s
447 add a0, a0, a5
448 add v1.4s, v1.4s, v6.4s
449 add a1, a1, a6
450 add v2.4s, v2.4s, v7.4s
451 add a2, a2, a7
452 add v3.4s, v3.4s, v4.4s
453 add a3, a3, a4
454
455 eor v15.16b, v15.16b, v0.16b
456 eor a15, a15, a0
457 eor v12.16b, v12.16b, v1.16b
458 eor a12, a12, a1
459 eor v13.16b, v13.16b, v2.16b
460 eor a13, a13, a2
461 eor v14.16b, v14.16b, v3.16b
462 eor a14, a14, a3
463
464 tbl v15.16b, {v15.16b}, v31.16b
465 ror a15, a15, #24
466 tbl v12.16b, {v12.16b}, v31.16b
467 ror a12, a12, #24
468 tbl v13.16b, {v13.16b}, v31.16b
469 ror a13, a13, #24
470 tbl v14.16b, {v14.16b}, v31.16b
471 ror a14, a14, #24
472
473 // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
474 // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
475 // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
476 // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
477 add v10.4s, v10.4s, v15.4s
478 add a10, a10, a15
479 add v11.4s, v11.4s, v12.4s
480 add a11, a11, a12
481 add v8.4s, v8.4s, v13.4s
482 add a8, a8, a13
483 add v9.4s, v9.4s, v14.4s
484 add a9, a9, a14
485
486 eor v16.16b, v5.16b, v10.16b
487 eor a5, a5, a10
488 eor v17.16b, v6.16b, v11.16b
489 eor a6, a6, a11
490 eor v18.16b, v7.16b, v8.16b
491 eor a7, a7, a8
492 eor v19.16b, v4.16b, v9.16b
493 eor a4, a4, a9
494
495 shl v5.4s, v16.4s, #7
496 shl v6.4s, v17.4s, #7
497 shl v7.4s, v18.4s, #7
498 shl v4.4s, v19.4s, #7
499
500 sri v5.4s, v16.4s, #25
501 ror a5, a5, #25
502 sri v6.4s, v17.4s, #25
503 ror a6, a6, #25
504 sri v7.4s, v18.4s, #25
505 ror a7, a7, #25
506 sri v4.4s, v19.4s, #25
507 ror a4, a4, #25
508
509 subs w3, w3, #2
510 b.ne .Ldoubleround4
511
512 ld4r {v16.4s-v19.4s}, [x0], #16
513 ld4r {v20.4s-v23.4s}, [x0], #16
514
515 // x12 += counter values 0-3
516 add v12.4s, v12.4s, v30.4s
517
518 // x0[0-3] += s0[0]
519 // x1[0-3] += s0[1]
520 // x2[0-3] += s0[2]
521 // x3[0-3] += s0[3]
522 add v0.4s, v0.4s, v16.4s
523 mov w6, v16.s[0]
524 mov w7, v17.s[0]
525 add v1.4s, v1.4s, v17.4s
526 mov w8, v18.s[0]
527 mov w9, v19.s[0]
528 add v2.4s, v2.4s, v18.4s
529 add a0, a0, w6
530 add a1, a1, w7
531 add v3.4s, v3.4s, v19.4s
532 add a2, a2, w8
533 add a3, a3, w9
534CPU_BE( rev a0, a0 )
535CPU_BE( rev a1, a1 )
536CPU_BE( rev a2, a2 )
537CPU_BE( rev a3, a3 )
538
539 ld4r {v24.4s-v27.4s}, [x0], #16
540 ld4r {v28.4s-v31.4s}, [x0]
541
542 // x4[0-3] += s1[0]
543 // x5[0-3] += s1[1]
544 // x6[0-3] += s1[2]
545 // x7[0-3] += s1[3]
546 add v4.4s, v4.4s, v20.4s
547 mov w6, v20.s[0]
548 mov w7, v21.s[0]
549 add v5.4s, v5.4s, v21.4s
550 mov w8, v22.s[0]
551 mov w9, v23.s[0]
552 add v6.4s, v6.4s, v22.4s
553 add a4, a4, w6
554 add a5, a5, w7
555 add v7.4s, v7.4s, v23.4s
556 add a6, a6, w8
557 add a7, a7, w9
558CPU_BE( rev a4, a4 )
559CPU_BE( rev a5, a5 )
560CPU_BE( rev a6, a6 )
561CPU_BE( rev a7, a7 )
562
563 // x8[0-3] += s2[0]
564 // x9[0-3] += s2[1]
565 // x10[0-3] += s2[2]
566 // x11[0-3] += s2[3]
567 add v8.4s, v8.4s, v24.4s
568 mov w6, v24.s[0]
569 mov w7, v25.s[0]
570 add v9.4s, v9.4s, v25.4s
571 mov w8, v26.s[0]
572 mov w9, v27.s[0]
573 add v10.4s, v10.4s, v26.4s
574 add a8, a8, w6
575 add a9, a9, w7
576 add v11.4s, v11.4s, v27.4s
577 add a10, a10, w8
578 add a11, a11, w9
579CPU_BE( rev a8, a8 )
580CPU_BE( rev a9, a9 )
581CPU_BE( rev a10, a10 )
582CPU_BE( rev a11, a11 )
583
584 // x12[0-3] += s3[0]
585 // x13[0-3] += s3[1]
586 // x14[0-3] += s3[2]
587 // x15[0-3] += s3[3]
588 add v12.4s, v12.4s, v28.4s
589 mov w6, v28.s[0]
590 mov w7, v29.s[0]
591 add v13.4s, v13.4s, v29.4s
592 mov w8, v30.s[0]
593 mov w9, v31.s[0]
594 add v14.4s, v14.4s, v30.4s
595 add a12, a12, w6
596 add a13, a13, w7
597 add v15.4s, v15.4s, v31.4s
598 add a14, a14, w8
599 add a15, a15, w9
600CPU_BE( rev a12, a12 )
601CPU_BE( rev a13, a13 )
602CPU_BE( rev a14, a14 )
603CPU_BE( rev a15, a15 )
604
605 // interleave 32-bit words in state n, n+1
606 ldp w6, w7, [x2], #64
607 zip1 v16.4s, v0.4s, v1.4s
608 ldp w8, w9, [x2, #-56]
609 eor a0, a0, w6
610 zip2 v17.4s, v0.4s, v1.4s
611 eor a1, a1, w7
612 zip1 v18.4s, v2.4s, v3.4s
613 eor a2, a2, w8
614 zip2 v19.4s, v2.4s, v3.4s
615 eor a3, a3, w9
616 ldp w6, w7, [x2, #-48]
617 zip1 v20.4s, v4.4s, v5.4s
618 ldp w8, w9, [x2, #-40]
619 eor a4, a4, w6
620 zip2 v21.4s, v4.4s, v5.4s
621 eor a5, a5, w7
622 zip1 v22.4s, v6.4s, v7.4s
623 eor a6, a6, w8
624 zip2 v23.4s, v6.4s, v7.4s
625 eor a7, a7, w9
626 ldp w6, w7, [x2, #-32]
627 zip1 v24.4s, v8.4s, v9.4s
628 ldp w8, w9, [x2, #-24]
629 eor a8, a8, w6
630 zip2 v25.4s, v8.4s, v9.4s
631 eor a9, a9, w7
632 zip1 v26.4s, v10.4s, v11.4s
633 eor a10, a10, w8
634 zip2 v27.4s, v10.4s, v11.4s
635 eor a11, a11, w9
636 ldp w6, w7, [x2, #-16]
637 zip1 v28.4s, v12.4s, v13.4s
638 ldp w8, w9, [x2, #-8]
639 eor a12, a12, w6
640 zip2 v29.4s, v12.4s, v13.4s
641 eor a13, a13, w7
642 zip1 v30.4s, v14.4s, v15.4s
643 eor a14, a14, w8
644 zip2 v31.4s, v14.4s, v15.4s
645 eor a15, a15, w9
646
647 add x3, x2, x4
648 sub x3, x3, #128 // start of last block
649
650 subs x5, x4, #128
651 csel x2, x2, x3, ge
652
653 // interleave 64-bit words in state n, n+2
654 zip1 v0.2d, v16.2d, v18.2d
655 zip2 v4.2d, v16.2d, v18.2d
656 stp a0, a1, [x1], #64
657 zip1 v8.2d, v17.2d, v19.2d
658 zip2 v12.2d, v17.2d, v19.2d
659 stp a2, a3, [x1, #-56]
660
661 subs x6, x4, #192
662 ld1 {v16.16b-v19.16b}, [x2], #64
663 csel x2, x2, x3, ge
664
665 zip1 v1.2d, v20.2d, v22.2d
666 zip2 v5.2d, v20.2d, v22.2d
667 stp a4, a5, [x1, #-48]
668 zip1 v9.2d, v21.2d, v23.2d
669 zip2 v13.2d, v21.2d, v23.2d
670 stp a6, a7, [x1, #-40]
671
672 subs x7, x4, #256
673 ld1 {v20.16b-v23.16b}, [x2], #64
674 csel x2, x2, x3, ge
675
676 zip1 v2.2d, v24.2d, v26.2d
677 zip2 v6.2d, v24.2d, v26.2d
678 stp a8, a9, [x1, #-32]
679 zip1 v10.2d, v25.2d, v27.2d
680 zip2 v14.2d, v25.2d, v27.2d
681 stp a10, a11, [x1, #-24]
682
683 subs x8, x4, #320
684 ld1 {v24.16b-v27.16b}, [x2], #64
685 csel x2, x2, x3, ge
686
687 zip1 v3.2d, v28.2d, v30.2d
688 zip2 v7.2d, v28.2d, v30.2d
689 stp a12, a13, [x1, #-16]
690 zip1 v11.2d, v29.2d, v31.2d
691 zip2 v15.2d, v29.2d, v31.2d
692 stp a14, a15, [x1, #-8]
693
694 tbnz x5, #63, .Lt128
695 ld1 {v28.16b-v31.16b}, [x2]
696
697 // xor with corresponding input, write to output
698 eor v16.16b, v16.16b, v0.16b
699 eor v17.16b, v17.16b, v1.16b
700 eor v18.16b, v18.16b, v2.16b
701 eor v19.16b, v19.16b, v3.16b
702
703 tbnz x6, #63, .Lt192
704
705 eor v20.16b, v20.16b, v4.16b
706 eor v21.16b, v21.16b, v5.16b
707 eor v22.16b, v22.16b, v6.16b
708 eor v23.16b, v23.16b, v7.16b
709
710 st1 {v16.16b-v19.16b}, [x1], #64
711 tbnz x7, #63, .Lt256
712
713 eor v24.16b, v24.16b, v8.16b
714 eor v25.16b, v25.16b, v9.16b
715 eor v26.16b, v26.16b, v10.16b
716 eor v27.16b, v27.16b, v11.16b
717
718 st1 {v20.16b-v23.16b}, [x1], #64
719 tbnz x8, #63, .Lt320
720
721 eor v28.16b, v28.16b, v12.16b
722 eor v29.16b, v29.16b, v13.16b
723 eor v30.16b, v30.16b, v14.16b
724 eor v31.16b, v31.16b, v15.16b
725
726 st1 {v24.16b-v27.16b}, [x1], #64
727 st1 {v28.16b-v31.16b}, [x1]
728
729.Lout: frame_pop
730 ret
731
732 // fewer than 192 bytes of in/output
733.Lt192: cbz x5, 1f // exactly 128 bytes?
734 ld1 {v28.16b-v31.16b}, [x10]
735 add x5, x5, x1
736 tbl v28.16b, {v4.16b-v7.16b}, v28.16b
737 tbl v29.16b, {v4.16b-v7.16b}, v29.16b
738 tbl v30.16b, {v4.16b-v7.16b}, v30.16b
739 tbl v31.16b, {v4.16b-v7.16b}, v31.16b
740
7410: eor v20.16b, v20.16b, v28.16b
742 eor v21.16b, v21.16b, v29.16b
743 eor v22.16b, v22.16b, v30.16b
744 eor v23.16b, v23.16b, v31.16b
745 st1 {v20.16b-v23.16b}, [x5] // overlapping stores
7461: st1 {v16.16b-v19.16b}, [x1]
747 b .Lout
748
749 // fewer than 128 bytes of in/output
750.Lt128: ld1 {v28.16b-v31.16b}, [x10]
751 add x5, x5, x1
752 sub x1, x1, #64
753 tbl v28.16b, {v0.16b-v3.16b}, v28.16b
754 tbl v29.16b, {v0.16b-v3.16b}, v29.16b
755 tbl v30.16b, {v0.16b-v3.16b}, v30.16b
756 tbl v31.16b, {v0.16b-v3.16b}, v31.16b
757 ld1 {v16.16b-v19.16b}, [x1] // reload first output block
758 b 0b
759
760 // fewer than 256 bytes of in/output
761.Lt256: cbz x6, 2f // exactly 192 bytes?
762 ld1 {v4.16b-v7.16b}, [x10]
763 add x6, x6, x1
764 tbl v0.16b, {v8.16b-v11.16b}, v4.16b
765 tbl v1.16b, {v8.16b-v11.16b}, v5.16b
766 tbl v2.16b, {v8.16b-v11.16b}, v6.16b
767 tbl v3.16b, {v8.16b-v11.16b}, v7.16b
768
769 eor v28.16b, v28.16b, v0.16b
770 eor v29.16b, v29.16b, v1.16b
771 eor v30.16b, v30.16b, v2.16b
772 eor v31.16b, v31.16b, v3.16b
773 st1 {v28.16b-v31.16b}, [x6] // overlapping stores
7742: st1 {v20.16b-v23.16b}, [x1]
775 b .Lout
776
777 // fewer than 320 bytes of in/output
778.Lt320: cbz x7, 3f // exactly 256 bytes?
779 ld1 {v4.16b-v7.16b}, [x10]
780 add x7, x7, x1
781 tbl v0.16b, {v12.16b-v15.16b}, v4.16b
782 tbl v1.16b, {v12.16b-v15.16b}, v5.16b
783 tbl v2.16b, {v12.16b-v15.16b}, v6.16b
784 tbl v3.16b, {v12.16b-v15.16b}, v7.16b
785
786 eor v28.16b, v28.16b, v0.16b
787 eor v29.16b, v29.16b, v1.16b
788 eor v30.16b, v30.16b, v2.16b
789 eor v31.16b, v31.16b, v3.16b
790 st1 {v28.16b-v31.16b}, [x7] // overlapping stores
7913: st1 {v24.16b-v27.16b}, [x1]
792 b .Lout
793SYM_FUNC_END(chacha_4block_xor_neon)
794
795 .section ".rodata", "a", %progbits
796 .align L1_CACHE_SHIFT
797.Lpermute:
798 .set .Li, 0
799 .rept 128
800 .byte (.Li - 64)
801 .set .Li, .Li + 1
802 .endr
803
804CTRINC: .word 1, 2, 3, 4
805ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
806

source code of linux/arch/arm64/crypto/chacha-neon-core.S