1/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2/*
3 * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
4 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
5 */
6
7#define MASK_U32 0x3c
8#define CHACHA20_BLOCK_SIZE 64
9#define STACK_SIZE 32
10
11#define X0 $t0
12#define X1 $t1
13#define X2 $t2
14#define X3 $t3
15#define X4 $t4
16#define X5 $t5
17#define X6 $t6
18#define X7 $t7
19#define X8 $t8
20#define X9 $t9
21#define X10 $v1
22#define X11 $s6
23#define X12 $s5
24#define X13 $s4
25#define X14 $s3
26#define X15 $s2
27/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
28#define T0 $s1
29#define T1 $s0
30#define T(n) T ## n
31#define X(n) X ## n
32
33/* Input arguments */
34#define STATE $a0
35#define OUT $a1
36#define IN $a2
37#define BYTES $a3
38
39/* Output argument */
40/* NONCE[0] is kept in a register and not in memory.
41 * We don't want to touch original value in memory.
42 * Must be incremented every loop iteration.
43 */
44#define NONCE_0 $v0
45
46/* SAVED_X and SAVED_CA are set in the jump table.
47 * Use regs which are overwritten on exit else we don't leak clear data.
48 * They are used to handling the last bytes which are not multiple of 4.
49 */
50#define SAVED_X X15
51#define SAVED_CA $s7
52
53#define IS_UNALIGNED $s7
54
55#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
56#define MSB 0
57#define LSB 3
58#define ROTx rotl
59#define ROTR(n) rotr n, 24
60#define CPU_TO_LE32(n) \
61 wsbh n; \
62 rotr n, 16;
63#else
64#define MSB 3
65#define LSB 0
66#define ROTx rotr
67#define CPU_TO_LE32(n)
68#define ROTR(n)
69#endif
70
71#define FOR_EACH_WORD(x) \
72 x( 0); \
73 x( 1); \
74 x( 2); \
75 x( 3); \
76 x( 4); \
77 x( 5); \
78 x( 6); \
79 x( 7); \
80 x( 8); \
81 x( 9); \
82 x(10); \
83 x(11); \
84 x(12); \
85 x(13); \
86 x(14); \
87 x(15);
88
89#define FOR_EACH_WORD_REV(x) \
90 x(15); \
91 x(14); \
92 x(13); \
93 x(12); \
94 x(11); \
95 x(10); \
96 x( 9); \
97 x( 8); \
98 x( 7); \
99 x( 6); \
100 x( 5); \
101 x( 4); \
102 x( 3); \
103 x( 2); \
104 x( 1); \
105 x( 0);
106
107#define PLUS_ONE_0 1
108#define PLUS_ONE_1 2
109#define PLUS_ONE_2 3
110#define PLUS_ONE_3 4
111#define PLUS_ONE_4 5
112#define PLUS_ONE_5 6
113#define PLUS_ONE_6 7
114#define PLUS_ONE_7 8
115#define PLUS_ONE_8 9
116#define PLUS_ONE_9 10
117#define PLUS_ONE_10 11
118#define PLUS_ONE_11 12
119#define PLUS_ONE_12 13
120#define PLUS_ONE_13 14
121#define PLUS_ONE_14 15
122#define PLUS_ONE_15 16
123#define PLUS_ONE(x) PLUS_ONE_ ## x
124#define _CONCAT3(a,b,c) a ## b ## c
125#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
126
127#define STORE_UNALIGNED(x) \
128CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
129 .if (x != 12); \
130 lw T0, (x*4)(STATE); \
131 .endif; \
132 lwl T1, (x*4)+MSB ## (IN); \
133 lwr T1, (x*4)+LSB ## (IN); \
134 .if (x == 12); \
135 addu X ## x, NONCE_0; \
136 .else; \
137 addu X ## x, T0; \
138 .endif; \
139 CPU_TO_LE32(X ## x); \
140 xor X ## x, T1; \
141 swl X ## x, (x*4)+MSB ## (OUT); \
142 swr X ## x, (x*4)+LSB ## (OUT);
143
144#define STORE_ALIGNED(x) \
145CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
146 .if (x != 12); \
147 lw T0, (x*4)(STATE); \
148 .endif; \
149 lw T1, (x*4) ## (IN); \
150 .if (x == 12); \
151 addu X ## x, NONCE_0; \
152 .else; \
153 addu X ## x, T0; \
154 .endif; \
155 CPU_TO_LE32(X ## x); \
156 xor X ## x, T1; \
157 sw X ## x, (x*4) ## (OUT);
158
159/* Jump table macro.
160 * Used for setup and handling the last bytes, which are not multiple of 4.
161 * X15 is free to store Xn
162 * Every jumptable entry must be equal in size.
163 */
164#define JMPTBL_ALIGNED(x) \
165.Lchacha_mips_jmptbl_aligned_ ## x: ; \
166 .set noreorder; \
167 b .Lchacha_mips_xor_aligned_ ## x ## _b; \
168 .if (x == 12); \
169 addu SAVED_X, X ## x, NONCE_0; \
170 .else; \
171 addu SAVED_X, X ## x, SAVED_CA; \
172 .endif; \
173 .set reorder
174
175#define JMPTBL_UNALIGNED(x) \
176.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
177 .set noreorder; \
178 b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
179 .if (x == 12); \
180 addu SAVED_X, X ## x, NONCE_0; \
181 .else; \
182 addu SAVED_X, X ## x, SAVED_CA; \
183 .endif; \
184 .set reorder
185
186#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
187 addu X(A), X(K); \
188 addu X(B), X(L); \
189 addu X(C), X(M); \
190 addu X(D), X(N); \
191 xor X(V), X(A); \
192 xor X(W), X(B); \
193 xor X(Y), X(C); \
194 xor X(Z), X(D); \
195 rotl X(V), S; \
196 rotl X(W), S; \
197 rotl X(Y), S; \
198 rotl X(Z), S;
199
200.text
201.set reorder
202.set noat
203.globl chacha_crypt_arch
204.ent chacha_crypt_arch
205chacha_crypt_arch:
206 .frame $sp, STACK_SIZE, $ra
207
208 /* Load number of rounds */
209 lw $at, 16($sp)
210
211 addiu $sp, -STACK_SIZE
212
213 /* Return bytes = 0. */
214 beqz BYTES, .Lchacha_mips_end
215
216 lw NONCE_0, 48(STATE)
217
218 /* Save s0-s7 */
219 sw $s0, 0($sp)
220 sw $s1, 4($sp)
221 sw $s2, 8($sp)
222 sw $s3, 12($sp)
223 sw $s4, 16($sp)
224 sw $s5, 20($sp)
225 sw $s6, 24($sp)
226 sw $s7, 28($sp)
227
228 /* Test IN or OUT is unaligned.
229 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
230 */
231 or IS_UNALIGNED, IN, OUT
232 andi IS_UNALIGNED, 0x3
233
234 b .Lchacha_rounds_start
235
236.align 4
237.Loop_chacha_rounds:
238 addiu IN, CHACHA20_BLOCK_SIZE
239 addiu OUT, CHACHA20_BLOCK_SIZE
240 addiu NONCE_0, 1
241
242.Lchacha_rounds_start:
243 lw X0, 0(STATE)
244 lw X1, 4(STATE)
245 lw X2, 8(STATE)
246 lw X3, 12(STATE)
247
248 lw X4, 16(STATE)
249 lw X5, 20(STATE)
250 lw X6, 24(STATE)
251 lw X7, 28(STATE)
252 lw X8, 32(STATE)
253 lw X9, 36(STATE)
254 lw X10, 40(STATE)
255 lw X11, 44(STATE)
256
257 move X12, NONCE_0
258 lw X13, 52(STATE)
259 lw X14, 56(STATE)
260 lw X15, 60(STATE)
261
262.Loop_chacha_xor_rounds:
263 addiu $at, -2
264 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
265 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
266 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
267 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
268 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
269 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
270 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
271 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
272 bnez $at, .Loop_chacha_xor_rounds
273
274 addiu BYTES, -(CHACHA20_BLOCK_SIZE)
275
276 /* Is data src/dst unaligned? Jump */
277 bnez IS_UNALIGNED, .Loop_chacha_unaligned
278
279 /* Set number rounds here to fill delayslot. */
280 lw $at, (STACK_SIZE+16)($sp)
281
282 /* BYTES < 0, it has no full block. */
283 bltz BYTES, .Lchacha_mips_no_full_block_aligned
284
285 FOR_EACH_WORD_REV(STORE_ALIGNED)
286
287 /* BYTES > 0? Loop again. */
288 bgtz BYTES, .Loop_chacha_rounds
289
290 /* Place this here to fill delay slot */
291 addiu NONCE_0, 1
292
293 /* BYTES < 0? Handle last bytes */
294 bltz BYTES, .Lchacha_mips_xor_bytes
295
296.Lchacha_mips_xor_done:
297 /* Restore used registers */
298 lw $s0, 0($sp)
299 lw $s1, 4($sp)
300 lw $s2, 8($sp)
301 lw $s3, 12($sp)
302 lw $s4, 16($sp)
303 lw $s5, 20($sp)
304 lw $s6, 24($sp)
305 lw $s7, 28($sp)
306
307 /* Write NONCE_0 back to right location in state */
308 sw NONCE_0, 48(STATE)
309
310.Lchacha_mips_end:
311 addiu $sp, STACK_SIZE
312 jr $ra
313
314.Lchacha_mips_no_full_block_aligned:
315 /* Restore the offset on BYTES */
316 addiu BYTES, CHACHA20_BLOCK_SIZE
317
318 /* Get number of full WORDS */
319 andi $at, BYTES, MASK_U32
320
321 /* Load upper half of jump table addr */
322 lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
323
324 /* Calculate lower half jump table offset */
325 ins T0, $at, 1, 6
326
327 /* Add offset to STATE */
328 addu T1, STATE, $at
329
330 /* Add lower half jump table addr */
331 addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
332
333 /* Read value from STATE */
334 lw SAVED_CA, 0(T1)
335
336 /* Store remaining bytecounter as negative value */
337 subu BYTES, $at, BYTES
338
339 jr T0
340
341 /* Jump table */
342 FOR_EACH_WORD(JMPTBL_ALIGNED)
343
344
345.Loop_chacha_unaligned:
346 /* Set number rounds here to fill delayslot. */
347 lw $at, (STACK_SIZE+16)($sp)
348
349 /* BYTES > 0, it has no full block. */
350 bltz BYTES, .Lchacha_mips_no_full_block_unaligned
351
352 FOR_EACH_WORD_REV(STORE_UNALIGNED)
353
354 /* BYTES > 0? Loop again. */
355 bgtz BYTES, .Loop_chacha_rounds
356
357 /* Write NONCE_0 back to right location in state */
358 sw NONCE_0, 48(STATE)
359
360 .set noreorder
361 /* Fall through to byte handling */
362 bgez BYTES, .Lchacha_mips_xor_done
363.Lchacha_mips_xor_unaligned_0_b:
364.Lchacha_mips_xor_aligned_0_b:
365 /* Place this here to fill delay slot */
366 addiu NONCE_0, 1
367 .set reorder
368
369.Lchacha_mips_xor_bytes:
370 addu IN, $at
371 addu OUT, $at
372 /* First byte */
373 lbu T1, 0(IN)
374 addiu $at, BYTES, 1
375 CPU_TO_LE32(SAVED_X)
376 ROTR(SAVED_X)
377 xor T1, SAVED_X
378 sb T1, 0(OUT)
379 beqz $at, .Lchacha_mips_xor_done
380 /* Second byte */
381 lbu T1, 1(IN)
382 addiu $at, BYTES, 2
383 ROTx SAVED_X, 8
384 xor T1, SAVED_X
385 sb T1, 1(OUT)
386 beqz $at, .Lchacha_mips_xor_done
387 /* Third byte */
388 lbu T1, 2(IN)
389 ROTx SAVED_X, 8
390 xor T1, SAVED_X
391 sb T1, 2(OUT)
392 b .Lchacha_mips_xor_done
393
394.Lchacha_mips_no_full_block_unaligned:
395 /* Restore the offset on BYTES */
396 addiu BYTES, CHACHA20_BLOCK_SIZE
397
398 /* Get number of full WORDS */
399 andi $at, BYTES, MASK_U32
400
401 /* Load upper half of jump table addr */
402 lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
403
404 /* Calculate lower half jump table offset */
405 ins T0, $at, 1, 6
406
407 /* Add offset to STATE */
408 addu T1, STATE, $at
409
410 /* Add lower half jump table addr */
411 addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
412
413 /* Read value from STATE */
414 lw SAVED_CA, 0(T1)
415
416 /* Store remaining bytecounter as negative value */
417 subu BYTES, $at, BYTES
418
419 jr T0
420
421 /* Jump table */
422 FOR_EACH_WORD(JMPTBL_UNALIGNED)
423.end chacha_crypt_arch
424.set at
425
426/* Input arguments
427 * STATE $a0
428 * OUT $a1
429 * NROUND $a2
430 */
431
432#undef X12
433#undef X13
434#undef X14
435#undef X15
436
437#define X12 $a3
438#define X13 $at
439#define X14 $v0
440#define X15 STATE
441
442.set noat
443.globl hchacha_block_arch
444.ent hchacha_block_arch
445hchacha_block_arch:
446 .frame $sp, STACK_SIZE, $ra
447
448 addiu $sp, -STACK_SIZE
449
450 /* Save X11(s6) */
451 sw X11, 0($sp)
452
453 lw X0, 0(STATE)
454 lw X1, 4(STATE)
455 lw X2, 8(STATE)
456 lw X3, 12(STATE)
457 lw X4, 16(STATE)
458 lw X5, 20(STATE)
459 lw X6, 24(STATE)
460 lw X7, 28(STATE)
461 lw X8, 32(STATE)
462 lw X9, 36(STATE)
463 lw X10, 40(STATE)
464 lw X11, 44(STATE)
465 lw X12, 48(STATE)
466 lw X13, 52(STATE)
467 lw X14, 56(STATE)
468 lw X15, 60(STATE)
469
470.Loop_hchacha_xor_rounds:
471 addiu $a2, -2
472 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
473 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
474 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
475 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
476 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
477 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
478 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
479 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
480 bnez $a2, .Loop_hchacha_xor_rounds
481
482 /* Restore used register */
483 lw X11, 0($sp)
484
485 sw X0, 0(OUT)
486 sw X1, 4(OUT)
487 sw X2, 8(OUT)
488 sw X3, 12(OUT)
489 sw X12, 16(OUT)
490 sw X13, 20(OUT)
491 sw X14, 24(OUT)
492 sw X15, 28(OUT)
493
494 addiu $sp, STACK_SIZE
495 jr $ra
496.end hchacha_block_arch
497.set at
498

source code of linux/arch/mips/crypto/chacha-core.S