1 | /* SPDX-License-Identifier: GPL-2.0 OR MIT */ |
2 | /* |
3 | * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved. |
4 | * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. |
5 | */ |
6 | |
7 | #define MASK_U32 0x3c |
8 | #define CHACHA20_BLOCK_SIZE 64 |
9 | #define STACK_SIZE 32 |
10 | |
11 | #define X0 $t0 |
12 | #define X1 $t1 |
13 | #define X2 $t2 |
14 | #define X3 $t3 |
15 | #define X4 $t4 |
16 | #define X5 $t5 |
17 | #define X6 $t6 |
18 | #define X7 $t7 |
19 | #define X8 $t8 |
20 | #define X9 $t9 |
21 | #define X10 $v1 |
22 | #define X11 $s6 |
23 | #define X12 $s5 |
24 | #define X13 $s4 |
25 | #define X14 $s3 |
26 | #define X15 $s2 |
27 | /* Use regs which are overwritten on exit for Tx so we don't leak clear data. */ |
28 | #define T0 $s1 |
29 | #define T1 $s0 |
30 | #define T(n) T ## n |
31 | #define X(n) X ## n |
32 | |
33 | /* Input arguments */ |
34 | #define STATE $a0 |
35 | #define OUT $a1 |
36 | #define IN $a2 |
37 | #define BYTES $a3 |
38 | |
39 | /* Output argument */ |
40 | /* NONCE[0] is kept in a register and not in memory. |
41 | * We don't want to touch original value in memory. |
42 | * Must be incremented every loop iteration. |
43 | */ |
44 | #define NONCE_0 $v0 |
45 | |
46 | /* SAVED_X and SAVED_CA are set in the jump table. |
47 | * Use regs which are overwritten on exit else we don't leak clear data. |
48 | * They are used to handling the last bytes which are not multiple of 4. |
49 | */ |
50 | #define SAVED_X X15 |
51 | #define SAVED_CA $s7 |
52 | |
53 | #define IS_UNALIGNED $s7 |
54 | |
55 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
56 | #define MSB 0 |
57 | #define LSB 3 |
58 | #define ROTx rotl |
59 | #define ROTR(n) rotr n, 24 |
60 | #define CPU_TO_LE32(n) \ |
61 | wsbh n; \ |
62 | rotr n, 16; |
63 | #else |
64 | #define MSB 3 |
65 | #define LSB 0 |
66 | #define ROTx rotr |
67 | #define CPU_TO_LE32(n) |
68 | #define ROTR(n) |
69 | #endif |
70 | |
71 | #define FOR_EACH_WORD(x) \ |
72 | x( 0); \ |
73 | x( 1); \ |
74 | x( 2); \ |
75 | x( 3); \ |
76 | x( 4); \ |
77 | x( 5); \ |
78 | x( 6); \ |
79 | x( 7); \ |
80 | x( 8); \ |
81 | x( 9); \ |
82 | x(10); \ |
83 | x(11); \ |
84 | x(12); \ |
85 | x(13); \ |
86 | x(14); \ |
87 | x(15); |
88 | |
89 | #define FOR_EACH_WORD_REV(x) \ |
90 | x(15); \ |
91 | x(14); \ |
92 | x(13); \ |
93 | x(12); \ |
94 | x(11); \ |
95 | x(10); \ |
96 | x( 9); \ |
97 | x( 8); \ |
98 | x( 7); \ |
99 | x( 6); \ |
100 | x( 5); \ |
101 | x( 4); \ |
102 | x( 3); \ |
103 | x( 2); \ |
104 | x( 1); \ |
105 | x( 0); |
106 | |
107 | #define PLUS_ONE_0 1 |
108 | #define PLUS_ONE_1 2 |
109 | #define PLUS_ONE_2 3 |
110 | #define PLUS_ONE_3 4 |
111 | #define PLUS_ONE_4 5 |
112 | #define PLUS_ONE_5 6 |
113 | #define PLUS_ONE_6 7 |
114 | #define PLUS_ONE_7 8 |
115 | #define PLUS_ONE_8 9 |
116 | #define PLUS_ONE_9 10 |
117 | #define PLUS_ONE_10 11 |
118 | #define PLUS_ONE_11 12 |
119 | #define PLUS_ONE_12 13 |
120 | #define PLUS_ONE_13 14 |
121 | #define PLUS_ONE_14 15 |
122 | #define PLUS_ONE_15 16 |
123 | #define PLUS_ONE(x) PLUS_ONE_ ## x |
124 | #define _CONCAT3(a,b,c) a ## b ## c |
125 | #define CONCAT3(a,b,c) _CONCAT3(a,b,c) |
126 | |
127 | #define STORE_UNALIGNED(x) \ |
128 | CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \ |
129 | .if (x != 12); \ |
130 | lw T0, (x*4)(STATE); \ |
131 | .endif; \ |
132 | lwl T1, (x*4)+MSB ## (IN); \ |
133 | lwr T1, (x*4)+LSB ## (IN); \ |
134 | .if (x == 12); \ |
135 | addu X ## x, NONCE_0; \ |
136 | .else; \ |
137 | addu X ## x, T0; \ |
138 | .endif; \ |
139 | CPU_TO_LE32(X ## x); \ |
140 | xor X ## x, T1; \ |
141 | swl X ## x, (x*4)+MSB ## (OUT); \ |
142 | swr X ## x, (x*4)+LSB ## (OUT); |
143 | |
144 | #define STORE_ALIGNED(x) \ |
145 | CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ |
146 | .if (x != 12); \ |
147 | lw T0, (x*4)(STATE); \ |
148 | .endif; \ |
149 | lw T1, (x*4) ## (IN); \ |
150 | .if (x == 12); \ |
151 | addu X ## x, NONCE_0; \ |
152 | .else; \ |
153 | addu X ## x, T0; \ |
154 | .endif; \ |
155 | CPU_TO_LE32(X ## x); \ |
156 | xor X ## x, T1; \ |
157 | sw X ## x, (x*4) ## (OUT); |
158 | |
159 | /* Jump table macro. |
160 | * Used for setup and handling the last bytes, which are not multiple of 4. |
161 | * X15 is free to store Xn |
162 | * Every jumptable entry must be equal in size. |
163 | */ |
164 | #define JMPTBL_ALIGNED(x) \ |
165 | .Lchacha_mips_jmptbl_aligned_ ## x: ; \ |
166 | .set noreorder; \ |
167 | b .Lchacha_mips_xor_aligned_ ## x ## _b; \ |
168 | .if (x == 12); \ |
169 | addu SAVED_X, X ## x, NONCE_0; \ |
170 | .else; \ |
171 | addu SAVED_X, X ## x, SAVED_CA; \ |
172 | .endif; \ |
173 | .set reorder |
174 | |
175 | #define JMPTBL_UNALIGNED(x) \ |
176 | .Lchacha_mips_jmptbl_unaligned_ ## x: ; \ |
177 | .set noreorder; \ |
178 | b .Lchacha_mips_xor_unaligned_ ## x ## _b; \ |
179 | .if (x == 12); \ |
180 | addu SAVED_X, X ## x, NONCE_0; \ |
181 | .else; \ |
182 | addu SAVED_X, X ## x, SAVED_CA; \ |
183 | .endif; \ |
184 | .set reorder |
185 | |
186 | #define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \ |
187 | addu X(A), X(K); \ |
188 | addu X(B), X(L); \ |
189 | addu X(C), X(M); \ |
190 | addu X(D), X(N); \ |
191 | xor X(V), X(A); \ |
192 | xor X(W), X(B); \ |
193 | xor X(Y), X(C); \ |
194 | xor X(Z), X(D); \ |
195 | rotl X(V), S; \ |
196 | rotl X(W), S; \ |
197 | rotl X(Y), S; \ |
198 | rotl X(Z), S; |
199 | |
200 | .text |
201 | .set reorder |
202 | .set noat |
203 | .globl chacha_crypt_arch |
204 | .ent chacha_crypt_arch |
205 | chacha_crypt_arch: |
206 | .frame $sp, STACK_SIZE, $ra |
207 | |
208 | /* Load number of rounds */ |
209 | lw $at, 16($sp) |
210 | |
211 | addiu $sp, -STACK_SIZE |
212 | |
213 | /* Return bytes = 0. */ |
214 | beqz BYTES, .Lchacha_mips_end |
215 | |
216 | lw NONCE_0, 48(STATE) |
217 | |
218 | /* Save s0-s7 */ |
219 | sw $s0, 0($sp) |
220 | sw $s1, 4($sp) |
221 | sw $s2, 8($sp) |
222 | sw $s3, 12($sp) |
223 | sw $s4, 16($sp) |
224 | sw $s5, 20($sp) |
225 | sw $s6, 24($sp) |
226 | sw $s7, 28($sp) |
227 | |
228 | /* Test IN or OUT is unaligned. |
229 | * IS_UNALIGNED = ( IN | OUT ) & 0x00000003 |
230 | */ |
231 | or IS_UNALIGNED, IN, OUT |
232 | andi IS_UNALIGNED, 0x3 |
233 | |
234 | b .Lchacha_rounds_start |
235 | |
236 | .align 4 |
237 | .Loop_chacha_rounds: |
238 | addiu IN, CHACHA20_BLOCK_SIZE |
239 | addiu OUT, CHACHA20_BLOCK_SIZE |
240 | addiu NONCE_0, 1 |
241 | |
242 | .Lchacha_rounds_start: |
243 | lw X0, 0(STATE) |
244 | lw X1, 4(STATE) |
245 | lw X2, 8(STATE) |
246 | lw X3, 12(STATE) |
247 | |
248 | lw X4, 16(STATE) |
249 | lw X5, 20(STATE) |
250 | lw X6, 24(STATE) |
251 | lw X7, 28(STATE) |
252 | lw X8, 32(STATE) |
253 | lw X9, 36(STATE) |
254 | lw X10, 40(STATE) |
255 | lw X11, 44(STATE) |
256 | |
257 | move X12, NONCE_0 |
258 | lw X13, 52(STATE) |
259 | lw X14, 56(STATE) |
260 | lw X15, 60(STATE) |
261 | |
262 | .Loop_chacha_xor_rounds: |
263 | addiu $at, -2 |
264 | AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); |
265 | AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); |
266 | AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); |
267 | AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); |
268 | AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); |
269 | AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); |
270 | AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); |
271 | AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); |
272 | bnez $at, .Loop_chacha_xor_rounds |
273 | |
274 | addiu BYTES, -(CHACHA20_BLOCK_SIZE) |
275 | |
276 | /* Is data src/dst unaligned? Jump */ |
277 | bnez IS_UNALIGNED, .Loop_chacha_unaligned |
278 | |
279 | /* Set number rounds here to fill delayslot. */ |
280 | lw $at, (STACK_SIZE+16)($sp) |
281 | |
282 | /* BYTES < 0, it has no full block. */ |
283 | bltz BYTES, .Lchacha_mips_no_full_block_aligned |
284 | |
285 | FOR_EACH_WORD_REV(STORE_ALIGNED) |
286 | |
287 | /* BYTES > 0? Loop again. */ |
288 | bgtz BYTES, .Loop_chacha_rounds |
289 | |
290 | /* Place this here to fill delay slot */ |
291 | addiu NONCE_0, 1 |
292 | |
293 | /* BYTES < 0? Handle last bytes */ |
294 | bltz BYTES, .Lchacha_mips_xor_bytes |
295 | |
296 | .Lchacha_mips_xor_done: |
297 | /* Restore used registers */ |
298 | lw $s0, 0($sp) |
299 | lw $s1, 4($sp) |
300 | lw $s2, 8($sp) |
301 | lw $s3, 12($sp) |
302 | lw $s4, 16($sp) |
303 | lw $s5, 20($sp) |
304 | lw $s6, 24($sp) |
305 | lw $s7, 28($sp) |
306 | |
307 | /* Write NONCE_0 back to right location in state */ |
308 | sw NONCE_0, 48(STATE) |
309 | |
310 | .Lchacha_mips_end: |
311 | addiu $sp, STACK_SIZE |
312 | jr $ra |
313 | |
314 | .Lchacha_mips_no_full_block_aligned: |
315 | /* Restore the offset on BYTES */ |
316 | addiu BYTES, CHACHA20_BLOCK_SIZE |
317 | |
318 | /* Get number of full WORDS */ |
319 | andi $at, BYTES, MASK_U32 |
320 | |
321 | /* Load upper half of jump table addr */ |
322 | lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0) |
323 | |
324 | /* Calculate lower half jump table offset */ |
325 | ins T0, $at, 1, 6 |
326 | |
327 | /* Add offset to STATE */ |
328 | addu T1, STATE, $at |
329 | |
330 | /* Add lower half jump table addr */ |
331 | addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0) |
332 | |
333 | /* Read value from STATE */ |
334 | lw SAVED_CA, 0(T1) |
335 | |
336 | /* Store remaining bytecounter as negative value */ |
337 | subu BYTES, $at, BYTES |
338 | |
339 | jr T0 |
340 | |
341 | /* Jump table */ |
342 | FOR_EACH_WORD(JMPTBL_ALIGNED) |
343 | |
344 | |
345 | .Loop_chacha_unaligned: |
346 | /* Set number rounds here to fill delayslot. */ |
347 | lw $at, (STACK_SIZE+16)($sp) |
348 | |
349 | /* BYTES > 0, it has no full block. */ |
350 | bltz BYTES, .Lchacha_mips_no_full_block_unaligned |
351 | |
352 | FOR_EACH_WORD_REV(STORE_UNALIGNED) |
353 | |
354 | /* BYTES > 0? Loop again. */ |
355 | bgtz BYTES, .Loop_chacha_rounds |
356 | |
357 | /* Write NONCE_0 back to right location in state */ |
358 | sw NONCE_0, 48(STATE) |
359 | |
360 | .set noreorder |
361 | /* Fall through to byte handling */ |
362 | bgez BYTES, .Lchacha_mips_xor_done |
363 | .Lchacha_mips_xor_unaligned_0_b: |
364 | .Lchacha_mips_xor_aligned_0_b: |
365 | /* Place this here to fill delay slot */ |
366 | addiu NONCE_0, 1 |
367 | .set reorder |
368 | |
369 | .Lchacha_mips_xor_bytes: |
370 | addu IN, $at |
371 | addu OUT, $at |
372 | /* First byte */ |
373 | lbu T1, 0(IN) |
374 | addiu $at, BYTES, 1 |
375 | CPU_TO_LE32(SAVED_X) |
376 | ROTR(SAVED_X) |
377 | xor T1, SAVED_X |
378 | sb T1, 0(OUT) |
379 | beqz $at, .Lchacha_mips_xor_done |
380 | /* Second byte */ |
381 | lbu T1, 1(IN) |
382 | addiu $at, BYTES, 2 |
383 | ROTx SAVED_X, 8 |
384 | xor T1, SAVED_X |
385 | sb T1, 1(OUT) |
386 | beqz $at, .Lchacha_mips_xor_done |
387 | /* Third byte */ |
388 | lbu T1, 2(IN) |
389 | ROTx SAVED_X, 8 |
390 | xor T1, SAVED_X |
391 | sb T1, 2(OUT) |
392 | b .Lchacha_mips_xor_done |
393 | |
394 | .Lchacha_mips_no_full_block_unaligned: |
395 | /* Restore the offset on BYTES */ |
396 | addiu BYTES, CHACHA20_BLOCK_SIZE |
397 | |
398 | /* Get number of full WORDS */ |
399 | andi $at, BYTES, MASK_U32 |
400 | |
401 | /* Load upper half of jump table addr */ |
402 | lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0) |
403 | |
404 | /* Calculate lower half jump table offset */ |
405 | ins T0, $at, 1, 6 |
406 | |
407 | /* Add offset to STATE */ |
408 | addu T1, STATE, $at |
409 | |
410 | /* Add lower half jump table addr */ |
411 | addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0) |
412 | |
413 | /* Read value from STATE */ |
414 | lw SAVED_CA, 0(T1) |
415 | |
416 | /* Store remaining bytecounter as negative value */ |
417 | subu BYTES, $at, BYTES |
418 | |
419 | jr T0 |
420 | |
421 | /* Jump table */ |
422 | FOR_EACH_WORD(JMPTBL_UNALIGNED) |
423 | .end chacha_crypt_arch |
424 | .set at |
425 | |
426 | /* Input arguments |
427 | * STATE $a0 |
428 | * OUT $a1 |
429 | * NROUND $a2 |
430 | */ |
431 | |
432 | #undef X12 |
433 | #undef X13 |
434 | #undef X14 |
435 | #undef X15 |
436 | |
437 | #define X12 $a3 |
438 | #define X13 $at |
439 | #define X14 $v0 |
440 | #define X15 STATE |
441 | |
442 | .set noat |
443 | .globl hchacha_block_arch |
444 | .ent hchacha_block_arch |
445 | hchacha_block_arch: |
446 | .frame $sp, STACK_SIZE, $ra |
447 | |
448 | addiu $sp, -STACK_SIZE |
449 | |
450 | /* Save X11(s6) */ |
451 | sw X11, 0($sp) |
452 | |
453 | lw X0, 0(STATE) |
454 | lw X1, 4(STATE) |
455 | lw X2, 8(STATE) |
456 | lw X3, 12(STATE) |
457 | lw X4, 16(STATE) |
458 | lw X5, 20(STATE) |
459 | lw X6, 24(STATE) |
460 | lw X7, 28(STATE) |
461 | lw X8, 32(STATE) |
462 | lw X9, 36(STATE) |
463 | lw X10, 40(STATE) |
464 | lw X11, 44(STATE) |
465 | lw X12, 48(STATE) |
466 | lw X13, 52(STATE) |
467 | lw X14, 56(STATE) |
468 | lw X15, 60(STATE) |
469 | |
470 | .Loop_hchacha_xor_rounds: |
471 | addiu $a2, -2 |
472 | AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); |
473 | AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); |
474 | AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); |
475 | AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); |
476 | AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); |
477 | AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); |
478 | AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); |
479 | AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); |
480 | bnez $a2, .Loop_hchacha_xor_rounds |
481 | |
482 | /* Restore used register */ |
483 | lw X11, 0($sp) |
484 | |
485 | sw X0, 0(OUT) |
486 | sw X1, 4(OUT) |
487 | sw X2, 8(OUT) |
488 | sw X3, 12(OUT) |
489 | sw X12, 16(OUT) |
490 | sw X13, 20(OUT) |
491 | sw X14, 24(OUT) |
492 | sw X15, 28(OUT) |
493 | |
494 | addiu $sp, STACK_SIZE |
495 | jr $ra |
496 | .end hchacha_block_arch |
497 | .set at |
498 | |