1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * x86_64/AVX2/AES-NI assembler implementation of Camellia |
4 | * |
5 | * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
6 | */ |
7 | |
8 | #include <linux/linkage.h> |
9 | #include <asm/frame.h> |
10 | |
11 | #define CAMELLIA_TABLE_BYTE_LEN 272 |
12 | |
13 | /* struct camellia_ctx: */ |
14 | #define key_table 0 |
15 | #define key_length CAMELLIA_TABLE_BYTE_LEN |
16 | |
17 | /* register macros */ |
18 | #define CTX %rdi |
19 | #define RIO %r8 |
20 | |
21 | /********************************************************************** |
22 | helper macros |
23 | **********************************************************************/ |
24 | #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ |
25 | vpand x, mask4bit, tmp0; \ |
26 | vpandn x, mask4bit, x; \ |
27 | vpsrld $4, x, x; \ |
28 | \ |
29 | vpshufb tmp0, lo_t, tmp0; \ |
30 | vpshufb x, hi_t, x; \ |
31 | vpxor tmp0, x, x; |
32 | |
33 | #define ymm0_x xmm0 |
34 | #define ymm1_x xmm1 |
35 | #define ymm2_x xmm2 |
36 | #define ymm3_x xmm3 |
37 | #define ymm4_x xmm4 |
38 | #define ymm5_x xmm5 |
39 | #define ymm6_x xmm6 |
40 | #define ymm7_x xmm7 |
41 | #define ymm8_x xmm8 |
42 | #define ymm9_x xmm9 |
43 | #define ymm10_x xmm10 |
44 | #define ymm11_x xmm11 |
45 | #define ymm12_x xmm12 |
46 | #define ymm13_x xmm13 |
47 | #define ymm14_x xmm14 |
48 | #define ymm15_x xmm15 |
49 | |
50 | /********************************************************************** |
51 | 32-way camellia |
52 | **********************************************************************/ |
53 | |
54 | /* |
55 | * IN: |
56 | * x0..x7: byte-sliced AB state |
57 | * mem_cd: register pointer storing CD state |
58 | * key: index for key material |
59 | * OUT: |
60 | * x0..x7: new byte-sliced CD state |
61 | */ |
62 | #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ |
63 | t7, mem_cd, key) \ |
64 | /* \ |
65 | * S-function with AES subbytes \ |
66 | */ \ |
67 | vbroadcasti128 .Linv_shift_row(%rip), t4; \ |
68 | vpbroadcastd .L0f0f0f0f(%rip), t7; \ |
69 | vbroadcasti128 .Lpre_tf_lo_s1(%rip), t5; \ |
70 | vbroadcasti128 .Lpre_tf_hi_s1(%rip), t6; \ |
71 | vbroadcasti128 .Lpre_tf_lo_s4(%rip), t2; \ |
72 | vbroadcasti128 .Lpre_tf_hi_s4(%rip), t3; \ |
73 | \ |
74 | /* AES inverse shift rows */ \ |
75 | vpshufb t4, x0, x0; \ |
76 | vpshufb t4, x7, x7; \ |
77 | vpshufb t4, x3, x3; \ |
78 | vpshufb t4, x6, x6; \ |
79 | vpshufb t4, x2, x2; \ |
80 | vpshufb t4, x5, x5; \ |
81 | vpshufb t4, x1, x1; \ |
82 | vpshufb t4, x4, x4; \ |
83 | \ |
84 | /* prefilter sboxes 1, 2 and 3 */ \ |
85 | /* prefilter sbox 4 */ \ |
86 | filter_8bit(x0, t5, t6, t7, t4); \ |
87 | filter_8bit(x7, t5, t6, t7, t4); \ |
88 | vextracti128 $1, x0, t0##_x; \ |
89 | vextracti128 $1, x7, t1##_x; \ |
90 | filter_8bit(x3, t2, t3, t7, t4); \ |
91 | filter_8bit(x6, t2, t3, t7, t4); \ |
92 | vextracti128 $1, x3, t3##_x; \ |
93 | vextracti128 $1, x6, t2##_x; \ |
94 | filter_8bit(x2, t5, t6, t7, t4); \ |
95 | filter_8bit(x5, t5, t6, t7, t4); \ |
96 | filter_8bit(x1, t5, t6, t7, t4); \ |
97 | filter_8bit(x4, t5, t6, t7, t4); \ |
98 | \ |
99 | vpxor t4##_x, t4##_x, t4##_x; \ |
100 | \ |
101 | /* AES subbytes + AES shift rows */ \ |
102 | vextracti128 $1, x2, t6##_x; \ |
103 | vextracti128 $1, x5, t5##_x; \ |
104 | vaesenclast t4##_x, x0##_x, x0##_x; \ |
105 | vaesenclast t4##_x, t0##_x, t0##_x; \ |
106 | vinserti128 $1, t0##_x, x0, x0; \ |
107 | vaesenclast t4##_x, x7##_x, x7##_x; \ |
108 | vaesenclast t4##_x, t1##_x, t1##_x; \ |
109 | vinserti128 $1, t1##_x, x7, x7; \ |
110 | vaesenclast t4##_x, x3##_x, x3##_x; \ |
111 | vaesenclast t4##_x, t3##_x, t3##_x; \ |
112 | vinserti128 $1, t3##_x, x3, x3; \ |
113 | vaesenclast t4##_x, x6##_x, x6##_x; \ |
114 | vaesenclast t4##_x, t2##_x, t2##_x; \ |
115 | vinserti128 $1, t2##_x, x6, x6; \ |
116 | vextracti128 $1, x1, t3##_x; \ |
117 | vextracti128 $1, x4, t2##_x; \ |
118 | vbroadcasti128 .Lpost_tf_lo_s1(%rip), t0; \ |
119 | vbroadcasti128 .Lpost_tf_hi_s1(%rip), t1; \ |
120 | vaesenclast t4##_x, x2##_x, x2##_x; \ |
121 | vaesenclast t4##_x, t6##_x, t6##_x; \ |
122 | vinserti128 $1, t6##_x, x2, x2; \ |
123 | vaesenclast t4##_x, x5##_x, x5##_x; \ |
124 | vaesenclast t4##_x, t5##_x, t5##_x; \ |
125 | vinserti128 $1, t5##_x, x5, x5; \ |
126 | vaesenclast t4##_x, x1##_x, x1##_x; \ |
127 | vaesenclast t4##_x, t3##_x, t3##_x; \ |
128 | vinserti128 $1, t3##_x, x1, x1; \ |
129 | vaesenclast t4##_x, x4##_x, x4##_x; \ |
130 | vaesenclast t4##_x, t2##_x, t2##_x; \ |
131 | vinserti128 $1, t2##_x, x4, x4; \ |
132 | \ |
133 | /* postfilter sboxes 1 and 4 */ \ |
134 | vbroadcasti128 .Lpost_tf_lo_s3(%rip), t2; \ |
135 | vbroadcasti128 .Lpost_tf_hi_s3(%rip), t3; \ |
136 | filter_8bit(x0, t0, t1, t7, t6); \ |
137 | filter_8bit(x7, t0, t1, t7, t6); \ |
138 | filter_8bit(x3, t0, t1, t7, t6); \ |
139 | filter_8bit(x6, t0, t1, t7, t6); \ |
140 | \ |
141 | /* postfilter sbox 3 */ \ |
142 | vbroadcasti128 .Lpost_tf_lo_s2(%rip), t4; \ |
143 | vbroadcasti128 .Lpost_tf_hi_s2(%rip), t5; \ |
144 | filter_8bit(x2, t2, t3, t7, t6); \ |
145 | filter_8bit(x5, t2, t3, t7, t6); \ |
146 | \ |
147 | vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \ |
148 | \ |
149 | /* postfilter sbox 2 */ \ |
150 | filter_8bit(x1, t4, t5, t7, t2); \ |
151 | filter_8bit(x4, t4, t5, t7, t2); \ |
152 | vpxor t7, t7, t7; \ |
153 | \ |
154 | vpsrldq $1, t0, t1; \ |
155 | vpsrldq $2, t0, t2; \ |
156 | vpshufb t7, t1, t1; \ |
157 | vpsrldq $3, t0, t3; \ |
158 | \ |
159 | /* P-function */ \ |
160 | vpxor x5, x0, x0; \ |
161 | vpxor x6, x1, x1; \ |
162 | vpxor x7, x2, x2; \ |
163 | vpxor x4, x3, x3; \ |
164 | \ |
165 | vpshufb t7, t2, t2; \ |
166 | vpsrldq $4, t0, t4; \ |
167 | vpshufb t7, t3, t3; \ |
168 | vpsrldq $5, t0, t5; \ |
169 | vpshufb t7, t4, t4; \ |
170 | \ |
171 | vpxor x2, x4, x4; \ |
172 | vpxor x3, x5, x5; \ |
173 | vpxor x0, x6, x6; \ |
174 | vpxor x1, x7, x7; \ |
175 | \ |
176 | vpsrldq $6, t0, t6; \ |
177 | vpshufb t7, t5, t5; \ |
178 | vpshufb t7, t6, t6; \ |
179 | \ |
180 | vpxor x7, x0, x0; \ |
181 | vpxor x4, x1, x1; \ |
182 | vpxor x5, x2, x2; \ |
183 | vpxor x6, x3, x3; \ |
184 | \ |
185 | vpxor x3, x4, x4; \ |
186 | vpxor x0, x5, x5; \ |
187 | vpxor x1, x6, x6; \ |
188 | vpxor x2, x7, x7; /* note: high and low parts swapped */ \ |
189 | \ |
190 | /* Add key material and result to CD (x becomes new CD) */ \ |
191 | \ |
192 | vpxor t6, x1, x1; \ |
193 | vpxor 5 * 32(mem_cd), x1, x1; \ |
194 | \ |
195 | vpsrldq $7, t0, t6; \ |
196 | vpshufb t7, t0, t0; \ |
197 | vpshufb t7, t6, t7; \ |
198 | \ |
199 | vpxor t7, x0, x0; \ |
200 | vpxor 4 * 32(mem_cd), x0, x0; \ |
201 | \ |
202 | vpxor t5, x2, x2; \ |
203 | vpxor 6 * 32(mem_cd), x2, x2; \ |
204 | \ |
205 | vpxor t4, x3, x3; \ |
206 | vpxor 7 * 32(mem_cd), x3, x3; \ |
207 | \ |
208 | vpxor t3, x4, x4; \ |
209 | vpxor 0 * 32(mem_cd), x4, x4; \ |
210 | \ |
211 | vpxor t2, x5, x5; \ |
212 | vpxor 1 * 32(mem_cd), x5, x5; \ |
213 | \ |
214 | vpxor t1, x6, x6; \ |
215 | vpxor 2 * 32(mem_cd), x6, x6; \ |
216 | \ |
217 | vpxor t0, x7, x7; \ |
218 | vpxor 3 * 32(mem_cd), x7, x7; |
219 | |
220 | /* |
221 | * Size optimization... with inlined roundsm32 binary would be over 5 times |
222 | * larger and would only marginally faster. |
223 | */ |
224 | SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) |
225 | roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
226 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, |
227 | %rcx, (%r9)); |
228 | RET; |
229 | SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) |
230 | |
231 | SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) |
232 | roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, |
233 | %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11, |
234 | %rax, (%r9)); |
235 | RET; |
236 | SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) |
237 | |
238 | /* |
239 | * IN/OUT: |
240 | * x0..x7: byte-sliced AB state preloaded |
241 | * mem_ab: byte-sliced AB state in memory |
242 | * mem_cb: byte-sliced CD state in memory |
243 | */ |
244 | #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ |
245 | y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ |
246 | leaq (key_table + (i) * 8)(CTX), %r9; \ |
247 | call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \ |
248 | \ |
249 | vmovdqu x0, 4 * 32(mem_cd); \ |
250 | vmovdqu x1, 5 * 32(mem_cd); \ |
251 | vmovdqu x2, 6 * 32(mem_cd); \ |
252 | vmovdqu x3, 7 * 32(mem_cd); \ |
253 | vmovdqu x4, 0 * 32(mem_cd); \ |
254 | vmovdqu x5, 1 * 32(mem_cd); \ |
255 | vmovdqu x6, 2 * 32(mem_cd); \ |
256 | vmovdqu x7, 3 * 32(mem_cd); \ |
257 | \ |
258 | leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \ |
259 | call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \ |
260 | \ |
261 | store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); |
262 | |
263 | #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ |
264 | |
265 | #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ |
266 | /* Store new AB state */ \ |
267 | vmovdqu x4, 4 * 32(mem_ab); \ |
268 | vmovdqu x5, 5 * 32(mem_ab); \ |
269 | vmovdqu x6, 6 * 32(mem_ab); \ |
270 | vmovdqu x7, 7 * 32(mem_ab); \ |
271 | vmovdqu x0, 0 * 32(mem_ab); \ |
272 | vmovdqu x1, 1 * 32(mem_ab); \ |
273 | vmovdqu x2, 2 * 32(mem_ab); \ |
274 | vmovdqu x3, 3 * 32(mem_ab); |
275 | |
276 | #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ |
277 | y6, y7, mem_ab, mem_cd, i) \ |
278 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ |
279 | y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ |
280 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ |
281 | y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ |
282 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ |
283 | y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); |
284 | |
285 | #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ |
286 | y6, y7, mem_ab, mem_cd, i) \ |
287 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ |
288 | y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ |
289 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ |
290 | y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ |
291 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ |
292 | y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); |
293 | |
294 | /* |
295 | * IN: |
296 | * v0..3: byte-sliced 32-bit integers |
297 | * OUT: |
298 | * v0..3: (IN <<< 1) |
299 | */ |
300 | #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \ |
301 | vpcmpgtb v0, zero, t0; \ |
302 | vpaddb v0, v0, v0; \ |
303 | vpabsb t0, t0; \ |
304 | \ |
305 | vpcmpgtb v1, zero, t1; \ |
306 | vpaddb v1, v1, v1; \ |
307 | vpabsb t1, t1; \ |
308 | \ |
309 | vpcmpgtb v2, zero, t2; \ |
310 | vpaddb v2, v2, v2; \ |
311 | vpabsb t2, t2; \ |
312 | \ |
313 | vpor t0, v1, v1; \ |
314 | \ |
315 | vpcmpgtb v3, zero, t0; \ |
316 | vpaddb v3, v3, v3; \ |
317 | vpabsb t0, t0; \ |
318 | \ |
319 | vpor t1, v2, v2; \ |
320 | vpor t2, v3, v3; \ |
321 | vpor t0, v0, v0; |
322 | |
323 | /* |
324 | * IN: |
325 | * r: byte-sliced AB state in memory |
326 | * l: byte-sliced CD state in memory |
327 | * OUT: |
328 | * x0..x7: new byte-sliced CD state |
329 | */ |
330 | #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ |
331 | tt1, tt2, tt3, kll, klr, krl, krr) \ |
332 | /* \ |
333 | * t0 = kll; \ |
334 | * t0 &= ll; \ |
335 | * lr ^= rol32(t0, 1); \ |
336 | */ \ |
337 | vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ |
338 | vpxor tt0, tt0, tt0; \ |
339 | vpshufb tt0, t0, t3; \ |
340 | vpsrldq $1, t0, t0; \ |
341 | vpshufb tt0, t0, t2; \ |
342 | vpsrldq $1, t0, t0; \ |
343 | vpshufb tt0, t0, t1; \ |
344 | vpsrldq $1, t0, t0; \ |
345 | vpshufb tt0, t0, t0; \ |
346 | \ |
347 | vpand l0, t0, t0; \ |
348 | vpand l1, t1, t1; \ |
349 | vpand l2, t2, t2; \ |
350 | vpand l3, t3, t3; \ |
351 | \ |
352 | rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ |
353 | \ |
354 | vpxor l4, t0, l4; \ |
355 | vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ |
356 | vmovdqu l4, 4 * 32(l); \ |
357 | vpxor l5, t1, l5; \ |
358 | vmovdqu l5, 5 * 32(l); \ |
359 | vpxor l6, t2, l6; \ |
360 | vmovdqu l6, 6 * 32(l); \ |
361 | vpxor l7, t3, l7; \ |
362 | vmovdqu l7, 7 * 32(l); \ |
363 | \ |
364 | /* \ |
365 | * t2 = krr; \ |
366 | * t2 |= rr; \ |
367 | * rl ^= t2; \ |
368 | */ \ |
369 | \ |
370 | vpshufb tt0, t0, t3; \ |
371 | vpsrldq $1, t0, t0; \ |
372 | vpshufb tt0, t0, t2; \ |
373 | vpsrldq $1, t0, t0; \ |
374 | vpshufb tt0, t0, t1; \ |
375 | vpsrldq $1, t0, t0; \ |
376 | vpshufb tt0, t0, t0; \ |
377 | \ |
378 | vpor 4 * 32(r), t0, t0; \ |
379 | vpor 5 * 32(r), t1, t1; \ |
380 | vpor 6 * 32(r), t2, t2; \ |
381 | vpor 7 * 32(r), t3, t3; \ |
382 | \ |
383 | vpxor 0 * 32(r), t0, t0; \ |
384 | vpxor 1 * 32(r), t1, t1; \ |
385 | vpxor 2 * 32(r), t2, t2; \ |
386 | vpxor 3 * 32(r), t3, t3; \ |
387 | vmovdqu t0, 0 * 32(r); \ |
388 | vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ |
389 | vmovdqu t1, 1 * 32(r); \ |
390 | vmovdqu t2, 2 * 32(r); \ |
391 | vmovdqu t3, 3 * 32(r); \ |
392 | \ |
393 | /* \ |
394 | * t2 = krl; \ |
395 | * t2 &= rl; \ |
396 | * rr ^= rol32(t2, 1); \ |
397 | */ \ |
398 | vpshufb tt0, t0, t3; \ |
399 | vpsrldq $1, t0, t0; \ |
400 | vpshufb tt0, t0, t2; \ |
401 | vpsrldq $1, t0, t0; \ |
402 | vpshufb tt0, t0, t1; \ |
403 | vpsrldq $1, t0, t0; \ |
404 | vpshufb tt0, t0, t0; \ |
405 | \ |
406 | vpand 0 * 32(r), t0, t0; \ |
407 | vpand 1 * 32(r), t1, t1; \ |
408 | vpand 2 * 32(r), t2, t2; \ |
409 | vpand 3 * 32(r), t3, t3; \ |
410 | \ |
411 | rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ |
412 | \ |
413 | vpxor 4 * 32(r), t0, t0; \ |
414 | vpxor 5 * 32(r), t1, t1; \ |
415 | vpxor 6 * 32(r), t2, t2; \ |
416 | vpxor 7 * 32(r), t3, t3; \ |
417 | vmovdqu t0, 4 * 32(r); \ |
418 | vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ |
419 | vmovdqu t1, 5 * 32(r); \ |
420 | vmovdqu t2, 6 * 32(r); \ |
421 | vmovdqu t3, 7 * 32(r); \ |
422 | \ |
423 | /* \ |
424 | * t0 = klr; \ |
425 | * t0 |= lr; \ |
426 | * ll ^= t0; \ |
427 | */ \ |
428 | \ |
429 | vpshufb tt0, t0, t3; \ |
430 | vpsrldq $1, t0, t0; \ |
431 | vpshufb tt0, t0, t2; \ |
432 | vpsrldq $1, t0, t0; \ |
433 | vpshufb tt0, t0, t1; \ |
434 | vpsrldq $1, t0, t0; \ |
435 | vpshufb tt0, t0, t0; \ |
436 | \ |
437 | vpor l4, t0, t0; \ |
438 | vpor l5, t1, t1; \ |
439 | vpor l6, t2, t2; \ |
440 | vpor l7, t3, t3; \ |
441 | \ |
442 | vpxor l0, t0, l0; \ |
443 | vmovdqu l0, 0 * 32(l); \ |
444 | vpxor l1, t1, l1; \ |
445 | vmovdqu l1, 1 * 32(l); \ |
446 | vpxor l2, t2, l2; \ |
447 | vmovdqu l2, 2 * 32(l); \ |
448 | vpxor l3, t3, l3; \ |
449 | vmovdqu l3, 3 * 32(l); |
450 | |
451 | #define transpose_4x4(x0, x1, x2, x3, t1, t2) \ |
452 | vpunpckhdq x1, x0, t2; \ |
453 | vpunpckldq x1, x0, x0; \ |
454 | \ |
455 | vpunpckldq x3, x2, t1; \ |
456 | vpunpckhdq x3, x2, x2; \ |
457 | \ |
458 | vpunpckhqdq t1, x0, x1; \ |
459 | vpunpcklqdq t1, x0, x0; \ |
460 | \ |
461 | vpunpckhqdq x2, t2, x3; \ |
462 | vpunpcklqdq x2, t2, x2; |
463 | |
464 | #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \ |
465 | a3, b3, c3, d3, st0, st1) \ |
466 | vmovdqu d2, st0; \ |
467 | vmovdqu d3, st1; \ |
468 | transpose_4x4(a0, a1, a2, a3, d2, d3); \ |
469 | transpose_4x4(b0, b1, b2, b3, d2, d3); \ |
470 | vmovdqu st0, d2; \ |
471 | vmovdqu st1, d3; \ |
472 | \ |
473 | vmovdqu a0, st0; \ |
474 | vmovdqu a1, st1; \ |
475 | transpose_4x4(c0, c1, c2, c3, a0, a1); \ |
476 | transpose_4x4(d0, d1, d2, d3, a0, a1); \ |
477 | \ |
478 | vbroadcasti128 .Lshufb_16x16b(%rip), a0; \ |
479 | vmovdqu st1, a1; \ |
480 | vpshufb a0, a2, a2; \ |
481 | vpshufb a0, a3, a3; \ |
482 | vpshufb a0, b0, b0; \ |
483 | vpshufb a0, b1, b1; \ |
484 | vpshufb a0, b2, b2; \ |
485 | vpshufb a0, b3, b3; \ |
486 | vpshufb a0, a1, a1; \ |
487 | vpshufb a0, c0, c0; \ |
488 | vpshufb a0, c1, c1; \ |
489 | vpshufb a0, c2, c2; \ |
490 | vpshufb a0, c3, c3; \ |
491 | vpshufb a0, d0, d0; \ |
492 | vpshufb a0, d1, d1; \ |
493 | vpshufb a0, d2, d2; \ |
494 | vpshufb a0, d3, d3; \ |
495 | vmovdqu d3, st1; \ |
496 | vmovdqu st0, d3; \ |
497 | vpshufb a0, d3, a0; \ |
498 | vmovdqu d2, st0; \ |
499 | \ |
500 | transpose_4x4(a0, b0, c0, d0, d2, d3); \ |
501 | transpose_4x4(a1, b1, c1, d1, d2, d3); \ |
502 | vmovdqu st0, d2; \ |
503 | vmovdqu st1, d3; \ |
504 | \ |
505 | vmovdqu b0, st0; \ |
506 | vmovdqu b1, st1; \ |
507 | transpose_4x4(a2, b2, c2, d2, b0, b1); \ |
508 | transpose_4x4(a3, b3, c3, d3, b0, b1); \ |
509 | vmovdqu st0, b0; \ |
510 | vmovdqu st1, b1; \ |
511 | /* does not adjust output bytes inside vectors */ |
512 | |
513 | /* load blocks to registers and apply pre-whitening */ |
514 | #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ |
515 | y6, y7, rio, key) \ |
516 | vpbroadcastq key, x0; \ |
517 | vpshufb .Lpack_bswap(%rip), x0, x0; \ |
518 | \ |
519 | vpxor 0 * 32(rio), x0, y7; \ |
520 | vpxor 1 * 32(rio), x0, y6; \ |
521 | vpxor 2 * 32(rio), x0, y5; \ |
522 | vpxor 3 * 32(rio), x0, y4; \ |
523 | vpxor 4 * 32(rio), x0, y3; \ |
524 | vpxor 5 * 32(rio), x0, y2; \ |
525 | vpxor 6 * 32(rio), x0, y1; \ |
526 | vpxor 7 * 32(rio), x0, y0; \ |
527 | vpxor 8 * 32(rio), x0, x7; \ |
528 | vpxor 9 * 32(rio), x0, x6; \ |
529 | vpxor 10 * 32(rio), x0, x5; \ |
530 | vpxor 11 * 32(rio), x0, x4; \ |
531 | vpxor 12 * 32(rio), x0, x3; \ |
532 | vpxor 13 * 32(rio), x0, x2; \ |
533 | vpxor 14 * 32(rio), x0, x1; \ |
534 | vpxor 15 * 32(rio), x0, x0; |
535 | |
536 | /* byteslice pre-whitened blocks and store to temporary memory */ |
537 | #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ |
538 | y6, y7, mem_ab, mem_cd) \ |
539 | byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \ |
540 | y4, y5, y6, y7, (mem_ab), (mem_cd)); \ |
541 | \ |
542 | vmovdqu x0, 0 * 32(mem_ab); \ |
543 | vmovdqu x1, 1 * 32(mem_ab); \ |
544 | vmovdqu x2, 2 * 32(mem_ab); \ |
545 | vmovdqu x3, 3 * 32(mem_ab); \ |
546 | vmovdqu x4, 4 * 32(mem_ab); \ |
547 | vmovdqu x5, 5 * 32(mem_ab); \ |
548 | vmovdqu x6, 6 * 32(mem_ab); \ |
549 | vmovdqu x7, 7 * 32(mem_ab); \ |
550 | vmovdqu y0, 0 * 32(mem_cd); \ |
551 | vmovdqu y1, 1 * 32(mem_cd); \ |
552 | vmovdqu y2, 2 * 32(mem_cd); \ |
553 | vmovdqu y3, 3 * 32(mem_cd); \ |
554 | vmovdqu y4, 4 * 32(mem_cd); \ |
555 | vmovdqu y5, 5 * 32(mem_cd); \ |
556 | vmovdqu y6, 6 * 32(mem_cd); \ |
557 | vmovdqu y7, 7 * 32(mem_cd); |
558 | |
559 | /* de-byteslice, apply post-whitening and store blocks */ |
560 | #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ |
561 | y5, y6, y7, key, stack_tmp0, stack_tmp1) \ |
562 | byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \ |
563 | y3, y7, x3, x7, stack_tmp0, stack_tmp1); \ |
564 | \ |
565 | vmovdqu x0, stack_tmp0; \ |
566 | \ |
567 | vpbroadcastq key, x0; \ |
568 | vpshufb .Lpack_bswap(%rip), x0, x0; \ |
569 | \ |
570 | vpxor x0, y7, y7; \ |
571 | vpxor x0, y6, y6; \ |
572 | vpxor x0, y5, y5; \ |
573 | vpxor x0, y4, y4; \ |
574 | vpxor x0, y3, y3; \ |
575 | vpxor x0, y2, y2; \ |
576 | vpxor x0, y1, y1; \ |
577 | vpxor x0, y0, y0; \ |
578 | vpxor x0, x7, x7; \ |
579 | vpxor x0, x6, x6; \ |
580 | vpxor x0, x5, x5; \ |
581 | vpxor x0, x4, x4; \ |
582 | vpxor x0, x3, x3; \ |
583 | vpxor x0, x2, x2; \ |
584 | vpxor x0, x1, x1; \ |
585 | vpxor stack_tmp0, x0, x0; |
586 | |
587 | #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ |
588 | y6, y7, rio) \ |
589 | vmovdqu x0, 0 * 32(rio); \ |
590 | vmovdqu x1, 1 * 32(rio); \ |
591 | vmovdqu x2, 2 * 32(rio); \ |
592 | vmovdqu x3, 3 * 32(rio); \ |
593 | vmovdqu x4, 4 * 32(rio); \ |
594 | vmovdqu x5, 5 * 32(rio); \ |
595 | vmovdqu x6, 6 * 32(rio); \ |
596 | vmovdqu x7, 7 * 32(rio); \ |
597 | vmovdqu y0, 8 * 32(rio); \ |
598 | vmovdqu y1, 9 * 32(rio); \ |
599 | vmovdqu y2, 10 * 32(rio); \ |
600 | vmovdqu y3, 11 * 32(rio); \ |
601 | vmovdqu y4, 12 * 32(rio); \ |
602 | vmovdqu y5, 13 * 32(rio); \ |
603 | vmovdqu y6, 14 * 32(rio); \ |
604 | vmovdqu y7, 15 * 32(rio); |
605 | |
606 | |
607 | .section .rodata.cst32.shufb_16x16b, "aM" , @progbits, 32 |
608 | .align 32 |
609 | #define SHUFB_BYTES(idx) \ |
610 | 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) |
611 | .Lshufb_16x16b: |
612 | .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) |
613 | .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) |
614 | |
615 | .section .rodata.cst32.pack_bswap, "aM" , @progbits, 32 |
616 | .align 32 |
617 | .Lpack_bswap: |
618 | .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 |
619 | .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 |
620 | |
621 | /* NB: section is mergeable, all elements must be aligned 16-byte blocks */ |
622 | .section .rodata.cst16, "aM" , @progbits, 16 |
623 | .align 16 |
624 | |
625 | /* |
626 | * pre-SubByte transform |
627 | * |
628 | * pre-lookup for sbox1, sbox2, sbox3: |
629 | * swap_bitendianness( |
630 | * isom_map_camellia_to_aes( |
631 | * camellia_f( |
632 | * swap_bitendianess(in) |
633 | * ) |
634 | * ) |
635 | * ) |
636 | * |
637 | * (note: '⊕ 0xc5' inside camellia_f()) |
638 | */ |
639 | .Lpre_tf_lo_s1: |
640 | .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 |
641 | .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 |
642 | .Lpre_tf_hi_s1: |
643 | .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a |
644 | .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 |
645 | |
646 | /* |
647 | * pre-SubByte transform |
648 | * |
649 | * pre-lookup for sbox4: |
650 | * swap_bitendianness( |
651 | * isom_map_camellia_to_aes( |
652 | * camellia_f( |
653 | * swap_bitendianess(in <<< 1) |
654 | * ) |
655 | * ) |
656 | * ) |
657 | * |
658 | * (note: '⊕ 0xc5' inside camellia_f()) |
659 | */ |
660 | .Lpre_tf_lo_s4: |
661 | .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 |
662 | .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 |
663 | .Lpre_tf_hi_s4: |
664 | .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 |
665 | .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf |
666 | |
667 | /* |
668 | * post-SubByte transform |
669 | * |
670 | * post-lookup for sbox1, sbox4: |
671 | * swap_bitendianness( |
672 | * camellia_h( |
673 | * isom_map_aes_to_camellia( |
674 | * swap_bitendianness( |
675 | * aes_inverse_affine_transform(in) |
676 | * ) |
677 | * ) |
678 | * ) |
679 | * ) |
680 | * |
681 | * (note: '⊕ 0x6e' inside camellia_h()) |
682 | */ |
683 | .Lpost_tf_lo_s1: |
684 | .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 |
685 | .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 |
686 | .Lpost_tf_hi_s1: |
687 | .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 |
688 | .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c |
689 | |
690 | /* |
691 | * post-SubByte transform |
692 | * |
693 | * post-lookup for sbox2: |
694 | * swap_bitendianness( |
695 | * camellia_h( |
696 | * isom_map_aes_to_camellia( |
697 | * swap_bitendianness( |
698 | * aes_inverse_affine_transform(in) |
699 | * ) |
700 | * ) |
701 | * ) |
702 | * ) <<< 1 |
703 | * |
704 | * (note: '⊕ 0x6e' inside camellia_h()) |
705 | */ |
706 | .Lpost_tf_lo_s2: |
707 | .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 |
708 | .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 |
709 | .Lpost_tf_hi_s2: |
710 | .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 |
711 | .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 |
712 | |
713 | /* |
714 | * post-SubByte transform |
715 | * |
716 | * post-lookup for sbox3: |
717 | * swap_bitendianness( |
718 | * camellia_h( |
719 | * isom_map_aes_to_camellia( |
720 | * swap_bitendianness( |
721 | * aes_inverse_affine_transform(in) |
722 | * ) |
723 | * ) |
724 | * ) |
725 | * ) >>> 1 |
726 | * |
727 | * (note: '⊕ 0x6e' inside camellia_h()) |
728 | */ |
729 | .Lpost_tf_lo_s3: |
730 | .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 |
731 | .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 |
732 | .Lpost_tf_hi_s3: |
733 | .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 |
734 | .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 |
735 | |
736 | /* For isolating SubBytes from AESENCLAST, inverse shift row */ |
737 | .Linv_shift_row: |
738 | .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b |
739 | .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 |
740 | |
741 | .section .rodata.cst4.L0f0f0f0f, "aM" , @progbits, 4 |
742 | .align 4 |
743 | /* 4-bit mask */ |
744 | .L0f0f0f0f: |
745 | .long 0x0f0f0f0f |
746 | |
747 | .text |
748 | |
749 | SYM_FUNC_START_LOCAL(__camellia_enc_blk32) |
750 | /* input: |
751 | * %rdi: ctx, CTX |
752 | * %rax: temporary storage, 512 bytes |
753 | * %ymm0..%ymm15: 32 plaintext blocks |
754 | * output: |
755 | * %ymm0..%ymm15: 32 encrypted blocks, order swapped: |
756 | * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 |
757 | */ |
758 | FRAME_BEGIN |
759 | |
760 | leaq 8 * 32(%rax), %rcx; |
761 | |
762 | inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
763 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
764 | %ymm15, %rax, %rcx); |
765 | |
766 | enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
767 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
768 | %ymm15, %rax, %rcx, 0); |
769 | |
770 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
771 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
772 | %ymm15, |
773 | ((key_table + (8) * 8) + 0)(CTX), |
774 | ((key_table + (8) * 8) + 4)(CTX), |
775 | ((key_table + (8) * 8) + 8)(CTX), |
776 | ((key_table + (8) * 8) + 12)(CTX)); |
777 | |
778 | enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
779 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
780 | %ymm15, %rax, %rcx, 8); |
781 | |
782 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
783 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
784 | %ymm15, |
785 | ((key_table + (16) * 8) + 0)(CTX), |
786 | ((key_table + (16) * 8) + 4)(CTX), |
787 | ((key_table + (16) * 8) + 8)(CTX), |
788 | ((key_table + (16) * 8) + 12)(CTX)); |
789 | |
790 | enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
791 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
792 | %ymm15, %rax, %rcx, 16); |
793 | |
794 | movl $24, %r8d; |
795 | cmpl $16, key_length(CTX); |
796 | jne .Lenc_max32; |
797 | |
798 | .Lenc_done: |
799 | /* load CD for output */ |
800 | vmovdqu 0 * 32(%rcx), %ymm8; |
801 | vmovdqu 1 * 32(%rcx), %ymm9; |
802 | vmovdqu 2 * 32(%rcx), %ymm10; |
803 | vmovdqu 3 * 32(%rcx), %ymm11; |
804 | vmovdqu 4 * 32(%rcx), %ymm12; |
805 | vmovdqu 5 * 32(%rcx), %ymm13; |
806 | vmovdqu 6 * 32(%rcx), %ymm14; |
807 | vmovdqu 7 * 32(%rcx), %ymm15; |
808 | |
809 | outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
810 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
811 | %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax)); |
812 | |
813 | FRAME_END |
814 | RET; |
815 | |
816 | .align 8 |
817 | .Lenc_max32: |
818 | movl $32, %r8d; |
819 | |
820 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
821 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
822 | %ymm15, |
823 | ((key_table + (24) * 8) + 0)(CTX), |
824 | ((key_table + (24) * 8) + 4)(CTX), |
825 | ((key_table + (24) * 8) + 8)(CTX), |
826 | ((key_table + (24) * 8) + 12)(CTX)); |
827 | |
828 | enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
829 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
830 | %ymm15, %rax, %rcx, 24); |
831 | |
832 | jmp .Lenc_done; |
833 | SYM_FUNC_END(__camellia_enc_blk32) |
834 | |
835 | SYM_FUNC_START_LOCAL(__camellia_dec_blk32) |
836 | /* input: |
837 | * %rdi: ctx, CTX |
838 | * %rax: temporary storage, 512 bytes |
839 | * %r8d: 24 for 16 byte key, 32 for larger |
840 | * %ymm0..%ymm15: 16 encrypted blocks |
841 | * output: |
842 | * %ymm0..%ymm15: 16 plaintext blocks, order swapped: |
843 | * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 |
844 | */ |
845 | FRAME_BEGIN |
846 | |
847 | leaq 8 * 32(%rax), %rcx; |
848 | |
849 | inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
850 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
851 | %ymm15, %rax, %rcx); |
852 | |
853 | cmpl $32, %r8d; |
854 | je .Ldec_max32; |
855 | |
856 | .Ldec_max24: |
857 | dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
858 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
859 | %ymm15, %rax, %rcx, 16); |
860 | |
861 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
862 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
863 | %ymm15, |
864 | ((key_table + (16) * 8) + 8)(CTX), |
865 | ((key_table + (16) * 8) + 12)(CTX), |
866 | ((key_table + (16) * 8) + 0)(CTX), |
867 | ((key_table + (16) * 8) + 4)(CTX)); |
868 | |
869 | dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
870 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
871 | %ymm15, %rax, %rcx, 8); |
872 | |
873 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
874 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
875 | %ymm15, |
876 | ((key_table + (8) * 8) + 8)(CTX), |
877 | ((key_table + (8) * 8) + 12)(CTX), |
878 | ((key_table + (8) * 8) + 0)(CTX), |
879 | ((key_table + (8) * 8) + 4)(CTX)); |
880 | |
881 | dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
882 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
883 | %ymm15, %rax, %rcx, 0); |
884 | |
885 | /* load CD for output */ |
886 | vmovdqu 0 * 32(%rcx), %ymm8; |
887 | vmovdqu 1 * 32(%rcx), %ymm9; |
888 | vmovdqu 2 * 32(%rcx), %ymm10; |
889 | vmovdqu 3 * 32(%rcx), %ymm11; |
890 | vmovdqu 4 * 32(%rcx), %ymm12; |
891 | vmovdqu 5 * 32(%rcx), %ymm13; |
892 | vmovdqu 6 * 32(%rcx), %ymm14; |
893 | vmovdqu 7 * 32(%rcx), %ymm15; |
894 | |
895 | outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
896 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
897 | %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); |
898 | |
899 | FRAME_END |
900 | RET; |
901 | |
902 | .align 8 |
903 | .Ldec_max32: |
904 | dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
905 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
906 | %ymm15, %rax, %rcx, 24); |
907 | |
908 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
909 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
910 | %ymm15, |
911 | ((key_table + (24) * 8) + 8)(CTX), |
912 | ((key_table + (24) * 8) + 12)(CTX), |
913 | ((key_table + (24) * 8) + 0)(CTX), |
914 | ((key_table + (24) * 8) + 4)(CTX)); |
915 | |
916 | jmp .Ldec_max24; |
917 | SYM_FUNC_END(__camellia_dec_blk32) |
918 | |
919 | SYM_FUNC_START(camellia_ecb_enc_32way) |
920 | /* input: |
921 | * %rdi: ctx, CTX |
922 | * %rsi: dst (32 blocks) |
923 | * %rdx: src (32 blocks) |
924 | */ |
925 | FRAME_BEGIN |
926 | |
927 | vzeroupper; |
928 | |
929 | inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
930 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
931 | %ymm15, %rdx, (key_table)(CTX)); |
932 | |
933 | /* now dst can be used as temporary buffer (even in src == dst case) */ |
934 | movq %rsi, %rax; |
935 | |
936 | call __camellia_enc_blk32; |
937 | |
938 | write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, |
939 | %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, |
940 | %ymm8, %rsi); |
941 | |
942 | vzeroupper; |
943 | |
944 | FRAME_END |
945 | RET; |
946 | SYM_FUNC_END(camellia_ecb_enc_32way) |
947 | |
948 | SYM_FUNC_START(camellia_ecb_dec_32way) |
949 | /* input: |
950 | * %rdi: ctx, CTX |
951 | * %rsi: dst (32 blocks) |
952 | * %rdx: src (32 blocks) |
953 | */ |
954 | FRAME_BEGIN |
955 | |
956 | vzeroupper; |
957 | |
958 | cmpl $16, key_length(CTX); |
959 | movl $32, %r8d; |
960 | movl $24, %eax; |
961 | cmovel %eax, %r8d; /* max */ |
962 | |
963 | inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
964 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
965 | %ymm15, %rdx, (key_table)(CTX, %r8, 8)); |
966 | |
967 | /* now dst can be used as temporary buffer (even in src == dst case) */ |
968 | movq %rsi, %rax; |
969 | |
970 | call __camellia_dec_blk32; |
971 | |
972 | write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, |
973 | %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, |
974 | %ymm8, %rsi); |
975 | |
976 | vzeroupper; |
977 | |
978 | FRAME_END |
979 | RET; |
980 | SYM_FUNC_END(camellia_ecb_dec_32way) |
981 | |
982 | SYM_FUNC_START(camellia_cbc_dec_32way) |
983 | /* input: |
984 | * %rdi: ctx, CTX |
985 | * %rsi: dst (32 blocks) |
986 | * %rdx: src (32 blocks) |
987 | */ |
988 | FRAME_BEGIN |
989 | subq $(16 * 32), %rsp; |
990 | |
991 | vzeroupper; |
992 | |
993 | cmpl $16, key_length(CTX); |
994 | movl $32, %r8d; |
995 | movl $24, %eax; |
996 | cmovel %eax, %r8d; /* max */ |
997 | |
998 | inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
999 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
1000 | %ymm15, %rdx, (key_table)(CTX, %r8, 8)); |
1001 | |
1002 | cmpq %rsi, %rdx; |
1003 | je .Lcbc_dec_use_stack; |
1004 | |
1005 | /* dst can be used as temporary storage, src is not overwritten. */ |
1006 | movq %rsi, %rax; |
1007 | jmp .Lcbc_dec_continue; |
1008 | |
1009 | .Lcbc_dec_use_stack: |
1010 | /* |
1011 | * dst still in-use (because dst == src), so use stack for temporary |
1012 | * storage. |
1013 | */ |
1014 | movq %rsp, %rax; |
1015 | |
1016 | .Lcbc_dec_continue: |
1017 | call __camellia_dec_blk32; |
1018 | |
1019 | vmovdqu %ymm7, (%rax); |
1020 | vpxor %ymm7, %ymm7, %ymm7; |
1021 | vinserti128 $1, (%rdx), %ymm7, %ymm7; |
1022 | vpxor (%rax), %ymm7, %ymm7; |
1023 | vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6; |
1024 | vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5; |
1025 | vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4; |
1026 | vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3; |
1027 | vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2; |
1028 | vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1; |
1029 | vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0; |
1030 | vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15; |
1031 | vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14; |
1032 | vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13; |
1033 | vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12; |
1034 | vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11; |
1035 | vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10; |
1036 | vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9; |
1037 | vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8; |
1038 | write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, |
1039 | %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, |
1040 | %ymm8, %rsi); |
1041 | |
1042 | vzeroupper; |
1043 | |
1044 | addq $(16 * 32), %rsp; |
1045 | FRAME_END |
1046 | RET; |
1047 | SYM_FUNC_END(camellia_cbc_dec_32way) |
1048 | |