1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * Fast AES implementation for SPE instruction set (PPC) |
4 | * |
5 | * This code makes use of the SPE SIMD instruction set as defined in |
6 | * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf |
7 | * Implementation is based on optimization guide notes from |
8 | * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf |
9 | * |
10 | * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> |
11 | */ |
12 | |
13 | #include <asm/ppc_asm.h> |
14 | #include "aes-spe-regs.h" |
15 | |
16 | #define EAD(in, bpos) \ |
17 | rlwimi rT0,in,28-((bpos+3)%4)*8,20,27; |
18 | |
19 | #define DAD(in, bpos) \ |
20 | rlwimi rT1,in,24-((bpos+3)%4)*8,24,31; |
21 | |
22 | #define LWH(out, off) \ |
23 | evlwwsplat out,off(rT0); /* load word high */ |
24 | |
25 | #define LWL(out, off) \ |
26 | lwz out,off(rT0); /* load word low */ |
27 | |
28 | #define LBZ(out, tab, off) \ |
29 | lbz out,off(tab); /* load byte */ |
30 | |
31 | #define LAH(out, in, bpos, off) \ |
32 | EAD(in, bpos) /* calc addr + load word high */ \ |
33 | LWH(out, off) |
34 | |
35 | #define LAL(out, in, bpos, off) \ |
36 | EAD(in, bpos) /* calc addr + load word low */ \ |
37 | LWL(out, off) |
38 | |
39 | #define LAE(out, in, bpos) \ |
40 | EAD(in, bpos) /* calc addr + load enc byte */ \ |
41 | LBZ(out, rT0, 8) |
42 | |
43 | #define LBE(out) \ |
44 | LBZ(out, rT0, 8) /* load enc byte */ |
45 | |
46 | #define LAD(out, in, bpos) \ |
47 | DAD(in, bpos) /* calc addr + load dec byte */ \ |
48 | LBZ(out, rT1, 0) |
49 | |
50 | #define LBD(out) \ |
51 | LBZ(out, rT1, 0) |
52 | |
53 | /* |
54 | * ppc_encrypt_block: The central encryption function for a single 16 bytes |
55 | * block. It does no stack handling or register saving to support fast calls |
56 | * via bl/blr. It expects that caller has pre-xored input data with first |
57 | * 4 words of encryption key into rD0-rD3. Pointer/counter registers must |
58 | * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3 |
59 | * and rW0-rW3 and caller must execute a final xor on the output registers. |
60 | * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing. |
61 | * |
62 | */ |
63 | _GLOBAL(ppc_encrypt_block) |
64 | LAH(rW4, rD1, 2, 4) |
65 | LAH(rW6, rD0, 3, 0) |
66 | LAH(rW3, rD0, 1, 8) |
67 | ppc_encrypt_block_loop: |
68 | LAH(rW0, rD3, 0, 12) |
69 | LAL(rW0, rD0, 0, 12) |
70 | LAH(rW1, rD1, 0, 12) |
71 | LAH(rW2, rD2, 1, 8) |
72 | LAL(rW2, rD3, 1, 8) |
73 | LAL(rW3, rD1, 1, 8) |
74 | LAL(rW4, rD2, 2, 4) |
75 | LAL(rW6, rD1, 3, 0) |
76 | LAH(rW5, rD3, 2, 4) |
77 | LAL(rW5, rD0, 2, 4) |
78 | LAH(rW7, rD2, 3, 0) |
79 | evldw rD1,16(rKP) |
80 | EAD(rD3, 3) |
81 | evxor rW2,rW2,rW4 |
82 | LWL(rW7, 0) |
83 | evxor rW2,rW2,rW6 |
84 | EAD(rD2, 0) |
85 | evxor rD1,rD1,rW2 |
86 | LWL(rW1, 12) |
87 | evxor rD1,rD1,rW0 |
88 | evldw rD3,24(rKP) |
89 | evmergehi rD0,rD0,rD1 |
90 | EAD(rD1, 2) |
91 | evxor rW3,rW3,rW5 |
92 | LWH(rW4, 4) |
93 | evxor rW3,rW3,rW7 |
94 | EAD(rD0, 3) |
95 | evxor rD3,rD3,rW3 |
96 | LWH(rW6, 0) |
97 | evxor rD3,rD3,rW1 |
98 | EAD(rD0, 1) |
99 | evmergehi rD2,rD2,rD3 |
100 | LWH(rW3, 8) |
101 | LAH(rW0, rD3, 0, 12) |
102 | LAL(rW0, rD0, 0, 12) |
103 | LAH(rW1, rD1, 0, 12) |
104 | LAH(rW2, rD2, 1, 8) |
105 | LAL(rW2, rD3, 1, 8) |
106 | LAL(rW3, rD1, 1, 8) |
107 | LAL(rW4, rD2, 2, 4) |
108 | LAL(rW6, rD1, 3, 0) |
109 | LAH(rW5, rD3, 2, 4) |
110 | LAL(rW5, rD0, 2, 4) |
111 | LAH(rW7, rD2, 3, 0) |
112 | evldw rD1,32(rKP) |
113 | EAD(rD3, 3) |
114 | evxor rW2,rW2,rW4 |
115 | LWL(rW7, 0) |
116 | evxor rW2,rW2,rW6 |
117 | EAD(rD2, 0) |
118 | evxor rD1,rD1,rW2 |
119 | LWL(rW1, 12) |
120 | evxor rD1,rD1,rW0 |
121 | evldw rD3,40(rKP) |
122 | evmergehi rD0,rD0,rD1 |
123 | EAD(rD1, 2) |
124 | evxor rW3,rW3,rW5 |
125 | LWH(rW4, 4) |
126 | evxor rW3,rW3,rW7 |
127 | EAD(rD0, 3) |
128 | evxor rD3,rD3,rW3 |
129 | LWH(rW6, 0) |
130 | evxor rD3,rD3,rW1 |
131 | EAD(rD0, 1) |
132 | evmergehi rD2,rD2,rD3 |
133 | LWH(rW3, 8) |
134 | addi rKP,rKP,32 |
135 | bdnz ppc_encrypt_block_loop |
136 | LAH(rW0, rD3, 0, 12) |
137 | LAL(rW0, rD0, 0, 12) |
138 | LAH(rW1, rD1, 0, 12) |
139 | LAH(rW2, rD2, 1, 8) |
140 | LAL(rW2, rD3, 1, 8) |
141 | LAL(rW3, rD1, 1, 8) |
142 | LAL(rW4, rD2, 2, 4) |
143 | LAH(rW5, rD3, 2, 4) |
144 | LAL(rW6, rD1, 3, 0) |
145 | LAL(rW5, rD0, 2, 4) |
146 | LAH(rW7, rD2, 3, 0) |
147 | evldw rD1,16(rKP) |
148 | EAD(rD3, 3) |
149 | evxor rW2,rW2,rW4 |
150 | LWL(rW7, 0) |
151 | evxor rW2,rW2,rW6 |
152 | EAD(rD2, 0) |
153 | evxor rD1,rD1,rW2 |
154 | LWL(rW1, 12) |
155 | evxor rD1,rD1,rW0 |
156 | evldw rD3,24(rKP) |
157 | evmergehi rD0,rD0,rD1 |
158 | EAD(rD1, 0) |
159 | evxor rW3,rW3,rW5 |
160 | LBE(rW2) |
161 | evxor rW3,rW3,rW7 |
162 | EAD(rD0, 1) |
163 | evxor rD3,rD3,rW3 |
164 | LBE(rW6) |
165 | evxor rD3,rD3,rW1 |
166 | EAD(rD0, 0) |
167 | evmergehi rD2,rD2,rD3 |
168 | LBE(rW1) |
169 | LAE(rW0, rD3, 0) |
170 | LAE(rW1, rD0, 0) |
171 | LAE(rW4, rD2, 1) |
172 | LAE(rW5, rD3, 1) |
173 | LAE(rW3, rD2, 0) |
174 | LAE(rW7, rD1, 1) |
175 | rlwimi rW0,rW4,8,16,23 |
176 | rlwimi rW1,rW5,8,16,23 |
177 | LAE(rW4, rD1, 2) |
178 | LAE(rW5, rD2, 2) |
179 | rlwimi rW2,rW6,8,16,23 |
180 | rlwimi rW3,rW7,8,16,23 |
181 | LAE(rW6, rD3, 2) |
182 | LAE(rW7, rD0, 2) |
183 | rlwimi rW0,rW4,16,8,15 |
184 | rlwimi rW1,rW5,16,8,15 |
185 | LAE(rW4, rD0, 3) |
186 | LAE(rW5, rD1, 3) |
187 | rlwimi rW2,rW6,16,8,15 |
188 | lwz rD0,32(rKP) |
189 | rlwimi rW3,rW7,16,8,15 |
190 | lwz rD1,36(rKP) |
191 | LAE(rW6, rD2, 3) |
192 | LAE(rW7, rD3, 3) |
193 | rlwimi rW0,rW4,24,0,7 |
194 | lwz rD2,40(rKP) |
195 | rlwimi rW1,rW5,24,0,7 |
196 | lwz rD3,44(rKP) |
197 | rlwimi rW2,rW6,24,0,7 |
198 | rlwimi rW3,rW7,24,0,7 |
199 | blr |
200 | |
201 | /* |
202 | * ppc_decrypt_block: The central decryption function for a single 16 bytes |
203 | * block. It does no stack handling or register saving to support fast calls |
204 | * via bl/blr. It expects that caller has pre-xored input data with first |
205 | * 4 words of encryption key into rD0-rD3. Pointer/counter registers must |
206 | * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3 |
207 | * and rW0-rW3 and caller must execute a final xor on the output registers. |
208 | * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing. |
209 | * |
210 | */ |
211 | _GLOBAL(ppc_decrypt_block) |
212 | LAH(rW0, rD1, 0, 12) |
213 | LAH(rW6, rD0, 3, 0) |
214 | LAH(rW3, rD0, 1, 8) |
215 | ppc_decrypt_block_loop: |
216 | LAH(rW1, rD3, 0, 12) |
217 | LAL(rW0, rD2, 0, 12) |
218 | LAH(rW2, rD2, 1, 8) |
219 | LAL(rW2, rD3, 1, 8) |
220 | LAH(rW4, rD3, 2, 4) |
221 | LAL(rW4, rD0, 2, 4) |
222 | LAL(rW6, rD1, 3, 0) |
223 | LAH(rW5, rD1, 2, 4) |
224 | LAH(rW7, rD2, 3, 0) |
225 | LAL(rW7, rD3, 3, 0) |
226 | LAL(rW3, rD1, 1, 8) |
227 | evldw rD1,16(rKP) |
228 | EAD(rD0, 0) |
229 | evxor rW4,rW4,rW6 |
230 | LWL(rW1, 12) |
231 | evxor rW0,rW0,rW4 |
232 | EAD(rD2, 2) |
233 | evxor rW0,rW0,rW2 |
234 | LWL(rW5, 4) |
235 | evxor rD1,rD1,rW0 |
236 | evldw rD3,24(rKP) |
237 | evmergehi rD0,rD0,rD1 |
238 | EAD(rD1, 0) |
239 | evxor rW3,rW3,rW7 |
240 | LWH(rW0, 12) |
241 | evxor rW3,rW3,rW1 |
242 | EAD(rD0, 3) |
243 | evxor rD3,rD3,rW3 |
244 | LWH(rW6, 0) |
245 | evxor rD3,rD3,rW5 |
246 | EAD(rD0, 1) |
247 | evmergehi rD2,rD2,rD3 |
248 | LWH(rW3, 8) |
249 | LAH(rW1, rD3, 0, 12) |
250 | LAL(rW0, rD2, 0, 12) |
251 | LAH(rW2, rD2, 1, 8) |
252 | LAL(rW2, rD3, 1, 8) |
253 | LAH(rW4, rD3, 2, 4) |
254 | LAL(rW4, rD0, 2, 4) |
255 | LAL(rW6, rD1, 3, 0) |
256 | LAH(rW5, rD1, 2, 4) |
257 | LAH(rW7, rD2, 3, 0) |
258 | LAL(rW7, rD3, 3, 0) |
259 | LAL(rW3, rD1, 1, 8) |
260 | evldw rD1,32(rKP) |
261 | EAD(rD0, 0) |
262 | evxor rW4,rW4,rW6 |
263 | LWL(rW1, 12) |
264 | evxor rW0,rW0,rW4 |
265 | EAD(rD2, 2) |
266 | evxor rW0,rW0,rW2 |
267 | LWL(rW5, 4) |
268 | evxor rD1,rD1,rW0 |
269 | evldw rD3,40(rKP) |
270 | evmergehi rD0,rD0,rD1 |
271 | EAD(rD1, 0) |
272 | evxor rW3,rW3,rW7 |
273 | LWH(rW0, 12) |
274 | evxor rW3,rW3,rW1 |
275 | EAD(rD0, 3) |
276 | evxor rD3,rD3,rW3 |
277 | LWH(rW6, 0) |
278 | evxor rD3,rD3,rW5 |
279 | EAD(rD0, 1) |
280 | evmergehi rD2,rD2,rD3 |
281 | LWH(rW3, 8) |
282 | addi rKP,rKP,32 |
283 | bdnz ppc_decrypt_block_loop |
284 | LAH(rW1, rD3, 0, 12) |
285 | LAL(rW0, rD2, 0, 12) |
286 | LAH(rW2, rD2, 1, 8) |
287 | LAL(rW2, rD3, 1, 8) |
288 | LAH(rW4, rD3, 2, 4) |
289 | LAL(rW4, rD0, 2, 4) |
290 | LAL(rW6, rD1, 3, 0) |
291 | LAH(rW5, rD1, 2, 4) |
292 | LAH(rW7, rD2, 3, 0) |
293 | LAL(rW7, rD3, 3, 0) |
294 | LAL(rW3, rD1, 1, 8) |
295 | evldw rD1,16(rKP) |
296 | EAD(rD0, 0) |
297 | evxor rW4,rW4,rW6 |
298 | LWL(rW1, 12) |
299 | evxor rW0,rW0,rW4 |
300 | EAD(rD2, 2) |
301 | evxor rW0,rW0,rW2 |
302 | LWL(rW5, 4) |
303 | evxor rD1,rD1,rW0 |
304 | evldw rD3,24(rKP) |
305 | evmergehi rD0,rD0,rD1 |
306 | DAD(rD1, 0) |
307 | evxor rW3,rW3,rW7 |
308 | LBD(rW0) |
309 | evxor rW3,rW3,rW1 |
310 | DAD(rD0, 1) |
311 | evxor rD3,rD3,rW3 |
312 | LBD(rW6) |
313 | evxor rD3,rD3,rW5 |
314 | DAD(rD0, 0) |
315 | evmergehi rD2,rD2,rD3 |
316 | LBD(rW3) |
317 | LAD(rW2, rD3, 0) |
318 | LAD(rW1, rD2, 0) |
319 | LAD(rW4, rD2, 1) |
320 | LAD(rW5, rD3, 1) |
321 | LAD(rW7, rD1, 1) |
322 | rlwimi rW0,rW4,8,16,23 |
323 | rlwimi rW1,rW5,8,16,23 |
324 | LAD(rW4, rD3, 2) |
325 | LAD(rW5, rD0, 2) |
326 | rlwimi rW2,rW6,8,16,23 |
327 | rlwimi rW3,rW7,8,16,23 |
328 | LAD(rW6, rD1, 2) |
329 | LAD(rW7, rD2, 2) |
330 | rlwimi rW0,rW4,16,8,15 |
331 | rlwimi rW1,rW5,16,8,15 |
332 | LAD(rW4, rD0, 3) |
333 | LAD(rW5, rD1, 3) |
334 | rlwimi rW2,rW6,16,8,15 |
335 | lwz rD0,32(rKP) |
336 | rlwimi rW3,rW7,16,8,15 |
337 | lwz rD1,36(rKP) |
338 | LAD(rW6, rD2, 3) |
339 | LAD(rW7, rD3, 3) |
340 | rlwimi rW0,rW4,24,0,7 |
341 | lwz rD2,40(rKP) |
342 | rlwimi rW1,rW5,24,0,7 |
343 | lwz rD3,44(rKP) |
344 | rlwimi rW2,rW6,24,0,7 |
345 | rlwimi rW3,rW7,24,0,7 |
346 | blr |
347 | |