1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Fast AES implementation for SPE instruction set (PPC)
4 *
5 * This code makes use of the SPE SIMD instruction set as defined in
6 * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
7 * Implementation is based on optimization guide notes from
8 * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
9 *
10 * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
11 */
12
13#include <asm/ppc_asm.h>
14#include "aes-spe-regs.h"
15
16#define EAD(in, bpos) \
17 rlwimi rT0,in,28-((bpos+3)%4)*8,20,27;
18
19#define DAD(in, bpos) \
20 rlwimi rT1,in,24-((bpos+3)%4)*8,24,31;
21
22#define LWH(out, off) \
23 evlwwsplat out,off(rT0); /* load word high */
24
25#define LWL(out, off) \
26 lwz out,off(rT0); /* load word low */
27
28#define LBZ(out, tab, off) \
29 lbz out,off(tab); /* load byte */
30
31#define LAH(out, in, bpos, off) \
32 EAD(in, bpos) /* calc addr + load word high */ \
33 LWH(out, off)
34
35#define LAL(out, in, bpos, off) \
36 EAD(in, bpos) /* calc addr + load word low */ \
37 LWL(out, off)
38
39#define LAE(out, in, bpos) \
40 EAD(in, bpos) /* calc addr + load enc byte */ \
41 LBZ(out, rT0, 8)
42
43#define LBE(out) \
44 LBZ(out, rT0, 8) /* load enc byte */
45
46#define LAD(out, in, bpos) \
47 DAD(in, bpos) /* calc addr + load dec byte */ \
48 LBZ(out, rT1, 0)
49
50#define LBD(out) \
51 LBZ(out, rT1, 0)
52
53/*
54 * ppc_encrypt_block: The central encryption function for a single 16 bytes
55 * block. It does no stack handling or register saving to support fast calls
56 * via bl/blr. It expects that caller has pre-xored input data with first
57 * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
58 * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
59 * and rW0-rW3 and caller must execute a final xor on the output registers.
60 * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
61 *
62 */
63_GLOBAL(ppc_encrypt_block)
64 LAH(rW4, rD1, 2, 4)
65 LAH(rW6, rD0, 3, 0)
66 LAH(rW3, rD0, 1, 8)
67ppc_encrypt_block_loop:
68 LAH(rW0, rD3, 0, 12)
69 LAL(rW0, rD0, 0, 12)
70 LAH(rW1, rD1, 0, 12)
71 LAH(rW2, rD2, 1, 8)
72 LAL(rW2, rD3, 1, 8)
73 LAL(rW3, rD1, 1, 8)
74 LAL(rW4, rD2, 2, 4)
75 LAL(rW6, rD1, 3, 0)
76 LAH(rW5, rD3, 2, 4)
77 LAL(rW5, rD0, 2, 4)
78 LAH(rW7, rD2, 3, 0)
79 evldw rD1,16(rKP)
80 EAD(rD3, 3)
81 evxor rW2,rW2,rW4
82 LWL(rW7, 0)
83 evxor rW2,rW2,rW6
84 EAD(rD2, 0)
85 evxor rD1,rD1,rW2
86 LWL(rW1, 12)
87 evxor rD1,rD1,rW0
88 evldw rD3,24(rKP)
89 evmergehi rD0,rD0,rD1
90 EAD(rD1, 2)
91 evxor rW3,rW3,rW5
92 LWH(rW4, 4)
93 evxor rW3,rW3,rW7
94 EAD(rD0, 3)
95 evxor rD3,rD3,rW3
96 LWH(rW6, 0)
97 evxor rD3,rD3,rW1
98 EAD(rD0, 1)
99 evmergehi rD2,rD2,rD3
100 LWH(rW3, 8)
101 LAH(rW0, rD3, 0, 12)
102 LAL(rW0, rD0, 0, 12)
103 LAH(rW1, rD1, 0, 12)
104 LAH(rW2, rD2, 1, 8)
105 LAL(rW2, rD3, 1, 8)
106 LAL(rW3, rD1, 1, 8)
107 LAL(rW4, rD2, 2, 4)
108 LAL(rW6, rD1, 3, 0)
109 LAH(rW5, rD3, 2, 4)
110 LAL(rW5, rD0, 2, 4)
111 LAH(rW7, rD2, 3, 0)
112 evldw rD1,32(rKP)
113 EAD(rD3, 3)
114 evxor rW2,rW2,rW4
115 LWL(rW7, 0)
116 evxor rW2,rW2,rW6
117 EAD(rD2, 0)
118 evxor rD1,rD1,rW2
119 LWL(rW1, 12)
120 evxor rD1,rD1,rW0
121 evldw rD3,40(rKP)
122 evmergehi rD0,rD0,rD1
123 EAD(rD1, 2)
124 evxor rW3,rW3,rW5
125 LWH(rW4, 4)
126 evxor rW3,rW3,rW7
127 EAD(rD0, 3)
128 evxor rD3,rD3,rW3
129 LWH(rW6, 0)
130 evxor rD3,rD3,rW1
131 EAD(rD0, 1)
132 evmergehi rD2,rD2,rD3
133 LWH(rW3, 8)
134 addi rKP,rKP,32
135 bdnz ppc_encrypt_block_loop
136 LAH(rW0, rD3, 0, 12)
137 LAL(rW0, rD0, 0, 12)
138 LAH(rW1, rD1, 0, 12)
139 LAH(rW2, rD2, 1, 8)
140 LAL(rW2, rD3, 1, 8)
141 LAL(rW3, rD1, 1, 8)
142 LAL(rW4, rD2, 2, 4)
143 LAH(rW5, rD3, 2, 4)
144 LAL(rW6, rD1, 3, 0)
145 LAL(rW5, rD0, 2, 4)
146 LAH(rW7, rD2, 3, 0)
147 evldw rD1,16(rKP)
148 EAD(rD3, 3)
149 evxor rW2,rW2,rW4
150 LWL(rW7, 0)
151 evxor rW2,rW2,rW6
152 EAD(rD2, 0)
153 evxor rD1,rD1,rW2
154 LWL(rW1, 12)
155 evxor rD1,rD1,rW0
156 evldw rD3,24(rKP)
157 evmergehi rD0,rD0,rD1
158 EAD(rD1, 0)
159 evxor rW3,rW3,rW5
160 LBE(rW2)
161 evxor rW3,rW3,rW7
162 EAD(rD0, 1)
163 evxor rD3,rD3,rW3
164 LBE(rW6)
165 evxor rD3,rD3,rW1
166 EAD(rD0, 0)
167 evmergehi rD2,rD2,rD3
168 LBE(rW1)
169 LAE(rW0, rD3, 0)
170 LAE(rW1, rD0, 0)
171 LAE(rW4, rD2, 1)
172 LAE(rW5, rD3, 1)
173 LAE(rW3, rD2, 0)
174 LAE(rW7, rD1, 1)
175 rlwimi rW0,rW4,8,16,23
176 rlwimi rW1,rW5,8,16,23
177 LAE(rW4, rD1, 2)
178 LAE(rW5, rD2, 2)
179 rlwimi rW2,rW6,8,16,23
180 rlwimi rW3,rW7,8,16,23
181 LAE(rW6, rD3, 2)
182 LAE(rW7, rD0, 2)
183 rlwimi rW0,rW4,16,8,15
184 rlwimi rW1,rW5,16,8,15
185 LAE(rW4, rD0, 3)
186 LAE(rW5, rD1, 3)
187 rlwimi rW2,rW6,16,8,15
188 lwz rD0,32(rKP)
189 rlwimi rW3,rW7,16,8,15
190 lwz rD1,36(rKP)
191 LAE(rW6, rD2, 3)
192 LAE(rW7, rD3, 3)
193 rlwimi rW0,rW4,24,0,7
194 lwz rD2,40(rKP)
195 rlwimi rW1,rW5,24,0,7
196 lwz rD3,44(rKP)
197 rlwimi rW2,rW6,24,0,7
198 rlwimi rW3,rW7,24,0,7
199 blr
200
201/*
202 * ppc_decrypt_block: The central decryption function for a single 16 bytes
203 * block. It does no stack handling or register saving to support fast calls
204 * via bl/blr. It expects that caller has pre-xored input data with first
205 * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
206 * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
207 * and rW0-rW3 and caller must execute a final xor on the output registers.
208 * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
209 *
210 */
211_GLOBAL(ppc_decrypt_block)
212 LAH(rW0, rD1, 0, 12)
213 LAH(rW6, rD0, 3, 0)
214 LAH(rW3, rD0, 1, 8)
215ppc_decrypt_block_loop:
216 LAH(rW1, rD3, 0, 12)
217 LAL(rW0, rD2, 0, 12)
218 LAH(rW2, rD2, 1, 8)
219 LAL(rW2, rD3, 1, 8)
220 LAH(rW4, rD3, 2, 4)
221 LAL(rW4, rD0, 2, 4)
222 LAL(rW6, rD1, 3, 0)
223 LAH(rW5, rD1, 2, 4)
224 LAH(rW7, rD2, 3, 0)
225 LAL(rW7, rD3, 3, 0)
226 LAL(rW3, rD1, 1, 8)
227 evldw rD1,16(rKP)
228 EAD(rD0, 0)
229 evxor rW4,rW4,rW6
230 LWL(rW1, 12)
231 evxor rW0,rW0,rW4
232 EAD(rD2, 2)
233 evxor rW0,rW0,rW2
234 LWL(rW5, 4)
235 evxor rD1,rD1,rW0
236 evldw rD3,24(rKP)
237 evmergehi rD0,rD0,rD1
238 EAD(rD1, 0)
239 evxor rW3,rW3,rW7
240 LWH(rW0, 12)
241 evxor rW3,rW3,rW1
242 EAD(rD0, 3)
243 evxor rD3,rD3,rW3
244 LWH(rW6, 0)
245 evxor rD3,rD3,rW5
246 EAD(rD0, 1)
247 evmergehi rD2,rD2,rD3
248 LWH(rW3, 8)
249 LAH(rW1, rD3, 0, 12)
250 LAL(rW0, rD2, 0, 12)
251 LAH(rW2, rD2, 1, 8)
252 LAL(rW2, rD3, 1, 8)
253 LAH(rW4, rD3, 2, 4)
254 LAL(rW4, rD0, 2, 4)
255 LAL(rW6, rD1, 3, 0)
256 LAH(rW5, rD1, 2, 4)
257 LAH(rW7, rD2, 3, 0)
258 LAL(rW7, rD3, 3, 0)
259 LAL(rW3, rD1, 1, 8)
260 evldw rD1,32(rKP)
261 EAD(rD0, 0)
262 evxor rW4,rW4,rW6
263 LWL(rW1, 12)
264 evxor rW0,rW0,rW4
265 EAD(rD2, 2)
266 evxor rW0,rW0,rW2
267 LWL(rW5, 4)
268 evxor rD1,rD1,rW0
269 evldw rD3,40(rKP)
270 evmergehi rD0,rD0,rD1
271 EAD(rD1, 0)
272 evxor rW3,rW3,rW7
273 LWH(rW0, 12)
274 evxor rW3,rW3,rW1
275 EAD(rD0, 3)
276 evxor rD3,rD3,rW3
277 LWH(rW6, 0)
278 evxor rD3,rD3,rW5
279 EAD(rD0, 1)
280 evmergehi rD2,rD2,rD3
281 LWH(rW3, 8)
282 addi rKP,rKP,32
283 bdnz ppc_decrypt_block_loop
284 LAH(rW1, rD3, 0, 12)
285 LAL(rW0, rD2, 0, 12)
286 LAH(rW2, rD2, 1, 8)
287 LAL(rW2, rD3, 1, 8)
288 LAH(rW4, rD3, 2, 4)
289 LAL(rW4, rD0, 2, 4)
290 LAL(rW6, rD1, 3, 0)
291 LAH(rW5, rD1, 2, 4)
292 LAH(rW7, rD2, 3, 0)
293 LAL(rW7, rD3, 3, 0)
294 LAL(rW3, rD1, 1, 8)
295 evldw rD1,16(rKP)
296 EAD(rD0, 0)
297 evxor rW4,rW4,rW6
298 LWL(rW1, 12)
299 evxor rW0,rW0,rW4
300 EAD(rD2, 2)
301 evxor rW0,rW0,rW2
302 LWL(rW5, 4)
303 evxor rD1,rD1,rW0
304 evldw rD3,24(rKP)
305 evmergehi rD0,rD0,rD1
306 DAD(rD1, 0)
307 evxor rW3,rW3,rW7
308 LBD(rW0)
309 evxor rW3,rW3,rW1
310 DAD(rD0, 1)
311 evxor rD3,rD3,rW3
312 LBD(rW6)
313 evxor rD3,rD3,rW5
314 DAD(rD0, 0)
315 evmergehi rD2,rD2,rD3
316 LBD(rW3)
317 LAD(rW2, rD3, 0)
318 LAD(rW1, rD2, 0)
319 LAD(rW4, rD2, 1)
320 LAD(rW5, rD3, 1)
321 LAD(rW7, rD1, 1)
322 rlwimi rW0,rW4,8,16,23
323 rlwimi rW1,rW5,8,16,23
324 LAD(rW4, rD3, 2)
325 LAD(rW5, rD0, 2)
326 rlwimi rW2,rW6,8,16,23
327 rlwimi rW3,rW7,8,16,23
328 LAD(rW6, rD1, 2)
329 LAD(rW7, rD2, 2)
330 rlwimi rW0,rW4,16,8,15
331 rlwimi rW1,rW5,16,8,15
332 LAD(rW4, rD0, 3)
333 LAD(rW5, rD1, 3)
334 rlwimi rW2,rW6,16,8,15
335 lwz rD0,32(rKP)
336 rlwimi rW3,rW7,16,8,15
337 lwz rD1,36(rKP)
338 LAD(rW6, rD2, 3)
339 LAD(rW7, rD3, 3)
340 rlwimi rW0,rW4,24,0,7
341 lwz rD2,40(rKP)
342 rlwimi rW1,rW5,24,0,7
343 lwz rD3,44(rKP)
344 rlwimi rW2,rW6,24,0,7
345 rlwimi rW3,rW7,24,0,7
346 blr
347

source code of linux/arch/powerpc/crypto/aes-spe-core.S