1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * Fast SHA-1 implementation for SPE instruction set (PPC) |
4 | * |
5 | * This code makes use of the SPE SIMD instruction set as defined in |
6 | * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf |
7 | * Implementation is based on optimization guide notes from |
8 | * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf |
9 | * |
10 | * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> |
11 | */ |
12 | |
13 | #include <asm/ppc_asm.h> |
14 | #include <asm/asm-offsets.h> |
15 | |
16 | #define rHP r3 /* pointer to hash value */ |
17 | #define rWP r4 /* pointer to input */ |
18 | #define rKP r5 /* pointer to constants */ |
19 | |
20 | #define rW0 r14 /* 64 bit round words */ |
21 | #define rW1 r15 |
22 | #define rW2 r16 |
23 | #define rW3 r17 |
24 | #define rW4 r18 |
25 | #define rW5 r19 |
26 | #define rW6 r20 |
27 | #define rW7 r21 |
28 | |
29 | #define rH0 r6 /* 32 bit hash values */ |
30 | #define rH1 r7 |
31 | #define rH2 r8 |
32 | #define rH3 r9 |
33 | #define rH4 r10 |
34 | |
35 | #define rT0 r22 /* 64 bit temporary */ |
36 | #define rT1 r0 /* 32 bit temporaries */ |
37 | #define rT2 r11 |
38 | #define rT3 r12 |
39 | |
40 | #define rK r23 /* 64 bit constant in volatile register */ |
41 | |
42 | #define LOAD_K01 |
43 | |
44 | #define LOAD_K11 \ |
45 | evlwwsplat rK,0(rKP); |
46 | |
47 | #define LOAD_K21 \ |
48 | evlwwsplat rK,4(rKP); |
49 | |
50 | #define LOAD_K31 \ |
51 | evlwwsplat rK,8(rKP); |
52 | |
53 | #define LOAD_K41 \ |
54 | evlwwsplat rK,12(rKP); |
55 | |
56 | #define INITIALIZE \ |
57 | stwu r1,-128(r1); /* create stack frame */ \ |
58 | evstdw r14,8(r1); /* We must save non volatile */ \ |
59 | evstdw r15,16(r1); /* registers. Take the chance */ \ |
60 | evstdw r16,24(r1); /* and save the SPE part too */ \ |
61 | evstdw r17,32(r1); \ |
62 | evstdw r18,40(r1); \ |
63 | evstdw r19,48(r1); \ |
64 | evstdw r20,56(r1); \ |
65 | evstdw r21,64(r1); \ |
66 | evstdw r22,72(r1); \ |
67 | evstdw r23,80(r1); |
68 | |
69 | |
70 | #define FINALIZE \ |
71 | evldw r14,8(r1); /* restore SPE registers */ \ |
72 | evldw r15,16(r1); \ |
73 | evldw r16,24(r1); \ |
74 | evldw r17,32(r1); \ |
75 | evldw r18,40(r1); \ |
76 | evldw r19,48(r1); \ |
77 | evldw r20,56(r1); \ |
78 | evldw r21,64(r1); \ |
79 | evldw r22,72(r1); \ |
80 | evldw r23,80(r1); \ |
81 | xor r0,r0,r0; \ |
82 | stw r0,8(r1); /* Delete sensitive data */ \ |
83 | stw r0,16(r1); /* that we might have pushed */ \ |
84 | stw r0,24(r1); /* from other context that runs */ \ |
85 | stw r0,32(r1); /* the same code. Assume that */ \ |
86 | stw r0,40(r1); /* the lower part of the GPRs */ \ |
87 | stw r0,48(r1); /* were already overwritten on */ \ |
88 | stw r0,56(r1); /* the way down to here */ \ |
89 | stw r0,64(r1); \ |
90 | stw r0,72(r1); \ |
91 | stw r0,80(r1); \ |
92 | addi r1,r1,128; /* cleanup stack frame */ |
93 | |
94 | #ifdef __BIG_ENDIAN__ |
95 | #define LOAD_DATA(reg, off) \ |
96 | lwz reg,off(rWP); /* load data */ |
97 | #define NEXT_BLOCK \ |
98 | addi rWP,rWP,64; /* increment per block */ |
99 | #else |
100 | #define LOAD_DATA(reg, off) \ |
101 | lwbrx reg,0,rWP; /* load data */ \ |
102 | addi rWP,rWP,4; /* increment per word */ |
103 | #define NEXT_BLOCK /* nothing to do */ |
104 | #endif |
105 | |
106 | #define R_00_15(a, b, c, d, e, w0, w1, k, off) \ |
107 | LOAD_DATA(w0, off) /* 1: W */ \ |
108 | and rT2,b,c; /* 1: F' = B and C */ \ |
109 | LOAD_K##k##1 \ |
110 | andc rT1,d,b; /* 1: F" = ~B and D */ \ |
111 | rotrwi rT0,a,27; /* 1: A' = A rotl 5 */ \ |
112 | or rT2,rT2,rT1; /* 1: F = F' or F" */ \ |
113 | add e,e,rT0; /* 1: E = E + A' */ \ |
114 | rotrwi b,b,2; /* 1: B = B rotl 30 */ \ |
115 | add e,e,w0; /* 1: E = E + W */ \ |
116 | LOAD_DATA(w1, off+4) /* 2: W */ \ |
117 | add e,e,rT2; /* 1: E = E + F */ \ |
118 | and rT1,a,b; /* 2: F' = B and C */ \ |
119 | add e,e,rK; /* 1: E = E + K */ \ |
120 | andc rT2,c,a; /* 2: F" = ~B and D */ \ |
121 | add d,d,rK; /* 2: E = E + K */ \ |
122 | or rT2,rT2,rT1; /* 2: F = F' or F" */ \ |
123 | rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ |
124 | add d,d,w1; /* 2: E = E + W */ \ |
125 | rotrwi a,a,2; /* 2: B = B rotl 30 */ \ |
126 | add d,d,rT0; /* 2: E = E + A' */ \ |
127 | evmergelo w1,w1,w0; /* mix W[0]/W[1] */ \ |
128 | add d,d,rT2 /* 2: E = E + F */ |
129 | |
130 | #define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ |
131 | and rT2,b,c; /* 1: F' = B and C */ \ |
132 | evmergelohi rT0,w7,w6; /* W[-3] */ \ |
133 | andc rT1,d,b; /* 1: F" = ~B and D */ \ |
134 | evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ |
135 | or rT1,rT1,rT2; /* 1: F = F' or F" */ \ |
136 | evxor w0,w0,w4; /* W = W xor W[-8] */ \ |
137 | add e,e,rT1; /* 1: E = E + F */ \ |
138 | evxor w0,w0,w1; /* W = W xor W[-14] */ \ |
139 | rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ |
140 | evrlwi w0,w0,1; /* W = W rotl 1 */ \ |
141 | add e,e,rT2; /* 1: E = E + A' */ \ |
142 | evaddw rT0,w0,rK; /* WK = W + K */ \ |
143 | rotrwi b,b,2; /* 1: B = B rotl 30 */ \ |
144 | LOAD_K##k##1 \ |
145 | evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ |
146 | add e,e,rT0; /* 1: E = E + WK */ \ |
147 | add d,d,rT1; /* 2: E = E + WK */ \ |
148 | and rT2,a,b; /* 2: F' = B and C */ \ |
149 | andc rT1,c,a; /* 2: F" = ~B and D */ \ |
150 | rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ |
151 | or rT1,rT1,rT2; /* 2: F = F' or F" */ \ |
152 | add d,d,rT0; /* 2: E = E + A' */ \ |
153 | rotrwi a,a,2; /* 2: B = B rotl 30 */ \ |
154 | add d,d,rT1 /* 2: E = E + F */ |
155 | |
156 | #define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ |
157 | evmergelohi rT0,w7,w6; /* W[-3] */ \ |
158 | xor rT2,b,c; /* 1: F' = B xor C */ \ |
159 | evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ |
160 | xor rT2,rT2,d; /* 1: F = F' xor D */ \ |
161 | evxor w0,w0,w4; /* W = W xor W[-8] */ \ |
162 | add e,e,rT2; /* 1: E = E + F */ \ |
163 | evxor w0,w0,w1; /* W = W xor W[-14] */ \ |
164 | rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ |
165 | evrlwi w0,w0,1; /* W = W rotl 1 */ \ |
166 | add e,e,rT2; /* 1: E = E + A' */ \ |
167 | evaddw rT0,w0,rK; /* WK = W + K */ \ |
168 | rotrwi b,b,2; /* 1: B = B rotl 30 */ \ |
169 | LOAD_K##k##1 \ |
170 | evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ |
171 | add e,e,rT0; /* 1: E = E + WK */ \ |
172 | xor rT2,a,b; /* 2: F' = B xor C */ \ |
173 | add d,d,rT1; /* 2: E = E + WK */ \ |
174 | xor rT2,rT2,c; /* 2: F = F' xor D */ \ |
175 | rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ |
176 | add d,d,rT2; /* 2: E = E + F */ \ |
177 | rotrwi a,a,2; /* 2: B = B rotl 30 */ \ |
178 | add d,d,rT0 /* 2: E = E + A' */ |
179 | |
180 | #define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ |
181 | and rT2,b,c; /* 1: F' = B and C */ \ |
182 | evmergelohi rT0,w7,w6; /* W[-3] */ \ |
183 | or rT1,b,c; /* 1: F" = B or C */ \ |
184 | evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ |
185 | and rT1,d,rT1; /* 1: F" = F" and D */ \ |
186 | evxor w0,w0,w4; /* W = W xor W[-8] */ \ |
187 | or rT2,rT2,rT1; /* 1: F = F' or F" */ \ |
188 | evxor w0,w0,w1; /* W = W xor W[-14] */ \ |
189 | add e,e,rT2; /* 1: E = E + F */ \ |
190 | evrlwi w0,w0,1; /* W = W rotl 1 */ \ |
191 | rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ |
192 | evaddw rT0,w0,rK; /* WK = W + K */ \ |
193 | add e,e,rT2; /* 1: E = E + A' */ \ |
194 | LOAD_K##k##1 \ |
195 | evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ |
196 | rotrwi b,b,2; /* 1: B = B rotl 30 */ \ |
197 | add e,e,rT0; /* 1: E = E + WK */ \ |
198 | and rT2,a,b; /* 2: F' = B and C */ \ |
199 | or rT0,a,b; /* 2: F" = B or C */ \ |
200 | add d,d,rT1; /* 2: E = E + WK */ \ |
201 | and rT0,c,rT0; /* 2: F" = F" and D */ \ |
202 | rotrwi a,a,2; /* 2: B = B rotl 30 */ \ |
203 | or rT2,rT2,rT0; /* 2: F = F' or F" */ \ |
204 | rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ |
205 | add d,d,rT2; /* 2: E = E + F */ \ |
206 | add d,d,rT0 /* 2: E = E + A' */ |
207 | |
208 | #define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ |
209 | R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) |
210 | |
211 | _GLOBAL(ppc_spe_sha1_transform) |
212 | INITIALIZE |
213 | |
214 | lwz rH0,0(rHP) |
215 | lwz rH1,4(rHP) |
216 | mtctr r5 |
217 | lwz rH2,8(rHP) |
218 | lis rKP,PPC_SPE_SHA1_K@h |
219 | lwz rH3,12(rHP) |
220 | ori rKP,rKP,PPC_SPE_SHA1_K@l |
221 | lwz rH4,16(rHP) |
222 | |
223 | ppc_spe_sha1_main: |
224 | R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0) |
225 | R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8) |
226 | R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16) |
227 | R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24) |
228 | R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32) |
229 | R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40) |
230 | R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48) |
231 | R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56) |
232 | |
233 | R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0) |
234 | R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2) |
235 | |
236 | R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0) |
237 | R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0) |
238 | R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0) |
239 | R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0) |
240 | R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0) |
241 | R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0) |
242 | R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0) |
243 | R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0) |
244 | R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0) |
245 | R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3) |
246 | |
247 | R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0) |
248 | R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0) |
249 | R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0) |
250 | R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0) |
251 | R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0) |
252 | R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0) |
253 | R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0) |
254 | R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0) |
255 | R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0) |
256 | R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4) |
257 | |
258 | R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0) |
259 | R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0) |
260 | R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0) |
261 | R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0) |
262 | R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0) |
263 | R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0) |
264 | R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0) |
265 | lwz rT3,0(rHP) |
266 | R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0) |
267 | lwz rW1,4(rHP) |
268 | R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0) |
269 | lwz rW2,8(rHP) |
270 | R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0) |
271 | lwz rW3,12(rHP) |
272 | NEXT_BLOCK |
273 | lwz rW4,16(rHP) |
274 | |
275 | add rH0,rH0,rT3 |
276 | stw rH0,0(rHP) |
277 | add rH1,rH1,rW1 |
278 | stw rH1,4(rHP) |
279 | add rH2,rH2,rW2 |
280 | stw rH2,8(rHP) |
281 | add rH3,rH3,rW3 |
282 | stw rH3,12(rHP) |
283 | add rH4,rH4,rW4 |
284 | stw rH4,16(rHP) |
285 | |
286 | bdnz ppc_spe_sha1_main |
287 | |
288 | FINALIZE |
289 | blr |
290 | |
291 | .data |
292 | .align 4 |
293 | PPC_SPE_SHA1_K: |
294 | .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6 |
295 | |