1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Fast SHA-1 implementation for SPE instruction set (PPC)
4 *
5 * This code makes use of the SPE SIMD instruction set as defined in
6 * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
7 * Implementation is based on optimization guide notes from
8 * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
9 *
10 * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
11 */
12
13#include <asm/ppc_asm.h>
14#include <asm/asm-offsets.h>
15
16#define rHP r3 /* pointer to hash value */
17#define rWP r4 /* pointer to input */
18#define rKP r5 /* pointer to constants */
19
20#define rW0 r14 /* 64 bit round words */
21#define rW1 r15
22#define rW2 r16
23#define rW3 r17
24#define rW4 r18
25#define rW5 r19
26#define rW6 r20
27#define rW7 r21
28
29#define rH0 r6 /* 32 bit hash values */
30#define rH1 r7
31#define rH2 r8
32#define rH3 r9
33#define rH4 r10
34
35#define rT0 r22 /* 64 bit temporary */
36#define rT1 r0 /* 32 bit temporaries */
37#define rT2 r11
38#define rT3 r12
39
40#define rK r23 /* 64 bit constant in volatile register */
41
42#define LOAD_K01
43
44#define LOAD_K11 \
45 evlwwsplat rK,0(rKP);
46
47#define LOAD_K21 \
48 evlwwsplat rK,4(rKP);
49
50#define LOAD_K31 \
51 evlwwsplat rK,8(rKP);
52
53#define LOAD_K41 \
54 evlwwsplat rK,12(rKP);
55
56#define INITIALIZE \
57 stwu r1,-128(r1); /* create stack frame */ \
58 evstdw r14,8(r1); /* We must save non volatile */ \
59 evstdw r15,16(r1); /* registers. Take the chance */ \
60 evstdw r16,24(r1); /* and save the SPE part too */ \
61 evstdw r17,32(r1); \
62 evstdw r18,40(r1); \
63 evstdw r19,48(r1); \
64 evstdw r20,56(r1); \
65 evstdw r21,64(r1); \
66 evstdw r22,72(r1); \
67 evstdw r23,80(r1);
68
69
70#define FINALIZE \
71 evldw r14,8(r1); /* restore SPE registers */ \
72 evldw r15,16(r1); \
73 evldw r16,24(r1); \
74 evldw r17,32(r1); \
75 evldw r18,40(r1); \
76 evldw r19,48(r1); \
77 evldw r20,56(r1); \
78 evldw r21,64(r1); \
79 evldw r22,72(r1); \
80 evldw r23,80(r1); \
81 xor r0,r0,r0; \
82 stw r0,8(r1); /* Delete sensitive data */ \
83 stw r0,16(r1); /* that we might have pushed */ \
84 stw r0,24(r1); /* from other context that runs */ \
85 stw r0,32(r1); /* the same code. Assume that */ \
86 stw r0,40(r1); /* the lower part of the GPRs */ \
87 stw r0,48(r1); /* were already overwritten on */ \
88 stw r0,56(r1); /* the way down to here */ \
89 stw r0,64(r1); \
90 stw r0,72(r1); \
91 stw r0,80(r1); \
92 addi r1,r1,128; /* cleanup stack frame */
93
94#ifdef __BIG_ENDIAN__
95#define LOAD_DATA(reg, off) \
96 lwz reg,off(rWP); /* load data */
97#define NEXT_BLOCK \
98 addi rWP,rWP,64; /* increment per block */
99#else
100#define LOAD_DATA(reg, off) \
101 lwbrx reg,0,rWP; /* load data */ \
102 addi rWP,rWP,4; /* increment per word */
103#define NEXT_BLOCK /* nothing to do */
104#endif
105
106#define R_00_15(a, b, c, d, e, w0, w1, k, off) \
107 LOAD_DATA(w0, off) /* 1: W */ \
108 and rT2,b,c; /* 1: F' = B and C */ \
109 LOAD_K##k##1 \
110 andc rT1,d,b; /* 1: F" = ~B and D */ \
111 rotrwi rT0,a,27; /* 1: A' = A rotl 5 */ \
112 or rT2,rT2,rT1; /* 1: F = F' or F" */ \
113 add e,e,rT0; /* 1: E = E + A' */ \
114 rotrwi b,b,2; /* 1: B = B rotl 30 */ \
115 add e,e,w0; /* 1: E = E + W */ \
116 LOAD_DATA(w1, off+4) /* 2: W */ \
117 add e,e,rT2; /* 1: E = E + F */ \
118 and rT1,a,b; /* 2: F' = B and C */ \
119 add e,e,rK; /* 1: E = E + K */ \
120 andc rT2,c,a; /* 2: F" = ~B and D */ \
121 add d,d,rK; /* 2: E = E + K */ \
122 or rT2,rT2,rT1; /* 2: F = F' or F" */ \
123 rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
124 add d,d,w1; /* 2: E = E + W */ \
125 rotrwi a,a,2; /* 2: B = B rotl 30 */ \
126 add d,d,rT0; /* 2: E = E + A' */ \
127 evmergelo w1,w1,w0; /* mix W[0]/W[1] */ \
128 add d,d,rT2 /* 2: E = E + F */
129
130#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
131 and rT2,b,c; /* 1: F' = B and C */ \
132 evmergelohi rT0,w7,w6; /* W[-3] */ \
133 andc rT1,d,b; /* 1: F" = ~B and D */ \
134 evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
135 or rT1,rT1,rT2; /* 1: F = F' or F" */ \
136 evxor w0,w0,w4; /* W = W xor W[-8] */ \
137 add e,e,rT1; /* 1: E = E + F */ \
138 evxor w0,w0,w1; /* W = W xor W[-14] */ \
139 rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
140 evrlwi w0,w0,1; /* W = W rotl 1 */ \
141 add e,e,rT2; /* 1: E = E + A' */ \
142 evaddw rT0,w0,rK; /* WK = W + K */ \
143 rotrwi b,b,2; /* 1: B = B rotl 30 */ \
144 LOAD_K##k##1 \
145 evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
146 add e,e,rT0; /* 1: E = E + WK */ \
147 add d,d,rT1; /* 2: E = E + WK */ \
148 and rT2,a,b; /* 2: F' = B and C */ \
149 andc rT1,c,a; /* 2: F" = ~B and D */ \
150 rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
151 or rT1,rT1,rT2; /* 2: F = F' or F" */ \
152 add d,d,rT0; /* 2: E = E + A' */ \
153 rotrwi a,a,2; /* 2: B = B rotl 30 */ \
154 add d,d,rT1 /* 2: E = E + F */
155
156#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
157 evmergelohi rT0,w7,w6; /* W[-3] */ \
158 xor rT2,b,c; /* 1: F' = B xor C */ \
159 evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
160 xor rT2,rT2,d; /* 1: F = F' xor D */ \
161 evxor w0,w0,w4; /* W = W xor W[-8] */ \
162 add e,e,rT2; /* 1: E = E + F */ \
163 evxor w0,w0,w1; /* W = W xor W[-14] */ \
164 rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
165 evrlwi w0,w0,1; /* W = W rotl 1 */ \
166 add e,e,rT2; /* 1: E = E + A' */ \
167 evaddw rT0,w0,rK; /* WK = W + K */ \
168 rotrwi b,b,2; /* 1: B = B rotl 30 */ \
169 LOAD_K##k##1 \
170 evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
171 add e,e,rT0; /* 1: E = E + WK */ \
172 xor rT2,a,b; /* 2: F' = B xor C */ \
173 add d,d,rT1; /* 2: E = E + WK */ \
174 xor rT2,rT2,c; /* 2: F = F' xor D */ \
175 rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
176 add d,d,rT2; /* 2: E = E + F */ \
177 rotrwi a,a,2; /* 2: B = B rotl 30 */ \
178 add d,d,rT0 /* 2: E = E + A' */
179
180#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
181 and rT2,b,c; /* 1: F' = B and C */ \
182 evmergelohi rT0,w7,w6; /* W[-3] */ \
183 or rT1,b,c; /* 1: F" = B or C */ \
184 evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
185 and rT1,d,rT1; /* 1: F" = F" and D */ \
186 evxor w0,w0,w4; /* W = W xor W[-8] */ \
187 or rT2,rT2,rT1; /* 1: F = F' or F" */ \
188 evxor w0,w0,w1; /* W = W xor W[-14] */ \
189 add e,e,rT2; /* 1: E = E + F */ \
190 evrlwi w0,w0,1; /* W = W rotl 1 */ \
191 rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
192 evaddw rT0,w0,rK; /* WK = W + K */ \
193 add e,e,rT2; /* 1: E = E + A' */ \
194 LOAD_K##k##1 \
195 evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
196 rotrwi b,b,2; /* 1: B = B rotl 30 */ \
197 add e,e,rT0; /* 1: E = E + WK */ \
198 and rT2,a,b; /* 2: F' = B and C */ \
199 or rT0,a,b; /* 2: F" = B or C */ \
200 add d,d,rT1; /* 2: E = E + WK */ \
201 and rT0,c,rT0; /* 2: F" = F" and D */ \
202 rotrwi a,a,2; /* 2: B = B rotl 30 */ \
203 or rT2,rT2,rT0; /* 2: F = F' or F" */ \
204 rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
205 add d,d,rT2; /* 2: E = E + F */ \
206 add d,d,rT0 /* 2: E = E + A' */
207
208#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
209 R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)
210
211_GLOBAL(ppc_spe_sha1_transform)
212 INITIALIZE
213
214 lwz rH0,0(rHP)
215 lwz rH1,4(rHP)
216 mtctr r5
217 lwz rH2,8(rHP)
218 lis rKP,PPC_SPE_SHA1_K@h
219 lwz rH3,12(rHP)
220 ori rKP,rKP,PPC_SPE_SHA1_K@l
221 lwz rH4,16(rHP)
222
223ppc_spe_sha1_main:
224 R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
225 R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
226 R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
227 R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
228 R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
229 R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
230 R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
231 R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)
232
233 R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
234 R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)
235
236 R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
237 R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
238 R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
239 R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
240 R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
241 R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
242 R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
243 R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
244 R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
245 R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)
246
247 R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
248 R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
249 R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
250 R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
251 R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
252 R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
253 R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
254 R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
255 R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
256 R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)
257
258 R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
259 R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
260 R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
261 R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
262 R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
263 R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
264 R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
265 lwz rT3,0(rHP)
266 R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
267 lwz rW1,4(rHP)
268 R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
269 lwz rW2,8(rHP)
270 R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
271 lwz rW3,12(rHP)
272 NEXT_BLOCK
273 lwz rW4,16(rHP)
274
275 add rH0,rH0,rT3
276 stw rH0,0(rHP)
277 add rH1,rH1,rW1
278 stw rH1,4(rHP)
279 add rH2,rH2,rW2
280 stw rH2,8(rHP)
281 add rH3,rH3,rW3
282 stw rH3,12(rHP)
283 add rH4,rH4,rW4
284 stw rH4,16(rHP)
285
286 bdnz ppc_spe_sha1_main
287
288 FINALIZE
289 blr
290
291.data
292.align 4
293PPC_SPE_SHA1_K:
294 .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6
295

source code of linux/arch/powerpc/crypto/sha1-spe-asm.S