sha256-spe-asm.S source code [linux/arch/powerpc/crypto/sha256-spe-asm.S]

1	/ SPDX-License-Identifier: GPL-2.0-or-later /
2	/*
3	* Fast SHA-256 implementation for SPE instruction set (PPC)
4	*
5	* This code makes use of the SPE SIMD instruction set as defined in
6	* http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
7	* Implementation is based on optimization guide notes from
8	* http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
9	*
10	* Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
11	*/
12
13	#include <asm/ppc_asm.h>
14	#include <asm/asm-offsets.h>
15
16	#define rHP r3 /* pointer to hash values in memory */
17	#define rKP r24 /* pointer to round constants */
18	#define rWP r4 /* pointer to input data */
19
20	#define rH0 r5 /* 8 32 bit hash values in 8 registers */
21	#define rH1 r6
22	#define rH2 r7
23	#define rH3 r8
24	#define rH4 r9
25	#define rH5 r10
26	#define rH6 r11
27	#define rH7 r12
28
29	#define rW0 r14 /* 64 bit registers. 16 words in 8 registers */
30	#define rW1 r15
31	#define rW2 r16
32	#define rW3 r17
33	#define rW4 r18
34	#define rW5 r19
35	#define rW6 r20
36	#define rW7 r21
37
38	#define rT0 r22 /* 64 bit temporaries */
39	#define rT1 r23
40	#define rT2 r0 /* 32 bit temporaries */
41	#define rT3 r25
42
43	#define CMP_KN_LOOP
44	#define CMP_KC_LOOP \
45	cmpwi rT1,0;
46
47	#define INITIALIZE \
48	stwu r1,-128(r1); /* create stack frame */ \
49	evstdw r14,8(r1); /* We must save non volatile */ \
50	evstdw r15,16(r1); /* registers. Take the chance */ \
51	evstdw r16,24(r1); /* and save the SPE part too */ \
52	evstdw r17,32(r1); \
53	evstdw r18,40(r1); \
54	evstdw r19,48(r1); \
55	evstdw r20,56(r1); \
56	evstdw r21,64(r1); \
57	evstdw r22,72(r1); \
58	evstdw r23,80(r1); \
59	stw r24,88(r1); /* save normal registers */ \
60	stw r25,92(r1);
61
62
63	#define FINALIZE \
64	evldw r14,8(r1); /* restore SPE registers */ \
65	evldw r15,16(r1); \
66	evldw r16,24(r1); \
67	evldw r17,32(r1); \
68	evldw r18,40(r1); \
69	evldw r19,48(r1); \
70	evldw r20,56(r1); \
71	evldw r21,64(r1); \
72	evldw r22,72(r1); \
73	evldw r23,80(r1); \
74	lwz r24,88(r1); /* restore normal registers */ \
75	lwz r25,92(r1); \
76	xor r0,r0,r0; \
77	stw r0,8(r1); /* Delete sensitive data */ \
78	stw r0,16(r1); /* that we might have pushed */ \
79	stw r0,24(r1); /* from other context that runs */ \
80	stw r0,32(r1); /* the same code. Assume that */ \
81	stw r0,40(r1); /* the lower part of the GPRs */ \
82	stw r0,48(r1); /* was already overwritten on */ \
83	stw r0,56(r1); /* the way down to here */ \
84	stw r0,64(r1); \
85	stw r0,72(r1); \
86	stw r0,80(r1); \
87	addi r1,r1,128; /* cleanup stack frame */
88
89	#ifdef __BIG_ENDIAN__
90	#define LOAD_DATA(reg, off) \
91	lwz reg,off(rWP); /* load data */
92	#define NEXT_BLOCK \
93	addi rWP,rWP,64; /* increment per block */
94	#else
95	#define LOAD_DATA(reg, off) \
96	lwbrx reg,0,rWP; /* load data */ \
97	addi rWP,rWP,4; /* increment per word */
98	#define NEXT_BLOCK /* nothing to do */
99	#endif
100
101	#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \
102	LOAD_DATA(w, off) /* 1: W */ \
103	rotrwi rT0,e,6; /* 1: S1 = e rotr 6 */ \
104	rotrwi rT1,e,11; /* 1: S1' = e rotr 11 */ \
105	rotrwi rT2,e,25; /* 1: S1" = e rotr 25 */ \
106	xor rT0,rT0,rT1; /* 1: S1 = S1 xor S1' */ \
107	and rT3,e,f; /* 1: ch = e and f */ \
108	xor rT0,rT0,rT2; /* 1: S1 = S1 xor S1" */ \
109	andc rT1,g,e; /* 1: ch' = ~e and g */ \
110	lwz rT2,off(rKP); /* 1: K */ \
111	xor rT3,rT3,rT1; /* 1: ch = ch xor ch' */ \
112	add h,h,rT0; /* 1: temp1 = h + S1 */ \
113	add rT3,rT3,w; /* 1: temp1' = ch + w */ \
114	rotrwi rT0,a,2; /* 1: S0 = a rotr 2 */ \
115	add h,h,rT3; /* 1: temp1 = temp1 + temp1' */ \
116	rotrwi rT1,a,13; /* 1: S0' = a rotr 13 */ \
117	add h,h,rT2; /* 1: temp1 = temp1 + K */ \
118	rotrwi rT3,a,22; /* 1: S0" = a rotr 22 */ \
119	xor rT0,rT0,rT1; /* 1: S0 = S0 xor S0' */ \
120	add d,d,h; /* 1: d = d + temp1 */ \
121	xor rT3,rT0,rT3; /* 1: S0 = S0 xor S0" */ \
122	evmergelo w,w,w; /* shift W */ \
123	or rT2,a,b; /* 1: maj = a or b */ \
124	and rT1,a,b; /* 1: maj' = a and b */ \
125	and rT2,rT2,c; /* 1: maj = maj and c */ \
126	LOAD_DATA(w, off+4) /* 2: W */ \
127	or rT2,rT1,rT2; /* 1: maj = maj or maj' */ \
128	rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \
129	add rT3,rT3,rT2; /* 1: temp2 = S0 + maj */ \
130	rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \
131	add h,h,rT3; /* 1: h = temp1 + temp2 */ \
132	rotrwi rT2,d,25; /* 2: S1" = e rotr 25 */ \
133	xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \
134	and rT3,d,e; /* 2: ch = e and f */ \
135	xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \
136	andc rT1,f,d; /* 2: ch' = ~e and g */ \
137	lwz rT2,off+4(rKP); /* 2: K */ \
138	xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \
139	add g,g,rT0; /* 2: temp1 = h + S1 */ \
140	add rT3,rT3,w; /* 2: temp1' = ch + w */ \
141	rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \
142	add g,g,rT3; /* 2: temp1 = temp1 + temp1' */ \
143	rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \
144	add g,g,rT2; /* 2: temp1 = temp1 + K */ \
145	rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \
146	xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \
147	or rT2,h,a; /* 2: maj = a or b */ \
148	xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \
149	and rT1,h,a; /* 2: maj' = a and b */ \
150	and rT2,rT2,b; /* 2: maj = maj and c */ \
151	add c,c,g; /* 2: d = d + temp1 */ \
152	or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \
153	add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \
154	add g,g,rT3 /* 2: h = temp1 + temp2 */
155
156	#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \
157	rotrwi rT2,e,6; /* 1: S1 = e rotr 6 */ \
158	evmergelohi rT0,w0,w1; /* w[-15] */ \
159	rotrwi rT3,e,11; /* 1: S1' = e rotr 11 */ \
160	evsrwiu rT1,rT0,3; /* s0 = w[-15] >> 3 */ \
161	xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \
162	evrlwi rT0,rT0,25; /* s0' = w[-15] rotr 7 */ \
163	rotrwi rT3,e,25; /* 1: S1' = e rotr 25 */ \
164	evxor rT1,rT1,rT0; /* s0 = s0 xor s0' */ \
165	xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \
166	evrlwi rT0,rT0,21; /* s0' = w[-15] rotr 18 */ \
167	add h,h,rT2; /* 1: temp1 = h + S1 */ \
168	evxor rT0,rT0,rT1; /* s0 = s0 xor s0' */ \
169	and rT2,e,f; /* 1: ch = e and f */ \
170	evaddw w0,w0,rT0; /* w = w[-16] + s0 */ \
171	andc rT3,g,e; /* 1: ch' = ~e and g */ \
172	evsrwiu rT0,w7,10; /* s1 = w[-2] >> 10 */ \
173	xor rT2,rT2,rT3; /* 1: ch = ch xor ch' */ \
174	evrlwi rT1,w7,15; /* s1' = w[-2] rotr 17 */ \
175	add h,h,rT2; /* 1: temp1 = temp1 + ch */ \
176	evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \
177	rotrwi rT2,a,2; /* 1: S0 = a rotr 2 */ \
178	evrlwi rT1,w7,13; /* s1' = w[-2] rotr 19 */ \
179	rotrwi rT3,a,13; /* 1: S0' = a rotr 13 */ \
180	evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \
181	xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \
182	evldw rT1,off(rKP); /* k */ \
183	rotrwi rT3,a,22; /* 1: S0' = a rotr 22 */ \
184	evaddw w0,w0,rT0; /* w = w + s1 */ \
185	xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \
186	evmergelohi rT0,w4,w5; /* w[-7] */ \
187	and rT3,a,b; /* 1: maj = a and b */ \
188	evaddw w0,w0,rT0; /* w = w + w[-7] */ \
189	CMP_K##k##_LOOP \
190	add rT2,rT2,rT3; /* 1: temp2 = S0 + maj */ \
191	evaddw rT1,rT1,w0; /* wk = w + k */ \
192	xor rT3,a,b; /* 1: maj = a xor b */ \
193	evmergehi rT0,rT1,rT1; /* wk1/wk2 */ \
194	and rT3,rT3,c; /* 1: maj = maj and c */ \
195	add h,h,rT0; /* 1: temp1 = temp1 + wk */ \
196	add rT2,rT2,rT3; /* 1: temp2 = temp2 + maj */ \
197	add g,g,rT1; /* 2: temp1 = temp1 + wk */ \
198	add d,d,h; /* 1: d = d + temp1 */ \
199	rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \
200	add h,h,rT2; /* 1: h = temp1 + temp2 */ \
201	rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \
202	rotrwi rT2,d,25; /* 2: S" = e rotr 25 */ \
203	xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \
204	and rT3,d,e; /* 2: ch = e and f */ \
205	xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \
206	andc rT1,f,d; /* 2: ch' = ~e and g */ \
207	add g,g,rT0; /* 2: temp1 = h + S1 */ \
208	xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \
209	rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \
210	add g,g,rT3; /* 2: temp1 = temp1 + ch */ \
211	rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \
212	rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \
213	xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \
214	or rT2,h,a; /* 2: maj = a or b */ \
215	and rT1,h,a; /* 2: maj' = a and b */ \
216	and rT2,rT2,b; /* 2: maj = maj and c */ \
217	xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \
218	or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \
219	add c,c,g; /* 2: d = d + temp1 */ \
220	add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \
221	add g,g,rT3 /* 2: h = temp1 + temp2 */
222
223	_GLOBAL(ppc_spe_sha256_transform)
224	INITIALIZE
225
226	mtctr r5
227	lwz rH0,`0`(rHP)
228	lwz rH1,`4`(rHP)
229	lwz rH2,`8`(rHP)
230	lwz rH3,`12`(rHP)
231	lwz rH4,`16`(rHP)
232	lwz rH5,`20`(rHP)
233	lwz rH6,`24`(rHP)
234	lwz rH7,`28`(rHP)
235
236	ppc_spe_sha256_main:
237	lis rKP,PPC_SPE_SHA256_K@ha
238	addi rKP,rKP,PPC_SPE_SHA256_K@l
239
240	R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, `0`)
241	R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, `8`)
242	R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, `16`)
243	R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, `24`)
244	R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, `32`)
245	R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, `40`)
246	R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, `48`)
247	R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, `56`)
248	ppc_spe_sha256_16_rounds:
249	addi rKP,rKP,`64`
250	R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
251	rW0, rW1, rW4, rW5, rW7, N, `0`)
252	R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
253	rW1, rW2, rW5, rW6, rW0, N, `8`)
254	R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
255	rW2, rW3, rW6, rW7, rW1, N, `16`)
256	R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
257	rW3, rW4, rW7, rW0, rW2, N, `24`)
258	R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
259	rW4, rW5, rW0, rW1, rW3, N, `32`)
260	R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
261	rW5, rW6, rW1, rW2, rW4, N, `40`)
262	R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
263	rW6, rW7, rW2, rW3, rW5, N, `48`)
264	R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
265	rW7, rW0, rW3, rW4, rW6, C, `56`)
266	bt gt,ppc_spe_sha256_16_rounds
267
268	lwz rW0,`0`(rHP)
269	NEXT_BLOCK
270	lwz rW1,`4`(rHP)
271	lwz rW2,`8`(rHP)
272	lwz rW3,`12`(rHP)
273	lwz rW4,`16`(rHP)
274	lwz rW5,`20`(rHP)
275	lwz rW6,`24`(rHP)
276	lwz rW7,`28`(rHP)
277
278	add rH0,rH0,rW0
279	stw rH0,`0`(rHP)
280	add rH1,rH1,rW1
281	stw rH1,`4`(rHP)
282	add rH2,rH2,rW2
283	stw rH2,`8`(rHP)
284	add rH3,rH3,rW3
285	stw rH3,`12`(rHP)
286	add rH4,rH4,rW4
287	stw rH4,`16`(rHP)
288	add rH5,rH5,rW5
289	stw rH5,`20`(rHP)
290	add rH6,rH6,rW6
291	stw rH6,`24`(rHP)
292	add rH7,rH7,rW7
293	stw rH7,`28`(rHP)
294
295	bdnz ppc_spe_sha256_main
296
297	FINALIZE
298	blr
299
300	.data
301	.align `5`
302	PPC_SPE_SHA256_K:
303	.long `0x428a2f98`,`0x71374491`,`0xb5c0fbcf`,`0xe9b5dba5`
304	.long `0x3956c25b`,`0x59f111f1`,`0x923f82a4`,`0xab1c5ed5`
305	.long `0xd807aa98`,`0x12835b01`,`0x243185be`,`0x550c7dc3`
306	.long `0x72be5d74`,`0x80deb1fe`,`0x9bdc06a7`,`0xc19bf174`
307	.long `0xe49b69c1`,`0xefbe4786`,`0x0fc19dc6`,`0x240ca1cc`
308	.long `0x2de92c6f`,`0x4a7484aa`,`0x5cb0a9dc`,`0x76f988da`
309	.long `0x983e5152`,`0xa831c66d`,`0xb00327c8`,`0xbf597fc7`
310	.long `0xc6e00bf3`,`0xd5a79147`,`0x06ca6351`,`0x14292967`
311	.long `0x27b70a85`,`0x2e1b2138`,`0x4d2c6dfc`,`0x53380d13`
312	.long `0x650a7354`,`0x766a0abb`,`0x81c2c92e`,`0x92722c85`
313	.long `0xa2bfe8a1`,`0xa81a664b`,`0xc24b8b70`,`0xc76c51a3`
314	.long `0xd192e819`,`0xd6990624`,`0xf40e3585`,`0x106aa070`
315	.long `0x19a4c116`,`0x1e376c08`,`0x2748774c`,`0x34b0bcb5`
316	.long `0x391c0cb3`,`0x4ed8aa4a`,`0x5b9cca4f`,`0x682e6ff3`
317	.long `0x748f82ee`,`0x78a5636f`,`0x84c87814`,`0x8cc70208`
318	.long `0x90befffa`,`0xa4506ceb`,`0xbef9a3f7`,`0xc67178f2`
319

source code of linux/arch/powerpc/crypto/sha256-spe-asm.S