x86_64-gcc.c [openssl/crypto/bn/asm/x86_64-gcc.c]

1	#include "../bn_lcl.h"
2	#if !(defined(__GNUC__) && __GNUC__>=2)
3	# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
4	#else
5	/*
6	* x86_64 BIGNUM accelerator version 0.1, December 2002.
7	*
8	* Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9	* project.
10	*
11	* Rights for redistribution and usage in source and binary forms are
12	* granted according to the OpenSSL license. Warranty of any kind is
13	* disclaimed.
14	*
15	* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
16	* versions, like 1.0...
17	* A. Well, that's because this code is basically a quick-n-dirty
18	* proof-of-concept hack. As you can see it's implemented with
19	* inline assembler, which means that you're bound to GCC and that
20	* there might be enough room for further improvement.
21	*
22	* Q. Why inline assembler?
23	* A. x86_64 features own ABI which I'm not familiar with. This is
24	* why I decided to let the compiler take care of subroutine
25	* prologue/epilogue as well as register allocation. For reference.
26	* Win64 implements different ABI for AMD64, different from Linux.
27	*
28	* Q. How much faster does it get?
29	* A. 'apps/openssl speed rsa dsa' output with no-asm:
30	*
31	* sign verify sign/s verify/s
32	* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
33	* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
34	* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
35	* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
36	* sign verify sign/s verify/s
37	* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
38	* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
39	* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
40	*
41	* 'apps/openssl speed rsa dsa' output with this module:
42	*
43	* sign verify sign/s verify/s
44	* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
45	* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
46	* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
47	* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
48	* sign verify sign/s verify/s
49	* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
50	* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
51	* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
52	*
53	* For the reference. IA-32 assembler implementation performs
54	* very much like 64-bit code compiled with no-asm on the same
55	* machine.
56	*/
57
58	#ifdef _WIN64
59	#define BN_ULONG unsigned long long
60	#else
61	#define BN_ULONG unsigned long
62	#endif
63
64	#undef mul
65	#undef mul_add
66	#undef sqr
67
68	/*
69	* "m"(a), "+m"(r) is the way to favor DirectPath ľ-code;
70	* "g"(0) let the compiler to decide where does it
71	* want to keep the value of zero;
72	*/
73	#define mul_add(r,a,word,carry) do { \
74	register BN_ULONG high,low; \
75	asm ("mulq %3" \
76	: "=a"(low),"=d"(high) \
77	: "a"(word),"m"(a) \
78	: "cc"); \
79	asm ("addq %2,%0; adcq %3,%1" \
80	: "+r"(carry),"+d"(high)\
81	: "a"(low),"g"(0) \
82	: "cc"); \
83	asm ("addq %2,%0; adcq %3,%1" \
84	: "+m"(r),"+d"(high) \
85	: "r"(carry),"g"(0) \
86	: "cc"); \
87	carry=high; \
88	} while (0)
89
90	#define mul(r,a,word,carry) do { \
91	register BN_ULONG high,low; \
92	asm ("mulq %3" \
93	: "=a"(low),"=d"(high) \
94	: "a"(word),"g"(a) \
95	: "cc"); \
96	asm ("addq %2,%0; adcq %3,%1" \
97	: "+r"(carry),"+d"(high)\
98	: "a"(low),"g"(0) \
99	: "cc"); \
100	(r)=carry, carry=high; \
101	} while (0)
102
103	#define sqr(r0,r1,a) \
104	asm ("mulq %2" \
105	: "=a"(r0),"=d"(r1) \
106	: "a"(a) \
107	: "cc");
108
109	BN_ULONG bn_mul_add_words(BN_ULONG rp, const* BN_ULONG ap, int* num, BN_ULONG w)
110	{
111	BN_ULONG c1=`0`;
112
113	if (num <= `0`) return(c1);
114
115	while (num&~`3`)
116	{
117	mul_add(rp[`0`],ap[`0`],w,c1);
118	mul_add(rp[`1`],ap[`1`],w,c1);
119	mul_add(rp[`2`],ap[`2`],w,c1);
120	mul_add(rp[`3`],ap[`3`],w,c1);
121	ap+=`4`; rp+=`4`; num-=`4`;
122	}
123	if (num)
124	{
125	mul_add(rp[`0`],ap[`0`],w,c1); if (--num==`0`) return c1;
126	mul_add(rp[`1`],ap[`1`],w,c1); if (--num==`0`) return c1;
127	mul_add(rp[`2`],ap[`2`],w,c1); return c1;
128	}
129
130	return(c1);
131	}
132
133	BN_ULONG bn_mul_words(BN_ULONG rp, const* BN_ULONG ap, int* num, BN_ULONG w)
134	{
135	BN_ULONG c1=`0`;
136
137	if (num <= `0`) return(c1);
138
139	while (num&~`3`)
140	{
141	mul(rp[`0`],ap[`0`],w,c1);
142	mul(rp[`1`],ap[`1`],w,c1);
143	mul(rp[`2`],ap[`2`],w,c1);
144	mul(rp[`3`],ap[`3`],w,c1);
145	ap+=`4`; rp+=`4`; num-=`4`;
146	}
147	if (num)
148	{
149	mul(rp[`0`],ap[`0`],w,c1); if (--num == `0`) return c1;
150	mul(rp[`1`],ap[`1`],w,c1); if (--num == `0`) return c1;
151	mul(rp[`2`],ap[`2`],w,c1);
152	}
153	return(c1);
154	}
155
156	void bn_sqr_words(BN_ULONG r, const* BN_ULONG a, int* n)
157	{
158	if (n <= `0`) return;
159
160	while (n&~`3`)
161	{
162	sqr(r[`0`],r[`1`],a[`0`]);
163	sqr(r[`2`],r[`3`],a[`1`]);
164	sqr(r[`4`],r[`5`],a[`2`]);
165	sqr(r[`6`],r[`7`],a[`3`]);
166	a+=`4`; r+=`8`; n-=`4`;
167	}
168	if (n)
169	{
170	sqr(r[`0`],r[`1`],a[`0`]); if (--n == `0`) return;
171	sqr(r[`2`],r[`3`],a[`1`]); if (--n == `0`) return;
172	sqr(r[`4`],r[`5`],a[`2`]);
173	}
174	}
175
176	BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
177	{ BN_ULONG ret,waste;
178
179	asm ("divq %4"
180	: "=a"(ret),"=d"(waste)
181	: "a"(l),"d"(h),"g"(d)
182	: "cc");
183
184	return ret;
185	}
186
187	BN_ULONG bn_add_words (BN_ULONG rp, const* BN_ULONG ap, const* BN_ULONG bp,int* n)
188	{ BN_ULONG ret=`0`,i=`0`;
189
190	if (n <= `0`) return `0`;
191
192	asm (
193	" subq %2,%2 \n"
194	".p2align 4 \n"
195	"1: movq (%4,%2,8),%0 \n"
196	" adcq (%5,%2,8),%0 \n"
197	" movq %0,(%3,%2,8) \n"
198	" leaq 1(%2),%2 \n"
199	" loop 1b \n"
200	" sbbq %0,%0 \n"
201	: "=&a"(ret),"+c"(n),"=&r"(i)
202	: "r"(rp),"r"(ap),"r"(bp)
203	: "cc"
204	);
205
206	return ret&`1`;
207	}
208
209	#ifndef SIMICS
210	BN_ULONG bn_sub_words (BN_ULONG rp, const* BN_ULONG ap, const* BN_ULONG bp,int* n)
211	{ BN_ULONG ret=`0`,i=`0`;
212
213	if (n <= `0`) return `0`;
214
215	asm (
216	" subq %2,%2 \n"
217	".p2align 4 \n"
218	"1: movq (%4,%2,8),%0 \n"
219	" sbbq (%5,%2,8),%0 \n"
220	" movq %0,(%3,%2,8) \n"
221	" leaq 1(%2),%2 \n"
222	" loop 1b \n"
223	" sbbq %0,%0 \n"
224	: "=&a"(ret),"+c"(n),"=&r"(i)
225	: "r"(rp),"r"(ap),"r"(bp)
226	: "cc"
227	);
228
229	return ret&`1`;
230	}
231	#else
232	/ Simics 1.4<7 has buggy sbbq:-( /
233	#define BN_MASK2 0xffffffffffffffffL
234	BN_ULONG bn_sub_words(BN_ULONG r, BN_ULONG a, BN_ULONG b, int* n)
235	{
236	BN_ULONG t1,t2;
237	int c=`0`;
238
239	if (n <= `0`) return((BN_ULONG)`0`);
240
241	for (;;)
242	{
243	t1=a[`0`]; t2=b[`0`];
244	r[`0`]=(t1-t2-c)&BN_MASK2;
245	if (t1 != t2) c=(t1 < t2);
246	if (--n <= `0`) break;
247
248	t1=a[`1`]; t2=b[`1`];
249	r[`1`]=(t1-t2-c)&BN_MASK2;
250	if (t1 != t2) c=(t1 < t2);
251	if (--n <= `0`) break;
252
253	t1=a[`2`]; t2=b[`2`];
254	r[`2`]=(t1-t2-c)&BN_MASK2;
255	if (t1 != t2) c=(t1 < t2);
256	if (--n <= `0`) break;
257
258	t1=a[`3`]; t2=b[`3`];
259	r[`3`]=(t1-t2-c)&BN_MASK2;
260	if (t1 != t2) c=(t1 < t2);
261	if (--n <= `0`) break;
262
263	a+=`4`;
264	b+=`4`;
265	r+=`4`;
266	}
267	return(c);
268	}
269	#endif
270
271	/ mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /*
272	/ mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) /
273	/ sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) /
274	/ sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number c=(c2,c1,c0) /
275
276	#if 0
277	/ original macros are kept for reference purposes /
278	#define mul_add_c(a,b,c0,c1,c2) { \
279	BN_ULONG ta=(a),tb=(b); \
280	t1 = ta * tb; \
281	t2 = BN_UMULT_HIGH(ta,tb); \
282	c0 += t1; t2 += (c0<t1)?1:0; \
283	c1 += t2; c2 += (c1<t2)?1:0; \
284	}
285
286	#define mul_add_c2(a,b,c0,c1,c2) { \
287	BN_ULONG ta=(a),tb=(b),t0; \
288	t1 = BN_UMULT_HIGH(ta,tb); \
289	t0 = ta * tb; \
290	t2 = t1+t1; c2 += (t2<t1)?1:0; \
291	t1 = t0+t0; t2 += (t1<t0)?1:0; \
292	c0 += t1; t2 += (c0<t1)?1:0; \
293	c1 += t2; c2 += (c1<t2)?1:0; \
294	}
295	#else
296	#define mul_add_c(a,b,c0,c1,c2) do { \
297	asm ("mulq %3" \
298	: "=a"(t1),"=d"(t2) \
299	: "a"(a),"m"(b) \
300	: "cc"); \
301	asm ("addq %2,%0; adcq %3,%1" \
302	: "+r"(c0),"+d"(t2) \
303	: "a"(t1),"g"(0) \
304	: "cc"); \
305	asm ("addq %2,%0; adcq %3,%1" \
306	: "+r"(c1),"+r"(c2) \
307	: "d"(t2),"g"(0) \
308	: "cc"); \
309	} while (0)
310
311	#define sqr_add_c(a,i,c0,c1,c2) do { \
312	asm ("mulq %2" \
313	: "=a"(t1),"=d"(t2) \
314	: "a"(a[i]) \
315	: "cc"); \
316	asm ("addq %2,%0; adcq %3,%1" \
317	: "+r"(c0),"+d"(t2) \
318	: "a"(t1),"g"(0) \
319	: "cc"); \
320	asm ("addq %2,%0; adcq %3,%1" \
321	: "+r"(c1),"+r"(c2) \
322	: "d"(t2),"g"(0) \
323	: "cc"); \
324	} while (0)
325
326	#define mul_add_c2(a,b,c0,c1,c2) do { \
327	asm ("mulq %3" \
328	: "=a"(t1),"=d"(t2) \
329	: "a"(a),"m"(b) \
330	: "cc"); \
331	asm ("addq %0,%0; adcq %2,%1" \
332	: "+d"(t2),"+r"(c2) \
333	: "g"(0) \
334	: "cc"); \
335	asm ("addq %0,%0; adcq %2,%1" \
336	: "+a"(t1),"+d"(t2) \
337	: "g"(0) \
338	: "cc"); \
339	asm ("addq %2,%0; adcq %3,%1" \
340	: "+r"(c0),"+d"(t2) \
341	: "a"(t1),"g"(0) \
342	: "cc"); \
343	asm ("addq %2,%0; adcq %3,%1" \
344	: "+r"(c1),"+r"(c2) \
345	: "d"(t2),"g"(0) \
346	: "cc"); \
347	} while (0)
348	#endif
349
350	#define sqr_add_c2(a,i,j,c0,c1,c2) \
351	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
352
353	void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
354	{
355	BN_ULONG t1,t2;
356	BN_ULONG c1,c2,c3;
357
358	c1=`0`;
359	c2=`0`;
360	c3=`0`;
361	mul_add_c(a[`0`],b[`0`],c1,c2,c3);
362	r[`0`]=c1;
363	c1=`0`;
364	mul_add_c(a[`0`],b[`1`],c2,c3,c1);
365	mul_add_c(a[`1`],b[`0`],c2,c3,c1);
366	r[`1`]=c2;
367	c2=`0`;
368	mul_add_c(a[`2`],b[`0`],c3,c1,c2);
369	mul_add_c(a[`1`],b[`1`],c3,c1,c2);
370	mul_add_c(a[`0`],b[`2`],c3,c1,c2);
371	r[`2`]=c3;
372	c3=`0`;
373	mul_add_c(a[`0`],b[`3`],c1,c2,c3);
374	mul_add_c(a[`1`],b[`2`],c1,c2,c3);
375	mul_add_c(a[`2`],b[`1`],c1,c2,c3);
376	mul_add_c(a[`3`],b[`0`],c1,c2,c3);
377	r[`3`]=c1;
378	c1=`0`;
379	mul_add_c(a[`4`],b[`0`],c2,c3,c1);
380	mul_add_c(a[`3`],b[`1`],c2,c3,c1);
381	mul_add_c(a[`2`],b[`2`],c2,c3,c1);
382	mul_add_c(a[`1`],b[`3`],c2,c3,c1);
383	mul_add_c(a[`0`],b[`4`],c2,c3,c1);
384	r[`4`]=c2;
385	c2=`0`;
386	mul_add_c(a[`0`],b[`5`],c3,c1,c2);
387	mul_add_c(a[`1`],b[`4`],c3,c1,c2);
388	mul_add_c(a[`2`],b[`3`],c3,c1,c2);
389	mul_add_c(a[`3`],b[`2`],c3,c1,c2);
390	mul_add_c(a[`4`],b[`1`],c3,c1,c2);
391	mul_add_c(a[`5`],b[`0`],c3,c1,c2);
392	r[`5`]=c3;
393	c3=`0`;
394	mul_add_c(a[`6`],b[`0`],c1,c2,c3);
395	mul_add_c(a[`5`],b[`1`],c1,c2,c3);
396	mul_add_c(a[`4`],b[`2`],c1,c2,c3);
397	mul_add_c(a[`3`],b[`3`],c1,c2,c3);
398	mul_add_c(a[`2`],b[`4`],c1,c2,c3);
399	mul_add_c(a[`1`],b[`5`],c1,c2,c3);
400	mul_add_c(a[`0`],b[`6`],c1,c2,c3);
401	r[`6`]=c1;
402	c1=`0`;
403	mul_add_c(a[`0`],b[`7`],c2,c3,c1);
404	mul_add_c(a[`1`],b[`6`],c2,c3,c1);
405	mul_add_c(a[`2`],b[`5`],c2,c3,c1);
406	mul_add_c(a[`3`],b[`4`],c2,c3,c1);
407	mul_add_c(a[`4`],b[`3`],c2,c3,c1);
408	mul_add_c(a[`5`],b[`2`],c2,c3,c1);
409	mul_add_c(a[`6`],b[`1`],c2,c3,c1);
410	mul_add_c(a[`7`],b[`0`],c2,c3,c1);
411	r[`7`]=c2;
412	c2=`0`;
413	mul_add_c(a[`7`],b[`1`],c3,c1,c2);
414	mul_add_c(a[`6`],b[`2`],c3,c1,c2);
415	mul_add_c(a[`5`],b[`3`],c3,c1,c2);
416	mul_add_c(a[`4`],b[`4`],c3,c1,c2);
417	mul_add_c(a[`3`],b[`5`],c3,c1,c2);
418	mul_add_c(a[`2`],b[`6`],c3,c1,c2);
419	mul_add_c(a[`1`],b[`7`],c3,c1,c2);
420	r[`8`]=c3;
421	c3=`0`;
422	mul_add_c(a[`2`],b[`7`],c1,c2,c3);
423	mul_add_c(a[`3`],b[`6`],c1,c2,c3);
424	mul_add_c(a[`4`],b[`5`],c1,c2,c3);
425	mul_add_c(a[`5`],b[`4`],c1,c2,c3);
426	mul_add_c(a[`6`],b[`3`],c1,c2,c3);
427	mul_add_c(a[`7`],b[`2`],c1,c2,c3);
428	r[`9`]=c1;
429	c1=`0`;
430	mul_add_c(a[`7`],b[`3`],c2,c3,c1);
431	mul_add_c(a[`6`],b[`4`],c2,c3,c1);
432	mul_add_c(a[`5`],b[`5`],c2,c3,c1);
433	mul_add_c(a[`4`],b[`6`],c2,c3,c1);
434	mul_add_c(a[`3`],b[`7`],c2,c3,c1);
435	r[`10`]=c2;
436	c2=`0`;
437	mul_add_c(a[`4`],b[`7`],c3,c1,c2);
438	mul_add_c(a[`5`],b[`6`],c3,c1,c2);
439	mul_add_c(a[`6`],b[`5`],c3,c1,c2);
440	mul_add_c(a[`7`],b[`4`],c3,c1,c2);
441	r[`11`]=c3;
442	c3=`0`;
443	mul_add_c(a[`7`],b[`5`],c1,c2,c3);
444	mul_add_c(a[`6`],b[`6`],c1,c2,c3);
445	mul_add_c(a[`5`],b[`7`],c1,c2,c3);
446	r[`12`]=c1;
447	c1=`0`;
448	mul_add_c(a[`6`],b[`7`],c2,c3,c1);
449	mul_add_c(a[`7`],b[`6`],c2,c3,c1);
450	r[`13`]=c2;
451	c2=`0`;
452	mul_add_c(a[`7`],b[`7`],c3,c1,c2);
453	r[`14`]=c3;
454	r[`15`]=c1;
455	}
456
457	void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
458	{
459	BN_ULONG t1,t2;
460	BN_ULONG c1,c2,c3;
461
462	c1=`0`;
463	c2=`0`;
464	c3=`0`;
465	mul_add_c(a[`0`],b[`0`],c1,c2,c3);
466	r[`0`]=c1;
467	c1=`0`;
468	mul_add_c(a[`0`],b[`1`],c2,c3,c1);
469	mul_add_c(a[`1`],b[`0`],c2,c3,c1);
470	r[`1`]=c2;
471	c2=`0`;
472	mul_add_c(a[`2`],b[`0`],c3,c1,c2);
473	mul_add_c(a[`1`],b[`1`],c3,c1,c2);
474	mul_add_c(a[`0`],b[`2`],c3,c1,c2);
475	r[`2`]=c3;
476	c3=`0`;
477	mul_add_c(a[`0`],b[`3`],c1,c2,c3);
478	mul_add_c(a[`1`],b[`2`],c1,c2,c3);
479	mul_add_c(a[`2`],b[`1`],c1,c2,c3);
480	mul_add_c(a[`3`],b[`0`],c1,c2,c3);
481	r[`3`]=c1;
482	c1=`0`;
483	mul_add_c(a[`3`],b[`1`],c2,c3,c1);
484	mul_add_c(a[`2`],b[`2`],c2,c3,c1);
485	mul_add_c(a[`1`],b[`3`],c2,c3,c1);
486	r[`4`]=c2;
487	c2=`0`;
488	mul_add_c(a[`2`],b[`3`],c3,c1,c2);
489	mul_add_c(a[`3`],b[`2`],c3,c1,c2);
490	r[`5`]=c3;
491	c3=`0`;
492	mul_add_c(a[`3`],b[`3`],c1,c2,c3);
493	r[`6`]=c1;
494	r[`7`]=c2;
495	}
496
497	void bn_sqr_comba8(BN_ULONG r, const* BN_ULONG *a)
498	{
499	BN_ULONG t1,t2;
500	BN_ULONG c1,c2,c3;
501
502	c1=`0`;
503	c2=`0`;
504	c3=`0`;
505	sqr_add_c(a,`0`,c1,c2,c3);
506	r[`0`]=c1;
507	c1=`0`;
508	sqr_add_c2(a,`1`,`0`,c2,c3,c1);
509	r[`1`]=c2;
510	c2=`0`;
511	sqr_add_c(a,`1`,c3,c1,c2);
512	sqr_add_c2(a,`2`,`0`,c3,c1,c2);
513	r[`2`]=c3;
514	c3=`0`;
515	sqr_add_c2(a,`3`,`0`,c1,c2,c3);
516	sqr_add_c2(a,`2`,`1`,c1,c2,c3);
517	r[`3`]=c1;
518	c1=`0`;
519	sqr_add_c(a,`2`,c2,c3,c1);
520	sqr_add_c2(a,`3`,`1`,c2,c3,c1);
521	sqr_add_c2(a,`4`,`0`,c2,c3,c1);
522	r[`4`]=c2;
523	c2=`0`;
524	sqr_add_c2(a,`5`,`0`,c3,c1,c2);
525	sqr_add_c2(a,`4`,`1`,c3,c1,c2);
526	sqr_add_c2(a,`3`,`2`,c3,c1,c2);
527	r[`5`]=c3;
528	c3=`0`;
529	sqr_add_c(a,`3`,c1,c2,c3);
530	sqr_add_c2(a,`4`,`2`,c1,c2,c3);
531	sqr_add_c2(a,`5`,`1`,c1,c2,c3);
532	sqr_add_c2(a,`6`,`0`,c1,c2,c3);
533	r[`6`]=c1;
534	c1=`0`;
535	sqr_add_c2(a,`7`,`0`,c2,c3,c1);
536	sqr_add_c2(a,`6`,`1`,c2,c3,c1);
537	sqr_add_c2(a,`5`,`2`,c2,c3,c1);
538	sqr_add_c2(a,`4`,`3`,c2,c3,c1);
539	r[`7`]=c2;
540	c2=`0`;
541	sqr_add_c(a,`4`,c3,c1,c2);
542	sqr_add_c2(a,`5`,`3`,c3,c1,c2);
543	sqr_add_c2(a,`6`,`2`,c3,c1,c2);
544	sqr_add_c2(a,`7`,`1`,c3,c1,c2);
545	r[`8`]=c3;
546	c3=`0`;
547	sqr_add_c2(a,`7`,`2`,c1,c2,c3);
548	sqr_add_c2(a,`6`,`3`,c1,c2,c3);
549	sqr_add_c2(a,`5`,`4`,c1,c2,c3);
550	r[`9`]=c1;
551	c1=`0`;
552	sqr_add_c(a,`5`,c2,c3,c1);
553	sqr_add_c2(a,`6`,`4`,c2,c3,c1);
554	sqr_add_c2(a,`7`,`3`,c2,c3,c1);
555	r[`10`]=c2;
556	c2=`0`;
557	sqr_add_c2(a,`7`,`4`,c3,c1,c2);
558	sqr_add_c2(a,`6`,`5`,c3,c1,c2);
559	r[`11`]=c3;
560	c3=`0`;
561	sqr_add_c(a,`6`,c1,c2,c3);
562	sqr_add_c2(a,`7`,`5`,c1,c2,c3);
563	r[`12`]=c1;
564	c1=`0`;
565	sqr_add_c2(a,`7`,`6`,c2,c3,c1);
566	r[`13`]=c2;
567	c2=`0`;
568	sqr_add_c(a,`7`,c3,c1,c2);
569	r[`14`]=c3;
570	r[`15`]=c1;
571	}
572
573	void bn_sqr_comba4(BN_ULONG r, const* BN_ULONG *a)
574	{
575	BN_ULONG t1,t2;
576	BN_ULONG c1,c2,c3;
577
578	c1=`0`;
579	c2=`0`;
580	c3=`0`;
581	sqr_add_c(a,`0`,c1,c2,c3);
582	r[`0`]=c1;
583	c1=`0`;
584	sqr_add_c2(a,`1`,`0`,c2,c3,c1);
585	r[`1`]=c2;
586	c2=`0`;
587	sqr_add_c(a,`1`,c3,c1,c2);
588	sqr_add_c2(a,`2`,`0`,c3,c1,c2);
589	r[`2`]=c3;
590	c3=`0`;
591	sqr_add_c2(a,`3`,`0`,c1,c2,c3);
592	sqr_add_c2(a,`2`,`1`,c1,c2,c3);
593	r[`3`]=c1;
594	c1=`0`;
595	sqr_add_c(a,`2`,c2,c3,c1);
596	sqr_add_c2(a,`3`,`1`,c2,c3,c1);
597	r[`4`]=c2;
598	c2=`0`;
599	sqr_add_c2(a,`3`,`2`,c3,c1,c2);
600	r[`5`]=c3;
601	c3=`0`;
602	sqr_add_c(a,`3`,c1,c2,c3);
603	r[`6`]=c1;
604	r[`7`]=c2;
605	}
606	#endif
607