1 | #include "../bn_lcl.h" |
2 | #if !(defined(__GNUC__) && __GNUC__>=2) |
3 | # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ |
4 | #else |
5 | /* |
6 | * x86_64 BIGNUM accelerator version 0.1, December 2002. |
7 | * |
8 | * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
9 | * project. |
10 | * |
11 | * Rights for redistribution and usage in source and binary forms are |
12 | * granted according to the OpenSSL license. Warranty of any kind is |
13 | * disclaimed. |
14 | * |
15 | * Q. Version 0.1? It doesn't sound like Andy, he used to assign real |
16 | * versions, like 1.0... |
17 | * A. Well, that's because this code is basically a quick-n-dirty |
18 | * proof-of-concept hack. As you can see it's implemented with |
19 | * inline assembler, which means that you're bound to GCC and that |
20 | * there might be enough room for further improvement. |
21 | * |
22 | * Q. Why inline assembler? |
23 | * A. x86_64 features own ABI which I'm not familiar with. This is |
24 | * why I decided to let the compiler take care of subroutine |
25 | * prologue/epilogue as well as register allocation. For reference. |
26 | * Win64 implements different ABI for AMD64, different from Linux. |
27 | * |
28 | * Q. How much faster does it get? |
29 | * A. 'apps/openssl speed rsa dsa' output with no-asm: |
30 | * |
31 | * sign verify sign/s verify/s |
32 | * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 |
33 | * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 |
34 | * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 |
35 | * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 |
36 | * sign verify sign/s verify/s |
37 | * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 |
38 | * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 |
39 | * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 |
40 | * |
41 | * 'apps/openssl speed rsa dsa' output with this module: |
42 | * |
43 | * sign verify sign/s verify/s |
44 | * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 |
45 | * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 |
46 | * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 |
47 | * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 |
48 | * sign verify sign/s verify/s |
49 | * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 |
50 | * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 |
51 | * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 |
52 | * |
53 | * For the reference. IA-32 assembler implementation performs |
54 | * very much like 64-bit code compiled with no-asm on the same |
55 | * machine. |
56 | */ |
57 | |
58 | #ifdef _WIN64 |
59 | #define BN_ULONG unsigned long long |
60 | #else |
61 | #define BN_ULONG unsigned long |
62 | #endif |
63 | |
64 | #undef mul |
65 | #undef mul_add |
66 | #undef sqr |
67 | |
68 | /* |
69 | * "m"(a), "+m"(r) is the way to favor DirectPath µ-code; |
70 | * "g"(0) let the compiler to decide where does it |
71 | * want to keep the value of zero; |
72 | */ |
73 | #define mul_add(r,a,word,carry) do { \ |
74 | register BN_ULONG high,low; \ |
75 | asm ("mulq %3" \ |
76 | : "=a"(low),"=d"(high) \ |
77 | : "a"(word),"m"(a) \ |
78 | : "cc"); \ |
79 | asm ("addq %2,%0; adcq %3,%1" \ |
80 | : "+r"(carry),"+d"(high)\ |
81 | : "a"(low),"g"(0) \ |
82 | : "cc"); \ |
83 | asm ("addq %2,%0; adcq %3,%1" \ |
84 | : "+m"(r),"+d"(high) \ |
85 | : "r"(carry),"g"(0) \ |
86 | : "cc"); \ |
87 | carry=high; \ |
88 | } while (0) |
89 | |
90 | #define mul(r,a,word,carry) do { \ |
91 | register BN_ULONG high,low; \ |
92 | asm ("mulq %3" \ |
93 | : "=a"(low),"=d"(high) \ |
94 | : "a"(word),"g"(a) \ |
95 | : "cc"); \ |
96 | asm ("addq %2,%0; adcq %3,%1" \ |
97 | : "+r"(carry),"+d"(high)\ |
98 | : "a"(low),"g"(0) \ |
99 | : "cc"); \ |
100 | (r)=carry, carry=high; \ |
101 | } while (0) |
102 | |
103 | #define sqr(r0,r1,a) \ |
104 | asm ("mulq %2" \ |
105 | : "=a"(r0),"=d"(r1) \ |
106 | : "a"(a) \ |
107 | : "cc"); |
108 | |
109 | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
110 | { |
111 | BN_ULONG c1=0; |
112 | |
113 | if (num <= 0) return(c1); |
114 | |
115 | while (num&~3) |
116 | { |
117 | mul_add(rp[0],ap[0],w,c1); |
118 | mul_add(rp[1],ap[1],w,c1); |
119 | mul_add(rp[2],ap[2],w,c1); |
120 | mul_add(rp[3],ap[3],w,c1); |
121 | ap+=4; rp+=4; num-=4; |
122 | } |
123 | if (num) |
124 | { |
125 | mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1; |
126 | mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1; |
127 | mul_add(rp[2],ap[2],w,c1); return c1; |
128 | } |
129 | |
130 | return(c1); |
131 | } |
132 | |
133 | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
134 | { |
135 | BN_ULONG c1=0; |
136 | |
137 | if (num <= 0) return(c1); |
138 | |
139 | while (num&~3) |
140 | { |
141 | mul(rp[0],ap[0],w,c1); |
142 | mul(rp[1],ap[1],w,c1); |
143 | mul(rp[2],ap[2],w,c1); |
144 | mul(rp[3],ap[3],w,c1); |
145 | ap+=4; rp+=4; num-=4; |
146 | } |
147 | if (num) |
148 | { |
149 | mul(rp[0],ap[0],w,c1); if (--num == 0) return c1; |
150 | mul(rp[1],ap[1],w,c1); if (--num == 0) return c1; |
151 | mul(rp[2],ap[2],w,c1); |
152 | } |
153 | return(c1); |
154 | } |
155 | |
156 | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
157 | { |
158 | if (n <= 0) return; |
159 | |
160 | while (n&~3) |
161 | { |
162 | sqr(r[0],r[1],a[0]); |
163 | sqr(r[2],r[3],a[1]); |
164 | sqr(r[4],r[5],a[2]); |
165 | sqr(r[6],r[7],a[3]); |
166 | a+=4; r+=8; n-=4; |
167 | } |
168 | if (n) |
169 | { |
170 | sqr(r[0],r[1],a[0]); if (--n == 0) return; |
171 | sqr(r[2],r[3],a[1]); if (--n == 0) return; |
172 | sqr(r[4],r[5],a[2]); |
173 | } |
174 | } |
175 | |
176 | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) |
177 | { BN_ULONG ret,waste; |
178 | |
179 | asm ("divq %4" |
180 | : "=a" (ret),"=d" (waste) |
181 | : "a" (l),"d" (h),"g" (d) |
182 | : "cc" ); |
183 | |
184 | return ret; |
185 | } |
186 | |
187 | BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n) |
188 | { BN_ULONG ret=0,i=0; |
189 | |
190 | if (n <= 0) return 0; |
191 | |
192 | asm ( |
193 | " subq %2,%2 \n" |
194 | ".p2align 4 \n" |
195 | "1: movq (%4,%2,8),%0 \n" |
196 | " adcq (%5,%2,8),%0 \n" |
197 | " movq %0,(%3,%2,8) \n" |
198 | " leaq 1(%2),%2 \n" |
199 | " loop 1b \n" |
200 | " sbbq %0,%0 \n" |
201 | : "=&a" (ret),"+c" (n),"=&r" (i) |
202 | : "r" (rp),"r" (ap),"r" (bp) |
203 | : "cc" |
204 | ); |
205 | |
206 | return ret&1; |
207 | } |
208 | |
209 | #ifndef SIMICS |
210 | BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n) |
211 | { BN_ULONG ret=0,i=0; |
212 | |
213 | if (n <= 0) return 0; |
214 | |
215 | asm ( |
216 | " subq %2,%2 \n" |
217 | ".p2align 4 \n" |
218 | "1: movq (%4,%2,8),%0 \n" |
219 | " sbbq (%5,%2,8),%0 \n" |
220 | " movq %0,(%3,%2,8) \n" |
221 | " leaq 1(%2),%2 \n" |
222 | " loop 1b \n" |
223 | " sbbq %0,%0 \n" |
224 | : "=&a" (ret),"+c" (n),"=&r" (i) |
225 | : "r" (rp),"r" (ap),"r" (bp) |
226 | : "cc" |
227 | ); |
228 | |
229 | return ret&1; |
230 | } |
231 | #else |
232 | /* Simics 1.4<7 has buggy sbbq:-( */ |
233 | #define BN_MASK2 0xffffffffffffffffL |
234 | BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) |
235 | { |
236 | BN_ULONG t1,t2; |
237 | int c=0; |
238 | |
239 | if (n <= 0) return((BN_ULONG)0); |
240 | |
241 | for (;;) |
242 | { |
243 | t1=a[0]; t2=b[0]; |
244 | r[0]=(t1-t2-c)&BN_MASK2; |
245 | if (t1 != t2) c=(t1 < t2); |
246 | if (--n <= 0) break; |
247 | |
248 | t1=a[1]; t2=b[1]; |
249 | r[1]=(t1-t2-c)&BN_MASK2; |
250 | if (t1 != t2) c=(t1 < t2); |
251 | if (--n <= 0) break; |
252 | |
253 | t1=a[2]; t2=b[2]; |
254 | r[2]=(t1-t2-c)&BN_MASK2; |
255 | if (t1 != t2) c=(t1 < t2); |
256 | if (--n <= 0) break; |
257 | |
258 | t1=a[3]; t2=b[3]; |
259 | r[3]=(t1-t2-c)&BN_MASK2; |
260 | if (t1 != t2) c=(t1 < t2); |
261 | if (--n <= 0) break; |
262 | |
263 | a+=4; |
264 | b+=4; |
265 | r+=4; |
266 | } |
267 | return(c); |
268 | } |
269 | #endif |
270 | |
271 | /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ |
272 | /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ |
273 | /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ |
274 | /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */ |
275 | |
276 | #if 0 |
277 | /* original macros are kept for reference purposes */ |
278 | #define mul_add_c(a,b,c0,c1,c2) { \ |
279 | BN_ULONG ta=(a),tb=(b); \ |
280 | t1 = ta * tb; \ |
281 | t2 = BN_UMULT_HIGH(ta,tb); \ |
282 | c0 += t1; t2 += (c0<t1)?1:0; \ |
283 | c1 += t2; c2 += (c1<t2)?1:0; \ |
284 | } |
285 | |
286 | #define mul_add_c2(a,b,c0,c1,c2) { \ |
287 | BN_ULONG ta=(a),tb=(b),t0; \ |
288 | t1 = BN_UMULT_HIGH(ta,tb); \ |
289 | t0 = ta * tb; \ |
290 | t2 = t1+t1; c2 += (t2<t1)?1:0; \ |
291 | t1 = t0+t0; t2 += (t1<t0)?1:0; \ |
292 | c0 += t1; t2 += (c0<t1)?1:0; \ |
293 | c1 += t2; c2 += (c1<t2)?1:0; \ |
294 | } |
295 | #else |
296 | #define mul_add_c(a,b,c0,c1,c2) do { \ |
297 | asm ("mulq %3" \ |
298 | : "=a"(t1),"=d"(t2) \ |
299 | : "a"(a),"m"(b) \ |
300 | : "cc"); \ |
301 | asm ("addq %2,%0; adcq %3,%1" \ |
302 | : "+r"(c0),"+d"(t2) \ |
303 | : "a"(t1),"g"(0) \ |
304 | : "cc"); \ |
305 | asm ("addq %2,%0; adcq %3,%1" \ |
306 | : "+r"(c1),"+r"(c2) \ |
307 | : "d"(t2),"g"(0) \ |
308 | : "cc"); \ |
309 | } while (0) |
310 | |
311 | #define sqr_add_c(a,i,c0,c1,c2) do { \ |
312 | asm ("mulq %2" \ |
313 | : "=a"(t1),"=d"(t2) \ |
314 | : "a"(a[i]) \ |
315 | : "cc"); \ |
316 | asm ("addq %2,%0; adcq %3,%1" \ |
317 | : "+r"(c0),"+d"(t2) \ |
318 | : "a"(t1),"g"(0) \ |
319 | : "cc"); \ |
320 | asm ("addq %2,%0; adcq %3,%1" \ |
321 | : "+r"(c1),"+r"(c2) \ |
322 | : "d"(t2),"g"(0) \ |
323 | : "cc"); \ |
324 | } while (0) |
325 | |
326 | #define mul_add_c2(a,b,c0,c1,c2) do { \ |
327 | asm ("mulq %3" \ |
328 | : "=a"(t1),"=d"(t2) \ |
329 | : "a"(a),"m"(b) \ |
330 | : "cc"); \ |
331 | asm ("addq %0,%0; adcq %2,%1" \ |
332 | : "+d"(t2),"+r"(c2) \ |
333 | : "g"(0) \ |
334 | : "cc"); \ |
335 | asm ("addq %0,%0; adcq %2,%1" \ |
336 | : "+a"(t1),"+d"(t2) \ |
337 | : "g"(0) \ |
338 | : "cc"); \ |
339 | asm ("addq %2,%0; adcq %3,%1" \ |
340 | : "+r"(c0),"+d"(t2) \ |
341 | : "a"(t1),"g"(0) \ |
342 | : "cc"); \ |
343 | asm ("addq %2,%0; adcq %3,%1" \ |
344 | : "+r"(c1),"+r"(c2) \ |
345 | : "d"(t2),"g"(0) \ |
346 | : "cc"); \ |
347 | } while (0) |
348 | #endif |
349 | |
350 | #define sqr_add_c2(a,i,j,c0,c1,c2) \ |
351 | mul_add_c2((a)[i],(a)[j],c0,c1,c2) |
352 | |
353 | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
354 | { |
355 | BN_ULONG t1,t2; |
356 | BN_ULONG c1,c2,c3; |
357 | |
358 | c1=0; |
359 | c2=0; |
360 | c3=0; |
361 | mul_add_c(a[0],b[0],c1,c2,c3); |
362 | r[0]=c1; |
363 | c1=0; |
364 | mul_add_c(a[0],b[1],c2,c3,c1); |
365 | mul_add_c(a[1],b[0],c2,c3,c1); |
366 | r[1]=c2; |
367 | c2=0; |
368 | mul_add_c(a[2],b[0],c3,c1,c2); |
369 | mul_add_c(a[1],b[1],c3,c1,c2); |
370 | mul_add_c(a[0],b[2],c3,c1,c2); |
371 | r[2]=c3; |
372 | c3=0; |
373 | mul_add_c(a[0],b[3],c1,c2,c3); |
374 | mul_add_c(a[1],b[2],c1,c2,c3); |
375 | mul_add_c(a[2],b[1],c1,c2,c3); |
376 | mul_add_c(a[3],b[0],c1,c2,c3); |
377 | r[3]=c1; |
378 | c1=0; |
379 | mul_add_c(a[4],b[0],c2,c3,c1); |
380 | mul_add_c(a[3],b[1],c2,c3,c1); |
381 | mul_add_c(a[2],b[2],c2,c3,c1); |
382 | mul_add_c(a[1],b[3],c2,c3,c1); |
383 | mul_add_c(a[0],b[4],c2,c3,c1); |
384 | r[4]=c2; |
385 | c2=0; |
386 | mul_add_c(a[0],b[5],c3,c1,c2); |
387 | mul_add_c(a[1],b[4],c3,c1,c2); |
388 | mul_add_c(a[2],b[3],c3,c1,c2); |
389 | mul_add_c(a[3],b[2],c3,c1,c2); |
390 | mul_add_c(a[4],b[1],c3,c1,c2); |
391 | mul_add_c(a[5],b[0],c3,c1,c2); |
392 | r[5]=c3; |
393 | c3=0; |
394 | mul_add_c(a[6],b[0],c1,c2,c3); |
395 | mul_add_c(a[5],b[1],c1,c2,c3); |
396 | mul_add_c(a[4],b[2],c1,c2,c3); |
397 | mul_add_c(a[3],b[3],c1,c2,c3); |
398 | mul_add_c(a[2],b[4],c1,c2,c3); |
399 | mul_add_c(a[1],b[5],c1,c2,c3); |
400 | mul_add_c(a[0],b[6],c1,c2,c3); |
401 | r[6]=c1; |
402 | c1=0; |
403 | mul_add_c(a[0],b[7],c2,c3,c1); |
404 | mul_add_c(a[1],b[6],c2,c3,c1); |
405 | mul_add_c(a[2],b[5],c2,c3,c1); |
406 | mul_add_c(a[3],b[4],c2,c3,c1); |
407 | mul_add_c(a[4],b[3],c2,c3,c1); |
408 | mul_add_c(a[5],b[2],c2,c3,c1); |
409 | mul_add_c(a[6],b[1],c2,c3,c1); |
410 | mul_add_c(a[7],b[0],c2,c3,c1); |
411 | r[7]=c2; |
412 | c2=0; |
413 | mul_add_c(a[7],b[1],c3,c1,c2); |
414 | mul_add_c(a[6],b[2],c3,c1,c2); |
415 | mul_add_c(a[5],b[3],c3,c1,c2); |
416 | mul_add_c(a[4],b[4],c3,c1,c2); |
417 | mul_add_c(a[3],b[5],c3,c1,c2); |
418 | mul_add_c(a[2],b[6],c3,c1,c2); |
419 | mul_add_c(a[1],b[7],c3,c1,c2); |
420 | r[8]=c3; |
421 | c3=0; |
422 | mul_add_c(a[2],b[7],c1,c2,c3); |
423 | mul_add_c(a[3],b[6],c1,c2,c3); |
424 | mul_add_c(a[4],b[5],c1,c2,c3); |
425 | mul_add_c(a[5],b[4],c1,c2,c3); |
426 | mul_add_c(a[6],b[3],c1,c2,c3); |
427 | mul_add_c(a[7],b[2],c1,c2,c3); |
428 | r[9]=c1; |
429 | c1=0; |
430 | mul_add_c(a[7],b[3],c2,c3,c1); |
431 | mul_add_c(a[6],b[4],c2,c3,c1); |
432 | mul_add_c(a[5],b[5],c2,c3,c1); |
433 | mul_add_c(a[4],b[6],c2,c3,c1); |
434 | mul_add_c(a[3],b[7],c2,c3,c1); |
435 | r[10]=c2; |
436 | c2=0; |
437 | mul_add_c(a[4],b[7],c3,c1,c2); |
438 | mul_add_c(a[5],b[6],c3,c1,c2); |
439 | mul_add_c(a[6],b[5],c3,c1,c2); |
440 | mul_add_c(a[7],b[4],c3,c1,c2); |
441 | r[11]=c3; |
442 | c3=0; |
443 | mul_add_c(a[7],b[5],c1,c2,c3); |
444 | mul_add_c(a[6],b[6],c1,c2,c3); |
445 | mul_add_c(a[5],b[7],c1,c2,c3); |
446 | r[12]=c1; |
447 | c1=0; |
448 | mul_add_c(a[6],b[7],c2,c3,c1); |
449 | mul_add_c(a[7],b[6],c2,c3,c1); |
450 | r[13]=c2; |
451 | c2=0; |
452 | mul_add_c(a[7],b[7],c3,c1,c2); |
453 | r[14]=c3; |
454 | r[15]=c1; |
455 | } |
456 | |
457 | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
458 | { |
459 | BN_ULONG t1,t2; |
460 | BN_ULONG c1,c2,c3; |
461 | |
462 | c1=0; |
463 | c2=0; |
464 | c3=0; |
465 | mul_add_c(a[0],b[0],c1,c2,c3); |
466 | r[0]=c1; |
467 | c1=0; |
468 | mul_add_c(a[0],b[1],c2,c3,c1); |
469 | mul_add_c(a[1],b[0],c2,c3,c1); |
470 | r[1]=c2; |
471 | c2=0; |
472 | mul_add_c(a[2],b[0],c3,c1,c2); |
473 | mul_add_c(a[1],b[1],c3,c1,c2); |
474 | mul_add_c(a[0],b[2],c3,c1,c2); |
475 | r[2]=c3; |
476 | c3=0; |
477 | mul_add_c(a[0],b[3],c1,c2,c3); |
478 | mul_add_c(a[1],b[2],c1,c2,c3); |
479 | mul_add_c(a[2],b[1],c1,c2,c3); |
480 | mul_add_c(a[3],b[0],c1,c2,c3); |
481 | r[3]=c1; |
482 | c1=0; |
483 | mul_add_c(a[3],b[1],c2,c3,c1); |
484 | mul_add_c(a[2],b[2],c2,c3,c1); |
485 | mul_add_c(a[1],b[3],c2,c3,c1); |
486 | r[4]=c2; |
487 | c2=0; |
488 | mul_add_c(a[2],b[3],c3,c1,c2); |
489 | mul_add_c(a[3],b[2],c3,c1,c2); |
490 | r[5]=c3; |
491 | c3=0; |
492 | mul_add_c(a[3],b[3],c1,c2,c3); |
493 | r[6]=c1; |
494 | r[7]=c2; |
495 | } |
496 | |
497 | void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
498 | { |
499 | BN_ULONG t1,t2; |
500 | BN_ULONG c1,c2,c3; |
501 | |
502 | c1=0; |
503 | c2=0; |
504 | c3=0; |
505 | sqr_add_c(a,0,c1,c2,c3); |
506 | r[0]=c1; |
507 | c1=0; |
508 | sqr_add_c2(a,1,0,c2,c3,c1); |
509 | r[1]=c2; |
510 | c2=0; |
511 | sqr_add_c(a,1,c3,c1,c2); |
512 | sqr_add_c2(a,2,0,c3,c1,c2); |
513 | r[2]=c3; |
514 | c3=0; |
515 | sqr_add_c2(a,3,0,c1,c2,c3); |
516 | sqr_add_c2(a,2,1,c1,c2,c3); |
517 | r[3]=c1; |
518 | c1=0; |
519 | sqr_add_c(a,2,c2,c3,c1); |
520 | sqr_add_c2(a,3,1,c2,c3,c1); |
521 | sqr_add_c2(a,4,0,c2,c3,c1); |
522 | r[4]=c2; |
523 | c2=0; |
524 | sqr_add_c2(a,5,0,c3,c1,c2); |
525 | sqr_add_c2(a,4,1,c3,c1,c2); |
526 | sqr_add_c2(a,3,2,c3,c1,c2); |
527 | r[5]=c3; |
528 | c3=0; |
529 | sqr_add_c(a,3,c1,c2,c3); |
530 | sqr_add_c2(a,4,2,c1,c2,c3); |
531 | sqr_add_c2(a,5,1,c1,c2,c3); |
532 | sqr_add_c2(a,6,0,c1,c2,c3); |
533 | r[6]=c1; |
534 | c1=0; |
535 | sqr_add_c2(a,7,0,c2,c3,c1); |
536 | sqr_add_c2(a,6,1,c2,c3,c1); |
537 | sqr_add_c2(a,5,2,c2,c3,c1); |
538 | sqr_add_c2(a,4,3,c2,c3,c1); |
539 | r[7]=c2; |
540 | c2=0; |
541 | sqr_add_c(a,4,c3,c1,c2); |
542 | sqr_add_c2(a,5,3,c3,c1,c2); |
543 | sqr_add_c2(a,6,2,c3,c1,c2); |
544 | sqr_add_c2(a,7,1,c3,c1,c2); |
545 | r[8]=c3; |
546 | c3=0; |
547 | sqr_add_c2(a,7,2,c1,c2,c3); |
548 | sqr_add_c2(a,6,3,c1,c2,c3); |
549 | sqr_add_c2(a,5,4,c1,c2,c3); |
550 | r[9]=c1; |
551 | c1=0; |
552 | sqr_add_c(a,5,c2,c3,c1); |
553 | sqr_add_c2(a,6,4,c2,c3,c1); |
554 | sqr_add_c2(a,7,3,c2,c3,c1); |
555 | r[10]=c2; |
556 | c2=0; |
557 | sqr_add_c2(a,7,4,c3,c1,c2); |
558 | sqr_add_c2(a,6,5,c3,c1,c2); |
559 | r[11]=c3; |
560 | c3=0; |
561 | sqr_add_c(a,6,c1,c2,c3); |
562 | sqr_add_c2(a,7,5,c1,c2,c3); |
563 | r[12]=c1; |
564 | c1=0; |
565 | sqr_add_c2(a,7,6,c2,c3,c1); |
566 | r[13]=c2; |
567 | c2=0; |
568 | sqr_add_c(a,7,c3,c1,c2); |
569 | r[14]=c3; |
570 | r[15]=c1; |
571 | } |
572 | |
573 | void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
574 | { |
575 | BN_ULONG t1,t2; |
576 | BN_ULONG c1,c2,c3; |
577 | |
578 | c1=0; |
579 | c2=0; |
580 | c3=0; |
581 | sqr_add_c(a,0,c1,c2,c3); |
582 | r[0]=c1; |
583 | c1=0; |
584 | sqr_add_c2(a,1,0,c2,c3,c1); |
585 | r[1]=c2; |
586 | c2=0; |
587 | sqr_add_c(a,1,c3,c1,c2); |
588 | sqr_add_c2(a,2,0,c3,c1,c2); |
589 | r[2]=c3; |
590 | c3=0; |
591 | sqr_add_c2(a,3,0,c1,c2,c3); |
592 | sqr_add_c2(a,2,1,c1,c2,c3); |
593 | r[3]=c1; |
594 | c1=0; |
595 | sqr_add_c(a,2,c2,c3,c1); |
596 | sqr_add_c2(a,3,1,c2,c3,c1); |
597 | r[4]=c2; |
598 | c2=0; |
599 | sqr_add_c2(a,3,2,c3,c1,c2); |
600 | r[5]=c3; |
601 | c3=0; |
602 | sqr_add_c(a,3,c1,c2,c3); |
603 | r[6]=c1; |
604 | r[7]=c2; |
605 | } |
606 | #endif |
607 | |