cryptlib  3.4.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Macros
x86_64-gcc.c
Go to the documentation of this file.
1 /*
2  * x86_64 BIGNUM accelerator version 0.1, December 2002.
3  *
4  * Implemented by Andy Polyakov <[email protected]> for the OpenSSL
5  * project.
6  *
7  * Rights for redistribution and usage in source and binary forms are
8  * granted according to the OpenSSL license. Warranty of any kind is
9  * disclaimed.
10  *
11  * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
12  * versions, like 1.0...
13  * A. Well, that's because this code is basically a quick-n-dirty
14  * proof-of-concept hack. As you can see it's implemented with
15  * inline assembler, which means that you're bound to GCC and that
16  * there might be enough room for further improvement.
17  *
18  * Q. Why inline assembler?
19  * A. x86_64 features own ABI which I'm not familiar with. This is
20  * why I decided to let the compiler take care of subroutine
21  * prologue/epilogue as well as register allocation. For reference.
22  * Win64 implements different ABI for AMD64, different from Linux.
23  *
24  * Q. How much faster does it get?
25  * A. 'apps/openssl speed rsa dsa' output with no-asm:
26  *
27  * sign verify sign/s verify/s
28  * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
29  * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
30  * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
31  * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
32  * sign verify sign/s verify/s
33  * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
34  * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
35  * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
36  *
37  * 'apps/openssl speed rsa dsa' output with this module:
38  *
39  * sign verify sign/s verify/s
40  * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
41  * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
42  * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
43  * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
44  * sign verify sign/s verify/s
45  * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
46  * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
47  * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
48  *
49  * For the reference. IA-32 assembler implementation performs
50  * very much like 64-bit code compiled with no-asm on the same
51  * machine.
52  */
53 
54 #define BN_ULONG unsigned long
55 
56 /*
57  * "m"(a), "+m"(r) is the way to favor DirectPath �-code;
58  * "g"(0) let the compiler to decide where does it
59  * want to keep the value of zero;
60  */
61 #define mul_add(r,a,word,carry) do { \
62  register BN_ULONG high,low; \
63  asm ("mulq %3" \
64  : "=a"(low),"=d"(high) \
65  : "a"(word),"m"(a) \
66  : "cc"); \
67  asm ("addq %2,%0; adcq %3,%1" \
68  : "+r"(carry),"+d"(high)\
69  : "a"(low),"g"(0) \
70  : "cc"); \
71  asm ("addq %2,%0; adcq %3,%1" \
72  : "+m"(r),"+d"(high) \
73  : "r"(carry),"g"(0) \
74  : "cc"); \
75  carry=high; \
76  } while (0)
77 
78 #define mul(r,a,word,carry) do { \
79  register BN_ULONG high,low; \
80  asm ("mulq %3" \
81  : "=a"(low),"=d"(high) \
82  : "a"(word),"g"(a) \
83  : "cc"); \
84  asm ("addq %2,%0; adcq %3,%1" \
85  : "+r"(carry),"+d"(high)\
86  : "a"(low),"g"(0) \
87  : "cc"); \
88  (r)=carry, carry=high; \
89  } while (0)
90 
91 #define sqr(r0,r1,a) \
92  asm ("mulq %2" \
93  : "=a"(r0),"=d"(r1) \
94  : "a"(a) \
95  : "cc");
96 
98  {
99  BN_ULONG c1=0;
100 
101  if (num <= 0) return(c1);
102 
103  while (num&~3)
104  {
105  mul_add(rp[0],ap[0],w,c1);
106  mul_add(rp[1],ap[1],w,c1);
107  mul_add(rp[2],ap[2],w,c1);
108  mul_add(rp[3],ap[3],w,c1);
109  ap+=4; rp+=4; num-=4;
110  }
111  if (num)
112  {
113  mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
114  mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
115  mul_add(rp[2],ap[2],w,c1); return c1;
116  }
117 
118  return(c1);
119  }
120 
122  {
123  BN_ULONG c1=0;
124 
125  if (num <= 0) return(c1);
126 
127  while (num&~3)
128  {
129  mul(rp[0],ap[0],w,c1);
130  mul(rp[1],ap[1],w,c1);
131  mul(rp[2],ap[2],w,c1);
132  mul(rp[3],ap[3],w,c1);
133  ap+=4; rp+=4; num-=4;
134  }
135  if (num)
136  {
137  mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
138  mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
139  mul(rp[2],ap[2],w,c1);
140  }
141  return(c1);
142  }
143 
144 void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
145  {
146  if (n <= 0) return;
147 
148  while (n&~3)
149  {
150  sqr(r[0],r[1],a[0]);
151  sqr(r[2],r[3],a[1]);
152  sqr(r[4],r[5],a[2]);
153  sqr(r[6],r[7],a[3]);
154  a+=4; r+=8; n-=4;
155  }
156  if (n)
157  {
158  sqr(r[0],r[1],a[0]); if (--n == 0) return;
159  sqr(r[2],r[3],a[1]); if (--n == 0) return;
160  sqr(r[4],r[5],a[2]);
161  }
162  }
163 
165 { BN_ULONG ret,waste;
166 
167  asm ("divq %4"
168  : "=a"(ret),"=d"(waste)
169  : "a"(l),"d"(h),"g"(d)
170  : "cc");
171 
172  return ret;
173 }
174 
176 { BN_ULONG ret=0,i=0;
177 
178  if (n <= 0) return 0;
179 
180  asm (
181  " subq %2,%2 \n"
182  ".align 16 \n"
183  "1: movq (%4,%2,8),%0 \n"
184  " adcq (%5,%2,8),%0 \n"
185  " movq %0,(%3,%2,8) \n"
186  " leaq 1(%2),%2 \n"
187  " loop 1b \n"
188  " sbbq %0,%0 \n"
189  : "=&a"(ret),"+c"(n),"=&r"(i)
190  : "r"(rp),"r"(ap),"r"(bp)
191  : "cc"
192  );
193 
194  return ret&1;
195 }
196 
197 #ifndef SIMICS
199 { BN_ULONG ret=0,i=0;
200 
201  if (n <= 0) return 0;
202 
203  asm (
204  " subq %2,%2 \n"
205  ".align 16 \n"
206  "1: movq (%4,%2,8),%0 \n"
207  " sbbq (%5,%2,8),%0 \n"
208  " movq %0,(%3,%2,8) \n"
209  " leaq 1(%2),%2 \n"
210  " loop 1b \n"
211  " sbbq %0,%0 \n"
212  : "=&a"(ret),"+c"(n),"=&r"(i)
213  : "r"(rp),"r"(ap),"r"(bp)
214  : "cc"
215  );
216 
217  return ret&1;
218 }
219 #else
220 /* Simics 1.4<7 has buggy sbbq:-( */
221 #define BN_MASK2 0xffffffffffffffffL
223  {
224  BN_ULONG t1,t2;
225  int c=0;
226 
227  if (n <= 0) return((BN_ULONG)0);
228 
229  for (;;)
230  {
231  t1=a[0]; t2=b[0];
232  r[0]=(t1-t2-c)&BN_MASK2;
233  if (t1 != t2) c=(t1 < t2);
234  if (--n <= 0) break;
235 
236  t1=a[1]; t2=b[1];
237  r[1]=(t1-t2-c)&BN_MASK2;
238  if (t1 != t2) c=(t1 < t2);
239  if (--n <= 0) break;
240 
241  t1=a[2]; t2=b[2];
242  r[2]=(t1-t2-c)&BN_MASK2;
243  if (t1 != t2) c=(t1 < t2);
244  if (--n <= 0) break;
245 
246  t1=a[3]; t2=b[3];
247  r[3]=(t1-t2-c)&BN_MASK2;
248  if (t1 != t2) c=(t1 < t2);
249  if (--n <= 0) break;
250 
251  a+=4;
252  b+=4;
253  r+=4;
254  }
255  return(c);
256  }
257 #endif
258 
259 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
260 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
261 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
262 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
263 
264 #if 0
265 /* original macros are kept for reference purposes */
266 #define mul_add_c(a,b,c0,c1,c2) { \
267  BN_ULONG ta=(a),tb=(b); \
268  t1 = ta * tb; \
269  t2 = BN_UMULT_HIGH(ta,tb); \
270  c0 += t1; t2 += (c0<t1)?1:0; \
271  c1 += t2; c2 += (c1<t2)?1:0; \
272  }
273 
274 #define mul_add_c2(a,b,c0,c1,c2) { \
275  BN_ULONG ta=(a),tb=(b),t0; \
276  t1 = BN_UMULT_HIGH(ta,tb); \
277  t0 = ta * tb; \
278  t2 = t1+t1; c2 += (t2<t1)?1:0; \
279  t1 = t0+t0; t2 += (t1<t0)?1:0; \
280  c0 += t1; t2 += (c0<t1)?1:0; \
281  c1 += t2; c2 += (c1<t2)?1:0; \
282  }
283 #else
284 #define mul_add_c(a,b,c0,c1,c2) do { \
285  asm ("mulq %3" \
286  : "=a"(t1),"=d"(t2) \
287  : "a"(a),"m"(b) \
288  : "cc"); \
289  asm ("addq %2,%0; adcq %3,%1" \
290  : "+r"(c0),"+d"(t2) \
291  : "a"(t1),"g"(0) \
292  : "cc"); \
293  asm ("addq %2,%0; adcq %3,%1" \
294  : "+r"(c1),"+r"(c2) \
295  : "d"(t2),"g"(0) \
296  : "cc"); \
297  } while (0)
298 
299 #define sqr_add_c(a,i,c0,c1,c2) do { \
300  asm ("mulq %2" \
301  : "=a"(t1),"=d"(t2) \
302  : "a"(a[i]) \
303  : "cc"); \
304  asm ("addq %2,%0; adcq %3,%1" \
305  : "+r"(c0),"+d"(t2) \
306  : "a"(t1),"g"(0) \
307  : "cc"); \
308  asm ("addq %2,%0; adcq %3,%1" \
309  : "+r"(c1),"+r"(c2) \
310  : "d"(t2),"g"(0) \
311  : "cc"); \
312  } while (0)
313 
314 #define mul_add_c2(a,b,c0,c1,c2) do { \
315  asm ("mulq %3" \
316  : "=a"(t1),"=d"(t2) \
317  : "a"(a),"m"(b) \
318  : "cc"); \
319  asm ("addq %0,%0; adcq %2,%1" \
320  : "+d"(t2),"+r"(c2) \
321  : "g"(0) \
322  : "cc"); \
323  asm ("addq %0,%0; adcq %2,%1" \
324  : "+a"(t1),"+d"(t2) \
325  : "g"(0) \
326  : "cc"); \
327  asm ("addq %2,%0; adcq %3,%1" \
328  : "+r"(c0),"+d"(t2) \
329  : "a"(t1),"g"(0) \
330  : "cc"); \
331  asm ("addq %2,%0; adcq %3,%1" \
332  : "+r"(c1),"+r"(c2) \
333  : "d"(t2),"g"(0) \
334  : "cc"); \
335  } while (0)
336 #endif
337 
338 #define sqr_add_c2(a,i,j,c0,c1,c2) \
339  mul_add_c2((a)[i],(a)[j],c0,c1,c2)
340 
342  {
343  BN_ULONG t1,t2;
344  BN_ULONG c1,c2,c3;
345 
346  c1=0;
347  c2=0;
348  c3=0;
349  mul_add_c(a[0],b[0],c1,c2,c3);
350  r[0]=c1;
351  c1=0;
352  mul_add_c(a[0],b[1],c2,c3,c1);
353  mul_add_c(a[1],b[0],c2,c3,c1);
354  r[1]=c2;
355  c2=0;
356  mul_add_c(a[2],b[0],c3,c1,c2);
357  mul_add_c(a[1],b[1],c3,c1,c2);
358  mul_add_c(a[0],b[2],c3,c1,c2);
359  r[2]=c3;
360  c3=0;
361  mul_add_c(a[0],b[3],c1,c2,c3);
362  mul_add_c(a[1],b[2],c1,c2,c3);
363  mul_add_c(a[2],b[1],c1,c2,c3);
364  mul_add_c(a[3],b[0],c1,c2,c3);
365  r[3]=c1;
366  c1=0;
367  mul_add_c(a[4],b[0],c2,c3,c1);
368  mul_add_c(a[3],b[1],c2,c3,c1);
369  mul_add_c(a[2],b[2],c2,c3,c1);
370  mul_add_c(a[1],b[3],c2,c3,c1);
371  mul_add_c(a[0],b[4],c2,c3,c1);
372  r[4]=c2;
373  c2=0;
374  mul_add_c(a[0],b[5],c3,c1,c2);
375  mul_add_c(a[1],b[4],c3,c1,c2);
376  mul_add_c(a[2],b[3],c3,c1,c2);
377  mul_add_c(a[3],b[2],c3,c1,c2);
378  mul_add_c(a[4],b[1],c3,c1,c2);
379  mul_add_c(a[5],b[0],c3,c1,c2);
380  r[5]=c3;
381  c3=0;
382  mul_add_c(a[6],b[0],c1,c2,c3);
383  mul_add_c(a[5],b[1],c1,c2,c3);
384  mul_add_c(a[4],b[2],c1,c2,c3);
385  mul_add_c(a[3],b[3],c1,c2,c3);
386  mul_add_c(a[2],b[4],c1,c2,c3);
387  mul_add_c(a[1],b[5],c1,c2,c3);
388  mul_add_c(a[0],b[6],c1,c2,c3);
389  r[6]=c1;
390  c1=0;
391  mul_add_c(a[0],b[7],c2,c3,c1);
392  mul_add_c(a[1],b[6],c2,c3,c1);
393  mul_add_c(a[2],b[5],c2,c3,c1);
394  mul_add_c(a[3],b[4],c2,c3,c1);
395  mul_add_c(a[4],b[3],c2,c3,c1);
396  mul_add_c(a[5],b[2],c2,c3,c1);
397  mul_add_c(a[6],b[1],c2,c3,c1);
398  mul_add_c(a[7],b[0],c2,c3,c1);
399  r[7]=c2;
400  c2=0;
401  mul_add_c(a[7],b[1],c3,c1,c2);
402  mul_add_c(a[6],b[2],c3,c1,c2);
403  mul_add_c(a[5],b[3],c3,c1,c2);
404  mul_add_c(a[4],b[4],c3,c1,c2);
405  mul_add_c(a[3],b[5],c3,c1,c2);
406  mul_add_c(a[2],b[6],c3,c1,c2);
407  mul_add_c(a[1],b[7],c3,c1,c2);
408  r[8]=c3;
409  c3=0;
410  mul_add_c(a[2],b[7],c1,c2,c3);
411  mul_add_c(a[3],b[6],c1,c2,c3);
412  mul_add_c(a[4],b[5],c1,c2,c3);
413  mul_add_c(a[5],b[4],c1,c2,c3);
414  mul_add_c(a[6],b[3],c1,c2,c3);
415  mul_add_c(a[7],b[2],c1,c2,c3);
416  r[9]=c1;
417  c1=0;
418  mul_add_c(a[7],b[3],c2,c3,c1);
419  mul_add_c(a[6],b[4],c2,c3,c1);
420  mul_add_c(a[5],b[5],c2,c3,c1);
421  mul_add_c(a[4],b[6],c2,c3,c1);
422  mul_add_c(a[3],b[7],c2,c3,c1);
423  r[10]=c2;
424  c2=0;
425  mul_add_c(a[4],b[7],c3,c1,c2);
426  mul_add_c(a[5],b[6],c3,c1,c2);
427  mul_add_c(a[6],b[5],c3,c1,c2);
428  mul_add_c(a[7],b[4],c3,c1,c2);
429  r[11]=c3;
430  c3=0;
431  mul_add_c(a[7],b[5],c1,c2,c3);
432  mul_add_c(a[6],b[6],c1,c2,c3);
433  mul_add_c(a[5],b[7],c1,c2,c3);
434  r[12]=c1;
435  c1=0;
436  mul_add_c(a[6],b[7],c2,c3,c1);
437  mul_add_c(a[7],b[6],c2,c3,c1);
438  r[13]=c2;
439  c2=0;
440  mul_add_c(a[7],b[7],c3,c1,c2);
441  r[14]=c3;
442  r[15]=c1;
443  }
444 
446  {
447  BN_ULONG t1,t2;
448  BN_ULONG c1,c2,c3;
449 
450  c1=0;
451  c2=0;
452  c3=0;
453  mul_add_c(a[0],b[0],c1,c2,c3);
454  r[0]=c1;
455  c1=0;
456  mul_add_c(a[0],b[1],c2,c3,c1);
457  mul_add_c(a[1],b[0],c2,c3,c1);
458  r[1]=c2;
459  c2=0;
460  mul_add_c(a[2],b[0],c3,c1,c2);
461  mul_add_c(a[1],b[1],c3,c1,c2);
462  mul_add_c(a[0],b[2],c3,c1,c2);
463  r[2]=c3;
464  c3=0;
465  mul_add_c(a[0],b[3],c1,c2,c3);
466  mul_add_c(a[1],b[2],c1,c2,c3);
467  mul_add_c(a[2],b[1],c1,c2,c3);
468  mul_add_c(a[3],b[0],c1,c2,c3);
469  r[3]=c1;
470  c1=0;
471  mul_add_c(a[3],b[1],c2,c3,c1);
472  mul_add_c(a[2],b[2],c2,c3,c1);
473  mul_add_c(a[1],b[3],c2,c3,c1);
474  r[4]=c2;
475  c2=0;
476  mul_add_c(a[2],b[3],c3,c1,c2);
477  mul_add_c(a[3],b[2],c3,c1,c2);
478  r[5]=c3;
479  c3=0;
480  mul_add_c(a[3],b[3],c1,c2,c3);
481  r[6]=c1;
482  r[7]=c2;
483  }
484 
486  {
487  BN_ULONG t1,t2;
488  BN_ULONG c1,c2,c3;
489 
490  c1=0;
491  c2=0;
492  c3=0;
493  sqr_add_c(a,0,c1,c2,c3);
494  r[0]=c1;
495  c1=0;
496  sqr_add_c2(a,1,0,c2,c3,c1);
497  r[1]=c2;
498  c2=0;
499  sqr_add_c(a,1,c3,c1,c2);
500  sqr_add_c2(a,2,0,c3,c1,c2);
501  r[2]=c3;
502  c3=0;
503  sqr_add_c2(a,3,0,c1,c2,c3);
504  sqr_add_c2(a,2,1,c1,c2,c3);
505  r[3]=c1;
506  c1=0;
507  sqr_add_c(a,2,c2,c3,c1);
508  sqr_add_c2(a,3,1,c2,c3,c1);
509  sqr_add_c2(a,4,0,c2,c3,c1);
510  r[4]=c2;
511  c2=0;
512  sqr_add_c2(a,5,0,c3,c1,c2);
513  sqr_add_c2(a,4,1,c3,c1,c2);
514  sqr_add_c2(a,3,2,c3,c1,c2);
515  r[5]=c3;
516  c3=0;
517  sqr_add_c(a,3,c1,c2,c3);
518  sqr_add_c2(a,4,2,c1,c2,c3);
519  sqr_add_c2(a,5,1,c1,c2,c3);
520  sqr_add_c2(a,6,0,c1,c2,c3);
521  r[6]=c1;
522  c1=0;
523  sqr_add_c2(a,7,0,c2,c3,c1);
524  sqr_add_c2(a,6,1,c2,c3,c1);
525  sqr_add_c2(a,5,2,c2,c3,c1);
526  sqr_add_c2(a,4,3,c2,c3,c1);
527  r[7]=c2;
528  c2=0;
529  sqr_add_c(a,4,c3,c1,c2);
530  sqr_add_c2(a,5,3,c3,c1,c2);
531  sqr_add_c2(a,6,2,c3,c1,c2);
532  sqr_add_c2(a,7,1,c3,c1,c2);
533  r[8]=c3;
534  c3=0;
535  sqr_add_c2(a,7,2,c1,c2,c3);
536  sqr_add_c2(a,6,3,c1,c2,c3);
537  sqr_add_c2(a,5,4,c1,c2,c3);
538  r[9]=c1;
539  c1=0;
540  sqr_add_c(a,5,c2,c3,c1);
541  sqr_add_c2(a,6,4,c2,c3,c1);
542  sqr_add_c2(a,7,3,c2,c3,c1);
543  r[10]=c2;
544  c2=0;
545  sqr_add_c2(a,7,4,c3,c1,c2);
546  sqr_add_c2(a,6,5,c3,c1,c2);
547  r[11]=c3;
548  c3=0;
549  sqr_add_c(a,6,c1,c2,c3);
550  sqr_add_c2(a,7,5,c1,c2,c3);
551  r[12]=c1;
552  c1=0;
553  sqr_add_c2(a,7,6,c2,c3,c1);
554  r[13]=c2;
555  c2=0;
556  sqr_add_c(a,7,c3,c1,c2);
557  r[14]=c3;
558  r[15]=c1;
559  }
560 
562  {
563  BN_ULONG t1,t2;
564  BN_ULONG c1,c2,c3;
565 
566  c1=0;
567  c2=0;
568  c3=0;
569  sqr_add_c(a,0,c1,c2,c3);
570  r[0]=c1;
571  c1=0;
572  sqr_add_c2(a,1,0,c2,c3,c1);
573  r[1]=c2;
574  c2=0;
575  sqr_add_c(a,1,c3,c1,c2);
576  sqr_add_c2(a,2,0,c3,c1,c2);
577  r[2]=c3;
578  c3=0;
579  sqr_add_c2(a,3,0,c1,c2,c3);
580  sqr_add_c2(a,2,1,c1,c2,c3);
581  r[3]=c1;
582  c1=0;
583  sqr_add_c(a,2,c2,c3,c1);
584  sqr_add_c2(a,3,1,c2,c3,c1);
585  r[4]=c2;
586  c2=0;
587  sqr_add_c2(a,3,2,c3,c1,c2);
588  r[5]=c3;
589  c3=0;
590  sqr_add_c(a,3,c1,c2,c3);
591  r[6]=c1;
592  r[7]=c2;
593  }