OpenSSL  1.0.1c
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Macros
ecp_nistp521.c
Go to the documentation of this file.
1 /* crypto/ec/ecp_nistp521.c */
2 /*
3  * Written by Adam Langley (Google) for the OpenSSL project
4  */
5 /* Copyright 2011 Google Inc.
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  *
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  * http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  */
20 
21 /*
22  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
23  *
24  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26  * work which got its smarts from Daniel J. Bernstein's work on the same.
27  */
28 
29 #include <openssl/opensslconf.h>
30 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
31 
32 #ifndef OPENSSL_SYS_VMS
33 #include <stdint.h>
34 #else
35 #include <inttypes.h>
36 #endif
37 
38 #include <string.h>
39 #include <openssl/err.h>
40 #include "ec_lcl.h"
41 
42 #if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
43  /* even with gcc, the typedef won't work for 32-bit platforms */
44  typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
45 #else
46  #error "Need GCC 3.1 or later to define type uint128_t"
47 #endif
48 
49 typedef uint8_t u8;
50 typedef uint64_t u64;
51 typedef int64_t s64;
52 
53 /* The underlying field.
54  *
55  * P521 operates over GF(2^521-1). We can serialise an element of this field
56  * into 66 bytes where the most significant byte contains only a single bit. We
57  * call this an felem_bytearray. */
58 
59 typedef u8 felem_bytearray[66];
60 
61 /* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
62  * These values are big-endian. */
63 static const felem_bytearray nistp521_curve_params[5] =
64  {
65  {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
66  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
67  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73  0xff, 0xff},
74  {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
75  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
76  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
77  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82  0xff, 0xfc},
83  {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
84  0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
85  0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
86  0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
87  0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
88  0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
89  0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
90  0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
91  0x3f, 0x00},
92  {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
93  0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
94  0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
95  0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
96  0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
97  0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
98  0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
99  0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
100  0xbd, 0x66},
101  {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
102  0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
103  0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
104  0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
105  0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
106  0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
107  0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
108  0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
109  0x66, 0x50}
110  };
111 
112 /* The representation of field elements.
113  * ------------------------------------
114  *
115  * We represent field elements with nine values. These values are either 64 or
116  * 128 bits and the field element represented is:
117  * v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464 (mod p)
118  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
119  * 58 bits apart, but are greater than 58 bits in length, the most significant
120  * bits of each limb overlap with the least significant bits of the next.
121  *
122  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
123  * 'largefelem' */
124 
125 #define NLIMBS 9
126 
127 typedef uint64_t limb;
128 typedef limb felem[NLIMBS];
129 typedef uint128_t largefelem[NLIMBS];
130 
131 static const limb bottom57bits = 0x1ffffffffffffff;
132 static const limb bottom58bits = 0x3ffffffffffffff;
133 
134 /* bin66_to_felem takes a little-endian byte array and converts it into felem
135  * form. This assumes that the CPU is little-endian. */
136 static void bin66_to_felem(felem out, const u8 in[66])
137  {
138  out[0] = (*((limb*) &in[0])) & bottom58bits;
139  out[1] = (*((limb*) &in[7]) >> 2) & bottom58bits;
140  out[2] = (*((limb*) &in[14]) >> 4) & bottom58bits;
141  out[3] = (*((limb*) &in[21]) >> 6) & bottom58bits;
142  out[4] = (*((limb*) &in[29])) & bottom58bits;
143  out[5] = (*((limb*) &in[36]) >> 2) & bottom58bits;
144  out[6] = (*((limb*) &in[43]) >> 4) & bottom58bits;
145  out[7] = (*((limb*) &in[50]) >> 6) & bottom58bits;
146  out[8] = (*((limb*) &in[58])) & bottom57bits;
147  }
148 
149 /* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
150  * array. This assumes that the CPU is little-endian. */
151 static void felem_to_bin66(u8 out[66], const felem in)
152  {
153  memset(out, 0, 66);
154  (*((limb*) &out[0])) = in[0];
155  (*((limb*) &out[7])) |= in[1] << 2;
156  (*((limb*) &out[14])) |= in[2] << 4;
157  (*((limb*) &out[21])) |= in[3] << 6;
158  (*((limb*) &out[29])) = in[4];
159  (*((limb*) &out[36])) |= in[5] << 2;
160  (*((limb*) &out[43])) |= in[6] << 4;
161  (*((limb*) &out[50])) |= in[7] << 6;
162  (*((limb*) &out[58])) = in[8];
163  }
164 
165 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
166 static void flip_endian(u8 *out, const u8 *in, unsigned len)
167  {
168  unsigned i;
169  for (i = 0; i < len; ++i)
170  out[i] = in[len-1-i];
171  }
172 
173 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
174 static int BN_to_felem(felem out, const BIGNUM *bn)
175  {
176  felem_bytearray b_in;
177  felem_bytearray b_out;
178  unsigned num_bytes;
179 
180  /* BN_bn2bin eats leading zeroes */
181  memset(b_out, 0, sizeof b_out);
182  num_bytes = BN_num_bytes(bn);
183  if (num_bytes > sizeof b_out)
184  {
186  return 0;
187  }
188  if (BN_is_negative(bn))
189  {
191  return 0;
192  }
193  num_bytes = BN_bn2bin(bn, b_in);
194  flip_endian(b_out, b_in, num_bytes);
195  bin66_to_felem(out, b_out);
196  return 1;
197  }
198 
199 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
200 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
201  {
202  felem_bytearray b_in, b_out;
203  felem_to_bin66(b_in, in);
204  flip_endian(b_out, b_in, sizeof b_out);
205  return BN_bin2bn(b_out, sizeof b_out, out);
206  }
207 
208 
209 /* Field operations
210  * ---------------- */
211 
212 static void felem_one(felem out)
213  {
214  out[0] = 1;
215  out[1] = 0;
216  out[2] = 0;
217  out[3] = 0;
218  out[4] = 0;
219  out[5] = 0;
220  out[6] = 0;
221  out[7] = 0;
222  out[8] = 0;
223  }
224 
225 static void felem_assign(felem out, const felem in)
226  {
227  out[0] = in[0];
228  out[1] = in[1];
229  out[2] = in[2];
230  out[3] = in[3];
231  out[4] = in[4];
232  out[5] = in[5];
233  out[6] = in[6];
234  out[7] = in[7];
235  out[8] = in[8];
236  }
237 
238 /* felem_sum64 sets out = out + in. */
239 static void felem_sum64(felem out, const felem in)
240  {
241  out[0] += in[0];
242  out[1] += in[1];
243  out[2] += in[2];
244  out[3] += in[3];
245  out[4] += in[4];
246  out[5] += in[5];
247  out[6] += in[6];
248  out[7] += in[7];
249  out[8] += in[8];
250  }
251 
252 /* felem_scalar sets out = in * scalar */
253 static void felem_scalar(felem out, const felem in, limb scalar)
254  {
255  out[0] = in[0] * scalar;
256  out[1] = in[1] * scalar;
257  out[2] = in[2] * scalar;
258  out[3] = in[3] * scalar;
259  out[4] = in[4] * scalar;
260  out[5] = in[5] * scalar;
261  out[6] = in[6] * scalar;
262  out[7] = in[7] * scalar;
263  out[8] = in[8] * scalar;
264  }
265 
266 /* felem_scalar64 sets out = out * scalar */
267 static void felem_scalar64(felem out, limb scalar)
268  {
269  out[0] *= scalar;
270  out[1] *= scalar;
271  out[2] *= scalar;
272  out[3] *= scalar;
273  out[4] *= scalar;
274  out[5] *= scalar;
275  out[6] *= scalar;
276  out[7] *= scalar;
277  out[8] *= scalar;
278  }
279 
280 /* felem_scalar128 sets out = out * scalar */
281 static void felem_scalar128(largefelem out, limb scalar)
282  {
283  out[0] *= scalar;
284  out[1] *= scalar;
285  out[2] *= scalar;
286  out[3] *= scalar;
287  out[4] *= scalar;
288  out[5] *= scalar;
289  out[6] *= scalar;
290  out[7] *= scalar;
291  out[8] *= scalar;
292  }
293 
294 /* felem_neg sets |out| to |-in|
295  * On entry:
296  * in[i] < 2^59 + 2^14
297  * On exit:
298  * out[i] < 2^62
299  */
300 static void felem_neg(felem out, const felem in)
301  {
302  /* In order to prevent underflow, we subtract from 0 mod p. */
303  static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
304  static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
305 
306  out[0] = two62m3 - in[0];
307  out[1] = two62m2 - in[1];
308  out[2] = two62m2 - in[2];
309  out[3] = two62m2 - in[3];
310  out[4] = two62m2 - in[4];
311  out[5] = two62m2 - in[5];
312  out[6] = two62m2 - in[6];
313  out[7] = two62m2 - in[7];
314  out[8] = two62m2 - in[8];
315  }
316 
317 /* felem_diff64 subtracts |in| from |out|
318  * On entry:
319  * in[i] < 2^59 + 2^14
320  * On exit:
321  * out[i] < out[i] + 2^62
322  */
323 static void felem_diff64(felem out, const felem in)
324  {
325  /* In order to prevent underflow, we add 0 mod p before subtracting. */
326  static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
327  static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
328 
329  out[0] += two62m3 - in[0];
330  out[1] += two62m2 - in[1];
331  out[2] += two62m2 - in[2];
332  out[3] += two62m2 - in[3];
333  out[4] += two62m2 - in[4];
334  out[5] += two62m2 - in[5];
335  out[6] += two62m2 - in[6];
336  out[7] += two62m2 - in[7];
337  out[8] += two62m2 - in[8];
338  }
339 
340 /* felem_diff_128_64 subtracts |in| from |out|
341  * On entry:
342  * in[i] < 2^62 + 2^17
343  * On exit:
344  * out[i] < out[i] + 2^63
345  */
346 static void felem_diff_128_64(largefelem out, const felem in)
347  {
348  /* In order to prevent underflow, we add 0 mod p before subtracting. */
349  static const limb two63m6 = (((limb)1) << 62) - (((limb)1) << 5);
350  static const limb two63m5 = (((limb)1) << 62) - (((limb)1) << 4);
351 
352  out[0] += two63m6 - in[0];
353  out[1] += two63m5 - in[1];
354  out[2] += two63m5 - in[2];
355  out[3] += two63m5 - in[3];
356  out[4] += two63m5 - in[4];
357  out[5] += two63m5 - in[5];
358  out[6] += two63m5 - in[6];
359  out[7] += two63m5 - in[7];
360  out[8] += two63m5 - in[8];
361  }
362 
363 /* felem_diff_128_64 subtracts |in| from |out|
364  * On entry:
365  * in[i] < 2^126
366  * On exit:
367  * out[i] < out[i] + 2^127 - 2^69
368  */
369 static void felem_diff128(largefelem out, const largefelem in)
370  {
371  /* In order to prevent underflow, we add 0 mod p before subtracting. */
372  static const uint128_t two127m70 = (((uint128_t)1) << 127) - (((uint128_t)1) << 70);
373  static const uint128_t two127m69 = (((uint128_t)1) << 127) - (((uint128_t)1) << 69);
374 
375  out[0] += (two127m70 - in[0]);
376  out[1] += (two127m69 - in[1]);
377  out[2] += (two127m69 - in[2]);
378  out[3] += (two127m69 - in[3]);
379  out[4] += (two127m69 - in[4]);
380  out[5] += (two127m69 - in[5]);
381  out[6] += (two127m69 - in[6]);
382  out[7] += (two127m69 - in[7]);
383  out[8] += (two127m69 - in[8]);
384  }
385 
386 /* felem_square sets |out| = |in|^2
387  * On entry:
388  * in[i] < 2^62
389  * On exit:
390  * out[i] < 17 * max(in[i]) * max(in[i])
391  */
392 static void felem_square(largefelem out, const felem in)
393  {
394  felem inx2, inx4;
395  felem_scalar(inx2, in, 2);
396  felem_scalar(inx4, in, 4);
397 
398  /* We have many cases were we want to do
399  * in[x] * in[y] +
400  * in[y] * in[x]
401  * This is obviously just
402  * 2 * in[x] * in[y]
403  * However, rather than do the doubling on the 128 bit result, we
404  * double one of the inputs to the multiplication by reading from
405  * |inx2| */
406 
407  out[0] = ((uint128_t) in[0]) * in[0];
408  out[1] = ((uint128_t) in[0]) * inx2[1];
409  out[2] = ((uint128_t) in[0]) * inx2[2] +
410  ((uint128_t) in[1]) * in[1];
411  out[3] = ((uint128_t) in[0]) * inx2[3] +
412  ((uint128_t) in[1]) * inx2[2];
413  out[4] = ((uint128_t) in[0]) * inx2[4] +
414  ((uint128_t) in[1]) * inx2[3] +
415  ((uint128_t) in[2]) * in[2];
416  out[5] = ((uint128_t) in[0]) * inx2[5] +
417  ((uint128_t) in[1]) * inx2[4] +
418  ((uint128_t) in[2]) * inx2[3];
419  out[6] = ((uint128_t) in[0]) * inx2[6] +
420  ((uint128_t) in[1]) * inx2[5] +
421  ((uint128_t) in[2]) * inx2[4] +
422  ((uint128_t) in[3]) * in[3];
423  out[7] = ((uint128_t) in[0]) * inx2[7] +
424  ((uint128_t) in[1]) * inx2[6] +
425  ((uint128_t) in[2]) * inx2[5] +
426  ((uint128_t) in[3]) * inx2[4];
427  out[8] = ((uint128_t) in[0]) * inx2[8] +
428  ((uint128_t) in[1]) * inx2[7] +
429  ((uint128_t) in[2]) * inx2[6] +
430  ((uint128_t) in[3]) * inx2[5] +
431  ((uint128_t) in[4]) * in[4];
432 
433  /* The remaining limbs fall above 2^521, with the first falling at
434  * 2^522. They correspond to locations one bit up from the limbs
435  * produced above so we would have to multiply by two to align them.
436  * Again, rather than operate on the 128-bit result, we double one of
437  * the inputs to the multiplication. If we want to double for both this
438  * reason, and the reason above, then we end up multiplying by four. */
439 
440  /* 9 */
441  out[0] += ((uint128_t) in[1]) * inx4[8] +
442  ((uint128_t) in[2]) * inx4[7] +
443  ((uint128_t) in[3]) * inx4[6] +
444  ((uint128_t) in[4]) * inx4[5];
445 
446  /* 10 */
447  out[1] += ((uint128_t) in[2]) * inx4[8] +
448  ((uint128_t) in[3]) * inx4[7] +
449  ((uint128_t) in[4]) * inx4[6] +
450  ((uint128_t) in[5]) * inx2[5];
451 
452  /* 11 */
453  out[2] += ((uint128_t) in[3]) * inx4[8] +
454  ((uint128_t) in[4]) * inx4[7] +
455  ((uint128_t) in[5]) * inx4[6];
456 
457  /* 12 */
458  out[3] += ((uint128_t) in[4]) * inx4[8] +
459  ((uint128_t) in[5]) * inx4[7] +
460  ((uint128_t) in[6]) * inx2[6];
461 
462  /* 13 */
463  out[4] += ((uint128_t) in[5]) * inx4[8] +
464  ((uint128_t) in[6]) * inx4[7];
465 
466  /* 14 */
467  out[5] += ((uint128_t) in[6]) * inx4[8] +
468  ((uint128_t) in[7]) * inx2[7];
469 
470  /* 15 */
471  out[6] += ((uint128_t) in[7]) * inx4[8];
472 
473  /* 16 */
474  out[7] += ((uint128_t) in[8]) * inx2[8];
475  }
476 
477 /* felem_mul sets |out| = |in1| * |in2|
478  * On entry:
479  * in1[i] < 2^64
480  * in2[i] < 2^63
481  * On exit:
482  * out[i] < 17 * max(in1[i]) * max(in2[i])
483  */
484 static void felem_mul(largefelem out, const felem in1, const felem in2)
485  {
486  felem in2x2;
487  felem_scalar(in2x2, in2, 2);
488 
489  out[0] = ((uint128_t) in1[0]) * in2[0];
490 
491  out[1] = ((uint128_t) in1[0]) * in2[1] +
492  ((uint128_t) in1[1]) * in2[0];
493 
494  out[2] = ((uint128_t) in1[0]) * in2[2] +
495  ((uint128_t) in1[1]) * in2[1] +
496  ((uint128_t) in1[2]) * in2[0];
497 
498  out[3] = ((uint128_t) in1[0]) * in2[3] +
499  ((uint128_t) in1[1]) * in2[2] +
500  ((uint128_t) in1[2]) * in2[1] +
501  ((uint128_t) in1[3]) * in2[0];
502 
503  out[4] = ((uint128_t) in1[0]) * in2[4] +
504  ((uint128_t) in1[1]) * in2[3] +
505  ((uint128_t) in1[2]) * in2[2] +
506  ((uint128_t) in1[3]) * in2[1] +
507  ((uint128_t) in1[4]) * in2[0];
508 
509  out[5] = ((uint128_t) in1[0]) * in2[5] +
510  ((uint128_t) in1[1]) * in2[4] +
511  ((uint128_t) in1[2]) * in2[3] +
512  ((uint128_t) in1[3]) * in2[2] +
513  ((uint128_t) in1[4]) * in2[1] +
514  ((uint128_t) in1[5]) * in2[0];
515 
516  out[6] = ((uint128_t) in1[0]) * in2[6] +
517  ((uint128_t) in1[1]) * in2[5] +
518  ((uint128_t) in1[2]) * in2[4] +
519  ((uint128_t) in1[3]) * in2[3] +
520  ((uint128_t) in1[4]) * in2[2] +
521  ((uint128_t) in1[5]) * in2[1] +
522  ((uint128_t) in1[6]) * in2[0];
523 
524  out[7] = ((uint128_t) in1[0]) * in2[7] +
525  ((uint128_t) in1[1]) * in2[6] +
526  ((uint128_t) in1[2]) * in2[5] +
527  ((uint128_t) in1[3]) * in2[4] +
528  ((uint128_t) in1[4]) * in2[3] +
529  ((uint128_t) in1[5]) * in2[2] +
530  ((uint128_t) in1[6]) * in2[1] +
531  ((uint128_t) in1[7]) * in2[0];
532 
533  out[8] = ((uint128_t) in1[0]) * in2[8] +
534  ((uint128_t) in1[1]) * in2[7] +
535  ((uint128_t) in1[2]) * in2[6] +
536  ((uint128_t) in1[3]) * in2[5] +
537  ((uint128_t) in1[4]) * in2[4] +
538  ((uint128_t) in1[5]) * in2[3] +
539  ((uint128_t) in1[6]) * in2[2] +
540  ((uint128_t) in1[7]) * in2[1] +
541  ((uint128_t) in1[8]) * in2[0];
542 
543  /* See comment in felem_square about the use of in2x2 here */
544 
545  out[0] += ((uint128_t) in1[1]) * in2x2[8] +
546  ((uint128_t) in1[2]) * in2x2[7] +
547  ((uint128_t) in1[3]) * in2x2[6] +
548  ((uint128_t) in1[4]) * in2x2[5] +
549  ((uint128_t) in1[5]) * in2x2[4] +
550  ((uint128_t) in1[6]) * in2x2[3] +
551  ((uint128_t) in1[7]) * in2x2[2] +
552  ((uint128_t) in1[8]) * in2x2[1];
553 
554  out[1] += ((uint128_t) in1[2]) * in2x2[8] +
555  ((uint128_t) in1[3]) * in2x2[7] +
556  ((uint128_t) in1[4]) * in2x2[6] +
557  ((uint128_t) in1[5]) * in2x2[5] +
558  ((uint128_t) in1[6]) * in2x2[4] +
559  ((uint128_t) in1[7]) * in2x2[3] +
560  ((uint128_t) in1[8]) * in2x2[2];
561 
562  out[2] += ((uint128_t) in1[3]) * in2x2[8] +
563  ((uint128_t) in1[4]) * in2x2[7] +
564  ((uint128_t) in1[5]) * in2x2[6] +
565  ((uint128_t) in1[6]) * in2x2[5] +
566  ((uint128_t) in1[7]) * in2x2[4] +
567  ((uint128_t) in1[8]) * in2x2[3];
568 
569  out[3] += ((uint128_t) in1[4]) * in2x2[8] +
570  ((uint128_t) in1[5]) * in2x2[7] +
571  ((uint128_t) in1[6]) * in2x2[6] +
572  ((uint128_t) in1[7]) * in2x2[5] +
573  ((uint128_t) in1[8]) * in2x2[4];
574 
575  out[4] += ((uint128_t) in1[5]) * in2x2[8] +
576  ((uint128_t) in1[6]) * in2x2[7] +
577  ((uint128_t) in1[7]) * in2x2[6] +
578  ((uint128_t) in1[8]) * in2x2[5];
579 
580  out[5] += ((uint128_t) in1[6]) * in2x2[8] +
581  ((uint128_t) in1[7]) * in2x2[7] +
582  ((uint128_t) in1[8]) * in2x2[6];
583 
584  out[6] += ((uint128_t) in1[7]) * in2x2[8] +
585  ((uint128_t) in1[8]) * in2x2[7];
586 
587  out[7] += ((uint128_t) in1[8]) * in2x2[8];
588  }
589 
590 static const limb bottom52bits = 0xfffffffffffff;
591 
592 /* felem_reduce converts a largefelem to an felem.
593  * On entry:
594  * in[i] < 2^128
595  * On exit:
596  * out[i] < 2^59 + 2^14
597  */
598 static void felem_reduce(felem out, const largefelem in)
599  {
600  u64 overflow1, overflow2;
601 
602  out[0] = ((limb) in[0]) & bottom58bits;
603  out[1] = ((limb) in[1]) & bottom58bits;
604  out[2] = ((limb) in[2]) & bottom58bits;
605  out[3] = ((limb) in[3]) & bottom58bits;
606  out[4] = ((limb) in[4]) & bottom58bits;
607  out[5] = ((limb) in[5]) & bottom58bits;
608  out[6] = ((limb) in[6]) & bottom58bits;
609  out[7] = ((limb) in[7]) & bottom58bits;
610  out[8] = ((limb) in[8]) & bottom58bits;
611 
612  /* out[i] < 2^58 */
613 
614  out[1] += ((limb) in[0]) >> 58;
615  out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
616  /* out[1] < 2^58 + 2^6 + 2^58
617  * = 2^59 + 2^6 */
618  out[2] += ((limb) (in[0] >> 64)) >> 52;
619 
620  out[2] += ((limb) in[1]) >> 58;
621  out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
622  out[3] += ((limb) (in[1] >> 64)) >> 52;
623 
624  out[3] += ((limb) in[2]) >> 58;
625  out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
626  out[4] += ((limb) (in[2] >> 64)) >> 52;
627 
628  out[4] += ((limb) in[3]) >> 58;
629  out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
630  out[5] += ((limb) (in[3] >> 64)) >> 52;
631 
632  out[5] += ((limb) in[4]) >> 58;
633  out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
634  out[6] += ((limb) (in[4] >> 64)) >> 52;
635 
636  out[6] += ((limb) in[5]) >> 58;
637  out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
638  out[7] += ((limb) (in[5] >> 64)) >> 52;
639 
640  out[7] += ((limb) in[6]) >> 58;
641  out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
642  out[8] += ((limb) (in[6] >> 64)) >> 52;
643 
644  out[8] += ((limb) in[7]) >> 58;
645  out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
646  /* out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
647  * < 2^59 + 2^13 */
648  overflow1 = ((limb) (in[7] >> 64)) >> 52;
649 
650  overflow1 += ((limb) in[8]) >> 58;
651  overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
652  overflow2 = ((limb) (in[8] >> 64)) >> 52;
653 
654  overflow1 <<= 1; /* overflow1 < 2^13 + 2^7 + 2^59 */
655  overflow2 <<= 1; /* overflow2 < 2^13 */
656 
657  out[0] += overflow1; /* out[0] < 2^60 */
658  out[1] += overflow2; /* out[1] < 2^59 + 2^6 + 2^13 */
659 
660  out[1] += out[0] >> 58; out[0] &= bottom58bits;
661  /* out[0] < 2^58
662  * out[1] < 2^59 + 2^6 + 2^13 + 2^2
663  * < 2^59 + 2^14 */
664  }
665 
666 static void felem_square_reduce(felem out, const felem in)
667  {
668  largefelem tmp;
669  felem_square(tmp, in);
670  felem_reduce(out, tmp);
671  }
672 
673 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
674  {
675  largefelem tmp;
676  felem_mul(tmp, in1, in2);
677  felem_reduce(out, tmp);
678  }
679 
680 /* felem_inv calculates |out| = |in|^{-1}
681  *
682  * Based on Fermat's Little Theorem:
683  * a^p = a (mod p)
684  * a^{p-1} = 1 (mod p)
685  * a^{p-2} = a^{-1} (mod p)
686  */
687 static void felem_inv(felem out, const felem in)
688  {
689  felem ftmp, ftmp2, ftmp3, ftmp4;
690  largefelem tmp;
691  unsigned i;
692 
693  felem_square(tmp, in); felem_reduce(ftmp, tmp); /* 2^1 */
694  felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */
695  felem_assign(ftmp2, ftmp);
696  felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */
697  felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2^0 */
698  felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^4 - 2^1 */
699 
700  felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp); /* 2^3 - 2^1 */
701  felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^4 - 2^2 */
702  felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */
703 
704  felem_assign(ftmp2, ftmp3);
705  felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^5 - 2^1 */
706  felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^6 - 2^2 */
707  felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^7 - 2^3 */
708  felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^8 - 2^4 */
709  felem_assign(ftmp4, ftmp3);
710  felem_mul(tmp, ftmp3, ftmp); felem_reduce(ftmp4, tmp); /* 2^8 - 2^1 */
711  felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp); /* 2^9 - 2^2 */
712  felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */
713  felem_assign(ftmp2, ftmp3);
714 
715  for (i = 0; i < 8; i++)
716  {
717  felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
718  }
719  felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */
720  felem_assign(ftmp2, ftmp3);
721 
722  for (i = 0; i < 16; i++)
723  {
724  felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
725  }
726  felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */
727  felem_assign(ftmp2, ftmp3);
728 
729  for (i = 0; i < 32; i++)
730  {
731  felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
732  }
733  felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */
734  felem_assign(ftmp2, ftmp3);
735 
736  for (i = 0; i < 64; i++)
737  {
738  felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
739  }
740  felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */
741  felem_assign(ftmp2, ftmp3);
742 
743  for (i = 0; i < 128; i++)
744  {
745  felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
746  }
747  felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */
748  felem_assign(ftmp2, ftmp3);
749 
750  for (i = 0; i < 256; i++)
751  {
752  felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
753  }
754  felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */
755 
756  for (i = 0; i < 9; i++)
757  {
758  felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
759  }
760  felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp); /* 2^512 - 2^2 */
761  felem_mul(tmp, ftmp3, in); felem_reduce(out, tmp); /* 2^512 - 3 */
762 }
763 
764 /* This is 2^521-1, expressed as an felem */
765 static const felem kPrime =
766  {
767  0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
768  0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
769  0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
770  };
771 
772 /* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
773  * otherwise.
774  * On entry:
775  * in[i] < 2^59 + 2^14
776  */
777 static limb felem_is_zero(const felem in)
778  {
779  felem ftmp;
780  limb is_zero, is_p;
781  felem_assign(ftmp, in);
782 
783  ftmp[0] += ftmp[8] >> 57; ftmp[8] &= bottom57bits;
784  /* ftmp[8] < 2^57 */
785  ftmp[1] += ftmp[0] >> 58; ftmp[0] &= bottom58bits;
786  ftmp[2] += ftmp[1] >> 58; ftmp[1] &= bottom58bits;
787  ftmp[3] += ftmp[2] >> 58; ftmp[2] &= bottom58bits;
788  ftmp[4] += ftmp[3] >> 58; ftmp[3] &= bottom58bits;
789  ftmp[5] += ftmp[4] >> 58; ftmp[4] &= bottom58bits;
790  ftmp[6] += ftmp[5] >> 58; ftmp[5] &= bottom58bits;
791  ftmp[7] += ftmp[6] >> 58; ftmp[6] &= bottom58bits;
792  ftmp[8] += ftmp[7] >> 58; ftmp[7] &= bottom58bits;
793  /* ftmp[8] < 2^57 + 4 */
794 
795  /* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is
796  * greater than our bound for ftmp[8]. Therefore we only have to check
797  * if the zero is zero or 2^521-1. */
798 
799  is_zero = 0;
800  is_zero |= ftmp[0];
801  is_zero |= ftmp[1];
802  is_zero |= ftmp[2];
803  is_zero |= ftmp[3];
804  is_zero |= ftmp[4];
805  is_zero |= ftmp[5];
806  is_zero |= ftmp[6];
807  is_zero |= ftmp[7];
808  is_zero |= ftmp[8];
809 
810  is_zero--;
811  /* We know that ftmp[i] < 2^63, therefore the only way that the top bit
812  * can be set is if is_zero was 0 before the decrement. */
813  is_zero = ((s64) is_zero) >> 63;
814 
815  is_p = ftmp[0] ^ kPrime[0];
816  is_p |= ftmp[1] ^ kPrime[1];
817  is_p |= ftmp[2] ^ kPrime[2];
818  is_p |= ftmp[3] ^ kPrime[3];
819  is_p |= ftmp[4] ^ kPrime[4];
820  is_p |= ftmp[5] ^ kPrime[5];
821  is_p |= ftmp[6] ^ kPrime[6];
822  is_p |= ftmp[7] ^ kPrime[7];
823  is_p |= ftmp[8] ^ kPrime[8];
824 
825  is_p--;
826  is_p = ((s64) is_p) >> 63;
827 
828  is_zero |= is_p;
829  return is_zero;
830  }
831 
832 static int felem_is_zero_int(const felem in)
833  {
834  return (int) (felem_is_zero(in) & ((limb)1));
835  }
836 
837 /* felem_contract converts |in| to its unique, minimal representation.
838  * On entry:
839  * in[i] < 2^59 + 2^14
840  */
841 static void felem_contract(felem out, const felem in)
842  {
843  limb is_p, is_greater, sign;
844  static const limb two58 = ((limb)1) << 58;
845 
846  felem_assign(out, in);
847 
848  out[0] += out[8] >> 57; out[8] &= bottom57bits;
849  /* out[8] < 2^57 */
850  out[1] += out[0] >> 58; out[0] &= bottom58bits;
851  out[2] += out[1] >> 58; out[1] &= bottom58bits;
852  out[3] += out[2] >> 58; out[2] &= bottom58bits;
853  out[4] += out[3] >> 58; out[3] &= bottom58bits;
854  out[5] += out[4] >> 58; out[4] &= bottom58bits;
855  out[6] += out[5] >> 58; out[5] &= bottom58bits;
856  out[7] += out[6] >> 58; out[6] &= bottom58bits;
857  out[8] += out[7] >> 58; out[7] &= bottom58bits;
858  /* out[8] < 2^57 + 4 */
859 
860  /* If the value is greater than 2^521-1 then we have to subtract
861  * 2^521-1 out. See the comments in felem_is_zero regarding why we
862  * don't test for other multiples of the prime. */
863 
864  /* First, if |out| is equal to 2^521-1, we subtract it out to get zero. */
865 
866  is_p = out[0] ^ kPrime[0];
867  is_p |= out[1] ^ kPrime[1];
868  is_p |= out[2] ^ kPrime[2];
869  is_p |= out[3] ^ kPrime[3];
870  is_p |= out[4] ^ kPrime[4];
871  is_p |= out[5] ^ kPrime[5];
872  is_p |= out[6] ^ kPrime[6];
873  is_p |= out[7] ^ kPrime[7];
874  is_p |= out[8] ^ kPrime[8];
875 
876  is_p--;
877  is_p &= is_p << 32;
878  is_p &= is_p << 16;
879  is_p &= is_p << 8;
880  is_p &= is_p << 4;
881  is_p &= is_p << 2;
882  is_p &= is_p << 1;
883  is_p = ((s64) is_p) >> 63;
884  is_p = ~is_p;
885 
886  /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
887 
888  out[0] &= is_p;
889  out[1] &= is_p;
890  out[2] &= is_p;
891  out[3] &= is_p;
892  out[4] &= is_p;
893  out[5] &= is_p;
894  out[6] &= is_p;
895  out[7] &= is_p;
896  out[8] &= is_p;
897 
898  /* In order to test that |out| >= 2^521-1 we need only test if out[8]
899  * >> 57 is greater than zero as (2^521-1) + x >= 2^522 */
900  is_greater = out[8] >> 57;
901  is_greater |= is_greater << 32;
902  is_greater |= is_greater << 16;
903  is_greater |= is_greater << 8;
904  is_greater |= is_greater << 4;
905  is_greater |= is_greater << 2;
906  is_greater |= is_greater << 1;
907  is_greater = ((s64) is_greater) >> 63;
908 
909  out[0] -= kPrime[0] & is_greater;
910  out[1] -= kPrime[1] & is_greater;
911  out[2] -= kPrime[2] & is_greater;
912  out[3] -= kPrime[3] & is_greater;
913  out[4] -= kPrime[4] & is_greater;
914  out[5] -= kPrime[5] & is_greater;
915  out[6] -= kPrime[6] & is_greater;
916  out[7] -= kPrime[7] & is_greater;
917  out[8] -= kPrime[8] & is_greater;
918 
919  /* Eliminate negative coefficients */
920  sign = -(out[0] >> 63); out[0] += (two58 & sign); out[1] -= (1 & sign);
921  sign = -(out[1] >> 63); out[1] += (two58 & sign); out[2] -= (1 & sign);
922  sign = -(out[2] >> 63); out[2] += (two58 & sign); out[3] -= (1 & sign);
923  sign = -(out[3] >> 63); out[3] += (two58 & sign); out[4] -= (1 & sign);
924  sign = -(out[4] >> 63); out[4] += (two58 & sign); out[5] -= (1 & sign);
925  sign = -(out[0] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
926  sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
927  sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
928  sign = -(out[5] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
929  sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
930  sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
931  }
932 
933 /* Group operations
934  * ----------------
935  *
936  * Building on top of the field operations we have the operations on the
937  * elliptic curve group itself. Points on the curve are represented in Jacobian
938  * coordinates */
939 
940 /* point_double calcuates 2*(x_in, y_in, z_in)
941  *
942  * The method is taken from:
943  * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
944  *
945  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
946  * while x_out == y_in is not (maybe this works, but it's not tested). */
947 static void
948 point_double(felem x_out, felem y_out, felem z_out,
949  const felem x_in, const felem y_in, const felem z_in)
950  {
951  largefelem tmp, tmp2;
952  felem delta, gamma, beta, alpha, ftmp, ftmp2;
953 
954  felem_assign(ftmp, x_in);
955  felem_assign(ftmp2, x_in);
956 
957  /* delta = z^2 */
958  felem_square(tmp, z_in);
959  felem_reduce(delta, tmp); /* delta[i] < 2^59 + 2^14 */
960 
961  /* gamma = y^2 */
962  felem_square(tmp, y_in);
963  felem_reduce(gamma, tmp); /* gamma[i] < 2^59 + 2^14 */
964 
965  /* beta = x*gamma */
966  felem_mul(tmp, x_in, gamma);
967  felem_reduce(beta, tmp); /* beta[i] < 2^59 + 2^14 */
968 
969  /* alpha = 3*(x-delta)*(x+delta) */
970  felem_diff64(ftmp, delta);
971  /* ftmp[i] < 2^61 */
972  felem_sum64(ftmp2, delta);
973  /* ftmp2[i] < 2^60 + 2^15 */
974  felem_scalar64(ftmp2, 3);
975  /* ftmp2[i] < 3*2^60 + 3*2^15 */
976  felem_mul(tmp, ftmp, ftmp2);
977  /* tmp[i] < 17(3*2^121 + 3*2^76)
978  * = 61*2^121 + 61*2^76
979  * < 64*2^121 + 64*2^76
980  * = 2^127 + 2^82
981  * < 2^128 */
982  felem_reduce(alpha, tmp);
983 
984  /* x' = alpha^2 - 8*beta */
985  felem_square(tmp, alpha);
986  /* tmp[i] < 17*2^120
987  * < 2^125 */
988  felem_assign(ftmp, beta);
989  felem_scalar64(ftmp, 8);
990  /* ftmp[i] < 2^62 + 2^17 */
991  felem_diff_128_64(tmp, ftmp);
992  /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
993  felem_reduce(x_out, tmp);
994 
995  /* z' = (y + z)^2 - gamma - delta */
996  felem_sum64(delta, gamma);
997  /* delta[i] < 2^60 + 2^15 */
998  felem_assign(ftmp, y_in);
999  felem_sum64(ftmp, z_in);
1000  /* ftmp[i] < 2^60 + 2^15 */
1001  felem_square(tmp, ftmp);
1002  /* tmp[i] < 17(2^122)
1003  * < 2^127 */
1004  felem_diff_128_64(tmp, delta);
1005  /* tmp[i] < 2^127 + 2^63 */
1006  felem_reduce(z_out, tmp);
1007 
1008  /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1009  felem_scalar64(beta, 4);
1010  /* beta[i] < 2^61 + 2^16 */
1011  felem_diff64(beta, x_out);
1012  /* beta[i] < 2^61 + 2^60 + 2^16 */
1013  felem_mul(tmp, alpha, beta);
1014  /* tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1015  * = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1016  * = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1017  * < 2^128 */
1018  felem_square(tmp2, gamma);
1019  /* tmp2[i] < 17*(2^59 + 2^14)^2
1020  * = 17*(2^118 + 2^74 + 2^28) */
1021  felem_scalar128(tmp2, 8);
1022  /* tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1023  * = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1024  * < 2^126 */
1025  felem_diff128(tmp, tmp2);
1026  /* tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1027  * = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1028  * 2^74 + 2^69 + 2^34 + 2^30
1029  * < 2^128 */
1030  felem_reduce(y_out, tmp);
1031  }
1032 
1033 /* copy_conditional copies in to out iff mask is all ones. */
1034 static void
1035 copy_conditional(felem out, const felem in, limb mask)
1036  {
1037  unsigned i;
1038  for (i = 0; i < NLIMBS; ++i)
1039  {
1040  const limb tmp = mask & (in[i] ^ out[i]);
1041  out[i] ^= tmp;
1042  }
1043  }
1044 
1045 /* point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1046  *
1047  * The method is taken from
1048  * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1049  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1050  *
1051  * This function includes a branch for checking whether the two input points
1052  * are equal (while not equal to the point at infinity). This case never
1053  * happens during single point multiplication, so there is no timing leak for
1054  * ECDH or ECDSA signing. */
1055 static void point_add(felem x3, felem y3, felem z3,
1056  const felem x1, const felem y1, const felem z1,
1057  const int mixed, const felem x2, const felem y2, const felem z2)
1058  {
1059  felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1060  largefelem tmp, tmp2;
1061  limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1062 
1063  z1_is_zero = felem_is_zero(z1);
1064  z2_is_zero = felem_is_zero(z2);
1065 
1066  /* ftmp = z1z1 = z1**2 */
1067  felem_square(tmp, z1);
1068  felem_reduce(ftmp, tmp);
1069 
1070  if (!mixed)
1071  {
1072  /* ftmp2 = z2z2 = z2**2 */
1073  felem_square(tmp, z2);
1074  felem_reduce(ftmp2, tmp);
1075 
1076  /* u1 = ftmp3 = x1*z2z2 */
1077  felem_mul(tmp, x1, ftmp2);
1078  felem_reduce(ftmp3, tmp);
1079 
1080  /* ftmp5 = z1 + z2 */
1081  felem_assign(ftmp5, z1);
1082  felem_sum64(ftmp5, z2);
1083  /* ftmp5[i] < 2^61 */
1084 
1085  /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1086  felem_square(tmp, ftmp5);
1087  /* tmp[i] < 17*2^122 */
1088  felem_diff_128_64(tmp, ftmp);
1089  /* tmp[i] < 17*2^122 + 2^63 */
1090  felem_diff_128_64(tmp, ftmp2);
1091  /* tmp[i] < 17*2^122 + 2^64 */
1092  felem_reduce(ftmp5, tmp);
1093 
1094  /* ftmp2 = z2 * z2z2 */
1095  felem_mul(tmp, ftmp2, z2);
1096  felem_reduce(ftmp2, tmp);
1097 
1098  /* s1 = ftmp6 = y1 * z2**3 */
1099  felem_mul(tmp, y1, ftmp2);
1100  felem_reduce(ftmp6, tmp);
1101  }
1102  else
1103  {
1104  /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
1105 
1106  /* u1 = ftmp3 = x1*z2z2 */
1107  felem_assign(ftmp3, x1);
1108 
1109  /* ftmp5 = 2*z1z2 */
1110  felem_scalar(ftmp5, z1, 2);
1111 
1112  /* s1 = ftmp6 = y1 * z2**3 */
1113  felem_assign(ftmp6, y1);
1114  }
1115 
1116  /* u2 = x2*z1z1 */
1117  felem_mul(tmp, x2, ftmp);
1118  /* tmp[i] < 17*2^120 */
1119 
1120  /* h = ftmp4 = u2 - u1 */
1121  felem_diff_128_64(tmp, ftmp3);
1122  /* tmp[i] < 17*2^120 + 2^63 */
1123  felem_reduce(ftmp4, tmp);
1124 
1125  x_equal = felem_is_zero(ftmp4);
1126 
1127  /* z_out = ftmp5 * h */
1128  felem_mul(tmp, ftmp5, ftmp4);
1129  felem_reduce(z_out, tmp);
1130 
1131  /* ftmp = z1 * z1z1 */
1132  felem_mul(tmp, ftmp, z1);
1133  felem_reduce(ftmp, tmp);
1134 
1135  /* s2 = tmp = y2 * z1**3 */
1136  felem_mul(tmp, y2, ftmp);
1137  /* tmp[i] < 17*2^120 */
1138 
1139  /* r = ftmp5 = (s2 - s1)*2 */
1140  felem_diff_128_64(tmp, ftmp6);
1141  /* tmp[i] < 17*2^120 + 2^63 */
1142  felem_reduce(ftmp5, tmp);
1143  y_equal = felem_is_zero(ftmp5);
1144  felem_scalar64(ftmp5, 2);
1145  /* ftmp5[i] < 2^61 */
1146 
1147  if (x_equal && y_equal && !z1_is_zero && !z2_is_zero)
1148  {
1149  point_double(x3, y3, z3, x1, y1, z1);
1150  return;
1151  }
1152 
1153  /* I = ftmp = (2h)**2 */
1154  felem_assign(ftmp, ftmp4);
1155  felem_scalar64(ftmp, 2);
1156  /* ftmp[i] < 2^61 */
1157  felem_square(tmp, ftmp);
1158  /* tmp[i] < 17*2^122 */
1159  felem_reduce(ftmp, tmp);
1160 
1161  /* J = ftmp2 = h * I */
1162  felem_mul(tmp, ftmp4, ftmp);
1163  felem_reduce(ftmp2, tmp);
1164 
1165  /* V = ftmp4 = U1 * I */
1166  felem_mul(tmp, ftmp3, ftmp);
1167  felem_reduce(ftmp4, tmp);
1168 
1169  /* x_out = r**2 - J - 2V */
1170  felem_square(tmp, ftmp5);
1171  /* tmp[i] < 17*2^122 */
1172  felem_diff_128_64(tmp, ftmp2);
1173  /* tmp[i] < 17*2^122 + 2^63 */
1174  felem_assign(ftmp3, ftmp4);
1175  felem_scalar64(ftmp4, 2);
1176  /* ftmp4[i] < 2^61 */
1177  felem_diff_128_64(tmp, ftmp4);
1178  /* tmp[i] < 17*2^122 + 2^64 */
1179  felem_reduce(x_out, tmp);
1180 
1181  /* y_out = r(V-x_out) - 2 * s1 * J */
1182  felem_diff64(ftmp3, x_out);
1183  /* ftmp3[i] < 2^60 + 2^60
1184  * = 2^61 */
1185  felem_mul(tmp, ftmp5, ftmp3);
1186  /* tmp[i] < 17*2^122 */
1187  felem_mul(tmp2, ftmp6, ftmp2);
1188  /* tmp2[i] < 17*2^120 */
1189  felem_scalar128(tmp2, 2);
1190  /* tmp2[i] < 17*2^121 */
1191  felem_diff128(tmp, tmp2);
1192  /* tmp[i] < 2^127 - 2^69 + 17*2^122
1193  * = 2^126 - 2^122 - 2^6 - 2^2 - 1
1194  * < 2^127 */
1195  felem_reduce(y_out, tmp);
1196 
1197  copy_conditional(x_out, x2, z1_is_zero);
1198  copy_conditional(x_out, x1, z2_is_zero);
1199  copy_conditional(y_out, y2, z1_is_zero);
1200  copy_conditional(y_out, y1, z2_is_zero);
1201  copy_conditional(z_out, z2, z1_is_zero);
1202  copy_conditional(z_out, z1, z2_is_zero);
1203  felem_assign(x3, x_out);
1204  felem_assign(y3, y_out);
1205  felem_assign(z3, z_out);
1206  }
1207 
1208 /* Base point pre computation
1209  * --------------------------
1210  *
1211  * Two different sorts of precomputed tables are used in the following code.
1212  * Each contain various points on the curve, where each point is three field
1213  * elements (x, y, z).
1214  *
1215  * For the base point table, z is usually 1 (0 for the point at infinity).
1216  * This table has 16 elements:
1217  * index | bits | point
1218  * ------+---------+------------------------------
1219  * 0 | 0 0 0 0 | 0G
1220  * 1 | 0 0 0 1 | 1G
1221  * 2 | 0 0 1 0 | 2^130G
1222  * 3 | 0 0 1 1 | (2^130 + 1)G
1223  * 4 | 0 1 0 0 | 2^260G
1224  * 5 | 0 1 0 1 | (2^260 + 1)G
1225  * 6 | 0 1 1 0 | (2^260 + 2^130)G
1226  * 7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1227  * 8 | 1 0 0 0 | 2^390G
1228  * 9 | 1 0 0 1 | (2^390 + 1)G
1229  * 10 | 1 0 1 0 | (2^390 + 2^130)G
1230  * 11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1231  * 12 | 1 1 0 0 | (2^390 + 2^260)G
1232  * 13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1233  * 14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1234  * 15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1235  *
1236  * The reason for this is so that we can clock bits into four different
1237  * locations when doing simple scalar multiplies against the base point.
1238  *
1239  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1240 
1241 /* gmul is the table of precomputed base points */
1242 static const felem gmul[16][3] =
1243  {{{0, 0, 0, 0, 0, 0, 0, 0, 0},
1244  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1245  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1246  {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1247  0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1248  0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1249  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1250  0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1251  0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1252  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1253  {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1254  0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1255  0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1256  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1257  0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1258  0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1259  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1260  {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1261  0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1262  0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1263  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1264  0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1265  0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1266  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1267  {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1268  0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1269  0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1270  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1271  0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1272  0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1273  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1274  {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1275  0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1276  0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1277  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1278  0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1279  0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1280  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1281  {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1282  0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1283  0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1284  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1285  0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1286  0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1287  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1288  {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1289  0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1290  0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1291  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1292  0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1293  0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1294  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1295  {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1296  0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1297  0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1298  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1299  0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1300  0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1301  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1302  {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1303  0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1304  0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1305  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1306  0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1307  0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1308  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1309  {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1310  0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1311  0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1312  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1313  0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1314  0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1315  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1316  {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1317  0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1318  0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1319  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1320  0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1321  0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1322  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1323  {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1324  0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1325  0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1326  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1327  0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1328  0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1329  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1330  {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1331  0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1332  0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1333  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1334  0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1335  0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1336  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1337  {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1338  0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1339  0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1340  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1341  0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1342  0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1343  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1344  {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1345  0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1346  0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1347  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1348  0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1349  0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1350  {1, 0, 0, 0, 0, 0, 0, 0, 0}}};
1351 
1352 /* select_point selects the |idx|th point from a precomputation table and
1353  * copies it to out. */
1354 static void select_point(const limb idx, unsigned int size, const felem pre_comp[/* size */][3],
1355  felem out[3])
1356  {
1357  unsigned i, j;
1358  limb *outlimbs = &out[0][0];
1359  memset(outlimbs, 0, 3 * sizeof(felem));
1360 
1361  for (i = 0; i < size; i++)
1362  {
1363  const limb *inlimbs = &pre_comp[i][0][0];
1364  limb mask = i ^ idx;
1365  mask |= mask >> 4;
1366  mask |= mask >> 2;
1367  mask |= mask >> 1;
1368  mask &= 1;
1369  mask--;
1370  for (j = 0; j < NLIMBS * 3; j++)
1371  outlimbs[j] |= inlimbs[j] & mask;
1372  }
1373  }
1374 
1375 /* get_bit returns the |i|th bit in |in| */
1376 static char get_bit(const felem_bytearray in, int i)
1377  {
1378  if (i < 0)
1379  return 0;
1380  return (in[i >> 3] >> (i & 7)) & 1;
1381  }
1382 
1383 /* Interleaved point multiplication using precomputed point multiples:
1384  * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[],
1385  * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
1386  * of the generator, using certain (large) precomputed multiples in g_pre_comp.
1387  * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
1388 static void batch_mul(felem x_out, felem y_out, felem z_out,
1389  const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar,
1390  const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[16][3])
1391  {
1392  int i, skip;
1393  unsigned num, gen_mul = (g_scalar != NULL);
1394  felem nq[3], tmp[4];
1395  limb bits;
1396  u8 sign, digit;
1397 
1398  /* set nq to the point at infinity */
1399  memset(nq, 0, 3 * sizeof(felem));
1400 
1401  /* Loop over all scalars msb-to-lsb, interleaving additions
1402  * of multiples of the generator (last quarter of rounds)
1403  * and additions of other points multiples (every 5th round).
1404  */
1405  skip = 1; /* save two point operations in the first round */
1406  for (i = (num_points ? 520 : 130); i >= 0; --i)
1407  {
1408  /* double */
1409  if (!skip)
1410  point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1411 
1412  /* add multiples of the generator */
1413  if (gen_mul && (i <= 130))
1414  {
1415  bits = get_bit(g_scalar, i + 390) << 3;
1416  if (i < 130)
1417  {
1418  bits |= get_bit(g_scalar, i + 260) << 2;
1419  bits |= get_bit(g_scalar, i + 130) << 1;
1420  bits |= get_bit(g_scalar, i);
1421  }
1422  /* select the point to add, in constant time */
1423  select_point(bits, 16, g_pre_comp, tmp);
1424  if (!skip)
1425  {
1426  point_add(nq[0], nq[1], nq[2],
1427  nq[0], nq[1], nq[2],
1428  1 /* mixed */, tmp[0], tmp[1], tmp[2]);
1429  }
1430  else
1431  {
1432  memcpy(nq, tmp, 3 * sizeof(felem));
1433  skip = 0;
1434  }
1435  }
1436 
1437  /* do other additions every 5 doublings */
1438  if (num_points && (i % 5 == 0))
1439  {
1440  /* loop over all scalars */
1441  for (num = 0; num < num_points; ++num)
1442  {
1443  bits = get_bit(scalars[num], i + 4) << 5;
1444  bits |= get_bit(scalars[num], i + 3) << 4;
1445  bits |= get_bit(scalars[num], i + 2) << 3;
1446  bits |= get_bit(scalars[num], i + 1) << 2;
1447  bits |= get_bit(scalars[num], i) << 1;
1448  bits |= get_bit(scalars[num], i - 1);
1449  ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1450 
1451  /* select the point to add or subtract, in constant time */
1452  select_point(digit, 17, pre_comp[num], tmp);
1453  felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */
1454  copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1455 
1456  if (!skip)
1457  {
1458  point_add(nq[0], nq[1], nq[2],
1459  nq[0], nq[1], nq[2],
1460  mixed, tmp[0], tmp[1], tmp[2]);
1461  }
1462  else
1463  {
1464  memcpy(nq, tmp, 3 * sizeof(felem));
1465  skip = 0;
1466  }
1467  }
1468  }
1469  }
1470  felem_assign(x_out, nq[0]);
1471  felem_assign(y_out, nq[1]);
1472  felem_assign(z_out, nq[2]);
1473  }
1474 
1475 
1476 /* Precomputation for the group generator. */
1477 typedef struct {
1478  felem g_pre_comp[16][3];
1481 
1483  {
1484  static const EC_METHOD ret = {
1504  0 /* point_set_compressed_coordinates */,
1505  0 /* point2oct */,
1506  0 /* oct2point */,
1520  0 /* field_div */,
1521  0 /* field_encode */,
1522  0 /* field_decode */,
1523  0 /* field_set_to_one */ };
1524 
1525  return &ret;
1526  }
1527 
1528 
1529 /******************************************************************************/
1530 /* FUNCTIONS TO MANAGE PRECOMPUTATION
1531  */
1532 
1533 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1534  {
1535  NISTP521_PRE_COMP *ret = NULL;
1537  if (!ret)
1538  {
1540  return ret;
1541  }
1542  memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1543  ret->references = 1;
1544  return ret;
1545  }
1546 
1547 static void *nistp521_pre_comp_dup(void *src_)
1548  {
1549  NISTP521_PRE_COMP *src = src_;
1550 
1551  /* no need to actually copy, these objects never change! */
1553 
1554  return src_;
1555  }
1556 
1557 static void nistp521_pre_comp_free(void *pre_)
1558  {
1559  int i;
1560  NISTP521_PRE_COMP *pre = pre_;
1561 
1562  if (!pre)
1563  return;
1564 
1566  if (i > 0)
1567  return;
1568 
1569  OPENSSL_free(pre);
1570  }
1571 
1572 static void nistp521_pre_comp_clear_free(void *pre_)
1573  {
1574  int i;
1575  NISTP521_PRE_COMP *pre = pre_;
1576 
1577  if (!pre)
1578  return;
1579 
1581  if (i > 0)
1582  return;
1583 
1584  OPENSSL_cleanse(pre, sizeof(*pre));
1585  OPENSSL_free(pre);
1586  }
1587 
1588 /******************************************************************************/
1589 /* OPENSSL EC_METHOD FUNCTIONS
1590  */
1591 
1593  {
1594  int ret;
1595  ret = ec_GFp_simple_group_init(group);
1596  group->a_is_minus3 = 1;
1597  return ret;
1598  }
1599 
1601  const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
1602  {
1603  int ret = 0;
1604  BN_CTX *new_ctx = NULL;
1605  BIGNUM *curve_p, *curve_a, *curve_b;
1606 
1607  if (ctx == NULL)
1608  if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1609  BN_CTX_start(ctx);
1610  if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1611  ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1612  ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err;
1613  BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1614  BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1615  BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1616  if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
1617  (BN_cmp(curve_b, b)))
1618  {
1621  goto err;
1622  }
1624  ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1625 err:
1626  BN_CTX_end(ctx);
1627  if (new_ctx != NULL)
1628  BN_CTX_free(new_ctx);
1629  return ret;
1630  }
1631 
1632 /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
1633  * (X', Y') = (X/Z^2, Y/Z^3) */
1635  const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
1636  {
1637  felem z1, z2, x_in, y_in, x_out, y_out;
1638  largefelem tmp;
1639 
1640  if (EC_POINT_is_at_infinity(group, point))
1641  {
1644  return 0;
1645  }
1646  if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1647  (!BN_to_felem(z1, &point->Z))) return 0;
1648  felem_inv(z2, z1);
1649  felem_square(tmp, z2); felem_reduce(z1, tmp);
1650  felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp);
1651  felem_contract(x_out, x_in);
1652  if (x != NULL)
1653  {
1654  if (!felem_to_BN(x, x_out))
1655  {
1657  return 0;
1658  }
1659  }
1660  felem_mul(tmp, z1, z2); felem_reduce(z1, tmp);
1661  felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp);
1662  felem_contract(y_out, y_in);
1663  if (y != NULL)
1664  {
1665  if (!felem_to_BN(y, y_out))
1666  {
1668  return 0;
1669  }
1670  }
1671  return 1;
1672  }
1673 
1674 static void make_points_affine(size_t num, felem points[/* num */][3], felem tmp_felems[/* num+1 */])
1675  {
1676  /* Runs in constant time, unless an input is the point at infinity
1677  * (which normally shouldn't happen). */
1679  num,
1680  points,
1681  sizeof(felem),
1682  tmp_felems,
1683  (void (*)(void *)) felem_one,
1684  (int (*)(const void *)) felem_is_zero_int,
1685  (void (*)(void *, const void *)) felem_assign,
1686  (void (*)(void *, const void *)) felem_square_reduce,
1687  (void (*)(void *, const void *, const void *)) felem_mul_reduce,
1688  (void (*)(void *, const void *)) felem_inv,
1689  (void (*)(void *, const void *)) felem_contract);
1690  }
1691 
1692 /* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
1693  * Result is stored in r (r can equal one of the inputs). */
1695  const BIGNUM *scalar, size_t num, const EC_POINT *points[],
1696  const BIGNUM *scalars[], BN_CTX *ctx)
1697  {
1698  int ret = 0;
1699  int j;
1700  int mixed = 0;
1701  BN_CTX *new_ctx = NULL;
1702  BIGNUM *x, *y, *z, *tmp_scalar;
1703  felem_bytearray g_secret;
1704  felem_bytearray *secrets = NULL;
1705  felem (*pre_comp)[17][3] = NULL;
1706  felem *tmp_felems = NULL;
1707  felem_bytearray tmp;
1708  unsigned i, num_bytes;
1709  int have_pre_comp = 0;
1710  size_t num_points = num;
1711  felem x_in, y_in, z_in, x_out, y_out, z_out;
1712  NISTP521_PRE_COMP *pre = NULL;
1713  felem (*g_pre_comp)[3] = NULL;
1714  EC_POINT *generator = NULL;
1715  const EC_POINT *p = NULL;
1716  const BIGNUM *p_scalar = NULL;
1717 
1718  if (ctx == NULL)
1719  if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1720  BN_CTX_start(ctx);
1721  if (((x = BN_CTX_get(ctx)) == NULL) ||
1722  ((y = BN_CTX_get(ctx)) == NULL) ||
1723  ((z = BN_CTX_get(ctx)) == NULL) ||
1724  ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1725  goto err;
1726 
1727  if (scalar != NULL)
1728  {
1729  pre = EC_EX_DATA_get_data(group->extra_data,
1730  nistp521_pre_comp_dup, nistp521_pre_comp_free,
1731  nistp521_pre_comp_clear_free);
1732  if (pre)
1733  /* we have precomputation, try to use it */
1734  g_pre_comp = &pre->g_pre_comp[0];
1735  else
1736  /* try to use the standard precomputation */
1737  g_pre_comp = (felem (*)[3]) gmul;
1738  generator = EC_POINT_new(group);
1739  if (generator == NULL)
1740  goto err;
1741  /* get the generator from precomputation */
1742  if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1743  !felem_to_BN(y, g_pre_comp[1][1]) ||
1744  !felem_to_BN(z, g_pre_comp[1][2]))
1745  {
1747  goto err;
1748  }
1750  generator, x, y, z, ctx))
1751  goto err;
1752  if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1753  /* precomputation matches generator */
1754  have_pre_comp = 1;
1755  else
1756  /* we don't have valid precomputation:
1757  * treat the generator as a random point */
1758  num_points++;
1759  }
1760 
1761  if (num_points > 0)
1762  {
1763  if (num_points >= 2)
1764  {
1765  /* unless we precompute multiples for just one point,
1766  * converting those into affine form is time well spent */
1767  mixed = 1;
1768  }
1769  secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1770  pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1771  if (mixed)
1772  tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1773  if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL)))
1774  {
1776  goto err;
1777  }
1778 
1779  /* we treat NULL scalars as 0, and NULL points as points at infinity,
1780  * i.e., they contribute nothing to the linear combination */
1781  memset(secrets, 0, num_points * sizeof(felem_bytearray));
1782  memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1783  for (i = 0; i < num_points; ++i)
1784  {
1785  if (i == num)
1786  /* we didn't have a valid precomputation, so we pick
1787  * the generator */
1788  {
1789  p = EC_GROUP_get0_generator(group);
1790  p_scalar = scalar;
1791  }
1792  else
1793  /* the i^th point */
1794  {
1795  p = points[i];
1796  p_scalar = scalars[i];
1797  }
1798  if ((p_scalar != NULL) && (p != NULL))
1799  {
1800  /* reduce scalar to 0 <= scalar < 2^521 */
1801  if ((BN_num_bits(p_scalar) > 521) || (BN_is_negative(p_scalar)))
1802  {
1803  /* this is an unusual input, and we don't guarantee
1804  * constant-timeness */
1805  if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx))
1806  {
1808  goto err;
1809  }
1810  num_bytes = BN_bn2bin(tmp_scalar, tmp);
1811  }
1812  else
1813  num_bytes = BN_bn2bin(p_scalar, tmp);
1814  flip_endian(secrets[i], tmp, num_bytes);
1815  /* precompute multiples */
1816  if ((!BN_to_felem(x_out, &p->X)) ||
1817  (!BN_to_felem(y_out, &p->Y)) ||
1818  (!BN_to_felem(z_out, &p->Z))) goto err;
1819  memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1820  memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1821  memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1822  for (j = 2; j <= 16; ++j)
1823  {
1824  if (j & 1)
1825  {
1826  point_add(
1827  pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1828  pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
1829  0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]);
1830  }
1831  else
1832  {
1833  point_double(
1834  pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1835  pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]);
1836  }
1837  }
1838  }
1839  }
1840  if (mixed)
1841  make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1842  }
1843 
1844  /* the scalar for the generator */
1845  if ((scalar != NULL) && (have_pre_comp))
1846  {
1847  memset(g_secret, 0, sizeof(g_secret));
1848  /* reduce scalar to 0 <= scalar < 2^521 */
1849  if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar)))
1850  {
1851  /* this is an unusual input, and we don't guarantee
1852  * constant-timeness */
1853  if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx))
1854  {
1856  goto err;
1857  }
1858  num_bytes = BN_bn2bin(tmp_scalar, tmp);
1859  }
1860  else
1861  num_bytes = BN_bn2bin(scalar, tmp);
1862  flip_endian(g_secret, tmp, num_bytes);
1863  /* do the multiplication with generator precomputation*/
1864  batch_mul(x_out, y_out, z_out,
1865  (const felem_bytearray (*)) secrets, num_points,
1866  g_secret,
1867  mixed, (const felem (*)[17][3]) pre_comp,
1868  (const felem (*)[3]) g_pre_comp);
1869  }
1870  else
1871  /* do the multiplication without generator precomputation */
1872  batch_mul(x_out, y_out, z_out,
1873  (const felem_bytearray (*)) secrets, num_points,
1874  NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL);
1875  /* reduce the output to its unique minimal representation */
1876  felem_contract(x_in, x_out);
1877  felem_contract(y_in, y_out);
1878  felem_contract(z_in, z_out);
1879  if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1880  (!felem_to_BN(z, z_in)))
1881  {
1883  goto err;
1884  }
1885  ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1886 
1887 err:
1888  BN_CTX_end(ctx);
1889  if (generator != NULL)
1890  EC_POINT_free(generator);
1891  if (new_ctx != NULL)
1892  BN_CTX_free(new_ctx);
1893  if (secrets != NULL)
1894  OPENSSL_free(secrets);
1895  if (pre_comp != NULL)
1896  OPENSSL_free(pre_comp);
1897  if (tmp_felems != NULL)
1898  OPENSSL_free(tmp_felems);
1899  return ret;
1900  }
1901 
1903  {
1904  int ret = 0;
1905  NISTP521_PRE_COMP *pre = NULL;
1906  int i, j;
1907  BN_CTX *new_ctx = NULL;
1908  BIGNUM *x, *y;
1909  EC_POINT *generator = NULL;
1910  felem tmp_felems[16];
1911 
1912  /* throw away old precomputation */
1913  EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
1914  nistp521_pre_comp_free, nistp521_pre_comp_clear_free);
1915  if (ctx == NULL)
1916  if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1917  BN_CTX_start(ctx);
1918  if (((x = BN_CTX_get(ctx)) == NULL) ||
1919  ((y = BN_CTX_get(ctx)) == NULL))
1920  goto err;
1921  /* get the generator */
1922  if (group->generator == NULL) goto err;
1923  generator = EC_POINT_new(group);
1924  if (generator == NULL)
1925  goto err;
1926  BN_bin2bn(nistp521_curve_params[3], sizeof (felem_bytearray), x);
1927  BN_bin2bn(nistp521_curve_params[4], sizeof (felem_bytearray), y);
1928  if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
1929  goto err;
1930  if ((pre = nistp521_pre_comp_new()) == NULL)
1931  goto err;
1932  /* if the generator is the standard one, use built-in precomputation */
1933  if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1934  {
1935  memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1936  ret = 1;
1937  goto err;
1938  }
1939  if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
1940  (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
1941  (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
1942  goto err;
1943  /* compute 2^130*G, 2^260*G, 2^390*G */
1944  for (i = 1; i <= 4; i <<= 1)
1945  {
1946  point_double(pre->g_pre_comp[2*i][0], pre->g_pre_comp[2*i][1],
1947  pre->g_pre_comp[2*i][2], pre->g_pre_comp[i][0],
1948  pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
1949  for (j = 0; j < 129; ++j)
1950  {
1951  point_double(pre->g_pre_comp[2*i][0],
1952  pre->g_pre_comp[2*i][1],
1953  pre->g_pre_comp[2*i][2],
1954  pre->g_pre_comp[2*i][0],
1955  pre->g_pre_comp[2*i][1],
1956  pre->g_pre_comp[2*i][2]);
1957  }
1958  }
1959  /* g_pre_comp[0] is the point at infinity */
1960  memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
1961  /* the remaining multiples */
1962  /* 2^130*G + 2^260*G */
1963  point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
1964  pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
1965  pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
1966  0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1967  pre->g_pre_comp[2][2]);
1968  /* 2^130*G + 2^390*G */
1969  point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
1970  pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
1971  pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
1972  0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1973  pre->g_pre_comp[2][2]);
1974  /* 2^260*G + 2^390*G */
1975  point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
1976  pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
1977  pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
1978  0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
1979  pre->g_pre_comp[4][2]);
1980  /* 2^130*G + 2^260*G + 2^390*G */
1981  point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
1982  pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
1983  pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
1984  0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1985  pre->g_pre_comp[2][2]);
1986  for (i = 1; i < 8; ++i)
1987  {
1988  /* odd multiples: add G */
1989  point_add(pre->g_pre_comp[2*i+1][0], pre->g_pre_comp[2*i+1][1],
1990  pre->g_pre_comp[2*i+1][2], pre->g_pre_comp[2*i][0],
1991  pre->g_pre_comp[2*i][1], pre->g_pre_comp[2*i][2],
1992  0, pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
1993  pre->g_pre_comp[1][2]);
1994  }
1995  make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
1996 
1997  if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
1998  nistp521_pre_comp_free, nistp521_pre_comp_clear_free))
1999  goto err;
2000  ret = 1;
2001  pre = NULL;
2002  err:
2003  BN_CTX_end(ctx);
2004  if (generator != NULL)
2005  EC_POINT_free(generator);
2006  if (new_ctx != NULL)
2007  BN_CTX_free(new_ctx);
2008  if (pre)
2009  nistp521_pre_comp_free(pre);
2010  return ret;
2011  }
2012 
2014  {
2015  if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2016  nistp521_pre_comp_free, nistp521_pre_comp_clear_free)
2017  != NULL)
2018  return 1;
2019  else
2020  return 0;
2021  }
2022 
2023 #else
2024 static void *dummy=&dummy;
2025 #endif