cryptlib  3.4.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Macros
gf128mul.c
Go to the documentation of this file.
1 /*
2  ---------------------------------------------------------------------------
3  Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
4 
5  LICENSE TERMS
6 
7  The redistribution and use of this software (with or without changes)
8  is allowed without the payment of fees or royalties provided that:
9 
10  1. source code distributions include the above copyright notice, this
11  list of conditions and the following disclaimer;
12 
13  2. binary distributions include the above copyright notice, this list
14  of conditions and the following disclaimer in their documentation;
15 
16  3. the name of the copyright holder is not used to endorse products
17  built using this software without specific written permission.
18 
19  DISCLAIMER
20 
21  This software is provided 'as is' with no explicit or implied warranties
22  in respect of its properties, including, but not limited to, correctness
23  and/or fitness for purpose.
24  ---------------------------------------------------------------------------
25  Issue Date: 20/12/2007
26 
27  This file provides fast multiplication in GF(128) as required by several
28  cryptographic authentication modes (see gfmul128.h).
29 */
30 
31 /* Speed critical loops can be unrolled to gain speed but consume more memory */
32 #if 1
33 # define UNROLL_LOOPS
34 #endif
35 
36 #if defined( INC_ALL ) /* pcg */
37  #include "gf128mul.h"
38  #include "mode_hdr.h"
39  #include "gf_mul_lo.h"
40 #else
41  #include "crypt/gf128mul.h"
42  #include "crypt/mode_hdr.h"
43  #include "crypt/gf_mul_lo.h"
44 #endif /* Compiler-specific includes */
45 
46 #if defined( GF_MODE_LL )
47 # define mode _ll
48 #elif defined( GF_MODE_BL )
49 # define mode _bl
50 #elif defined( GF_MODE_LB )
51 # define mode _lb
52 #elif defined( GF_MODE_BB )
53 # define mode _bb
54 #else
55 # error mode is not defined
56 #endif
57 
58 #if defined( GF_MODE_LL) || defined( GF_MODE_LB )
59 # define GF_INDEX(i) (i)
60 #else
61 # define GF_INDEX(i) (15 - (i))
62 #endif
63 
64 /* A slow field multiplier */
65 
66 void gf_mul(gf_t a, const gf_t b)
67 { gf_t p[8];
68  uint_8t *q, ch;
69  int i;
70 
71  copy_block_aligned(p[0], a);
72  for(i = 0; i < 7; ++i)
73  gf_mulx1(mode)(p[i + 1], p[i]);
74 
75  q = (uint_8t*)(a == b ? p[0] : b);
76  memset(a, 0, GF_BYTE_LEN);
77  for(i = 15 ; ; )
78  {
79  ch = q[GF_INDEX(i)];
80  if(ch & X_0)
81  xor_block_aligned(a, a, p[0]);
82  if(ch & X_1)
83  xor_block_aligned(a, a, p[1]);
84  if(ch & X_2)
85  xor_block_aligned(a, a, p[2]);
86  if(ch & X_3)
87  xor_block_aligned(a, a, p[3]);
88  if(ch & X_4)
89  xor_block_aligned(a, a, p[4]);
90  if(ch & X_5)
91  xor_block_aligned(a, a, p[5]);
92  if(ch & X_6)
93  xor_block_aligned(a, a, p[6]);
94  if(ch & X_7)
95  xor_block_aligned(a, a, p[7]);
96  if(!i--)
97  break;
98  gf_mulx8(mode)(a);
99  }
100 }
101 
102 #if defined( TABLES_64K )
103 
104 /* This version uses 64k bytes of table space on the stack.
105  An input variable field value in a[] has to be multiplied
106  by a key value in g[] that changes far less frequently.
107 
108  To do this a[] is split up into 16 smaller field values,
109  each one byte in length. For the 256 values of each of
110  these smaller values, we can precompute the result of
111  mulltiplying g by this field value. We can then combine
112  these values to provide the full multiply. So for each
113  of 16 bytes we have a table of 256 field values each of
114  16 bytes - 64k bytes in total.
115 */
116 
117 void init_64k_table(const gf_t g, gf_t64k_t t)
118 { int i = 0, j, k;
119 
120  /*
121  depending on the representation we have to process bits
122  within bytes high to low (0xe1 style ) or low to high
123  (0x87 style). We start by producing the powers x ,x^2
124  .. x^7 and put them in t[0][1], t[0][2] .. t[128] or in
125  t[128], t[64] .. t[1] depending on the bit order in use.
126  */
127 
128  /* clear the element for the zero field element */
129  memset(t[0][0], 0, GF_BYTE_LEN);
130 
131 #if defined( GF_MODE_LL ) || defined( GF_MODE_BL )
132 
133  /* g -> t[0][1], generate t[0][2] ... */
134  memcpy(t[0][1], g, GF_BYTE_LEN);
135  for(j = 1; j <= 64; j <<= 1)
136  gf_mulx1(mode)(t[0][j + j], t[0][j]);
137 #else
138 
139  /* g -> t[0][128], generate t[0][64] ... */
140  memcpy(t[0][128], g, GF_BYTE_LEN);
141  for(j = 64; j >= 1; j >>= 1)
142  gf_mulx1(mode)(t[0][j], t[0][j + j]);
143 #endif
144 
145  for( ; ; )
146  {
147  /* if { n } stands for the field value represented by
148  the integer n, we can express higher multiplies in
149  the table as follows:
150 
151  1. g * { 3} = g * {2} ^ g * {1}
152 
153  2. g * { 5} = g * {4} ^ g * {1}
154  g * { 6} = g * {4} ^ g * {2}
155  g * { 7} = g * {4} ^ g * {3}
156 
157  3. g * { 9} = g * {8} ^ g * {1}
158  g * {10} = g * {8} ^ g * {2}
159  ....
160 
161  and so on. This is what the following loops do.
162  */
163  for(j = 2; j < 256; j += j)
164  for(k = 1; k < j; ++k)
165  xor_block_aligned(t[i][j + k], t[i][j], t[i][k]);
166 
167  if(++i == GF_BYTE_LEN) /* all 16 byte positions done */
168  return;
169 
170  /* We now move to the next byte up and set up its eight
171  starting values by multiplying the values in the
172  lower table by x^8
173  */
174  memset(t[i][0], 0, GF_BYTE_LEN);
175  for(j = 128; j > 0; j >>= 1)
176  {
177  memcpy(t[i][j], t[i - 1][j], GF_BYTE_LEN);
178  gf_mulx8(mode)(t[i][j]);
179  }
180  }
181 }
182 
183 #define xor_64k(i,ap,t,r) xor_block_aligned(r, r, t[i][ap[GF_INDEX(i)]])
184 
185 #if defined( UNROLL_LOOPS )
186 
187 void gf_mul_64k(gf_t a, const gf_t64k_t t, gf_t r)
188 { uint_8t *ap = (uint_8t*)a;
189  memset(r, 0, GF_BYTE_LEN);
190  xor_64k(15, ap, t, r); xor_64k(14, ap, t, r);
191  xor_64k(13, ap, t, r); xor_64k(12, ap, t, r);
192  xor_64k(11, ap, t, r); xor_64k(10, ap, t, r);
193  xor_64k( 9, ap, t, r); xor_64k( 8, ap, t, r);
194  xor_64k( 7, ap, t, r); xor_64k( 6, ap, t, r);
195  xor_64k( 5, ap, t, r); xor_64k( 4, ap, t, r);
196  xor_64k( 3, ap, t, r); xor_64k( 2, ap, t, r);
197  xor_64k( 1, ap, t, r); xor_64k( 0, ap, t, r);
198  copy_block_aligned(a, r);
199 }
200 
201 #else
202 
203 void gf_mul_64k(gf_t a, const gf_t64k_t t, gf_t r)
204 { int i;
205  uint_8t *ap = (uint_8t*)a;
206  memset(r, 0, GF_BYTE_LEN);
207  for(i = 15; i >= 0; --i)
208  {
209  xor_64k(i,ap,t,r);
210  }
211  copy_block_aligned(a, r);
212 }
213 
214 #endif
215 
216 #endif
217 
218 #if defined( TABLES_8K )
219 
220 /* This version uses 8k bytes of table space on the stack.
221  An input field value in a[] has to be multiplied by a
222  key value in g[]. To do this a[] is split up into 32
223  smaller field values each 4-bits in length. For the
224  16 values of each of these smaller field values we can
225  precompute the result of mulltiplying g[] by the field
226  value in question. So for each of 32 nibbles we have a
227  table of 16 field values, each of 16 bytes - 8k bytes
228  in total.
229 */
230 void init_8k_table(const gf_t g, gf_t8k_t t)
231 { int i = 0, j, k;
232 
233  /* do the low 4-bit nibble first - t[0][16] - and note
234  that the unit multiplier sits at 0x01 - t[0][1] in
235  the table. Then multiplies by x go at 2, 4, 8
236  */
237  /* set the table elements for a zero multiplier */
238  memset(t[0][0], 0, GF_BYTE_LEN);
239  memset(t[1][0], 0, GF_BYTE_LEN);
240 
241 #if defined( GF_MODE_LL ) || defined( GF_MODE_BL )
242 
243  /* t[0][1] = g, compute t[0][2], t[0][4], t[0][8] */
244  memcpy(t[0][1], g, GF_BYTE_LEN);
245  for(j = 1; j <= 4; j <<= 1)
246  gf_mulx1(mode)(t[0][j + j], t[0][j]);
247  /* t[1][1] = t[0][1] * x^4 = t[0][8] * x */
248  gf_mulx1(mode)(t[1][1], t[0][8]);
249  for(j = 1; j <= 4; j <<= 1)
250  gf_mulx1(mode)(t[1][j + j], t[1][j]);
251 #else
252 
253  /* g -> t[0][8], compute t[0][4], t[0][2], t[0][1] */
254  memcpy(t[1][8], g, GF_BYTE_LEN);
255  for(j = 4; j >= 1; j >>= 1)
256  gf_mulx1(mode)(t[1][j], t[1][j + j]);
257  /* t[1][1] = t[0][1] * x^4 = t[0][8] * x */
258  gf_mulx1(mode)(t[0][8], t[1][1]);
259  for(j = 4; j >= 1; j >>= 1)
260  gf_mulx1(mode)(t[0][j], t[0][j + j]);
261 #endif
262 
263  for( ; ; )
264  {
265  for(j = 2; j < 16; j += j)
266  for(k = 1; k < j; ++k)
267  xor_block_aligned(t[i][j + k], t[i][j], t[i][k]);
268 
269  if(++i == 2 * GF_BYTE_LEN)
270  return;
271 
272  if(i > 1)
273  {
274  memset(t[i][0], 0, GF_BYTE_LEN);
275  for(j = 8; j > 0; j >>= 1)
276  {
277  memcpy(t[i][j], t[i - 2][j], GF_BYTE_LEN);
278  gf_mulx8(mode)(t[i][j]);
279  }
280  }
281 
282  }
283 }
284 
285 #define xor_8k(i,ap,t,r) \
286  xor_block_aligned(r, r, t[i + i][ap[GF_INDEX(i)] & 15]); \
287  xor_block_aligned(r, r, t[i + i + 1][ap[GF_INDEX(i)] >> 4])
288 
289 #if defined( UNROLL_LOOPS )
290 
291 void gf_mul_8k(gf_t a, const gf_t8k_t t, gf_t r)
292 { uint_8t *ap = (uint_8t*)a;
293  memset(r, 0, GF_BYTE_LEN);
294  xor_8k(15, ap, t, r); xor_8k(14, ap, t, r);
295  xor_8k(13, ap, t, r); xor_8k(12, ap, t, r);
296  xor_8k(11, ap, t, r); xor_8k(10, ap, t, r);
297  xor_8k( 9, ap, t, r); xor_8k( 8, ap, t, r);
298  xor_8k( 7, ap, t, r); xor_8k( 6, ap, t, r);
299  xor_8k( 5, ap, t, r); xor_8k( 4, ap, t, r);
300  xor_8k( 3, ap, t, r); xor_8k( 2, ap, t, r);
301  xor_8k( 1, ap, t, r); xor_8k( 0, ap, t, r);
302  copy_block_aligned(a, r);
303 }
304 
305 #else
306 
307 void gf_mul_8k(gf_t a, const gf_t8k_t t, gf_t r)
308 { int i;
309  uint_8t *ap = (uint_8t*)a;
310  memset(r, 0, GF_BYTE_LEN);
311  for(i = 15; i >= 0; --i)
312  {
313  xor_8k(i,ap,t,r);
314  }
315  memcpy(a, r, GF_BYTE_LEN);
316 }
317 
318 #endif
319 
320 #endif
321 
322 #if defined( TABLES_4K )
323 
324 /* This version uses 4k bytes of table space on the stack.
325  A 16 byte buffer has to be multiplied by a 16 byte key
326  value in GF(128). If we consider a GF(128) value in a
327  single byte, we can construct a table of the 256 16
328  byte values that result from multiplying g by the 256
329  values of this byte. This requires 4096 bytes.
330 
331  If we take the highest byte in the buffer and use this
332  table to multiply it by g, we then have to multiply it
333  by x^120 to get the final value. For the next highest
334  byte the result has to be multiplied by x^112 and so on.
335 
336  But we can do this by accumulating the result in an
337  accumulator starting with the result for the top byte.
338  We repeatedly multiply the accumulator value by x^8 and
339  then add in (i.e. xor) the 16 bytes of the next lower
340  byte in the buffer, stopping when we reach the lowest
341  byte. This requires a 4096 byte table.
342 */
343 
344 void init_4k_table(const gf_t g, gf_t4k_t t)
345 { int j, k;
346 
347  memset(t[0], 0, GF_BYTE_LEN);
348 
349 #if defined( GF_MODE_LL ) || defined( GF_MODE_BL )
350 
351  memcpy(t[1], g, GF_BYTE_LEN);
352  for(j = 1; j <= 64; j <<= 1)
353  gf_mulx1(mode)(t[j + j], t[j]);
354 #else
355 
356  memcpy(t[128], g, GF_BYTE_LEN);
357  for(j = 64; j >= 1; j >>= 1)
358  gf_mulx1(mode)(t[j], t[j + j]);
359 #endif
360 
361  for(j = 2; j < 256; j += j)
362  for(k = 1; k < j; ++k)
363  xor_block_aligned(t[j + k], t[j], t[k]);
364 }
365 
366 #define xor_4k(i,ap,t,r) gf_mulx8(mode)(r); xor_block_aligned(r, r, t[ap[GF_INDEX(i)]])
367 
368 #if defined( UNROLL_LOOPS )
369 
370 void gf_mul_4k(gf_t a, const gf_t4k_t t, gf_t r)
371 { uint_8t *ap = (uint_8t*)a;
372  memset(r, 0, GF_BYTE_LEN);
373  xor_4k(15, ap, t, r); xor_4k(14, ap, t, r);
374  xor_4k(13, ap, t, r); xor_4k(12, ap, t, r);
375  xor_4k(11, ap, t, r); xor_4k(10, ap, t, r);
376  xor_4k( 9, ap, t, r); xor_4k( 8, ap, t, r);
377  xor_4k( 7, ap, t, r); xor_4k( 6, ap, t, r);
378  xor_4k( 5, ap, t, r); xor_4k( 4, ap, t, r);
379  xor_4k( 3, ap, t, r); xor_4k( 2, ap, t, r);
380  xor_4k( 1, ap, t, r); xor_4k( 0, ap, t, r);
381  copy_block_aligned(a, r);
382 }
383 
384 #else
385 
386 void gf_mul_4k(gf_t a, const gf_t4k_t t, gf_t r)
387 { int i = 15;
388  uint_8t *ap = (uint_8t*)a;
389  memset(r, 0, GF_BYTE_LEN);
390  for(i = 15; i >=0; --i)
391  {
392  xor_4k(i, ap, t, r);
393  }
394  copy_block_aligned(a, r);
395 }
396 
397 #endif
398 
399 #endif
400 
401 #if defined( TABLES_256 )
402 
403 /* This version uses 256 bytes of table space on the stack.
404  A 16 byte buffer has to be multiplied by a 16 byte key
405  value in GF(128). If we consider a GF(128) value in a
406  single 4-bit nibble, we can construct a table of the 16
407  16 byte values that result from the 16 values of this
408  byte. This requires 256 bytes. If we take the highest
409  4-bit nibble in the buffer and use this table to get the
410  result, we then have to multiply by x^124 to get the
411  final value. For the next highest byte the result has to
412  be multiplied by x^120 and so on. But we can do this by
413  accumulating the result in an accumulator starting with
414  the result for the top nibble. We repeatedly multiply
415  the accumulator value by x^4 and then add in (i.e. xor)
416  the 16 bytes of the next lower nibble in the buffer,
417  stopping when we reach the lowest nibble. This uses a
418  256 byte table.
419 */
420 
421 void init_256_table(const gf_t g, gf_t256_t t)
422 { int j, k;
423 
424  memset(t[0], 0, GF_BYTE_LEN);
425 
426 #if defined( GF_MODE_LL ) || defined( GF_MODE_BL )
427 
428  memcpy(t[1], g, GF_BYTE_LEN);
429  for(j = 1; j <= 4; j <<= 1)
430  gf_mulx1(mode)(t[j + j], t[j]);
431 #else
432 
433  memcpy(t[8], g, GF_BYTE_LEN);
434  for(j = 4; j >= 1; j >>= 1)
435  gf_mulx1(mode)(t[j], t[j + j]);
436 #endif
437 
438  for(j = 2; j < 16; j += j)
439  for(k = 1; k < j; ++k)
440  xor_block_aligned(t[j + k], t[j], t[k]);
441 }
442 
443 #define x_lo(i,ap,t,r) gf_mulx4(mode)(r); xor_block_aligned(r, r, t[ap[GF_INDEX(i)] & 0x0f])
444 #define x_hi(i,ap,t,r) gf_mulx4(mode)(r); xor_block_aligned(r, r, t[ap[GF_INDEX(i)] >> 4])
445 
446 #if defined( GF_MODE_LL ) || defined( GF_MODE_BL )
447 #define xor_256(a,b,c,d) x_hi(a,b,c,d); x_lo(a,b,c,d)
448 #else
449 #define xor_256(a,b,c,d) x_lo(a,b,c,d); x_hi(a,b,c,d)
450 #endif
451 
452 #if defined( UNROLL_LOOPS )
453 
454 void gf_mul_256(gf_t a, const gf_t256_t t, gf_t r)
455 { uint_8t *ap = (uint_8t*)a;
456  memset(r, 0, GF_BYTE_LEN);
457  xor_256(15, ap, t, r); xor_256(14, ap, t, r);
458  xor_256(13, ap, t, r); xor_256(12, ap, t, r);
459  xor_256(11, ap, t, r); xor_256(10, ap, t, r);
460  xor_256( 9, ap, t, r); xor_256( 8, ap, t, r);
461  xor_256( 7, ap, t, r); xor_256( 6, ap, t, r);
462  xor_256( 5, ap, t, r); xor_256( 4, ap, t, r);
463  xor_256( 3, ap, t, r); xor_256( 2, ap, t, r);
464  xor_256( 1, ap, t, r); xor_256( 0, ap, t, r);
465  copy_block_aligned(a, r);
466 }
467 
468 #else
469 
470 void gf_mul_256(gf_t a, const gf_t256_t t, gf_t r)
471 { int i;
472  uint_8t *ap = (uint_8t*)a;
473  memset(r, 0, GF_BYTE_LEN);
474  for(i = 15; i >= 0; --i)
475  {
476  xor_256(i, ap, t, r);
477  }
478  copy_block_aligned(a, r);
479 }
480 
481 #endif
482 
483 #endif