OpenSSL  1.0.1c
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Macros
aes_x86core.c
Go to the documentation of this file.
1 /* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */
28 /*
29  * This is experimental x86[_64] derivative. It assumes little-endian
30  * byte order and expects CPU to sustain unaligned memory references.
31  * It is used as playground for cache-time attack mitigations and
32  * serves as reference C implementation for x86[_64] assembler.
33  *
35  */
36 
37 
38 #ifndef AES_DEBUG
39 # ifndef NDEBUG
40 # define NDEBUG
41 # endif
42 #endif
43 #include <assert.h>
44 
45 #include <stdlib.h>
46 #include <openssl/aes.h>
47 #include "aes_locl.h"
48 
49 /*
50  * These two parameters control which table, 256-byte or 2KB, is
51  * referenced in outer and respectively inner rounds.
52  */
53 #define AES_COMPACT_IN_OUTER_ROUNDS
54 #ifdef AES_COMPACT_IN_OUTER_ROUNDS
55 /* AES_COMPACT_IN_OUTER_ROUNDS costs ~30% in performance, while
56  * adding AES_COMPACT_IN_INNER_ROUNDS reduces benchmark *further*
57  * by factor of ~2. */
58 # undef AES_COMPACT_IN_INNER_ROUNDS
59 #endif
60 
61 #if 1
62 static void prefetch256(const void *table)
63 {
64  volatile unsigned long *t=(void *)table,ret;
65  unsigned long sum;
66  int i;
67 
68  /* 32 is common least cache-line size */
69  for (sum=0,i=0;i<256/sizeof(t[0]);i+=32/sizeof(t[0])) sum ^= t[i];
70 
71  ret = sum;
72 }
73 #else
74 # define prefetch256(t)
75 #endif
76 
77 #undef GETU32
78 #define GETU32(p) (*((u32*)(p)))
79 
80 #if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
81 typedef unsigned __int64 u64;
82 #define U64(C) C##UI64
83 #elif defined(__arch64__)
84 typedef unsigned long u64;
85 #define U64(C) C##UL
86 #else
87 typedef unsigned long long u64;
88 #define U64(C) C##ULL
89 #endif
90 
91 #undef ROTATE
92 #if defined(_MSC_VER) || defined(__ICC)
93 # define ROTATE(a,n) _lrotl(a,n)
94 #elif defined(__GNUC__) && __GNUC__>=2
95 # if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
96 # define ROTATE(a,n) ({ register unsigned int ret; \
97  asm ( \
98  "roll %1,%0" \
99  : "=r"(ret) \
100  : "I"(n), "0"(a) \
101  : "cc"); \
102  ret; \
103  })
104 # endif
105 #endif
106 /*
107 Te [x] = S [x].[02, 01, 01, 03, 02, 01, 01, 03];
108 Te0[x] = S [x].[02, 01, 01, 03];
109 Te1[x] = S [x].[03, 02, 01, 01];
110 Te2[x] = S [x].[01, 03, 02, 01];
111 Te3[x] = S [x].[01, 01, 03, 02];
112 */
113 #define Te0 (u32)((u64*)((u8*)Te+0))
114 #define Te1 (u32)((u64*)((u8*)Te+3))
115 #define Te2 (u32)((u64*)((u8*)Te+2))
116 #define Te3 (u32)((u64*)((u8*)Te+1))
117 /*
118 Td [x] = Si[x].[0e, 09, 0d, 0b, 0e, 09, 0d, 0b];
119 Td0[x] = Si[x].[0e, 09, 0d, 0b];
120 Td1[x] = Si[x].[0b, 0e, 09, 0d];
121 Td2[x] = Si[x].[0d, 0b, 0e, 09];
122 Td3[x] = Si[x].[09, 0d, 0b, 0e];
123 Td4[x] = Si[x].[01];
124 */
125 #define Td0 (u32)((u64*)((u8*)Td+0))
126 #define Td1 (u32)((u64*)((u8*)Td+3))
127 #define Td2 (u32)((u64*)((u8*)Td+2))
128 #define Td3 (u32)((u64*)((u8*)Td+1))
129 
130 static const u64 Te[256] = {
131  U64(0xa56363c6a56363c6), U64(0x847c7cf8847c7cf8),
132  U64(0x997777ee997777ee), U64(0x8d7b7bf68d7b7bf6),
133  U64(0x0df2f2ff0df2f2ff), U64(0xbd6b6bd6bd6b6bd6),
134  U64(0xb16f6fdeb16f6fde), U64(0x54c5c59154c5c591),
135  U64(0x5030306050303060), U64(0x0301010203010102),
136  U64(0xa96767cea96767ce), U64(0x7d2b2b567d2b2b56),
137  U64(0x19fefee719fefee7), U64(0x62d7d7b562d7d7b5),
138  U64(0xe6abab4de6abab4d), U64(0x9a7676ec9a7676ec),
139  U64(0x45caca8f45caca8f), U64(0x9d82821f9d82821f),
140  U64(0x40c9c98940c9c989), U64(0x877d7dfa877d7dfa),
141  U64(0x15fafaef15fafaef), U64(0xeb5959b2eb5959b2),
142  U64(0xc947478ec947478e), U64(0x0bf0f0fb0bf0f0fb),
143  U64(0xecadad41ecadad41), U64(0x67d4d4b367d4d4b3),
144  U64(0xfda2a25ffda2a25f), U64(0xeaafaf45eaafaf45),
145  U64(0xbf9c9c23bf9c9c23), U64(0xf7a4a453f7a4a453),
146  U64(0x967272e4967272e4), U64(0x5bc0c09b5bc0c09b),
147  U64(0xc2b7b775c2b7b775), U64(0x1cfdfde11cfdfde1),
148  U64(0xae93933dae93933d), U64(0x6a26264c6a26264c),
149  U64(0x5a36366c5a36366c), U64(0x413f3f7e413f3f7e),
150  U64(0x02f7f7f502f7f7f5), U64(0x4fcccc834fcccc83),
151  U64(0x5c3434685c343468), U64(0xf4a5a551f4a5a551),
152  U64(0x34e5e5d134e5e5d1), U64(0x08f1f1f908f1f1f9),
153  U64(0x937171e2937171e2), U64(0x73d8d8ab73d8d8ab),
154  U64(0x5331316253313162), U64(0x3f15152a3f15152a),
155  U64(0x0c0404080c040408), U64(0x52c7c79552c7c795),
156  U64(0x6523234665232346), U64(0x5ec3c39d5ec3c39d),
157  U64(0x2818183028181830), U64(0xa1969637a1969637),
158  U64(0x0f05050a0f05050a), U64(0xb59a9a2fb59a9a2f),
159  U64(0x0907070e0907070e), U64(0x3612122436121224),
160  U64(0x9b80801b9b80801b), U64(0x3de2e2df3de2e2df),
161  U64(0x26ebebcd26ebebcd), U64(0x6927274e6927274e),
162  U64(0xcdb2b27fcdb2b27f), U64(0x9f7575ea9f7575ea),
163  U64(0x1b0909121b090912), U64(0x9e83831d9e83831d),
164  U64(0x742c2c58742c2c58), U64(0x2e1a1a342e1a1a34),
165  U64(0x2d1b1b362d1b1b36), U64(0xb26e6edcb26e6edc),
166  U64(0xee5a5ab4ee5a5ab4), U64(0xfba0a05bfba0a05b),
167  U64(0xf65252a4f65252a4), U64(0x4d3b3b764d3b3b76),
168  U64(0x61d6d6b761d6d6b7), U64(0xceb3b37dceb3b37d),
169  U64(0x7b2929527b292952), U64(0x3ee3e3dd3ee3e3dd),
170  U64(0x712f2f5e712f2f5e), U64(0x9784841397848413),
171  U64(0xf55353a6f55353a6), U64(0x68d1d1b968d1d1b9),
172  U64(0x0000000000000000), U64(0x2cededc12cededc1),
173  U64(0x6020204060202040), U64(0x1ffcfce31ffcfce3),
174  U64(0xc8b1b179c8b1b179), U64(0xed5b5bb6ed5b5bb6),
175  U64(0xbe6a6ad4be6a6ad4), U64(0x46cbcb8d46cbcb8d),
176  U64(0xd9bebe67d9bebe67), U64(0x4b3939724b393972),
177  U64(0xde4a4a94de4a4a94), U64(0xd44c4c98d44c4c98),
178  U64(0xe85858b0e85858b0), U64(0x4acfcf854acfcf85),
179  U64(0x6bd0d0bb6bd0d0bb), U64(0x2aefefc52aefefc5),
180  U64(0xe5aaaa4fe5aaaa4f), U64(0x16fbfbed16fbfbed),
181  U64(0xc5434386c5434386), U64(0xd74d4d9ad74d4d9a),
182  U64(0x5533336655333366), U64(0x9485851194858511),
183  U64(0xcf45458acf45458a), U64(0x10f9f9e910f9f9e9),
184  U64(0x0602020406020204), U64(0x817f7ffe817f7ffe),
185  U64(0xf05050a0f05050a0), U64(0x443c3c78443c3c78),
186  U64(0xba9f9f25ba9f9f25), U64(0xe3a8a84be3a8a84b),
187  U64(0xf35151a2f35151a2), U64(0xfea3a35dfea3a35d),
188  U64(0xc0404080c0404080), U64(0x8a8f8f058a8f8f05),
189  U64(0xad92923fad92923f), U64(0xbc9d9d21bc9d9d21),
190  U64(0x4838387048383870), U64(0x04f5f5f104f5f5f1),
191  U64(0xdfbcbc63dfbcbc63), U64(0xc1b6b677c1b6b677),
192  U64(0x75dadaaf75dadaaf), U64(0x6321214263212142),
193  U64(0x3010102030101020), U64(0x1affffe51affffe5),
194  U64(0x0ef3f3fd0ef3f3fd), U64(0x6dd2d2bf6dd2d2bf),
195  U64(0x4ccdcd814ccdcd81), U64(0x140c0c18140c0c18),
196  U64(0x3513132635131326), U64(0x2fececc32fececc3),
197  U64(0xe15f5fbee15f5fbe), U64(0xa2979735a2979735),
198  U64(0xcc444488cc444488), U64(0x3917172e3917172e),
199  U64(0x57c4c49357c4c493), U64(0xf2a7a755f2a7a755),
200  U64(0x827e7efc827e7efc), U64(0x473d3d7a473d3d7a),
201  U64(0xac6464c8ac6464c8), U64(0xe75d5dbae75d5dba),
202  U64(0x2b1919322b191932), U64(0x957373e6957373e6),
203  U64(0xa06060c0a06060c0), U64(0x9881811998818119),
204  U64(0xd14f4f9ed14f4f9e), U64(0x7fdcdca37fdcdca3),
205  U64(0x6622224466222244), U64(0x7e2a2a547e2a2a54),
206  U64(0xab90903bab90903b), U64(0x8388880b8388880b),
207  U64(0xca46468cca46468c), U64(0x29eeeec729eeeec7),
208  U64(0xd3b8b86bd3b8b86b), U64(0x3c1414283c141428),
209  U64(0x79dedea779dedea7), U64(0xe25e5ebce25e5ebc),
210  U64(0x1d0b0b161d0b0b16), U64(0x76dbdbad76dbdbad),
211  U64(0x3be0e0db3be0e0db), U64(0x5632326456323264),
212  U64(0x4e3a3a744e3a3a74), U64(0x1e0a0a141e0a0a14),
213  U64(0xdb494992db494992), U64(0x0a06060c0a06060c),
214  U64(0x6c2424486c242448), U64(0xe45c5cb8e45c5cb8),
215  U64(0x5dc2c29f5dc2c29f), U64(0x6ed3d3bd6ed3d3bd),
216  U64(0xefacac43efacac43), U64(0xa66262c4a66262c4),
217  U64(0xa8919139a8919139), U64(0xa4959531a4959531),
218  U64(0x37e4e4d337e4e4d3), U64(0x8b7979f28b7979f2),
219  U64(0x32e7e7d532e7e7d5), U64(0x43c8c88b43c8c88b),
220  U64(0x5937376e5937376e), U64(0xb76d6ddab76d6dda),
221  U64(0x8c8d8d018c8d8d01), U64(0x64d5d5b164d5d5b1),
222  U64(0xd24e4e9cd24e4e9c), U64(0xe0a9a949e0a9a949),
223  U64(0xb46c6cd8b46c6cd8), U64(0xfa5656acfa5656ac),
224  U64(0x07f4f4f307f4f4f3), U64(0x25eaeacf25eaeacf),
225  U64(0xaf6565caaf6565ca), U64(0x8e7a7af48e7a7af4),
226  U64(0xe9aeae47e9aeae47), U64(0x1808081018080810),
227  U64(0xd5baba6fd5baba6f), U64(0x887878f0887878f0),
228  U64(0x6f25254a6f25254a), U64(0x722e2e5c722e2e5c),
229  U64(0x241c1c38241c1c38), U64(0xf1a6a657f1a6a657),
230  U64(0xc7b4b473c7b4b473), U64(0x51c6c69751c6c697),
231  U64(0x23e8e8cb23e8e8cb), U64(0x7cdddda17cdddda1),
232  U64(0x9c7474e89c7474e8), U64(0x211f1f3e211f1f3e),
233  U64(0xdd4b4b96dd4b4b96), U64(0xdcbdbd61dcbdbd61),
234  U64(0x868b8b0d868b8b0d), U64(0x858a8a0f858a8a0f),
235  U64(0x907070e0907070e0), U64(0x423e3e7c423e3e7c),
236  U64(0xc4b5b571c4b5b571), U64(0xaa6666ccaa6666cc),
237  U64(0xd8484890d8484890), U64(0x0503030605030306),
238  U64(0x01f6f6f701f6f6f7), U64(0x120e0e1c120e0e1c),
239  U64(0xa36161c2a36161c2), U64(0x5f35356a5f35356a),
240  U64(0xf95757aef95757ae), U64(0xd0b9b969d0b9b969),
241  U64(0x9186861791868617), U64(0x58c1c19958c1c199),
242  U64(0x271d1d3a271d1d3a), U64(0xb99e9e27b99e9e27),
243  U64(0x38e1e1d938e1e1d9), U64(0x13f8f8eb13f8f8eb),
244  U64(0xb398982bb398982b), U64(0x3311112233111122),
245  U64(0xbb6969d2bb6969d2), U64(0x70d9d9a970d9d9a9),
246  U64(0x898e8e07898e8e07), U64(0xa7949433a7949433),
247  U64(0xb69b9b2db69b9b2d), U64(0x221e1e3c221e1e3c),
248  U64(0x9287871592878715), U64(0x20e9e9c920e9e9c9),
249  U64(0x49cece8749cece87), U64(0xff5555aaff5555aa),
250  U64(0x7828285078282850), U64(0x7adfdfa57adfdfa5),
251  U64(0x8f8c8c038f8c8c03), U64(0xf8a1a159f8a1a159),
252  U64(0x8089890980898909), U64(0x170d0d1a170d0d1a),
253  U64(0xdabfbf65dabfbf65), U64(0x31e6e6d731e6e6d7),
254  U64(0xc6424284c6424284), U64(0xb86868d0b86868d0),
255  U64(0xc3414182c3414182), U64(0xb0999929b0999929),
256  U64(0x772d2d5a772d2d5a), U64(0x110f0f1e110f0f1e),
257  U64(0xcbb0b07bcbb0b07b), U64(0xfc5454a8fc5454a8),
258  U64(0xd6bbbb6dd6bbbb6d), U64(0x3a16162c3a16162c)
259 };
260 
261 static const u8 Te4[256] = {
262  0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
263  0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
264  0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
265  0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
266  0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
267  0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
268  0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
269  0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
270  0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
271  0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
272  0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
273  0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
274  0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
275  0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
276  0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
277  0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
278  0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
279  0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
280  0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
281  0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
282  0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
283  0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
284  0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
285  0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
286  0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
287  0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
288  0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
289  0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
290  0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
291  0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
292  0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
293  0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
294 };
295 
296 static const u64 Td[256] = {
297  U64(0x50a7f45150a7f451), U64(0x5365417e5365417e),
298  U64(0xc3a4171ac3a4171a), U64(0x965e273a965e273a),
299  U64(0xcb6bab3bcb6bab3b), U64(0xf1459d1ff1459d1f),
300  U64(0xab58faacab58faac), U64(0x9303e34b9303e34b),
301  U64(0x55fa302055fa3020), U64(0xf66d76adf66d76ad),
302  U64(0x9176cc889176cc88), U64(0x254c02f5254c02f5),
303  U64(0xfcd7e54ffcd7e54f), U64(0xd7cb2ac5d7cb2ac5),
304  U64(0x8044352680443526), U64(0x8fa362b58fa362b5),
305  U64(0x495ab1de495ab1de), U64(0x671bba25671bba25),
306  U64(0x980eea45980eea45), U64(0xe1c0fe5de1c0fe5d),
307  U64(0x02752fc302752fc3), U64(0x12f04c8112f04c81),
308  U64(0xa397468da397468d), U64(0xc6f9d36bc6f9d36b),
309  U64(0xe75f8f03e75f8f03), U64(0x959c9215959c9215),
310  U64(0xeb7a6dbfeb7a6dbf), U64(0xda595295da595295),
311  U64(0x2d83bed42d83bed4), U64(0xd3217458d3217458),
312  U64(0x2969e0492969e049), U64(0x44c8c98e44c8c98e),
313  U64(0x6a89c2756a89c275), U64(0x78798ef478798ef4),
314  U64(0x6b3e58996b3e5899), U64(0xdd71b927dd71b927),
315  U64(0xb64fe1beb64fe1be), U64(0x17ad88f017ad88f0),
316  U64(0x66ac20c966ac20c9), U64(0xb43ace7db43ace7d),
317  U64(0x184adf63184adf63), U64(0x82311ae582311ae5),
318  U64(0x6033519760335197), U64(0x457f5362457f5362),
319  U64(0xe07764b1e07764b1), U64(0x84ae6bbb84ae6bbb),
320  U64(0x1ca081fe1ca081fe), U64(0x942b08f9942b08f9),
321  U64(0x5868487058684870), U64(0x19fd458f19fd458f),
322  U64(0x876cde94876cde94), U64(0xb7f87b52b7f87b52),
323  U64(0x23d373ab23d373ab), U64(0xe2024b72e2024b72),
324  U64(0x578f1fe3578f1fe3), U64(0x2aab55662aab5566),
325  U64(0x0728ebb20728ebb2), U64(0x03c2b52f03c2b52f),
326  U64(0x9a7bc5869a7bc586), U64(0xa50837d3a50837d3),
327  U64(0xf2872830f2872830), U64(0xb2a5bf23b2a5bf23),
328  U64(0xba6a0302ba6a0302), U64(0x5c8216ed5c8216ed),
329  U64(0x2b1ccf8a2b1ccf8a), U64(0x92b479a792b479a7),
330  U64(0xf0f207f3f0f207f3), U64(0xa1e2694ea1e2694e),
331  U64(0xcdf4da65cdf4da65), U64(0xd5be0506d5be0506),
332  U64(0x1f6234d11f6234d1), U64(0x8afea6c48afea6c4),
333  U64(0x9d532e349d532e34), U64(0xa055f3a2a055f3a2),
334  U64(0x32e18a0532e18a05), U64(0x75ebf6a475ebf6a4),
335  U64(0x39ec830b39ec830b), U64(0xaaef6040aaef6040),
336  U64(0x069f715e069f715e), U64(0x51106ebd51106ebd),
337  U64(0xf98a213ef98a213e), U64(0x3d06dd963d06dd96),
338  U64(0xae053eddae053edd), U64(0x46bde64d46bde64d),
339  U64(0xb58d5491b58d5491), U64(0x055dc471055dc471),
340  U64(0x6fd406046fd40604), U64(0xff155060ff155060),
341  U64(0x24fb981924fb9819), U64(0x97e9bdd697e9bdd6),
342  U64(0xcc434089cc434089), U64(0x779ed967779ed967),
343  U64(0xbd42e8b0bd42e8b0), U64(0x888b8907888b8907),
344  U64(0x385b19e7385b19e7), U64(0xdbeec879dbeec879),
345  U64(0x470a7ca1470a7ca1), U64(0xe90f427ce90f427c),
346  U64(0xc91e84f8c91e84f8), U64(0x0000000000000000),
347  U64(0x8386800983868009), U64(0x48ed2b3248ed2b32),
348  U64(0xac70111eac70111e), U64(0x4e725a6c4e725a6c),
349  U64(0xfbff0efdfbff0efd), U64(0x5638850f5638850f),
350  U64(0x1ed5ae3d1ed5ae3d), U64(0x27392d3627392d36),
351  U64(0x64d90f0a64d90f0a), U64(0x21a65c6821a65c68),
352  U64(0xd1545b9bd1545b9b), U64(0x3a2e36243a2e3624),
353  U64(0xb1670a0cb1670a0c), U64(0x0fe757930fe75793),
354  U64(0xd296eeb4d296eeb4), U64(0x9e919b1b9e919b1b),
355  U64(0x4fc5c0804fc5c080), U64(0xa220dc61a220dc61),
356  U64(0x694b775a694b775a), U64(0x161a121c161a121c),
357  U64(0x0aba93e20aba93e2), U64(0xe52aa0c0e52aa0c0),
358  U64(0x43e0223c43e0223c), U64(0x1d171b121d171b12),
359  U64(0x0b0d090e0b0d090e), U64(0xadc78bf2adc78bf2),
360  U64(0xb9a8b62db9a8b62d), U64(0xc8a91e14c8a91e14),
361  U64(0x8519f1578519f157), U64(0x4c0775af4c0775af),
362  U64(0xbbdd99eebbdd99ee), U64(0xfd607fa3fd607fa3),
363  U64(0x9f2601f79f2601f7), U64(0xbcf5725cbcf5725c),
364  U64(0xc53b6644c53b6644), U64(0x347efb5b347efb5b),
365  U64(0x7629438b7629438b), U64(0xdcc623cbdcc623cb),
366  U64(0x68fcedb668fcedb6), U64(0x63f1e4b863f1e4b8),
367  U64(0xcadc31d7cadc31d7), U64(0x1085634210856342),
368  U64(0x4022971340229713), U64(0x2011c6842011c684),
369  U64(0x7d244a857d244a85), U64(0xf83dbbd2f83dbbd2),
370  U64(0x1132f9ae1132f9ae), U64(0x6da129c76da129c7),
371  U64(0x4b2f9e1d4b2f9e1d), U64(0xf330b2dcf330b2dc),
372  U64(0xec52860dec52860d), U64(0xd0e3c177d0e3c177),
373  U64(0x6c16b32b6c16b32b), U64(0x99b970a999b970a9),
374  U64(0xfa489411fa489411), U64(0x2264e9472264e947),
375  U64(0xc48cfca8c48cfca8), U64(0x1a3ff0a01a3ff0a0),
376  U64(0xd82c7d56d82c7d56), U64(0xef903322ef903322),
377  U64(0xc74e4987c74e4987), U64(0xc1d138d9c1d138d9),
378  U64(0xfea2ca8cfea2ca8c), U64(0x360bd498360bd498),
379  U64(0xcf81f5a6cf81f5a6), U64(0x28de7aa528de7aa5),
380  U64(0x268eb7da268eb7da), U64(0xa4bfad3fa4bfad3f),
381  U64(0xe49d3a2ce49d3a2c), U64(0x0d9278500d927850),
382  U64(0x9bcc5f6a9bcc5f6a), U64(0x62467e5462467e54),
383  U64(0xc2138df6c2138df6), U64(0xe8b8d890e8b8d890),
384  U64(0x5ef7392e5ef7392e), U64(0xf5afc382f5afc382),
385  U64(0xbe805d9fbe805d9f), U64(0x7c93d0697c93d069),
386  U64(0xa92dd56fa92dd56f), U64(0xb31225cfb31225cf),
387  U64(0x3b99acc83b99acc8), U64(0xa77d1810a77d1810),
388  U64(0x6e639ce86e639ce8), U64(0x7bbb3bdb7bbb3bdb),
389  U64(0x097826cd097826cd), U64(0xf418596ef418596e),
390  U64(0x01b79aec01b79aec), U64(0xa89a4f83a89a4f83),
391  U64(0x656e95e6656e95e6), U64(0x7ee6ffaa7ee6ffaa),
392  U64(0x08cfbc2108cfbc21), U64(0xe6e815efe6e815ef),
393  U64(0xd99be7bad99be7ba), U64(0xce366f4ace366f4a),
394  U64(0xd4099fead4099fea), U64(0xd67cb029d67cb029),
395  U64(0xafb2a431afb2a431), U64(0x31233f2a31233f2a),
396  U64(0x3094a5c63094a5c6), U64(0xc066a235c066a235),
397  U64(0x37bc4e7437bc4e74), U64(0xa6ca82fca6ca82fc),
398  U64(0xb0d090e0b0d090e0), U64(0x15d8a73315d8a733),
399  U64(0x4a9804f14a9804f1), U64(0xf7daec41f7daec41),
400  U64(0x0e50cd7f0e50cd7f), U64(0x2ff691172ff69117),
401  U64(0x8dd64d768dd64d76), U64(0x4db0ef434db0ef43),
402  U64(0x544daacc544daacc), U64(0xdf0496e4df0496e4),
403  U64(0xe3b5d19ee3b5d19e), U64(0x1b886a4c1b886a4c),
404  U64(0xb81f2cc1b81f2cc1), U64(0x7f5165467f516546),
405  U64(0x04ea5e9d04ea5e9d), U64(0x5d358c015d358c01),
406  U64(0x737487fa737487fa), U64(0x2e410bfb2e410bfb),
407  U64(0x5a1d67b35a1d67b3), U64(0x52d2db9252d2db92),
408  U64(0x335610e9335610e9), U64(0x1347d66d1347d66d),
409  U64(0x8c61d79a8c61d79a), U64(0x7a0ca1377a0ca137),
410  U64(0x8e14f8598e14f859), U64(0x893c13eb893c13eb),
411  U64(0xee27a9ceee27a9ce), U64(0x35c961b735c961b7),
412  U64(0xede51ce1ede51ce1), U64(0x3cb1477a3cb1477a),
413  U64(0x59dfd29c59dfd29c), U64(0x3f73f2553f73f255),
414  U64(0x79ce141879ce1418), U64(0xbf37c773bf37c773),
415  U64(0xeacdf753eacdf753), U64(0x5baafd5f5baafd5f),
416  U64(0x146f3ddf146f3ddf), U64(0x86db447886db4478),
417  U64(0x81f3afca81f3afca), U64(0x3ec468b93ec468b9),
418  U64(0x2c3424382c342438), U64(0x5f40a3c25f40a3c2),
419  U64(0x72c31d1672c31d16), U64(0x0c25e2bc0c25e2bc),
420  U64(0x8b493c288b493c28), U64(0x41950dff41950dff),
421  U64(0x7101a8397101a839), U64(0xdeb30c08deb30c08),
422  U64(0x9ce4b4d89ce4b4d8), U64(0x90c1566490c15664),
423  U64(0x6184cb7b6184cb7b), U64(0x70b632d570b632d5),
424  U64(0x745c6c48745c6c48), U64(0x4257b8d04257b8d0)
425 };
426 static const u8 Td4[256] = {
427  0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
428  0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
429  0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
430  0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
431  0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
432  0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
433  0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
434  0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
435  0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
436  0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
437  0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
438  0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
439  0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
440  0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
441  0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
442  0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
443  0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
444  0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
445  0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
446  0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
447  0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
448  0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
449  0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
450  0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
451  0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
452  0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
453  0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
454  0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
455  0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
456  0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
457  0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
458  0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU
459 };
460 
461 static const u32 rcon[] = {
462  0x00000001U, 0x00000002U, 0x00000004U, 0x00000008U,
463  0x00000010U, 0x00000020U, 0x00000040U, 0x00000080U,
464  0x0000001bU, 0x00000036U, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
465 };
466 
470 int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
471  AES_KEY *key) {
472 
473  u32 *rk;
474  int i = 0;
475  u32 temp;
476 
477  if (!userKey || !key)
478  return -1;
479  if (bits != 128 && bits != 192 && bits != 256)
480  return -2;
481 
482  rk = key->rd_key;
483 
484  if (bits==128)
485  key->rounds = 10;
486  else if (bits==192)
487  key->rounds = 12;
488  else
489  key->rounds = 14;
490 
491  rk[0] = GETU32(userKey );
492  rk[1] = GETU32(userKey + 4);
493  rk[2] = GETU32(userKey + 8);
494  rk[3] = GETU32(userKey + 12);
495  if (bits == 128) {
496  while (1) {
497  temp = rk[3];
498  rk[4] = rk[0] ^
499  (Te4[(temp >> 8) & 0xff] ) ^
500  (Te4[(temp >> 16) & 0xff] << 8) ^
501  (Te4[(temp >> 24) ] << 16) ^
502  (Te4[(temp ) & 0xff] << 24) ^
503  rcon[i];
504  rk[5] = rk[1] ^ rk[4];
505  rk[6] = rk[2] ^ rk[5];
506  rk[7] = rk[3] ^ rk[6];
507  if (++i == 10) {
508  return 0;
509  }
510  rk += 4;
511  }
512  }
513  rk[4] = GETU32(userKey + 16);
514  rk[5] = GETU32(userKey + 20);
515  if (bits == 192) {
516  while (1) {
517  temp = rk[ 5];
518  rk[ 6] = rk[ 0] ^
519  (Te4[(temp >> 8) & 0xff] ) ^
520  (Te4[(temp >> 16) & 0xff] << 8) ^
521  (Te4[(temp >> 24) ] << 16) ^
522  (Te4[(temp ) & 0xff] << 24) ^
523  rcon[i];
524  rk[ 7] = rk[ 1] ^ rk[ 6];
525  rk[ 8] = rk[ 2] ^ rk[ 7];
526  rk[ 9] = rk[ 3] ^ rk[ 8];
527  if (++i == 8) {
528  return 0;
529  }
530  rk[10] = rk[ 4] ^ rk[ 9];
531  rk[11] = rk[ 5] ^ rk[10];
532  rk += 6;
533  }
534  }
535  rk[6] = GETU32(userKey + 24);
536  rk[7] = GETU32(userKey + 28);
537  if (bits == 256) {
538  while (1) {
539  temp = rk[ 7];
540  rk[ 8] = rk[ 0] ^
541  (Te4[(temp >> 8) & 0xff] ) ^
542  (Te4[(temp >> 16) & 0xff] << 8) ^
543  (Te4[(temp >> 24) ] << 16) ^
544  (Te4[(temp ) & 0xff] << 24) ^
545  rcon[i];
546  rk[ 9] = rk[ 1] ^ rk[ 8];
547  rk[10] = rk[ 2] ^ rk[ 9];
548  rk[11] = rk[ 3] ^ rk[10];
549  if (++i == 7) {
550  return 0;
551  }
552  temp = rk[11];
553  rk[12] = rk[ 4] ^
554  (Te4[(temp ) & 0xff] ) ^
555  (Te4[(temp >> 8) & 0xff] << 8) ^
556  (Te4[(temp >> 16) & 0xff] << 16) ^
557  (Te4[(temp >> 24) ] << 24);
558  rk[13] = rk[ 5] ^ rk[12];
559  rk[14] = rk[ 6] ^ rk[13];
560  rk[15] = rk[ 7] ^ rk[14];
561 
562  rk += 8;
563  }
564  }
565  return 0;
566 }
567 
571 int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
572  AES_KEY *key) {
573 
574  u32 *rk;
575  int i, j, status;
576  u32 temp;
577 
578  /* first, start with an encryption schedule */
579  status = AES_set_encrypt_key(userKey, bits, key);
580  if (status < 0)
581  return status;
582 
583  rk = key->rd_key;
584 
585  /* invert the order of the round keys: */
586  for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
587  temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
588  temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
589  temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
590  temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
591  }
592  /* apply the inverse MixColumn transform to all round keys but the first and the last: */
593  for (i = 1; i < (key->rounds); i++) {
594  rk += 4;
595 #if 1
596  for (j = 0; j < 4; j++) {
597  u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
598 
599  tp1 = rk[j];
600  m = tp1 & 0x80808080;
601  tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
602  ((m - (m >> 7)) & 0x1b1b1b1b);
603  m = tp2 & 0x80808080;
604  tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
605  ((m - (m >> 7)) & 0x1b1b1b1b);
606  m = tp4 & 0x80808080;
607  tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
608  ((m - (m >> 7)) & 0x1b1b1b1b);
609  tp9 = tp8 ^ tp1;
610  tpb = tp9 ^ tp2;
611  tpd = tp9 ^ tp4;
612  tpe = tp8 ^ tp4 ^ tp2;
613 #if defined(ROTATE)
614  rk[j] = tpe ^ ROTATE(tpd,16) ^
615  ROTATE(tp9,8) ^ ROTATE(tpb,24);
616 #else
617  rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
618  (tp9 >> 24) ^ (tp9 << 8) ^
619  (tpb >> 8) ^ (tpb << 24);
620 #endif
621  }
622 #else
623  rk[0] =
624  Td0[Te2[(rk[0] ) & 0xff] & 0xff] ^
625  Td1[Te2[(rk[0] >> 8) & 0xff] & 0xff] ^
626  Td2[Te2[(rk[0] >> 16) & 0xff] & 0xff] ^
627  Td3[Te2[(rk[0] >> 24) ] & 0xff];
628  rk[1] =
629  Td0[Te2[(rk[1] ) & 0xff] & 0xff] ^
630  Td1[Te2[(rk[1] >> 8) & 0xff] & 0xff] ^
631  Td2[Te2[(rk[1] >> 16) & 0xff] & 0xff] ^
632  Td3[Te2[(rk[1] >> 24) ] & 0xff];
633  rk[2] =
634  Td0[Te2[(rk[2] ) & 0xff] & 0xff] ^
635  Td1[Te2[(rk[2] >> 8) & 0xff] & 0xff] ^
636  Td2[Te2[(rk[2] >> 16) & 0xff] & 0xff] ^
637  Td3[Te2[(rk[2] >> 24) ] & 0xff];
638  rk[3] =
639  Td0[Te2[(rk[3] ) & 0xff] & 0xff] ^
640  Td1[Te2[(rk[3] >> 8) & 0xff] & 0xff] ^
641  Td2[Te2[(rk[3] >> 16) & 0xff] & 0xff] ^
642  Td3[Te2[(rk[3] >> 24) ] & 0xff];
643 #endif
644  }
645  return 0;
646 }
647 
648 /*
649  * Encrypt a single block
650  * in and out can overlap
651  */
652 void AES_encrypt(const unsigned char *in, unsigned char *out,
653  const AES_KEY *key) {
654 
655  const u32 *rk;
656  u32 s0, s1, s2, s3, t[4];
657  int r;
658 
659  assert(in && out && key);
660  rk = key->rd_key;
661 
662  /*
663  * map byte array block to cipher state
664  * and add initial round key:
665  */
666  s0 = GETU32(in ) ^ rk[0];
667  s1 = GETU32(in + 4) ^ rk[1];
668  s2 = GETU32(in + 8) ^ rk[2];
669  s3 = GETU32(in + 12) ^ rk[3];
670 
671 #if defined(AES_COMPACT_IN_OUTER_ROUNDS)
672  prefetch256(Te4);
673 
674  t[0] = Te4[(s0 ) & 0xff] ^
675  Te4[(s1 >> 8) & 0xff] << 8 ^
676  Te4[(s2 >> 16) & 0xff] << 16 ^
677  Te4[(s3 >> 24) ] << 24;
678  t[1] = Te4[(s1 ) & 0xff] ^
679  Te4[(s2 >> 8) & 0xff] << 8 ^
680  Te4[(s3 >> 16) & 0xff] << 16 ^
681  Te4[(s0 >> 24) ] << 24;
682  t[2] = Te4[(s2 ) & 0xff] ^
683  Te4[(s3 >> 8) & 0xff] << 8 ^
684  Te4[(s0 >> 16) & 0xff] << 16 ^
685  Te4[(s1 >> 24) ] << 24;
686  t[3] = Te4[(s3 ) & 0xff] ^
687  Te4[(s0 >> 8) & 0xff] << 8 ^
688  Te4[(s1 >> 16) & 0xff] << 16 ^
689  Te4[(s2 >> 24) ] << 24;
690 
691  /* now do the linear transform using words */
692  { int i;
693  u32 r0, r1, r2;
694 
695  for (i = 0; i < 4; i++) {
696  r0 = t[i];
697  r1 = r0 & 0x80808080;
698  r2 = ((r0 & 0x7f7f7f7f) << 1) ^
699  ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
700 #if defined(ROTATE)
701  t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
702  ROTATE(r0,16) ^ ROTATE(r0,8);
703 #else
704  t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
705  (r0 << 16) ^ (r0 >> 16) ^
706  (r0 << 8) ^ (r0 >> 24);
707 #endif
708  t[i] ^= rk[4+i];
709  }
710  }
711 #else
712  t[0] = Te0[(s0 ) & 0xff] ^
713  Te1[(s1 >> 8) & 0xff] ^
714  Te2[(s2 >> 16) & 0xff] ^
715  Te3[(s3 >> 24) ] ^
716  rk[4];
717  t[1] = Te0[(s1 ) & 0xff] ^
718  Te1[(s2 >> 8) & 0xff] ^
719  Te2[(s3 >> 16) & 0xff] ^
720  Te3[(s0 >> 24) ] ^
721  rk[5];
722  t[2] = Te0[(s2 ) & 0xff] ^
723  Te1[(s3 >> 8) & 0xff] ^
724  Te2[(s0 >> 16) & 0xff] ^
725  Te3[(s1 >> 24) ] ^
726  rk[6];
727  t[3] = Te0[(s3 ) & 0xff] ^
728  Te1[(s0 >> 8) & 0xff] ^
729  Te2[(s1 >> 16) & 0xff] ^
730  Te3[(s2 >> 24) ] ^
731  rk[7];
732 #endif
733  s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
734 
735  /*
736  * Nr - 2 full rounds:
737  */
738  for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
739 #if defined(AES_COMPACT_IN_INNER_ROUNDS)
740  t[0] = Te4[(s0 ) & 0xff] ^
741  Te4[(s1 >> 8) & 0xff] << 8 ^
742  Te4[(s2 >> 16) & 0xff] << 16 ^
743  Te4[(s3 >> 24) ] << 24;
744  t[1] = Te4[(s1 ) & 0xff] ^
745  Te4[(s2 >> 8) & 0xff] << 8 ^
746  Te4[(s3 >> 16) & 0xff] << 16 ^
747  Te4[(s0 >> 24) ] << 24;
748  t[2] = Te4[(s2 ) & 0xff] ^
749  Te4[(s3 >> 8) & 0xff] << 8 ^
750  Te4[(s0 >> 16) & 0xff] << 16 ^
751  Te4[(s1 >> 24) ] << 24;
752  t[3] = Te4[(s3 ) & 0xff] ^
753  Te4[(s0 >> 8) & 0xff] << 8 ^
754  Te4[(s1 >> 16) & 0xff] << 16 ^
755  Te4[(s2 >> 24) ] << 24;
756 
757  /* now do the linear transform using words */
758  { int i;
759  u32 r0, r1, r2;
760 
761  for (i = 0; i < 4; i++) {
762  r0 = t[i];
763  r1 = r0 & 0x80808080;
764  r2 = ((r0 & 0x7f7f7f7f) << 1) ^
765  ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
766 #if defined(ROTATE)
767  t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
768  ROTATE(r0,16) ^ ROTATE(r0,8);
769 #else
770  t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
771  (r0 << 16) ^ (r0 >> 16) ^
772  (r0 << 8) ^ (r0 >> 24);
773 #endif
774  t[i] ^= rk[i];
775  }
776  }
777 #else
778  t[0] = Te0[(s0 ) & 0xff] ^
779  Te1[(s1 >> 8) & 0xff] ^
780  Te2[(s2 >> 16) & 0xff] ^
781  Te3[(s3 >> 24) ] ^
782  rk[0];
783  t[1] = Te0[(s1 ) & 0xff] ^
784  Te1[(s2 >> 8) & 0xff] ^
785  Te2[(s3 >> 16) & 0xff] ^
786  Te3[(s0 >> 24) ] ^
787  rk[1];
788  t[2] = Te0[(s2 ) & 0xff] ^
789  Te1[(s3 >> 8) & 0xff] ^
790  Te2[(s0 >> 16) & 0xff] ^
791  Te3[(s1 >> 24) ] ^
792  rk[2];
793  t[3] = Te0[(s3 ) & 0xff] ^
794  Te1[(s0 >> 8) & 0xff] ^
795  Te2[(s1 >> 16) & 0xff] ^
796  Te3[(s2 >> 24) ] ^
797  rk[3];
798 #endif
799  s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
800  }
801  /*
802  * apply last round and
803  * map cipher state to byte array block:
804  */
805 #if defined(AES_COMPACT_IN_OUTER_ROUNDS)
806  prefetch256(Te4);
807 
808  *(u32*)(out+0) =
809  Te4[(s0 ) & 0xff] ^
810  Te4[(s1 >> 8) & 0xff] << 8 ^
811  Te4[(s2 >> 16) & 0xff] << 16 ^
812  Te4[(s3 >> 24) ] << 24 ^
813  rk[0];
814  *(u32*)(out+4) =
815  Te4[(s1 ) & 0xff] ^
816  Te4[(s2 >> 8) & 0xff] << 8 ^
817  Te4[(s3 >> 16) & 0xff] << 16 ^
818  Te4[(s0 >> 24) ] << 24 ^
819  rk[1];
820  *(u32*)(out+8) =
821  Te4[(s2 ) & 0xff] ^
822  Te4[(s3 >> 8) & 0xff] << 8 ^
823  Te4[(s0 >> 16) & 0xff] << 16 ^
824  Te4[(s1 >> 24) ] << 24 ^
825  rk[2];
826  *(u32*)(out+12) =
827  Te4[(s3 ) & 0xff] ^
828  Te4[(s0 >> 8) & 0xff] << 8 ^
829  Te4[(s1 >> 16) & 0xff] << 16 ^
830  Te4[(s2 >> 24) ] << 24 ^
831  rk[3];
832 #else
833  *(u32*)(out+0) =
834  (Te2[(s0 ) & 0xff] & 0x000000ffU) ^
835  (Te3[(s1 >> 8) & 0xff] & 0x0000ff00U) ^
836  (Te0[(s2 >> 16) & 0xff] & 0x00ff0000U) ^
837  (Te1[(s3 >> 24) ] & 0xff000000U) ^
838  rk[0];
839  *(u32*)(out+4) =
840  (Te2[(s1 ) & 0xff] & 0x000000ffU) ^
841  (Te3[(s2 >> 8) & 0xff] & 0x0000ff00U) ^
842  (Te0[(s3 >> 16) & 0xff] & 0x00ff0000U) ^
843  (Te1[(s0 >> 24) ] & 0xff000000U) ^
844  rk[1];
845  *(u32*)(out+8) =
846  (Te2[(s2 ) & 0xff] & 0x000000ffU) ^
847  (Te3[(s3 >> 8) & 0xff] & 0x0000ff00U) ^
848  (Te0[(s0 >> 16) & 0xff] & 0x00ff0000U) ^
849  (Te1[(s1 >> 24) ] & 0xff000000U) ^
850  rk[2];
851  *(u32*)(out+12) =
852  (Te2[(s3 ) & 0xff] & 0x000000ffU) ^
853  (Te3[(s0 >> 8) & 0xff] & 0x0000ff00U) ^
854  (Te0[(s1 >> 16) & 0xff] & 0x00ff0000U) ^
855  (Te1[(s2 >> 24) ] & 0xff000000U) ^
856  rk[3];
857 #endif
858 }
859 
860 /*
861  * Decrypt a single block
862  * in and out can overlap
863  */
864 void AES_decrypt(const unsigned char *in, unsigned char *out,
865  const AES_KEY *key) {
866 
867  const u32 *rk;
868  u32 s0, s1, s2, s3, t[4];
869  int r;
870 
871  assert(in && out && key);
872  rk = key->rd_key;
873 
874  /*
875  * map byte array block to cipher state
876  * and add initial round key:
877  */
878  s0 = GETU32(in ) ^ rk[0];
879  s1 = GETU32(in + 4) ^ rk[1];
880  s2 = GETU32(in + 8) ^ rk[2];
881  s3 = GETU32(in + 12) ^ rk[3];
882 
883 #if defined(AES_COMPACT_IN_OUTER_ROUNDS)
884  prefetch256(Td4);
885 
886  t[0] = Td4[(s0 ) & 0xff] ^
887  Td4[(s3 >> 8) & 0xff] << 8 ^
888  Td4[(s2 >> 16) & 0xff] << 16 ^
889  Td4[(s1 >> 24) ] << 24;
890  t[1] = Td4[(s1 ) & 0xff] ^
891  Td4[(s0 >> 8) & 0xff] << 8 ^
892  Td4[(s3 >> 16) & 0xff] << 16 ^
893  Td4[(s2 >> 24) ] << 24;
894  t[2] = Td4[(s2 ) & 0xff] ^
895  Td4[(s1 >> 8) & 0xff] << 8 ^
896  Td4[(s0 >> 16) & 0xff] << 16 ^
897  Td4[(s3 >> 24) ] << 24;
898  t[3] = Td4[(s3 ) & 0xff] ^
899  Td4[(s2 >> 8) & 0xff] << 8 ^
900  Td4[(s1 >> 16) & 0xff] << 16 ^
901  Td4[(s0 >> 24) ] << 24;
902 
903  /* now do the linear transform using words */
904  { int i;
905  u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
906 
907  for (i = 0; i < 4; i++) {
908  tp1 = t[i];
909  m = tp1 & 0x80808080;
910  tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
911  ((m - (m >> 7)) & 0x1b1b1b1b);
912  m = tp2 & 0x80808080;
913  tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
914  ((m - (m >> 7)) & 0x1b1b1b1b);
915  m = tp4 & 0x80808080;
916  tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
917  ((m - (m >> 7)) & 0x1b1b1b1b);
918  tp9 = tp8 ^ tp1;
919  tpb = tp9 ^ tp2;
920  tpd = tp9 ^ tp4;
921  tpe = tp8 ^ tp4 ^ tp2;
922 #if defined(ROTATE)
923  t[i] = tpe ^ ROTATE(tpd,16) ^
924  ROTATE(tp9,8) ^ ROTATE(tpb,24);
925 #else
926  t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
927  (tp9 >> 24) ^ (tp9 << 8) ^
928  (tpb >> 8) ^ (tpb << 24);
929 #endif
930  t[i] ^= rk[4+i];
931  }
932  }
933 #else
934  t[0] = Td0[(s0 ) & 0xff] ^
935  Td1[(s3 >> 8) & 0xff] ^
936  Td2[(s2 >> 16) & 0xff] ^
937  Td3[(s1 >> 24) ] ^
938  rk[4];
939  t[1] = Td0[(s1 ) & 0xff] ^
940  Td1[(s0 >> 8) & 0xff] ^
941  Td2[(s3 >> 16) & 0xff] ^
942  Td3[(s2 >> 24) ] ^
943  rk[5];
944  t[2] = Td0[(s2 ) & 0xff] ^
945  Td1[(s1 >> 8) & 0xff] ^
946  Td2[(s0 >> 16) & 0xff] ^
947  Td3[(s3 >> 24) ] ^
948  rk[6];
949  t[3] = Td0[(s3 ) & 0xff] ^
950  Td1[(s2 >> 8) & 0xff] ^
951  Td2[(s1 >> 16) & 0xff] ^
952  Td3[(s0 >> 24) ] ^
953  rk[7];
954 #endif
955  s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
956 
957  /*
958  * Nr - 2 full rounds:
959  */
960  for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
961 #if defined(AES_COMPACT_IN_INNER_ROUNDS)
962  t[0] = Td4[(s0 ) & 0xff] ^
963  Td4[(s3 >> 8) & 0xff] << 8 ^
964  Td4[(s2 >> 16) & 0xff] << 16 ^
965  Td4[(s1 >> 24) ] << 24;
966  t[1] = Td4[(s1 ) & 0xff] ^
967  Td4[(s0 >> 8) & 0xff] << 8 ^
968  Td4[(s3 >> 16) & 0xff] << 16 ^
969  Td4[(s2 >> 24) ] << 24;
970  t[2] = Td4[(s2 ) & 0xff] ^
971  Td4[(s1 >> 8) & 0xff] << 8 ^
972  Td4[(s0 >> 16) & 0xff] << 16 ^
973  Td4[(s3 >> 24) ] << 24;
974  t[3] = Td4[(s3 ) & 0xff] ^
975  Td4[(s2 >> 8) & 0xff] << 8 ^
976  Td4[(s1 >> 16) & 0xff] << 16 ^
977  Td4[(s0 >> 24) ] << 24;
978 
979  /* now do the linear transform using words */
980  { int i;
981  u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
982 
983  for (i = 0; i < 4; i++) {
984  tp1 = t[i];
985  m = tp1 & 0x80808080;
986  tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
987  ((m - (m >> 7)) & 0x1b1b1b1b);
988  m = tp2 & 0x80808080;
989  tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
990  ((m - (m >> 7)) & 0x1b1b1b1b);
991  m = tp4 & 0x80808080;
992  tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
993  ((m - (m >> 7)) & 0x1b1b1b1b);
994  tp9 = tp8 ^ tp1;
995  tpb = tp9 ^ tp2;
996  tpd = tp9 ^ tp4;
997  tpe = tp8 ^ tp4 ^ tp2;
998 #if defined(ROTATE)
999  t[i] = tpe ^ ROTATE(tpd,16) ^
1000  ROTATE(tp9,8) ^ ROTATE(tpb,24);
1001 #else
1002  t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
1003  (tp9 >> 24) ^ (tp9 << 8) ^
1004  (tpb >> 8) ^ (tpb << 24);
1005 #endif
1006  t[i] ^= rk[i];
1007  }
1008  }
1009 #else
1010  t[0] = Td0[(s0 ) & 0xff] ^
1011  Td1[(s3 >> 8) & 0xff] ^
1012  Td2[(s2 >> 16) & 0xff] ^
1013  Td3[(s1 >> 24) ] ^
1014  rk[0];
1015  t[1] = Td0[(s1 ) & 0xff] ^
1016  Td1[(s0 >> 8) & 0xff] ^
1017  Td2[(s3 >> 16) & 0xff] ^
1018  Td3[(s2 >> 24) ] ^
1019  rk[1];
1020  t[2] = Td0[(s2 ) & 0xff] ^
1021  Td1[(s1 >> 8) & 0xff] ^
1022  Td2[(s0 >> 16) & 0xff] ^
1023  Td3[(s3 >> 24) ] ^
1024  rk[2];
1025  t[3] = Td0[(s3 ) & 0xff] ^
1026  Td1[(s2 >> 8) & 0xff] ^
1027  Td2[(s1 >> 16) & 0xff] ^
1028  Td3[(s0 >> 24) ] ^
1029  rk[3];
1030 #endif
1031  s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
1032  }
1033  /*
1034  * apply last round and
1035  * map cipher state to byte array block:
1036  */
1037  prefetch256(Td4);
1038 
1039  *(u32*)(out+0) =
1040  (Td4[(s0 ) & 0xff]) ^
1041  (Td4[(s3 >> 8) & 0xff] << 8) ^
1042  (Td4[(s2 >> 16) & 0xff] << 16) ^
1043  (Td4[(s1 >> 24) ] << 24) ^
1044  rk[0];
1045  *(u32*)(out+4) =
1046  (Td4[(s1 ) & 0xff]) ^
1047  (Td4[(s0 >> 8) & 0xff] << 8) ^
1048  (Td4[(s3 >> 16) & 0xff] << 16) ^
1049  (Td4[(s2 >> 24) ] << 24) ^
1050  rk[1];
1051  *(u32*)(out+8) =
1052  (Td4[(s2 ) & 0xff]) ^
1053  (Td4[(s1 >> 8) & 0xff] << 8) ^
1054  (Td4[(s0 >> 16) & 0xff] << 16) ^
1055  (Td4[(s3 >> 24) ] << 24) ^
1056  rk[2];
1057  *(u32*)(out+12) =
1058  (Td4[(s3 ) & 0xff]) ^
1059  (Td4[(s2 >> 8) & 0xff] << 8) ^
1060  (Td4[(s1 >> 16) & 0xff] << 16) ^
1061  (Td4[(s0 >> 24) ] << 24) ^
1062  rk[3];
1063 }