00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067 #include "pch.h"
00068
00069 #ifndef CRYPTOPP_IMPORTS
00070 #ifndef CRYPTOPP_GENERATE_X64_MASM
00071
00072 #include "rijndael.h"
00073 #include "misc.h"
00074 #include "cpu.h"
00075
00076 NAMESPACE_BEGIN(CryptoPP)
00077
00078 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00079 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
00080 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
00081 using namespace rdtable;
00082 #else
00083 static word64 Te[256];
00084 #endif
00085 static word64 Td[256];
00086 #else
00087 static word32 Te[256*4], Td[256*4];
00088 #endif
00089 static volatile bool s_TeFilled = false, s_TdFilled = false;
00090
00091
00092
00093 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
00094 a ^= L(T, 3, byte(t)); t >>= 8;\
00095 b ^= L(T, 2, byte(t)); t >>= 8;\
00096 c ^= L(T, 1, byte(t)); t >>= 8;\
00097 d ^= L(T, 0, t);
00098
00099 #define QUARTER_ROUND_LE(t, a, b, c, d) \
00100 tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00101 tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00102 tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00103 tempBlock[d] = ((byte *)(Te+t))[1];
00104
00105 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00106 #define QUARTER_ROUND_LD(t, a, b, c, d) \
00107 tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00108 tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00109 tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00110 tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
00111 #else
00112 #define QUARTER_ROUND_LD(t, a, b, c, d) \
00113 tempBlock[a] = Sd[byte(t)]; t >>= 8;\
00114 tempBlock[b] = Sd[byte(t)]; t >>= 8;\
00115 tempBlock[c] = Sd[byte(t)]; t >>= 8;\
00116 tempBlock[d] = Sd[t];
00117 #endif
00118
00119 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
00120 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
00121
00122 #ifdef IS_LITTLE_ENDIAN
00123 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
00124 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
00125 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00126 #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (6-i)%4+1))
00127 #define TL_M(T, i, x) (*(word32 *)((byte *)T + x*8 + (i+3)%4+1))
00128 #else
00129 #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
00130 #define TL_M(T, i, x) T[i*256 + x]
00131 #endif
00132 #else
00133 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
00134 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
00135 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00136 #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (4-i)%4))
00137 #define TL_M TL_F
00138 #else
00139 #define TL_F(T, i, x) rotrFixed(T[x], i*8)
00140 #define TL_M(T, i, x) T[i*256 + x]
00141 #endif
00142 #endif
00143
00144
00145 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
00146 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
00147 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
00148
00149 #define f3(x) (f2(x) ^ x)
00150 #define f9(x) (f8(x) ^ x)
00151 #define fb(x) (f8(x) ^ f2(x) ^ x)
00152 #define fd(x) (f8(x) ^ f4(x) ^ x)
00153 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
00154
00155 void Rijndael::Base::FillEncTable()
00156 {
00157 for (int i=0; i<256; i++)
00158 {
00159 byte x = Se[i];
00160 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00161 word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
00162 Te[i] = word64(y | f3(x))<<32 | y;
00163 #else
00164 word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
00165 for (int j=0; j<4; j++)
00166 {
00167 Te[i+j*256] = y;
00168 y = rotrFixed(y, 8);
00169 }
00170 #endif
00171 }
00172 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
00173 Te[256] = Te[257] = 0;
00174 #endif
00175 s_TeFilled = true;
00176 }
00177
00178 void Rijndael::Base::FillDecTable()
00179 {
00180 for (int i=0; i<256; i++)
00181 {
00182 byte x = Sd[i];
00183 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00184 word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
00185 Td[i] = word64(y | fb(x))<<32 | y | x;
00186 #else
00187 word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
00188 for (int j=0; j<4; j++)
00189 {
00190 Td[i+j*256] = y;
00191 y = rotrFixed(y, 8);
00192 }
00193 #endif
00194 }
00195 s_TdFilled = true;
00196 }
00197
00198 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
00199 {
00200 AssertValidKeyLength(keylen);
00201
00202 m_rounds = keylen/4 + 6;
00203 m_key.New(4*(m_rounds+1));
00204
00205 word32 *rk = m_key;
00206
00207 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86)
00208
00209 if (HasAESNI())
00210 {
00211 static const word32 rcLE[] = {
00212 0x01, 0x02, 0x04, 0x08,
00213 0x10, 0x20, 0x40, 0x80,
00214 0x1B, 0x36,
00215 };
00216 const word32 *rc = rcLE;
00217
00218 __m128i temp = _mm_loadu_si128((__m128i *)(userKey+keylen-16));
00219 memcpy(rk, userKey, keylen);
00220
00221 while (true)
00222 {
00223 rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
00224 rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
00225 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
00226 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
00227
00228 if (rk + keylen/4 + 4 == m_key.end())
00229 break;
00230
00231 if (keylen == 24)
00232 {
00233 rk[10] = rk[ 4] ^ rk[ 9];
00234 rk[11] = rk[ 5] ^ rk[10];
00235 temp = _mm_insert_epi32(temp, rk[11], 3);
00236 }
00237 else if (keylen == 32)
00238 {
00239 temp = _mm_insert_epi32(temp, rk[11], 3);
00240 rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
00241 rk[13] = rk[ 5] ^ rk[12];
00242 rk[14] = rk[ 6] ^ rk[13];
00243 rk[15] = rk[ 7] ^ rk[14];
00244 temp = _mm_insert_epi32(temp, rk[15], 3);
00245 }
00246 else
00247 temp = _mm_insert_epi32(temp, rk[7], 3);
00248
00249 rk += keylen/4;
00250 }
00251
00252 if (!IsForwardTransformation())
00253 {
00254 rk = m_key;
00255 unsigned int i, j;
00256
00257 std::swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
00258
00259 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
00260 {
00261 temp = _mm_aesimc_si128(*(__m128i *)(rk+i));
00262 *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+j));
00263 *(__m128i *)(rk+j) = temp;
00264 }
00265
00266 *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+i));
00267 }
00268
00269 return;
00270 }
00271 #endif
00272
00273 GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
00274 const word32 *rc = rcon;
00275 word32 temp;
00276
00277 while (true)
00278 {
00279 temp = rk[keylen/4-1];
00280 word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
00281 rk[keylen/4] = rk[0] ^ x ^ *(rc++);
00282 rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
00283 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
00284 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
00285
00286 if (rk + keylen/4 + 4 == m_key.end())
00287 break;
00288
00289 if (keylen == 24)
00290 {
00291 rk[10] = rk[ 4] ^ rk[ 9];
00292 rk[11] = rk[ 5] ^ rk[10];
00293 }
00294 else if (keylen == 32)
00295 {
00296 temp = rk[11];
00297 rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
00298 rk[13] = rk[ 5] ^ rk[12];
00299 rk[14] = rk[ 6] ^ rk[13];
00300 rk[15] = rk[ 7] ^ rk[14];
00301 }
00302 rk += keylen/4;
00303 }
00304
00305 rk = m_key;
00306
00307 if (IsForwardTransformation())
00308 {
00309 if (!s_TeFilled)
00310 FillEncTable();
00311
00312 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
00313 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
00314 }
00315 else
00316 {
00317 if (!s_TdFilled)
00318 FillDecTable();
00319
00320 unsigned int i, j;
00321
00322 #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
00323
00324 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
00325 {
00326 temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
00327 temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
00328 temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
00329 temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
00330 }
00331
00332 rk[i+0] = InverseMixColumn(rk[i+0]);
00333 rk[i+1] = InverseMixColumn(rk[i+1]);
00334 rk[i+2] = InverseMixColumn(rk[i+2]);
00335 rk[i+3] = InverseMixColumn(rk[i+3]);
00336
00337 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
00338 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
00339 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
00340 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
00341 }
00342
00343 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
00344 if (HasAESNI())
00345 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
00346 #endif
00347 }
00348
00349 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
00350 {
00351 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
00352 if (HasSSE2())
00353 {
00354 Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
00355 return;
00356 }
00357 #endif
00358
00359 typedef BlockGetAndPut<word32, NativeByteOrder> Block;
00360
00361 word32 s0, s1, s2, s3, t0, t1, t2, t3;
00362 Block::Get(inBlock)(s0)(s1)(s2)(s3);
00363
00364 const word32 *rk = m_key;
00365 s0 ^= rk[0];
00366 s1 ^= rk[1];
00367 s2 ^= rk[2];
00368 s3 ^= rk[3];
00369 t0 = rk[4];
00370 t1 = rk[5];
00371 t2 = rk[6];
00372 t3 = rk[7];
00373 rk += 8;
00374
00375
00376 const int cacheLineSize = GetCacheLineSize();
00377 unsigned int i;
00378 word32 u = 0;
00379 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00380 for (i=0; i<2048; i+=cacheLineSize)
00381 #else
00382 for (i=0; i<1024; i+=cacheLineSize)
00383 #endif
00384 u &= *(const word32 *)(((const byte *)Te)+i);
00385 u &= Te[255];
00386 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
00387
00388 QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
00389 QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
00390 QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
00391 QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
00392
00393
00394 unsigned int r = m_rounds/2 - 1;
00395 do
00396 {
00397 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
00398
00399 QUARTER_ROUND_E(t3, s0, s1, s2, s3)
00400 QUARTER_ROUND_E(t2, s3, s0, s1, s2)
00401 QUARTER_ROUND_E(t1, s2, s3, s0, s1)
00402 QUARTER_ROUND_E(t0, s1, s2, s3, s0)
00403
00404 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
00405
00406 QUARTER_ROUND_E(s3, t0, t1, t2, t3)
00407 QUARTER_ROUND_E(s2, t3, t0, t1, t2)
00408 QUARTER_ROUND_E(s1, t2, t3, t0, t1)
00409 QUARTER_ROUND_E(s0, t1, t2, t3, t0)
00410
00411 rk += 8;
00412 } while (--r);
00413
00414 word32 tbw[4];
00415 byte *const tempBlock = (byte *)tbw;
00416
00417 QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
00418 QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
00419 QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
00420 QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
00421
00422 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
00423 }
00424
00425 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
00426 {
00427 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
00428 if (HasAESNI())
00429 {
00430 Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
00431 return;
00432 }
00433 #endif
00434
00435 typedef BlockGetAndPut<word32, NativeByteOrder> Block;
00436
00437 word32 s0, s1, s2, s3, t0, t1, t2, t3;
00438 Block::Get(inBlock)(s0)(s1)(s2)(s3);
00439
00440 const word32 *rk = m_key;
00441 s0 ^= rk[0];
00442 s1 ^= rk[1];
00443 s2 ^= rk[2];
00444 s3 ^= rk[3];
00445 t0 = rk[4];
00446 t1 = rk[5];
00447 t2 = rk[6];
00448 t3 = rk[7];
00449 rk += 8;
00450
00451
00452 const int cacheLineSize = GetCacheLineSize();
00453 unsigned int i;
00454 word32 u = 0;
00455 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00456 for (i=0; i<2048; i+=cacheLineSize)
00457 #else
00458 for (i=0; i<1024; i+=cacheLineSize)
00459 #endif
00460 u &= *(const word32 *)(((const byte *)Td)+i);
00461 u &= Td[255];
00462 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
00463
00464 QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
00465 QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
00466 QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
00467 QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
00468
00469
00470 unsigned int r = m_rounds/2 - 1;
00471 do
00472 {
00473 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
00474
00475 QUARTER_ROUND_D(t3, s2, s1, s0, s3)
00476 QUARTER_ROUND_D(t2, s1, s0, s3, s2)
00477 QUARTER_ROUND_D(t1, s0, s3, s2, s1)
00478 QUARTER_ROUND_D(t0, s3, s2, s1, s0)
00479
00480 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
00481
00482 QUARTER_ROUND_D(s3, t2, t1, t0, t3)
00483 QUARTER_ROUND_D(s2, t1, t0, t3, t2)
00484 QUARTER_ROUND_D(s1, t0, t3, t2, t1)
00485 QUARTER_ROUND_D(s0, t3, t2, t1, t0)
00486
00487 rk += 8;
00488 } while (--r);
00489
00490 #ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00491
00492
00493
00494 u = 0;
00495 for (i=0; i<256; i+=cacheLineSize)
00496 u &= *(const word32 *)(Sd+i);
00497 u &= *(const word32 *)(Sd+252);
00498 t0 |= u; t1 |= u; t2 |= u; t3 |= u;
00499 #endif
00500
00501 word32 tbw[4];
00502 byte *const tempBlock = (byte *)tbw;
00503
00504 QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
00505 QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
00506 QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
00507 QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
00508
00509 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
00510 }
00511
00512
00513
00514 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
00515
00516 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
00517
00518 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00519
00520 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k)
00521 {
00522 #if CRYPTOPP_BOOL_X86
00523
00524 #define L_REG esp
00525 #define L_INDEX(i) (L_REG+512+i)
00526 #define L_INXORBLOCKS L_INBLOCKS+4
00527 #define L_OUTXORBLOCKS L_INBLOCKS+8
00528 #define L_OUTBLOCKS L_INBLOCKS+12
00529 #define L_INCREMENTS L_INDEX(16*15)
00530 #define L_SP L_INDEX(16*16)
00531 #define L_LENGTH L_INDEX(16*16+4)
00532 #define L_KEYS_BEGIN L_INDEX(16*16+8)
00533
00534 #define MOVD movd
00535 #define MM(i) mm##i
00536
00537 #define MXOR(a,b,c) \
00538 AS2( movzx esi, b)\
00539 AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00540 AS2( pxor MM(a), mm7)\
00541
00542 #define MMOV(a,b,c) \
00543 AS2( movzx esi, b)\
00544 AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00545
00546 #else
00547
00548 #define L_REG r8
00549 #define L_INDEX(i) (L_REG+i)
00550 #define L_INXORBLOCKS L_INBLOCKS+8
00551 #define L_OUTXORBLOCKS L_INBLOCKS+16
00552 #define L_OUTBLOCKS L_INBLOCKS+24
00553 #define L_INCREMENTS L_INDEX(16*16)
00554 #define L_LENGTH L_INDEX(16*18+8)
00555 #define L_KEYS_BEGIN L_INDEX(16*19)
00556
00557 #define MOVD mov
00558 #define MM_0 r9d
00559 #define MM_1 r12d
00560 #ifdef __GNUC__
00561 #define MM_2 r11d
00562 #else
00563 #define MM_2 r10d
00564 #endif
00565 #define MM(i) MM_##i
00566
00567 #define MXOR(a,b,c) \
00568 AS2( movzx esi, b)\
00569 AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00570
00571 #define MMOV(a,b,c) \
00572 AS2( movzx esi, b)\
00573 AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00574
00575 #endif
00576
00577 #define L_SUBKEYS L_INDEX(0)
00578 #define L_SAVED_X L_SUBKEYS
00579 #define L_KEY12 L_INDEX(16*12)
00580 #define L_LASTROUND L_INDEX(16*13)
00581 #define L_INBLOCKS L_INDEX(16*14)
00582 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
00583
00584 #define XOR(a,b,c) \
00585 AS2( movzx esi, b)\
00586 AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00587
00588 #define MOV(a,b,c) \
00589 AS2( movzx esi, b)\
00590 AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00591
00592 #ifdef CRYPTOPP_GENERATE_X64_MASM
00593 ALIGN 8
00594 Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
00595 rex_push_reg rsi
00596 push_reg rdi
00597 push_reg rbx
00598 push_reg r12
00599 .endprolog
00600 mov L_REG, rcx
00601 mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
00602 mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
00603 #elif defined(__GNUC__)
00604 __asm__ __volatile__
00605 (
00606 ".intel_syntax noprefix;"
00607 #if CRYPTOPP_BOOL_X64
00608 AS2( mov L_REG, rcx)
00609 #endif
00610 AS_PUSH_IF86(bx)
00611 AS_PUSH_IF86(bp)
00612 AS2( mov AS_REG_7, WORD_REG(si))
00613 #else
00614 AS_PUSH_IF86(si)
00615 AS_PUSH_IF86(di)
00616 AS_PUSH_IF86(bx)
00617 AS_PUSH_IF86(bp)
00618 AS2( lea AS_REG_7, [Te])
00619 AS2( mov edi, [g_cacheLineSize])
00620 #endif
00621
00622 #if CRYPTOPP_BOOL_X86
00623 AS2( mov [ecx+16*12+16*4], esp)
00624 AS2( lea esp, [ecx-512])
00625 #endif
00626
00627
00628 AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
00629 AS2( mov WORD_REG(ax), 16)
00630 AS2( and WORD_REG(ax), WORD_REG(si))
00631 AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)])
00632 AS2( movdqa [L_KEY12], xmm3)
00633 AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
00634 AS2( sub WORD_REG(ax), WORD_REG(si))
00635 ASL(0)
00636 AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
00637 AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
00638 AS2( add WORD_REG(si), 16)
00639 AS2( cmp WORD_REG(si), 16*12)
00640 ASJ( jl, 0, b)
00641
00642
00643 AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)])
00644 AS2( movdqa xmm1, [WORD_REG(dx)])
00645 AS2( MOVD MM(1), [WORD_REG(dx)+4*4])
00646 AS2( mov ebx, [WORD_REG(dx)+5*4])
00647 AS2( mov ecx, [WORD_REG(dx)+6*4])
00648 AS2( mov edx, [WORD_REG(dx)+7*4])
00649
00650
00651 AS2( xor WORD_REG(ax), WORD_REG(ax))
00652 ASL(9)
00653 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00654 AS2( add WORD_REG(ax), WORD_REG(di))
00655 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00656 AS2( add WORD_REG(ax), WORD_REG(di))
00657 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00658 AS2( add WORD_REG(ax), WORD_REG(di))
00659 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00660 AS2( add WORD_REG(ax), WORD_REG(di))
00661 AS2( cmp WORD_REG(ax), 2048)
00662 ASJ( jl, 9, b)
00663 AS1( lfence)
00664
00665 AS2( test DWORD PTR [L_LENGTH], 1)
00666 ASJ( jz, 8, f)
00667
00668
00669 AS2( mov WORD_REG(si), [L_INBLOCKS])
00670 AS2( movdqu xmm2, [WORD_REG(si)])
00671 AS2( pxor xmm2, xmm1)
00672 AS2( psrldq xmm1, 14)
00673 AS2( movd eax, xmm1)
00674 AS2( mov al, BYTE PTR [WORD_REG(si)+15])
00675 AS2( MOVD MM(2), eax)
00676 #if CRYPTOPP_BOOL_X86
00677 AS2( mov eax, 1)
00678 AS2( movd mm3, eax)
00679 #endif
00680
00681
00682 AS2( movd eax, xmm2)
00683 AS2( psrldq xmm2, 4)
00684 AS2( movd edi, xmm2)
00685 AS2( psrldq xmm2, 4)
00686 MXOR( 1, al, 0)
00687 XOR( edx, ah, 1)
00688 AS2( shr eax, 16)
00689 XOR( ecx, al, 2)
00690 XOR( ebx, ah, 3)
00691 AS2( mov eax, edi)
00692 AS2( movd edi, xmm2)
00693 AS2( psrldq xmm2, 4)
00694 XOR( ebx, al, 0)
00695 MXOR( 1, ah, 1)
00696 AS2( shr eax, 16)
00697 XOR( edx, al, 2)
00698 XOR( ecx, ah, 3)
00699 AS2( mov eax, edi)
00700 AS2( movd edi, xmm2)
00701 XOR( ecx, al, 0)
00702 XOR( ebx, ah, 1)
00703 AS2( shr eax, 16)
00704 MXOR( 1, al, 2)
00705 XOR( edx, ah, 3)
00706 AS2( mov eax, edi)
00707 XOR( edx, al, 0)
00708 XOR( ecx, ah, 1)
00709 AS2( shr eax, 16)
00710 XOR( ebx, al, 2)
00711 AS2( psrldq xmm2, 3)
00712
00713
00714 AS2( mov eax, [L_KEY12+0*4])
00715 AS2( mov edi, [L_KEY12+2*4])
00716 AS2( MOVD MM(0), [L_KEY12+3*4])
00717 MXOR( 0, cl, 3)
00718 XOR( edi, bl, 3)
00719 MXOR( 0, bh, 2)
00720 AS2( shr ebx, 16)
00721 XOR( eax, bl, 1)
00722 MOV( ebx, bh, 0)
00723 AS2( xor ebx, [L_KEY12+1*4])
00724 XOR( eax, ch, 2)
00725 AS2( shr ecx, 16)
00726 XOR( eax, dl, 3)
00727 XOR( ebx, dh, 2)
00728 AS2( shr edx, 16)
00729 XOR( edi, ch, 0)
00730 XOR( ebx, cl, 1)
00731 XOR( edi, dl, 1)
00732 MXOR( 0, dh, 0)
00733
00734 AS2( movd ecx, xmm2)
00735 AS2( MOVD edx, MM(1))
00736 AS2( MOVD [L_SAVED_X+3*4], MM(0))
00737 AS2( mov [L_SAVED_X+0*4], eax)
00738 AS2( mov [L_SAVED_X+1*4], ebx)
00739 AS2( mov [L_SAVED_X+2*4], edi)
00740 ASJ( jmp, 5, f)
00741
00742 ASL(3)
00743
00744 AS2( MOVD MM(1), [L_KEY12+0*4])
00745 AS2( mov ebx, [L_KEY12+1*4])
00746 AS2( mov ecx, [L_KEY12+2*4])
00747 AS2( mov edx, [L_KEY12+3*4])
00748 ASL(8)
00749 AS2( mov WORD_REG(ax), [L_INBLOCKS])
00750 AS2( movdqu xmm2, [WORD_REG(ax)])
00751 AS2( mov WORD_REG(si), [L_INXORBLOCKS])
00752 AS2( movdqu xmm5, [WORD_REG(si)])
00753 AS2( pxor xmm2, xmm1)
00754 AS2( pxor xmm2, xmm5)
00755
00756
00757 AS2( movd eax, xmm2)
00758 AS2( psrldq xmm2, 4)
00759 AS2( movd edi, xmm2)
00760 AS2( psrldq xmm2, 4)
00761 MXOR( 1, al, 0)
00762 XOR( edx, ah, 1)
00763 AS2( shr eax, 16)
00764 XOR( ecx, al, 2)
00765 XOR( ebx, ah, 3)
00766 AS2( mov eax, edi)
00767 AS2( movd edi, xmm2)
00768 AS2( psrldq xmm2, 4)
00769 XOR( ebx, al, 0)
00770 MXOR( 1, ah, 1)
00771 AS2( shr eax, 16)
00772 XOR( edx, al, 2)
00773 XOR( ecx, ah, 3)
00774 AS2( mov eax, edi)
00775 AS2( movd edi, xmm2)
00776 XOR( ecx, al, 0)
00777 XOR( ebx, ah, 1)
00778 AS2( shr eax, 16)
00779 MXOR( 1, al, 2)
00780 XOR( edx, ah, 3)
00781 AS2( mov eax, edi)
00782 XOR( edx, al, 0)
00783 XOR( ecx, ah, 1)
00784 AS2( shr eax, 16)
00785 XOR( ebx, al, 2)
00786 MXOR( 1, ah, 3)
00787 AS2( MOVD eax, MM(1))
00788
00789 AS2( add L_REG, [L_KEYS_BEGIN])
00790 AS2( add L_REG, 4*16)
00791 ASJ( jmp, 2, f)
00792
00793 ASL(1)
00794
00795 AS2( MOVD ecx, MM(2))
00796 AS2( MOVD edx, MM(1))
00797 AS2( mov eax, [L_SAVED_X+0*4])
00798 AS2( mov ebx, [L_SAVED_X+1*4])
00799 AS2( xor cl, ch)
00800 AS2( and WORD_REG(cx), 255)
00801 ASL(5)
00802 #if CRYPTOPP_BOOL_X86
00803 AS2( paddb MM(2), mm3)
00804 #else
00805 AS2( add MM(2), 1)
00806 #endif
00807
00808 AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
00809 XOR( ebx, dl, 3)
00810 MOV( ecx, dh, 2)
00811 AS2( shr edx, 16)
00812 AS2( xor ecx, [L_SAVED_X+2*4])
00813 XOR( eax, dh, 0)
00814 MOV( edx, dl, 1)
00815 AS2( xor edx, [L_SAVED_X+3*4])
00816
00817 AS2( add L_REG, [L_KEYS_BEGIN])
00818 AS2( add L_REG, 3*16)
00819 ASJ( jmp, 4, f)
00820
00821
00822
00823 #define ROUND() \
00824 MXOR( 0, cl, 3) \
00825 AS2( mov cl, al) \
00826 XOR( edi, ah, 2) \
00827 AS2( shr eax, 16) \
00828 XOR( edi, bl, 3) \
00829 MXOR( 0, bh, 2) \
00830 AS2( shr ebx, 16) \
00831 MXOR( 0, al, 1) \
00832 MOV( eax, ah, 0) \
00833 XOR( eax, bl, 1) \
00834 MOV( ebx, bh, 0) \
00835 XOR( eax, ch, 2) \
00836 XOR( ebx, cl, 3) \
00837 AS2( shr ecx, 16) \
00838 XOR( eax, dl, 3) \
00839 XOR( ebx, dh, 2) \
00840 AS2( shr edx, 16) \
00841 XOR( edi, ch, 0) \
00842 XOR( ebx, cl, 1) \
00843 XOR( edi, dl, 1) \
00844 MXOR( 0, dh, 0) \
00845
00846 ASL(2)
00847 AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
00848 AS2( mov edi, [L_SUBKEYS-4*16+2*4])
00849 ROUND()
00850 AS2( mov ecx, edi)
00851 AS2( xor eax, [L_SUBKEYS-4*16+0*4])
00852 AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
00853 AS2( MOVD edx, MM(0))
00854
00855 ASL(4)
00856 AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
00857 AS2( mov edi, [L_SUBKEYS-4*16+6*4])
00858 ROUND()
00859 AS2( mov ecx, edi)
00860 AS2( xor eax, [L_SUBKEYS-4*16+4*4])
00861 AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
00862 AS2( MOVD edx, MM(0))
00863
00864 AS2( add L_REG, 32)
00865 AS2( test L_REG, 255)
00866 ASJ( jnz, 2, b)
00867 AS2( sub L_REG, 16*16)
00868
00869 #define LAST(a, b, c) \
00870 AS2( movzx esi, a )\
00871 AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
00872 AS2( movzx esi, b )\
00873 AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
00874 AS2( mov WORD PTR [L_LASTROUND+c], di )\
00875
00876
00877 LAST(ch, dl, 2)
00878 LAST(dh, al, 6)
00879 AS2( shr edx, 16)
00880 LAST(ah, bl, 10)
00881 AS2( shr eax, 16)
00882 LAST(bh, cl, 14)
00883 AS2( shr ebx, 16)
00884 LAST(dh, al, 12)
00885 AS2( shr ecx, 16)
00886 LAST(ah, bl, 0)
00887 LAST(bh, cl, 4)
00888 LAST(ch, dl, 8)
00889
00890 AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
00891 AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
00892
00893 AS2( mov WORD_REG(cx), [L_LENGTH])
00894 AS2( sub WORD_REG(cx), 16)
00895
00896 AS2( movdqu xmm2, [WORD_REG(ax)])
00897 AS2( pxor xmm2, xmm4)
00898
00899 #if CRYPTOPP_BOOL_X86
00900 AS2( movdqa xmm0, [L_INCREMENTS])
00901 AS2( paddd xmm0, [L_INBLOCKS])
00902 AS2( movdqa [L_INBLOCKS], xmm0)
00903 #else
00904 AS2( movdqa xmm0, [L_INCREMENTS+16])
00905 AS2( paddq xmm0, [L_INBLOCKS+16])
00906 AS2( movdqa [L_INBLOCKS+16], xmm0)
00907 #endif
00908
00909 AS2( pxor xmm2, [L_LASTROUND])
00910 AS2( movdqu [WORD_REG(bx)], xmm2)
00911
00912 ASJ( jle, 7, f)
00913 AS2( mov [L_LENGTH], WORD_REG(cx))
00914 AS2( test WORD_REG(cx), 1)
00915 ASJ( jnz, 1, b)
00916 #if CRYPTOPP_BOOL_X64
00917 AS2( movdqa xmm0, [L_INCREMENTS])
00918 AS2( paddq xmm0, [L_INBLOCKS])
00919 AS2( movdqa [L_INBLOCKS], xmm0)
00920 #endif
00921 ASJ( jmp, 3, b)
00922
00923 ASL(7)
00924
00925 AS2( xorps xmm0, xmm0)
00926 AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
00927 AS2( movaps [WORD_REG(ax)-7*16], xmm0)
00928 AS2( movaps [WORD_REG(ax)-6*16], xmm0)
00929 AS2( movaps [WORD_REG(ax)-5*16], xmm0)
00930 AS2( movaps [WORD_REG(ax)-4*16], xmm0)
00931 AS2( movaps [WORD_REG(ax)-3*16], xmm0)
00932 AS2( movaps [WORD_REG(ax)-2*16], xmm0)
00933 AS2( movaps [WORD_REG(ax)-1*16], xmm0)
00934 AS2( movaps [WORD_REG(ax)+0*16], xmm0)
00935 AS2( movaps [WORD_REG(ax)+1*16], xmm0)
00936 AS2( movaps [WORD_REG(ax)+2*16], xmm0)
00937 AS2( movaps [WORD_REG(ax)+3*16], xmm0)
00938 AS2( movaps [WORD_REG(ax)+4*16], xmm0)
00939 AS2( movaps [WORD_REG(ax)+5*16], xmm0)
00940 AS2( movaps [WORD_REG(ax)+6*16], xmm0)
00941 #if CRYPTOPP_BOOL_X86
00942 AS2( mov esp, [L_SP])
00943 AS1( emms)
00944 #endif
00945 AS_POP_IF86(bp)
00946 AS_POP_IF86(bx)
00947 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
00948 AS_POP_IF86(di)
00949 AS_POP_IF86(si)
00950 AS1(ret)
00951 #endif
00952 #ifdef CRYPTOPP_GENERATE_X64_MASM
00953 pop r12
00954 pop rbx
00955 pop rdi
00956 pop rsi
00957 ret
00958 Rijndael_Enc_AdvancedProcessBlocks ENDP
00959 #endif
00960 #ifdef __GNUC__
00961 ".att_syntax prefix;"
00962 :
00963 : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
00964 : "memory", "cc", "%eax"
00965 #if CRYPTOPP_BOOL_X64
00966 , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
00967 #endif
00968 );
00969 #endif
00970 }
00971
00972 #endif
00973
00974 #ifndef CRYPTOPP_GENERATE_X64_MASM
00975
00976 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00977 extern "C" {
00978 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k);
00979 }
00980 #endif
00981
00982 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86
00983
00984 static inline bool AliasedWithTable(const byte *begin, const byte *end)
00985 {
00986 size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096;
00987 size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+sizeof(Te))%4096;
00988 if (t1 > t0)
00989 return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
00990 else
00991 return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
00992 }
00993
00994 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
00995
00996 inline void AESNI_Enc_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
00997 {
00998 block = _mm_xor_si128(block, subkeys[0]);
00999 for (unsigned int i=1; i<rounds-1; i+=2)
01000 {
01001 block = _mm_aesenc_si128(block, subkeys[i]);
01002 block = _mm_aesenc_si128(block, subkeys[i+1]);
01003 }
01004 block = _mm_aesenc_si128(block, subkeys[rounds-1]);
01005 block = _mm_aesenclast_si128(block, subkeys[rounds]);
01006 }
01007
01008 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
01009 {
01010 __m128i rk = subkeys[0];
01011 block0 = _mm_xor_si128(block0, rk);
01012 block1 = _mm_xor_si128(block1, rk);
01013 block2 = _mm_xor_si128(block2, rk);
01014 block3 = _mm_xor_si128(block3, rk);
01015 for (unsigned int i=1; i<rounds; i++)
01016 {
01017 rk = subkeys[i];
01018 block0 = _mm_aesenc_si128(block0, rk);
01019 block1 = _mm_aesenc_si128(block1, rk);
01020 block2 = _mm_aesenc_si128(block2, rk);
01021 block3 = _mm_aesenc_si128(block3, rk);
01022 }
01023 rk = subkeys[rounds];
01024 block0 = _mm_aesenclast_si128(block0, rk);
01025 block1 = _mm_aesenclast_si128(block1, rk);
01026 block2 = _mm_aesenclast_si128(block2, rk);
01027 block3 = _mm_aesenclast_si128(block3, rk);
01028 }
01029
01030 inline void AESNI_Dec_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
01031 {
01032 block = _mm_xor_si128(block, subkeys[0]);
01033 for (unsigned int i=1; i<rounds-1; i+=2)
01034 {
01035 block = _mm_aesdec_si128(block, subkeys[i]);
01036 block = _mm_aesdec_si128(block, subkeys[i+1]);
01037 }
01038 block = _mm_aesdec_si128(block, subkeys[rounds-1]);
01039 block = _mm_aesdeclast_si128(block, subkeys[rounds]);
01040 }
01041
01042 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
01043 {
01044 __m128i rk = subkeys[0];
01045 block0 = _mm_xor_si128(block0, rk);
01046 block1 = _mm_xor_si128(block1, rk);
01047 block2 = _mm_xor_si128(block2, rk);
01048 block3 = _mm_xor_si128(block3, rk);
01049 for (unsigned int i=1; i<rounds; i++)
01050 {
01051 rk = subkeys[i];
01052 block0 = _mm_aesdec_si128(block0, rk);
01053 block1 = _mm_aesdec_si128(block1, rk);
01054 block2 = _mm_aesdec_si128(block2, rk);
01055 block3 = _mm_aesdec_si128(block3, rk);
01056 }
01057 rk = subkeys[rounds];
01058 block0 = _mm_aesdeclast_si128(block0, rk);
01059 block1 = _mm_aesdeclast_si128(block1, rk);
01060 block2 = _mm_aesdeclast_si128(block2, rk);
01061 block3 = _mm_aesdeclast_si128(block3, rk);
01062 }
01063
01064 static CRYPTOPP_ALIGN_DATA(16) const word32 s_one[] = {0, 0, 0, 1<<24};
01065
01066 template <typename F1, typename F4>
01067 inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, const __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
01068 {
01069 size_t blockSize = 16;
01070 size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
01071 size_t xorIncrement = xorBlocks ? blockSize : 0;
01072 size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
01073
01074 if (flags & BlockTransformation::BT_ReverseDirection)
01075 {
01076 assert(length % blockSize == 0);
01077 inBlocks += length - blockSize;
01078 xorBlocks += length - blockSize;
01079 outBlocks += length - blockSize;
01080 inIncrement = 0-inIncrement;
01081 xorIncrement = 0-xorIncrement;
01082 outIncrement = 0-outIncrement;
01083 }
01084
01085 if (flags & BlockTransformation::BT_AllowParallel)
01086 {
01087 while (length >= 4*blockSize)
01088 {
01089 __m128i block0 = _mm_loadu_si128((const __m128i *)inBlocks), block1, block2, block3;
01090 if (flags & BlockTransformation::BT_InBlockIsCounter)
01091 {
01092 const __m128i be1 = *(const __m128i *)s_one;
01093 block1 = _mm_add_epi32(block0, be1);
01094 block2 = _mm_add_epi32(block1, be1);
01095 block3 = _mm_add_epi32(block2, be1);
01096 _mm_storeu_si128((__m128i *)inBlocks, _mm_add_epi32(block3, be1));
01097 }
01098 else
01099 {
01100 inBlocks += inIncrement;
01101 block1 = _mm_loadu_si128((const __m128i *)inBlocks);
01102 inBlocks += inIncrement;
01103 block2 = _mm_loadu_si128((const __m128i *)inBlocks);
01104 inBlocks += inIncrement;
01105 block3 = _mm_loadu_si128((const __m128i *)inBlocks);
01106 inBlocks += inIncrement;
01107 }
01108
01109 if (flags & BlockTransformation::BT_XorInput)
01110 {
01111 block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
01112 xorBlocks += xorIncrement;
01113 block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
01114 xorBlocks += xorIncrement;
01115 block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
01116 xorBlocks += xorIncrement;
01117 block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
01118 xorBlocks += xorIncrement;
01119 }
01120
01121 func4(block0, block1, block2, block3, subkeys, rounds);
01122
01123 if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
01124 {
01125 block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
01126 xorBlocks += xorIncrement;
01127 block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
01128 xorBlocks += xorIncrement;
01129 block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
01130 xorBlocks += xorIncrement;
01131 block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
01132 xorBlocks += xorIncrement;
01133 }
01134
01135 _mm_storeu_si128((__m128i *)outBlocks, block0);
01136 outBlocks += outIncrement;
01137 _mm_storeu_si128((__m128i *)outBlocks, block1);
01138 outBlocks += outIncrement;
01139 _mm_storeu_si128((__m128i *)outBlocks, block2);
01140 outBlocks += outIncrement;
01141 _mm_storeu_si128((__m128i *)outBlocks, block3);
01142 outBlocks += outIncrement;
01143
01144 length -= 4*blockSize;
01145 }
01146 }
01147
01148 while (length >= blockSize)
01149 {
01150 __m128i block = _mm_loadu_si128((const __m128i *)inBlocks);
01151
01152 if (flags & BlockTransformation::BT_XorInput)
01153 block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
01154
01155 if (flags & BlockTransformation::BT_InBlockIsCounter)
01156 const_cast<byte *>(inBlocks)[15]++;
01157
01158 func1(block, subkeys, rounds);
01159
01160 if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
01161 block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
01162
01163 _mm_storeu_si128((__m128i *)outBlocks, block);
01164
01165 inBlocks += inIncrement;
01166 outBlocks += outIncrement;
01167 xorBlocks += xorIncrement;
01168 length -= blockSize;
01169 }
01170
01171 return length;
01172 }
01173 #endif
01174
01175 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
01176 {
01177 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
01178 if (HasAESNI())
01179 return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
01180 #endif
01181
01182 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
01183 if (HasSSE2())
01184 {
01185 if (length < BLOCKSIZE)
01186 return length;
01187
01188 struct Locals
01189 {
01190 word32 subkeys[4*12], workspace[8];
01191 const byte *inBlocks, *inXorBlocks, *outXorBlocks;
01192 byte *outBlocks;
01193 size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
01194 size_t regSpill, lengthAndCounterFlag, keysBegin;
01195 };
01196
01197 size_t increment = BLOCKSIZE;
01198 const byte* zeros = (byte *)(Te+256);
01199 byte *space;
01200
01201 do {
01202 space = (byte *)alloca(255+sizeof(Locals));
01203 space += (256-(size_t)space%256)%256;
01204 }
01205 while (AliasedWithTable(space, space+sizeof(Locals)));
01206
01207 if (flags & BT_ReverseDirection)
01208 {
01209 assert(length % BLOCKSIZE == 0);
01210 inBlocks += length - BLOCKSIZE;
01211 xorBlocks += length - BLOCKSIZE;
01212 outBlocks += length - BLOCKSIZE;
01213 increment = 0-increment;
01214 }
01215
01216 Locals &locals = *(Locals *)space;
01217
01218 locals.inBlocks = inBlocks;
01219 locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
01220 locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
01221 locals.outBlocks = outBlocks;
01222
01223 locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
01224 locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
01225 locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
01226 locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
01227
01228 locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
01229 int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
01230 locals.keysBegin = (12-keysToCopy)*16;
01231
01232 Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
01233 return length % BLOCKSIZE;
01234 }
01235 #endif
01236
01237 return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
01238 }
01239
01240 #endif
01241
01242 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
01243
01244 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
01245 {
01246 if (HasAESNI())
01247 return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
01248
01249 return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
01250 }
01251
01252 #endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
01253
01254 NAMESPACE_END
01255
01256 #endif
01257 #endif