Crypto++: rijndael.cpp Source File

00001 // rijndael.cpp - modified by Chris Morgan <[email protected]>
00002 // and Wei Dai from Paulo Baretto's Rijndael implementation
00003 // The original code and all modifications are in the public domain.
00004 
00005 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
00006 
00007 /*
00008 July 2010: Added support for AES-NI instructions via compiler intrinsics.
00009 */
00010 
00011 /*
00012 Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode 
00013 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein 
00014 and Peter Schwabe in their paper "New AES software speed records". The round 
00015 function was also modified to include a trick similar to one in Brian Gladman's 
00016 x86 assembly code, doing an 8-bit register move to minimize the number of 
00017 register spills. Also switched to compressed tables and copying round keys to 
00018 the stack.
00019 
00020 The C++ implementation now uses compressed tables if 
00021 CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined.
00022 */
00023 
00024 /*
00025 July 2006: Defense against timing attacks was added in by Wei Dai.
00026 
00027 The code now uses smaller tables in the first and last rounds,
00028 and preloads them into L1 cache before usage (by loading at least 
00029 one element in each cache line). 
00030 
00031 We try to delay subsequent accesses to each table (used in the first 
00032 and last rounds) until all of the table has been preloaded. Hopefully
00033 the compiler isn't smart enough to optimize that code away.
00034 
00035 After preloading the table, we also try not to access any memory location
00036 other than the table and the stack, in order to prevent table entries from 
00037 being unloaded from L1 cache, until that round is finished.
00038 (Some popular CPUs have 2-way associative caches.)
00039 */
00040 
00041 // This is the original introductory comment:
00042 
00043 /**
00044  * version 3.0 (December 2000)
00045  *
00046  * Optimised ANSI C code for the Rijndael cipher (now AES)
00047  *
00048  * author Vincent Rijmen <[email protected]>
00049  * author Antoon Bosselaers <[email protected]>
00050  * author Paulo Barreto <[email protected]>
00051  *
00052  * This code is hereby placed in the public domain.
00053  *
00054  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
00055  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
00056  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00057  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
00058  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00059  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00060  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
00061  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
00062  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
00063  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
00064  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00065  */
00066 
00067 #include "pch.h"
00068 
00069 #ifndef CRYPTOPP_IMPORTS
00070 #ifndef CRYPTOPP_GENERATE_X64_MASM
00071 
00072 #include "rijndael.h"
00073 #include "misc.h"
00074 #include "cpu.h"
00075 
00076 NAMESPACE_BEGIN(CryptoPP)
00077 
00078 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00079 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
00080 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
00081 using namespace rdtable;
00082 #else
00083 static word64 Te[256];
00084 #endif
00085 static word64 Td[256];
00086 #else
00087 static word32 Te[256*4], Td[256*4];
00088 #endif
00089 static volatile bool s_TeFilled = false, s_TdFilled = false;
00090 
00091 // ************************* Portable Code ************************************
00092 
00093 #define QUARTER_ROUND(L, T, t, a, b, c, d)      \
00094         a ^= L(T, 3, byte(t)); t >>= 8;\
00095         b ^= L(T, 2, byte(t)); t >>= 8;\
00096         c ^= L(T, 1, byte(t)); t >>= 8;\
00097         d ^= L(T, 0, t);
00098 
00099 #define QUARTER_ROUND_LE(t, a, b, c, d) \
00100         tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00101         tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00102         tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00103         tempBlock[d] = ((byte *)(Te+t))[1];
00104 
00105 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00106         #define QUARTER_ROUND_LD(t, a, b, c, d) \
00107                 tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00108                 tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00109                 tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00110                 tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
00111 #else
00112         #define QUARTER_ROUND_LD(t, a, b, c, d) \
00113                 tempBlock[a] = Sd[byte(t)]; t >>= 8;\
00114                 tempBlock[b] = Sd[byte(t)]; t >>= 8;\
00115                 tempBlock[c] = Sd[byte(t)]; t >>= 8;\
00116                 tempBlock[d] = Sd[t];
00117 #endif
00118 
00119 #define QUARTER_ROUND_E(t, a, b, c, d)          QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
00120 #define QUARTER_ROUND_D(t, a, b, c, d)          QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
00121 
00122 #ifdef IS_LITTLE_ENDIAN
00123         #define QUARTER_ROUND_FE(t, a, b, c, d)         QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
00124         #define QUARTER_ROUND_FD(t, a, b, c, d)         QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
00125         #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00126                 #define TL_F(T, i, x)   (*(word32 *)((byte *)T + x*8 + (6-i)%4+1))
00127                 #define TL_M(T, i, x)   (*(word32 *)((byte *)T + x*8 + (i+3)%4+1))
00128         #else
00129                 #define TL_F(T, i, x)   rotrFixed(T[x], (3-i)*8)
00130                 #define TL_M(T, i, x)   T[i*256 + x]
00131         #endif
00132 #else
00133         #define QUARTER_ROUND_FE(t, a, b, c, d)         QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
00134         #define QUARTER_ROUND_FD(t, a, b, c, d)         QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
00135         #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00136                 #define TL_F(T, i, x)   (*(word32 *)((byte *)T + x*8 + (4-i)%4))
00137                 #define TL_M                    TL_F
00138         #else
00139                 #define TL_F(T, i, x)   rotrFixed(T[x], i*8)
00140                 #define TL_M(T, i, x)   T[i*256 + x]
00141         #endif
00142 #endif
00143 
00144 
00145 #define f2(x)   ((x<<1)^(((x>>7)&1)*0x11b))
00146 #define f4(x)   ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
00147 #define f8(x)   ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
00148 
00149 #define f3(x)   (f2(x) ^ x)
00150 #define f9(x)   (f8(x) ^ x)
00151 #define fb(x)   (f8(x) ^ f2(x) ^ x)
00152 #define fd(x)   (f8(x) ^ f4(x) ^ x)
00153 #define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
00154 
00155 void Rijndael::Base::FillEncTable()
00156 {
00157         for (int i=0; i<256; i++)
00158         {
00159                 byte x = Se[i];
00160 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00161                 word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
00162                 Te[i] = word64(y | f3(x))<<32 | y;
00163 #else
00164                 word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
00165                 for (int j=0; j<4; j++)
00166                 {
00167                         Te[i+j*256] = y;
00168                         y = rotrFixed(y, 8);
00169                 }
00170 #endif
00171         }
00172 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
00173         Te[256] = Te[257] = 0;
00174 #endif
00175         s_TeFilled = true;
00176 }
00177 
00178 void Rijndael::Base::FillDecTable()
00179 {
00180         for (int i=0; i<256; i++)
00181         {
00182                 byte x = Sd[i];
00183 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00184                 word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
00185                 Td[i] = word64(y | fb(x))<<32 | y | x;
00186 #else
00187                 word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
00188                 for (int j=0; j<4; j++)
00189                 {
00190                         Td[i+j*256] = y;
00191                         y = rotrFixed(y, 8);
00192                 }
00193 #endif
00194         }
00195         s_TdFilled = true;
00196 }
00197 
00198 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
00199 {
00200         AssertValidKeyLength(keylen);
00201 
00202         m_rounds = keylen/4 + 6;
00203         m_key.New(4*(m_rounds+1));
00204 
00205         word32 *rk = m_key;
00206 
00207 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86)
00208         // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
00209         if (HasAESNI())
00210         {
00211                 static const word32 rcLE[] = {
00212                         0x01, 0x02, 0x04, 0x08,
00213                         0x10, 0x20, 0x40, 0x80,
00214                         0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
00215                 };
00216                 const word32 *rc = rcLE;
00217 
00218                 __m128i temp = _mm_loadu_si128((__m128i *)(userKey+keylen-16));
00219                 memcpy(rk, userKey, keylen);
00220 
00221                 while (true)
00222                 {
00223                         rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
00224                         rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
00225                         rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
00226                         rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
00227 
00228                         if (rk + keylen/4 + 4 == m_key.end())
00229                                 break;
00230 
00231                         if (keylen == 24)
00232                         {
00233                                 rk[10] = rk[ 4] ^ rk[ 9];
00234                                 rk[11] = rk[ 5] ^ rk[10];
00235                                 temp = _mm_insert_epi32(temp, rk[11], 3);
00236                         }
00237                         else if (keylen == 32)
00238                         {
00239                                 temp = _mm_insert_epi32(temp, rk[11], 3);
00240                         rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
00241                         rk[13] = rk[ 5] ^ rk[12];
00242                         rk[14] = rk[ 6] ^ rk[13];
00243                         rk[15] = rk[ 7] ^ rk[14];
00244                                 temp = _mm_insert_epi32(temp, rk[15], 3);
00245                         }
00246                         else
00247                                 temp = _mm_insert_epi32(temp, rk[7], 3);
00248 
00249                         rk += keylen/4;
00250                 }
00251 
00252                 if (!IsForwardTransformation())
00253                 {
00254                         rk = m_key;
00255                         unsigned int i, j;
00256 
00257                         std::swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
00258 
00259                         for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
00260                         {
00261                                 temp = _mm_aesimc_si128(*(__m128i *)(rk+i));
00262                                 *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+j));
00263                                 *(__m128i *)(rk+j) = temp;
00264                         }
00265 
00266                         *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+i));
00267                 }
00268 
00269                 return;
00270         }
00271 #endif
00272 
00273         GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
00274         const word32 *rc = rcon;
00275         word32 temp;
00276 
00277         while (true)
00278         {
00279                 temp  = rk[keylen/4-1];
00280                 word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
00281                 rk[keylen/4] = rk[0] ^ x ^ *(rc++);
00282                 rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
00283                 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
00284                 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
00285 
00286                 if (rk + keylen/4 + 4 == m_key.end())
00287                         break;
00288 
00289                 if (keylen == 24)
00290                 {
00291                         rk[10] = rk[ 4] ^ rk[ 9];
00292                         rk[11] = rk[ 5] ^ rk[10];
00293                 }
00294                 else if (keylen == 32)
00295                 {
00296                 temp = rk[11];
00297                 rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
00298                 rk[13] = rk[ 5] ^ rk[12];
00299                 rk[14] = rk[ 6] ^ rk[13];
00300                 rk[15] = rk[ 7] ^ rk[14];
00301                 }
00302                 rk += keylen/4;
00303         }
00304 
00305         rk = m_key;
00306 
00307         if (IsForwardTransformation())
00308         {
00309                 if (!s_TeFilled)
00310                         FillEncTable();
00311 
00312                 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
00313                 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
00314         }
00315         else
00316         {
00317                 if (!s_TdFilled)
00318                         FillDecTable();
00319 
00320                 unsigned int i, j;
00321 
00322 #define InverseMixColumn(x)             TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
00323 
00324                 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
00325                 {
00326                         temp = InverseMixColumn(rk[i    ]); rk[i    ] = InverseMixColumn(rk[j    ]); rk[j    ] = temp;
00327                         temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
00328                         temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
00329                         temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
00330                 }
00331 
00332                 rk[i+0] = InverseMixColumn(rk[i+0]);
00333                 rk[i+1] = InverseMixColumn(rk[i+1]);
00334                 rk[i+2] = InverseMixColumn(rk[i+2]);
00335                 rk[i+3] = InverseMixColumn(rk[i+3]);
00336 
00337                 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
00338                 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
00339                 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
00340                 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
00341         }
00342 
00343 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
00344         if (HasAESNI())
00345                 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
00346 #endif
00347 }
00348 
00349 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
00350 {
00351 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
00352         if (HasSSE2())
00353         {
00354                 Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
00355                 return;
00356         }
00357 #endif
00358 
00359         typedef BlockGetAndPut<word32, NativeByteOrder> Block;
00360 
00361         word32 s0, s1, s2, s3, t0, t1, t2, t3;
00362         Block::Get(inBlock)(s0)(s1)(s2)(s3);
00363 
00364         const word32 *rk = m_key;
00365         s0 ^= rk[0];
00366         s1 ^= rk[1];
00367         s2 ^= rk[2];
00368         s3 ^= rk[3];
00369         t0 = rk[4];
00370         t1 = rk[5];
00371         t2 = rk[6];
00372         t3 = rk[7];
00373         rk += 8;
00374 
00375         // timing attack countermeasure. see comments at top for more details
00376         const int cacheLineSize = GetCacheLineSize();
00377         unsigned int i;
00378         word32 u = 0;
00379 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00380         for (i=0; i<2048; i+=cacheLineSize)
00381 #else
00382         for (i=0; i<1024; i+=cacheLineSize)
00383 #endif
00384                 u &= *(const word32 *)(((const byte *)Te)+i);
00385         u &= Te[255];
00386         s0 |= u; s1 |= u; s2 |= u; s3 |= u;
00387 
00388         QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
00389         QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
00390         QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
00391         QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
00392 
00393         // Nr - 2 full rounds:
00394     unsigned int r = m_rounds/2 - 1;
00395     do
00396         {
00397                 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
00398 
00399                 QUARTER_ROUND_E(t3, s0, s1, s2, s3)
00400                 QUARTER_ROUND_E(t2, s3, s0, s1, s2)
00401                 QUARTER_ROUND_E(t1, s2, s3, s0, s1)
00402                 QUARTER_ROUND_E(t0, s1, s2, s3, s0)
00403 
00404                 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
00405 
00406                 QUARTER_ROUND_E(s3, t0, t1, t2, t3)
00407                 QUARTER_ROUND_E(s2, t3, t0, t1, t2)
00408                 QUARTER_ROUND_E(s1, t2, t3, t0, t1)
00409                 QUARTER_ROUND_E(s0, t1, t2, t3, t0)
00410 
00411         rk += 8;
00412     } while (--r);
00413 
00414         word32 tbw[4];
00415         byte *const tempBlock = (byte *)tbw;
00416 
00417         QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
00418         QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
00419         QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
00420         QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
00421 
00422         Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
00423 }
00424 
00425 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
00426 {
00427 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
00428         if (HasAESNI())
00429         {
00430                 Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
00431                 return;
00432         }
00433 #endif
00434 
00435         typedef BlockGetAndPut<word32, NativeByteOrder> Block;
00436 
00437         word32 s0, s1, s2, s3, t0, t1, t2, t3;
00438         Block::Get(inBlock)(s0)(s1)(s2)(s3);
00439 
00440         const word32 *rk = m_key;
00441         s0 ^= rk[0];
00442         s1 ^= rk[1];
00443         s2 ^= rk[2];
00444         s3 ^= rk[3];
00445         t0 = rk[4];
00446         t1 = rk[5];
00447         t2 = rk[6];
00448         t3 = rk[7];
00449         rk += 8;
00450 
00451         // timing attack countermeasure. see comments at top for more details
00452         const int cacheLineSize = GetCacheLineSize();
00453         unsigned int i;
00454         word32 u = 0;
00455 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00456         for (i=0; i<2048; i+=cacheLineSize)
00457 #else
00458         for (i=0; i<1024; i+=cacheLineSize)
00459 #endif
00460                 u &= *(const word32 *)(((const byte *)Td)+i);
00461         u &= Td[255];
00462         s0 |= u; s1 |= u; s2 |= u; s3 |= u;
00463 
00464         QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
00465         QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
00466         QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
00467         QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
00468 
00469         // Nr - 2 full rounds:
00470     unsigned int r = m_rounds/2 - 1;
00471     do
00472         {
00473                 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
00474 
00475                 QUARTER_ROUND_D(t3, s2, s1, s0, s3)
00476                 QUARTER_ROUND_D(t2, s1, s0, s3, s2)
00477                 QUARTER_ROUND_D(t1, s0, s3, s2, s1)
00478                 QUARTER_ROUND_D(t0, s3, s2, s1, s0)
00479 
00480                 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
00481 
00482                 QUARTER_ROUND_D(s3, t2, t1, t0, t3)
00483                 QUARTER_ROUND_D(s2, t1, t0, t3, t2)
00484                 QUARTER_ROUND_D(s1, t0, t3, t2, t1)
00485                 QUARTER_ROUND_D(s0, t3, t2, t1, t0)
00486 
00487         rk += 8;
00488     } while (--r);
00489 
00490 #ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00491         // timing attack countermeasure. see comments at top for more details
00492         // If CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined, 
00493         // QUARTER_ROUND_LD will use Td, which is already preloaded.
00494         u = 0;
00495         for (i=0; i<256; i+=cacheLineSize)
00496                 u &= *(const word32 *)(Sd+i);
00497         u &= *(const word32 *)(Sd+252);
00498         t0 |= u; t1 |= u; t2 |= u; t3 |= u;
00499 #endif
00500 
00501         word32 tbw[4];
00502         byte *const tempBlock = (byte *)tbw;
00503 
00504         QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
00505         QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
00506         QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
00507         QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
00508 
00509         Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
00510 }
00511 
00512 // ************************* Assembly Code ************************************
00513 
00514 #pragma warning(disable: 4731)  // frame pointer register 'ebp' modified by inline assembly code
00515 
00516 #endif  // #ifndef CRYPTOPP_GENERATE_X64_MASM
00517 
00518 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00519 
00520 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k)
00521 {
00522 #if CRYPTOPP_BOOL_X86
00523 
00524 #define L_REG                   esp
00525 #define L_INDEX(i)              (L_REG+512+i)
00526 #define L_INXORBLOCKS   L_INBLOCKS+4
00527 #define L_OUTXORBLOCKS  L_INBLOCKS+8
00528 #define L_OUTBLOCKS             L_INBLOCKS+12
00529 #define L_INCREMENTS    L_INDEX(16*15)
00530 #define L_SP                    L_INDEX(16*16)
00531 #define L_LENGTH                L_INDEX(16*16+4)
00532 #define L_KEYS_BEGIN    L_INDEX(16*16+8)
00533 
00534 #define MOVD                    movd
00535 #define MM(i)                   mm##i
00536 
00537 #define MXOR(a,b,c)     \
00538         AS2(    movzx   esi, b)\
00539         AS2(    movd    mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00540         AS2(    pxor    MM(a), mm7)\
00541 
00542 #define MMOV(a,b,c)     \
00543         AS2(    movzx   esi, b)\
00544         AS2(    movd    MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00545 
00546 #else
00547 
00548 #define L_REG                   r8
00549 #define L_INDEX(i)              (L_REG+i)
00550 #define L_INXORBLOCKS   L_INBLOCKS+8
00551 #define L_OUTXORBLOCKS  L_INBLOCKS+16
00552 #define L_OUTBLOCKS             L_INBLOCKS+24
00553 #define L_INCREMENTS    L_INDEX(16*16)
00554 #define L_LENGTH                L_INDEX(16*18+8)
00555 #define L_KEYS_BEGIN    L_INDEX(16*19)
00556 
00557 #define MOVD                    mov
00558 #define MM_0                    r9d
00559 #define MM_1                    r12d
00560 #ifdef __GNUC__
00561 #define MM_2                    r11d
00562 #else
00563 #define MM_2                    r10d
00564 #endif
00565 #define MM(i)                   MM_##i
00566 
00567 #define MXOR(a,b,c)     \
00568         AS2(    movzx   esi, b)\
00569         AS2(    xor             MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00570 
00571 #define MMOV(a,b,c)     \
00572         AS2(    movzx   esi, b)\
00573         AS2(    mov             MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00574 
00575 #endif
00576 
00577 #define L_SUBKEYS               L_INDEX(0)
00578 #define L_SAVED_X               L_SUBKEYS
00579 #define L_KEY12                 L_INDEX(16*12)
00580 #define L_LASTROUND             L_INDEX(16*13)
00581 #define L_INBLOCKS              L_INDEX(16*14)
00582 #define MAP0TO4(i)              (ASM_MOD(i+3,4)+1)
00583 
00584 #define XOR(a,b,c)      \
00585         AS2(    movzx   esi, b)\
00586         AS2(    xor             a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00587 
00588 #define MOV(a,b,c)      \
00589         AS2(    movzx   esi, b)\
00590         AS2(    mov             a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00591 
00592 #ifdef CRYPTOPP_GENERATE_X64_MASM
00593                 ALIGN   8
00594         Rijndael_Enc_AdvancedProcessBlocks      PROC FRAME
00595                 rex_push_reg rsi
00596                 push_reg rdi
00597                 push_reg rbx
00598                 push_reg r12
00599                 .endprolog
00600                 mov L_REG, rcx
00601                 mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
00602                 mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
00603 #elif defined(__GNUC__)
00604         __asm__ __volatile__
00605         (
00606         ".intel_syntax noprefix;"
00607         #if CRYPTOPP_BOOL_X64
00608         AS2(    mov             L_REG, rcx)
00609         #endif
00610         AS_PUSH_IF86(bx)
00611         AS_PUSH_IF86(bp)
00612         AS2(    mov             AS_REG_7, WORD_REG(si))
00613 #else
00614         AS_PUSH_IF86(si)
00615         AS_PUSH_IF86(di)
00616         AS_PUSH_IF86(bx)
00617         AS_PUSH_IF86(bp)
00618         AS2(    lea             AS_REG_7, [Te])
00619         AS2(    mov             edi, [g_cacheLineSize])
00620 #endif
00621 
00622 #if CRYPTOPP_BOOL_X86
00623         AS2(    mov             [ecx+16*12+16*4], esp)  // save esp to L_SP
00624         AS2(    lea             esp, [ecx-512])
00625 #endif
00626 
00627         // copy subkeys to stack
00628         AS2(    mov             WORD_REG(si), [L_KEYS_BEGIN])
00629         AS2(    mov             WORD_REG(ax), 16)
00630         AS2(    and             WORD_REG(ax), WORD_REG(si))
00631         AS2(    movdqa  xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)])       // subkey 1 (non-counter) or 2 (counter)
00632         AS2(    movdqa  [L_KEY12], xmm3)
00633         AS2(    lea             WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
00634         AS2(    sub             WORD_REG(ax), WORD_REG(si))
00635         ASL(0)
00636         AS2(    movdqa  xmm0, [WORD_REG(ax)+WORD_REG(si)])
00637         AS2(    movdqa  XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
00638         AS2(    add             WORD_REG(si), 16)
00639         AS2(    cmp             WORD_REG(si), 16*12)
00640         ASJ(    jl,             0, b)
00641 
00642         // read subkeys 0, 1 and last
00643         AS2(    movdqa  xmm4, [WORD_REG(ax)+WORD_REG(si)])      // last subkey
00644         AS2(    movdqa  xmm1, [WORD_REG(dx)])                   // subkey 0
00645         AS2(    MOVD    MM(1), [WORD_REG(dx)+4*4])              // 0,1,2,3
00646         AS2(    mov             ebx, [WORD_REG(dx)+5*4])                // 4,5,6,7
00647         AS2(    mov             ecx, [WORD_REG(dx)+6*4])                // 8,9,10,11
00648         AS2(    mov             edx, [WORD_REG(dx)+7*4])                // 12,13,14,15
00649 
00650         // load table into cache
00651         AS2(    xor             WORD_REG(ax), WORD_REG(ax))
00652         ASL(9)
00653         AS2(    mov             esi, [AS_REG_7+WORD_REG(ax)])
00654         AS2(    add             WORD_REG(ax), WORD_REG(di))
00655         AS2(    mov             esi, [AS_REG_7+WORD_REG(ax)])
00656         AS2(    add             WORD_REG(ax), WORD_REG(di))
00657         AS2(    mov             esi, [AS_REG_7+WORD_REG(ax)])
00658         AS2(    add             WORD_REG(ax), WORD_REG(di))
00659         AS2(    mov             esi, [AS_REG_7+WORD_REG(ax)])
00660         AS2(    add             WORD_REG(ax), WORD_REG(di))
00661         AS2(    cmp             WORD_REG(ax), 2048)
00662         ASJ(    jl,             9, b)
00663         AS1(    lfence)
00664 
00665         AS2(    test    DWORD PTR [L_LENGTH], 1)
00666         ASJ(    jz,             8, f)
00667 
00668         // counter mode one-time setup
00669         AS2(    mov             WORD_REG(si), [L_INBLOCKS])
00670         AS2(    movdqu  xmm2, [WORD_REG(si)])   // counter
00671         AS2(    pxor    xmm2, xmm1)
00672         AS2(    psrldq  xmm1, 14)
00673         AS2(    movd    eax, xmm1)
00674         AS2(    mov             al, BYTE PTR [WORD_REG(si)+15])
00675         AS2(    MOVD    MM(2), eax)
00676 #if CRYPTOPP_BOOL_X86
00677         AS2(    mov             eax, 1)
00678         AS2(    movd    mm3, eax)
00679 #endif
00680 
00681         // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
00682         AS2(    movd    eax, xmm2)
00683         AS2(    psrldq  xmm2, 4)
00684         AS2(    movd    edi, xmm2)
00685         AS2(    psrldq  xmm2, 4)
00686                 MXOR(           1, al, 0)               // 0
00687                 XOR(            edx, ah, 1)             // 1
00688         AS2(    shr             eax, 16)
00689                 XOR(            ecx, al, 2)             // 2
00690                 XOR(            ebx, ah, 3)             // 3
00691         AS2(    mov             eax, edi)
00692         AS2(    movd    edi, xmm2)
00693         AS2(    psrldq  xmm2, 4)
00694                 XOR(            ebx, al, 0)             // 4
00695                 MXOR(           1, ah, 1)               // 5
00696         AS2(    shr             eax, 16)
00697                 XOR(            edx, al, 2)             // 6
00698                 XOR(            ecx, ah, 3)             // 7
00699         AS2(    mov             eax, edi)
00700         AS2(    movd    edi, xmm2)
00701                 XOR(            ecx, al, 0)             // 8
00702                 XOR(            ebx, ah, 1)             // 9
00703         AS2(    shr             eax, 16)
00704                 MXOR(           1, al, 2)               // 10
00705                 XOR(            edx, ah, 3)             // 11
00706         AS2(    mov             eax, edi)
00707                 XOR(            edx, al, 0)             // 12
00708                 XOR(            ecx, ah, 1)             // 13
00709         AS2(    shr             eax, 16)
00710                 XOR(            ebx, al, 2)             // 14
00711         AS2(    psrldq  xmm2, 3)
00712 
00713         // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
00714         AS2(    mov             eax, [L_KEY12+0*4])
00715         AS2(    mov             edi, [L_KEY12+2*4])
00716         AS2(    MOVD    MM(0), [L_KEY12+3*4])
00717                 MXOR(   0, cl, 3)       /* 11 */
00718                 XOR(    edi, bl, 3)     /* 7 */
00719                 MXOR(   0, bh, 2)       /* 6 */
00720         AS2(    shr ebx, 16)    /* 4,5 */
00721                 XOR(    eax, bl, 1)     /* 5 */
00722                 MOV(    ebx, bh, 0)     /* 4 */
00723         AS2(    xor             ebx, [L_KEY12+1*4])
00724                 XOR(    eax, ch, 2)     /* 10 */
00725         AS2(    shr ecx, 16)    /* 8,9 */
00726                 XOR(    eax, dl, 3)     /* 15 */
00727                 XOR(    ebx, dh, 2)     /* 14 */
00728         AS2(    shr edx, 16)    /* 12,13 */
00729                 XOR(    edi, ch, 0)     /* 8 */
00730                 XOR(    ebx, cl, 1)     /* 9 */
00731                 XOR(    edi, dl, 1)     /* 13 */
00732                 MXOR(   0, dh, 0)       /* 12 */
00733 
00734         AS2(    movd    ecx, xmm2)
00735         AS2(    MOVD    edx, MM(1))
00736         AS2(    MOVD    [L_SAVED_X+3*4], MM(0))
00737         AS2(    mov             [L_SAVED_X+0*4], eax)
00738         AS2(    mov             [L_SAVED_X+1*4], ebx)
00739         AS2(    mov             [L_SAVED_X+2*4], edi)
00740         ASJ(    jmp,    5, f)
00741 
00742         ASL(3)
00743         // non-counter mode per-block setup
00744         AS2(    MOVD    MM(1), [L_KEY12+0*4])   // 0,1,2,3
00745         AS2(    mov             ebx, [L_KEY12+1*4])             // 4,5,6,7
00746         AS2(    mov             ecx, [L_KEY12+2*4])             // 8,9,10,11
00747         AS2(    mov             edx, [L_KEY12+3*4])             // 12,13,14,15
00748         ASL(8)
00749         AS2(    mov             WORD_REG(ax), [L_INBLOCKS])
00750         AS2(    movdqu  xmm2, [WORD_REG(ax)])
00751         AS2(    mov             WORD_REG(si), [L_INXORBLOCKS])
00752         AS2(    movdqu  xmm5, [WORD_REG(si)])
00753         AS2(    pxor    xmm2, xmm1)
00754         AS2(    pxor    xmm2, xmm5)
00755 
00756         // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
00757         AS2(    movd    eax, xmm2)
00758         AS2(    psrldq  xmm2, 4)
00759         AS2(    movd    edi, xmm2)
00760         AS2(    psrldq  xmm2, 4)
00761                 MXOR(           1, al, 0)               // 0
00762                 XOR(            edx, ah, 1)             // 1
00763         AS2(    shr             eax, 16)
00764                 XOR(            ecx, al, 2)             // 2
00765                 XOR(            ebx, ah, 3)             // 3
00766         AS2(    mov             eax, edi)
00767         AS2(    movd    edi, xmm2)
00768         AS2(    psrldq  xmm2, 4)
00769                 XOR(            ebx, al, 0)             // 4
00770                 MXOR(           1, ah, 1)               // 5
00771         AS2(    shr             eax, 16)
00772                 XOR(            edx, al, 2)             // 6
00773                 XOR(            ecx, ah, 3)             // 7
00774         AS2(    mov             eax, edi)
00775         AS2(    movd    edi, xmm2)
00776                 XOR(            ecx, al, 0)             // 8
00777                 XOR(            ebx, ah, 1)             // 9
00778         AS2(    shr             eax, 16)
00779                 MXOR(           1, al, 2)               // 10
00780                 XOR(            edx, ah, 3)             // 11
00781         AS2(    mov             eax, edi)
00782                 XOR(            edx, al, 0)             // 12
00783                 XOR(            ecx, ah, 1)             // 13
00784         AS2(    shr             eax, 16)
00785                 XOR(            ebx, al, 2)             // 14
00786                 MXOR(           1, ah, 3)               // 15
00787         AS2(    MOVD    eax, MM(1))
00788 
00789         AS2(    add             L_REG, [L_KEYS_BEGIN])
00790         AS2(    add             L_REG, 4*16)
00791         ASJ(    jmp,    2, f)
00792 
00793         ASL(1)
00794         // counter-mode per-block setup
00795         AS2(    MOVD    ecx, MM(2))
00796         AS2(    MOVD    edx, MM(1))
00797         AS2(    mov             eax, [L_SAVED_X+0*4])
00798         AS2(    mov             ebx, [L_SAVED_X+1*4])
00799         AS2(    xor             cl, ch)
00800         AS2(    and             WORD_REG(cx), 255)
00801         ASL(5)
00802 #if CRYPTOPP_BOOL_X86
00803         AS2(    paddb   MM(2), mm3)
00804 #else
00805         AS2(    add             MM(2), 1)
00806 #endif
00807         // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
00808         AS2(    xor             edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
00809                 XOR(            ebx, dl, 3)
00810                 MOV(            ecx, dh, 2)
00811         AS2(    shr             edx, 16)
00812         AS2(    xor             ecx, [L_SAVED_X+2*4])
00813                 XOR(            eax, dh, 0)
00814                 MOV(            edx, dl, 1)
00815         AS2(    xor             edx, [L_SAVED_X+3*4])
00816 
00817         AS2(    add             L_REG, [L_KEYS_BEGIN])
00818         AS2(    add             L_REG, 3*16)
00819         ASJ(    jmp,    4, f)
00820 
00821 // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
00822 // out: eax, ebx, edi, mm0
00823 #define ROUND()         \
00824                 MXOR(   0, cl, 3)       /* 11 */\
00825         AS2(    mov     cl, al)         /* 8,9,10,3 */\
00826                 XOR(    edi, ah, 2)     /* 2 */\
00827         AS2(    shr eax, 16)    /* 0,1 */\
00828                 XOR(    edi, bl, 3)     /* 7 */\
00829                 MXOR(   0, bh, 2)       /* 6 */\
00830         AS2(    shr ebx, 16)    /* 4,5 */\
00831                 MXOR(   0, al, 1)       /* 1 */\
00832                 MOV(    eax, ah, 0)     /* 0 */\
00833                 XOR(    eax, bl, 1)     /* 5 */\
00834                 MOV(    ebx, bh, 0)     /* 4 */\
00835                 XOR(    eax, ch, 2)     /* 10 */\
00836                 XOR(    ebx, cl, 3)     /* 3 */\
00837         AS2(    shr ecx, 16)    /* 8,9 */\
00838                 XOR(    eax, dl, 3)     /* 15 */\
00839                 XOR(    ebx, dh, 2)     /* 14 */\
00840         AS2(    shr edx, 16)    /* 12,13 */\
00841                 XOR(    edi, ch, 0)     /* 8 */\
00842                 XOR(    ebx, cl, 1)     /* 9 */\
00843                 XOR(    edi, dl, 1)     /* 13 */\
00844                 MXOR(   0, dh, 0)       /* 12 */\
00845 
00846         ASL(2)  // 2-round loop
00847         AS2(    MOVD    MM(0), [L_SUBKEYS-4*16+3*4])
00848         AS2(    mov             edi, [L_SUBKEYS-4*16+2*4])
00849         ROUND()
00850         AS2(    mov             ecx, edi)
00851         AS2(    xor             eax, [L_SUBKEYS-4*16+0*4])
00852         AS2(    xor             ebx, [L_SUBKEYS-4*16+1*4])
00853         AS2(    MOVD    edx, MM(0))
00854 
00855         ASL(4)
00856         AS2(    MOVD    MM(0), [L_SUBKEYS-4*16+7*4])
00857         AS2(    mov             edi, [L_SUBKEYS-4*16+6*4])
00858         ROUND()
00859         AS2(    mov             ecx, edi)
00860         AS2(    xor             eax, [L_SUBKEYS-4*16+4*4])
00861         AS2(    xor             ebx, [L_SUBKEYS-4*16+5*4])
00862         AS2(    MOVD    edx, MM(0))
00863 
00864         AS2(    add             L_REG, 32)
00865         AS2(    test    L_REG, 255)
00866         ASJ(    jnz,    2, b)
00867         AS2(    sub             L_REG, 16*16)
00868 
00869 #define LAST(a, b, c)                                                                                           \
00870         AS2(    movzx   esi, a                                                                                  )\
00871         AS2(    movzx   edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1]       )\
00872         AS2(    movzx   esi, b                                                                                  )\
00873         AS2(    xor             edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0]      )\
00874         AS2(    mov             WORD PTR [L_LASTROUND+c], di                                    )\
00875 
00876         // last round
00877         LAST(ch, dl, 2)
00878         LAST(dh, al, 6)
00879         AS2(    shr             edx, 16)
00880         LAST(ah, bl, 10)
00881         AS2(    shr             eax, 16)
00882         LAST(bh, cl, 14)
00883         AS2(    shr             ebx, 16)
00884         LAST(dh, al, 12)
00885         AS2(    shr             ecx, 16)
00886         LAST(ah, bl, 0)
00887         LAST(bh, cl, 4)
00888         LAST(ch, dl, 8)
00889 
00890         AS2(    mov             WORD_REG(ax), [L_OUTXORBLOCKS])
00891         AS2(    mov             WORD_REG(bx), [L_OUTBLOCKS])
00892 
00893         AS2(    mov             WORD_REG(cx), [L_LENGTH])
00894         AS2(    sub             WORD_REG(cx), 16)
00895 
00896         AS2(    movdqu  xmm2, [WORD_REG(ax)])
00897         AS2(    pxor    xmm2, xmm4)
00898 
00899 #if CRYPTOPP_BOOL_X86
00900         AS2(    movdqa  xmm0, [L_INCREMENTS])
00901         AS2(    paddd   xmm0, [L_INBLOCKS])
00902         AS2(    movdqa  [L_INBLOCKS], xmm0)
00903 #else
00904         AS2(    movdqa  xmm0, [L_INCREMENTS+16])
00905         AS2(    paddq   xmm0, [L_INBLOCKS+16])
00906         AS2(    movdqa  [L_INBLOCKS+16], xmm0)
00907 #endif
00908 
00909         AS2(    pxor    xmm2, [L_LASTROUND])
00910         AS2(    movdqu  [WORD_REG(bx)], xmm2)
00911 
00912         ASJ(    jle,    7, f)
00913         AS2(    mov             [L_LENGTH], WORD_REG(cx))
00914         AS2(    test    WORD_REG(cx), 1)
00915         ASJ(    jnz,    1, b)
00916 #if CRYPTOPP_BOOL_X64
00917         AS2(    movdqa  xmm0, [L_INCREMENTS])
00918         AS2(    paddq   xmm0, [L_INBLOCKS])
00919         AS2(    movdqa  [L_INBLOCKS], xmm0)
00920 #endif
00921         ASJ(    jmp,    3, b)
00922 
00923         ASL(7)
00924         // erase keys on stack
00925         AS2(    xorps   xmm0, xmm0)
00926         AS2(    lea             WORD_REG(ax), [L_SUBKEYS+7*16])
00927         AS2(    movaps  [WORD_REG(ax)-7*16], xmm0)
00928         AS2(    movaps  [WORD_REG(ax)-6*16], xmm0)
00929         AS2(    movaps  [WORD_REG(ax)-5*16], xmm0)
00930         AS2(    movaps  [WORD_REG(ax)-4*16], xmm0)
00931         AS2(    movaps  [WORD_REG(ax)-3*16], xmm0)
00932         AS2(    movaps  [WORD_REG(ax)-2*16], xmm0)
00933         AS2(    movaps  [WORD_REG(ax)-1*16], xmm0)
00934         AS2(    movaps  [WORD_REG(ax)+0*16], xmm0)
00935         AS2(    movaps  [WORD_REG(ax)+1*16], xmm0)
00936         AS2(    movaps  [WORD_REG(ax)+2*16], xmm0)
00937         AS2(    movaps  [WORD_REG(ax)+3*16], xmm0)
00938         AS2(    movaps  [WORD_REG(ax)+4*16], xmm0)
00939         AS2(    movaps  [WORD_REG(ax)+5*16], xmm0)
00940         AS2(    movaps  [WORD_REG(ax)+6*16], xmm0)
00941 #if CRYPTOPP_BOOL_X86
00942         AS2(    mov             esp, [L_SP])
00943         AS1(    emms)
00944 #endif
00945         AS_POP_IF86(bp)
00946         AS_POP_IF86(bx)
00947 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
00948         AS_POP_IF86(di)
00949         AS_POP_IF86(si)
00950         AS1(ret)
00951 #endif
00952 #ifdef CRYPTOPP_GENERATE_X64_MASM
00953         pop r12
00954         pop rbx
00955         pop rdi
00956         pop rsi
00957         ret
00958         Rijndael_Enc_AdvancedProcessBlocks ENDP
00959 #endif
00960 #ifdef __GNUC__
00961         ".att_syntax prefix;"
00962         : 
00963         : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
00964         : "memory", "cc", "%eax"
00965         #if CRYPTOPP_BOOL_X64
00966                 , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
00967         #endif
00968         );
00969 #endif
00970 }
00971 
00972 #endif
00973 
00974 #ifndef CRYPTOPP_GENERATE_X64_MASM
00975 
00976 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00977 extern "C" {
00978 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k);
00979 }
00980 #endif
00981 
00982 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86
00983 
00984 static inline bool AliasedWithTable(const byte *begin, const byte *end)
00985 {
00986         size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096;
00987         size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+sizeof(Te))%4096;
00988         if (t1 > t0)
00989                 return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
00990         else
00991                 return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
00992 }
00993 
00994 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
00995 
00996 inline void AESNI_Enc_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
00997 {
00998         block = _mm_xor_si128(block, subkeys[0]);
00999         for (unsigned int i=1; i<rounds-1; i+=2)
01000         {
01001                 block = _mm_aesenc_si128(block, subkeys[i]);
01002                 block = _mm_aesenc_si128(block, subkeys[i+1]);
01003         }
01004         block = _mm_aesenc_si128(block, subkeys[rounds-1]);
01005         block = _mm_aesenclast_si128(block, subkeys[rounds]);
01006 }
01007 
01008 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
01009 {
01010         __m128i rk = subkeys[0];
01011         block0 = _mm_xor_si128(block0, rk);
01012         block1 = _mm_xor_si128(block1, rk);
01013         block2 = _mm_xor_si128(block2, rk);
01014         block3 = _mm_xor_si128(block3, rk);
01015         for (unsigned int i=1; i<rounds; i++)
01016         {
01017                 rk = subkeys[i];
01018                 block0 = _mm_aesenc_si128(block0, rk);
01019                 block1 = _mm_aesenc_si128(block1, rk);
01020                 block2 = _mm_aesenc_si128(block2, rk);
01021                 block3 = _mm_aesenc_si128(block3, rk);
01022         }
01023         rk = subkeys[rounds];
01024         block0 = _mm_aesenclast_si128(block0, rk);
01025         block1 = _mm_aesenclast_si128(block1, rk);
01026         block2 = _mm_aesenclast_si128(block2, rk);
01027         block3 = _mm_aesenclast_si128(block3, rk);
01028 }
01029 
01030 inline void AESNI_Dec_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
01031 {
01032         block = _mm_xor_si128(block, subkeys[0]);
01033         for (unsigned int i=1; i<rounds-1; i+=2)
01034         {
01035                 block = _mm_aesdec_si128(block, subkeys[i]);
01036                 block = _mm_aesdec_si128(block, subkeys[i+1]);
01037         }
01038         block = _mm_aesdec_si128(block, subkeys[rounds-1]);
01039         block = _mm_aesdeclast_si128(block, subkeys[rounds]);
01040 }
01041 
01042 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
01043 {
01044         __m128i rk = subkeys[0];
01045         block0 = _mm_xor_si128(block0, rk);
01046         block1 = _mm_xor_si128(block1, rk);
01047         block2 = _mm_xor_si128(block2, rk);
01048         block3 = _mm_xor_si128(block3, rk);
01049         for (unsigned int i=1; i<rounds; i++)
01050         {
01051                 rk = subkeys[i];
01052                 block0 = _mm_aesdec_si128(block0, rk);
01053                 block1 = _mm_aesdec_si128(block1, rk);
01054                 block2 = _mm_aesdec_si128(block2, rk);
01055                 block3 = _mm_aesdec_si128(block3, rk);
01056         }
01057         rk = subkeys[rounds];
01058         block0 = _mm_aesdeclast_si128(block0, rk);
01059         block1 = _mm_aesdeclast_si128(block1, rk);
01060         block2 = _mm_aesdeclast_si128(block2, rk);
01061         block3 = _mm_aesdeclast_si128(block3, rk);
01062 }
01063 
01064 static CRYPTOPP_ALIGN_DATA(16) const word32 s_one[] = {0, 0, 0, 1<<24};
01065 
01066 template <typename F1, typename F4>
01067 inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, const __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
01068 {
01069         size_t blockSize = 16;
01070         size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
01071         size_t xorIncrement = xorBlocks ? blockSize : 0;
01072         size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
01073 
01074         if (flags & BlockTransformation::BT_ReverseDirection)
01075         {
01076                 assert(length % blockSize == 0);
01077                 inBlocks += length - blockSize;
01078                 xorBlocks += length - blockSize;
01079                 outBlocks += length - blockSize;
01080                 inIncrement = 0-inIncrement;
01081                 xorIncrement = 0-xorIncrement;
01082                 outIncrement = 0-outIncrement;
01083         }
01084 
01085         if (flags & BlockTransformation::BT_AllowParallel)
01086         {
01087                 while (length >= 4*blockSize)
01088                 {
01089                         __m128i block0 = _mm_loadu_si128((const __m128i *)inBlocks), block1, block2, block3;
01090                         if (flags & BlockTransformation::BT_InBlockIsCounter)
01091                         {
01092                                 const __m128i be1 = *(const __m128i *)s_one;
01093                                 block1 = _mm_add_epi32(block0, be1);
01094                                 block2 = _mm_add_epi32(block1, be1);
01095                                 block3 = _mm_add_epi32(block2, be1);
01096                                 _mm_storeu_si128((__m128i *)inBlocks, _mm_add_epi32(block3, be1));
01097                         }
01098                         else
01099                         {
01100                                 inBlocks += inIncrement;
01101                                 block1 = _mm_loadu_si128((const __m128i *)inBlocks);
01102                                 inBlocks += inIncrement;
01103                                 block2 = _mm_loadu_si128((const __m128i *)inBlocks);
01104                                 inBlocks += inIncrement;
01105                                 block3 = _mm_loadu_si128((const __m128i *)inBlocks);
01106                                 inBlocks += inIncrement;
01107                         }
01108 
01109                         if (flags & BlockTransformation::BT_XorInput)
01110                         {
01111                                 block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
01112                                 xorBlocks += xorIncrement;
01113                                 block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
01114                                 xorBlocks += xorIncrement;
01115                                 block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
01116                                 xorBlocks += xorIncrement;
01117                                 block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
01118                                 xorBlocks += xorIncrement;
01119                         }
01120 
01121                         func4(block0, block1, block2, block3, subkeys, rounds);
01122 
01123                         if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
01124                         {
01125                                 block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
01126                                 xorBlocks += xorIncrement;
01127                                 block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
01128                                 xorBlocks += xorIncrement;
01129                                 block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
01130                                 xorBlocks += xorIncrement;
01131                                 block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
01132                                 xorBlocks += xorIncrement;
01133                         }
01134 
01135                         _mm_storeu_si128((__m128i *)outBlocks, block0);
01136                         outBlocks += outIncrement;
01137                         _mm_storeu_si128((__m128i *)outBlocks, block1);
01138                         outBlocks += outIncrement;
01139                         _mm_storeu_si128((__m128i *)outBlocks, block2);
01140                         outBlocks += outIncrement;
01141                         _mm_storeu_si128((__m128i *)outBlocks, block3);
01142                         outBlocks += outIncrement;
01143 
01144                         length -= 4*blockSize;
01145                 }
01146         }
01147 
01148         while (length >= blockSize)
01149         {
01150                 __m128i block = _mm_loadu_si128((const __m128i *)inBlocks);
01151 
01152                 if (flags & BlockTransformation::BT_XorInput)
01153                         block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
01154 
01155                 if (flags & BlockTransformation::BT_InBlockIsCounter)
01156                         const_cast<byte *>(inBlocks)[15]++;
01157 
01158                 func1(block, subkeys, rounds);
01159 
01160                 if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
01161                         block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
01162                         
01163                 _mm_storeu_si128((__m128i *)outBlocks, block);
01164 
01165                 inBlocks += inIncrement;
01166                 outBlocks += outIncrement;
01167                 xorBlocks += xorIncrement;
01168                 length -= blockSize;
01169         }
01170 
01171         return length;
01172 }
01173 #endif
01174 
01175 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
01176 {
01177 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
01178         if (HasAESNI())
01179                 return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
01180 #endif
01181         
01182 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
01183         if (HasSSE2())
01184         {
01185                 if (length < BLOCKSIZE)
01186                         return length;
01187 
01188                 struct Locals
01189                 {
01190                         word32 subkeys[4*12], workspace[8];
01191                         const byte *inBlocks, *inXorBlocks, *outXorBlocks;
01192                         byte *outBlocks;
01193                         size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
01194                         size_t regSpill, lengthAndCounterFlag, keysBegin;
01195                 };
01196 
01197                 size_t increment = BLOCKSIZE;
01198                 const byte* zeros = (byte *)(Te+256);
01199                 byte *space;
01200 
01201                 do {
01202                         space = (byte *)alloca(255+sizeof(Locals));
01203                         space += (256-(size_t)space%256)%256;
01204                 }
01205                 while (AliasedWithTable(space, space+sizeof(Locals)));
01206 
01207                 if (flags & BT_ReverseDirection)
01208                 {
01209                         assert(length % BLOCKSIZE == 0);
01210                         inBlocks += length - BLOCKSIZE;
01211                         xorBlocks += length - BLOCKSIZE;
01212                         outBlocks += length - BLOCKSIZE;
01213                         increment = 0-increment;
01214                 }
01215 
01216                 Locals &locals = *(Locals *)space;
01217 
01218                 locals.inBlocks = inBlocks;
01219                 locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
01220                 locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
01221                 locals.outBlocks = outBlocks;
01222 
01223                 locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
01224                 locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
01225                 locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
01226                 locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
01227 
01228                 locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
01229                 int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
01230                 locals.keysBegin = (12-keysToCopy)*16;
01231 
01232                 Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
01233                 return length % BLOCKSIZE;
01234         }
01235 #endif
01236 
01237         return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
01238 }
01239 
01240 #endif
01241 
01242 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
01243 
01244 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
01245 {
01246         if (HasAESNI())
01247                 return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
01248         
01249         return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
01250 }
01251 
01252 #endif  // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
01253 
01254 NAMESPACE_END
01255 
01256 #endif
01257 #endif