00001
00002
00003
00004
00005 #include "pch.h"
00006
00007 #ifndef CRYPTOPP_GENERATE_X64_MASM
00008
00009 #include "salsa.h"
00010 #include "misc.h"
00011 #include "argnames.h"
00012 #include "cpu.h"
00013
00014 NAMESPACE_BEGIN(CryptoPP)
00015
00016 void Salsa20_TestInstantiations()
00017 {
00018 Salsa20::Encryption x;
00019 }
00020
00021 void Salsa20_Policy::CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length)
00022 {
00023 m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
00024
00025 if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
00026 throw InvalidRounds(Salsa20::StaticAlgorithmName(), m_rounds);
00027
00028
00029 GetBlock<word32, LittleEndian> get1(key);
00030 get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]);
00031 GetBlock<word32, LittleEndian> get2(key + length - 16);
00032 get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]);
00033
00034
00035 m_state[0] = 0x61707865;
00036 m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e;
00037 m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32;
00038 m_state[3] = 0x6b206574;
00039 }
00040
00041 void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
00042 {
00043 assert(length==8);
00044 GetBlock<word32, LittleEndian> get(IV);
00045 get(m_state[14])(m_state[11]);
00046 m_state[8] = m_state[5] = 0;
00047 }
00048
00049 void Salsa20_Policy::SeekToIteration(lword iterationCount)
00050 {
00051 m_state[8] = (word32)iterationCount;
00052 m_state[5] = (word32)SafeRightShift<32>(iterationCount);
00053 }
00054
00055 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
00056 unsigned int Salsa20_Policy::GetAlignment() const
00057 {
00058 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00059 if (HasSSE2())
00060 return 16;
00061 else
00062 #endif
00063 return GetAlignmentOf<word32>();
00064 }
00065
00066 unsigned int Salsa20_Policy::GetOptimalBlockSize() const
00067 {
00068 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00069 if (HasSSE2())
00070 return 4*BYTES_PER_ITERATION;
00071 else
00072 #endif
00073 return BYTES_PER_ITERATION;
00074 }
00075 #endif
00076
00077 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00078 extern "C" {
00079 void Salsa20_OperateKeystream(byte *output, const byte *input, size_t iterationCount, int rounds, void *state);
00080 }
00081 #endif
00082
00083 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
00084
00085 void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
00086 {
00087 #endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
00088
00089 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00090 Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data());
00091 return;
00092 #endif
00093
00094 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00095 #ifdef CRYPTOPP_GENERATE_X64_MASM
00096 ALIGN 8
00097 Salsa20_OperateKeystream PROC FRAME
00098 mov r10, [rsp + 5*8] ; state
00099 alloc_stack(10*16 + 32*16 + 8)
00100 save_xmm128 xmm6, 0200h
00101 save_xmm128 xmm7, 0210h
00102 save_xmm128 xmm8, 0220h
00103 save_xmm128 xmm9, 0230h
00104 save_xmm128 xmm10, 0240h
00105 save_xmm128 xmm11, 0250h
00106 save_xmm128 xmm12, 0260h
00107 save_xmm128 xmm13, 0270h
00108 save_xmm128 xmm14, 0280h
00109 save_xmm128 xmm15, 0290h
00110 .endprolog
00111
00112 #define REG_output rcx
00113 #define REG_input rdx
00114 #define REG_iterationCount r8
00115 #define REG_state r10
00116 #define REG_rounds e9d
00117 #define REG_roundsLeft eax
00118 #define REG_temp32 r11d
00119 #define REG_temp r11
00120 #define SSE2_WORKSPACE rsp
00121 #else
00122 if (HasSSE2())
00123 {
00124 #if CRYPTOPP_BOOL_X64
00125 #define REG_output %4
00126 #define REG_input %1
00127 #define REG_iterationCount %2
00128 #define REG_state %3
00129 #define REG_rounds %0
00130 #define REG_roundsLeft eax
00131 #define REG_temp32 edx
00132 #define REG_temp rdx
00133 #define SSE2_WORKSPACE %5
00134
00135 FixedSizeAlignedSecBlock<byte, 32*16> workspace;
00136 #else
00137 #define REG_output edi
00138 #define REG_input eax
00139 #define REG_iterationCount ecx
00140 #define REG_state esi
00141 #define REG_rounds edx
00142 #define REG_roundsLeft ebx
00143 #define REG_temp32 ebp
00144 #define REG_temp ebp
00145 #define SSE2_WORKSPACE esp + WORD_SZ
00146 #endif
00147
00148 #ifdef __GNUC__
00149 __asm__ __volatile__
00150 (
00151 ".intel_syntax noprefix;"
00152 AS_PUSH_IF86( bx)
00153 #else
00154 void *s = m_state.data();
00155 word32 r = m_rounds;
00156
00157 AS2( mov REG_iterationCount, iterationCount)
00158 AS2( mov REG_input, input)
00159 AS2( mov REG_output, output)
00160 AS2( mov REG_state, s)
00161 AS2( mov REG_rounds, r)
00162 #endif
00163 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
00164
00165 AS_PUSH_IF86( bp)
00166 AS2( cmp REG_iterationCount, 4)
00167 ASJ( jl, 5, f)
00168
00169 #if CRYPTOPP_BOOL_X86
00170 AS2( mov ebx, esp)
00171 AS2( and esp, -16)
00172 AS2( sub esp, 32*16)
00173 AS1( push ebx)
00174 #endif
00175
00176 #define SSE2_EXPAND_S(i, j) \
00177 ASS( pshufd xmm4, xmm##i, j, j, j, j) \
00178 AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
00179
00180 AS2( movdqa xmm0, [REG_state + 0*16])
00181 AS2( movdqa xmm1, [REG_state + 1*16])
00182 AS2( movdqa xmm2, [REG_state + 2*16])
00183 AS2( movdqa xmm3, [REG_state + 3*16])
00184 SSE2_EXPAND_S(0, 0)
00185 SSE2_EXPAND_S(0, 1)
00186 SSE2_EXPAND_S(0, 2)
00187 SSE2_EXPAND_S(0, 3)
00188 SSE2_EXPAND_S(1, 0)
00189 SSE2_EXPAND_S(1, 2)
00190 SSE2_EXPAND_S(1, 3)
00191 SSE2_EXPAND_S(2, 1)
00192 SSE2_EXPAND_S(2, 2)
00193 SSE2_EXPAND_S(2, 3)
00194 SSE2_EXPAND_S(3, 0)
00195 SSE2_EXPAND_S(3, 1)
00196 SSE2_EXPAND_S(3, 2)
00197 SSE2_EXPAND_S(3, 3)
00198
00199 #define SSE2_EXPAND_S85(i) \
00200 AS2( mov dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_roundsLeft) \
00201 AS2( mov dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32) \
00202 AS2( add REG_roundsLeft, 1) \
00203 AS2( adc REG_temp32, 0)
00204
00205 ASL(1)
00206 AS2( mov REG_roundsLeft, dword ptr [REG_state + 8*4])
00207 AS2( mov REG_temp32, dword ptr [REG_state + 5*4])
00208 SSE2_EXPAND_S85(0)
00209 SSE2_EXPAND_S85(1)
00210 SSE2_EXPAND_S85(2)
00211 SSE2_EXPAND_S85(3)
00212 AS2( mov dword ptr [REG_state + 8*4], REG_roundsLeft)
00213 AS2( mov dword ptr [REG_state + 5*4], REG_temp32)
00214
00215 #define SSE2_QUARTER_ROUND(a, b, d, i) \
00216 AS2( movdqa xmm4, xmm##d) \
00217 AS2( paddd xmm4, xmm##a) \
00218 AS2( movdqa xmm5, xmm4) \
00219 AS2( pslld xmm4, i) \
00220 AS2( psrld xmm5, 32-i) \
00221 AS2( pxor xmm##b, xmm4) \
00222 AS2( pxor xmm##b, xmm5)
00223
00224 #define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
00225 #define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256])
00226 #define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C)
00227 #define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
00228 #define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7)
00229 #define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7)
00230 #define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
00231 #define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B)
00232 #define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A)
00233 #define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
00234 #define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C)
00235 #define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
00236 #define L13(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 9)
00237 #define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9)
00238 #define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
00239 #define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D)
00240 #define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A)
00241 #define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
00242 #define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B)
00243 #define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
00244 #define L21(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 13)
00245 #define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13)
00246 #define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
00247 #define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B)
00248 #define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A)
00249 #define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D)
00250 #define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
00251 #define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18)
00252 #define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18)
00253 #define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C)
00254 #define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D)
00255 #define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A)
00256
00257 #define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \
00258 L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \
00259 L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) \
00260 L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) \
00261 L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) \
00262 L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) \
00263 L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) \
00264 L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) \
00265 L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) \
00266 L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) \
00267 L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) \
00268 L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) \
00269 L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) \
00270 L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) \
00271 L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) \
00272 L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) \
00273 L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) \
00274 L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) \
00275 L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) \
00276 L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) \
00277 L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) \
00278 L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) \
00279 L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) \
00280 L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) \
00281 L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) \
00282 L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) \
00283 L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) \
00284 L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) \
00285 L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) \
00286 L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) \
00287 L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) \
00288 L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) \
00289 L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i)
00290
00291 #define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H) \
00292 L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) L01(8,9,10,11, A,B,C,D, i) L01(12,13,14,15, E,F,G,H, i) \
00293 L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) L02(8,9,10,11, A,B,C,D, i) L02(12,13,14,15, E,F,G,H, i) \
00294 L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) L03(8,9,10,11, A,B,C,D, i) L03(12,13,14,15, E,F,G,H, i) \
00295 L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) L04(8,9,10,11, A,B,C,D, i) L04(12,13,14,15, E,F,G,H, i) \
00296 L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) L05(8,9,10,11, A,B,C,D, i) L05(12,13,14,15, E,F,G,H, i) \
00297 L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) L06(8,9,10,11, A,B,C,D, i) L06(12,13,14,15, E,F,G,H, i) \
00298 L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) L07(8,9,10,11, A,B,C,D, i) L07(12,13,14,15, E,F,G,H, i) \
00299 L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) L08(8,9,10,11, A,B,C,D, i) L08(12,13,14,15, E,F,G,H, i) \
00300 L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) L09(8,9,10,11, A,B,C,D, i) L09(12,13,14,15, E,F,G,H, i) \
00301 L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) L10(8,9,10,11, A,B,C,D, i) L10(12,13,14,15, E,F,G,H, i) \
00302 L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) L11(8,9,10,11, A,B,C,D, i) L11(12,13,14,15, E,F,G,H, i) \
00303 L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) L12(8,9,10,11, A,B,C,D, i) L12(12,13,14,15, E,F,G,H, i) \
00304 L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) L13(8,9,10,11, A,B,C,D, i) L13(12,13,14,15, E,F,G,H, i) \
00305 L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) L14(8,9,10,11, A,B,C,D, i) L14(12,13,14,15, E,F,G,H, i) \
00306 L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) L15(8,9,10,11, A,B,C,D, i) L15(12,13,14,15, E,F,G,H, i) \
00307 L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) L16(8,9,10,11, A,B,C,D, i) L16(12,13,14,15, E,F,G,H, i) \
00308 L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) L17(8,9,10,11, A,B,C,D, i) L17(12,13,14,15, E,F,G,H, i) \
00309 L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) L18(8,9,10,11, A,B,C,D, i) L18(12,13,14,15, E,F,G,H, i) \
00310 L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) L19(8,9,10,11, A,B,C,D, i) L19(12,13,14,15, E,F,G,H, i) \
00311 L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) L20(8,9,10,11, A,B,C,D, i) L20(12,13,14,15, E,F,G,H, i) \
00312 L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) L21(8,9,10,11, A,B,C,D, i) L21(12,13,14,15, E,F,G,H, i) \
00313 L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) L22(8,9,10,11, A,B,C,D, i) L22(12,13,14,15, E,F,G,H, i) \
00314 L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) L23(8,9,10,11, A,B,C,D, i) L23(12,13,14,15, E,F,G,H, i) \
00315 L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) L24(8,9,10,11, A,B,C,D, i) L24(12,13,14,15, E,F,G,H, i) \
00316 L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) L25(8,9,10,11, A,B,C,D, i) L25(12,13,14,15, E,F,G,H, i) \
00317 L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) L26(8,9,10,11, A,B,C,D, i) L26(12,13,14,15, E,F,G,H, i) \
00318 L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) L27(8,9,10,11, A,B,C,D, i) L27(12,13,14,15, E,F,G,H, i) \
00319 L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) L28(8,9,10,11, A,B,C,D, i) L28(12,13,14,15, E,F,G,H, i) \
00320 L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) L29(8,9,10,11, A,B,C,D, i) L29(12,13,14,15, E,F,G,H, i) \
00321 L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) L30(8,9,10,11, A,B,C,D, i) L30(12,13,14,15, E,F,G,H, i) \
00322 L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) L31(8,9,10,11, A,B,C,D, i) L31(12,13,14,15, E,F,G,H, i) \
00323 L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) L32(8,9,10,11, A,B,C,D, i) L32(12,13,14,15, E,F,G,H, i)
00324
00325 #if CRYPTOPP_BOOL_X64
00326 SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
00327 #else
00328 SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15)
00329 SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13)
00330 #endif
00331 AS2( mov REG_roundsLeft, REG_rounds)
00332 ASJ( jmp, 2, f)
00333
00334 ASL(SSE2_Salsa_Output)
00335 AS2( movdqa xmm0, xmm4)
00336 AS2( punpckldq xmm4, xmm5)
00337 AS2( movdqa xmm1, xmm6)
00338 AS2( punpckldq xmm6, xmm7)
00339 AS2( movdqa xmm2, xmm4)
00340 AS2( punpcklqdq xmm4, xmm6)
00341 AS2( punpckhqdq xmm2, xmm6)
00342 AS2( punpckhdq xmm0, xmm5)
00343 AS2( punpckhdq xmm1, xmm7)
00344 AS2( movdqa xmm6, xmm0)
00345 AS2( punpcklqdq xmm0, xmm1)
00346 AS2( punpckhqdq xmm6, xmm1)
00347 AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1)
00348 AS1( ret)
00349
00350 ASL(6)
00351 #if CRYPTOPP_BOOL_X64
00352 SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
00353 ASL(2)
00354 SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6)
00355 #else
00356 SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15)
00357 SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13)
00358 ASL(2)
00359 SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6)
00360 SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4)
00361 #endif
00362 AS2( sub REG_roundsLeft, 2)
00363 ASJ( jnz, 6, b)
00364
00365 #define SSE2_OUTPUT_4(a, b, c, d) \
00366 AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\
00367 AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\
00368 AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\
00369 AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\
00370 AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\
00371 AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\
00372 AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\
00373 AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\
00374 ASC( call, SSE2_Salsa_Output)
00375
00376 SSE2_OUTPUT_4(0, 13, 10, 7)
00377 SSE2_OUTPUT_4(4, 1, 14, 11)
00378 SSE2_OUTPUT_4(8, 5, 2, 15)
00379 SSE2_OUTPUT_4(12, 9, 6, 3)
00380 AS2( test REG_input, REG_input)
00381 ASJ( jz, 9, f)
00382 AS2( add REG_input, 12*16)
00383 ASL(9)
00384 AS2( add REG_output, 12*16)
00385 AS2( sub REG_iterationCount, 4)
00386 AS2( cmp REG_iterationCount, 4)
00387 ASJ( jge, 1, b)
00388 AS_POP_IF86( sp)
00389
00390 ASL(5)
00391 AS2( sub REG_iterationCount, 1)
00392 ASJ( jl, 4, f)
00393 AS2( movdqa xmm0, [REG_state + 0*16])
00394 AS2( movdqa xmm1, [REG_state + 1*16])
00395 AS2( movdqa xmm2, [REG_state + 2*16])
00396 AS2( movdqa xmm3, [REG_state + 3*16])
00397 AS2( mov REG_roundsLeft, REG_rounds)
00398
00399 ASL(0)
00400 SSE2_QUARTER_ROUND(0, 1, 3, 7)
00401 SSE2_QUARTER_ROUND(1, 2, 0, 9)
00402 SSE2_QUARTER_ROUND(2, 3, 1, 13)
00403 SSE2_QUARTER_ROUND(3, 0, 2, 18)
00404 ASS( pshufd xmm1, xmm1, 2, 1, 0, 3)
00405 ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
00406 ASS( pshufd xmm3, xmm3, 0, 3, 2, 1)
00407 SSE2_QUARTER_ROUND(0, 3, 1, 7)
00408 SSE2_QUARTER_ROUND(3, 2, 0, 9)
00409 SSE2_QUARTER_ROUND(2, 1, 3, 13)
00410 SSE2_QUARTER_ROUND(1, 0, 2, 18)
00411 ASS( pshufd xmm1, xmm1, 0, 3, 2, 1)
00412 ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
00413 ASS( pshufd xmm3, xmm3, 2, 1, 0, 3)
00414 AS2( sub REG_roundsLeft, 2)
00415 ASJ( jnz, 0, b)
00416
00417 AS2( paddd xmm0, [REG_state + 0*16])
00418 AS2( paddd xmm1, [REG_state + 1*16])
00419 AS2( paddd xmm2, [REG_state + 2*16])
00420 AS2( paddd xmm3, [REG_state + 3*16])
00421
00422 AS2( add dword ptr [REG_state + 8*4], 1)
00423 AS2( adc dword ptr [REG_state + 5*4], 0)
00424
00425 AS2( pcmpeqb xmm6, xmm6)
00426 AS2( psrlq xmm6, 32)
00427 ASS( pshufd xmm7, xmm6, 0, 1, 2, 3)
00428 AS2( movdqa xmm4, xmm0)
00429 AS2( movdqa xmm5, xmm3)
00430 AS2( pand xmm0, xmm7)
00431 AS2( pand xmm4, xmm6)
00432 AS2( pand xmm3, xmm6)
00433 AS2( pand xmm5, xmm7)
00434 AS2( por xmm4, xmm5)
00435 AS2( movdqa xmm5, xmm1)
00436 AS2( pand xmm1, xmm7)
00437 AS2( pand xmm5, xmm6)
00438 AS2( por xmm0, xmm5)
00439 AS2( pand xmm6, xmm2)
00440 AS2( pand xmm2, xmm7)
00441 AS2( por xmm1, xmm6)
00442 AS2( por xmm2, xmm3)
00443
00444 AS2( movdqa xmm5, xmm4)
00445 AS2( movdqa xmm6, xmm0)
00446 AS3( shufpd xmm4, xmm1, 2)
00447 AS3( shufpd xmm0, xmm2, 2)
00448 AS3( shufpd xmm1, xmm5, 2)
00449 AS3( shufpd xmm2, xmm6, 2)
00450
00451
00452 AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4)
00453 ASJ( jmp, 5, b)
00454 ASL(4)
00455
00456 AS_POP_IF86( bp)
00457 #ifdef __GNUC__
00458 AS_POP_IF86( bx)
00459 ".att_syntax prefix;"
00460 :
00461 #if CRYPTOPP_BOOL_X64
00462 : "r" (m_rounds), "r" (input), "r" (iterationCount), "r" (m_state.data()), "r" (output), "r" (workspace.m_ptr)
00463 : "%eax", "%edx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
00464 #else
00465 : "d" (m_rounds), "a" (input), "c" (iterationCount), "S" (m_state.data()), "D" (output)
00466 : "memory", "cc"
00467 #endif
00468 );
00469 #endif
00470 #ifdef CRYPTOPP_GENERATE_X64_MASM
00471 movdqa xmm6, [rsp + 0200h]
00472 movdqa xmm7, [rsp + 0210h]
00473 movdqa xmm8, [rsp + 0220h]
00474 movdqa xmm9, [rsp + 0230h]
00475 movdqa xmm10, [rsp + 0240h]
00476 movdqa xmm11, [rsp + 0250h]
00477 movdqa xmm12, [rsp + 0260h]
00478 movdqa xmm13, [rsp + 0270h]
00479 movdqa xmm14, [rsp + 0280h]
00480 movdqa xmm15, [rsp + 0290h]
00481 add rsp, 10*16 + 32*16 + 8
00482 ret
00483 Salsa20_OperateKeystream ENDP
00484 #else
00485 }
00486 else
00487 #endif
00488 #endif
00489 #ifndef CRYPTOPP_GENERATE_X64_MASM
00490 {
00491 word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
00492
00493 while (iterationCount--)
00494 {
00495 x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
00496 x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7];
00497 x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11];
00498 x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15];
00499
00500 for (int i=m_rounds; i>0; i-=2)
00501 {
00502 #define QUARTER_ROUND(a, b, c, d) \
00503 b = b ^ rotlFixed(a + d, 7); \
00504 c = c ^ rotlFixed(b + a, 9); \
00505 d = d ^ rotlFixed(c + b, 13); \
00506 a = a ^ rotlFixed(d + c, 18);
00507
00508 QUARTER_ROUND(x0, x4, x8, x12)
00509 QUARTER_ROUND(x1, x5, x9, x13)
00510 QUARTER_ROUND(x2, x6, x10, x14)
00511 QUARTER_ROUND(x3, x7, x11, x15)
00512
00513 QUARTER_ROUND(x0, x13, x10, x7)
00514 QUARTER_ROUND(x1, x14, x11, x4)
00515 QUARTER_ROUND(x2, x15, x8, x5)
00516 QUARTER_ROUND(x3, x12, x9, x6)
00517 }
00518
00519 #define SALSA_OUTPUT(x) {\
00520 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
00521 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
00522 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
00523 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
00524 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
00525 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
00526 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
00527 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
00528 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
00529 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
00530 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
00531 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
00532 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
00533 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
00534 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
00535 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
00536
00537 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
00538 CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
00539 #endif
00540
00541 if (++m_state[8] == 0)
00542 ++m_state[5];
00543 }
00544 }
00545 }
00546
00547 void XSalsa20_Policy::CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length)
00548 {
00549 m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
00550
00551 if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
00552 throw InvalidRounds(XSalsa20::StaticAlgorithmName(), m_rounds);
00553
00554 GetUserKey(LITTLE_ENDIAN_ORDER, m_key.begin(), m_key.size(), key, length);
00555 if (length == 16)
00556 memcpy(m_key.begin()+4, m_key.begin(), 16);
00557
00558
00559 m_state[0] = 0x61707865;
00560 m_state[1] = 0x3320646e;
00561 m_state[2] = 0x79622d32;
00562 m_state[3] = 0x6b206574;
00563 }
00564
00565 void XSalsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
00566 {
00567 assert(length==24);
00568
00569 word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
00570
00571 GetBlock<word32, LittleEndian> get(IV);
00572 get(x14)(x11)(x8)(x5)(m_state[14])(m_state[11]);
00573
00574 x13 = m_key[0]; x10 = m_key[1]; x7 = m_key[2]; x4 = m_key[3];
00575 x15 = m_key[4]; x12 = m_key[5]; x9 = m_key[6]; x6 = m_key[7];
00576 x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
00577
00578 for (int i=m_rounds; i>0; i-=2)
00579 {
00580 QUARTER_ROUND(x0, x4, x8, x12)
00581 QUARTER_ROUND(x1, x5, x9, x13)
00582 QUARTER_ROUND(x2, x6, x10, x14)
00583 QUARTER_ROUND(x3, x7, x11, x15)
00584
00585 QUARTER_ROUND(x0, x13, x10, x7)
00586 QUARTER_ROUND(x1, x14, x11, x4)
00587 QUARTER_ROUND(x2, x15, x8, x5)
00588 QUARTER_ROUND(x3, x12, x9, x6)
00589 }
00590
00591 m_state[13] = x0; m_state[10] = x1; m_state[7] = x2; m_state[4] = x3;
00592 m_state[15] = x14; m_state[12] = x11; m_state[9] = x8; m_state[6] = x5;
00593 m_state[8] = m_state[5] = 0;
00594 }
00595
00596 NAMESPACE_END
00597
00598 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM