8 #include <botan/idea_sse2.h>    15 inline __m128i mul(__m128i X, 
u16bit K_16)
    17    const __m128i zeros = _mm_set1_epi16(0);
    18    const __m128i ones = _mm_set1_epi16(1);
    20    const __m128i K = _mm_set1_epi16(K_16);
    22    const __m128i X_is_zero = _mm_cmpeq_epi16(X, zeros);
    23    const __m128i K_is_zero = _mm_cmpeq_epi16(K, zeros);
    25    const __m128i mul_lo = _mm_mullo_epi16(X, K);
    26    const __m128i mul_hi = _mm_mulhi_epu16(X, K);
    28    __m128i T = _mm_sub_epi16(mul_lo, mul_hi);
    31    const __m128i subs = _mm_subs_epu16(mul_hi, mul_lo);
    32    const __m128i cmp = _mm_min_epu8(
    33      _mm_or_si128(subs, _mm_srli_epi16(subs, 8)), ones);
    35    T = _mm_add_epi16(T, cmp);
    45       _mm_andnot_si128(X_is_zero, T),
    46       _mm_and_si128(_mm_sub_epi16(ones, K), X_is_zero));
    49       _mm_andnot_si128(K_is_zero, T),
    50       _mm_and_si128(_mm_sub_epi16(ones, X), K_is_zero));
    63 void transpose_in(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
    65    __m128i T0 = _mm_unpackhi_epi32(B0, B1);
    66    __m128i T1 = _mm_unpacklo_epi32(B0, B1);
    67    __m128i T2 = _mm_unpackhi_epi32(B2, B3);
    68    __m128i T3 = _mm_unpacklo_epi32(B2, B3);
    70    __m128i T4 = _mm_unpacklo_epi32(T0, T1);
    71    __m128i T5 = _mm_unpackhi_epi32(T0, T1);
    72    __m128i T6 = _mm_unpacklo_epi32(T2, T3);
    73    __m128i T7 = _mm_unpackhi_epi32(T2, T3);
    75    T0 = _mm_shufflehi_epi16(T4, _MM_SHUFFLE(1, 3, 0, 2));
    76    T1 = _mm_shufflehi_epi16(T5, _MM_SHUFFLE(1, 3, 0, 2));
    77    T2 = _mm_shufflehi_epi16(T6, _MM_SHUFFLE(1, 3, 0, 2));
    78    T3 = _mm_shufflehi_epi16(T7, _MM_SHUFFLE(1, 3, 0, 2));
    80    T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(1, 3, 0, 2));
    81    T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(1, 3, 0, 2));
    82    T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(1, 3, 0, 2));
    83    T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(1, 3, 0, 2));
    85    T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
    86    T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
    87    T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
    88    T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
    90    B0 = _mm_unpacklo_epi64(T0, T2);
    91    B1 = _mm_unpackhi_epi64(T0, T2);
    92    B2 = _mm_unpacklo_epi64(T1, T3);
    93    B3 = _mm_unpackhi_epi64(T1, T3);
    99 void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
   101    __m128i T0 = _mm_unpacklo_epi64(B0, B1);
   102    __m128i T1 = _mm_unpacklo_epi64(B2, B3);
   103    __m128i T2 = _mm_unpackhi_epi64(B0, B1);
   104    __m128i T3 = _mm_unpackhi_epi64(B2, B3);
   106    T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
   107    T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
   108    T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
   109    T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
   111    T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
   112    T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
   113    T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
   114    T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
   116    T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
   117    T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
   118    T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
   119    T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
   121    B0 = _mm_unpacklo_epi32(T0, T1);
   122    B1 = _mm_unpackhi_epi32(T0, T1);
   123    B2 = _mm_unpacklo_epi32(T2, T3);
   124    B3 = _mm_unpackhi_epi32(T2, T3);
   130 void idea_op_8(
const byte in[64], 
byte out[64], 
const u16bit EK[52])
   132    const __m128i* in_mm = 
reinterpret_cast<const __m128i*
>(in);
   134    __m128i B0 = _mm_loadu_si128(in_mm + 0);
   135    __m128i B1 = _mm_loadu_si128(in_mm + 1);
   136    __m128i B2 = _mm_loadu_si128(in_mm + 2);
   137    __m128i B3 = _mm_loadu_si128(in_mm + 3);
   139    transpose_in(B0, B1, B2, B3);
   142    B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
   143    B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
   144    B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
   145    B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
   147    for(
size_t i = 0; i != 8; ++i)
   149       B0 = mul(B0, EK[6*i+0]);
   150       B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6*i+1]));
   151       B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6*i+2]));
   152       B3 = mul(B3, EK[6*i+3]);
   156       B2 = _mm_xor_si128(B2, B0);
   157       B2 = mul(B2, EK[6*i+4]);
   161       B1 = _mm_xor_si128(B1, B3);
   162       B1 = _mm_add_epi16(B1, B2);
   163       B1 = mul(B1, EK[6*i+5]);
   165       B2 = _mm_add_epi16(B2, B1);
   167       B0 = _mm_xor_si128(B0, B1);
   168       B1 = _mm_xor_si128(B1, T0);
   169       B3 = _mm_xor_si128(B3, B2);
   170       B2 = _mm_xor_si128(B2, T1);
   173    B0 = mul(B0, EK[48]);
   174    B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50]));
   175    B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49]));
   176    B3 = mul(B3, EK[51]);
   179    B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
   180    B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
   181    B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
   182    B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
   184    transpose_out(B0, B2, B1, B3);
   186    __m128i* out_mm = 
reinterpret_cast<__m128i*
>(out);
   188    _mm_storeu_si128(out_mm + 0, B0);
   189    _mm_storeu_si128(out_mm + 1, B2);
   190    _mm_storeu_si128(out_mm + 2, B1);
   191    _mm_storeu_si128(out_mm + 3, B3);
   205       idea_op_8(in, out, KS);
   224       idea_op_8(in, out, KS);
 
const SecureVector< u16bit > & get_EK() const
void encrypt_n(const byte in[], byte out[], size_t blocks) const
const SecureVector< u16bit > & get_DK() const
void decrypt_n(const byte in[], byte out[], size_t blocks) const
void encrypt_n(const byte in[], byte out[], size_t blocks) const
void decrypt_n(const byte in[], byte out[], size_t blocks) const