8 #include <botan/aes_ni.h>     9 #include <botan/loadstor.h>    10 #include <wmmintrin.h>    16 __m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon)
    18    key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3,3,3,3));
    19    key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
    20    key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
    21    key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
    22    return _mm_xor_si128(key, key_with_rcon);
    25 void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon,
    31    key2_with_rcon  = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1));
    32    key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
    33    key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
    34    key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
    35    key1 = _mm_xor_si128(key1, key2_with_rcon);
    38    _mm_storeu_si128((__m128i*)out, key1);
    43    key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
    44    key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3)));
    47    out[4] = _mm_cvtsi128_si32(key2);
    48    out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
    54 __m128i aes_256_key_expansion(__m128i key, __m128i key2)
    56    __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
    57    key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2));
    59    key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
    60    key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
    61    key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
    62    return _mm_xor_si128(key, key_with_rcon);
    67 #define AES_ENC_4_ROUNDS(K)                     \    70       B0 = _mm_aesenc_si128(B0, K);             \    71       B1 = _mm_aesenc_si128(B1, K);             \    72       B2 = _mm_aesenc_si128(B2, K);             \    73       B3 = _mm_aesenc_si128(B3, K);             \    76 #define AES_ENC_4_LAST_ROUNDS(K)                \    79       B0 = _mm_aesenclast_si128(B0, K);         \    80       B1 = _mm_aesenclast_si128(B1, K);         \    81       B2 = _mm_aesenclast_si128(B2, K);         \    82       B3 = _mm_aesenclast_si128(B3, K);         \    85 #define AES_DEC_4_ROUNDS(K)                     \    88       B0 = _mm_aesdec_si128(B0, K);             \    89       B1 = _mm_aesdec_si128(B1, K);             \    90       B2 = _mm_aesdec_si128(B2, K);             \    91       B3 = _mm_aesdec_si128(B3, K);             \    94 #define AES_DEC_4_LAST_ROUNDS(K)                \    97       B0 = _mm_aesdeclast_si128(B0, K);         \    98       B1 = _mm_aesdeclast_si128(B1, K);         \    99       B2 = _mm_aesdeclast_si128(B2, K);         \   100       B3 = _mm_aesdeclast_si128(B3, K);         \   108    const __m128i* in_mm = (
const __m128i*)in;
   109    __m128i* out_mm = (__m128i*)out;
   111    const __m128i* key_mm = (
const __m128i*)&EK[0];
   113    __m128i K0  = _mm_loadu_si128(key_mm);
   114    __m128i K1  = _mm_loadu_si128(key_mm + 1);
   115    __m128i K2  = _mm_loadu_si128(key_mm + 2);
   116    __m128i K3  = _mm_loadu_si128(key_mm + 3);
   117    __m128i K4  = _mm_loadu_si128(key_mm + 4);
   118    __m128i K5  = _mm_loadu_si128(key_mm + 5);
   119    __m128i K6  = _mm_loadu_si128(key_mm + 6);
   120    __m128i K7  = _mm_loadu_si128(key_mm + 7);
   121    __m128i K8  = _mm_loadu_si128(key_mm + 8);
   122    __m128i K9  = _mm_loadu_si128(key_mm + 9);
   123    __m128i K10 = _mm_loadu_si128(key_mm + 10);
   127       __m128i B0 = _mm_loadu_si128(in_mm + 0);
   128       __m128i B1 = _mm_loadu_si128(in_mm + 1);
   129       __m128i B2 = _mm_loadu_si128(in_mm + 2);
   130       __m128i B3 = _mm_loadu_si128(in_mm + 3);
   132       B0 = _mm_xor_si128(B0, K0);
   133       B1 = _mm_xor_si128(B1, K0);
   134       B2 = _mm_xor_si128(B2, K0);
   135       B3 = _mm_xor_si128(B3, K0);
   148       _mm_storeu_si128(out_mm + 0, B0);
   149       _mm_storeu_si128(out_mm + 1, B1);
   150       _mm_storeu_si128(out_mm + 2, B2);
   151       _mm_storeu_si128(out_mm + 3, B3);
   158    for(
size_t i = 0; i != blocks; ++i)
   160       __m128i B = _mm_loadu_si128(in_mm + i);
   162       B = _mm_xor_si128(B, K0);
   164       B = _mm_aesenc_si128(B, K1);
   165       B = _mm_aesenc_si128(B, K2);
   166       B = _mm_aesenc_si128(B, K3);
   167       B = _mm_aesenc_si128(B, K4);
   168       B = _mm_aesenc_si128(B, K5);
   169       B = _mm_aesenc_si128(B, K6);
   170       B = _mm_aesenc_si128(B, K7);
   171       B = _mm_aesenc_si128(B, K8);
   172       B = _mm_aesenc_si128(B, K9);
   173       B = _mm_aesenclast_si128(B, K10);
   175       _mm_storeu_si128(out_mm + i, B);
   184    const __m128i* in_mm = (
const __m128i*)in;
   185    __m128i* out_mm = (__m128i*)out;
   187    const __m128i* key_mm = (
const __m128i*)&DK[0];
   189    __m128i K0  = _mm_loadu_si128(key_mm);
   190    __m128i K1  = _mm_loadu_si128(key_mm + 1);
   191    __m128i K2  = _mm_loadu_si128(key_mm + 2);
   192    __m128i K3  = _mm_loadu_si128(key_mm + 3);
   193    __m128i K4  = _mm_loadu_si128(key_mm + 4);
   194    __m128i K5  = _mm_loadu_si128(key_mm + 5);
   195    __m128i K6  = _mm_loadu_si128(key_mm + 6);
   196    __m128i K7  = _mm_loadu_si128(key_mm + 7);
   197    __m128i K8  = _mm_loadu_si128(key_mm + 8);
   198    __m128i K9  = _mm_loadu_si128(key_mm + 9);
   199    __m128i K10 = _mm_loadu_si128(key_mm + 10);
   203       __m128i B0 = _mm_loadu_si128(in_mm + 0);
   204       __m128i B1 = _mm_loadu_si128(in_mm + 1);
   205       __m128i B2 = _mm_loadu_si128(in_mm + 2);
   206       __m128i B3 = _mm_loadu_si128(in_mm + 3);
   208       B0 = _mm_xor_si128(B0, K0);
   209       B1 = _mm_xor_si128(B1, K0);
   210       B2 = _mm_xor_si128(B2, K0);
   211       B3 = _mm_xor_si128(B3, K0);
   224       _mm_storeu_si128(out_mm + 0, B0);
   225       _mm_storeu_si128(out_mm + 1, B1);
   226       _mm_storeu_si128(out_mm + 2, B2);
   227       _mm_storeu_si128(out_mm + 3, B3);
   234    for(
size_t i = 0; i != blocks; ++i)
   236       __m128i B = _mm_loadu_si128(in_mm + i);
   238       B = _mm_xor_si128(B, K0);
   240       B = _mm_aesdec_si128(B, K1);
   241       B = _mm_aesdec_si128(B, K2);
   242       B = _mm_aesdec_si128(B, K3);
   243       B = _mm_aesdec_si128(B, K4);
   244       B = _mm_aesdec_si128(B, K5);
   245       B = _mm_aesdec_si128(B, K6);
   246       B = _mm_aesdec_si128(B, K7);
   247       B = _mm_aesdec_si128(B, K8);
   248       B = _mm_aesdec_si128(B, K9);
   249       B = _mm_aesdeclast_si128(B, K10);
   251       _mm_storeu_si128(out_mm + i, B);
   258 void AES_128_NI::key_schedule(
const byte key[], 
size_t)
   260    #define AES_128_key_exp(K, RCON) \   261       aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON))   263    __m128i K0  = _mm_loadu_si128((
const __m128i*)(key));
   275    __m128i* EK_mm = (__m128i*)&EK[0];
   276    _mm_storeu_si128(EK_mm     , K0);
   277    _mm_storeu_si128(EK_mm +  1, K1);
   278    _mm_storeu_si128(EK_mm +  2, K2);
   279    _mm_storeu_si128(EK_mm +  3, K3);
   280    _mm_storeu_si128(EK_mm +  4, K4);
   281    _mm_storeu_si128(EK_mm +  5, K5);
   282    _mm_storeu_si128(EK_mm +  6, K6);
   283    _mm_storeu_si128(EK_mm +  7, K7);
   284    _mm_storeu_si128(EK_mm +  8, K8);
   285    _mm_storeu_si128(EK_mm +  9, K9);
   286    _mm_storeu_si128(EK_mm + 10, K10);
   290    __m128i* DK_mm = (__m128i*)&DK[0];
   291    _mm_storeu_si128(DK_mm     , K10);
   292    _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(K9));
   293    _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(K8));
   294    _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(K7));
   295    _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(K6));
   296    _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(K5));
   297    _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(K4));
   298    _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(K3));
   299    _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(K2));
   300    _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(K1));
   301    _mm_storeu_si128(DK_mm + 10, K0);
   318    const __m128i* in_mm = (
const __m128i*)in;
   319    __m128i* out_mm = (__m128i*)out;
   321    const __m128i* key_mm = (
const __m128i*)&EK[0];
   323    __m128i K0  = _mm_loadu_si128(key_mm);
   324    __m128i K1  = _mm_loadu_si128(key_mm + 1);
   325    __m128i K2  = _mm_loadu_si128(key_mm + 2);
   326    __m128i K3  = _mm_loadu_si128(key_mm + 3);
   327    __m128i K4  = _mm_loadu_si128(key_mm + 4);
   328    __m128i K5  = _mm_loadu_si128(key_mm + 5);
   329    __m128i K6  = _mm_loadu_si128(key_mm + 6);
   330    __m128i K7  = _mm_loadu_si128(key_mm + 7);
   331    __m128i K8  = _mm_loadu_si128(key_mm + 8);
   332    __m128i K9  = _mm_loadu_si128(key_mm + 9);
   333    __m128i K10 = _mm_loadu_si128(key_mm + 10);
   334    __m128i K11 = _mm_loadu_si128(key_mm + 11);
   335    __m128i K12 = _mm_loadu_si128(key_mm + 12);
   339       __m128i B0 = _mm_loadu_si128(in_mm + 0);
   340       __m128i B1 = _mm_loadu_si128(in_mm + 1);
   341       __m128i B2 = _mm_loadu_si128(in_mm + 2);
   342       __m128i B3 = _mm_loadu_si128(in_mm + 3);
   344       B0 = _mm_xor_si128(B0, K0);
   345       B1 = _mm_xor_si128(B1, K0);
   346       B2 = _mm_xor_si128(B2, K0);
   347       B3 = _mm_xor_si128(B3, K0);
   362       _mm_storeu_si128(out_mm + 0, B0);
   363       _mm_storeu_si128(out_mm + 1, B1);
   364       _mm_storeu_si128(out_mm + 2, B2);
   365       _mm_storeu_si128(out_mm + 3, B3);
   372    for(
size_t i = 0; i != blocks; ++i)
   374       __m128i B = _mm_loadu_si128(in_mm + i);
   376       B = _mm_xor_si128(B, K0);
   378       B = _mm_aesenc_si128(B, K1);
   379       B = _mm_aesenc_si128(B, K2);
   380       B = _mm_aesenc_si128(B, K3);
   381       B = _mm_aesenc_si128(B, K4);
   382       B = _mm_aesenc_si128(B, K5);
   383       B = _mm_aesenc_si128(B, K6);
   384       B = _mm_aesenc_si128(B, K7);
   385       B = _mm_aesenc_si128(B, K8);
   386       B = _mm_aesenc_si128(B, K9);
   387       B = _mm_aesenc_si128(B, K10);
   388       B = _mm_aesenc_si128(B, K11);
   389       B = _mm_aesenclast_si128(B, K12);
   391       _mm_storeu_si128(out_mm + i, B);
   400    const __m128i* in_mm = (
const __m128i*)in;
   401    __m128i* out_mm = (__m128i*)out;
   403    const __m128i* key_mm = (
const __m128i*)&DK[0];
   405    __m128i K0  = _mm_loadu_si128(key_mm);
   406    __m128i K1  = _mm_loadu_si128(key_mm + 1);
   407    __m128i K2  = _mm_loadu_si128(key_mm + 2);
   408    __m128i K3  = _mm_loadu_si128(key_mm + 3);
   409    __m128i K4  = _mm_loadu_si128(key_mm + 4);
   410    __m128i K5  = _mm_loadu_si128(key_mm + 5);
   411    __m128i K6  = _mm_loadu_si128(key_mm + 6);
   412    __m128i K7  = _mm_loadu_si128(key_mm + 7);
   413    __m128i K8  = _mm_loadu_si128(key_mm + 8);
   414    __m128i K9  = _mm_loadu_si128(key_mm + 9);
   415    __m128i K10 = _mm_loadu_si128(key_mm + 10);
   416    __m128i K11 = _mm_loadu_si128(key_mm + 11);
   417    __m128i K12 = _mm_loadu_si128(key_mm + 12);
   421       __m128i B0 = _mm_loadu_si128(in_mm + 0);
   422       __m128i B1 = _mm_loadu_si128(in_mm + 1);
   423       __m128i B2 = _mm_loadu_si128(in_mm + 2);
   424       __m128i B3 = _mm_loadu_si128(in_mm + 3);
   426       B0 = _mm_xor_si128(B0, K0);
   427       B1 = _mm_xor_si128(B1, K0);
   428       B2 = _mm_xor_si128(B2, K0);
   429       B3 = _mm_xor_si128(B3, K0);
   444       _mm_storeu_si128(out_mm + 0, B0);
   445       _mm_storeu_si128(out_mm + 1, B1);
   446       _mm_storeu_si128(out_mm + 2, B2);
   447       _mm_storeu_si128(out_mm + 3, B3);
   454    for(
size_t i = 0; i != blocks; ++i)
   456       __m128i B = _mm_loadu_si128(in_mm + i);
   458       B = _mm_xor_si128(B, K0);
   460       B = _mm_aesdec_si128(B, K1);
   461       B = _mm_aesdec_si128(B, K2);
   462       B = _mm_aesdec_si128(B, K3);
   463       B = _mm_aesdec_si128(B, K4);
   464       B = _mm_aesdec_si128(B, K5);
   465       B = _mm_aesdec_si128(B, K6);
   466       B = _mm_aesdec_si128(B, K7);
   467       B = _mm_aesdec_si128(B, K8);
   468       B = _mm_aesdec_si128(B, K9);
   469       B = _mm_aesdec_si128(B, K10);
   470       B = _mm_aesdec_si128(B, K11);
   471       B = _mm_aesdeclast_si128(B, K12);
   473       _mm_storeu_si128(out_mm + i, B);
   480 void AES_192_NI::key_schedule(
const byte key[], 
size_t)
   482    __m128i K0 = _mm_loadu_si128((
const __m128i*)(key));
   483    __m128i K1 = _mm_loadu_si128((
const __m128i*)(key + 8));
   484    K1 = _mm_srli_si128(K1, 8);
   488 #define AES_192_key_exp(RCON, EK_OFF)                         \   489    aes_192_key_expansion(&K0, &K1,                            \   490                          _mm_aeskeygenassist_si128(K1, RCON), \   491                          EK + EK_OFF, EK_OFF == 48)   503    const __m128i* EK_mm = (
const __m128i*)&EK[0];
   504    __m128i* DK_mm = (__m128i*)&DK[0];
   505    _mm_storeu_si128(DK_mm     , _mm_loadu_si128(EK_mm + 12));
   506    _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
   507    _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
   508    _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
   509    _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
   510    _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
   511    _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
   512    _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
   513    _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
   514    _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
   515    _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
   516    _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
   517    _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
   534    const __m128i* in_mm = (
const __m128i*)in;
   535    __m128i* out_mm = (__m128i*)out;
   537    const __m128i* key_mm = (
const __m128i*)&EK[0];
   539    __m128i K0  = _mm_loadu_si128(key_mm);
   540    __m128i K1  = _mm_loadu_si128(key_mm + 1);
   541    __m128i K2  = _mm_loadu_si128(key_mm + 2);
   542    __m128i K3  = _mm_loadu_si128(key_mm + 3);
   543    __m128i K4  = _mm_loadu_si128(key_mm + 4);
   544    __m128i K5  = _mm_loadu_si128(key_mm + 5);
   545    __m128i K6  = _mm_loadu_si128(key_mm + 6);
   546    __m128i K7  = _mm_loadu_si128(key_mm + 7);
   547    __m128i K8  = _mm_loadu_si128(key_mm + 8);
   548    __m128i K9  = _mm_loadu_si128(key_mm + 9);
   549    __m128i K10 = _mm_loadu_si128(key_mm + 10);
   550    __m128i K11 = _mm_loadu_si128(key_mm + 11);
   551    __m128i K12 = _mm_loadu_si128(key_mm + 12);
   552    __m128i K13 = _mm_loadu_si128(key_mm + 13);
   553    __m128i K14 = _mm_loadu_si128(key_mm + 14);
   557       __m128i B0 = _mm_loadu_si128(in_mm + 0);
   558       __m128i B1 = _mm_loadu_si128(in_mm + 1);
   559       __m128i B2 = _mm_loadu_si128(in_mm + 2);
   560       __m128i B3 = _mm_loadu_si128(in_mm + 3);
   562       B0 = _mm_xor_si128(B0, K0);
   563       B1 = _mm_xor_si128(B1, K0);
   564       B2 = _mm_xor_si128(B2, K0);
   565       B3 = _mm_xor_si128(B3, K0);
   582       _mm_storeu_si128(out_mm + 0, B0);
   583       _mm_storeu_si128(out_mm + 1, B1);
   584       _mm_storeu_si128(out_mm + 2, B2);
   585       _mm_storeu_si128(out_mm + 3, B3);
   592    for(
size_t i = 0; i != blocks; ++i)
   594       __m128i B = _mm_loadu_si128(in_mm + i);
   596       B = _mm_xor_si128(B, K0);
   598       B = _mm_aesenc_si128(B, K1);
   599       B = _mm_aesenc_si128(B, K2);
   600       B = _mm_aesenc_si128(B, K3);
   601       B = _mm_aesenc_si128(B, K4);
   602       B = _mm_aesenc_si128(B, K5);
   603       B = _mm_aesenc_si128(B, K6);
   604       B = _mm_aesenc_si128(B, K7);
   605       B = _mm_aesenc_si128(B, K8);
   606       B = _mm_aesenc_si128(B, K9);
   607       B = _mm_aesenc_si128(B, K10);
   608       B = _mm_aesenc_si128(B, K11);
   609       B = _mm_aesenc_si128(B, K12);
   610       B = _mm_aesenc_si128(B, K13);
   611       B = _mm_aesenclast_si128(B, K14);
   613       _mm_storeu_si128(out_mm + i, B);
   622    const __m128i* in_mm = (
const __m128i*)in;
   623    __m128i* out_mm = (__m128i*)out;
   625    const __m128i* key_mm = (
const __m128i*)&DK[0];
   627    __m128i K0  = _mm_loadu_si128(key_mm);
   628    __m128i K1  = _mm_loadu_si128(key_mm + 1);
   629    __m128i K2  = _mm_loadu_si128(key_mm + 2);
   630    __m128i K3  = _mm_loadu_si128(key_mm + 3);
   631    __m128i K4  = _mm_loadu_si128(key_mm + 4);
   632    __m128i K5  = _mm_loadu_si128(key_mm + 5);
   633    __m128i K6  = _mm_loadu_si128(key_mm + 6);
   634    __m128i K7  = _mm_loadu_si128(key_mm + 7);
   635    __m128i K8  = _mm_loadu_si128(key_mm + 8);
   636    __m128i K9  = _mm_loadu_si128(key_mm + 9);
   637    __m128i K10 = _mm_loadu_si128(key_mm + 10);
   638    __m128i K11 = _mm_loadu_si128(key_mm + 11);
   639    __m128i K12 = _mm_loadu_si128(key_mm + 12);
   640    __m128i K13 = _mm_loadu_si128(key_mm + 13);
   641    __m128i K14 = _mm_loadu_si128(key_mm + 14);
   645       __m128i B0 = _mm_loadu_si128(in_mm + 0);
   646       __m128i B1 = _mm_loadu_si128(in_mm + 1);
   647       __m128i B2 = _mm_loadu_si128(in_mm + 2);
   648       __m128i B3 = _mm_loadu_si128(in_mm + 3);
   650       B0 = _mm_xor_si128(B0, K0);
   651       B1 = _mm_xor_si128(B1, K0);
   652       B2 = _mm_xor_si128(B2, K0);
   653       B3 = _mm_xor_si128(B3, K0);
   670       _mm_storeu_si128(out_mm + 0, B0);
   671       _mm_storeu_si128(out_mm + 1, B1);
   672       _mm_storeu_si128(out_mm + 2, B2);
   673       _mm_storeu_si128(out_mm + 3, B3);
   680    for(
size_t i = 0; i != blocks; ++i)
   682       __m128i B = _mm_loadu_si128(in_mm + i);
   684       B = _mm_xor_si128(B, K0);
   686       B = _mm_aesdec_si128(B, K1);
   687       B = _mm_aesdec_si128(B, K2);
   688       B = _mm_aesdec_si128(B, K3);
   689       B = _mm_aesdec_si128(B, K4);
   690       B = _mm_aesdec_si128(B, K5);
   691       B = _mm_aesdec_si128(B, K6);
   692       B = _mm_aesdec_si128(B, K7);
   693       B = _mm_aesdec_si128(B, K8);
   694       B = _mm_aesdec_si128(B, K9);
   695       B = _mm_aesdec_si128(B, K10);
   696       B = _mm_aesdec_si128(B, K11);
   697       B = _mm_aesdec_si128(B, K12);
   698       B = _mm_aesdec_si128(B, K13);
   699       B = _mm_aesdeclast_si128(B, K14);
   701       _mm_storeu_si128(out_mm + i, B);
   708 void AES_256_NI::key_schedule(
const byte key[], 
size_t)
   710    __m128i K0 = _mm_loadu_si128((
const __m128i*)(key));
   711    __m128i K1 = _mm_loadu_si128((
const __m128i*)(key + 16));
   713    __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01));
   714    __m128i K3 = aes_256_key_expansion(K1, K2);
   716    __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128(K3, 0x02));
   717    __m128i K5 = aes_256_key_expansion(K3, K4);
   719    __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128(K5, 0x04));
   720    __m128i K7 = aes_256_key_expansion(K5, K6);
   722    __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128(K7, 0x08));
   723    __m128i K9 = aes_256_key_expansion(K7, K8);
   725    __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128(K9, 0x10));
   726    __m128i K11 = aes_256_key_expansion(K9, K10);
   728    __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128(K11, 0x20));
   729    __m128i K13 = aes_256_key_expansion(K11, K12);
   731    __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128(K13, 0x40));
   733    __m128i* EK_mm = (__m128i*)&EK[0];
   734    _mm_storeu_si128(EK_mm     , K0);
   735    _mm_storeu_si128(EK_mm +  1, K1);
   736    _mm_storeu_si128(EK_mm +  2, K2);
   737    _mm_storeu_si128(EK_mm +  3, K3);
   738    _mm_storeu_si128(EK_mm +  4, K4);
   739    _mm_storeu_si128(EK_mm +  5, K5);
   740    _mm_storeu_si128(EK_mm +  6, K6);
   741    _mm_storeu_si128(EK_mm +  7, K7);
   742    _mm_storeu_si128(EK_mm +  8, K8);
   743    _mm_storeu_si128(EK_mm +  9, K9);
   744    _mm_storeu_si128(EK_mm + 10, K10);
   745    _mm_storeu_si128(EK_mm + 11, K11);
   746    _mm_storeu_si128(EK_mm + 12, K12);
   747    _mm_storeu_si128(EK_mm + 13, K13);
   748    _mm_storeu_si128(EK_mm + 14, K14);
   752    __m128i* DK_mm = (__m128i*)&DK[0];
   753    _mm_storeu_si128(DK_mm     , K14);
   754    _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(K13));
   755    _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(K12));
   756    _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(K11));
   757    _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(K10));
   758    _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(K9));
   759    _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(K8));
   760    _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(K7));
   761    _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(K6));
   762    _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(K5));
   763    _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
   764    _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
   765    _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
   766    _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
   767    _mm_storeu_si128(DK_mm + 14, K0);
 
void encrypt_n(const byte in[], byte out[], size_t blocks) const
#define AES_DEC_4_LAST_ROUNDS(K)
T load_le(const byte in[], size_t off)
#define AES_192_key_exp(RCON, EK_OFF)
void decrypt_n(const byte in[], byte out[], size_t blocks) const
void decrypt_n(const byte in[], byte out[], size_t blocks) const
#define AES_ENC_4_ROUNDS(K)
#define AES_DEC_4_ROUNDS(K)
#define AES_ENC_4_LAST_ROUNDS(K)
void encrypt_n(const byte in[], byte out[], size_t blocks) const
#define AES_128_key_exp(K, RCON)
void encrypt_n(const byte in[], byte out[], size_t blocks) const
void decrypt_n(const byte in[], byte out[], size_t blocks) const
void zeroise(MemoryRegion< T > &vec)