C++ AES-NI decrypting a 256 bit block

664 Views Asked by At

So here's my problem.. I am writing out an AES-NI implementation for a library, and I am stuck on the decryption of a 256 bit block. Here's what I know.. The 128 bit block works perfectly. The encryption of a 256 block aligns with a proven Rijndael implementation. The expanded key also aligns with the other Rijndael implementation (allowing for the little endian byte order). The routine uses a blend and shift mask to compensate for the offset column shuffle of a 256 bit block, it is the inverse of the mask used to encrypt the block, this also tested and appears to be working fine. Here is the encrypt function:

    void Encrypt32(const std::vector<byte> &Input, const size_t InOffset, std::vector<byte> &Output, const size_t OutOffset)
{
    const size_t LRD = m_expKey.size() - 3;
    size_t keyCtr = 0;
    __m128i RIJNDAEL256_MASK = { 0,1,6,7,4,5,10,11,8,9,14,15,12,13,2,3 };
    __m128i BLEND_MASK = _mm_set_epi32(0x80000000, 0x80800000, 0x80800000, 0x80808000);
    __m128i block1 = _mm_loadu_si128((const __m128i*)(const void*)&Input[InOffset]);
    __m128i block2 = _mm_loadu_si128((const __m128i*)(const void*)&Input[InOffset + 16]);
    __m128i temp1, temp2;

    block1 = _mm_xor_si128(block1, m_expKey[keyCtr]);
    block2 = _mm_xor_si128(block2, m_expKey[++keyCtr]);

    while (keyCtr != LRD)
    {
        temp1 = _mm_blendv_epi8(block1, block2, BLEND_MASK);    // combine 2 blocks
        temp2 = _mm_blendv_epi8(block2, block1, BLEND_MASK);
        temp1 = _mm_shuffle_epi8(temp1, RIJNDAEL256_MASK);      // shuffle
        temp2 = _mm_shuffle_epi8(temp2, RIJNDAEL256_MASK);
        block1 = _mm_aesenc_si128(temp1, m_expKey[++keyCtr]);   // encrypt
        block2 = _mm_aesenc_si128(temp2, m_expKey[++keyCtr]);
    }

    temp1 = _mm_blendv_epi8(block1, block2, BLEND_MASK);
    temp2 = _mm_blendv_epi8(block2, block1, BLEND_MASK);
    temp1 = _mm_shuffle_epi8(temp1, RIJNDAEL256_MASK);
    temp2 = _mm_shuffle_epi8(temp2, RIJNDAEL256_MASK);
    block1 = _mm_aesenclast_si128(temp1, m_expKey[++keyCtr]);
    block2 = _mm_aesenclast_si128(temp2, m_expKey[++keyCtr]);

    _mm_storeu_si128((__m128i*)(void*)&Output[OutOffset], block1);
    _mm_storeu_si128((__m128i*)(void*)&Output[OutOffset + 16], block2);
}

This is the inverse transform:

    void Decrypt32(const std::vector<byte> &Input, const size_t InOffset, std::vector<byte> &Output, const size_t OutOffset)
{
    const size_t LRD = m_expKey.size() - 3;
    __m128i RIJNDAELINV_MASK = { 0,1,14,15,4,5,2,3,8,9,6,7,12,13,10,11 };
    __m128i BLEND_MASK = _mm_set_epi32(0x80000000, 0x80800000, 0x80800000, 0x80808000);
    __m128i block1 = _mm_loadu_si128((const __m128i*)(const void*)&Input[InOffset]);
    __m128i block2 = _mm_loadu_si128((const __m128i*)(const void*)&Input[InOffset + 16]);
    __m128i temp1, temp2;
    size_t keyCtr = 0;

    block1 = _mm_xor_si128(block1, m_expKey[keyCtr]);
    block2 = _mm_xor_si128(block2, m_expKey[++keyCtr]);

    while (keyCtr != LRD)
    {
        temp1 = _mm_aesdec_si128(block1, m_expKey[++keyCtr]);   // decrypt
        temp2 = _mm_aesdec_si128(block2, m_expKey[++keyCtr]);
        temp1 = _mm_shuffle_epi8(temp1, RIJNDAELINV_MASK);      // shuffle
        temp2 = _mm_shuffle_epi8(temp2, RIJNDAELINV_MASK);
        block1 = _mm_blendv_epi8(temp1, temp2, BLEND_MASK);     // combine
        block2 = _mm_blendv_epi8(temp2, temp1, BLEND_MASK);
    }

    temp1 = _mm_aesdeclast_si128(block1, m_expKey[++keyCtr]);
    temp2 = _mm_aesdeclast_si128(block2, m_expKey[++keyCtr]);
    temp1 = _mm_shuffle_epi8(temp1, RIJNDAELINV_MASK);
    temp2 = _mm_shuffle_epi8(temp2, RIJNDAELINV_MASK);
    block1 = _mm_blendv_epi8(temp1, temp2, BLEND_MASK);
    block2 = _mm_blendv_epi8(temp2, temp1, BLEND_MASK);

    _mm_storeu_si128((__m128i*)(void*)&Output[OutOffset], block1);
    _mm_storeu_si128((__m128i*)(void*)&Output[OutOffset + 16], block2);
}

I've been debugging this for hours, and just can't spot the problem, can anyone see why this wouldn't work? I can post the code to git if that would help.

0

There are 0 best solutions below