Merge pull request #7784 from daverodgman/aesce-unroll
This commit is contained in:
commit
c8d81ad54d
7 changed files with 156 additions and 58 deletions
|
@ -1,4 +1,7 @@
|
|||
Features
|
||||
* AES performance improvements on 64-bit architectures. Uplift
|
||||
varies by platform, toolchain, optimisation flags and mode,
|
||||
in the 0 - 84% range. Aarch64, gcc and GCM/XTS benefit the most.
|
||||
* AES performance improvements. Uplift varies by platform,
|
||||
toolchain, optimisation flags and mode.
|
||||
Aarch64, gcc -Os and CCM, GCM and XTS benefit the most.
|
||||
On Aarch64, uplift is typically around 20 - 110%.
|
||||
When compiling with gcc -Os on Aarch64, AES-XTS improves
|
||||
by 4.5x.
|
||||
|
|
|
@ -1077,23 +1077,6 @@ int mbedtls_aes_crypt_ecb(mbedtls_aes_context *ctx,
|
|||
|
||||
#if defined(MBEDTLS_CIPHER_MODE_CBC)
|
||||
|
||||
#if defined(__ARM_NEON) && defined(__aarch64__)
|
||||
/* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on
|
||||
* the result for the next block in CBC, and the cost of transferring that data from
|
||||
* NEON registers, it is faster to use the following on aarch64.
|
||||
* For 32-bit arm, NEON should be faster. */
|
||||
#define CBC_XOR_16(r, a, b) do { \
|
||||
mbedtls_put_unaligned_uint64(r, \
|
||||
mbedtls_get_unaligned_uint64(a) ^ \
|
||||
mbedtls_get_unaligned_uint64(b)); \
|
||||
mbedtls_put_unaligned_uint64(r + 8, \
|
||||
mbedtls_get_unaligned_uint64(a + 8) ^ \
|
||||
mbedtls_get_unaligned_uint64(b + 8)); \
|
||||
} while (0)
|
||||
#else
|
||||
#define CBC_XOR_16(r, a, b) mbedtls_xor(r, a, b, 16)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* AES-CBC buffer encryption/decryption
|
||||
*/
|
||||
|
@ -1136,7 +1119,10 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx,
|
|||
if (ret != 0) {
|
||||
goto exit;
|
||||
}
|
||||
CBC_XOR_16(output, output, iv);
|
||||
/* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on
|
||||
* the result for the next block in CBC, and the cost of transferring that data from
|
||||
* NEON registers, NEON is slower on aarch64. */
|
||||
mbedtls_xor_no_simd(output, output, iv, 16);
|
||||
|
||||
memcpy(iv, temp, 16);
|
||||
|
||||
|
@ -1146,7 +1132,7 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx,
|
|||
}
|
||||
} else {
|
||||
while (length > 0) {
|
||||
CBC_XOR_16(output, input, ivp);
|
||||
mbedtls_xor_no_simd(output, input, ivp, 16);
|
||||
|
||||
ret = mbedtls_aes_crypt_ecb(ctx, mode, output, output);
|
||||
if (ret != 0) {
|
||||
|
@ -1179,8 +1165,11 @@ typedef unsigned char mbedtls_be128[16];
|
|||
* for machine endianness and hence works correctly on both big and little
|
||||
* endian machines.
|
||||
*/
|
||||
static void mbedtls_gf128mul_x_ble(unsigned char r[16],
|
||||
const unsigned char x[16])
|
||||
#if defined(MBEDTLS_AESCE_C) || defined(MBEDTLS_AESNI_C)
|
||||
MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
|
||||
#endif
|
||||
static inline void mbedtls_gf128mul_x_ble(unsigned char r[16],
|
||||
const unsigned char x[16])
|
||||
{
|
||||
uint64_t a, b, ra, rb;
|
||||
|
||||
|
@ -1196,7 +1185,13 @@ static void mbedtls_gf128mul_x_ble(unsigned char r[16],
|
|||
|
||||
/*
|
||||
* AES-XTS buffer encryption/decryption
|
||||
*
|
||||
* Use of MBEDTLS_OPTIMIZE_FOR_PERFORMANCE here and for mbedtls_gf128mul_x_ble()
|
||||
* is a 3x performance improvement for gcc -Os, if we have hardware AES support.
|
||||
*/
|
||||
#if defined(MBEDTLS_AESCE_C) || defined(MBEDTLS_AESNI_C)
|
||||
MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
|
||||
#endif
|
||||
int mbedtls_aes_crypt_xts(mbedtls_aes_xts_context *ctx,
|
||||
int mode,
|
||||
size_t length,
|
||||
|
|
100
library/aesce.c
100
library/aesce.c
|
@ -101,59 +101,105 @@ int mbedtls_aesce_has_support(void)
|
|||
#endif
|
||||
}
|
||||
|
||||
/* Single round of AESCE encryption */
|
||||
#define AESCE_ENCRYPT_ROUND \
|
||||
block = vaeseq_u8(block, vld1q_u8(keys)); \
|
||||
block = vaesmcq_u8(block); \
|
||||
keys += 16
|
||||
/* Two rounds of AESCE encryption */
|
||||
#define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
|
||||
|
||||
MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
|
||||
static uint8x16_t aesce_encrypt_block(uint8x16_t block,
|
||||
unsigned char *keys,
|
||||
int rounds)
|
||||
{
|
||||
for (int i = 0; i < rounds - 1; i++) {
|
||||
/* AES AddRoundKey, SubBytes, ShiftRows (in this order).
|
||||
* AddRoundKey adds the round key for the previous round. */
|
||||
block = vaeseq_u8(block, vld1q_u8(keys + i * 16));
|
||||
/* AES mix columns */
|
||||
block = vaesmcq_u8(block);
|
||||
/* 10, 12 or 14 rounds. Unroll loop. */
|
||||
if (rounds == 10) {
|
||||
goto rounds_10;
|
||||
}
|
||||
if (rounds == 12) {
|
||||
goto rounds_12;
|
||||
}
|
||||
AESCE_ENCRYPT_ROUND_X2;
|
||||
rounds_12:
|
||||
AESCE_ENCRYPT_ROUND_X2;
|
||||
rounds_10:
|
||||
AESCE_ENCRYPT_ROUND_X2;
|
||||
AESCE_ENCRYPT_ROUND_X2;
|
||||
AESCE_ENCRYPT_ROUND_X2;
|
||||
AESCE_ENCRYPT_ROUND_X2;
|
||||
AESCE_ENCRYPT_ROUND;
|
||||
|
||||
/* AES AddRoundKey for the previous round.
|
||||
* SubBytes, ShiftRows for the final round. */
|
||||
block = vaeseq_u8(block, vld1q_u8(keys + (rounds -1) * 16));
|
||||
block = vaeseq_u8(block, vld1q_u8(keys));
|
||||
keys += 16;
|
||||
|
||||
/* Final round: no MixColumns */
|
||||
|
||||
/* Final AddRoundKey */
|
||||
block = veorq_u8(block, vld1q_u8(keys + rounds * 16));
|
||||
block = veorq_u8(block, vld1q_u8(keys));
|
||||
|
||||
return block;
|
||||
}
|
||||
|
||||
/* Single round of AESCE decryption
|
||||
*
|
||||
* AES AddRoundKey, SubBytes, ShiftRows
|
||||
*
|
||||
* block = vaesdq_u8(block, vld1q_u8(keys));
|
||||
*
|
||||
* AES inverse MixColumns for the next round.
|
||||
*
|
||||
* This means that we switch the order of the inverse AddRoundKey and
|
||||
* inverse MixColumns operations. We have to do this as AddRoundKey is
|
||||
* done in an atomic instruction together with the inverses of SubBytes
|
||||
* and ShiftRows.
|
||||
*
|
||||
* It works because MixColumns is a linear operation over GF(2^8) and
|
||||
* AddRoundKey is an exclusive or, which is equivalent to addition over
|
||||
* GF(2^8). (The inverse of MixColumns needs to be applied to the
|
||||
* affected round keys separately which has been done when the
|
||||
* decryption round keys were calculated.)
|
||||
*
|
||||
* block = vaesimcq_u8(block);
|
||||
*/
|
||||
#define AESCE_DECRYPT_ROUND \
|
||||
block = vaesdq_u8(block, vld1q_u8(keys)); \
|
||||
block = vaesimcq_u8(block); \
|
||||
keys += 16
|
||||
/* Two rounds of AESCE decryption */
|
||||
#define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
|
||||
|
||||
static uint8x16_t aesce_decrypt_block(uint8x16_t block,
|
||||
unsigned char *keys,
|
||||
int rounds)
|
||||
{
|
||||
|
||||
for (int i = 0; i < rounds - 1; i++) {
|
||||
/* AES AddRoundKey, SubBytes, ShiftRows */
|
||||
block = vaesdq_u8(block, vld1q_u8(keys + i * 16));
|
||||
/* AES inverse MixColumns for the next round.
|
||||
*
|
||||
* This means that we switch the order of the inverse AddRoundKey and
|
||||
* inverse MixColumns operations. We have to do this as AddRoundKey is
|
||||
* done in an atomic instruction together with the inverses of SubBytes
|
||||
* and ShiftRows.
|
||||
*
|
||||
* It works because MixColumns is a linear operation over GF(2^8) and
|
||||
* AddRoundKey is an exclusive or, which is equivalent to addition over
|
||||
* GF(2^8). (The inverse of MixColumns needs to be applied to the
|
||||
* affected round keys separately which has been done when the
|
||||
* decryption round keys were calculated.) */
|
||||
block = vaesimcq_u8(block);
|
||||
/* 10, 12 or 14 rounds. Unroll loop. */
|
||||
if (rounds == 10) {
|
||||
goto rounds_10;
|
||||
}
|
||||
if (rounds == 12) {
|
||||
goto rounds_12;
|
||||
}
|
||||
AESCE_DECRYPT_ROUND_X2;
|
||||
rounds_12:
|
||||
AESCE_DECRYPT_ROUND_X2;
|
||||
rounds_10:
|
||||
AESCE_DECRYPT_ROUND_X2;
|
||||
AESCE_DECRYPT_ROUND_X2;
|
||||
AESCE_DECRYPT_ROUND_X2;
|
||||
AESCE_DECRYPT_ROUND_X2;
|
||||
AESCE_DECRYPT_ROUND;
|
||||
|
||||
/* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
|
||||
* last full round. */
|
||||
block = vaesdq_u8(block, vld1q_u8(keys + (rounds - 1) * 16));
|
||||
block = vaesdq_u8(block, vld1q_u8(keys));
|
||||
keys += 16;
|
||||
|
||||
/* Inverse AddRoundKey for inverting the initial round key addition. */
|
||||
block = veorq_u8(block, vld1q_u8(keys + rounds * 16));
|
||||
block = veorq_u8(block, vld1q_u8(keys));
|
||||
|
||||
return block;
|
||||
}
|
||||
|
|
|
@ -52,6 +52,9 @@ int mbedtls_aesce_has_support(void);
|
|||
/**
|
||||
* \brief Internal AES-ECB block encryption and decryption
|
||||
*
|
||||
* \warning This assumes that the context specifies either 10, 12 or 14
|
||||
* rounds and will behave incorrectly if this is not the case.
|
||||
*
|
||||
* \param ctx AES context
|
||||
* \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT
|
||||
* \param input 16-byte input block
|
||||
|
|
|
@ -673,14 +673,10 @@
|
|||
#if defined(__arm__)
|
||||
|
||||
#if defined(__thumb__) && !defined(__thumb2__)
|
||||
#if !defined(__ARMCC_VERSION) && !defined(__clang__) \
|
||||
&& !defined(__llvm__) && !defined(__INTEL_COMPILER)
|
||||
#if defined(MBEDTLS_COMPILER_IS_GCC)
|
||||
/*
|
||||
* Thumb 1 ISA. This code path has only been tested successfully on gcc;
|
||||
* it does not compile on clang or armclang.
|
||||
*
|
||||
* Other compilers which define __GNUC__ may not work. The above macro
|
||||
* attempts to exclude these untested compilers.
|
||||
*/
|
||||
|
||||
#if !defined(__OPTIMIZE__) && defined(__GNUC__)
|
||||
|
|
|
@ -237,7 +237,7 @@ int mbedtls_cipher_cmac_update(mbedtls_cipher_context_t *ctx,
|
|||
input,
|
||||
block_size - cmac_ctx->unprocessed_len);
|
||||
|
||||
mbedtls_xor(state, cmac_ctx->unprocessed_block, state, block_size);
|
||||
mbedtls_xor_no_simd(state, cmac_ctx->unprocessed_block, state, block_size);
|
||||
|
||||
if ((ret = mbedtls_cipher_update(ctx, state, block_size, state,
|
||||
&olen)) != 0) {
|
||||
|
@ -255,7 +255,7 @@ int mbedtls_cipher_cmac_update(mbedtls_cipher_context_t *ctx,
|
|||
/* Iterate across the input data in block sized chunks, excluding any
|
||||
* final partial or complete block */
|
||||
for (j = 1; j < n; j++) {
|
||||
mbedtls_xor(state, input, state, block_size);
|
||||
mbedtls_xor_no_simd(state, input, state, block_size);
|
||||
|
||||
if ((ret = mbedtls_cipher_update(ctx, state, block_size, state,
|
||||
&olen)) != 0) {
|
||||
|
|
|
@ -192,6 +192,45 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform a fast block XOR operation, such that
|
||||
* r[i] = a[i] ^ b[i] where 0 <= i < n
|
||||
*
|
||||
* In some situations, this can perform better than mbedtls_xor (e.g., it's about 5%
|
||||
* better in AES-CBC).
|
||||
*
|
||||
* \param r Pointer to result (buffer of at least \p n bytes). \p r
|
||||
* may be equal to either \p a or \p b, but behaviour when
|
||||
* it overlaps in other ways is undefined.
|
||||
* \param a Pointer to input (buffer of at least \p n bytes)
|
||||
* \param b Pointer to input (buffer of at least \p n bytes)
|
||||
* \param n Number of bytes to process.
|
||||
*/
|
||||
static inline void mbedtls_xor_no_simd(unsigned char *r,
|
||||
const unsigned char *a,
|
||||
const unsigned char *b,
|
||||
size_t n)
|
||||
{
|
||||
size_t i = 0;
|
||||
#if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS)
|
||||
#if defined(__amd64__) || defined(__x86_64__) || defined(__aarch64__)
|
||||
/* This codepath probably only makes sense on architectures with 64-bit registers */
|
||||
for (; (i + 8) <= n; i += 8) {
|
||||
uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i);
|
||||
mbedtls_put_unaligned_uint64(r + i, x);
|
||||
}
|
||||
#else
|
||||
for (; (i + 4) <= n; i += 4) {
|
||||
uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i);
|
||||
mbedtls_put_unaligned_uint32(r + i, x);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
for (; i < n; i++) {
|
||||
r[i] = a[i] ^ b[i];
|
||||
}
|
||||
}
|
||||
|
||||
/* Fix MSVC C99 compatible issue
|
||||
* MSVC support __func__ from visual studio 2015( 1900 )
|
||||
* Use MSVC predefine macro to avoid name check fail.
|
||||
|
@ -261,4 +300,20 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
|
|||
#define MBEDTLS_UNLIKELY(x) x
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined(__ARMCC_VERSION) && !defined(__clang__) \
|
||||
&& !defined(__llvm__) && !defined(__INTEL_COMPILER)
|
||||
/* Defined if the compiler really is gcc and not clang, etc */
|
||||
#define MBEDTLS_COMPILER_IS_GCC
|
||||
#endif
|
||||
|
||||
/* For gcc -Os, override with -O2 for a given function.
|
||||
*
|
||||
* This will not affect behaviour for other optimisation settings, e.g. -O0.
|
||||
*/
|
||||
#if defined(MBEDTLS_COMPILER_IS_GCC) && defined(__OPTIMIZE_SIZE__)
|
||||
#define MBEDTLS_OPTIMIZE_FOR_PERFORMANCE __attribute__((optimize("-O2")))
|
||||
#else
|
||||
#define MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
|
||||
#endif
|
||||
|
||||
#endif /* MBEDTLS_LIBRARY_COMMON_H */
|
||||
|
|
Loading…
Reference in a new issue