diff --git a/ChangeLog.d/aes-perf.txt b/ChangeLog.d/aes-perf.txt index ca2ced92e..ab716bce8 100644 --- a/ChangeLog.d/aes-perf.txt +++ b/ChangeLog.d/aes-perf.txt @@ -1,4 +1,7 @@ Features - * AES performance improvements on 64-bit architectures. Uplift - varies by platform, toolchain, optimisation flags and mode, - in the 0 - 84% range. Aarch64, gcc and GCM/XTS benefit the most. + * AES performance improvements. Uplift varies by platform, + toolchain, optimisation flags and mode. + Aarch64, gcc -Os and CCM, GCM and XTS benefit the most. + On Aarch64, uplift is typically around 20 - 110%. + When compiling with gcc -Os on Aarch64, AES-XTS improves + by 4.5x. diff --git a/library/aes.c b/library/aes.c index a7bc8d535..4397deae2 100644 --- a/library/aes.c +++ b/library/aes.c @@ -1077,23 +1077,6 @@ int mbedtls_aes_crypt_ecb(mbedtls_aes_context *ctx, #if defined(MBEDTLS_CIPHER_MODE_CBC) -#if defined(__ARM_NEON) && defined(__aarch64__) -/* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on - * the result for the next block in CBC, and the cost of transferring that data from - * NEON registers, it is faster to use the following on aarch64. - * For 32-bit arm, NEON should be faster. */ -#define CBC_XOR_16(r, a, b) do { \ - mbedtls_put_unaligned_uint64(r, \ - mbedtls_get_unaligned_uint64(a) ^ \ - mbedtls_get_unaligned_uint64(b)); \ - mbedtls_put_unaligned_uint64(r + 8, \ - mbedtls_get_unaligned_uint64(a + 8) ^ \ - mbedtls_get_unaligned_uint64(b + 8)); \ -} while (0) -#else -#define CBC_XOR_16(r, a, b) mbedtls_xor(r, a, b, 16) -#endif - /* * AES-CBC buffer encryption/decryption */ @@ -1136,7 +1119,10 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, if (ret != 0) { goto exit; } - CBC_XOR_16(output, output, iv); + /* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on + * the result for the next block in CBC, and the cost of transferring that data from + * NEON registers, NEON is slower on aarch64. */ + mbedtls_xor_no_simd(output, output, iv, 16); memcpy(iv, temp, 16); @@ -1146,7 +1132,7 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, } } else { while (length > 0) { - CBC_XOR_16(output, input, ivp); + mbedtls_xor_no_simd(output, input, ivp, 16); ret = mbedtls_aes_crypt_ecb(ctx, mode, output, output); if (ret != 0) { @@ -1179,8 +1165,11 @@ typedef unsigned char mbedtls_be128[16]; * for machine endianness and hence works correctly on both big and little * endian machines. */ -static void mbedtls_gf128mul_x_ble(unsigned char r[16], - const unsigned char x[16]) +#if defined(MBEDTLS_AESCE_C) || defined(MBEDTLS_AESNI_C) +MBEDTLS_OPTIMIZE_FOR_PERFORMANCE +#endif +static inline void mbedtls_gf128mul_x_ble(unsigned char r[16], + const unsigned char x[16]) { uint64_t a, b, ra, rb; @@ -1196,7 +1185,13 @@ static void mbedtls_gf128mul_x_ble(unsigned char r[16], /* * AES-XTS buffer encryption/decryption + * + * Use of MBEDTLS_OPTIMIZE_FOR_PERFORMANCE here and for mbedtls_gf128mul_x_ble() + * is a 3x performance improvement for gcc -Os, if we have hardware AES support. */ +#if defined(MBEDTLS_AESCE_C) || defined(MBEDTLS_AESNI_C) +MBEDTLS_OPTIMIZE_FOR_PERFORMANCE +#endif int mbedtls_aes_crypt_xts(mbedtls_aes_xts_context *ctx, int mode, size_t length, diff --git a/library/aesce.c b/library/aesce.c index 42662bbe0..f37a11bbc 100644 --- a/library/aesce.c +++ b/library/aesce.c @@ -101,59 +101,105 @@ int mbedtls_aesce_has_support(void) #endif } +/* Single round of AESCE encryption */ +#define AESCE_ENCRYPT_ROUND \ + block = vaeseq_u8(block, vld1q_u8(keys)); \ + block = vaesmcq_u8(block); \ + keys += 16 +/* Two rounds of AESCE encryption */ +#define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND + +MBEDTLS_OPTIMIZE_FOR_PERFORMANCE static uint8x16_t aesce_encrypt_block(uint8x16_t block, unsigned char *keys, int rounds) { - for (int i = 0; i < rounds - 1; i++) { - /* AES AddRoundKey, SubBytes, ShiftRows (in this order). - * AddRoundKey adds the round key for the previous round. */ - block = vaeseq_u8(block, vld1q_u8(keys + i * 16)); - /* AES mix columns */ - block = vaesmcq_u8(block); + /* 10, 12 or 14 rounds. Unroll loop. */ + if (rounds == 10) { + goto rounds_10; } + if (rounds == 12) { + goto rounds_12; + } + AESCE_ENCRYPT_ROUND_X2; +rounds_12: + AESCE_ENCRYPT_ROUND_X2; +rounds_10: + AESCE_ENCRYPT_ROUND_X2; + AESCE_ENCRYPT_ROUND_X2; + AESCE_ENCRYPT_ROUND_X2; + AESCE_ENCRYPT_ROUND_X2; + AESCE_ENCRYPT_ROUND; /* AES AddRoundKey for the previous round. * SubBytes, ShiftRows for the final round. */ - block = vaeseq_u8(block, vld1q_u8(keys + (rounds -1) * 16)); + block = vaeseq_u8(block, vld1q_u8(keys)); + keys += 16; /* Final round: no MixColumns */ /* Final AddRoundKey */ - block = veorq_u8(block, vld1q_u8(keys + rounds * 16)); + block = veorq_u8(block, vld1q_u8(keys)); return block; } +/* Single round of AESCE decryption + * + * AES AddRoundKey, SubBytes, ShiftRows + * + * block = vaesdq_u8(block, vld1q_u8(keys)); + * + * AES inverse MixColumns for the next round. + * + * This means that we switch the order of the inverse AddRoundKey and + * inverse MixColumns operations. We have to do this as AddRoundKey is + * done in an atomic instruction together with the inverses of SubBytes + * and ShiftRows. + * + * It works because MixColumns is a linear operation over GF(2^8) and + * AddRoundKey is an exclusive or, which is equivalent to addition over + * GF(2^8). (The inverse of MixColumns needs to be applied to the + * affected round keys separately which has been done when the + * decryption round keys were calculated.) + * + * block = vaesimcq_u8(block); + */ +#define AESCE_DECRYPT_ROUND \ + block = vaesdq_u8(block, vld1q_u8(keys)); \ + block = vaesimcq_u8(block); \ + keys += 16 +/* Two rounds of AESCE decryption */ +#define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND + static uint8x16_t aesce_decrypt_block(uint8x16_t block, unsigned char *keys, int rounds) { - - for (int i = 0; i < rounds - 1; i++) { - /* AES AddRoundKey, SubBytes, ShiftRows */ - block = vaesdq_u8(block, vld1q_u8(keys + i * 16)); - /* AES inverse MixColumns for the next round. - * - * This means that we switch the order of the inverse AddRoundKey and - * inverse MixColumns operations. We have to do this as AddRoundKey is - * done in an atomic instruction together with the inverses of SubBytes - * and ShiftRows. - * - * It works because MixColumns is a linear operation over GF(2^8) and - * AddRoundKey is an exclusive or, which is equivalent to addition over - * GF(2^8). (The inverse of MixColumns needs to be applied to the - * affected round keys separately which has been done when the - * decryption round keys were calculated.) */ - block = vaesimcq_u8(block); + /* 10, 12 or 14 rounds. Unroll loop. */ + if (rounds == 10) { + goto rounds_10; } + if (rounds == 12) { + goto rounds_12; + } + AESCE_DECRYPT_ROUND_X2; +rounds_12: + AESCE_DECRYPT_ROUND_X2; +rounds_10: + AESCE_DECRYPT_ROUND_X2; + AESCE_DECRYPT_ROUND_X2; + AESCE_DECRYPT_ROUND_X2; + AESCE_DECRYPT_ROUND_X2; + AESCE_DECRYPT_ROUND; /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the * last full round. */ - block = vaesdq_u8(block, vld1q_u8(keys + (rounds - 1) * 16)); + block = vaesdq_u8(block, vld1q_u8(keys)); + keys += 16; /* Inverse AddRoundKey for inverting the initial round key addition. */ - block = veorq_u8(block, vld1q_u8(keys + rounds * 16)); + block = veorq_u8(block, vld1q_u8(keys)); return block; } diff --git a/library/aesce.h b/library/aesce.h index 7048d77c5..b12bf76ba 100644 --- a/library/aesce.h +++ b/library/aesce.h @@ -52,6 +52,9 @@ int mbedtls_aesce_has_support(void); /** * \brief Internal AES-ECB block encryption and decryption * + * \warning This assumes that the context specifies either 10, 12 or 14 + * rounds and will behave incorrectly if this is not the case. + * * \param ctx AES context * \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT * \param input 16-byte input block diff --git a/library/bn_mul.h b/library/bn_mul.h index 43dd5c298..ab1a66ae5 100644 --- a/library/bn_mul.h +++ b/library/bn_mul.h @@ -673,14 +673,10 @@ #if defined(__arm__) #if defined(__thumb__) && !defined(__thumb2__) -#if !defined(__ARMCC_VERSION) && !defined(__clang__) \ - && !defined(__llvm__) && !defined(__INTEL_COMPILER) +#if defined(MBEDTLS_COMPILER_IS_GCC) /* * Thumb 1 ISA. This code path has only been tested successfully on gcc; * it does not compile on clang or armclang. - * - * Other compilers which define __GNUC__ may not work. The above macro - * attempts to exclude these untested compilers. */ #if !defined(__OPTIMIZE__) && defined(__GNUC__) diff --git a/library/cmac.c b/library/cmac.c index 48f51df41..2f19d1129 100644 --- a/library/cmac.c +++ b/library/cmac.c @@ -237,7 +237,7 @@ int mbedtls_cipher_cmac_update(mbedtls_cipher_context_t *ctx, input, block_size - cmac_ctx->unprocessed_len); - mbedtls_xor(state, cmac_ctx->unprocessed_block, state, block_size); + mbedtls_xor_no_simd(state, cmac_ctx->unprocessed_block, state, block_size); if ((ret = mbedtls_cipher_update(ctx, state, block_size, state, &olen)) != 0) { @@ -255,7 +255,7 @@ int mbedtls_cipher_cmac_update(mbedtls_cipher_context_t *ctx, /* Iterate across the input data in block sized chunks, excluding any * final partial or complete block */ for (j = 1; j < n; j++) { - mbedtls_xor(state, input, state, block_size); + mbedtls_xor_no_simd(state, input, state, block_size); if ((ret = mbedtls_cipher_update(ctx, state, block_size, state, &olen)) != 0) { diff --git a/library/common.h b/library/common.h index 97846c462..839b7d119 100644 --- a/library/common.h +++ b/library/common.h @@ -192,6 +192,45 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned } } +/** + * Perform a fast block XOR operation, such that + * r[i] = a[i] ^ b[i] where 0 <= i < n + * + * In some situations, this can perform better than mbedtls_xor (e.g., it's about 5% + * better in AES-CBC). + * + * \param r Pointer to result (buffer of at least \p n bytes). \p r + * may be equal to either \p a or \p b, but behaviour when + * it overlaps in other ways is undefined. + * \param a Pointer to input (buffer of at least \p n bytes) + * \param b Pointer to input (buffer of at least \p n bytes) + * \param n Number of bytes to process. + */ +static inline void mbedtls_xor_no_simd(unsigned char *r, + const unsigned char *a, + const unsigned char *b, + size_t n) +{ + size_t i = 0; +#if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS) +#if defined(__amd64__) || defined(__x86_64__) || defined(__aarch64__) + /* This codepath probably only makes sense on architectures with 64-bit registers */ + for (; (i + 8) <= n; i += 8) { + uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i); + mbedtls_put_unaligned_uint64(r + i, x); + } +#else + for (; (i + 4) <= n; i += 4) { + uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i); + mbedtls_put_unaligned_uint32(r + i, x); + } +#endif +#endif + for (; i < n; i++) { + r[i] = a[i] ^ b[i]; + } +} + /* Fix MSVC C99 compatible issue * MSVC support __func__ from visual studio 2015( 1900 ) * Use MSVC predefine macro to avoid name check fail. @@ -261,4 +300,20 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned #define MBEDTLS_UNLIKELY(x) x #endif +#if defined(__GNUC__) && !defined(__ARMCC_VERSION) && !defined(__clang__) \ + && !defined(__llvm__) && !defined(__INTEL_COMPILER) +/* Defined if the compiler really is gcc and not clang, etc */ +#define MBEDTLS_COMPILER_IS_GCC +#endif + +/* For gcc -Os, override with -O2 for a given function. + * + * This will not affect behaviour for other optimisation settings, e.g. -O0. + */ +#if defined(MBEDTLS_COMPILER_IS_GCC) && defined(__OPTIMIZE_SIZE__) +#define MBEDTLS_OPTIMIZE_FOR_PERFORMANCE __attribute__((optimize("-O2"))) +#else +#define MBEDTLS_OPTIMIZE_FOR_PERFORMANCE +#endif + #endif /* MBEDTLS_LIBRARY_COMMON_H */