diff --git a/library/aes.c b/library/aes.c index 6d8cf2e3c..6a7e6102f 100644 --- a/library/aes.c +++ b/library/aes.c @@ -1035,6 +1035,24 @@ int mbedtls_aes_crypt_ecb(mbedtls_aes_context *ctx, } #if defined(MBEDTLS_CIPHER_MODE_CBC) + +#if defined(__ARM_NEON) && defined(__aarch64__) + /* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on + * the result for the next block in CBC, and the cost of transferring that data from + * NEON registers, it is faster to use the following on aarch64. + * For 32-bit arm, NEON should be faster. */ +#define CBC_XOR_16(r, a, b) do { \ + mbedtls_put_unaligned_uint64(r, \ + mbedtls_get_unaligned_uint64(a) ^ \ + mbedtls_get_unaligned_uint64(b)); \ + mbedtls_put_unaligned_uint64(r + 8, \ + mbedtls_get_unaligned_uint64(a + 8) ^ \ + mbedtls_get_unaligned_uint64(b + 8)); \ +} while (0) +#else +#define CBC_XOR_16(r, a, b) mbedtls_xor(r, a, b, 16) +#endif + /* * AES-CBC buffer encryption/decryption */ @@ -1077,8 +1095,7 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, if (ret != 0) { goto exit; } - - mbedtls_xor(output, output, iv, 16); + CBC_XOR_16(output, output, iv); memcpy(iv, temp, 16); @@ -1088,7 +1105,7 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, } } else { while (length > 0) { - mbedtls_xor(output, input, ivp, 16); + CBC_XOR_16(output, input, ivp); ret = mbedtls_aes_crypt_ecb(ctx, mode, output, output); if (ret != 0) {