From 03d2daf55c97b833d675e9a1e1c0f6c9dfb240b4 Mon Sep 17 00:00:00 2001 From: Ko- Date: Thu, 16 Aug 2018 01:59:49 -0700 Subject: [PATCH 1/4] Enable 64-bit limbs for all Aarch64 builds. GCC and Clang do not define __ARMCC_VERSION when building for Aarch64. Yet they should also use 64-bit limbs for Aarch64 builds. --- include/mbedtls/bignum.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/mbedtls/bignum.h b/include/mbedtls/bignum.h index 0b26727f3..f854ca8d2 100644 --- a/include/mbedtls/bignum.h +++ b/include/mbedtls/bignum.h @@ -140,9 +140,8 @@ typedef unsigned int mbedtls_t_udbl __attribute__((mode(TI))); #define MBEDTLS_HAVE_UDBL #endif /* !MBEDTLS_NO_UDBL_DIVISION */ - #elif defined(__ARMCC_VERSION) && defined(__aarch64__) + #elif defined(__aarch64__) /* - * __ARMCC_VERSION is defined for both armcc and armclang and * __aarch64__ is only defined by armclang when compiling 64-bit code */ #if !defined(MBEDTLS_HAVE_INT64) From cc1871e674c2508f88dd77106c9f0ba0dbee2120 Mon Sep 17 00:00:00 2001 From: Ko- Date: Thu, 16 Aug 2018 02:01:57 -0700 Subject: [PATCH 2/4] Add optimized bignum multiplication for Aarch64. x0-x3 are skipped such that function parameters to not have to be moved. MULADDC_INIT and MULADDC_STOP are mostly empty because it is more efficient to keep everything in registers (and that should easily be possible). I considered a MULADDC_HUIT implementation, but could not think of something that would be more efficient than basically 8 consecutive MULADDC_CORE. You could combine the loads and stores, but it's probably more efficient to interleave them with arithmetic, depending on the specific microarchitecture. NEON allows to do a 64x64->128 bit multiplication (and optional accumulation) in one instruction, but is not great at handling carries. --- include/mbedtls/bn_mul.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/include/mbedtls/bn_mul.h b/include/mbedtls/bn_mul.h index f7cb07252..4200ad43a 100644 --- a/include/mbedtls/bn_mul.h +++ b/include/mbedtls/bn_mul.h @@ -198,6 +198,30 @@ #endif /* AMD64 */ +#if defined(__aarch64__) + +#define MULADDC_INIT \ + asm( + +#define MULADDC_CORE \ + "ldr x4, [%3], #8 \n\t" \ + "ldr x5, [%4] \n\t" \ + "mul x6, x4, %6 \n\t" \ + "umulh x7, x4, %6 \n\t" \ + "adds x5, x5, x6 \n\t" \ + "adc x7, x7, xzr \n\t" \ + "adds x5, x5, %5 \n\t" \ + "adc %0, x7, xzr \n\t" \ + "str x5, [%1], #8 \n\t" + +#define MULADDC_STOP \ + : "+r" (c), "=r" (d), "=r" (s) \ + : "r" (s), "r" (d), "r" (c), "r" (b) \ + : "x4", "x5", "x6", "x7", "cc" \ + ); + +#endif /* Aarch64 */ + #if defined(__mc68020__) || defined(__mcpu32__) #define MULADDC_INIT \ From 05cff953c99d12cf6d67c86a84a1b94367763fbb Mon Sep 17 00:00:00 2001 From: Ko- Date: Mon, 20 Aug 2018 12:59:57 +0100 Subject: [PATCH 3/4] Make GNUC-compatible compilers use the right mbedtls_t_udbl again on Aarch64 builds. --- include/mbedtls/bignum.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/mbedtls/bignum.h b/include/mbedtls/bignum.h index f854ca8d2..2c5ace690 100644 --- a/include/mbedtls/bignum.h +++ b/include/mbedtls/bignum.h @@ -129,7 +129,8 @@ defined(__ppc64__) || defined(__powerpc64__) || \ defined(__ia64__) || defined(__alpha__) || \ ( defined(__sparc__) && defined(__arch64__) ) || \ - defined(__s390x__) || defined(__mips64) ) + defined(__s390x__) || defined(__mips64) || \ + defined(__aarch64__) ) #if !defined(MBEDTLS_HAVE_INT64) #define MBEDTLS_HAVE_INT64 #endif /* MBEDTLS_HAVE_INT64 */ @@ -140,8 +141,9 @@ typedef unsigned int mbedtls_t_udbl __attribute__((mode(TI))); #define MBEDTLS_HAVE_UDBL #endif /* !MBEDTLS_NO_UDBL_DIVISION */ - #elif defined(__aarch64__) + #elif defined(__ARMCC_VERSION) && defined(__aarch64__) /* + * __ARMCC_VERSION is defined for both armcc and armclang and * __aarch64__ is only defined by armclang when compiling 64-bit code */ #if !defined(MBEDTLS_HAVE_INT64) From cb260bb30d5d3e17ab7ca945f03b0808a67b1812 Mon Sep 17 00:00:00 2001 From: Ko- Date: Mon, 20 Aug 2018 13:59:53 +0100 Subject: [PATCH 4/4] Fix -O0 build for Aarch64 bignum multiplication. --- include/mbedtls/bn_mul.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/mbedtls/bn_mul.h b/include/mbedtls/bn_mul.h index 4200ad43a..163869ae7 100644 --- a/include/mbedtls/bn_mul.h +++ b/include/mbedtls/bn_mul.h @@ -204,20 +204,20 @@ asm( #define MULADDC_CORE \ - "ldr x4, [%3], #8 \n\t" \ - "ldr x5, [%4] \n\t" \ - "mul x6, x4, %6 \n\t" \ - "umulh x7, x4, %6 \n\t" \ + "ldr x4, [%2], #8 \n\t" \ + "ldr x5, [%1] \n\t" \ + "mul x6, x4, %3 \n\t" \ + "umulh x7, x4, %3 \n\t" \ "adds x5, x5, x6 \n\t" \ "adc x7, x7, xzr \n\t" \ - "adds x5, x5, %5 \n\t" \ + "adds x5, x5, %0 \n\t" \ "adc %0, x7, xzr \n\t" \ "str x5, [%1], #8 \n\t" -#define MULADDC_STOP \ - : "+r" (c), "=r" (d), "=r" (s) \ - : "r" (s), "r" (d), "r" (c), "r" (b) \ - : "x4", "x5", "x6", "x7", "cc" \ +#define MULADDC_STOP \ + : "+r" (c), "+r" (d), "+r" (s) \ + : "r" (b) \ + : "x4", "x5", "x6", "x7", "cc" \ ); #endif /* Aarch64 */