From 03d2daf55c97b833d675e9a1e1c0f6c9dfb240b4 Mon Sep 17 00:00:00 2001
From: Ko- <k.stoffelen@cs.ru.nl>
Date: Thu, 16 Aug 2018 01:59:49 -0700
Subject: [PATCH 1/4] Enable 64-bit limbs for all Aarch64 builds.

GCC and Clang do not define __ARMCC_VERSION when building for Aarch64.
Yet they should also use 64-bit limbs for Aarch64 builds.
---
 include/mbedtls/bignum.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/mbedtls/bignum.h b/include/mbedtls/bignum.h
index 0b26727f3..f854ca8d2 100644
--- a/include/mbedtls/bignum.h
+++ b/include/mbedtls/bignum.h
@@ -140,9 +140,8 @@
             typedef unsigned int mbedtls_t_udbl __attribute__((mode(TI)));
             #define MBEDTLS_HAVE_UDBL
         #endif /* !MBEDTLS_NO_UDBL_DIVISION */
-    #elif defined(__ARMCC_VERSION) && defined(__aarch64__)
+    #elif defined(__aarch64__)
         /*
-         * __ARMCC_VERSION is defined for both armcc and armclang and
          * __aarch64__ is only defined by armclang when compiling 64-bit code
          */
         #if !defined(MBEDTLS_HAVE_INT64)

From cc1871e674c2508f88dd77106c9f0ba0dbee2120 Mon Sep 17 00:00:00 2001
From: Ko- <k.stoffelen@cs.ru.nl>
Date: Thu, 16 Aug 2018 02:01:57 -0700
Subject: [PATCH 2/4] Add optimized bignum multiplication for Aarch64.

x0-x3 are skipped such that function parameters to not have to be moved.
MULADDC_INIT and MULADDC_STOP are mostly empty because it is more
efficient to keep everything in registers (and that should easily be
possible). I considered a MULADDC_HUIT implementation, but could not
think of something that would be more efficient than basically 8
consecutive MULADDC_CORE. You could combine the loads and stores, but
it's probably more efficient to interleave them with arithmetic,
depending on the specific microarchitecture. NEON allows to do a
64x64->128 bit multiplication (and optional accumulation) in one
instruction, but is not great at handling carries.
---
 include/mbedtls/bn_mul.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/include/mbedtls/bn_mul.h b/include/mbedtls/bn_mul.h
index f7cb07252..4200ad43a 100644
--- a/include/mbedtls/bn_mul.h
+++ b/include/mbedtls/bn_mul.h
@@ -198,6 +198,30 @@
 
 #endif /* AMD64 */
 
+#if defined(__aarch64__)
+
+#define MULADDC_INIT                \
+    asm(
+
+#define MULADDC_CORE                \
+        "ldr x4, [%3], #8   \n\t"   \
+        "ldr x5, [%4]       \n\t"   \
+        "mul x6, x4, %6     \n\t"   \
+        "umulh x7, x4, %6   \n\t"   \
+        "adds x5, x5, x6    \n\t"   \
+        "adc x7, x7, xzr    \n\t"   \
+        "adds x5, x5, %5    \n\t"   \
+        "adc %0, x7, xzr    \n\t"   \
+        "str x5, [%1], #8   \n\t"
+
+#define MULADDC_STOP                            \
+         : "+r" (c),  "=r" (d), "=r" (s)        \
+         : "r" (s), "r" (d), "r" (c), "r" (b)   \
+         : "x4", "x5", "x6", "x7", "cc"         \
+    );
+
+#endif /* Aarch64 */
+
 #if defined(__mc68020__) || defined(__mcpu32__)
 
 #define MULADDC_INIT                    \

From 05cff953c99d12cf6d67c86a84a1b94367763fbb Mon Sep 17 00:00:00 2001
From: Ko- <k.stoffelen@cs.ru.nl>
Date: Mon, 20 Aug 2018 12:59:57 +0100
Subject: [PATCH 3/4] Make GNUC-compatible compilers use the right
 mbedtls_t_udbl again on Aarch64 builds.

---
 include/mbedtls/bignum.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/mbedtls/bignum.h b/include/mbedtls/bignum.h
index f854ca8d2..2c5ace690 100644
--- a/include/mbedtls/bignum.h
+++ b/include/mbedtls/bignum.h
@@ -129,7 +129,8 @@
         defined(__ppc64__) || defined(__powerpc64__)  || \
         defined(__ia64__)  || defined(__alpha__)      || \
         ( defined(__sparc__) && defined(__arch64__) ) || \
-        defined(__s390x__) || defined(__mips64) )
+        defined(__s390x__) || defined(__mips64)       || \
+        defined(__aarch64__) )
         #if !defined(MBEDTLS_HAVE_INT64)
             #define MBEDTLS_HAVE_INT64
         #endif /* MBEDTLS_HAVE_INT64 */
@@ -140,8 +141,9 @@
             typedef unsigned int mbedtls_t_udbl __attribute__((mode(TI)));
             #define MBEDTLS_HAVE_UDBL
         #endif /* !MBEDTLS_NO_UDBL_DIVISION */
-    #elif defined(__aarch64__)
+    #elif defined(__ARMCC_VERSION) && defined(__aarch64__)
         /*
+         * __ARMCC_VERSION is defined for both armcc and armclang and
          * __aarch64__ is only defined by armclang when compiling 64-bit code
          */
         #if !defined(MBEDTLS_HAVE_INT64)

From cb260bb30d5d3e17ab7ca945f03b0808a67b1812 Mon Sep 17 00:00:00 2001
From: Ko- <k.stoffelen@cs.ru.nl>
Date: Mon, 20 Aug 2018 13:59:53 +0100
Subject: [PATCH 4/4] Fix -O0 build for Aarch64 bignum multiplication.

---
 include/mbedtls/bn_mul.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/mbedtls/bn_mul.h b/include/mbedtls/bn_mul.h
index 4200ad43a..163869ae7 100644
--- a/include/mbedtls/bn_mul.h
+++ b/include/mbedtls/bn_mul.h
@@ -204,20 +204,20 @@
     asm(
 
 #define MULADDC_CORE                \
-        "ldr x4, [%3], #8   \n\t"   \
-        "ldr x5, [%4]       \n\t"   \
-        "mul x6, x4, %6     \n\t"   \
-        "umulh x7, x4, %6   \n\t"   \
+        "ldr x4, [%2], #8   \n\t"   \
+        "ldr x5, [%1]       \n\t"   \
+        "mul x6, x4, %3     \n\t"   \
+        "umulh x7, x4, %3   \n\t"   \
         "adds x5, x5, x6    \n\t"   \
         "adc x7, x7, xzr    \n\t"   \
-        "adds x5, x5, %5    \n\t"   \
+        "adds x5, x5, %0    \n\t"   \
         "adc %0, x7, xzr    \n\t"   \
         "str x5, [%1], #8   \n\t"
 
-#define MULADDC_STOP                            \
-         : "+r" (c),  "=r" (d), "=r" (s)        \
-         : "r" (s), "r" (d), "r" (c), "r" (b)   \
-         : "x4", "x5", "x6", "x7", "cc"         \
+#define MULADDC_STOP                        \
+         : "+r" (c),  "+r" (d), "+r" (s)    \
+         : "r" (b)                          \
+         : "x4", "x5", "x6", "x7", "cc"     \
     );
 
 #endif /* Aarch64 */