From 377b2b624d5a5d894965236f0ae7fe8d09813a44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= Date: Tue, 27 Feb 2018 10:22:26 +0100 Subject: [PATCH] aria: optimize byte perms on Arm Use specific instructions for moving bytes around in a word. This speeds things up, and as a side-effect, slightly lowers code size. ARIA_P3 and ARIA_P1 are now 1 single-cycle instruction each (those instructions are available in all architecture versions starting from v6-M). Note: ARIA_P3 was already translated to a single instruction by Clang 3.8 and armclang 6.5, but not arm-gcc 5.4 nor armcc 5.06. ARIA_P2 is already efficiently translated to the minimal number of instruction (1 in ARM mode, 2 in thumb mode) by all tested compilers Manually compiled and inspected generated code with the following compilers: arm-gcc 5.4, clang 3.8, armcc 5.06 (with and without --gnu), armclang 6.5. Size reduction (arm-none-eabi-gcc -march=armv6-m -mthumb -Os): 5288 -> 5044 B Effect on executing time of self-tests on a few boards: FRDM-K64F (Cortex-M4): 444 -> 385 us (-13%) LPC1768 (Cortex-M3): 488 -> 432 us (-11%) FRDM-KL64Z (Cortex-M0): 1429 -> 1134 us (-20%) Measured using a config.h with no cipher mode and the following program with aria.c and aria.h copy-pasted to the online compiler: #include "mbed.h" #include "aria.h" int main() { Timer t; t.start(); int ret = mbedtls_aria_self_test(0); t.stop(); printf("ret = %d; time = %d us\n", ret, t.read_us()); } --- library/aria.c | 56 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/library/aria.c b/library/aria.c index f6ad7f126..1d1daa18a 100644 --- a/library/aria.c +++ b/library/aria.c @@ -85,11 +85,33 @@ static void mbedtls_zeroize( void *v, size_t n ) { * Common compilers fail to translate this to minimal number of instructions, * so let's provide asm versions for common platforms with C fallback. */ -#if defined(MBEDTLS_HAVE_ASM) && defined(__GNUC__) -#if defined(__i386__) || defined(__amd64__) || defined( __x86_64__) +#if defined(MBEDTLS_HAVE_ASM) +#if defined(__arm__) +/* armcc5 --gnu defines __GNUC__ but doesn't support GNU's extended asm */ +#if defined(__GNUC__) && \ + ( !defined(__ARMCC_VERSION) || __ARMCC_VERSION >= 6000000 ) +static inline uint32_t aria_p1( uint32_t x ) +{ + uint32_t r; + asm( "rev16 %0, %1" : "=l" (r) : "l" (x) ); + return( r ); +} +#define ARIA_P1 aria_p1 +#elif defined(__ARMCC_VERSION) && __ARMCC_VERSION < 6000000 +static __inline uint32_t aria_p1( uint32_t x ) +{ + uint32_t r; + __asm( "rev16 r, x" ); + return( r ); +} +#define ARIA_P1 aria_p1 +#endif +#endif /* arm */ +#if defined(__GNUC__) && \ + defined(__i386__) || defined(__amd64__) || defined( __x86_64__) /* I couldn't find an Intel equivalent of ret16, so two instructions */ #define ARIA_P1(x) ARIA_P2( ARIA_P3( x ) ) -#endif +#endif /* x86 gnuc */ #endif /* MBEDTLS_HAVE_ASM && GNUC */ #if !defined(ARIA_P1) #define ARIA_P1(x) ((((x) >> 8) & 0x00FF00FF) ^ (((x) & 0x00FF00FF) << 8)) @@ -112,15 +134,37 @@ static void mbedtls_zeroize( void *v, size_t n ) { * Some compilers fail to translate this to a single instruction, * so let's provide asm versions for common platforms with C fallback. */ -#if defined(MBEDTLS_HAVE_ASM) && defined(__GNUC__) -#if defined(__i386__) || defined(__amd64__) || defined( __x86_64__) +#if defined(MBEDTLS_HAVE_ASM) +#if defined(__arm__) +/* armcc5 --gnu defines __GNUC__ but doesn't support GNU's extended asm */ +#if defined(__GNUC__) && \ + ( !defined(__ARMCC_VERSION) || __ARMCC_VERSION >= 6000000 ) +static inline uint32_t aria_p3( uint32_t x ) +{ + uint32_t r; + asm( "rev %0, %1" : "=l" (r) : "l" (x) ); + return( r ); +} +#define ARIA_P3 aria_p3 +#elif defined(__ARMCC_VERSION) && __ARMCC_VERSION < 6000000 +static __inline uint32_t aria_p3( uint32_t x ) +{ + uint32_t r; + __asm( "rev r, x" ); + return( r ); +} +#define ARIA_P3 aria_p3 +#endif +#endif /* arm */ +#if defined(__GNUC__) && \ + defined(__i386__) || defined(__amd64__) || defined( __x86_64__) static inline uint32_t aria_p3( uint32_t x ) { asm( "bswap %0" : "=r" (x) : "0" (x) ); return( x ); } #define ARIA_P3 aria_p3 -#endif +#endif /* x86 gnuc */ #endif /* MBEDTLS_HAVE_ASM && GNUC */ #if !defined(ARIA_P3) #define ARIA_P3(x) ARIA_P2( ARIA_P1 ( x ) )