aria: optimize byte perms on Arm

Use specific instructions for moving bytes around in a word. This speeds
things up, and as a side-effect, slightly lowers code size.

ARIA_P3 and ARIA_P1 are now 1 single-cycle instruction each (those
instructions are available in all architecture versions starting from v6-M).
Note: ARIA_P3 was already translated to a single instruction by Clang 3.8 and
armclang 6.5, but not arm-gcc 5.4 nor armcc 5.06.

ARIA_P2 is already efficiently translated to the minimal number of
instruction (1 in ARM mode, 2 in thumb mode) by all tested compilers

Manually compiled and inspected generated code with the following compilers:
arm-gcc 5.4, clang 3.8, armcc 5.06 (with and without --gnu), armclang 6.5.

Size reduction (arm-none-eabi-gcc -march=armv6-m -mthumb -Os): 5288 -> 5044 B

Effect on executing time of self-tests on a few boards:
FRDM-K64F   (Cortex-M4):    444 ->  385 us (-13%)
LPC1768     (Cortex-M3):    488 ->  432 us (-11%)
FRDM-KL64Z  (Cortex-M0):   1429 -> 1134 us (-20%)

Measured using a config.h with no cipher mode and the following program with
aria.c and aria.h copy-pasted to the online compiler:

 #include "mbed.h"
 #include "aria.h"

int main() {
    Timer t;
    t.start();
    int ret = mbedtls_aria_self_test(0);
    t.stop();
    printf("ret = %d; time = %d us\n", ret, t.read_us());
}
This commit is contained in:
Manuel Pégourié-Gonnard 2018-02-27 10:22:26 +01:00
parent fb0e4f0d1a
commit 377b2b624d

View file

@ -85,11 +85,33 @@ static void mbedtls_zeroize( void *v, size_t n ) {
* Common compilers fail to translate this to minimal number of instructions,
* so let's provide asm versions for common platforms with C fallback.
*/
#if defined(MBEDTLS_HAVE_ASM) && defined(__GNUC__)
#if defined(__i386__) || defined(__amd64__) || defined( __x86_64__)
#if defined(MBEDTLS_HAVE_ASM)
#if defined(__arm__)
/* armcc5 --gnu defines __GNUC__ but doesn't support GNU's extended asm */
#if defined(__GNUC__) && \
( !defined(__ARMCC_VERSION) || __ARMCC_VERSION >= 6000000 )
static inline uint32_t aria_p1( uint32_t x )
{
uint32_t r;
asm( "rev16 %0, %1" : "=l" (r) : "l" (x) );
return( r );
}
#define ARIA_P1 aria_p1
#elif defined(__ARMCC_VERSION) && __ARMCC_VERSION < 6000000
static __inline uint32_t aria_p1( uint32_t x )
{
uint32_t r;
__asm( "rev16 r, x" );
return( r );
}
#define ARIA_P1 aria_p1
#endif
#endif /* arm */
#if defined(__GNUC__) && \
defined(__i386__) || defined(__amd64__) || defined( __x86_64__)
/* I couldn't find an Intel equivalent of ret16, so two instructions */
#define ARIA_P1(x) ARIA_P2( ARIA_P3( x ) )
#endif
#endif /* x86 gnuc */
#endif /* MBEDTLS_HAVE_ASM && GNUC */
#if !defined(ARIA_P1)
#define ARIA_P1(x) ((((x) >> 8) & 0x00FF00FF) ^ (((x) & 0x00FF00FF) << 8))
@ -112,15 +134,37 @@ static void mbedtls_zeroize( void *v, size_t n ) {
* Some compilers fail to translate this to a single instruction,
* so let's provide asm versions for common platforms with C fallback.
*/
#if defined(MBEDTLS_HAVE_ASM) && defined(__GNUC__)
#if defined(__i386__) || defined(__amd64__) || defined( __x86_64__)
#if defined(MBEDTLS_HAVE_ASM)
#if defined(__arm__)
/* armcc5 --gnu defines __GNUC__ but doesn't support GNU's extended asm */
#if defined(__GNUC__) && \
( !defined(__ARMCC_VERSION) || __ARMCC_VERSION >= 6000000 )
static inline uint32_t aria_p3( uint32_t x )
{
uint32_t r;
asm( "rev %0, %1" : "=l" (r) : "l" (x) );
return( r );
}
#define ARIA_P3 aria_p3
#elif defined(__ARMCC_VERSION) && __ARMCC_VERSION < 6000000
static __inline uint32_t aria_p3( uint32_t x )
{
uint32_t r;
__asm( "rev r, x" );
return( r );
}
#define ARIA_P3 aria_p3
#endif
#endif /* arm */
#if defined(__GNUC__) && \
defined(__i386__) || defined(__amd64__) || defined( __x86_64__)
static inline uint32_t aria_p3( uint32_t x )
{
asm( "bswap %0" : "=r" (x) : "0" (x) );
return( x );
}
#define ARIA_P3 aria_p3
#endif
#endif /* x86 gnuc */
#endif /* MBEDTLS_HAVE_ASM && GNUC */
#if !defined(ARIA_P3)
#define ARIA_P3(x) ARIA_P2( ARIA_P1 ( x ) )