Merge pull request #8719 from daverodgman/iar-codegen
Improve codegen of unaligned access for IAR and gcc
This commit is contained in:
commit
f5e231ca84
5 changed files with 193 additions and 38 deletions
2
ChangeLog.d/iar-gcc-perf.txt
Normal file
2
ChangeLog.d/iar-gcc-perf.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
Features
|
||||
* Improve performance for gcc (versions older than 9.3.0) and IAR.
|
|
@ -83,6 +83,14 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined(__ARMCC_VERSION) && !defined(__clang__) \
|
||||
&& !defined(__llvm__) && !defined(__INTEL_COMPILER)
|
||||
/* Defined if the compiler really is gcc and not clang, etc */
|
||||
#define MBEDTLS_COMPILER_IS_GCC
|
||||
#define MBEDTLS_GCC_VERSION \
|
||||
(__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_DEPRECATE)
|
||||
#define _CRT_SECURE_NO_DEPRECATE 1
|
||||
#endif
|
||||
|
|
|
@ -37,6 +37,52 @@
|
|||
#define MBEDTLS_EFFICIENT_UNALIGNED_ACCESS
|
||||
#endif
|
||||
|
||||
#if defined(__IAR_SYSTEMS_ICC__) && \
|
||||
(defined(MBEDTLS_ARCH_IS_ARM64) || defined(MBEDTLS_ARCH_IS_ARM32) \
|
||||
|| defined(__ICCRX__) || defined(__ICCRL78__) || defined(__ICCRISCV__))
|
||||
#pragma language=save
|
||||
#pragma language=extended
|
||||
#define MBEDTLS_POP_IAR_LANGUAGE_PRAGMA
|
||||
/* IAR recommend this technique for accessing unaligned data in
|
||||
* https://www.iar.com/knowledge/support/technical-notes/compiler/accessing-unaligned-data
|
||||
* This results in a single load / store instruction (if unaligned access is supported).
|
||||
* According to that document, this is only supported on certain architectures.
|
||||
*/
|
||||
#define UINT_UNALIGNED
|
||||
typedef uint16_t __packed mbedtls_uint16_unaligned_t;
|
||||
typedef uint32_t __packed mbedtls_uint32_unaligned_t;
|
||||
typedef uint64_t __packed mbedtls_uint64_unaligned_t;
|
||||
#elif defined(MBEDTLS_COMPILER_IS_GCC) && (MBEDTLS_GCC_VERSION >= 40504) && \
|
||||
((MBEDTLS_GCC_VERSION < 90300) || (!defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS)))
|
||||
/*
|
||||
* Old versions of gcc, depending on how the target is specified, may generate a branch to memcpy
|
||||
* for calls like `memcpy(dest, src, 4)` rather than generating some LDR or LDRB instructions
|
||||
* (similar for stores).
|
||||
* Recent versions where unaligned access is not enabled also do this.
|
||||
*
|
||||
* For performance (and code size, in some cases), we want to avoid the branch and just generate
|
||||
* some inline load/store instructions since the access is small and constant-size.
|
||||
*
|
||||
* The manual states:
|
||||
* "The aligned attribute specifies a minimum alignment for the variable or structure field,
|
||||
* measured in bytes."
|
||||
* https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html
|
||||
*
|
||||
* Tested with several versions of GCC from 4.5.0 up to 9.3.0
|
||||
* We don't enable for older than 4.5.0 as this has not been tested.
|
||||
*/
|
||||
#define UINT_UNALIGNED
|
||||
typedef uint16_t __attribute__((__aligned__(1))) mbedtls_uint16_unaligned_t;
|
||||
typedef uint32_t __attribute__((__aligned__(1))) mbedtls_uint32_unaligned_t;
|
||||
typedef uint64_t __attribute__((__aligned__(1))) mbedtls_uint64_unaligned_t;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* We try to force mbedtls_(get|put)_unaligned_uintXX to be always inline, because this results
|
||||
* in code that is both smaller and faster. IAR and gcc both benefit from this when optimising
|
||||
* for size.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Read the unsigned 16 bits integer from the given address, which need not
|
||||
* be aligned.
|
||||
|
@ -44,10 +90,20 @@
|
|||
* \param p pointer to 2 bytes of data
|
||||
* \return Data at the given address
|
||||
*/
|
||||
inline uint16_t mbedtls_get_unaligned_uint16(const void *p)
|
||||
#if defined(__IAR_SYSTEMS_ICC__)
|
||||
#pragma inline = forced
|
||||
#elif defined(__GNUC__)
|
||||
__attribute__((always_inline))
|
||||
#endif
|
||||
static inline uint16_t mbedtls_get_unaligned_uint16(const void *p)
|
||||
{
|
||||
uint16_t r;
|
||||
#if defined(UINT_UNALIGNED)
|
||||
mbedtls_uint16_unaligned_t *p16 = (mbedtls_uint16_unaligned_t *) p;
|
||||
r = *p16;
|
||||
#else
|
||||
memcpy(&r, p, sizeof(r));
|
||||
#endif
|
||||
return r;
|
||||
}
|
||||
|
||||
|
@ -58,9 +114,19 @@ inline uint16_t mbedtls_get_unaligned_uint16(const void *p)
|
|||
* \param p pointer to 2 bytes of data
|
||||
* \param x data to write
|
||||
*/
|
||||
inline void mbedtls_put_unaligned_uint16(void *p, uint16_t x)
|
||||
#if defined(__IAR_SYSTEMS_ICC__)
|
||||
#pragma inline = forced
|
||||
#elif defined(__GNUC__)
|
||||
__attribute__((always_inline))
|
||||
#endif
|
||||
static inline void mbedtls_put_unaligned_uint16(void *p, uint16_t x)
|
||||
{
|
||||
#if defined(UINT_UNALIGNED)
|
||||
mbedtls_uint16_unaligned_t *p16 = (mbedtls_uint16_unaligned_t *) p;
|
||||
*p16 = x;
|
||||
#else
|
||||
memcpy(p, &x, sizeof(x));
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -70,10 +136,20 @@ inline void mbedtls_put_unaligned_uint16(void *p, uint16_t x)
|
|||
* \param p pointer to 4 bytes of data
|
||||
* \return Data at the given address
|
||||
*/
|
||||
inline uint32_t mbedtls_get_unaligned_uint32(const void *p)
|
||||
#if defined(__IAR_SYSTEMS_ICC__)
|
||||
#pragma inline = forced
|
||||
#elif defined(__GNUC__)
|
||||
__attribute__((always_inline))
|
||||
#endif
|
||||
static inline uint32_t mbedtls_get_unaligned_uint32(const void *p)
|
||||
{
|
||||
uint32_t r;
|
||||
#if defined(UINT_UNALIGNED)
|
||||
mbedtls_uint32_unaligned_t *p32 = (mbedtls_uint32_unaligned_t *) p;
|
||||
r = *p32;
|
||||
#else
|
||||
memcpy(&r, p, sizeof(r));
|
||||
#endif
|
||||
return r;
|
||||
}
|
||||
|
||||
|
@ -84,9 +160,19 @@ inline uint32_t mbedtls_get_unaligned_uint32(const void *p)
|
|||
* \param p pointer to 4 bytes of data
|
||||
* \param x data to write
|
||||
*/
|
||||
inline void mbedtls_put_unaligned_uint32(void *p, uint32_t x)
|
||||
#if defined(__IAR_SYSTEMS_ICC__)
|
||||
#pragma inline = forced
|
||||
#elif defined(__GNUC__)
|
||||
__attribute__((always_inline))
|
||||
#endif
|
||||
static inline void mbedtls_put_unaligned_uint32(void *p, uint32_t x)
|
||||
{
|
||||
#if defined(UINT_UNALIGNED)
|
||||
mbedtls_uint32_unaligned_t *p32 = (mbedtls_uint32_unaligned_t *) p;
|
||||
*p32 = x;
|
||||
#else
|
||||
memcpy(p, &x, sizeof(x));
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -96,10 +182,20 @@ inline void mbedtls_put_unaligned_uint32(void *p, uint32_t x)
|
|||
* \param p pointer to 8 bytes of data
|
||||
* \return Data at the given address
|
||||
*/
|
||||
inline uint64_t mbedtls_get_unaligned_uint64(const void *p)
|
||||
#if defined(__IAR_SYSTEMS_ICC__)
|
||||
#pragma inline = forced
|
||||
#elif defined(__GNUC__)
|
||||
__attribute__((always_inline))
|
||||
#endif
|
||||
static inline uint64_t mbedtls_get_unaligned_uint64(const void *p)
|
||||
{
|
||||
uint64_t r;
|
||||
#if defined(UINT_UNALIGNED)
|
||||
mbedtls_uint64_unaligned_t *p64 = (mbedtls_uint64_unaligned_t *) p;
|
||||
r = *p64;
|
||||
#else
|
||||
memcpy(&r, p, sizeof(r));
|
||||
#endif
|
||||
return r;
|
||||
}
|
||||
|
||||
|
@ -110,11 +206,25 @@ inline uint64_t mbedtls_get_unaligned_uint64(const void *p)
|
|||
* \param p pointer to 8 bytes of data
|
||||
* \param x data to write
|
||||
*/
|
||||
inline void mbedtls_put_unaligned_uint64(void *p, uint64_t x)
|
||||
#if defined(__IAR_SYSTEMS_ICC__)
|
||||
#pragma inline = forced
|
||||
#elif defined(__GNUC__)
|
||||
__attribute__((always_inline))
|
||||
#endif
|
||||
static inline void mbedtls_put_unaligned_uint64(void *p, uint64_t x)
|
||||
{
|
||||
#if defined(UINT_UNALIGNED)
|
||||
mbedtls_uint64_unaligned_t *p64 = (mbedtls_uint64_unaligned_t *) p;
|
||||
*p64 = x;
|
||||
#else
|
||||
memcpy(p, &x, sizeof(x));
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(MBEDTLS_POP_IAR_LANGUAGE_PRAGMA)
|
||||
#pragma language=restore
|
||||
#endif
|
||||
|
||||
/** Byte Reading Macros
|
||||
*
|
||||
* Given a multi-byte integer \p x, MBEDTLS_BYTE_n retrieves the n-th
|
||||
|
|
|
@ -27,15 +27,6 @@
|
|||
#define MBEDTLS_HAVE_NEON_INTRINSICS
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__GNUC__) && !defined(__ARMCC_VERSION) && !defined(__clang__) \
|
||||
&& !defined(__llvm__) && !defined(__INTEL_COMPILER)
|
||||
/* Defined if the compiler really is gcc and not clang, etc */
|
||||
#define MBEDTLS_COMPILER_IS_GCC
|
||||
#define MBEDTLS_GCC_VERSION \
|
||||
(__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
|
||||
#endif
|
||||
|
||||
/** Helper to define a function as static except when building invasive tests.
|
||||
*
|
||||
* If a function is only used inside its own source file and should be
|
||||
|
@ -167,6 +158,12 @@ static inline const unsigned char *mbedtls_buffer_offset_const(
|
|||
return p == NULL ? NULL : p + n;
|
||||
}
|
||||
|
||||
/* Always inline mbedtls_xor() for similar reasons as mbedtls_xor_no_simd(). */
|
||||
#if defined(__IAR_SYSTEMS_ICC__)
|
||||
#pragma inline = forced
|
||||
#elif defined(__GNUC__)
|
||||
__attribute__((always_inline))
|
||||
#endif
|
||||
/**
|
||||
* Perform a fast block XOR operation, such that
|
||||
* r[i] = a[i] ^ b[i] where 0 <= i < n
|
||||
|
@ -177,8 +174,19 @@ static inline const unsigned char *mbedtls_buffer_offset_const(
|
|||
* \param a Pointer to input (buffer of at least \p n bytes)
|
||||
* \param b Pointer to input (buffer of at least \p n bytes)
|
||||
* \param n Number of bytes to process.
|
||||
*
|
||||
* \note Depending on the situation, it may be faster to use either mbedtls_xor() or
|
||||
* mbedtls_xor_no_simd() (these are functionally equivalent).
|
||||
* If the result is used immediately after the xor operation in non-SIMD code (e.g, in
|
||||
* AES-CBC), there may be additional latency to transfer the data from SIMD to scalar
|
||||
* registers, and in this case, mbedtls_xor_no_simd() may be faster. In other cases where
|
||||
* the result is not used immediately (e.g., in AES-CTR), mbedtls_xor() may be faster.
|
||||
* For targets without SIMD support, they will behave the same.
|
||||
*/
|
||||
inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned char *b, size_t n)
|
||||
static inline void mbedtls_xor(unsigned char *r,
|
||||
const unsigned char *a,
|
||||
const unsigned char *b,
|
||||
size_t n)
|
||||
{
|
||||
size_t i = 0;
|
||||
#if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS)
|
||||
|
@ -191,17 +199,36 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
|
|||
uint8x16_t x = veorq_u8(v1, v2);
|
||||
vst1q_u8(r + i, x);
|
||||
}
|
||||
#if defined(__IAR_SYSTEMS_ICC__)
|
||||
/* This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case
|
||||
* where n is a constant multiple of 16.
|
||||
* For other compilers (e.g. recent gcc and clang) it makes no difference if n is a compile-time
|
||||
* constant, and is a very small perf regression if n is not a compile-time constant. */
|
||||
if (n % 16 == 0) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#elif defined(MBEDTLS_ARCH_IS_X64) || defined(MBEDTLS_ARCH_IS_ARM64)
|
||||
/* This codepath probably only makes sense on architectures with 64-bit registers */
|
||||
for (; (i + 8) <= n; i += 8) {
|
||||
uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i);
|
||||
mbedtls_put_unaligned_uint64(r + i, x);
|
||||
}
|
||||
#if defined(__IAR_SYSTEMS_ICC__)
|
||||
if (n % 8 == 0) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
for (; (i + 4) <= n; i += 4) {
|
||||
uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i);
|
||||
mbedtls_put_unaligned_uint32(r + i, x);
|
||||
}
|
||||
#if defined(__IAR_SYSTEMS_ICC__)
|
||||
if (n % 4 == 0) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
for (; i < n; i++) {
|
||||
|
@ -209,11 +236,18 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
|
|||
}
|
||||
}
|
||||
|
||||
/* Always inline mbedtls_xor_no_simd() as we see significant perf regressions when it does not get
|
||||
* inlined (e.g., observed about 3x perf difference in gcm_mult_largetable with gcc 7 - 12) */
|
||||
#if defined(__IAR_SYSTEMS_ICC__)
|
||||
#pragma inline = forced
|
||||
#elif defined(__GNUC__)
|
||||
__attribute__((always_inline))
|
||||
#endif
|
||||
/**
|
||||
* Perform a fast block XOR operation, such that
|
||||
* r[i] = a[i] ^ b[i] where 0 <= i < n
|
||||
*
|
||||
* In some situations, this can perform better than mbedtls_xor (e.g., it's about 5%
|
||||
* In some situations, this can perform better than mbedtls_xor() (e.g., it's about 5%
|
||||
* better in AES-CBC).
|
||||
*
|
||||
* \param r Pointer to result (buffer of at least \p n bytes). \p r
|
||||
|
@ -222,6 +256,14 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
|
|||
* \param a Pointer to input (buffer of at least \p n bytes)
|
||||
* \param b Pointer to input (buffer of at least \p n bytes)
|
||||
* \param n Number of bytes to process.
|
||||
*
|
||||
* \note Depending on the situation, it may be faster to use either mbedtls_xor() or
|
||||
* mbedtls_xor_no_simd() (these are functionally equivalent).
|
||||
* If the result is used immediately after the xor operation in non-SIMD code (e.g, in
|
||||
* AES-CBC), there may be additional latency to transfer the data from SIMD to scalar
|
||||
* registers, and in this case, mbedtls_xor_no_simd() may be faster. In other cases where
|
||||
* the result is not used immediately (e.g., in AES-CTR), mbedtls_xor() may be faster.
|
||||
* For targets without SIMD support, they will behave the same.
|
||||
*/
|
||||
static inline void mbedtls_xor_no_simd(unsigned char *r,
|
||||
const unsigned char *a,
|
||||
|
@ -236,11 +278,25 @@ static inline void mbedtls_xor_no_simd(unsigned char *r,
|
|||
uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i);
|
||||
mbedtls_put_unaligned_uint64(r + i, x);
|
||||
}
|
||||
#if defined(__IAR_SYSTEMS_ICC__)
|
||||
/* This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case
|
||||
* where n is a constant multiple of 8.
|
||||
* For other compilers (e.g. recent gcc and clang) it makes no difference if n is a compile-time
|
||||
* constant, and is a very small perf regression if n is not a compile-time constant. */
|
||||
if (n % 8 == 0) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
for (; (i + 4) <= n; i += 4) {
|
||||
uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i);
|
||||
mbedtls_put_unaligned_uint32(r + i, x);
|
||||
}
|
||||
#if defined(__IAR_SYSTEMS_ICC__)
|
||||
if (n % 4 == 0) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
for (; i < n; i++) {
|
||||
|
|
|
@ -217,27 +217,6 @@ struct tm *mbedtls_platform_gmtime_r(const mbedtls_time_t *tt,
|
|||
void (*mbedtls_test_hook_test_fail)(const char *, int, const char *);
|
||||
#endif /* MBEDTLS_TEST_HOOKS */
|
||||
|
||||
/*
|
||||
* Provide external definitions of some inline functions so that the compiler
|
||||
* has the option to not inline them
|
||||
*/
|
||||
extern inline void mbedtls_xor(unsigned char *r,
|
||||
const unsigned char *a,
|
||||
const unsigned char *b,
|
||||
size_t n);
|
||||
|
||||
extern inline uint16_t mbedtls_get_unaligned_uint16(const void *p);
|
||||
|
||||
extern inline void mbedtls_put_unaligned_uint16(void *p, uint16_t x);
|
||||
|
||||
extern inline uint32_t mbedtls_get_unaligned_uint32(const void *p);
|
||||
|
||||
extern inline void mbedtls_put_unaligned_uint32(void *p, uint32_t x);
|
||||
|
||||
extern inline uint64_t mbedtls_get_unaligned_uint64(const void *p);
|
||||
|
||||
extern inline void mbedtls_put_unaligned_uint64(void *p, uint64_t x);
|
||||
|
||||
#if defined(MBEDTLS_HAVE_TIME) && !defined(MBEDTLS_PLATFORM_MS_TIME_ALT)
|
||||
|
||||
#include <time.h>
|
||||
|
|
Loading…
Reference in a new issue