From 87fbfb5d82ec591e77d765fd05c401a64be73f32 Mon Sep 17 00:00:00 2001 From: Tom Cosgrove Date: Tue, 15 Mar 2022 10:51:52 +0000 Subject: [PATCH 1/3] SECLIB-667: Accelerate SHA-512 with A64 crypto extensions Provide an additional pair of #defines, MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT and MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY. At most one of them may be specified. If used, it is necessary to compile with -march=armv8.2-a+sha3. The MBEDTLS_SHA512_PROCESS_ALT and MBEDTLS_SHA512_ALT mechanisms continue to work, and are mutually exclusive with SHA512_USE_A64_CRYPTO. There should be minimal code size impact if no A64_CRYPTO option is set. The SHA-512 implementation was originally written by Simon Tatham for PuTTY, under the MIT licence; dual-licensed as Apache 2 with his kind permission. Signed-off-by: Tom Cosgrove --- ...mbedtls_sha512_a64_crypto_acceleration.txt | 2 + include/mbedtls/check_config.h | 55 +++ include/mbedtls/mbedtls_config.h | 54 +++ library/sha512.c | 439 +++++++++++++++++- scripts/config.py | 3 +- tests/scripts/all.sh | 8 +- 6 files changed, 549 insertions(+), 12 deletions(-) create mode 100644 ChangeLog.d/mbedtls_sha512_a64_crypto_acceleration.txt diff --git a/ChangeLog.d/mbedtls_sha512_a64_crypto_acceleration.txt b/ChangeLog.d/mbedtls_sha512_a64_crypto_acceleration.txt new file mode 100644 index 000000000..01be0b345 --- /dev/null +++ b/ChangeLog.d/mbedtls_sha512_a64_crypto_acceleration.txt @@ -0,0 +1,2 @@ +Features + * A64 crypto extension support for SHA-512 diff --git a/include/mbedtls/check_config.h b/include/mbedtls/check_config.h index 06ba6b7d4..45f4cc5e2 100644 --- a/include/mbedtls/check_config.h +++ b/include/mbedtls/check_config.h @@ -605,6 +605,61 @@ #error "MBEDTLS_SHA384_C defined without MBEDTLS_SHA512_C" #endif +#if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT) && \ + defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY) +#error "Must only define one of MBEDTLS_SHA512_USE_A64_CRYPTO_*" +#endif + +#if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT) || \ + defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY) +#if !defined(MBEDTLS_SHA512_C) +#error "MBEDTLS_SHA512_USE_A64_CRYPTO_* defined without MBEDTLS_SHA512_C" +#endif +#if defined(MBEDTLS_SHA512_ALT) || defined(MBEDTLS_SHA512_PROCESS_ALT) +#error "MBEDTLS_SHA512_*ALT can't be used with MBEDTLS_SHA512_USE_A64_CRYPTO_*" +#endif +/* + * Best performance comes from most recent compilers, with intrinsics and -O3. + * Must compile with -march=armv8.2-a+sha3, but we can't detect armv8.2-a, and + * can't always detect __ARM_FEATURE_SHA512 (notably clang 7-12). + * + * GCC < 8 won't work at all (lacks the sha512 instructions) + * GCC >= 8 uses intrinsics, sets __ARM_FEATURE_SHA512 + * + * Clang < 7 won't work at all (lacks the sha512 instructions) + * Clang 7-12 don't have intrinsics (but we work around that with inline + * assembler) or __ARM_FEATURE_SHA512 + * Clang == 13.0.0 same as clang 12 (only seen on macOS) + * Clang >= 13.0.1 has __ARM_FEATURE_SHA512 and intrinsics + */ +#if defined(__aarch64__) && !defined(__ARM_FEATURE_SHA512) + /* Test Clang first, as it defines __GNUC__ */ +# if defined(__clang__) +# if __clang_major__ < 7 +# error "A more recent Clang is required for MBEDTLS_SHA512_USE_A64_CRYPTO_*" +# elif __clang_major__ < 13 || \ + (__clang_major__ == 13 && __clang_minor__ == 0 && __clang_patchlevel__ == 0) + /* We implement the intrinsics with inline assembler, so don't error */ +# else +# error "Must use minimum -march=armv8.2-a+sha3 for MBEDTLS_SHA512_USE_A64_CRYPTO_*" +# endif +# elif defined(__GNUC__) +# if __GNUC__ < 8 +# error "A more recent GCC is required for MBEDTLS_SHA512_USE_A64_CRYPTO_*" +# else +# error "Must use minimum -march=armv8.2-a+sha3 for MBEDTLS_SHA512_USE_A64_CRYPTO_*" +# endif +# else +# error "Only GCC and Clang supported for MBEDTLS_SHA512_USE_A64_CRYPTO_*" +# endif +#endif + +#endif /* MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT || MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY */ + +#if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY) && !defined(__aarch64__) +#error "MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY defined on non-Aarch64 system" +#endif + #if defined(MBEDTLS_SHA224_C) && !defined(MBEDTLS_SHA256_C) #error "MBEDTLS_SHA224_C defined without MBEDTLS_SHA256_C" #endif diff --git a/include/mbedtls/mbedtls_config.h b/include/mbedtls/mbedtls_config.h index 1c631b526..fe1b7d4cb 100644 --- a/include/mbedtls/mbedtls_config.h +++ b/include/mbedtls/mbedtls_config.h @@ -2840,6 +2840,60 @@ */ #define MBEDTLS_SHA512_C +/** + * \def MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT + * + * Enable acceleration of the SHA-512 cryptographic hash algorithm with the + * Arm A64 cryptographic extensions if they are available at runtime. If not, + * it will fall back to the C implementation. + * + * \note If MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT is defined when building + * for a non-Aarch64 build it will be silently ignored. + * + * \note The code uses the SHA-512 Neon intrinsics, so requires GCC >= 8 or + * Clang >= 7, and \c CFLAGS must be set to a minimum of + * \c -march=armv8.2-a+sha3. An optimisation level of \c -O3 generates the + * fastest code. + * + * \warning MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT cannot be defined at the + * same time as MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY. + * + * Requires: MBEDTLS_SHA512_C. + * + * Module: library/sha512.c + * + * Uncomment to have the library check for the A64 SHA-512 crypto extensions + * and use them if available. + */ +//#define MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT + +/** + * \def MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY + * + * Enable acceleration of the SHA-512 cryptographic hash algorithm with the + * Arm A64 cryptographic extensions, which must be available at runtime (or + * an illegal instruction fault will occur). + * + * \note This allows builds with a smaller code size than with + * MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT + * + * \note The code uses the SHA-512 Neon intrinsics, so requires GCC >= 8 or + * Clang >= 7, and \c CFLAGS must be set to a minimum of + * \c -march=armv8.2-a+sha3. An optimisation level of \c -O3 generates the + * fastest code. + * + * \warning MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY cannot be defined at the same + * time as MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT. + * + * Requires: MBEDTLS_SHA512_C. + * + * Module: library/sha512.c + * + * Uncomment to have the library use the A64 SHA-512 crypto extensions + * unconditionally. + */ +//#define MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY + /** * \def MBEDTLS_SSL_CACHE_C * diff --git a/library/sha512.c b/library/sha512.c index 2b4cc547e..71fbff06e 100644 --- a/library/sha512.c +++ b/library/sha512.c @@ -50,12 +50,128 @@ #endif /* MBEDTLS_PLATFORM_C */ #endif /* MBEDTLS_SELF_TEST */ +#if defined(__aarch64__) +# if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT) || \ + defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY) +# include +# endif +# if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT) +# if defined(__unix__) +# if defined(__linux__) + /* Our preferred method of detection is getauxval() */ +# include +# endif + /* Use SIGILL on Unix, and fall back to it on Linux */ +# include +# endif +# endif +#elif defined(_M_ARM64) +# if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT) || \ + defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY) +# include +# endif +#else +# undef MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY +# undef MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT +#endif + +#if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT) +/* + * Capability detection code comes early, so we can disable + * MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT if no detection mechanism found + */ +#if defined(HWCAP_SHA512) +static int mbedtls_a64_crypto_sha512_determine_support( void ) +{ + return( ( getauxval( AT_HWCAP ) & HWCAP_SHA512 ) ? 1 : 0 ); +} +#elif defined(__APPLE__) +#include +#include + +static int mbedtls_a64_crypto_sha512_determine_support( void ) +{ + int value = 0; + size_t value_len = sizeof(value); + + int ret = sysctlbyname( "hw.optional.armv8_2_sha512", &value, &value_len, + NULL, 0 ); + return( ret == 0 && value != 0 ); +} +#elif defined(_M_ARM64) +/* + * As of March 2022, there don't appear to be any PF_ARM_V8_* flags + * available to pass to IsProcessorFeaturePresent() to check for + * SHA-512 support. So we fall back to the C code only. + */ +#if defined(_MSC_VER) +#pragma message "No mechanism to detect A64_CRYPTO found, using C code only" +#else +#warning "No mechanism to detect A64_CRYPTO found, using C code only" +#endif +#elif defined(__unix__) && defined(SIG_SETMASK) +/* Detection with SIGILL, setjmp() and longjmp() */ +#include +#include + +#ifndef asm +#define asm __asm__ +#endif + +static jmp_buf return_from_sigill; + +/* + * A64 SHA512 support detection via SIGILL + */ +static void sigill_handler( int signal ) +{ + (void) signal; + longjmp( return_from_sigill, 1 ); +} + +static int mbedtls_a64_crypto_sha512_determine_support( void ) +{ + struct sigaction old_action, new_action; + + sigset_t old_mask; + if( sigprocmask( 0, NULL, &old_mask ) ) + return( 0 ); + + sigemptyset( &new_action.sa_mask ); + new_action.sa_flags = 0; + new_action.sa_handler = sigill_handler; + + sigaction( SIGILL, &new_action, &old_action ); + + static int ret = 0; + + if( setjmp( return_from_sigill ) == 0 ) /* First return only */ + { + /* If this traps, we will return a second time from setjmp() with 1 */ + asm( "sha512h q0, q0, v0.2d" : : : "v0" ); + ret = 1; + } + + sigaction( SIGILL, &old_action, NULL ); + sigprocmask( SIG_SETMASK, &old_mask, NULL ); + + return( ret ); +} +#else +#warning "No mechanism to detect A64_CRYPTO found, using C code only" +#undef MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT +#endif /* HWCAP_SHA512, __APPLE__, __unix__ && SIG_SETMASK */ + +#endif /* MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT */ + #define SHA512_VALIDATE_RET(cond) \ MBEDTLS_INTERNAL_VALIDATE_RET( cond, MBEDTLS_ERR_SHA512_BAD_INPUT_DATA ) #define SHA512_VALIDATE(cond) MBEDTLS_INTERNAL_VALIDATE( cond ) #if !defined(MBEDTLS_SHA512_ALT) +#define SHA512_BLOCK_SIZE 128 + #if defined(MBEDTLS_SHA512_SMALLER) static void sha512_put_uint64_be( uint64_t n, unsigned char *b, uint8_t i ) { @@ -188,9 +304,249 @@ static const uint64_t K[80] = UL64(0x4CC5D4BECB3E42B6), UL64(0x597F299CFC657E2A), UL64(0x5FCB6FAB3AD6FAEC), UL64(0x6C44198C4A475817) }; +#endif -int mbedtls_internal_sha512_process( mbedtls_sha512_context *ctx, - const unsigned char data[128] ) +#if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT) || \ + defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY) + +#if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY) +# define mbedtls_internal_sha512_process_many_a64_crypto mbedtls_internal_sha512_process_many +# define mbedtls_internal_sha512_process_a64_crypto mbedtls_internal_sha512_process +#endif + +#ifndef asm +#define asm __asm__ +#endif + +/* Accelerated SHA-512 implementation originally written by Simon Tatham for PuTTY, + * under the MIT licence; dual-licensed as Apache 2 with his kind permission. + */ + +#if defined(__clang__) && \ + (__clang_major__ < 13 || \ + (__clang_major__ == 13 && __clang_minor__ == 0 && __clang_patchlevel__ == 0)) +static inline uint64x2_t vsha512su0q_u64(uint64x2_t x, uint64x2_t y) +{ + asm( "sha512su0 %0.2D,%1.2D" : "+w" (x) : "w" (y) ); + return( x ); +} +static inline uint64x2_t vsha512su1q_u64(uint64x2_t x, uint64x2_t y, uint64x2_t z) +{ + asm( "sha512su1 %0.2D,%1.2D,%2.2D" : "+w" (x) : "w" (y), "w" (z) ); + return( x ); +} +static inline uint64x2_t vsha512hq_u64(uint64x2_t x, uint64x2_t y, uint64x2_t z) +{ + asm( "sha512h %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z) ); + return( x ); +} +static inline uint64x2_t vsha512h2q_u64(uint64x2_t x, uint64x2_t y, uint64x2_t z) +{ + asm( "sha512h2 %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z) ); + return( x ); +} +#endif /* __clang__ etc */ + +static size_t mbedtls_internal_sha512_process_many_a64_crypto( + mbedtls_sha512_context *ctx, const uint8_t *msg, size_t len ) +{ + uint64x2_t ab = vld1q_u64( &ctx->state[0] ); + uint64x2_t cd = vld1q_u64( &ctx->state[2] ); + uint64x2_t ef = vld1q_u64( &ctx->state[4] ); + uint64x2_t gh = vld1q_u64( &ctx->state[6] ); + + size_t processed = 0; + + for ( ; + len >= SHA512_BLOCK_SIZE; + processed += SHA512_BLOCK_SIZE, + msg += SHA512_BLOCK_SIZE, + len -= SHA512_BLOCK_SIZE ) + { + uint64x2_t initial_sum, sum, intermed; + + uint64x2_t ab_orig = ab; + uint64x2_t cd_orig = cd; + uint64x2_t ef_orig = ef; + uint64x2_t gh_orig = gh; + + uint64x2_t s0 = (uint64x2_t) vld1q_u8( msg + 16 * 0 ); + uint64x2_t s1 = (uint64x2_t) vld1q_u8( msg + 16 * 1 ); + uint64x2_t s2 = (uint64x2_t) vld1q_u8( msg + 16 * 2 ); + uint64x2_t s3 = (uint64x2_t) vld1q_u8( msg + 16 * 3 ); + uint64x2_t s4 = (uint64x2_t) vld1q_u8( msg + 16 * 4 ); + uint64x2_t s5 = (uint64x2_t) vld1q_u8( msg + 16 * 5 ); + uint64x2_t s6 = (uint64x2_t) vld1q_u8( msg + 16 * 6 ); + uint64x2_t s7 = (uint64x2_t) vld1q_u8( msg + 16 * 7 ); + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ /* assume LE if these not defined; untested on BE */ + s0 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s0 ) ) ); + s1 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s1 ) ) ); + s2 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s2 ) ) ); + s3 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s3 ) ) ); + s4 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s4 ) ) ); + s5 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s5 ) ) ); + s6 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s6 ) ) ); + s7 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s7 ) ) ); +#endif + + /* Rounds 0 and 1 */ + initial_sum = vaddq_u64( s0, vld1q_u64( &K[0] ) ); + sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), gh ); + intermed = vsha512hq_u64( sum, vextq_u64( ef, gh, 1 ), vextq_u64( cd, ef, 1 ) ); + gh = vsha512h2q_u64( intermed, cd, ab ); + cd = vaddq_u64( cd, intermed ); + + /* Rounds 2 and 3 */ + initial_sum = vaddq_u64( s1, vld1q_u64( &K[2] ) ); + sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ef ); + intermed = vsha512hq_u64( sum, vextq_u64( cd, ef, 1 ), vextq_u64( ab, cd, 1 ) ); + ef = vsha512h2q_u64( intermed, ab, gh ); + ab = vaddq_u64( ab, intermed ); + + /* Rounds 4 and 5 */ + initial_sum = vaddq_u64( s2, vld1q_u64( &K[4] ) ); + sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), cd ); + intermed = vsha512hq_u64( sum, vextq_u64( ab, cd, 1 ), vextq_u64( gh, ab, 1 ) ); + cd = vsha512h2q_u64( intermed, gh, ef ); + gh = vaddq_u64( gh, intermed ); + + /* Rounds 6 and 7 */ + initial_sum = vaddq_u64( s3, vld1q_u64( &K[6] ) ); + sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ab ); + intermed = vsha512hq_u64( sum, vextq_u64( gh, ab, 1 ), vextq_u64( ef, gh, 1 ) ); + ab = vsha512h2q_u64( intermed, ef, cd ); + ef = vaddq_u64( ef, intermed ); + + /* Rounds 8 and 9 */ + initial_sum = vaddq_u64( s4, vld1q_u64( &K[8] ) ); + sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), gh ); + intermed = vsha512hq_u64( sum, vextq_u64( ef, gh, 1 ), vextq_u64( cd, ef, 1 ) ); + gh = vsha512h2q_u64( intermed, cd, ab ); + cd = vaddq_u64( cd, intermed ); + + /* Rounds 10 and 11 */ + initial_sum = vaddq_u64( s5, vld1q_u64( &K[10] ) ); + sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ef ); + intermed = vsha512hq_u64( sum, vextq_u64( cd, ef, 1 ), vextq_u64( ab, cd, 1 ) ); + ef = vsha512h2q_u64( intermed, ab, gh ); + ab = vaddq_u64( ab, intermed ); + + /* Rounds 12 and 13 */ + initial_sum = vaddq_u64( s6, vld1q_u64( &K[12] ) ); + sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), cd ); + intermed = vsha512hq_u64( sum, vextq_u64( ab, cd, 1 ), vextq_u64( gh, ab, 1 ) ); + cd = vsha512h2q_u64( intermed, gh, ef ); + gh = vaddq_u64( gh, intermed ); + + /* Rounds 14 and 15 */ + initial_sum = vaddq_u64( s7, vld1q_u64( &K[14] ) ); + sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ab ); + intermed = vsha512hq_u64( sum, vextq_u64( gh, ab, 1 ), vextq_u64( ef, gh, 1 ) ); + ab = vsha512h2q_u64( intermed, ef, cd ); + ef = vaddq_u64( ef, intermed ); + + for ( unsigned int t = 16; t < 80; t += 16 ) + { + /* Rounds t and t + 1 */ + s0 = vsha512su1q_u64( vsha512su0q_u64( s0, s1 ), s7, vextq_u64( s4, s5, 1 ) ); + initial_sum = vaddq_u64( s0, vld1q_u64( &K[t] ) ); + sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), gh ); + intermed = vsha512hq_u64( sum, vextq_u64( ef, gh, 1 ), vextq_u64( cd, ef, 1 ) ); + gh = vsha512h2q_u64( intermed, cd, ab ); + cd = vaddq_u64( cd, intermed ); + + /* Rounds t + 2 and t + 3 */ + s1 = vsha512su1q_u64( vsha512su0q_u64( s1, s2 ), s0, vextq_u64( s5, s6, 1 ) ); + initial_sum = vaddq_u64( s1, vld1q_u64( &K[t + 2] ) ); + sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ef ); + intermed = vsha512hq_u64( sum, vextq_u64( cd, ef, 1 ), vextq_u64( ab, cd, 1 ) ); + ef = vsha512h2q_u64( intermed, ab, gh ); + ab = vaddq_u64( ab, intermed ); + + /* Rounds t + 4 and t + 5 */ + s2 = vsha512su1q_u64( vsha512su0q_u64( s2, s3 ), s1, vextq_u64( s6, s7, 1 ) ); + initial_sum = vaddq_u64( s2, vld1q_u64( &K[t + 4] ) ); + sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), cd ); + intermed = vsha512hq_u64( sum, vextq_u64( ab, cd, 1 ), vextq_u64( gh, ab, 1 ) ); + cd = vsha512h2q_u64( intermed, gh, ef ); + gh = vaddq_u64( gh, intermed ); + + /* Rounds t + 6 and t + 7 */ + s3 = vsha512su1q_u64( vsha512su0q_u64( s3, s4 ), s2, vextq_u64( s7, s0, 1 ) ); + initial_sum = vaddq_u64( s3, vld1q_u64( &K[t + 6] ) ); + sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ab ); + intermed = vsha512hq_u64( sum, vextq_u64( gh, ab, 1 ), vextq_u64( ef, gh, 1 ) ); + ab = vsha512h2q_u64( intermed, ef, cd ); + ef = vaddq_u64( ef, intermed ); + + /* Rounds t + 8 and t + 9 */ + s4 = vsha512su1q_u64( vsha512su0q_u64( s4, s5 ), s3, vextq_u64( s0, s1, 1 ) ); + initial_sum = vaddq_u64( s4, vld1q_u64( &K[t + 8] ) ); + sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), gh ); + intermed = vsha512hq_u64( sum, vextq_u64( ef, gh, 1 ), vextq_u64( cd, ef, 1 ) ); + gh = vsha512h2q_u64( intermed, cd, ab ); + cd = vaddq_u64( cd, intermed ); + + /* Rounds t + 10 and t + 11 */ + s5 = vsha512su1q_u64( vsha512su0q_u64( s5, s6 ), s4, vextq_u64( s1, s2, 1 ) ); + initial_sum = vaddq_u64( s5, vld1q_u64( &K[t + 10] ) ); + sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ef ); + intermed = vsha512hq_u64( sum, vextq_u64( cd, ef, 1 ), vextq_u64( ab, cd, 1 ) ); + ef = vsha512h2q_u64( intermed, ab, gh ); + ab = vaddq_u64( ab, intermed ); + + /* Rounds t + 12 and t + 13 */ + s6 = vsha512su1q_u64( vsha512su0q_u64( s6, s7 ), s5, vextq_u64( s2, s3, 1 ) ); + initial_sum = vaddq_u64( s6, vld1q_u64( &K[t + 12] ) ); + sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), cd ); + intermed = vsha512hq_u64( sum, vextq_u64( ab, cd, 1 ), vextq_u64( gh, ab, 1 ) ); + cd = vsha512h2q_u64( intermed, gh, ef ); + gh = vaddq_u64( gh, intermed ); + + /* Rounds t + 14 and t + 15 */ + s7 = vsha512su1q_u64( vsha512su0q_u64( s7, s0 ), s6, vextq_u64( s3, s4, 1 ) ); + initial_sum = vaddq_u64( s7, vld1q_u64( &K[t + 14] ) ); + sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ab ); + intermed = vsha512hq_u64( sum, vextq_u64( gh, ab, 1 ), vextq_u64( ef, gh, 1 ) ); + ab = vsha512h2q_u64( intermed, ef, cd ); + ef = vaddq_u64( ef, intermed ); + } + + ab = vaddq_u64( ab, ab_orig ); + cd = vaddq_u64( cd, cd_orig ); + ef = vaddq_u64( ef, ef_orig ); + gh = vaddq_u64( gh, gh_orig ); + } + + vst1q_u64( &ctx->state[0], ab ); + vst1q_u64( &ctx->state[2], cd ); + vst1q_u64( &ctx->state[4], ef ); + vst1q_u64( &ctx->state[6], gh ); + + return( processed ); +} + +int mbedtls_internal_sha512_process_a64_crypto( mbedtls_sha512_context *ctx, + const unsigned char data[SHA512_BLOCK_SIZE] ) +{ + return( mbedtls_internal_sha512_process_many_a64_crypto( ctx, data, + SHA512_BLOCK_SIZE ) == SHA512_BLOCK_SIZE ) ? 0 : -1; +} + +#endif /* MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT || MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY */ + + +#if !defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT) +#define mbedtls_internal_sha512_process_many_c mbedtls_internal_sha512_process_many +#define mbedtls_internal_sha512_process_c mbedtls_internal_sha512_process +#endif + + +#if !defined(MBEDTLS_SHA512_PROCESS_ALT) && !defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY) + +int mbedtls_internal_sha512_process_c( mbedtls_sha512_context *ctx, + const unsigned char data[SHA512_BLOCK_SIZE] ) { int i; struct @@ -291,7 +647,68 @@ int mbedtls_internal_sha512_process( mbedtls_sha512_context *ctx, return( 0 ); } -#endif /* !MBEDTLS_SHA512_PROCESS_ALT */ +#endif /* !MBEDTLS_SHA512_PROCESS_ALT && !MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY */ + + +#if !defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY) + +static size_t mbedtls_internal_sha512_process_many_c( + mbedtls_sha512_context *ctx, const uint8_t *data, size_t len) +{ + size_t processed = 0; + + while( len >= SHA512_BLOCK_SIZE ) + { + if( mbedtls_internal_sha512_process_c( ctx, data ) != 0) + return( 0 ); + + data += SHA512_BLOCK_SIZE; + len -= SHA512_BLOCK_SIZE; + + processed += SHA512_BLOCK_SIZE; + } + + return( processed ); +} + +#endif /* !MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY */ + + +#if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT) + +int mbedtls_a64_crypto_sha512_has_support( void ) +{ + static int done = 0; + static int supported = 0; + + if( !done ) + { + supported = mbedtls_a64_crypto_sha512_determine_support(); + done = 1; + } + + return( supported ); +} + +static size_t mbedtls_internal_sha512_process_many( mbedtls_sha512_context *ctx, + const uint8_t *msg, size_t len ) +{ + if( mbedtls_a64_crypto_sha512_has_support() ) + return( mbedtls_internal_sha512_process_many_a64_crypto( ctx, msg, len ) ); + else + return( mbedtls_internal_sha512_process_many_c( ctx, msg, len ) ); +} + +int mbedtls_internal_sha512_process( mbedtls_sha512_context *ctx, + const unsigned char data[SHA512_BLOCK_SIZE] ) +{ + if( mbedtls_a64_crypto_sha512_has_support() ) + return( mbedtls_internal_sha512_process_a64_crypto( ctx, data ) ); + else + return( mbedtls_internal_sha512_process_c( ctx, data ) ); +} + +#endif /* MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT */ /* * SHA-512 process buffer @@ -311,7 +728,7 @@ int mbedtls_sha512_update( mbedtls_sha512_context *ctx, return( 0 ); left = (unsigned int) (ctx->total[0] & 0x7F); - fill = 128 - left; + fill = SHA512_BLOCK_SIZE - left; ctx->total[0] += (uint64_t) ilen; @@ -330,13 +747,15 @@ int mbedtls_sha512_update( mbedtls_sha512_context *ctx, left = 0; } - while( ilen >= 128 ) + while( ilen >= SHA512_BLOCK_SIZE ) { - if( ( ret = mbedtls_internal_sha512_process( ctx, input ) ) != 0 ) - return( ret ); + size_t processed = + mbedtls_internal_sha512_process_many( ctx, input, ilen ); + if( processed < SHA512_BLOCK_SIZE ) + return( MBEDTLS_ERR_ERROR_GENERIC_ERROR ); - input += 128; - ilen -= 128; + input += processed; + ilen -= processed; } if( ilen > 0 ) @@ -373,7 +792,7 @@ int mbedtls_sha512_finish( mbedtls_sha512_context *ctx, else { /* We'll need an extra block */ - memset( ctx->buffer + used, 0, 128 - used ); + memset( ctx->buffer + used, 0, SHA512_BLOCK_SIZE - used ); if( ( ret = mbedtls_internal_sha512_process( ctx, ctx->buffer ) ) != 0 ) return( ret ); diff --git a/scripts/config.py b/scripts/config.py index 0ab1e394f..7395656fb 100755 --- a/scripts/config.py +++ b/scripts/config.py @@ -198,7 +198,8 @@ EXCLUDE_FROM_FULL = frozenset([ 'MBEDTLS_PSA_CRYPTO_SPM', # platform dependency (PSA SPM) 'MBEDTLS_PSA_INJECT_ENTROPY', # build dependency (hook functions) 'MBEDTLS_RSA_NO_CRT', # influences the use of RSA in X.509 and TLS - 'MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY', # interacts with *_USE_A64_CRYPTO_ONLY + 'MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY', # interacts with *_USE_A64_CRYPTO_IF_PRESENT + 'MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY', # interacts with *_USE_A64_CRYPTO_IF_PRESENT 'MBEDTLS_TEST_CONSTANT_FLOW_MEMSAN', # build dependency (clang+memsan) 'MBEDTLS_TEST_CONSTANT_FLOW_VALGRIND', # build dependency (valgrind headers) 'MBEDTLS_X509_REMOVE_INFO', # removes a feature diff --git a/tests/scripts/all.sh b/tests/scripts/all.sh index 69b1fc83e..cc11dcf8f 100755 --- a/tests/scripts/all.sh +++ b/tests/scripts/all.sh @@ -1558,6 +1558,9 @@ component_build_module_alt () { # MBEDTLS_SHA256_*ALT can't be used with MBEDTLS_SHA256_USE_A64_CRYPTO_* scripts/config.py unset MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT scripts/config.py unset MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY + # MBEDTLS_SHA512_*ALT can't be used with MBEDTLS_SHA512_USE_A64_CRYPTO_* + scripts/config.py unset MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT + scripts/config.py unset MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY # Enable all MBEDTLS_XXX_ALT for whole modules. Do not enable # MBEDTLS_XXX_YYY_ALT which are for single functions. scripts/config.py set-all 'MBEDTLS_([A-Z0-9]*|NIST_KW)_ALT' @@ -2742,6 +2745,9 @@ component_build_arm_none_eabi_gcc_no_64bit_multiplication () { component_build_armcc () { msg "build: ARM Compiler 5" scripts/config.py baremetal + # armc[56] don't support SHA-512 intrinsics + scripts/config.py unset MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT + make CC="$ARMC5_CC" AR="$ARMC5_AR" WARNING_CFLAGS='--strict --c99' lib msg "size: ARM Compiler 5" @@ -2761,7 +2767,7 @@ component_build_armcc () { # ARM Compiler 6 - Target ARMv8-M armc6_build_test "--target=arm-arm-none-eabi -march=armv8-m.main" - # ARM Compiler 6 - Target ARMv8-A - AArch64 + # ARM Compiler 6 - Target ARMv8.2-A - AArch64 armc6_build_test "--target=aarch64-arm-none-eabi -march=armv8.2-a+crypto" } From b7f5b97650e29d8df172fed10d2ccb9630d82002 Mon Sep 17 00:00:00 2001 From: Tom Cosgrove Date: Tue, 15 Mar 2022 11:26:55 +0000 Subject: [PATCH 2/3] Minor changes to sha256.c to bring it in line with sha512.c Signed-off-by: Tom Cosgrove --- library/sha256.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/library/sha256.c b/library/sha256.c index ff6280270..bdc396a18 100644 --- a/library/sha256.c +++ b/library/sha256.c @@ -49,8 +49,15 @@ defined(MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY) # include # endif -# if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT) && defined(__linux__) -# include +# if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT) +# if defined(__unix__) +# if defined(__linux__) + /* Our preferred method of detection is getauxval() */ +# include +# endif + /* Use SIGILL on Unix, and fall back to it on Linux */ +# include +# endif # endif #elif defined(_M_ARM64) # if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT) || \ @@ -272,10 +279,10 @@ static size_t mbedtls_internal_sha256_process_many_a64_crypto( uint32x4_t abcd_orig = abcd; uint32x4_t efgh_orig = efgh; - uint32x4_t sched0 = vld1q_u32( (const uint32_t *)( msg + 16 * 0 ) ); - uint32x4_t sched1 = vld1q_u32( (const uint32_t *)( msg + 16 * 1 ) ); - uint32x4_t sched2 = vld1q_u32( (const uint32_t *)( msg + 16 * 2 ) ); - uint32x4_t sched3 = vld1q_u32( (const uint32_t *)( msg + 16 * 3 ) ); + uint32x4_t sched0 = (uint32x4_t) vld1q_u8( msg + 16 * 0 ); + uint32x4_t sched1 = (uint32x4_t) vld1q_u8( msg + 16 * 1 ); + uint32x4_t sched2 = (uint32x4_t) vld1q_u8( msg + 16 * 2 ); + uint32x4_t sched3 = (uint32x4_t) vld1q_u8( msg + 16 * 3 ); #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ /* Will be true if not defined */ /* Untested on BE */ From 226aca195f6e3835070eb19b7239d9fa79a2110d Mon Sep 17 00:00:00 2001 From: Tom Cosgrove Date: Wed, 16 Mar 2022 14:11:07 +0000 Subject: [PATCH 3/3] Fix running of all.sh on macOS Was getting 'dd: unknown operand status' Signed-off-by: Tom Cosgrove --- tests/scripts/all.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/scripts/all.sh b/tests/scripts/all.sh index cc11dcf8f..f29312722 100755 --- a/tests/scripts/all.sh +++ b/tests/scripts/all.sh @@ -3111,7 +3111,7 @@ run_component () { local dd_cmd dd_cmd=(dd if=/dev/urandom of=./tests/seedfile bs=64 count=1) case $OSTYPE in - linux*|freebsd*|openbsd*|darwin*) dd_cmd+=(status=none) + linux*|freebsd*|openbsd*) dd_cmd+=(status=none) esac "${dd_cmd[@]}"