bn_mul.h: add ARM DSP optimized MULADDC code
The Cortex M4, M7 MCUs and the Cortex A CPUs support the ARM DSP instructions, and especially the umaal instruction which greatly speed up MULADDC code. In addition the patch switched the ASM constraints to registers instead of memory, giving the opportunity for the compiler to load them the best way. The speed improvement is variable depending on the crypto operation and the CPU. Here are the results on a Cortex M4, a Cortex M7 and a Cortex A8. All tests have been done with GCC 6.3 using -O2. RSA uses a RSA-4096 key. ECDSA uses a secp256r1 curve EC key pair. +--------+--------+--------+ | M4 | M7 | A8 | +----------------+--------+--------+--------+ | ECDSA signing | +6.3% | +7.9% | +4.1% | +----------------+--------+--------+--------+ | RSA signing | +43.7% | +68.3% | +26.3% | +----------------+--------+--------+--------+ | RSA encryption | +3.4% | +9.7% | +3.6% | +----------------+--------+--------+--------+ | RSA decryption | +43.0% | +67.8% | +22.8% | +----------------+--------+--------+--------+ I ran the whole testsuite on the Cortex A8 Linux environment, and it all passes.
This commit is contained in:
parent
c4bd8ec5ed
commit
16b1bd8932
1 changed files with 17 additions and 0 deletions
|
@ -630,6 +630,23 @@
|
|||
"r6", "r7", "r8", "r9", "cc" \
|
||||
);
|
||||
|
||||
#elif defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)
|
||||
|
||||
#define MULADDC_INIT \
|
||||
asm(
|
||||
|
||||
#define MULADDC_CORE \
|
||||
"ldr r0, [%0], #4 \n\t" \
|
||||
"ldr r1, [%1] \n\t" \
|
||||
"umaal r1, %2, %3, r0 \n\t" \
|
||||
"str r1, [%1], #4 \n\t"
|
||||
|
||||
#define MULADDC_STOP \
|
||||
: "=r" (s), "=r" (d), "=r" (c) \
|
||||
: "r" (b), "0" (s), "1" (d), "2" (c) \
|
||||
: "r0", "r1", "memory" \
|
||||
);
|
||||
|
||||
#else
|
||||
|
||||
#define MULADDC_INIT \
|
||||
|
|
Loading…
Reference in a new issue