Merge pull request #8716 from mschulz-at-hilscher/feature/gcm_largetable

Use large GCM tables
This commit is contained in:
Tom Cosgrove 2024-02-23 16:25:38 +00:00 committed by GitHub
commit 817772a6ca
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 256 additions and 111 deletions

View file

@ -0,0 +1,4 @@
Features
* Add support for 8-bit GCM tables for Shoup's algorithm to speedup GCM
operations when hardware accelerated AES is not present. Improves
performance by around 30% on 64-bit Intel; 125% on Armv7-M.

View file

@ -46,6 +46,12 @@ extern "C" {
#if !defined(MBEDTLS_GCM_ALT)
#if defined(MBEDTLS_GCM_LARGE_TABLE)
#define MBEDTLS_GCM_HTABLE_SIZE 256
#else
#define MBEDTLS_GCM_HTABLE_SIZE 16
#endif
/**
* \brief The GCM context structure.
*/
@ -53,18 +59,18 @@ typedef struct mbedtls_gcm_context {
#if defined(MBEDTLS_BLOCK_CIPHER_C)
mbedtls_block_cipher_context_t MBEDTLS_PRIVATE(block_cipher_ctx); /*!< The cipher context used. */
#else
mbedtls_cipher_context_t MBEDTLS_PRIVATE(cipher_ctx); /*!< The cipher context used. */
mbedtls_cipher_context_t MBEDTLS_PRIVATE(cipher_ctx); /*!< The cipher context used. */
#endif
uint64_t MBEDTLS_PRIVATE(HL)[16]; /*!< Precalculated HTable low. */
uint64_t MBEDTLS_PRIVATE(HH)[16]; /*!< Precalculated HTable high. */
uint64_t MBEDTLS_PRIVATE(len); /*!< The total length of the encrypted data. */
uint64_t MBEDTLS_PRIVATE(add_len); /*!< The total length of the additional data. */
unsigned char MBEDTLS_PRIVATE(base_ectr)[16]; /*!< The first ECTR for tag. */
unsigned char MBEDTLS_PRIVATE(y)[16]; /*!< The Y working value. */
unsigned char MBEDTLS_PRIVATE(buf)[16]; /*!< The buf working value. */
int MBEDTLS_PRIVATE(mode); /*!< The operation to perform:
#MBEDTLS_GCM_ENCRYPT or
#MBEDTLS_GCM_DECRYPT. */
uint64_t MBEDTLS_PRIVATE(H)[MBEDTLS_GCM_HTABLE_SIZE][2]; /*!< Precalculated HTable. */
uint64_t MBEDTLS_PRIVATE(len); /*!< The total length of the encrypted data. */
uint64_t MBEDTLS_PRIVATE(add_len); /*!< The total length of the additional data. */
unsigned char MBEDTLS_PRIVATE(base_ectr)[16]; /*!< The first ECTR for tag. */
unsigned char MBEDTLS_PRIVATE(y)[16]; /*!< The Y working value. */
unsigned char MBEDTLS_PRIVATE(buf)[16]; /*!< The buf working value. */
unsigned char MBEDTLS_PRIVATE(mode); /*!< The operation to perform:
#MBEDTLS_GCM_ENCRYPT or
#MBEDTLS_GCM_DECRYPT. */
unsigned char MBEDTLS_PRIVATE(acceleration); /*!< The acceleration to use. */
}
mbedtls_gcm_context;

View file

@ -2800,6 +2800,22 @@
*/
#define MBEDTLS_GCM_C
/**
* \def MBEDTLS_GCM_LARGE_TABLE
*
* Enable large pre-computed tables for Galois/Counter Mode (GCM).
* Can significantly increase throughput on systems without GCM hardware
* acceleration (e.g., AESNI, AESCE).
*
* The mbedtls_gcm_context size will increase by 3840 bytes.
* The code size will increase by roughly 344 bytes.
*
* Module: library/gcm.c
*
* Requires: MBEDTLS_GCM_C
*/
//#define MBEDTLS_GCM_LARGE_TABLE
/**
* \def MBEDTLS_HKDF_C
*

View file

@ -41,6 +41,12 @@
#if !defined(MBEDTLS_GCM_ALT)
/* Used to select the acceleration mechanism */
#define MBEDTLS_GCM_ACC_SMALLTABLE 0
#define MBEDTLS_GCM_ACC_LARGETABLE 1
#define MBEDTLS_GCM_ACC_AESNI 2
#define MBEDTLS_GCM_ACC_AESCE 3
/*
* Initialize a context
*/
@ -49,6 +55,39 @@ void mbedtls_gcm_init(mbedtls_gcm_context *ctx)
memset(ctx, 0, sizeof(mbedtls_gcm_context));
}
static inline void gcm_set_acceleration(mbedtls_gcm_context *ctx)
{
#if defined(MBEDTLS_GCM_LARGE_TABLE)
ctx->acceleration = MBEDTLS_GCM_ACC_LARGETABLE;
#else
ctx->acceleration = MBEDTLS_GCM_ACC_SMALLTABLE;
#endif
#if defined(MBEDTLS_AESNI_HAVE_CODE)
/* With CLMUL support, we need only h, not the rest of the table */
if (mbedtls_aesni_has_support(MBEDTLS_AESNI_CLMUL)) {
ctx->acceleration = MBEDTLS_GCM_ACC_AESNI;
}
#endif
#if defined(MBEDTLS_AESCE_HAVE_CODE)
if (MBEDTLS_AESCE_HAS_SUPPORT()) {
ctx->acceleration = MBEDTLS_GCM_ACC_AESCE;
}
#endif
}
static inline void gcm_gen_table_rightshift(uint64_t dst[2], const uint64_t src[2])
{
uint8_t *u8Dst = (uint8_t *) dst;
uint8_t *u8Src = (uint8_t *) src;
MBEDTLS_PUT_UINT64_BE(MBEDTLS_GET_UINT64_BE(&src[1], 0) >> 1, &dst[1], 0);
u8Dst[8] |= (u8Src[7] & 0x01) << 7;
MBEDTLS_PUT_UINT64_BE(MBEDTLS_GET_UINT64_BE(&src[0], 0) >> 1, &dst[0], 0);
u8Dst[0] ^= (u8Src[15] & 0x01) ? 0xE1 : 0;
}
/*
* Precompute small multiples of H, that is set
* HH[i] || HL[i] = H times i,
@ -60,11 +99,8 @@ void mbedtls_gcm_init(mbedtls_gcm_context *ctx)
static int gcm_gen_table(mbedtls_gcm_context *ctx)
{
int ret, i, j;
uint64_t hi, lo;
uint64_t vl, vh;
unsigned char h[16];
memset(h, 0, 16);
uint64_t u64h[2] = { 0 };
uint8_t *h = (uint8_t *) u64h;
#if defined(MBEDTLS_BLOCK_CIPHER_C)
ret = mbedtls_block_cipher_encrypt(&ctx->block_cipher_ctx, h, h);
@ -76,53 +112,48 @@ static int gcm_gen_table(mbedtls_gcm_context *ctx)
return ret;
}
/* pack h as two 64-bits ints, big-endian */
hi = MBEDTLS_GET_UINT32_BE(h, 0);
lo = MBEDTLS_GET_UINT32_BE(h, 4);
vh = (uint64_t) hi << 32 | lo;
gcm_set_acceleration(ctx);
hi = MBEDTLS_GET_UINT32_BE(h, 8);
lo = MBEDTLS_GET_UINT32_BE(h, 12);
vl = (uint64_t) hi << 32 | lo;
/* 8 = 1000 corresponds to 1 in GF(2^128) */
ctx->HL[8] = vl;
ctx->HH[8] = vh;
/* MBEDTLS_GCM_HTABLE_SIZE/2 = 1000 corresponds to 1 in GF(2^128) */
ctx->H[MBEDTLS_GCM_HTABLE_SIZE/2][0] = u64h[0];
ctx->H[MBEDTLS_GCM_HTABLE_SIZE/2][1] = u64h[1];
switch (ctx->acceleration) {
#if defined(MBEDTLS_AESNI_HAVE_CODE)
/* With CLMUL support, we need only h, not the rest of the table */
if (mbedtls_aesni_has_support(MBEDTLS_AESNI_CLMUL)) {
return 0;
}
case MBEDTLS_GCM_ACC_AESNI:
return 0;
#endif
#if defined(MBEDTLS_AESCE_HAVE_CODE)
if (MBEDTLS_AESCE_HAS_SUPPORT()) {
return 0;
}
case MBEDTLS_GCM_ACC_AESCE:
return 0;
#endif
/* 0 corresponds to 0 in GF(2^128) */
ctx->HH[0] = 0;
ctx->HL[0] = 0;
default:
/* 0 corresponds to 0 in GF(2^128) */
ctx->H[0][0] = 0;
ctx->H[0][1] = 0;
for (i = 4; i > 0; i >>= 1) {
uint32_t T = (vl & 1) * 0xe1000000U;
vl = (vh << 63) | (vl >> 1);
vh = (vh >> 1) ^ ((uint64_t) T << 32);
for (i = MBEDTLS_GCM_HTABLE_SIZE/4; i > 0; i >>= 1) {
gcm_gen_table_rightshift(ctx->H[i], ctx->H[i*2]);
}
ctx->HL[i] = vl;
ctx->HH[i] = vh;
}
#if !defined(MBEDTLS_GCM_LARGE_TABLE)
/* pack elements of H as 64-bits ints, big-endian */
for (i = MBEDTLS_GCM_HTABLE_SIZE/2; i > 0; i >>= 1) {
MBEDTLS_PUT_UINT64_BE(ctx->H[i][0], &ctx->H[i][0], 0);
MBEDTLS_PUT_UINT64_BE(ctx->H[i][1], &ctx->H[i][1], 0);
}
#endif
for (i = 2; i <= 8; i *= 2) {
uint64_t *HiL = ctx->HL + i, *HiH = ctx->HH + i;
vh = *HiH;
vl = *HiL;
for (j = 1; j < i; j++) {
HiH[j] = vh ^ ctx->HH[j];
HiL[j] = vl ^ ctx->HL[j];
}
for (i = 2; i < MBEDTLS_GCM_HTABLE_SIZE; i <<= 1) {
for (j = 1; j < i; j++) {
mbedtls_xor_no_simd((unsigned char *) ctx->H[i+j],
(unsigned char *) ctx->H[i],
(unsigned char *) ctx->H[j],
16);
}
}
}
return 0;
@ -181,6 +212,80 @@ int mbedtls_gcm_setkey(mbedtls_gcm_context *ctx,
return 0;
}
#if defined(MBEDTLS_GCM_LARGE_TABLE)
static const uint16_t last8[256] = {
0x0000, 0xc201, 0x8403, 0x4602, 0x0807, 0xca06, 0x8c04, 0x4e05,
0x100e, 0xd20f, 0x940d, 0x560c, 0x1809, 0xda08, 0x9c0a, 0x5e0b,
0x201c, 0xe21d, 0xa41f, 0x661e, 0x281b, 0xea1a, 0xac18, 0x6e19,
0x3012, 0xf213, 0xb411, 0x7610, 0x3815, 0xfa14, 0xbc16, 0x7e17,
0x4038, 0x8239, 0xc43b, 0x063a, 0x483f, 0x8a3e, 0xcc3c, 0x0e3d,
0x5036, 0x9237, 0xd435, 0x1634, 0x5831, 0x9a30, 0xdc32, 0x1e33,
0x6024, 0xa225, 0xe427, 0x2626, 0x6823, 0xaa22, 0xec20, 0x2e21,
0x702a, 0xb22b, 0xf429, 0x3628, 0x782d, 0xba2c, 0xfc2e, 0x3e2f,
0x8070, 0x4271, 0x0473, 0xc672, 0x8877, 0x4a76, 0x0c74, 0xce75,
0x907e, 0x527f, 0x147d, 0xd67c, 0x9879, 0x5a78, 0x1c7a, 0xde7b,
0xa06c, 0x626d, 0x246f, 0xe66e, 0xa86b, 0x6a6a, 0x2c68, 0xee69,
0xb062, 0x7263, 0x3461, 0xf660, 0xb865, 0x7a64, 0x3c66, 0xfe67,
0xc048, 0x0249, 0x444b, 0x864a, 0xc84f, 0x0a4e, 0x4c4c, 0x8e4d,
0xd046, 0x1247, 0x5445, 0x9644, 0xd841, 0x1a40, 0x5c42, 0x9e43,
0xe054, 0x2255, 0x6457, 0xa656, 0xe853, 0x2a52, 0x6c50, 0xae51,
0xf05a, 0x325b, 0x7459, 0xb658, 0xf85d, 0x3a5c, 0x7c5e, 0xbe5f,
0x00e1, 0xc2e0, 0x84e2, 0x46e3, 0x08e6, 0xcae7, 0x8ce5, 0x4ee4,
0x10ef, 0xd2ee, 0x94ec, 0x56ed, 0x18e8, 0xdae9, 0x9ceb, 0x5eea,
0x20fd, 0xe2fc, 0xa4fe, 0x66ff, 0x28fa, 0xeafb, 0xacf9, 0x6ef8,
0x30f3, 0xf2f2, 0xb4f0, 0x76f1, 0x38f4, 0xfaf5, 0xbcf7, 0x7ef6,
0x40d9, 0x82d8, 0xc4da, 0x06db, 0x48de, 0x8adf, 0xccdd, 0x0edc,
0x50d7, 0x92d6, 0xd4d4, 0x16d5, 0x58d0, 0x9ad1, 0xdcd3, 0x1ed2,
0x60c5, 0xa2c4, 0xe4c6, 0x26c7, 0x68c2, 0xaac3, 0xecc1, 0x2ec0,
0x70cb, 0xb2ca, 0xf4c8, 0x36c9, 0x78cc, 0xbacd, 0xfccf, 0x3ece,
0x8091, 0x4290, 0x0492, 0xc693, 0x8896, 0x4a97, 0x0c95, 0xce94,
0x909f, 0x529e, 0x149c, 0xd69d, 0x9898, 0x5a99, 0x1c9b, 0xde9a,
0xa08d, 0x628c, 0x248e, 0xe68f, 0xa88a, 0x6a8b, 0x2c89, 0xee88,
0xb083, 0x7282, 0x3480, 0xf681, 0xb884, 0x7a85, 0x3c87, 0xfe86,
0xc0a9, 0x02a8, 0x44aa, 0x86ab, 0xc8ae, 0x0aaf, 0x4cad, 0x8eac,
0xd0a7, 0x12a6, 0x54a4, 0x96a5, 0xd8a0, 0x1aa1, 0x5ca3, 0x9ea2,
0xe0b5, 0x22b4, 0x64b6, 0xa6b7, 0xe8b2, 0x2ab3, 0x6cb1, 0xaeb0,
0xf0bb, 0x32ba, 0x74b8, 0xb6b9, 0xf8bc, 0x3abd, 0x7cbf, 0xbebe
};
static void gcm_mult_largetable(uint8_t *output, const uint8_t *x, uint64_t H[256][2])
{
int i;
uint64_t u64z[2];
uint16_t *u16z = (uint16_t *) u64z;
uint8_t *u8z = (uint8_t *) u64z;
uint8_t rem;
u64z[0] = 0;
u64z[1] = 0;
if (MBEDTLS_IS_BIG_ENDIAN) {
for (i = 15; i > 0; i--) {
mbedtls_xor_no_simd(u8z, u8z, (uint8_t *) H[x[i]], 16);
rem = u8z[15];
u64z[1] >>= 8;
u8z[8] = u8z[7];
u64z[0] >>= 8;
u16z[0] ^= MBEDTLS_GET_UINT16_LE(&last8[rem], 0);
}
} else {
for (i = 15; i > 0; i--) {
mbedtls_xor_no_simd(u8z, u8z, (uint8_t *) H[x[i]], 16);
rem = u8z[15];
u64z[1] <<= 8;
u8z[8] = u8z[7];
u64z[0] <<= 8;
u16z[0] ^= last8[rem];
}
}
mbedtls_xor_no_simd(output, u8z, (uint8_t *) H[x[0]], 16);
}
#else
/*
* Shoup's method for multiplication use this table with
* last4[x] = x times P^128
@ -194,6 +299,47 @@ static const uint16_t last4[16] =
0x9180, 0x8da0, 0xa9c0, 0xb5e0
};
static void gcm_mult_smalltable(uint8_t *output, const uint8_t *x, uint64_t H[16][2])
{
int i = 0;
unsigned char lo, hi, rem;
uint64_t u64z[2];
const uint64_t *pu64z = NULL;
uint8_t *u8z = (uint8_t *) u64z;
lo = x[15] & 0xf;
hi = (x[15] >> 4) & 0xf;
pu64z = H[lo];
rem = (unsigned char) pu64z[1] & 0xf;
u64z[1] = (pu64z[0] << 60) | (pu64z[1] >> 4);
u64z[0] = (pu64z[0] >> 4);
u64z[0] ^= (uint64_t) last4[rem] << 48;
mbedtls_xor_no_simd(u8z, u8z, (uint8_t *) H[hi], 16);
for (i = 14; i >= 0; i--) {
lo = x[i] & 0xf;
hi = (x[i] >> 4) & 0xf;
rem = (unsigned char) u64z[1] & 0xf;
u64z[1] = (u64z[0] << 60) | (u64z[1] >> 4);
u64z[0] = (u64z[0] >> 4);
u64z[0] ^= (uint64_t) last4[rem] << 48;
mbedtls_xor_no_simd(u8z, u8z, (uint8_t *) H[lo], 16);
rem = (unsigned char) u64z[1] & 0xf;
u64z[1] = (u64z[0] << 60) | (u64z[1] >> 4);
u64z[0] = (u64z[0] >> 4);
u64z[0] ^= (uint64_t) last4[rem] << 48;
mbedtls_xor_no_simd(u8z, u8z, (uint8_t *) H[hi], 16);
}
MBEDTLS_PUT_UINT64_BE(u64z[0], output, 0);
MBEDTLS_PUT_UINT64_BE(u64z[1], output, 8);
}
#endif
/*
* Sets output to x times H using the precomputed tables.
* x and output are seen as elements of GF(2^128) as in [MGV].
@ -201,71 +347,31 @@ static const uint16_t last4[16] =
static void gcm_mult(mbedtls_gcm_context *ctx, const unsigned char x[16],
unsigned char output[16])
{
int i = 0;
unsigned char lo, hi, rem;
uint64_t zh, zl;
switch (ctx->acceleration) {
#if defined(MBEDTLS_AESNI_HAVE_CODE)
if (mbedtls_aesni_has_support(MBEDTLS_AESNI_CLMUL)) {
unsigned char h[16];
/* mbedtls_aesni_gcm_mult needs big-endian input */
MBEDTLS_PUT_UINT32_BE(ctx->HH[8] >> 32, h, 0);
MBEDTLS_PUT_UINT32_BE(ctx->HH[8], h, 4);
MBEDTLS_PUT_UINT32_BE(ctx->HL[8] >> 32, h, 8);
MBEDTLS_PUT_UINT32_BE(ctx->HL[8], h, 12);
mbedtls_aesni_gcm_mult(output, x, h);
return;
}
#endif /* MBEDTLS_AESNI_HAVE_CODE */
#if defined(MBEDTLS_AESCE_HAVE_CODE)
if (MBEDTLS_AESCE_HAS_SUPPORT()) {
unsigned char h[16];
/* mbedtls_aesce_gcm_mult needs big-endian input */
MBEDTLS_PUT_UINT32_BE(ctx->HH[8] >> 32, h, 0);
MBEDTLS_PUT_UINT32_BE(ctx->HH[8], h, 4);
MBEDTLS_PUT_UINT32_BE(ctx->HL[8] >> 32, h, 8);
MBEDTLS_PUT_UINT32_BE(ctx->HL[8], h, 12);
mbedtls_aesce_gcm_mult(output, x, h);
return;
}
case MBEDTLS_GCM_ACC_AESNI:
mbedtls_aesni_gcm_mult(output, x, (uint8_t *) ctx->H[MBEDTLS_GCM_HTABLE_SIZE/2]);
break;
#endif
lo = x[15] & 0xf;
#if defined(MBEDTLS_AESCE_HAVE_CODE)
case MBEDTLS_GCM_ACC_AESCE:
mbedtls_aesce_gcm_mult(output, x, (uint8_t *) ctx->H[MBEDTLS_GCM_HTABLE_SIZE/2]);
break;
#endif
zh = ctx->HH[lo];
zl = ctx->HL[lo];
for (i = 15; i >= 0; i--) {
lo = x[i] & 0xf;
hi = (x[i] >> 4) & 0xf;
if (i != 15) {
rem = (unsigned char) zl & 0xf;
zl = (zh << 60) | (zl >> 4);
zh = (zh >> 4);
zh ^= (uint64_t) last4[rem] << 48;
zh ^= ctx->HH[lo];
zl ^= ctx->HL[lo];
}
rem = (unsigned char) zl & 0xf;
zl = (zh << 60) | (zl >> 4);
zh = (zh >> 4);
zh ^= (uint64_t) last4[rem] << 48;
zh ^= ctx->HH[hi];
zl ^= ctx->HL[hi];
#if defined(MBEDTLS_GCM_LARGE_TABLE)
case MBEDTLS_GCM_ACC_LARGETABLE:
gcm_mult_largetable(output, x, ctx->H);
break;
#else
case MBEDTLS_GCM_ACC_SMALLTABLE:
gcm_mult_smalltable(output, x, ctx->H);
break;
#endif
}
MBEDTLS_PUT_UINT32_BE(zh >> 32, output, 0);
MBEDTLS_PUT_UINT32_BE(zh, output, 4);
MBEDTLS_PUT_UINT32_BE(zl >> 32, output, 8);
MBEDTLS_PUT_UINT32_BE(zl, output, 12);
return;
}
int mbedtls_gcm_starts(mbedtls_gcm_context *ctx,

View file

@ -5036,6 +5036,19 @@ component_test_aes_only_128_bit_keys_have_builtins () {
programs/test/selftest
}
component_test_gcm_largetable () {
msg "build: default config + GCM_LARGE_TABLE - AESNI_C - AESCE_C"
scripts/config.py set MBEDTLS_GCM_LARGE_TABLE
scripts/config.py unset MBEDTLS_PADLOCK_C
scripts/config.py unset MBEDTLS_AESNI_C
scripts/config.py unset MBEDTLS_AESCE_C
make CFLAGS='-O2 -Werror -Wall -Wextra'
msg "test: default config - GCM_LARGE_TABLE - AESNI_C - AESCE_C"
make test
}
component_test_aes_fewer_tables () {
msg "build: default config with AES_FEWER_TABLES enabled"
scripts/config.py set MBEDTLS_AES_FEWER_TABLES