Pack the iota round constants

This saves ~160 bytes of code size, at the cost of a bit of localized
complexity in the code. The impact on performance is measurable but small
(<5% observed on x86_64) and can go either way (there's a calculation vs
memory bandwidth compromise).

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
This commit is contained in:
Gilles Peskine 2024-02-13 18:14:58 +01:00
parent 8fe2e36de5
commit f8b983c855

View file

@ -26,14 +26,35 @@
#define XOR_BYTE 0x6
static const uint64_t rc[24] = {
0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003,
0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008,
/* Precomputed masks for the iota transform.
*
* Each round uses a 64-bit mask value. In each mask values, only
* bits whose position is of the form 2^k-1 can be set, thus only
* 7 of 64 bits of the mask need to be known for each mask value.
*
* We use a compressed encoding of the mask where bits 63, 31 and 15
* are moved to bits 4-6. This allows us to make each mask value
* 1 byte rather than 8 bytes, saving 7*24 = 168 bytes of data (with
* perhaps a little variation due to alignment). Decompressing this
* requires a little code, but much less than the savings on the table.
*
* The impact on performance depends on the platform and compiler.
* There's a bit more computation, but less memory bandwidth. A quick
* benchmark on x86_64 shows a 7% speed improvement with GCC and a
* 5% speed penalty with Clang, compared to the naive uint64_t[24] table.
* YMMV.
*/
/* Helper macro to set the values of the higher bits in unused low positions */
#define H(b63, b31, b15) (b63 << 6 | b31 << 5 | b15 << 4)
static const uint8_t iota_r_packed[24] = {
H(0, 0, 0) | 0x01, H(0, 0, 1) | 0x82, H(1, 0, 1) | 0x8a, H(1, 1, 1) | 0x00,
H(0, 0, 1) | 0x8b, H(0, 1, 0) | 0x01, H(1, 1, 1) | 0x81, H(1, 0, 1) | 0x09,
H(0, 0, 0) | 0x8a, H(0, 0, 0) | 0x88, H(0, 1, 1) | 0x09, H(0, 1, 0) | 0x0a,
H(0, 1, 1) | 0x8b, H(1, 0, 0) | 0x8b, H(1, 0, 1) | 0x89, H(1, 0, 1) | 0x03,
H(1, 0, 1) | 0x02, H(1, 0, 0) | 0x80, H(0, 0, 1) | 0x0a, H(1, 1, 0) | 0x0a,
H(1, 1, 1) | 0x81, H(1, 0, 1) | 0x80, H(0, 1, 0) | 0x01, H(1, 1, 1) | 0x08,
};
#undef H
static const uint8_t rho[24] = {
1, 62, 28, 27, 36, 44, 6, 55, 20,
@ -132,7 +153,11 @@ static void keccak_f1600(mbedtls_sha3_context *ctx)
s[24] ^= (~lane[0]) & lane[1];
/* Iota */
s[0] ^= rc[round];
/* Decompress the round masks (see definition of rc) */
s[0] ^= ((iota_r_packed[round] & 0x40ull) << 57 |
(iota_r_packed[round] & 0x20ull) << 26 |
(iota_r_packed[round] & 0x10ull) << 11 |
(iota_r_packed[round] & 0x8f));
}
}