Pack the iota round constants
This saves ~160 bytes of code size, at the cost of a bit of localized complexity in the code. The impact on performance is measurable but small (<5% observed on x86_64) and can go either way (there's a calculation vs memory bandwidth compromise). Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
This commit is contained in:
parent
8fe2e36de5
commit
f8b983c855
1 changed files with 33 additions and 8 deletions
|
@ -26,14 +26,35 @@
|
|||
|
||||
#define XOR_BYTE 0x6
|
||||
|
||||
static const uint64_t rc[24] = {
|
||||
0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
|
||||
0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
|
||||
0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
|
||||
0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003,
|
||||
0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
|
||||
0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008,
|
||||
/* Precomputed masks for the iota transform.
|
||||
*
|
||||
* Each round uses a 64-bit mask value. In each mask values, only
|
||||
* bits whose position is of the form 2^k-1 can be set, thus only
|
||||
* 7 of 64 bits of the mask need to be known for each mask value.
|
||||
*
|
||||
* We use a compressed encoding of the mask where bits 63, 31 and 15
|
||||
* are moved to bits 4-6. This allows us to make each mask value
|
||||
* 1 byte rather than 8 bytes, saving 7*24 = 168 bytes of data (with
|
||||
* perhaps a little variation due to alignment). Decompressing this
|
||||
* requires a little code, but much less than the savings on the table.
|
||||
*
|
||||
* The impact on performance depends on the platform and compiler.
|
||||
* There's a bit more computation, but less memory bandwidth. A quick
|
||||
* benchmark on x86_64 shows a 7% speed improvement with GCC and a
|
||||
* 5% speed penalty with Clang, compared to the naive uint64_t[24] table.
|
||||
* YMMV.
|
||||
*/
|
||||
/* Helper macro to set the values of the higher bits in unused low positions */
|
||||
#define H(b63, b31, b15) (b63 << 6 | b31 << 5 | b15 << 4)
|
||||
static const uint8_t iota_r_packed[24] = {
|
||||
H(0, 0, 0) | 0x01, H(0, 0, 1) | 0x82, H(1, 0, 1) | 0x8a, H(1, 1, 1) | 0x00,
|
||||
H(0, 0, 1) | 0x8b, H(0, 1, 0) | 0x01, H(1, 1, 1) | 0x81, H(1, 0, 1) | 0x09,
|
||||
H(0, 0, 0) | 0x8a, H(0, 0, 0) | 0x88, H(0, 1, 1) | 0x09, H(0, 1, 0) | 0x0a,
|
||||
H(0, 1, 1) | 0x8b, H(1, 0, 0) | 0x8b, H(1, 0, 1) | 0x89, H(1, 0, 1) | 0x03,
|
||||
H(1, 0, 1) | 0x02, H(1, 0, 0) | 0x80, H(0, 0, 1) | 0x0a, H(1, 1, 0) | 0x0a,
|
||||
H(1, 1, 1) | 0x81, H(1, 0, 1) | 0x80, H(0, 1, 0) | 0x01, H(1, 1, 1) | 0x08,
|
||||
};
|
||||
#undef H
|
||||
|
||||
static const uint8_t rho[24] = {
|
||||
1, 62, 28, 27, 36, 44, 6, 55, 20,
|
||||
|
@ -132,7 +153,11 @@ static void keccak_f1600(mbedtls_sha3_context *ctx)
|
|||
s[24] ^= (~lane[0]) & lane[1];
|
||||
|
||||
/* Iota */
|
||||
s[0] ^= rc[round];
|
||||
/* Decompress the round masks (see definition of rc) */
|
||||
s[0] ^= ((iota_r_packed[round] & 0x40ull) << 57 |
|
||||
(iota_r_packed[round] & 0x20ull) << 26 |
|
||||
(iota_r_packed[round] & 0x10ull) << 11 |
|
||||
(iota_r_packed[round] & 0x8f));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue