nixpkgs-suyu/nixos/tests/acme.nix
Ryan Lahfa 0b0726ae0b
Merge pull request #205983 from m1cr0man/acme-test-fix
nixos/acme: Increase number of retries in testing
2022-12-22 02:19:19 +01:00

675 lines
26 KiB
Nix

{ pkgs, lib, ... }: let
commonConfig = ./common/acme/client;
dnsServerIP = nodes: nodes.dnsserver.networking.primaryIPAddress;
dnsScript = nodes: let
dnsAddress = dnsServerIP nodes;
in pkgs.writeShellScript "dns-hook.sh" ''
set -euo pipefail
echo '[INFO]' "[$2]" 'dns-hook.sh' $*
if [ "$1" = "present" ]; then
${pkgs.curl}/bin/curl --data '{"host": "'"$2"'", "value": "'"$3"'"}' http://${dnsAddress}:8055/set-txt
else
${pkgs.curl}/bin/curl --data '{"host": "'"$2"'"}' http://${dnsAddress}:8055/clear-txt
fi
'';
dnsConfig = nodes: {
dnsProvider = "exec";
dnsPropagationCheck = false;
credentialsFile = pkgs.writeText "wildcard.env" ''
EXEC_PATH=${dnsScript nodes}
EXEC_POLLING_INTERVAL=1
EXEC_PROPAGATION_TIMEOUT=1
EXEC_SEQUENCE_INTERVAL=1
'';
};
documentRoot = pkgs.runCommand "docroot" {} ''
mkdir -p "$out"
echo hello world > "$out/index.html"
'';
vhostBase = {
forceSSL = true;
locations."/".root = documentRoot;
};
vhostBaseHttpd = {
forceSSL = true;
inherit documentRoot;
};
simpleConfig = {
security.acme = {
certs."http.example.test" = {
listenHTTP = ":80";
};
};
networking.firewall.allowedTCPPorts = [ 80 ];
};
# Base specialisation config for testing general ACME features
webserverBasicConfig = {
services.nginx.enable = true;
services.nginx.virtualHosts."a.example.test" = vhostBase // {
enableACME = true;
};
};
# Generate specialisations for testing a web server
mkServerConfigs = { server, group, vhostBaseData, extraConfig ? {} }: let
baseConfig = { nodes, config, specialConfig ? {} }: lib.mkMerge [
{
security.acme = {
defaults = (dnsConfig nodes);
# One manual wildcard cert
certs."example.test" = {
domain = "*.example.test";
};
};
users.users."${config.services."${server}".user}".extraGroups = ["acme"];
services."${server}" = {
enable = true;
virtualHosts = {
# Run-of-the-mill vhost using HTTP-01 validation
"${server}-http.example.test" = vhostBaseData // {
serverAliases = [ "${server}-http-alias.example.test" ];
enableACME = true;
};
# Another which inherits the DNS-01 config
"${server}-dns.example.test" = vhostBaseData // {
serverAliases = [ "${server}-dns-alias.example.test" ];
enableACME = true;
# Set acmeRoot to null instead of using the default of "/var/lib/acme/acme-challenge"
# webroot + dnsProvider are mutually exclusive.
acmeRoot = null;
};
# One using the wildcard certificate
"${server}-wildcard.example.test" = vhostBaseData // {
serverAliases = [ "${server}-wildcard-alias.example.test" ];
useACMEHost = "example.test";
};
};
};
# Used to determine if service reload was triggered
systemd.targets."test-renew-${server}" = {
wants = [ "acme-${server}-http.example.test.service" ];
after = [ "acme-${server}-http.example.test.service" "${server}-config-reload.service" ];
};
}
specialConfig
extraConfig
];
in {
"${server}".configuration = { nodes, config, ... }: baseConfig {
inherit nodes config;
};
# Test that server reloads when an alias is removed (and subsequently test removal works in acme)
"${server}-remove-alias".configuration = { nodes, config, ... }: baseConfig {
inherit nodes config;
specialConfig = {
# Remove an alias, but create a standalone vhost in its place for testing.
# This configuration results in certificate errors as useACMEHost does not imply
# append extraDomains, and thus we can validate the SAN is removed.
services."${server}" = {
virtualHosts."${server}-http.example.test".serverAliases = lib.mkForce [];
virtualHosts."${server}-http-alias.example.test" = vhostBaseData // {
useACMEHost = "${server}-http.example.test";
};
};
};
};
# Test that the server reloads when only the acme configuration is changed.
"${server}-change-acme-conf".configuration = { nodes, config, ... }: baseConfig {
inherit nodes config;
specialConfig = {
security.acme.certs."${server}-http.example.test" = {
keyType = "ec384";
# Also test that postRun is exec'd as root
postRun = "id | grep root";
};
};
};
};
in {
name = "acme";
meta = {
maintainers = lib.teams.acme.members;
# Hard timeout in seconds. Average run time is about 7 minutes.
timeout = 1800;
};
nodes = {
# The fake ACME server which will respond to client requests
acme = { nodes, ... }: {
imports = [ ./common/acme/server ];
networking.nameservers = lib.mkForce [ (dnsServerIP nodes) ];
};
# A fake DNS server which can be configured with records as desired
# Used to test DNS-01 challenge
dnsserver = { nodes, ... }: {
networking.firewall.allowedTCPPorts = [ 8055 53 ];
networking.firewall.allowedUDPPorts = [ 53 ];
systemd.services.pebble-challtestsrv = {
enable = true;
description = "Pebble ACME challenge test server";
wantedBy = [ "network.target" ];
serviceConfig = {
ExecStart = "${pkgs.pebble}/bin/pebble-challtestsrv -dns01 ':53' -defaultIPv6 '' -defaultIPv4 '${nodes.webserver.networking.primaryIPAddress}'";
# Required to bind on privileged ports.
AmbientCapabilities = [ "CAP_NET_BIND_SERVICE" ];
};
};
};
# A web server which will be the node requesting certs
webserver = { nodes, config, ... }: {
imports = [ commonConfig ];
networking.nameservers = lib.mkForce [ (dnsServerIP nodes) ];
networking.firewall.allowedTCPPorts = [ 80 443 ];
# OpenSSL will be used for more thorough certificate validation
environment.systemPackages = [ pkgs.openssl ];
# Set log level to info so that we can see when the service is reloaded
services.nginx.logError = "stderr info";
specialisation = {
# Tests HTTP-01 verification using Lego's built-in web server
http01lego.configuration = simpleConfig;
renew.configuration = lib.mkMerge [
simpleConfig
{
# Pebble provides 5 year long certs,
# needs to be higher than that to test renewal
security.acme.certs."http.example.test".validMinDays = 9999;
}
];
# Tests that account creds can be safely changed.
accountchange.configuration = lib.mkMerge [
simpleConfig
{
security.acme.certs."http.example.test".email = "admin@example.test";
}
];
# First derivation used to test general ACME features
general.configuration = { ... }: let
caDomain = nodes.acme.test-support.acme.caDomain;
email = config.security.acme.defaults.email;
# Exit 99 to make it easier to track if this is the reason a renew failed
accountCreateTester = ''
test -e accounts/${caDomain}/${email}/account.json || exit 99
'';
in lib.mkMerge [
webserverBasicConfig
{
# Used to test that account creation is collated into one service.
# These should not run until after acme-finished-a.example.test.target
systemd.services."b.example.test".preStart = accountCreateTester;
systemd.services."c.example.test".preStart = accountCreateTester;
services.nginx.virtualHosts."b.example.test" = vhostBase // {
enableACME = true;
};
services.nginx.virtualHosts."c.example.test" = vhostBase // {
enableACME = true;
};
}
];
# Test OCSP Stapling
ocsp-stapling.configuration = { ... }: lib.mkMerge [
webserverBasicConfig
{
security.acme.certs."a.example.test".ocspMustStaple = true;
services.nginx.virtualHosts."a.example.test" = {
extraConfig = ''
ssl_stapling on;
ssl_stapling_verify on;
'';
};
}
];
# Validate service relationships by adding a slow start service to nginx' wants.
# Reproducer for https://github.com/NixOS/nixpkgs/issues/81842
slow-startup.configuration = { ... }: lib.mkMerge [
webserverBasicConfig
{
systemd.services.my-slow-service = {
wantedBy = [ "multi-user.target" "nginx.service" ];
before = [ "nginx.service" ];
preStart = "sleep 5";
script = "${pkgs.python3}/bin/python -m http.server";
};
services.nginx.virtualHosts."slow.example.test" = {
forceSSL = true;
enableACME = true;
locations."/".proxyPass = "http://localhost:8000";
};
}
];
# Test lego internal server (listenHTTP option)
# Also tests useRoot option
lego-server.configuration = { ... }: {
security.acme.useRoot = true;
security.acme.certs."lego.example.test" = {
listenHTTP = ":80";
group = "nginx";
};
services.nginx.enable = true;
services.nginx.virtualHosts."lego.example.test" = {
useACMEHost = "lego.example.test";
onlySSL = true;
};
};
# Test compatibility with Caddy
# It only supports useACMEHost, hence not using mkServerConfigs
} // (let
baseCaddyConfig = { nodes, config, ... }: {
security.acme = {
defaults = (dnsConfig nodes);
# One manual wildcard cert
certs."example.test" = {
domain = "*.example.test";
};
};
users.users."${config.services.caddy.user}".extraGroups = ["acme"];
services.caddy = {
enable = true;
virtualHosts."a.exmaple.test" = {
useACMEHost = "example.test";
extraConfig = ''
root * ${documentRoot}
'';
};
};
};
in {
caddy.configuration = baseCaddyConfig;
# Test that the server reloads when only the acme configuration is changed.
"caddy-change-acme-conf".configuration = { nodes, config, ... }: lib.mkMerge [
(baseCaddyConfig {
inherit nodes config;
})
{
security.acme.certs."example.test" = {
keyType = "ec384";
};
}
];
# Test compatibility with Nginx
}) // (mkServerConfigs {
server = "nginx";
group = "nginx";
vhostBaseData = vhostBase;
})
# Test compatibility with Apache HTTPD
// (mkServerConfigs {
server = "httpd";
group = "wwwrun";
vhostBaseData = vhostBaseHttpd;
extraConfig = {
services.httpd.adminAddr = config.security.acme.defaults.email;
};
});
};
# The client will be used to curl the webserver to validate configuration
client = { nodes, ... }: {
imports = [ commonConfig ];
networking.nameservers = lib.mkForce [ (dnsServerIP nodes) ];
# OpenSSL will be used for more thorough certificate validation
environment.systemPackages = [ pkgs.openssl ];
};
};
testScript = { nodes, ... }:
let
caDomain = nodes.acme.test-support.acme.caDomain;
newServerSystem = nodes.webserver.config.system.build.toplevel;
switchToNewServer = "${newServerSystem}/bin/switch-to-configuration test";
in
# Note, wait_for_unit does not work for oneshot services that do not have RemainAfterExit=true,
# this is because a oneshot goes from inactive => activating => inactive, and never
# reaches the active state. Targets do not have this issue.
''
import time
TOTAL_RETRIES = 20
class BackoffTracker(object):
delay = 1
increment = 1
def handle_fail(self, retries, message) -> int:
assert retries < TOTAL_RETRIES, message
print(f"Retrying in {self.delay}s, {retries + 1}/{TOTAL_RETRIES}")
time.sleep(self.delay)
# Only increment after the first try
if retries == 0:
self.delay += self.increment
self.increment *= 2
return retries + 1
backoff = BackoffTracker()
def switch_to(node, name):
# On first switch, this will create a symlink to the current system so that we can
# quickly switch between derivations
root_specs = "/tmp/specialisation"
node.execute(
f"test -e {root_specs}"
f" || ln -s $(readlink /run/current-system)/specialisation {root_specs}"
)
switcher_path = f"/run/current-system/specialisation/{name}/bin/switch-to-configuration"
rc, _ = node.execute(f"test -e '{switcher_path}'")
if rc > 0:
switcher_path = f"/tmp/specialisation/{name}/bin/switch-to-configuration"
node.succeed(
f"{switcher_path} test"
)
# Ensures the issuer of our cert matches the chain
# and matches the issuer we expect it to be.
# It's a good validation to ensure the cert.pem and fullchain.pem
# are not still selfsigned afer verification
def check_issuer(node, cert_name, issuer):
for fname in ("cert.pem", "fullchain.pem"):
actual_issuer = node.succeed(
f"openssl x509 -noout -issuer -in /var/lib/acme/{cert_name}/{fname}"
).partition("=")[2]
print(f"{fname} issuer: {actual_issuer}")
assert issuer.lower() in actual_issuer.lower()
# Ensure cert comes before chain in fullchain.pem
def check_fullchain(node, cert_name):
subject_data = node.succeed(
f"openssl crl2pkcs7 -nocrl -certfile /var/lib/acme/{cert_name}/fullchain.pem"
" | openssl pkcs7 -print_certs -noout"
)
for line in subject_data.lower().split("\n"):
if "subject" in line:
print(f"First subject in fullchain.pem: {line}")
assert cert_name.lower() in line
return
assert False
def check_connection(node, domain, retries=0):
result = node.succeed(
"openssl s_client -brief -verify 2 -CAfile /tmp/ca.crt"
f" -servername {domain} -connect {domain}:443 < /dev/null 2>&1"
)
for line in result.lower().split("\n"):
if "verification" in line and "error" in line:
retries = backoff.handle_fail(retries, f"Failed to connect to https://{domain}")
return check_connection(node, domain, retries)
def check_connection_key_bits(node, domain, bits, retries=0):
result = node.succeed(
"openssl s_client -CAfile /tmp/ca.crt"
f" -servername {domain} -connect {domain}:443 < /dev/null"
" | openssl x509 -noout -text | grep -i Public-Key"
)
print("Key type:", result)
if bits not in result:
retries = backoff.handle_fail(retries, f"Did not find expected number of bits ({bits}) in key")
return check_connection_key_bits(node, domain, bits, retries)
def check_stapling(node, domain, retries=0):
# Pebble doesn't provide a full OCSP responder, so just check the URL
result = node.succeed(
"openssl s_client -CAfile /tmp/ca.crt"
f" -servername {domain} -connect {domain}:443 < /dev/null"
" | openssl x509 -noout -ocsp_uri"
)
print("OCSP Responder URL:", result)
if "${caDomain}:4002" not in result.lower():
retries = backoff.handle_fail(retries, "OCSP Stapling check failed")
return check_stapling(node, domain, retries)
def download_ca_certs(node, retries=0):
exit_code, _ = node.execute("curl https://${caDomain}:15000/roots/0 > /tmp/ca.crt")
exit_code_2, _ = node.execute(
"curl https://${caDomain}:15000/intermediate-keys/0 >> /tmp/ca.crt"
)
if exit_code + exit_code_2 > 0:
retries = backoff.handle_fail(retries, "Failed to connect to pebble to download root CA certs")
return download_ca_certs(node, retries)
start_all()
dnsserver.wait_for_unit("pebble-challtestsrv.service")
client.wait_for_unit("default.target")
client.succeed(
'curl --data \'{"host": "${caDomain}", "addresses": ["${nodes.acme.networking.primaryIPAddress}"]}\' http://${dnsServerIP nodes}:8055/add-a'
)
acme.wait_for_unit("network-online.target")
acme.wait_for_unit("pebble.service")
download_ca_certs(client)
# Perform http-01 w/ lego test first
with subtest("Can request certificate with Lego's built in web server"):
switch_to(webserver, "http01lego")
webserver.wait_for_unit("acme-finished-http.example.test.target")
check_fullchain(webserver, "http.example.test")
check_issuer(webserver, "http.example.test", "pebble")
# Perform renewal test
with subtest("Can renew certificates when they expire"):
hash = webserver.succeed("sha256sum /var/lib/acme/http.example.test/cert.pem")
switch_to(webserver, "renew")
webserver.wait_for_unit("acme-finished-http.example.test.target")
check_fullchain(webserver, "http.example.test")
check_issuer(webserver, "http.example.test", "pebble")
hash_after = webserver.succeed("sha256sum /var/lib/acme/http.example.test/cert.pem")
assert hash != hash_after
# Perform account change test
with subtest("Handles email change correctly"):
hash = webserver.succeed("sha256sum /var/lib/acme/http.example.test/cert.pem")
switch_to(webserver, "accountchange")
webserver.wait_for_unit("acme-finished-http.example.test.target")
check_fullchain(webserver, "http.example.test")
check_issuer(webserver, "http.example.test", "pebble")
hash_after = webserver.succeed("sha256sum /var/lib/acme/http.example.test/cert.pem")
# Has to do a full run to register account, which creates new certs.
assert hash != hash_after
# Perform general tests
switch_to(webserver, "general")
with subtest("Can request certificate with HTTP-01 challenge"):
webserver.wait_for_unit("acme-finished-a.example.test.target")
check_fullchain(webserver, "a.example.test")
check_issuer(webserver, "a.example.test", "pebble")
webserver.wait_for_unit("nginx.service")
check_connection(client, "a.example.test")
with subtest("Runs 1 cert for account creation before others"):
webserver.wait_for_unit("acme-finished-b.example.test.target")
webserver.wait_for_unit("acme-finished-c.example.test.target")
check_connection(client, "b.example.test")
check_connection(client, "c.example.test")
with subtest("Certificates and accounts have safe + valid permissions"):
# Nginx will set the group appropriately when enableACME is used
group = "nginx"
webserver.succeed(
f"test $(stat -L -c '%a %U %G' /var/lib/acme/a.example.test/*.pem | tee /dev/stderr | grep '640 acme {group}' | wc -l) -eq 5"
)
webserver.succeed(
f"test $(stat -L -c '%a %U %G' /var/lib/acme/.lego/a.example.test/**/a.example.test* | tee /dev/stderr | grep '600 acme {group}' | wc -l) -eq 4"
)
webserver.succeed(
f"test $(stat -L -c '%a %U %G' /var/lib/acme/a.example.test | tee /dev/stderr | grep '750 acme {group}' | wc -l) -eq 1"
)
webserver.succeed(
f"test $(find /var/lib/acme/accounts -type f -exec stat -L -c '%a %U %G' {{}} \\; | tee /dev/stderr | grep -v '600 acme {group}' | wc -l) -eq 0"
)
# Selfsigned certs tests happen late so we aren't fighting the system init triggering cert renewal
with subtest("Can generate valid selfsigned certs"):
webserver.succeed("systemctl clean acme-a.example.test.service --what=state")
webserver.succeed("systemctl start acme-selfsigned-a.example.test.service")
check_fullchain(webserver, "a.example.test")
check_issuer(webserver, "a.example.test", "minica")
# Check selfsigned permissions
webserver.succeed(
f"test $(stat -L -c '%a %U %G' /var/lib/acme/a.example.test/*.pem | tee /dev/stderr | grep '640 acme {group}' | wc -l) -eq 5"
)
# Will succeed if nginx can load the certs
webserver.succeed("systemctl start nginx-config-reload.service")
with subtest("Correctly implements OCSP stapling"):
switch_to(webserver, "ocsp-stapling")
webserver.wait_for_unit("acme-finished-a.example.test.target")
check_stapling(client, "a.example.test")
with subtest("Can request certificate with HTTP-01 using lego's internal web server"):
switch_to(webserver, "lego-server")
webserver.wait_for_unit("acme-finished-lego.example.test.target")
webserver.wait_for_unit("nginx.service")
webserver.succeed("echo HENLO && systemctl cat nginx.service")
webserver.succeed("test \"$(stat -c '%U' /var/lib/acme/* | uniq)\" = \"root\"")
check_connection(client, "a.example.test")
check_connection(client, "lego.example.test")
with subtest("Can request certificate with HTTP-01 when nginx startup is delayed"):
webserver.execute("systemctl stop nginx")
switch_to(webserver, "slow-startup")
webserver.wait_for_unit("acme-finished-slow.example.test.target")
check_issuer(webserver, "slow.example.test", "pebble")
webserver.wait_for_unit("nginx.service")
check_connection(client, "slow.example.test")
with subtest("Works with caddy"):
switch_to(webserver, "caddy")
webserver.wait_for_unit("acme-finished-example.test.target")
webserver.wait_for_unit("caddy.service")
# FIXME reloading caddy is not sufficient to load new certs.
# Restart it manually until this is fixed.
webserver.succeed("systemctl restart caddy.service")
check_connection(client, "a.example.test")
with subtest("security.acme changes reflect on caddy"):
switch_to(webserver, "caddy-change-acme-conf")
webserver.wait_for_unit("acme-finished-example.test.target")
webserver.wait_for_unit("caddy.service")
# FIXME reloading caddy is not sufficient to load new certs.
# Restart it manually until this is fixed.
webserver.succeed("systemctl restart caddy.service")
check_connection_key_bits(client, "a.example.test", "384")
domains = ["http", "dns", "wildcard"]
for server, logsrc in [
("nginx", "journalctl -n 30 -u nginx.service"),
("httpd", "tail -n 30 /var/log/httpd/*.log"),
]:
wait_for_server = lambda: webserver.wait_for_unit(f"{server}.service")
with subtest(f"Works with {server}"):
try:
switch_to(webserver, server)
# Skip wildcard domain for this check ([:-1])
for domain in domains[:-1]:
webserver.wait_for_unit(
f"acme-finished-{server}-{domain}.example.test.target"
)
except Exception as err:
_, output = webserver.execute(
f"{logsrc} && ls -al /var/lib/acme/acme-challenge"
)
print(output)
raise err
wait_for_server()
for domain in domains[:-1]:
check_issuer(webserver, f"{server}-{domain}.example.test", "pebble")
for domain in domains:
check_connection(client, f"{server}-{domain}.example.test")
check_connection(client, f"{server}-{domain}-alias.example.test")
test_domain = f"{server}-{domains[0]}.example.test"
with subtest(f"Can reload {server} when timer triggers renewal"):
# Switch to selfsigned first
webserver.succeed(f"systemctl clean acme-{test_domain}.service --what=state")
webserver.succeed(f"systemctl start acme-selfsigned-{test_domain}.service")
check_issuer(webserver, test_domain, "minica")
webserver.succeed(f"systemctl start {server}-config-reload.service")
webserver.succeed(f"systemctl start test-renew-{server}.target")
check_issuer(webserver, test_domain, "pebble")
check_connection(client, test_domain)
with subtest("Can remove an alias from a domain + cert is updated"):
test_alias = f"{server}-{domains[0]}-alias.example.test"
switch_to(webserver, f"{server}-remove-alias")
webserver.wait_for_unit(f"acme-finished-{test_domain}.target")
wait_for_server()
check_connection(client, test_domain)
rc, _s = client.execute(
f"openssl s_client -CAfile /tmp/ca.crt -connect {test_alias}:443"
" </dev/null 2>/dev/null | openssl x509 -noout -text"
f" | grep DNS: | grep {test_alias}"
)
assert rc > 0, "Removed extraDomainName was not removed from the cert"
with subtest("security.acme changes reflect on web server"):
# Switch back to normal server config first, reset everything.
switch_to(webserver, server)
wait_for_server()
switch_to(webserver, f"{server}-change-acme-conf")
webserver.wait_for_unit(f"acme-finished-{test_domain}.target")
wait_for_server()
check_connection_key_bits(client, test_domain, "384")
'';
}