nixpkgs-suyu/nixos/tests/slurm.nix

143 lines
3.8 KiB
Nix
Raw Normal View History

import ./make-test.nix ({ lib, ... }:
let
mungekey = "mungeverryweakkeybuteasytointegratoinatest";
2015-12-25 15:55:07 +01:00
slurmconfig = {
controlMachine = "control";
nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ];
partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ];
extraConfig = ''
AccountingStorageHost=dbd
AccountingStorageType=accounting_storage/slurmdbd
'';
2015-12-25 15:55:07 +01:00
};
in {
name = "slurm";
meta.maintainers = [ lib.maintainers.markuskowa ];
2015-12-25 15:55:07 +01:00
nodes =
let
computeNode =
{ ...}:
2015-12-25 15:55:07 +01:00
{
# TODO slurmd port and slurmctld port should be configurations and
2015-12-25 15:55:07 +01:00
# automatically allowed by the firewall.
networking.firewall.enable = false;
services.slurm = {
client.enable = true;
} // slurmconfig;
2015-12-25 15:55:07 +01:00
};
in {
2015-12-25 15:55:07 +01:00
control =
{ ...}:
2015-12-25 15:55:07 +01:00
{
networking.firewall.enable = false;
services.slurm = {
server.enable = true;
} // slurmconfig;
};
submit =
{ ...}:
{
networking.firewall.enable = false;
services.slurm = {
enableStools = true;
} // slurmconfig;
};
dbd =
{ pkgs, ... } :
{
networking.firewall.enable = false;
services.slurm.dbdserver = {
enable = true;
};
services.mysql = {
enable = true;
package = pkgs.mysql;
ensureDatabases = [ "slurm_acct_db" ];
ensureUsers = [{
ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; };
name = "slurm";
}];
extraOptions = ''
# recommendations from: https://slurm.schedmd.com/accounting.html#mysql-configuration
innodb_buffer_pool_size=1024M
innodb_log_file_size=64M
innodb_lock_wait_timeout=900
'';
};
};
2015-12-25 15:55:07 +01:00
node1 = computeNode;
node2 = computeNode;
node3 = computeNode;
};
2015-12-25 15:55:07 +01:00
testScript =
''
startAll;
# Set up authentification across the cluster
foreach my $node (($submit,$control,$dbd,$node1,$node2,$node3))
2015-12-25 15:55:07 +01:00
{
$node->waitForUnit("default.target");
$node->succeed("mkdir /etc/munge");
$node->succeed("echo '${mungekey}' > /etc/munge/munge.key");
$node->succeed("chmod 0400 /etc/munge/munge.key");
$node->succeed("chown munge:munge /etc/munge/munge.key");
2015-12-25 15:55:07 +01:00
$node->succeed("systemctl restart munged");
$node->waitForUnit("munged");
};
2015-12-25 15:55:07 +01:00
# Restart the services since they have probably failed due to the munge init
# failure
subtest "can_start_slurmdbd", sub {
$dbd->succeed("systemctl restart slurmdbd");
$dbd->waitForUnit("slurmdbd.service");
$dbd->waitForOpenPort(6819);
};
# there needs to be an entry for the current
# cluster in the database before slurmctld is restarted
subtest "add_account", sub {
$control->succeed("sacctmgr -i add cluster default");
# check for cluster entry
$control->succeed("sacctmgr list cluster | awk '{ print \$1 }' | grep default");
};
2015-12-25 15:55:07 +01:00
subtest "can_start_slurmctld", sub {
$control->succeed("systemctl restart slurmctld");
$control->waitForUnit("slurmctld.service");
};
subtest "can_start_slurmd", sub {
foreach my $node (($node1,$node2,$node3))
2015-12-25 15:55:07 +01:00
{
$node->succeed("systemctl restart slurmd.service");
$node->waitForUnit("slurmd");
}
};
# Test that the cluster works and can distribute jobs;
2015-12-25 15:55:07 +01:00
subtest "run_distributed_command", sub {
# Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
# The output must contain the 3 different names
$submit->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq");
2015-12-25 15:55:07 +01:00
};
subtest "check_slurm_dbd", sub {
# find the srun job from above in the database
sleep 5;
$control->succeed("sacct | grep hostname");
};
2015-12-25 15:55:07 +01:00
'';
})