63746cac08
This PR refactor CUDA setup hooks, and in particular autoAddOpenGLRunpath and autoAddCudaCompatRunpathHook, that were using a lot of code in common (in fact, I introduced the latter by copy pasting most of the bash script of the former). This is not satisfying for maintenance, as a recent patch showed, because we need to duplicate changes to both hooks. This commit abstract the common part in a single shell script that applies a generic patch action to every elf file in the output. For autoAddOpenGLRunpath the action is just addOpenGLRunpath (now addDriverRunpath), and is few line function for autoAddCudaCompatRunpathHook. Doing so, we also takes the occasion to use the newer addDriverRunpath instead of the previous addOpenGLRunpath, and rename the CUDA hook to reflect that as well. Co-Authored-By: Connor Baker <connor.baker@tweag.io>
139 lines
4.2 KiB
Nix
139 lines
4.2 KiB
Nix
{ lib
|
|
, gcc11Stdenv
|
|
, fetchFromGitHub
|
|
, catch2
|
|
, cmake
|
|
, cudaPackages_10_2
|
|
, cudaPackages_11_8
|
|
, cudaPackages_12
|
|
, fmt_9
|
|
, git
|
|
, jsoncpp
|
|
, libevent
|
|
, plog
|
|
, python3
|
|
, symlinkJoin
|
|
, tclap_1_4
|
|
, yaml-cpp
|
|
}:
|
|
let
|
|
# Flags copied from DCGM's libevent build script
|
|
libevent-nossl = libevent.override { sslSupport = false; };
|
|
libevent-nossl-static = libevent-nossl.overrideAttrs (super: {
|
|
CFLAGS = "-Wno-cast-function-type -Wno-implicit-fallthrough -fPIC";
|
|
CXXFLAGS = "-Wno-cast-function-type -Wno-implicit-fallthrough -fPIC";
|
|
configureFlags = super.configureFlags ++ [ "--disable-shared" "--with-pic" ];
|
|
});
|
|
|
|
jsoncpp-static = jsoncpp.override { enableStatic = true; };
|
|
|
|
# DCGM depends on 3 different versions of CUDA at the same time.
|
|
# The runtime closure, thankfully, is quite small because most things
|
|
# are statically linked.
|
|
cudaPackageSetByVersion = [
|
|
{
|
|
version = "10";
|
|
# Nixpkgs cudaPackages_10 doesn't have redist packages broken out.
|
|
pkgSet = [
|
|
cudaPackages_10_2.cudatoolkit
|
|
cudaPackages_10_2.cudatoolkit.lib
|
|
];
|
|
}
|
|
{
|
|
version = "11";
|
|
pkgSet = getCudaPackages cudaPackages_11_8;
|
|
}
|
|
{
|
|
version = "12";
|
|
pkgSet = getCudaPackages cudaPackages_12;
|
|
}
|
|
];
|
|
|
|
# Select needed redist packages from cudaPackages
|
|
# C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/scripts/0080_cuda.sh#L24-L39
|
|
getCudaPackages = p: with p; [
|
|
cuda_cccl
|
|
cuda_cudart
|
|
cuda_nvcc
|
|
cuda_nvml_dev
|
|
libcublas
|
|
libcufft
|
|
libcurand
|
|
];
|
|
|
|
# Builds CMake code to add CUDA paths for include and lib.
|
|
mkAppendCudaPaths = { version, pkgSet }:
|
|
let
|
|
# The DCGM CMake assumes that the folder containing cuda.h contains all headers, so we must
|
|
# combine everything together for headers to work.
|
|
# It would be more convenient to use symlinkJoin on *just* the include subdirectories
|
|
# of each package, but not all of them have an include directory and making that work
|
|
# is more effort than it's worth for this temporary, build-time package.
|
|
combined = symlinkJoin {
|
|
name = "cuda-combined-${version}";
|
|
paths = pkgSet;
|
|
};
|
|
# The combined package above breaks the build for some reason so we just configure
|
|
# each package's library path.
|
|
libs = lib.concatMapStringsSep " " (x: ''"${x}/lib"'') pkgSet;
|
|
in ''
|
|
list(APPEND Cuda${version}_INCLUDE_PATHS "${combined}/include")
|
|
list(APPEND Cuda${version}_LIB_PATHS ${libs})
|
|
'';
|
|
|
|
# gcc11 is required by DCGM's very particular build system
|
|
# C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/build.sh#L22
|
|
in gcc11Stdenv.mkDerivation rec {
|
|
pname = "dcgm";
|
|
version = "3.2.5"; # N.B: If you change this, be sure prometheus-dcgm-exporter supports this version.
|
|
|
|
src = fetchFromGitHub {
|
|
owner = "NVIDIA";
|
|
repo = "DCGM";
|
|
rev = "refs/tags/v${version}";
|
|
hash = "sha256-iMyYOr3dSpdRV2S/TlB/tEOAWYhK09373ZRbd5vzogQ=";
|
|
};
|
|
|
|
# Add our paths to the CUDA paths so FindCuda.cmake can find them.
|
|
EXTRA_CUDA_PATHS = lib.concatMapStringsSep "\n" mkAppendCudaPaths cudaPackageSetByVersion;
|
|
prePatch = ''
|
|
echo "$EXTRA_CUDA_PATHS"$'\n'"$(cat cmake/FindCuda.cmake)" > cmake/FindCuda.cmake
|
|
'';
|
|
|
|
hardeningDisable = [ "all" ];
|
|
|
|
strictDeps = true;
|
|
|
|
nativeBuildInputs = [
|
|
# autoAddDriverRunpath does not actually depend on or incur any dependency
|
|
# of cudaPackages. It merely adds an impure, non-Nix PATH to the RPATHs of
|
|
# executables that need to use cuda at runtime.
|
|
cudaPackages_12.autoAddDriverRunpath
|
|
|
|
cmake
|
|
git
|
|
python3
|
|
];
|
|
|
|
buildInputs = [
|
|
plog.dev # header-only
|
|
tclap_1_4 # header-only
|
|
|
|
catch2
|
|
fmt_9
|
|
jsoncpp-static
|
|
libevent-nossl-static
|
|
yaml-cpp
|
|
];
|
|
|
|
disallowedReferences = lib.concatMap (x: x.pkgSet) cudaPackageSetByVersion;
|
|
|
|
meta = with lib; {
|
|
description = "Data Center GPU Manager (DCGM) is a daemon that allows users to monitor NVIDIA data-center GPUs.";
|
|
homepage = "https://developer.nvidia.com/dcgm";
|
|
license = licenses.asl20;
|
|
maintainers = teams.deshaw.members;
|
|
mainProgram = "dcgmi";
|
|
platforms = platforms.linux;
|
|
};
|
|
}
|