diff --git a/pkgs/os-specific/linux/dcgm/default.nix b/pkgs/os-specific/linux/dcgm/default.nix new file mode 100644 index 000000000000..36c7e3ca6880 --- /dev/null +++ b/pkgs/os-specific/linux/dcgm/default.nix @@ -0,0 +1,147 @@ +{ lib +, callPackage +, gcc11Stdenv +, fetchFromGitHub +, addOpenGLRunpath +, catch2 +, cmake +, cudaPackages_10_2 +, cudaPackages_11_8 +, cudaPackages_12 +, fmt_9 +, git +, jsoncpp +, libevent +, plog +, python3 +, symlinkJoin +, tclap_1_4 +, yaml-cpp +}: +let + # Flags copied from DCGM's libevent build script + libevent-nossl = libevent.override { sslSupport = false; }; + libevent-nossl-static = libevent-nossl.overrideAttrs (super: { + CFLAGS = "-Wno-cast-function-type -Wno-implicit-fallthrough -fPIC"; + CXXFLAGS = "-Wno-cast-function-type -Wno-implicit-fallthrough -fPIC"; + configureFlags = super.configureFlags ++ [ "--disable-shared" "--with-pic" ]; + }); + + jsoncpp-static = jsoncpp.override { enableStatic = true; }; + + # DCGM depends on 3 different versions of CUDA at the same time. + # The runtime closure, thankfully, is quite small because most things + # are statically linked. + cudaPackageSetByVersion = [ + { + version = "10"; + # Nixpkgs cudaPackages_10 doesn't have redist packages broken out. + pkgSet = [ + cudaPackages_10_2.cudatoolkit + cudaPackages_10_2.cudatoolkit.lib + ]; + } + { + version = "11"; + pkgSet = getCudaPackages cudaPackages_11_8; + } + { + version = "12"; + pkgSet = getCudaPackages cudaPackages_12; + } + ]; + + # Select needed redist packages from cudaPackages + # C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/scripts/0080_cuda.sh#L24-L39 + getCudaPackages = p: with p; [ + cuda_cccl + cuda_cudart + cuda_nvcc + cuda_nvml_dev + libcublas + libcufft + libcurand + ]; + + # Builds CMake code to add CUDA paths for include and lib. + mkAppendCudaPaths = { version, pkgSet }: + let + # The DCGM CMake assumes that the folder containing cuda.h contains all headers, so we must + # combine everything together for headers to work. + # It would be more convenient to use symlinkJoin on *just* the include subdirectories + # of each package, but not all of them have an include directory and making that work + # is more effort than it's worth for this temporary, build-time package. + combined = symlinkJoin { + name = "cuda-combined-${version}"; + paths = pkgSet; + }; + # The combined package above breaks the build for some reason so we just configure + # each package's library path. + libs = lib.concatMapStringsSep " " (x: ''"${x}/lib"'') pkgSet; + in '' + list(APPEND Cuda${version}_INCLUDE_PATHS "${combined}/include") + list(APPEND Cuda${version}_LIB_PATHS ${libs}) + ''; + +# gcc11 is required by DCGM's very particular build system +# C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/build.sh#L22 +in gcc11Stdenv.mkDerivation rec { + pname = "dcgm"; + version = "3.1.8"; + + src = fetchFromGitHub { + owner = "NVIDIA"; + repo = "DCGM"; + rev = "refs/tags/v${version}"; + hash = "sha256-OXqXkP2ZUNPzafGIgJ0MKa39xB84keVFFYl+JsHgnks="; + }; + + # Add our paths to the CUDA paths so FindCuda.cmake can find them. + EXTRA_CUDA_PATHS = lib.concatMapStringsSep "\n" mkAppendCudaPaths cudaPackageSetByVersion; + prePatch = '' + echo "$EXTRA_CUDA_PATHS"$'\n'"$(cat cmake/FindCuda.cmake)" > cmake/FindCuda.cmake + ''; + + hardeningDisable = [ "all" ]; + + nativeBuildInputs = [ + addOpenGLRunpath + cmake + git + python3 + + jsoncpp-static + jsoncpp-static.dev + libevent-nossl-static + libevent-nossl-static.dev + plog.dev # header-only + tclap_1_4 # header-only + ]; + + buildInputs = [ + catch2 + fmt_9 + yaml-cpp + ]; + + # libcuda.so must be found at runtime because it is supplied by the NVIDIA + # driver. autoAddOpenGLRunpathHook breaks on the statically linked exes. + postFixup = '' + find "$out/bin" "$out/lib" -type f -executable -print0 | while IFS= read -r -d "" f; do + if isELF "$f" && [[ $(patchelf --print-needed "$f" || true) == *libcuda.so* ]]; then + addOpenGLRunpath "$f" + fi + done + ''; + + disallowedReferences = lib.concatMap (x: x.pkgSet) cudaPackageSetByVersion; + + meta = with lib; { + description = "Data Center GPU Manager (DCGM) is a daemon that allows users to monitor NVIDIA data-center GPUs."; + homepage = "https://developer.nvidia.com/dcgm"; + license = licenses.asl20; + maintainers = teams.deshaw.members; + mainProgram = "dcgmi"; + platforms = platforms.linux; + }; +} diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix index 38c00cfc82b5..aff829e92770 100644 --- a/pkgs/top-level/all-packages.nix +++ b/pkgs/top-level/all-packages.nix @@ -541,6 +541,8 @@ with pkgs; dbip-country-lite = callPackage ../data/misc/dbip-country-lite { }; + dcgm = callPackage ../os-specific/linux/dcgm { }; + dhallDirectoryToNix = callPackage ../build-support/dhall/directory-to-nix.nix { }; dhallPackageToNix = callPackage ../build-support/dhall/package-to-nix.nix { };