dcgm: init at 3.1.8

2023-04-24 23:42:44 -04:00 · 2023-04-24 23:42:44 -04:00 · 1cdc3752a5
commit 1cdc3752a5
parent 5ba94f8629
2 changed files with 149 additions and 0 deletions
--- a/pkgs/os-specific/linux/dcgm/default.nix
+++ b/pkgs/os-specific/linux/dcgm/default.nix
@ -0,0 +1,147 @@
+{ lib
+, callPackage
+, gcc11Stdenv
+, fetchFromGitHub
+, addOpenGLRunpath
+, catch2
+, cmake
+, cudaPackages_10_2
+, cudaPackages_11_8
+, cudaPackages_12
+, fmt_9
+, git
+, jsoncpp
+, libevent
+, plog
+, python3
+, symlinkJoin
+, tclap_1_4
+, yaml-cpp
+}:
+let
+  # Flags copied from DCGM's libevent build script
+  libevent-nossl = libevent.override { sslSupport = false; };
+  libevent-nossl-static = libevent-nossl.overrideAttrs (super: {
+    CFLAGS = "-Wno-cast-function-type -Wno-implicit-fallthrough -fPIC";
+    CXXFLAGS = "-Wno-cast-function-type -Wno-implicit-fallthrough -fPIC";
+    configureFlags = super.configureFlags ++ [ "--disable-shared" "--with-pic" ];
+  });
+
+  jsoncpp-static = jsoncpp.override { enableStatic = true; };
+
+  # DCGM depends on 3 different versions of CUDA at the same time.
+  # The runtime closure, thankfully, is quite small because most things
+  # are statically linked.
+  cudaPackageSetByVersion = [
+    {
+      version = "10";
+      # Nixpkgs cudaPackages_10 doesn't have redist packages broken out.
+      pkgSet = [
+        cudaPackages_10_2.cudatoolkit
+        cudaPackages_10_2.cudatoolkit.lib
+      ];
+    }
+    {
+      version = "11";
+      pkgSet = getCudaPackages cudaPackages_11_8;
+    }
+    {
+      version = "12";
+      pkgSet = getCudaPackages cudaPackages_12;
+    }
+  ];
+
+  # Select needed redist packages from cudaPackages
+  # C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/scripts/0080_cuda.sh#L24-L39
+  getCudaPackages = p: with p; [
+    cuda_cccl
+    cuda_cudart
+    cuda_nvcc
+    cuda_nvml_dev
+    libcublas
+    libcufft
+    libcurand
+  ];
+
+  # Builds CMake code to add CUDA paths for include and lib.
+  mkAppendCudaPaths = { version, pkgSet }:
+    let
+      # The DCGM CMake assumes that the folder containing cuda.h contains all headers, so we must
+      # combine everything together for headers to work.
+      # It would be more convenient to use symlinkJoin on *just* the include subdirectories
+      # of each package, but not all of them have an include directory and making that work
+      # is more effort than it's worth for this temporary, build-time package.
+      combined = symlinkJoin {
+        name = "cuda-combined-${version}";
+        paths = pkgSet;
+      };
+      # The combined package above breaks the build for some reason so we just configure
+      # each package's library path.
+      libs = lib.concatMapStringsSep " " (x: ''"${x}/lib"'') pkgSet;
+    in ''
+      list(APPEND Cuda${version}_INCLUDE_PATHS "${combined}/include")
+      list(APPEND Cuda${version}_LIB_PATHS ${libs})
+    '';
+
+# gcc11 is required by DCGM's very particular build system
+# C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/build.sh#L22
+in gcc11Stdenv.mkDerivation rec {
+  pname = "dcgm";
+  version = "3.1.8";
+
+  src = fetchFromGitHub {
+    owner = "NVIDIA";
+    repo = "DCGM";
+    rev = "refs/tags/v${version}";
+    hash = "sha256-OXqXkP2ZUNPzafGIgJ0MKa39xB84keVFFYl+JsHgnks=";
+  };
+
+  # Add our paths to the CUDA paths so FindCuda.cmake can find them.
+  EXTRA_CUDA_PATHS = lib.concatMapStringsSep "\n" mkAppendCudaPaths cudaPackageSetByVersion;
+  prePatch = ''
+    echo "$EXTRA_CUDA_PATHS"$'\n'"$(cat cmake/FindCuda.cmake)" > cmake/FindCuda.cmake
+  '';
+
+  hardeningDisable = [ "all" ];
+
+  nativeBuildInputs = [
+    addOpenGLRunpath
+    cmake
+    git
+    python3
+
+    jsoncpp-static
+    jsoncpp-static.dev
+    libevent-nossl-static
+    libevent-nossl-static.dev
+    plog.dev # header-only
+    tclap_1_4 # header-only
+  ];
+
+  buildInputs = [
+    catch2
+    fmt_9
+    yaml-cpp
+  ];
+
+  # libcuda.so must be found at runtime because it is supplied by the NVIDIA
+  # driver. autoAddOpenGLRunpathHook breaks on the statically linked exes.
+  postFixup = ''
+    find "$out/bin" "$out/lib" -type f -executable -print0 | while IFS= read -r -d "" f; do
+      if isELF "$f" && [[ $(patchelf --print-needed "$f" || true) == *libcuda.so* ]]; then
+        addOpenGLRunpath "$f"
+      fi
+    done
+  '';
+
+  disallowedReferences = lib.concatMap (x: x.pkgSet) cudaPackageSetByVersion;
+
+  meta = with lib; {
+    description = "Data Center GPU Manager (DCGM) is a daemon that allows users to monitor NVIDIA data-center GPUs.";
+    homepage = "https://developer.nvidia.com/dcgm";
+    license = licenses.asl20;
+    maintainers = teams.deshaw.members;
+    mainProgram = "dcgmi";
+    platforms = platforms.linux;
+  };
+}
--- a/pkgs/top-level/all-packages.nix
+++ b/pkgs/top-level/all-packages.nix
@ -541,6 +541,8 @@ with pkgs;

  dbip-country-lite = callPackage ../data/misc/dbip-country-lite { };

+  dcgm = callPackage ../os-specific/linux/dcgm { };
+
  dhallDirectoryToNix = callPackage ../build-support/dhall/directory-to-nix.nix { };

  dhallPackageToNix = callPackage ../build-support/dhall/package-to-nix.nix { };