Merge pull request #233581 from SomeoneSerge/CUDAToolkit_ROOT
cudaPackages.setupCudaHook: init
This commit is contained in:
commit
5825210d3f
11 changed files with 278 additions and 67 deletions
|
@ -24,6 +24,7 @@ args@
|
|||
, libkrb5
|
||||
, krb5
|
||||
, makeWrapper
|
||||
, markForCudatoolkitRootHook
|
||||
, ncurses5
|
||||
, numactl
|
||||
, nss
|
||||
|
@ -31,6 +32,7 @@ args@
|
|||
, python3 # FIXME: CUDAToolkit 10 may still need python27
|
||||
, pulseaudio
|
||||
, requireFile
|
||||
, setupCudaHook
|
||||
, stdenv
|
||||
, backendStdenv # E.g. gcc11Stdenv, set in extension.nix
|
||||
, unixODBC
|
||||
|
@ -80,11 +82,15 @@ backendStdenv.mkDerivation rec {
|
|||
addOpenGLRunpath
|
||||
autoPatchelfHook
|
||||
autoAddOpenGLRunpathHook
|
||||
markForCudatoolkitRootHook
|
||||
] ++ lib.optionals (lib.versionOlder version "11") [
|
||||
libsForQt5.wrapQtAppsHook
|
||||
] ++ lib.optionals (lib.versionAtLeast version "11.8") [
|
||||
qt6Packages.wrapQtAppsHook
|
||||
];
|
||||
depsTargetTargetPropagated = [
|
||||
setupCudaHook
|
||||
];
|
||||
buildInputs = lib.optionals (lib.versionOlder version "11") [
|
||||
libsForQt5.qt5.qtwebengine
|
||||
freeglut
|
||||
|
@ -280,24 +286,12 @@ backendStdenv.mkDerivation rec {
|
|||
sed -i "1 i#define _BITS_FLOATN_H" "$out/include/host_defines.h"
|
||||
'' +
|
||||
# Point NVCC at a compatible compiler
|
||||
# FIXME: redist cuda_nvcc copy-pastes this code
|
||||
# Refer to comments in the overrides for cuda_nvcc for explanation
|
||||
# CUDA_TOOLKIT_ROOT_DIR is legacy,
|
||||
# Cf. https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
|
||||
# NOTE: We unconditionally set -Xfatbin=-compress-all, which reduces the size of the compiled
|
||||
# binaries. If binaries grow over 2GB, they will fail to link. This is a problem for us, as
|
||||
# the default set of CUDA capabilities we build can regularly cause this to occur (for
|
||||
# example, with Magma).
|
||||
''
|
||||
mkdir -p $out/nix-support
|
||||
cat <<EOF >> $out/nix-support/setup-hook
|
||||
cmakeFlags+=' -DCUDA_TOOLKIT_ROOT_DIR=$out'
|
||||
cmakeFlags+=' -DCUDA_HOST_COMPILER=${backendStdenv.cc}/bin'
|
||||
cmakeFlags+=' -DCMAKE_CUDA_HOST_COMPILER=${backendStdenv.cc}/bin'
|
||||
if [ -z "\''${CUDAHOSTCXX-}" ]; then
|
||||
export CUDAHOSTCXX=${backendStdenv.cc}/bin;
|
||||
fi
|
||||
export NVCC_PREPEND_FLAGS+=' --compiler-bindir=${backendStdenv.cc}/bin -Xfatbin=-compress-all'
|
||||
EOF
|
||||
|
||||
# Move some libraries to the lib output so that programs that
|
||||
|
|
|
@ -38,10 +38,39 @@ final: prev: let
|
|||
|
||||
cudaFlags = final.callPackage ./flags.nix {};
|
||||
|
||||
# Internal hook, used by cudatoolkit and cuda redist packages
|
||||
# to accommodate automatic CUDAToolkit_ROOT construction
|
||||
markForCudatoolkitRootHook = (final.callPackage
|
||||
({ makeSetupHook }:
|
||||
makeSetupHook
|
||||
{ name = "mark-for-cudatoolkit-root-hook"; }
|
||||
./hooks/mark-for-cudatoolkit-root-hook.sh)
|
||||
{ });
|
||||
|
||||
# Normally propagated by cuda_nvcc or cudatoolkit through their depsHostHostPropagated
|
||||
setupCudaHook = (final.callPackage
|
||||
({ makeSetupHook, backendStdenv }:
|
||||
makeSetupHook
|
||||
{
|
||||
name = "setup-cuda-hook";
|
||||
|
||||
substitutions.ccRoot = "${backendStdenv.cc}";
|
||||
|
||||
# Required in addition to ccRoot as otherwise bin/gcc is looked up
|
||||
# when building CMakeCUDACompilerId.cu
|
||||
substitutions.ccFullPath = "${backendStdenv.cc}/bin/${backendStdenv.cc.targetPrefix}c++";
|
||||
}
|
||||
./hooks/setup-cuda-hook.sh)
|
||||
{ });
|
||||
|
||||
in
|
||||
{
|
||||
inherit
|
||||
backendStdenv
|
||||
cudatoolkit
|
||||
cudaFlags;
|
||||
cudaFlags
|
||||
markForCudatoolkitRootHook
|
||||
setupCudaHook;
|
||||
|
||||
saxpy = final.callPackage ./saxpy { };
|
||||
}
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
# shellcheck shell=bash
|
||||
|
||||
markForCUDAToolkit_ROOT() {
|
||||
mkdir -p "${prefix}/nix-support"
|
||||
touch "${prefix}/nix-support/include-in-cudatoolkit-root"
|
||||
}
|
||||
|
||||
fixupOutputHooks+=(markForCUDAToolkit_ROOT)
|
|
@ -0,0 +1,5 @@
|
|||
# shellcheck shell=bash
|
||||
|
||||
# CMake's enable_language(CUDA) runs a compiler test and it doesn't account for
|
||||
# CUDAToolkit_ROOT. We have to help it locate libcudart
|
||||
export NVCC_APPEND_FLAGS+=" -L@cudartRoot@/lib -I@cudartRoot@/include"
|
|
@ -0,0 +1,68 @@
|
|||
# shellcheck shell=bash
|
||||
|
||||
echo Sourcing setup-cuda-hook >&2
|
||||
|
||||
extendCUDAToolkit_ROOT() {
|
||||
if [[ -f "$1/nix-support/include-in-cudatoolkit-root" ]] ; then
|
||||
addToSearchPathWithCustomDelimiter ";" CUDAToolkit_ROOT "$1"
|
||||
|
||||
if [[ -d "$1/include" ]] ; then
|
||||
addToSearchPathWithCustomDelimiter ";" CUDAToolkit_INCLUDE_DIR "$1/include"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
addEnvHooks "$targetOffset" extendCUDAToolkit_ROOT
|
||||
|
||||
setupCUDAToolkitCompilers() {
|
||||
echo Executing setupCUDAToolkitCompilers >&2
|
||||
|
||||
if [[ -n "${dontSetupCUDAToolkitCompilers-}" ]] ; then
|
||||
return
|
||||
fi
|
||||
|
||||
# Point NVCC at a compatible compiler
|
||||
|
||||
# For CMake-based projects:
|
||||
# https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
|
||||
# https://cmake.org/cmake/help/latest/envvar/CUDAHOSTCXX.html
|
||||
# https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_HOST_COMPILER.html
|
||||
|
||||
export cmakeFlags+=" -DCUDA_HOST_COMPILER=@ccFullPath@"
|
||||
export cmakeFlags+=" -DCMAKE_CUDA_HOST_COMPILER=@ccFullPath@"
|
||||
|
||||
# For non-CMake projects:
|
||||
# We prepend --compiler-bindir to nvcc flags.
|
||||
# Downstream packages can override these, because NVCC
|
||||
# uses the last --compiler-bindir it gets on the command line.
|
||||
# FIXME: this results in "incompatible redefinition" warnings.
|
||||
# https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#compiler-bindir-directory-ccbin
|
||||
if [ -z "${CUDAHOSTCXX-}" ]; then
|
||||
export CUDAHOSTCXX="@ccFullPath@";
|
||||
fi
|
||||
|
||||
export NVCC_PREPEND_FLAGS+=" --compiler-bindir=@ccRoot@/bin"
|
||||
|
||||
# NOTE: We set -Xfatbin=-compress-all, which reduces the size of the compiled
|
||||
# binaries. If binaries grow over 2GB, they will fail to link. This is a problem for us, as
|
||||
# the default set of CUDA capabilities we build can regularly cause this to occur (for
|
||||
# example, with Magma).
|
||||
#
|
||||
# @SomeoneSerge: original comment was made by @ConnorBaker in .../cudatoolkit/common.nix
|
||||
if [[ -z "${dontCompressFatbin-}" ]]; then
|
||||
export NVCC_PREPEND_FLAGS+=" -Xfatbin=-compress-all"
|
||||
fi
|
||||
|
||||
# CMake's enable_language(CUDA) runs a compiler test and it doesn't account for
|
||||
# CUDAToolkit_ROOT. We have to help it locate libcudart
|
||||
if [[ -z "${nvccDontPrependCudartFlags-}" ]] ; then
|
||||
export NVCC_APPEND_FLAGS+=" -L@cudartRoot@/lib -I@cudartRoot@/include"
|
||||
fi
|
||||
}
|
||||
|
||||
setupCMakeCUDAToolkit_ROOT() {
|
||||
export cmakeFlags+=" -DCUDAToolkit_INCLUDE_DIR=$CUDAToolkit_INCLUDE_DIR -DCUDAToolkit_ROOT=$CUDAToolkit_ROOT"
|
||||
}
|
||||
|
||||
postHooks+=(setupCUDAToolkitCompilers)
|
||||
preConfigureHooks+=(setupCMakeCUDAToolkit_ROOT)
|
|
@ -4,6 +4,7 @@
|
|||
, fetchurl
|
||||
, autoPatchelfHook
|
||||
, autoAddOpenGLRunpathHook
|
||||
, markForCudatoolkitRootHook
|
||||
}:
|
||||
|
||||
pname:
|
||||
|
@ -28,6 +29,7 @@ backendStdenv.mkDerivation {
|
|||
# directory to the rpath of all ELF binaries.
|
||||
# Check e.g. with `patchelf --print-rpath path/to/my/binary
|
||||
autoAddOpenGLRunpathHook
|
||||
markForCudatoolkitRootHook
|
||||
];
|
||||
|
||||
buildInputs = [
|
||||
|
|
|
@ -27,35 +27,27 @@ in
|
|||
inherit (prev.backendStdenv) cc;
|
||||
in
|
||||
{
|
||||
# Required by cmake's enable_language(CUDA) to build a test program
|
||||
# When implementing cross-compilation support: this is
|
||||
# final.pkgs.targetPackages.cudaPackages.cuda_cudart
|
||||
env.cudartRoot = "${prev.lib.getDev final.cuda_cudart}";
|
||||
|
||||
# Point NVCC at a compatible compiler
|
||||
# FIXME: non-redist cudatoolkit copy-pastes this code
|
||||
|
||||
# For CMake-based projects:
|
||||
# https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
|
||||
# https://cmake.org/cmake/help/latest/envvar/CUDAHOSTCXX.html
|
||||
# https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_HOST_COMPILER.html
|
||||
|
||||
# For non-CMake projects:
|
||||
# We prepend --compiler-bindir to nvcc flags.
|
||||
# Downstream packages can override these, because NVCC
|
||||
# uses the last --compiler-bindir it gets on the command line.
|
||||
# FIXME: this results in "incompatible redefinition" warnings.
|
||||
# https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#compiler-bindir-directory-ccbin
|
||||
# NOTE: We unconditionally set -Xfatbin=-compress-all, which reduces the size of the
|
||||
# compiled binaries. If binaries grow over 2GB, they will fail to link. This is a problem
|
||||
# for us, as the default set of CUDA capabilities we build can regularly cause this to
|
||||
# occur (for example, with Magma).
|
||||
postInstall = (oldAttrs.postInstall or "") + ''
|
||||
mkdir -p $out/nix-support
|
||||
cat <<EOF >> $out/nix-support/setup-hook
|
||||
cmakeFlags+=' -DCUDA_HOST_COMPILER=${cc}/bin'
|
||||
cmakeFlags+=' -DCMAKE_CUDA_HOST_COMPILER=${cc}/bin'
|
||||
if [ -z "\''${CUDAHOSTCXX-}" ]; then
|
||||
export CUDAHOSTCXX=${cc}/bin;
|
||||
fi
|
||||
export NVCC_PREPEND_FLAGS+=' --compiler-bindir=${cc}/bin -Xfatbin=-compress-all'
|
||||
EOF
|
||||
'';
|
||||
# Desiredata: whenever a package (e.g. magma) adds cuda_nvcc to
|
||||
# nativeBuildInputs (offsets `(-1, 0)`), magma should also source the
|
||||
# setupCudaHook, i.e. we want it the hook to be propagated into the
|
||||
# same nativeBuildInputs.
|
||||
#
|
||||
# Logically, cuda_nvcc should include the hook in depsHostHostPropagated,
|
||||
# so that the final offsets for the propagated hook would be `(-1, 0) +
|
||||
# (0, 0) = (-1, 0)`.
|
||||
#
|
||||
# In practice, TargetTarget appears to work:
|
||||
# https://gist.github.com/fd80ff142cd25e64603618a3700e7f82
|
||||
depsTargetTargetPropagated = [
|
||||
final.setupCudaHook
|
||||
];
|
||||
});
|
||||
|
||||
cuda_nvprof = prev.cuda_nvprof.overrideAttrs (oldAttrs: {
|
||||
|
|
12
pkgs/development/compilers/cudatoolkit/saxpy/CMakeLists.txt
Normal file
12
pkgs/development/compilers/cudatoolkit/saxpy/CMakeLists.txt
Normal file
|
@ -0,0 +1,12 @@
|
|||
cmake_minimum_required(VERSION 3.25)
|
||||
project(saxpy LANGUAGES CXX CUDA)
|
||||
|
||||
find_package(CUDAToolkit REQUIRED COMPONENTS cudart cublas)
|
||||
|
||||
add_executable(saxpy saxpy.cu)
|
||||
target_link_libraries(saxpy PUBLIC CUDA::cublas CUDA::cudart m)
|
||||
target_compile_features(saxpy PRIVATE cxx_std_14)
|
||||
target_compile_options(saxpy PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
|
||||
--expt-relaxed-constexpr>)
|
||||
|
||||
install(TARGETS saxpy)
|
50
pkgs/development/compilers/cudatoolkit/saxpy/default.nix
Normal file
50
pkgs/development/compilers/cudatoolkit/saxpy/default.nix
Normal file
|
@ -0,0 +1,50 @@
|
|||
{ autoAddOpenGLRunpathHook
|
||||
, backendStdenv
|
||||
, cmake
|
||||
, cuda_cccl
|
||||
, cuda_cudart
|
||||
, cudaFlags
|
||||
, cuda_nvcc
|
||||
, lib
|
||||
, libcublas
|
||||
, setupCudaHook
|
||||
, stdenv
|
||||
}:
|
||||
|
||||
backendStdenv.mkDerivation {
|
||||
pname = "saxpy";
|
||||
version = "unstable-2023-07-11";
|
||||
|
||||
src = ./.;
|
||||
|
||||
buildInputs = [
|
||||
libcublas
|
||||
cuda_cudart
|
||||
cuda_cccl
|
||||
];
|
||||
nativeBuildInputs = [
|
||||
cmake
|
||||
|
||||
# NOTE: this needs to be pkgs.buildPackages.cudaPackages_XX_Y.cuda_nvcc for
|
||||
# cross-compilation to work. This should work automatically once we move to
|
||||
# spliced scopes. Delete this comment once that happens
|
||||
cuda_nvcc
|
||||
|
||||
# Alternatively, we could remove the propagated hook from cuda_nvcc and add
|
||||
# directly:
|
||||
# setupCudaHook
|
||||
autoAddOpenGLRunpathHook
|
||||
];
|
||||
|
||||
cmakeFlags = [
|
||||
"-DCMAKE_VERBOSE_MAKEFILE=ON"
|
||||
"-DCMAKE_CUDA_ARCHITECTURES=${with cudaFlags; builtins.concatStringsSep ";" (map dropDot cudaCapabilities)}"
|
||||
];
|
||||
|
||||
meta = {
|
||||
description = "A simple (Single-precision AX Plus Y) FindCUDAToolkit.cmake example for testing cross-compilation";
|
||||
license = lib.licenses.mit;
|
||||
maintainers = lib.teams.cuda.members;
|
||||
platforms = lib.platforms.unix;
|
||||
};
|
||||
}
|
68
pkgs/development/compilers/cudatoolkit/saxpy/saxpy.cu
Normal file
68
pkgs/development/compilers/cudatoolkit/saxpy/saxpy.cu
Normal file
|
@ -0,0 +1,68 @@
|
|||
#include <cublas_v2.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <vector>
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
static inline void check(cudaError_t err, const char *context) {
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "CUDA error at %s: %s\n", context, cudaGetErrorString(err));
|
||||
std::exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
#define CHECK(x) check(x, #x)
|
||||
|
||||
__global__ void saxpy(int n, float a, float *x, float *y) {
|
||||
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (i < n)
|
||||
y[i] = a * x[i] + y[i];
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
setbuf(stderr, NULL);
|
||||
fprintf(stderr, "Start\n");
|
||||
|
||||
int rtVersion, driverVersion;
|
||||
CHECK(cudaRuntimeGetVersion(&rtVersion));
|
||||
CHECK(cudaDriverGetVersion(&driverVersion));
|
||||
|
||||
fprintf(stderr, "Runtime version: %d\n", rtVersion);
|
||||
fprintf(stderr, "Driver version: %d\n", driverVersion);
|
||||
|
||||
constexpr int N = 1 << 10;
|
||||
|
||||
std::vector<float> xHost(N), yHost(N);
|
||||
for (int i = 0; i < N; i++) {
|
||||
xHost[i] = 1.0f;
|
||||
yHost[i] = 2.0f;
|
||||
}
|
||||
|
||||
fprintf(stderr, "Host memory initialized, copying to the device\n");
|
||||
fflush(stderr);
|
||||
|
||||
float *xDevice, *yDevice;
|
||||
CHECK(cudaMalloc(&xDevice, N * sizeof(float)));
|
||||
CHECK(cudaMalloc(&yDevice, N * sizeof(float)));
|
||||
|
||||
CHECK(cudaMemcpy(xDevice, xHost.data(), N * sizeof(float),
|
||||
cudaMemcpyHostToDevice));
|
||||
CHECK(cudaMemcpy(yDevice, yHost.data(), N * sizeof(float),
|
||||
cudaMemcpyHostToDevice));
|
||||
fprintf(stderr, "Scheduled a cudaMemcpy, calling the kernel\n");
|
||||
|
||||
saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, xDevice, yDevice);
|
||||
fprintf(stderr, "Scheduled a kernel call\n");
|
||||
CHECK(cudaGetLastError());
|
||||
|
||||
CHECK(cudaMemcpy(yHost.data(), yDevice, N * sizeof(float),
|
||||
cudaMemcpyDeviceToHost));
|
||||
|
||||
float maxError = 0.0f;
|
||||
for (int i = 0; i < N; i++)
|
||||
maxError = max(maxError, abs(yHost[i] - 4.0f));
|
||||
fprintf(stderr, "Max error: %f\n", maxError);
|
||||
|
||||
CHECK(cudaFree(xDevice));
|
||||
CHECK(cudaFree(yDevice));
|
||||
}
|
|
@ -86,29 +86,6 @@ let
|
|||
# "75" -> "750" Cf. https://bitbucket.org/icl/magma/src/f4ec79e2c13a2347eff8a77a3be6f83bc2daec20/CMakeLists.txt#lines-273
|
||||
"${minArch'}0";
|
||||
|
||||
cuda-common-redist = with cudaPackages; [
|
||||
libcublas # cublas_v2.h
|
||||
libcusparse # cusparse.h
|
||||
];
|
||||
|
||||
# Build-time dependencies
|
||||
cuda-native-redist = symlinkJoin {
|
||||
name = "cuda-native-redist-${cudaVersion}";
|
||||
paths = with cudaPackages; [
|
||||
cuda_cudart # cuda_runtime.h
|
||||
cuda_nvcc
|
||||
] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [
|
||||
cuda_nvprof # <cuda_profiler_api.h>
|
||||
] ++ lists.optionals (strings.versionAtLeast cudaVersion "11.8") [
|
||||
cuda_profiler_api # <cuda_profiler_api.h>
|
||||
] ++ cuda-common-redist;
|
||||
};
|
||||
|
||||
# Run-time dependencies
|
||||
cuda-redist = symlinkJoin {
|
||||
name = "cuda-redist-${cudaVersion}";
|
||||
paths = cuda-common-redist;
|
||||
};
|
||||
in
|
||||
|
||||
assert (builtins.match "[^[:space:]]*" gpuTargetString) != null;
|
||||
|
@ -128,16 +105,22 @@ stdenv.mkDerivation {
|
|||
ninja
|
||||
gfortran
|
||||
] ++ lists.optionals cudaSupport [
|
||||
cuda-native-redist
|
||||
cudaPackages.cuda_nvcc
|
||||
];
|
||||
|
||||
buildInputs = [
|
||||
libpthreadstubs
|
||||
lapack
|
||||
blas
|
||||
] ++ lists.optionals cudaSupport [
|
||||
cuda-redist
|
||||
] ++ lists.optionals rocmSupport [
|
||||
] ++ lists.optionals cudaSupport (with cudaPackages; [
|
||||
cuda_cudart
|
||||
libcublas # cublas_v2.h
|
||||
libcusparse # cusparse.h
|
||||
] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [
|
||||
cuda_nvprof # <cuda_profiler_api.h>
|
||||
] ++ lists.optionals (strings.versionAtLeast cudaVersion "11.8") [
|
||||
cuda_profiler_api # <cuda_profiler_api.h>
|
||||
]) ++ lists.optionals rocmSupport [
|
||||
hip
|
||||
hipblas
|
||||
hipsparse
|
||||
|
|
Loading…
Reference in a new issue