CMake

CMake是一个现代化C++、CUDA、C构建系统,对于大型项目来说,一个易用的构建系统是非常重要的,能够减少环境的配置时间,动态库的配置时间。因此熟练掌握cmake的用法是C++程序员一个标准的技能,能够让我们快速构建系统编译以及运行。

关于CUDA的CMake

CUDA是NVIDIA开发的并行计算库,能够让我们快速编写CUDA代码然后通过CUDA RUNTIME进行控制RTX等硬件运行。但是配置一个好的CUDA运行环境是非常不容易的,因此本文记录一下如何搭建一个CUDA的CMake系统。

环境

  • CUDA 11.7
  • RTX 2080ti

CMake语法

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
1. CMAKE_MINIMUM_REQUIRED(VERSION 3.24.1)

2. SET(PROJECT_NAME demo)

3. PROJECT(${PROJECT_NAME} LANGUAGES CXX CUDA)
4. set(CMAKE_CUDA_ARCHITECTURES 75)

5. include(GNUInstallDirs)

6. find_package(CUDAToolkit)

7. add_executable(${PROJECT_NAME} main.cu)
8. target_link_libraries(${PROJECT_NAME} PRIVATE CUDA::cublas)
  1. CMake的版本,因为有些CMake关键字在低版本中是不存在的,因此需要指定版本号
  2. 定义项目的名称,并将名称放入PROJECT_NAME变量中
  3. 定义项目使用的语言, 具体参考project Supported languages include C, CXX (i.e. C++), CUDA, OBJC (i.e. Objective-C), OBJCXX, Fortran, HIP, ISPC, and ASM. By default C and CXX are enabled if no language options are given. Specify language NONE, or use the LANGUAGES keyword and list no languages, to skip enabling any languages.
  4. 设置CUDA架构的生成代码的号 如set(CMAKE_CUDA_ARCHITECTURES 75) 对应sm75
  5. 包含标准install GNUInstallDirs¶
  6. 寻找cuda工具库 FindCUDAToolkit 还有各种各样的find_package
  7. 添加可执行文件
  8. 定义link

用法

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
#include <cstdio>
#include <cstdlib>
#include <vector>

#include <cublas_v2.h>
#include <cuda_runtime.h>

#include "cbblas_utils.h"

using data_type = double;

int main(int argc, char *argv[]) {
    cublasHandle_t cublasH = NULL;
    cudaStream_t stream = NULL;

    const int m = 2;
    const int n = 2;
    const int k = 2;
    const int lda = 2;
    const int ldb = 2;
    const int ldc = 2;
    /*
     *   A = | 1.0 | 2.0 |
     *       | 3.0 | 4.0 |
     *
     *   B = | 5.0 | 6.0 |
     *       | 7.0 | 8.0 |
     */

    const std::vector<data_type> A = {1.0, 8.0, 3.0, 4.0};
    const std::vector<data_type> B = {5.0, 6.0, 7.0, 8.0};
    std::vector<data_type> C(m * n);
    const data_type alpha = 1.0;
    const data_type beta = 0.0;

    data_type *d_A = nullptr;
    data_type *d_B = nullptr;
    data_type *d_C = nullptr;

    cublasOperation_t transa = CUBLAS_OP_N;
    cublasOperation_t transb = CUBLAS_OP_N;

    printf("A\n");
    print_matrix(m, k, A.data(), lda);
    printf("=====\n");

    printf("B\n");
    print_matrix(k, n, B.data(), ldb);
    printf("=====\n");

    /* step 1: create cublas handle, bind a stream */
    CUBLAS_CHECK(cublasCreate(&cublasH));

    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    CUBLAS_CHECK(cublasSetStream(cublasH, stream));

    /* step 2: copy data to device */
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_A), sizeof(data_type) * A.size()));
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_B), sizeof(data_type) * B.size()));
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_C), sizeof(data_type) * C.size()));

    CUDA_CHECK(cudaMemcpyAsync(d_A, A.data(), sizeof(data_type) * A.size(), cudaMemcpyHostToDevice,
                               stream));
    CUDA_CHECK(cudaMemcpyAsync(d_B, B.data(), sizeof(data_type) * B.size(), cudaMemcpyHostToDevice,
                               stream));

    while(1) {
        /* step 3: compute */
    CUBLAS_CHECK(
        cublasDgemm(cublasH, transa, transb, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc));
    }
    

    /* step 4: copy data to host */
    CUDA_CHECK(cudaMemcpyAsync(C.data(), d_C, sizeof(data_type) * C.size(), cudaMemcpyDeviceToHost,
                               stream));

    CUDA_CHECK(cudaStreamSynchronize(stream));

    /*
     *   C = | 23.0 | 31.0 |
     *       | 34.0 | 46.0 |
     */

    printf("C\n");
    print_matrix(m, n, C.data(), ldc);
    printf("=====\n");

    /* free resources */
    CUDA_CHECK(cudaFree(d_A));
    CUDA_CHECK(cudaFree(d_B));
    CUDA_CHECK(cudaFree(d_C));

    CUBLAS_CHECK(cublasDestroy(cublasH));

    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaDeviceReset());

    return EXIT_SUCCESS;
}

构建

mkdir build && cd build
cmake ..
-- The CXX compiler identification is GNU 7.5.0
-- The CUDA compiler identification is NVIDIA 11.7.64
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Detecting CUDA compiler ABI info
-- Detecting CUDA compiler ABI info - done
-- Check for working CUDA compiler: /usr/local/cuda-11.7/bin/nvcc - skipped
-- Detecting CUDA compile features
-- Detecting CUDA compile features - done
-- Found CUDAToolkit: /usr/local/cuda-11.7/include (found version "11.7.64") 
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed
-- Looking for pthread_create in pthreads
-- Looking for pthread_create in pthreads - not found
-- Looking for pthread_create in pthread
-- Looking for pthread_create in pthread - found
-- Found Threads: TRUE  
11.7.64
-- Configuring done
-- Generating done
-- Build files have been written to: /home/lay/work/study/soft/cc/demo/build
make

具体的代码在CUDALibrarySamples

这样就可以运行了。

reference

cmake-cuda