前一天我们实现了CPU数据结构的操作,今天我们实现GPU的操作。

CUDA是NVIDIA提出的并行计算框架,结合了CPU与GPU的优点,主要用来处理密集型并行计算。GPU用来提高计算密集型应用程序中并行程序段的执行速度。CUDA是非常重要的计算加速平台。

需要注意的是,网上并没有特别好的GPU学习资源,只能够啃官方文档。

新建cuda_device_api.h文件,定义GPU端操作的各个函数。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
#ifndef DLSYS_RUNTIME_CUDA_DEVICE_API_H_
#define DLSYS_RUNTIME_CUDA_DEVICE_API_H_

#include "c_runtime_api.h"
#include "device_api.h"
#include <cuda_runtime.h>

#include <assert.h>
#include <string>

class CUDADeviceAPI : public DeviceAPI {
public:
  void *AllocDataSpace(DLContext ctx, size_t size, size_t alignment) final;

  void FreeDataSpace(DLContext ctx, void *ptr) final;

  void CopyDataFromTo(const void *from, void *to, size_t size,
                      DLContext ctx_from, DLContext ctx_to,
                      DLStreamHandle stream) final;

  void StreamSync(DLContext ctx, DLStreamHandle stream) final;
};

这段代码与CPU操作没有区别。

实现GPU操作的方法

新建cuda_device_api.cpp

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#include "./cuda_device_api.h"
#include <cassert>
#include <cuda_runtime.h>
#include <iostream>

// 定义CUDA报错宏
#define CUDA_CALL(func)                                                        \
  {                                                                            \
    cudaError_t e = (func);                                                    \
    assert((e == cudaSuccess) || (e == cudaErrorCudartUnloading));             \
  }

static void GPUCopy(const void *from, void *to, size_t size,
                    cudaMemcpyKind kind, cudaStream_t stream) {
  if (stream != 0) {
    CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream));
  } else {
    CUDA_CALL(cudaMemcpy(to, from, size, kind));
  }
}

void *CUDADeviceAPI::AllocDataSpace(DLContext ctx, size_t size,
                                    size_t alignment) {
  // std::cout << "allocating cuda data" << std::endl;
  CUDA_CALL(cudaSetDevice(ctx.device_id));
  assert((256 % alignment) == 0U); // << "CUDA space is aligned at 256 bytes";
  void *ret;
  CUDA_CALL(cudaMalloc(&ret, size));
  return ret;
}

void CUDADeviceAPI::FreeDataSpace(DLContext ctx, void *ptr) {
  CUDA_CALL(cudaSetDevice(ctx.device_id));
  CUDA_CALL(cudaFree(ptr));
}

void CUDADeviceAPI::CopyDataFromTo(const void *from, void *to, size_t size,
                                   DLContext ctx_from, DLContext ctx_to,
                                   DLStreamHandle stream) {
  // std::cout << "copying cuda data" << std::endl;
  cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
  if (ctx_from.device_type == kGPU && ctx_to.device_type == kGPU) {
    CUDA_CALL(cudaSetDevice(ctx_from.device_id));
    if (ctx_from.device_id == ctx_to.device_id) {
      GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream);
    } else {
      cudaMemcpyPeerAsync(to, ctx_to.device_id, from, ctx_from.device_id, size,
                          cu_stream);
    }
  } else if (ctx_from.device_type == kGPU && ctx_to.device_type == kCPU) {
    CUDA_CALL(cudaSetDevice(ctx_from.device_id));
    GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream);
  } else if (ctx_from.device_type == kCPU && ctx_to.device_type == kGPU) {
    CUDA_CALL(cudaSetDevice(ctx_to.device_id));
    GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream);
  } else {
    std::cerr << "expect copy from/to GPU or between GPU" << std::endl;
  }
}

void CUDADeviceAPI::StreamSync(DLContext ctx, DLStreamHandle stream) {
  CUDA_CALL(cudaSetDevice(ctx.device_id));
  CUDA_CALL(cudaStreamSynchronize(static_cast<cudaStream_t>(stream)));
}

分别实现了对应的操作。AllocDataSpace,FreeDataSpace,CopyDataFromTo,StreamSync

同时需要修改c_runtime_api.cpp里面的:

1
2
3
4
5
6
7
  DeviceAPIManager() {
    std::fill(api_.begin(), api_.end(), nullptr);
    static CPUDeviceAPI cpu_device_api_inst;
    static CUDADeviceAPI gpu_device_api_inst;
    api_[kCPU] = static_cast<DeviceAPI *>(&cpu_device_api_inst);
    api_[kGPU] = static_cast<DeviceAPI *>(&gpu_device_api_inst);
  }

GPU操作的实现

当完成了对GPU的操作,接下来实现GPU的各种方法,需要在框架中实现的,在框架中我们需要以下几种方法:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
  int DLGpuArraySet(DLArrayHandle arr, float value);
  int DLGpuBroadcastTo(const DLArrayHandle input, DLArrayHandle output);
  int DLGpuReduceSumAxisZero(const DLArrayHandle input, DLArrayHandle output);
  int DLGpuMatrixElementwiseAdd(const DLArrayHandle matA,
                                const DLArrayHandle matB, DLArrayHandle output);
  int DLGpuMatrixElementwiseAddByConst(const DLArrayHandle input, float val,
                                       DLArrayHandle output);
  int DLGpuMatrixElementwiseMultiply(
      const DLArrayHandle matA, const DLArrayHandle matB, DLArrayHandle output);
  int DLGpuMatrixMultiplyByConst(const DLArrayHandle input, float val,
                                 DLArrayHandle output);
  int DLGpuMatrixMultiply(const DLArrayHandle matA, bool transposeA,
                          const DLArrayHandle matB, bool transposeB,
                          DLArrayHandle matC);
  int DLGpuRelu(const DLArrayHandle input, DLArrayHandle output);

  int DLGpuReluGradient(const DLArrayHandle input, const DLArrayHandle in_grad,
                        DLArrayHandle output);
  int DLGpuSoftmax(const DLArrayHandle input, DLArrayHandle output);
  int DLGpuSoftmaxCrossEntropy(const DLArrayHandle input_a,
                               const DLArrayHandle input_b,
                               DLArrayHandle output);

分别对应着深度学习里面常用方法

  • GPU设置数值
  • 广播机制
  • 对第一个维度进行累加
  • 矩阵元素相乘
  • 矩阵与常量相乘
  • 矩阵相乘
  • 矩阵相加
  • 矩阵与常量相加
  • 矩阵Relu激活
  • 矩阵Relu导数
  • 矩阵Softmax
  • Softmax交叉熵

GPU操作的实现

  1. GPU设置数值
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
__global__ void array_set_kernel(float *array, float value, int n) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < n) {
        array[index] = value;
    }
}


int DLGpuArraySet(DLArrayHandle arr, float value) { /* TODO: Your code here */
    int n = 1;
    for (int i = 0; i < arr->ndim; i++) {
        n = n * arr->shape[i];
    }

    float *array_data = (float *) arr->data;

    int threads_per_block = 1024;
    int num_blocks = (n + threads_per_block - 1) / threads_per_block;

    array_set_kernel << < num_blocks, threads_per_block >> > (array_data, value, n);
    return 0;
}
  1. Broadcast
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
__global__ void broadcast_to_kernel(const float *input_data,
                                    float *output_data,
                                    index_t input_n,
                                    index_t output_n) {
    index_t idx = blockDim.x * blockIdx.x + threadIdx.x;
    if (idx < output_n) {
        output_data[idx] = input_data[idx % input_n];
    }
}


int DLGpuBroadcastTo(const DLArrayHandle input, DLArrayHandle output) {
    index_t input_n = 1;
    for (int i = 0; i < input->ndim; i++)
        input_n *= input->shape[i];

    index_t output_n = 1;
    for (int i = 0; i < output->ndim; i++)
        output_n *= output->shape[i];

    const float *input_data = (const float *) input->data;
    float *output_data = (float *) output->data;

    int thread_per_block = 512;
    int n_blocks = (output_n + thread_per_block - 1) / thread_per_block;
    broadcast_to_kernel << < n_blocks, thread_per_block >> > (input_data, output_data,
            input_n, output_n);
    return 0;
}
  1. reduceSum
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
__global__ void reduced_sum_axis_zero(const float *input_data, float *output_data, int input_n, int output_n) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if (idx < output_n) {
        output_data[idx] = 0.0;
        for (int i = 0; i < input_n / output_n; i++) {
            output_data[idx] += input_data[i * output_n + idx];
        }
    }
}

int DLGpuReduceSumAxisZero(const DLArrayHandle input, DLArrayHandle output) {
    int input_n = 1;
    for (int i = 0; i < input->ndim; i++) {
        input_n *= input->shape[i];
    }

    int output_n = 1;
    for (int i = 0; i < output->ndim; i++) {
        output_n *= output->shape[i];
    }

    const float *input_data = (const float *) input->data;
    float *output_data = (float *) output->data;

    int thread_per_block = 1024;
    int n_blocks = (output_n + thread_per_block - 1) / thread_per_block;

    reduced_sum_axis_zero << < n_blocks, thread_per_block >> > (input_data, output_data, input_n, output_n);
    return 0;
}
  1. 矩阵元素相加
__global__ void matrix_elementwise_add(const float *a, const float *b, float *c,
                                       int n) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < n) {
        c[index] = a[index] + b[index];
    }
}

int DLGpuMatrixElementwiseAdd(const DLArrayHandle matA,
                              const DLArrayHandle matB, DLArrayHandle output) {
    int n = 1;
    for (int i = 0; i < output->ndim; i++) {
        n = n * output->shape[i];
    }
    const float *data_A = (const float *) matA->data;
    const float *data_B = (const float *) matB->data;
    float *data_output = (float *) output->data;

    int threads_per_block = 1024;
    int num_blocks = (n + threads_per_block - 1) / threads_per_block;

    matrix_elementwise_add << < num_blocks, threads_per_block >> > (data_A, data_B,
            data_output, n);
    return 0;
}
  1. 矩阵与常量相乘
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
__global__ void marix_multiply_by_const(const float *d_input, float *d_output,
                                        float val, int n) {
    int index = blockDim.x * blockIdx.x + threadIdx.x;
    if (index < n) {
        d_output[index] = d_input[index] * val;
    }
}

int DLGpuMatrixMultiplyByConst(const DLArrayHandle input, float val,
                               DLArrayHandle output) {
    int n = 1;
    for (int i = 0; i < input->ndim; i++) {
        n *= input->shape[i];
    }

    const float *input_data = (const float *) input->data;
    float *output_data = (float *) output->data;
    int threads_per_block = 1024;
    int num_blocks = (n + threads_per_block - 1) / threads_per_block;
    marix_multiply_by_const << < num_blocks, threads_per_block >> > (input_data,
            output_data, val, n);
    return 0;
}
  1. 矩阵元素相加 同理

  2. 矩阵与常量相乘 同理

  3. 矩阵Relu激活

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
__global__ void relu_kernel(const float *input, float *output, int n) {
    int index = blockDim.x * blockIdx.x + threadIdx.x;
    if (index < n) {
        float element = input[index];
        if (element <= 0) {
            output[index] = 0;
        } else {
            output[index] = element;
        }
    }
}

int DLGpuRelu(const DLArrayHandle input, DLArrayHandle output) {
    int n = 1;
    for (int i = 0; i < input->ndim; i++) {
        n *= input->shape[i];
    }

    const float *input_data = (const float *) input->data;
    float *output_data = (float *) output->data;
    int threads_per_block = 1024;
    int num_blocks = (n + threads_per_block - 1) / threads_per_block;
    relu_kernel << < num_blocks, threads_per_block >> > (input_data, output_data, n);
    return 0;
}
  1. 矩阵Relu导数
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
__global__ void relu_gradient_kernel(const float *input, float *output,
                                     const float *in_grad, int n) {
    int index = blockDim.x * blockIdx.x + threadIdx.x;
    if (index < n) {
        float element = input[index];
        if (element <= 0) {
            output[index] = 0;
        } else {
            output[index] = in_grad[index];
        }
    }
}

int DLGpuReluGradient(const DLArrayHandle input, const DLArrayHandle in_grad,
                      DLArrayHandle output) {
    int n = 1;
    for (int i = 0; i < input->ndim; i++) {
        n *= input->shape[i];
    }

    const float *input_data = (const float *) input->data;
    float *output_data = (float *) output->data;
    const float *in_grad_data = (const float *) in_grad->data;
    int threads_per_block = 1024;
    int num_blocks = (n + threads_per_block - 1) / threads_per_block;

    relu_gradient_kernel << < num_blocks, threads_per_block >> > (input_data,
            output_data, in_grad_data, n);
    return 0;
}
  1. 矩阵Softmax
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
__global__ void softmax_kernel(int64_t nrow, int64_t ncol,
                               const float *input_data,
                               float *output_data) {

    int y = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
    if (y >= nrow) {
        return;
    }
    input_data += y * ncol;
    output_data += y * ncol;
    // 找一行的最大值
    float maxval = *input_data;
    for (int x = 1; x < ncol; ++x) {
        maxval = max(maxval, input_data[x]);
    }
    float sum = 0;
    for (int x = 0; x < ncol; ++x) {
        sum += exp(input_data[x] - maxval);
    }
    for (int x = 0; x < ncol; ++x) {
        output_data[x] = exp(input_data[x] - maxval) / sum;
    }
}


int DLGpuSoftmax(const DLArrayHandle input, DLArrayHandle output) {
    assert(input->ndim == 2);
    assert(output->ndim == 2);
    int64_t nrow = input->shape[0];
    int64_t ncol = input->shape[1];
    float *input_data = (float *) input->data;
    float *output_data = (float *) output->data;
    dim3 threads;
    if (nrow < 1024) {
        threads.x = nrow;
    } else {
        threads.x = 1024;
        threads.y = (nrow + 1023) / 1024;
    }
    softmax_kernel << < 1, threads >> > (nrow, ncol, input_data, output_data);
    return 0;
}
  1. Softmax交叉熵
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70

// 求交叉熵
// np.mean(-np.sum(y_ * np.log(softmax(y)), axis=1), keepdims=True)
__global__ void matrix_softmax_cross_entropy_kernel(int nrow, int ncol,
                                                    const float *input_a,
                                                    const float *input_b,
                                                    float *output) {
  // Dynamic shared memory, size provided at kernel launch.
  extern __shared__ float loss_per_row[];
  // Two dimensional thread blocks.
  int y = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
          threadIdx.x;
  if (y >= nrow) {
    return;
  }
  input_a += y * ncol;
  input_b += y * ncol;
  float maxval = *input_a;
  // Find max for a row.
  for (int x = 1; x < ncol; ++x) {
    maxval = max(maxval, input_a[x]);
  }
  // Deduct by max for a row, and raise to exp.
  float sum = 0;
  for (int x = 0; x < ncol; ++x) {
    sum += exp(input_a[x] - maxval);
  }
  // Compute per-row loss.
  float loss = 0;
  for (int x = 0; x < ncol; ++x) {
    loss -= input_b[x] * log(exp(input_a[x] - maxval) / sum);
  }
  loss_per_row[y] = loss;
  __syncthreads();
  // Compute reduce_mean across rows.
  float mean_loss = 0;
  // Use a single thread to reduce mean across rows.
  if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
    for (int i = 0; i < nrow; ++i) {
      mean_loss += loss_per_row[i];
    }
    mean_loss /= nrow;
    output[0] = mean_loss;
  }
}
int DLGpuSoftmaxCrossEntropy(const DLArrayHandle input_a,
		const DLArrayHandle input_b, DLArrayHandle output) {
	assert(input_a->ndim == 2);
	assert(input_b->ndim == 2);
	assert(output->ndim == 1);
	assert(
			input_a->shape[0] == input_b->shape[0]
					&& input_a->shape[1] == input_b->shape[1]);
	int nrow = input_a->shape[0];
	assert(nrow <= 1024 * 4);
	int ncol = input_a->shape[1];
	const float *input_data_a = (const float *) input_a->data;
	const float *input_data_b = (const float *) input_b->data;
	float *output_data = (float *) output->data;
	dim3 threads;
	if (nrow <= 1024) {
		threads.x = nrow;
	} else {
		threads.x = 1024;
		threads.y = (nrow + 1023) / 1024;
	}
	matrix_softmax_cross_entropy_kernel<<<1, threads, nrow * sizeof(float)>>>(
			nrow, ncol, input_data_a, input_data_b, output_data);
	return 0;
}

到这里所有的数据结构的操作全部完成,接下来将其编译成动态链接库。

在与src平级的文件夹下,新建Makefile文件。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
CUDA_DIR = /usr/local/cuda

CC_SRCS := $(wildcard src/*.cc)
CC_OBJS := ${CC_SRCS:src/%.cc=build/obj/%.o}
CUDA_SRCS := $(wildcard src/*.cu)
CUDA_OBJS := ${CUDA_SRCS:src/%.cu=build/obj/%.o}
OBJS := $(CC_OBJS) $(CUDA_OBJS)

CC = g++
WARNINGS = -Wall -Wfatal-errors -Wno-unused -Wno-unused-result
CC_FLAGS = -std=c++11 -fPIC $(WARNINGS) -I$(CUDA_DIR)/include
LD_FLAGS = -L$(CUDA_DIR)/lib64 -lcuda -lcudart -lcublas

NVCC = nvcc
NVCC_FLAGS = -std=c++11 --compiler-options '-fPIC'
ARCH = -gencode arch=compute_30,code=sm_30 \
       -gencode arch=compute_35,code=sm_35 \
       -gencode arch=compute_50,code=[sm_50,compute_50] \
       -gencode arch=compute_52,code=[sm_52,compute_52]

all: build/lib/libc_runtime_api.so

build/lib/libc_runtime_api.so: $(OBJS)
	@mkdir -p build/lib
	$(CC) -shared $^ -o $@ $(LD_FLAGS)

build/obj/%.o: src/%.cc
	@mkdir -p build/obj
	$(CC) $(CC_FLAGS) -c $< -o $@

build/obj/%.o: src/%.cu
	@mkdir -p build/obj
	$(NVCC) $(ARCH) $(NVCC_FLAGS) -c $< -o $@

clean:
	rm -rf build

.PHONY: clean

打开Linux命令行,执行make操作,就可以得到libc_runtime_api.so文件,接下来的所有的操作就围绕着该动态链接库来进行。

在该GPU实现中,需要用了很多数学知识,关于softmax的推导蛮好的资料在 How to implement a neural network Intermezzo.

接下来实现Python调用该so文件进行无缝链接。