CUDA Dynamic Parallelism, bad performance
We are having performance issues when using the CUDA Dynamic Parallelism. At this moment, CDP is performing at least 3X slower than a traditional approach. We made the simplest reproducible code to show this issue, which is to increment the value of all elements of an array by +1. т.е.
a[0,0,0,0,0,0,0,.....,0] --> kernel +1 --> a[1,1,1,1,1,1,1,1,1]
The point of this simple example is just to see if CDP can perform as the others, or if there are serious overheads.
The code is here:
#include <stdio.h>
#include <cuda.h>
#define BLOCKSIZE 512
__global__ void kernel_parent(int *a, int n, int N);
__global__ void kernel_simple(int *a, int n, int N, int offset);
// N is the total array size
// n is the worksize for a kernel (one third of N)
__global__ void kernel_parent(int *a, int n, int N){
cudaStream_t s1, s2;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid == 0){
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (n + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
kernel_simple<<< grid, block, 0, s1 >>> (a, n, N, n);
kernel_simple<<< grid, block, 0, s2 >>> (a, n, N, 2*n);
}
a[tid] += 1;
}
__global__ void kernel_simple(int *a, int n, int N, int offset){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int pos = tid + offset;
if(pos < N){
a[pos] += 1;
}
}
int main(int argc, char **argv){
if(argc != 3){
fprintf(stderr, "run as ./prog n method\nn multiple of 32 eg: 1024, 1048576 (1024^2), 4194304 (2048^2), 16777216 (4096^2)\nmethod:\n0 (traditional) \n1 (dynamic parallelism)\n2 (three kernels using unique streams)\n");
exit(EXIT_FAILURE);
}
int N = atoi(argv[1])*3;
int method = atoi(argv[2]);
// init array as 0
int *ah, *ad;
printf("genarray of 3*N = %i.......", N); fflush(stdout);
ah = (int*)malloc(sizeof(int)*N);
for(int i=0; i<N; ++i){
ah[i] = 0;
}
printf("done\n"); fflush(stdout);
// malloc and copy array to gpu
printf("cudaMemcpy:Host->Device..........", N); fflush(stdout);
cudaMalloc(&ad, sizeof(int)*N);
cudaMemcpy(ad, ah, sizeof(int)*N, cudaMemcpyHostToDevice);
printf("done\n"); fflush(stdout);
// kernel launch (timed)
cudaStream_t s1, s2, s3;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s3, cudaStreamNonBlocking);
cudaEvent_t start, stop;
float rtime = 0.0f;
cudaEventCreate(&start);
cudaEventCreate(&stop);
printf("Kernel...........................", N); fflush(stdout);
if(method == 0){
// CLASSIC KERNEL LAUNCH
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (N + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
cudaEventRecord(start, 0);
kernel_simple<<< grid, block >>> (ad, N, N, 0);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
}
else if(method == 1){
// DYNAMIC PARALLELISM
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
cudaEventRecord(start, 0);
kernel_parent<<< grid, block, 0, s1 >>> (ad, N/3, N);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
}
else{
// THREE CONCURRENT KERNEL LAUNCHES USING STREAMS
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
cudaEventRecord(start, 0);
kernel_simple<<< grid, block, 0, s1 >>> (ad, N/3, N, 0);
kernel_simple<<< grid, block, 0, s2 >>> (ad, N/3, N, N/3);
kernel_simple<<< grid, block, 0, s3 >>> (ad, N/3, N, 2*(N/3));
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
}
printf("done\n"); fflush(stdout);
printf("cudaMemcpy:Device->Host..........", N); fflush(stdout);
cudaMemcpy(ah, ad, sizeof(int)*N, cudaMemcpyDeviceToHost);
printf("done\n"); fflush(stdout);
printf("checking result.................."); fflush(stdout);
for(int i=0; i<N; ++i){
if(ah[i] != 1){
fprintf(stderr, "bad element: a[%i] = %i\n", i, ah[i]);
exit(EXIT_FAILURE);
}
}
printf("done\n"); fflush(stdout);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&rtime, start, stop);
printf("rtime: %f ms\n", rtime); fflush(stdout);
return EXIT_SUCCESS;
}
Can be compiled with
nvcc -arch=sm_35 -rdc=true -lineinfo -lcudadevrt -use_fast_math main.cu -o prog
This example can compute the result with 3 methods:
- Simple Kernel: Just a single classic kernel +1 pass on the array.
- Dynamic Parallelism: from main(), call a parent kernel which does +1 on the range [0,N/3), and also calls two child kernels. The first child does +1 in the range [N/3, 2*N/3), the second child in the range [2*N/3,N). Childs are launched using different streams so they can be concurrent.
- Three Streams from Host: This one just launches three non-blocking streams from main(), one for each third of the array.
I get the following profile for method 0 (simple kernel): The following for method 1 (dynamic parallelism): And the following for method 2 (Three Streams from Host) The running times are like this:
➜ simple-cdp git:(master) ✗ ./prog 16777216 0
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 1.140928 ms
➜ simple-cdp git:(master) ✗ ./prog 16777216 1
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 5.790048 ms
➜ simple-cdp git:(master) ✗ ./prog 16777216 2
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 1.011936 ms
The main problem, visible from the pictures, is that in the Dynamic Parallelism method the parent kernel is taking excessive amount of time to close after the two child kernels have finished, which is what is making it take 3X or 4X times more. Even when considering the worst case, if all three kernels (parent and two childs) run in serial, it should take much less. Ie, there is N/3 of work for each kernel, so the whole parent kernel should take approx 3 child kernels long, which is much less. Is there a way to solve this problem?
EDIT: The serialization phenomenon of the child kernels, as well as for method 2, have been explained by Robert Crovella in the comments (many thanks). The fact that the kernels did run in serial do not invalidate the problem described in bold text (not for now at least).
2 ответа
Вызовы во время выполнения устройства "дороги", точно так же, как вызовы во время выполнения хоста стоят дорого. В этом случае кажется, что вы вызываете среду выполнения устройства для создания потоков для каждого потока, даже если этот код требует их только для потока 0.
Изменяя ваш код, чтобы запросить создание потока только для потока 0, мы можем получить синхронизацию по времени между случаем, когда мы используем отдельные потоки для запуска дочернего ядра, и случаем, когда мы не используем отдельные потоки для запуска дочернего ядра:
$ cat t370.cu
#include <stdio.h>
#define BLOCKSIZE 512
__global__ void kernel_parent(int *a, int n, int N);
__global__ void kernel_simple(int *a, int n, int N, int offset);
// N is the total array size
// n is the worksize for a kernel (one third of N)
__global__ void kernel_parent(int *a, int n, int N){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid == 0){
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (n + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
#ifdef USE_STREAMS
cudaStream_t s1, s2;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
kernel_simple<<< grid, block, 0, s1 >>> (a, n, N, n);
kernel_simple<<< grid, block, 0, s2 >>> (a, n, N, 2*n);
#else
kernel_simple<<< grid, block >>> (a, n, N, n);
kernel_simple<<< grid, block >>> (a, n, N, 2*n);
#endif
// these next 2 lines add noticeably to the overall timing
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("oops1: %d\n", (int)err);
}
a[tid] += 1;
}
__global__ void kernel_simple(int *a, int n, int N, int offset){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int pos = tid + offset;
if(pos < N){
a[pos] += 1;
}
}
int main(int argc, char **argv){
if(argc != 3){
fprintf(stderr, "run as ./prog n method\nn multiple of 32 eg: 1024, 1048576 (1024^2), 4194304 (2048^2), 16777216 (4096^2)\nmethod:\n0 (traditional) \n1 (dynamic parallelism)\n2 (three kernels using unique streams)\n");
exit(EXIT_FAILURE);
}
int N = atoi(argv[1])*3;
int method = atoi(argv[2]);
// init array as 0
int *ah, *ad;
printf("genarray of 3*N = %i.......", N); fflush(stdout);
ah = (int*)malloc(sizeof(int)*N);
for(int i=0; i<N; ++i){
ah[i] = 0;
}
printf("done\n"); fflush(stdout);
// malloc and copy array to gpu
printf("cudaMemcpy:Host->Device..........", N); fflush(stdout);
cudaMalloc(&ad, sizeof(int)*N);
cudaMemcpy(ad, ah, sizeof(int)*N, cudaMemcpyHostToDevice);
printf("done\n"); fflush(stdout);
// kernel launch (timed)
cudaStream_t s1, s2, s3;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s3, cudaStreamNonBlocking);
cudaEvent_t start, stop;
float rtime = 0.0f;
cudaEventCreate(&start);
cudaEventCreate(&stop);
printf("Kernel...........................", N); fflush(stdout);
if(method == 0){
// CLASSIC KERNEL LAUNCH
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (N + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
cudaEventRecord(start, 0);
kernel_simple<<< grid, block >>> (ad, N, N, 0);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
}
else if(method == 1){
// DYNAMIC PARALLELISM
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
cudaEventRecord(start, 0);
kernel_parent<<< grid, block, 0, s1 >>> (ad, N/3, N);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
}
else{
// THREE CONCURRENT KERNEL LAUNCHES USING STREAMS
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
cudaEventRecord(start, 0);
kernel_simple<<< grid, block, 0, s1 >>> (ad, N/3, N, 0);
kernel_simple<<< grid, block, 0, s2 >>> (ad, N/3, N, N/3);
kernel_simple<<< grid, block, 0, s3 >>> (ad, N/3, N, 2*(N/3));
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
}
printf("done\n"); fflush(stdout);
printf("cudaMemcpy:Device->Host..........", N); fflush(stdout);
cudaMemcpy(ah, ad, sizeof(int)*N, cudaMemcpyDeviceToHost);
printf("done\n"); fflush(stdout);
printf("checking result.................."); fflush(stdout);
for(int i=0; i<N; ++i){
if(ah[i] != 1){
fprintf(stderr, "bad element: a[%i] = %i\n", i, ah[i]);
exit(EXIT_FAILURE);
}
}
printf("done\n"); fflush(stdout);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&rtime, start, stop);
printf("rtime: %f ms\n", rtime); fflush(stdout);
return EXIT_SUCCESS;
}
$ nvcc -arch=sm_52 -rdc=true -lcudadevrt -o t370 t370.cu
$ ./t370 16777216 1
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 6.925632 ms
$ nvcc -arch=sm_52 -rdc=true -lcudadevrt -o t370 t370.cu -DUSE_STREAMS
$ ./t370 16777216 1
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 6.673568 ms
$
Хотя это и не включено в результаты теста выше, согласно моему тестированию, это также приводит к случаю динамического параллелизма CUDA (CDP) (1
) в "приблизительный паритет" со случаями, не относящимися к CDP (0
, 2
). Обратите внимание, что мы можем сбрить время примерно на 1 мс (!), Отклонив вызов cudaGetLastError()
в родительском ядре (которое я добавил в ваш код).
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
using thrust::host_vector;
using thrust::device_vector;
#define BLOCKSIZE 512
__global__ void child(int* a)
{
if (threadIdx.x == 0 && blockIdx.x == 0)
a[0]++;
}
__global__ void parent(int* a)
{
if (threadIdx.x == 0 && blockIdx.x == 0)
child<<<gridDim, blockDim>>>(a);
}
#define NBLOCKS 1024
#define NTHREADS 1024
#define BENCHCOUNT 1000
template<typename Lambda>
void runBench(Lambda arg, int* rp, const char* name)
{
// "preheat" the GPU
for (int i = 0; i < 100; i++)
child<<<dim3(NBLOCKS,1,1), dim3(NTHREADS,1,1)>>>(rp);
cudaEvent_t start, stop;
float rtime = 0.0f;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int i = 0; i < BENCHCOUNT; i++)
arg();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&rtime, start, stop);
printf("=== %s ===\n", name);
printf("time: %f ms\n", rtime/BENCHCOUNT); fflush(stdout);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaDeviceSynchronize();
}
int main(int argc, char **argv)
{
host_vector<int> hv(1);
hv[0] = 0xAABBCCDD;
device_vector<int> dv(1);
dv = hv;
int* rp = thrust::raw_pointer_cast(&dv[0]);
auto benchFun = [&](void) {
child<<<dim3(NBLOCKS,1,1), dim3(NTHREADS,1,1)>>>(rp); };
runBench(benchFun, rp, "Single kernel launch");
auto benchFun2 = [&](void) {
for (int j = 0; j < 2; j++)
child<<<dim3(NBLOCKS,1,1), dim3(NTHREADS,1,1)>>>(rp);
};
runBench(benchFun2, rp, "2x sequential kernel launch");
auto benchFunDP = [&](void) {
parent<<<dim3(NBLOCKS,1,1), dim3(NTHREADS,1,1)>>>(rp); };
runBench(benchFunDP, rp, "Nested kernel launch");
}
Чтобы построить / запустить:
- Скопируйте / вставьте код выше в dpar.cu
- nvcc -arch = sm_52 -rdc = true -std = C++ 11 -lcudadevrt -o dpar dpar.cu
- ./dpar
На моем ноутбуке p5000 он печатает:
=== Запуск одного ядра ===
время: 0,014297 мс
=== 2x последовательный запуск ядра ===
время: 0,030468 мс
=== Запуск вложенного ядра ===
время: 0,083820 мс
Таким образом, накладные расходы довольно большие.. выглядит в моем случае 43 микросекунды.