Производительность CUDA Cusolver
Я использую следующий код, который использует процедуру cusolverDnSgesvd, для расчета SVD. К сожалению, производительность намного ниже, чем у CPU, и когда я профилирую код, в отчете nvprof я вижу так много передач данных DtoH и HtoD (каждый передает только несколько байтов). Таким образом, производительность низкая, и большая часть времени уходит на передачу данных.
Мне было интересно, есть ли другой способ использовать cusolverDnSgesvd, который предотвращает так много передачи данных между хостом и устройством?
Я использую K40m и CUDA 8.0.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<iostream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include<math.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
#include "global.h"
/********/
/* MAIN */
/********/
int main(){
// --- gesvd only supports Nrows >= Ncols
// --- column major memory ordering
const long long Nrows = 200*200;
const long long Ncols = 232;
// --- cuSOLVE input/output parameters/arrays
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolverDnCreate(&solver_handle);
// --- Setting the host, Nrows x Ncols matrix
float *h_A = (float *)malloc(Nrows * Ncols * sizeof(float));
for(int j = 0; j < Nrows; j++)
for(int i = 0; i < Ncols; i++)
h_A[j + i*Nrows] = (i + j*j) * sqrt((float)(i + j));
// --- Setting the device matrix and moving the host matrix to the device
float *d_A; gpuErrchk(cudaMalloc(&d_A, Nrows * Ncols * sizeof(float)));
gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice));
// --- host side SVD results space
float *h_U = (float *)malloc(Nrows * Nrows * sizeof(float));
float *h_V = (float *)malloc(Ncols * Ncols * sizeof(float));
float *h_S = (float *)malloc(min(Nrows, Ncols) * sizeof(float));
// --- device side SVD workspace and matrices
float *d_U; gpuErrchk(cudaMalloc(&d_U, Nrows * Nrows * sizeof(float)));
float *d_V; gpuErrchk(cudaMalloc(&d_V, Ncols * Ncols * sizeof(float)));
float *d_S; gpuErrchk(cudaMalloc(&d_S, min(Nrows, Ncols) * sizeof(float)));
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
// --- CUDA SVD initialization
cusolveSafeCall(cusolverDnSgesvd_bufferSize(solver_handle, Nrows, Ncols, &work_size));
float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));
// --- CUDA SVD execution
// cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'A', 'A', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, d_V, Ncols, work, work_size, NULL, devInfo));
cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'A', 'N', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, /*d_V*/NULL, Ncols, work, work_size, NULL, devInfo));
int devInfo_h = 0; gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h != 0) std::cout << "Unsuccessful SVD execution\n\n";
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("Elapsed time(ms): %f\n", milliseconds);
// --- Moving the results from device to host
gpuErrchk(cudaMemcpy(h_S, d_S, min(Nrows, Ncols) * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_U, d_U, Nrows * Nrows * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, Ncols * Ncols * sizeof(float), cudaMemcpyDeviceToHost));
#if 0
std::cout << "Singular values\n";
for(int i = 0; i < min(Nrows, Ncols); i++)
std::cout << "d_S["<<i<<"] = " << std::setprecision(15) << h_S[i] << std::endl;
std::cout << "\nLeft singular vectors - For y = A * x, the columns of U span the space of y\n";
for(int j = 0; j < Nrows; j++) {
printf("\n");
for(int i = 0; i < Nrows; i++)
printf("U[%i,%i]=%f\n",i,j,h_U[j*Nrows + i]);
}
std::cout << "\nRight singular vectors - For y = A * x, the columns of V span the space of x\n";
for(int i = 0; i < Ncols; i++) {
printf("\n");
for(int j = 0; j < Ncols; j++)
printf("V[%i,%i]=%f\n",i,j,h_V[j*Ncols + i]);
}
#endif
cusolverDnDestroy(solver_handle);
return 0;
}
Вот результат работы nvprof:
Time(%) Time Calls Avg Min Max Name
59.07% 22.2271s 693 32.074ms 21.185us 100.61ms void ger_kernel<float, float, int=256, int=5, bool=0>(cublasGerParams<float, float>)
35.05% 13.1878s 463 28.483ms 684.72us 61.554ms void gemv2T_kernel_val<float, float, float, int=128, int=16, int=2, int=2, bool=0>(int, int, float, float const *, int, float const *, int, float, float*, int)
5.62% 2.11345s 2322 910.18us 1.9840us 2.09625s [CUDA memcpy DtoH]
0.11% 39.940ms 1 39.940ms 39.940ms 39.940ms void orgqr_set_submatrix_to_identity<float, int=5, int=3>(int, int, int, float*, int)
0.10% 35.770ms 223 160.41us 36.033us 308.49us void gemv2N_kernel_val<float, float, float, int=128, int=4, int=4, int=4, int=11>(float, float, cublasGemv2Params_v2<float, float, float>)
0.03% 11.258ms 3017 3.7310us 864ns 4.3219ms [CUDA memcpy HtoD]
0.02% 9.1377ms 924 9.8890us 7.6480us 13.728us void nrm2_kernel<float, float, float, int=1, int=0, int=128, int=0>(cublasNrm2Params<float, float>)
0.01% 2.5833ms 694 3.7220us 2.2080us 4.3840us void scal_kernel_val<float, float, int=0>(cublasScalParamsVal<float, float>)
0.00% 490.35us 1 490.35us 490.35us 490.35us void lacpy_kernel<float, int=5, int=3>(int, int, float const *, int, float*, int, int, int)
0.00% 453.22us 694 653ns 464ns 3.8180us [CUDA memset]
0.00% 78.242us 4 19.560us 19.073us 20.160us void gemv2N_kernel_val<float, float, float, int=128, int=2, int=4, int=4, int=11>(float, float, cublasGemv2Params_v2<float, float, float>)
0.00% 34.816us 3 11.605us 10.560us 12.832us void gemv2N_kernel_val<float, float, float, int=128, int=1, int=4, int=4, int=11>(float, float, cublasGemv2Params_v2<float, float, float>)
==30278== API calls:
Time(%) Time Calls Avg Min Max Name
93.23% 35.5561s 5333 6.6672ms 3.2230us 162.11ms cudaMemcpyAsync
5.51% 2.10125s 6 350.21ms 23.917us 2.09666s cudaMemcpy
1.15% 439.19ms 9 48.799ms 9.0750us 431.30ms cudaMalloc
0.07% 26.216ms 3006 8.7210us 5.8260us 410.28us cudaLaunch
0.01% 5.5753ms 2316 2.4070us 1.7040us 19.727us cudaStreamSynchronize
0.01% 2.9150ms 695 4.1940us 250ns 15.356us cudaMemsetAsync
0.01% 2.0314ms 7644 265ns 140ns 409.17us cudaSetupArgument
0.00% 1.3169ms 5317 247ns 138ns 4.7720us cudaGetLastError
0.00% 1.2719ms 462 2.7530us 2.2360us 10.603us cudaFuncGetAttributes
0.00% 1.0627ms 462 2.3000us 1.6970us 6.4350us cudaEventQuery
0.00% 713.82us 3006 237ns 148ns 13.563us cudaConfigureCall
0.00% 642.52us 91 7.0600us 314ns 253.80us cuDeviceGetAttribute
0.00% 518.37us 464 1.1170us 796ns 11.617us cudaEventRecord
0.00% 393.04us 1 393.04us 393.04us 393.04us cuDeviceTotalMem
0.00% 340.13us 5 68.025us 832ns 252.27us cudaFree
0.00% 57.216us 1 57.216us 57.216us 57.216us cuDeviceGetName
0.00% 16.432us 16 1.0270us 575ns 5.5690us cudaEventCreateWithFlags
0.00% 10.828us 16 676ns 394ns 2.0930us cudaEventDestroy
0.00% 10.743us 2 5.3710us 2.1210us 8.6220us cudaThreadSynchronize
0.00% 10.621us 20 531ns 351ns 2.7090us cudaDeviceGetAttribute
0.00% 6.5030us 3 2.1670us 392ns 4.8860us cuDeviceGetCount
0.00% 5.3180us 2 2.6590us 1.2370us 4.0810us cudaEventCreate
0.00% 4.1060us 1 4.1060us 4.1060us 4.1060us cudaEventElapsedTime
0.00% 3.6600us 2 1.8300us 433ns 3.2270us cudaGetDevice
0.00% 2.9400us 1 2.9400us 2.9400us 2.9400us cudaEventSynchronize
0.00% 2.4090us 3 803ns 509ns 1.1370us cuDeviceGet