Производительность CUDA Cusolver

Я использую следующий код, который использует процедуру cusolverDnSgesvd, для расчета SVD. К сожалению, производительность намного ниже, чем у CPU, и когда я профилирую код, в отчете nvprof я вижу так много передач данных DtoH и HtoD (каждый передает только несколько байтов). Таким образом, производительность низкая, и большая часть времени уходит на передачу данных.

Мне было интересно, есть ли другой способ использовать cusolverDnSgesvd, который предотвращает так много передачи данных между хостом и устройством?

Я использую K40m и CUDA 8.0.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include<iostream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include<math.h>

#include <cusolverDn.h>
#include <cuda_runtime_api.h>

#include "global.h"
/********/
/* MAIN */
/********/
int main(){

        // --- gesvd only supports Nrows >= Ncols
        // --- column major memory ordering

        const long long Nrows = 200*200;
        const long long Ncols = 232;

        // --- cuSOLVE input/output parameters/arrays
        int work_size = 0;
        int *devInfo;                   gpuErrchk(cudaMalloc(&devInfo,          sizeof(int)));

        // --- CUDA solver initialization
        cusolverDnHandle_t solver_handle;
        cusolverDnCreate(&solver_handle);

        // --- Setting the host, Nrows x Ncols matrix
        float *h_A = (float *)malloc(Nrows * Ncols * sizeof(float));
        for(int j = 0; j < Nrows; j++)
                for(int i = 0; i < Ncols; i++)
                        h_A[j + i*Nrows] = (i + j*j) * sqrt((float)(i + j));

        // --- Setting the device matrix and moving the host matrix to the device
        float *d_A;                     gpuErrchk(cudaMalloc(&d_A,              Nrows * Ncols * sizeof(float)));
        gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice));

        // --- host side SVD results space
        float *h_U = (float *)malloc(Nrows * Nrows     * sizeof(float));
        float *h_V = (float *)malloc(Ncols * Ncols     * sizeof(float));
        float *h_S = (float *)malloc(min(Nrows, Ncols) * sizeof(float));

        // --- device side SVD workspace and matrices
        float *d_U;                     gpuErrchk(cudaMalloc(&d_U,      Nrows * Nrows     * sizeof(float)));
        float *d_V;                     gpuErrchk(cudaMalloc(&d_V,      Ncols * Ncols     * sizeof(float)));
        float *d_S;                     gpuErrchk(cudaMalloc(&d_S,      min(Nrows, Ncols) * sizeof(float)));


        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);

        cudaEventRecord(start);

        // --- CUDA SVD initialization
        cusolveSafeCall(cusolverDnSgesvd_bufferSize(solver_handle, Nrows, Ncols, &work_size));
        float *work;    gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));

        // --- CUDA SVD execution
//      cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'A', 'A', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, d_V, Ncols, work, work_size, NULL, devInfo));
        cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'A', 'N', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, /*d_V*/NULL, Ncols, work, work_size, NULL, devInfo));
        int devInfo_h = 0;      gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
        if (devInfo_h != 0) std::cout   << "Unsuccessful SVD execution\n\n";

        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        float milliseconds = 0;
        cudaEventElapsedTime(&milliseconds, start, stop);
        printf("Elapsed time(ms): %f\n", milliseconds);

        // --- Moving the results from device to host
        gpuErrchk(cudaMemcpy(h_S, d_S, min(Nrows, Ncols) * sizeof(float), cudaMemcpyDeviceToHost));
        gpuErrchk(cudaMemcpy(h_U, d_U, Nrows * Nrows     * sizeof(float), cudaMemcpyDeviceToHost));
        gpuErrchk(cudaMemcpy(h_V, d_V, Ncols * Ncols     * sizeof(float), cudaMemcpyDeviceToHost));
#if 0
        std::cout << "Singular values\n";
        for(int i = 0; i < min(Nrows, Ncols); i++)
                std::cout << "d_S["<<i<<"] = " << std::setprecision(15) << h_S[i] << std::endl;

        std::cout << "\nLeft singular vectors - For y = A * x, the columns of U span the space of y\n";
        for(int j = 0; j < Nrows; j++) {
                printf("\n");
                for(int i = 0; i < Nrows; i++)
                        printf("U[%i,%i]=%f\n",i,j,h_U[j*Nrows + i]);
        }

        std::cout << "\nRight singular vectors - For y = A * x, the columns of V span the space of x\n";
        for(int i = 0; i < Ncols; i++) {
                printf("\n");
                for(int j = 0; j < Ncols; j++)
                        printf("V[%i,%i]=%f\n",i,j,h_V[j*Ncols + i]);
        }
#endif
        cusolverDnDestroy(solver_handle);

        return 0;

}

Вот результат работы nvprof:

Time(%)      Time     Calls       Avg       Min       Max  Name
 59.07%  22.2271s       693  32.074ms  21.185us  100.61ms  void ger_kernel<float, float, int=256, int=5, bool=0>(cublasGerParams<float, float>)
 35.05%  13.1878s       463  28.483ms  684.72us  61.554ms  void gemv2T_kernel_val<float, float, float, int=128, int=16, int=2, int=2, bool=0>(int, int, float, float const *, int, float const *, int, float, float*, int)
  5.62%  2.11345s      2322  910.18us  1.9840us  2.09625s  [CUDA memcpy DtoH]
  0.11%  39.940ms         1  39.940ms  39.940ms  39.940ms  void orgqr_set_submatrix_to_identity<float, int=5, int=3>(int, int, int, float*, int)
  0.10%  35.770ms       223  160.41us  36.033us  308.49us  void gemv2N_kernel_val<float, float, float, int=128, int=4, int=4, int=4, int=11>(float, float, cublasGemv2Params_v2<float, float, float>)
  0.03%  11.258ms      3017  3.7310us     864ns  4.3219ms  [CUDA memcpy HtoD]
  0.02%  9.1377ms       924  9.8890us  7.6480us  13.728us  void nrm2_kernel<float, float, float, int=1, int=0, int=128, int=0>(cublasNrm2Params<float, float>)
  0.01%  2.5833ms       694  3.7220us  2.2080us  4.3840us  void scal_kernel_val<float, float, int=0>(cublasScalParamsVal<float, float>)
  0.00%  490.35us         1  490.35us  490.35us  490.35us  void lacpy_kernel<float, int=5, int=3>(int, int, float const *, int, float*, int, int, int)
  0.00%  453.22us       694     653ns     464ns  3.8180us  [CUDA memset]
  0.00%  78.242us         4  19.560us  19.073us  20.160us  void gemv2N_kernel_val<float, float, float, int=128, int=2, int=4, int=4, int=11>(float, float, cublasGemv2Params_v2<float, float, float>)
  0.00%  34.816us         3  11.605us  10.560us  12.832us  void gemv2N_kernel_val<float, float, float, int=128, int=1, int=4, int=4, int=11>(float, float, cublasGemv2Params_v2<float, float, float>)

==30278== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 93.23%  35.5561s      5333  6.6672ms  3.2230us  162.11ms  cudaMemcpyAsync
  5.51%  2.10125s         6  350.21ms  23.917us  2.09666s  cudaMemcpy
  1.15%  439.19ms         9  48.799ms  9.0750us  431.30ms  cudaMalloc
  0.07%  26.216ms      3006  8.7210us  5.8260us  410.28us  cudaLaunch
  0.01%  5.5753ms      2316  2.4070us  1.7040us  19.727us  cudaStreamSynchronize
  0.01%  2.9150ms       695  4.1940us     250ns  15.356us  cudaMemsetAsync
  0.01%  2.0314ms      7644     265ns     140ns  409.17us  cudaSetupArgument
  0.00%  1.3169ms      5317     247ns     138ns  4.7720us  cudaGetLastError
  0.00%  1.2719ms       462  2.7530us  2.2360us  10.603us  cudaFuncGetAttributes
  0.00%  1.0627ms       462  2.3000us  1.6970us  6.4350us  cudaEventQuery
  0.00%  713.82us      3006     237ns     148ns  13.563us  cudaConfigureCall
  0.00%  642.52us        91  7.0600us     314ns  253.80us  cuDeviceGetAttribute
  0.00%  518.37us       464  1.1170us     796ns  11.617us  cudaEventRecord
  0.00%  393.04us         1  393.04us  393.04us  393.04us  cuDeviceTotalMem
  0.00%  340.13us         5  68.025us     832ns  252.27us  cudaFree
  0.00%  57.216us         1  57.216us  57.216us  57.216us  cuDeviceGetName
  0.00%  16.432us        16  1.0270us     575ns  5.5690us  cudaEventCreateWithFlags
  0.00%  10.828us        16     676ns     394ns  2.0930us  cudaEventDestroy
  0.00%  10.743us         2  5.3710us  2.1210us  8.6220us  cudaThreadSynchronize
  0.00%  10.621us        20     531ns     351ns  2.7090us  cudaDeviceGetAttribute
  0.00%  6.5030us         3  2.1670us     392ns  4.8860us  cuDeviceGetCount
  0.00%  5.3180us         2  2.6590us  1.2370us  4.0810us  cudaEventCreate
  0.00%  4.1060us         1  4.1060us  4.1060us  4.1060us  cudaEventElapsedTime
  0.00%  3.6600us         2  1.8300us     433ns  3.2270us  cudaGetDevice
  0.00%  2.9400us         1  2.9400us  2.9400us  2.9400us  cudaEventSynchronize
  0.00%  2.4090us         3     803ns     509ns  1.1370us  cuDeviceGet

0 ответов

Другие вопросы по тегам