Необъяснимое поведение cuda, связанное с памятью

Так что в основном я взял свой код на C++ (который работает правильно) и переписал его на cuda (у меня нет опыта работы с cuda). Одна часть кода (метод execute ()) не работает правильно, и я действительно не знаю почему.

Поэтому мой вопрос заключается в том, что именно означает ошибку "неопределенная ошибка запуска" во время cudaMemcpy и почему это происходит в моем коде.

Мой второй вопрос: почему переменные backup_ans и ans отличаются, когда они вычисляют одно и то же?

#include "stdio.h"
#include <algorithm>

__device__ unsigned int primes[1024];
__device__ long long n = 1ll<<32; // #unsigned_integers




__device__ int hashh(long long x) {
      return (x>>1)%1024;
}

// compute (x^e)%n
__device__ unsigned long long mulmod(unsigned long long x,unsigned long long e,unsigned long long n) {
    unsigned long long ans = 1;
    while(e>0) {
        if(e&1) ans = (ans*x)%n;
        x = (x*x)%n;
        e>>=1;
    }
    return ans;
}

// determine whether n is strong probable prime base a or not.
// n is ODD
__device__ int is_SPRP(unsigned long long a,unsigned long long n) {
  int d=0;
  unsigned long long t = n-1;
  while(t%2==0) {
      ++d;
      t>>=1;
  }
  unsigned long long x = mulmod(a,t,n);
  if(x==1) return 1; 
  for(int i=0;i<d;++i) {
      if(x==n-1) return 1;
      x=(x*x)%n;
  }
  return 0;
}
__device__ int prime(long long x) {
        return is_SPRP((unsigned long long)primes[(((long long)0xAFF7B4*x)>>7)%1024],(unsigned long long)x);
}

// copy all unsigned COMPOSITE ingeters which are not congruent to zero modulo 2,3,5,7 and their hashh value = 0; 
// count of those elements store in c
// 335545 is just magic constant to distribute all integers equally on all 400*32 threads
__global__ void find(unsigned int *out,unsigned int *c) {
    unsigned int buff[4096];
    int local_c = 0;
    long long b = 121+(threadIdx.x+blockIdx.x*blockDim.x)*335545;
    long long e = b+335545;
    if(b%2==0) ++b;
    for(long long i=b;i<e && i<n;i+=2) {
        if(i%3==0 || i%5==0 || i%7==0 || prime(i)) continue;
        if(hashh(i)==0) {
            buff[local_c++]=(unsigned int)i;
            if(local_c==4096) {
                int start = atomicAdd(c,local_c);
                for(int i=0;i<local_c;++i) out[i+start]=buff[i];
                local_c=0;
            }
        }
    }
    int start = atomicAdd(c,local_c);
    for(int i=0;i<local_c;++i) out[i+start]=buff[i];
}

// find base for which all elements in input are NOT SPRP. base is from {2,..,34} stored in 32bit uint
__global__ void solve(unsigned int *input, unsigned int *count,unsigned int *backup, unsigned int *ans) {
      __shared__ unsigned int s[32];
    unsigned int dif = (*count)/(blockDim.x*gridDim.x) +1;
    unsigned int b = (threadIdx.x+blockIdx.x*blockDim.x)*dif;
    unsigned int e = b+dif>(*count)?(*count):b+dif;
    unsigned int mysol = 0;
    for(long long i = 2; i<33; ++i) {
          int sol = 1;
          // each thread doing its part
          for(unsigned int j = b; j<e ; ++j) {
              //is some element is sprp base i break
              if(is_SPRP((unsigned long long)i,(unsigned long long)input[j])!=0) {
              sol=0;
              break;
              }
          }
          // if all elements passed store base to mysol
          if(sol==1) mysol|=1<<(i-2);
    }
    s[threadIdx.x] = mysol;
    // save thread_result
    backup[threadIdx.x+blockDim.x*blockIdx.x] = mysol;
    __syncthreads();
    // compute global resulte and store it to ans
    if(threadIdx.x==0) {
          unsigned int global_sol = ~0;
          for(int i=0;i<blockDim.x;++i) global_sol&=s[i];
          atomicAnd(ans,global_sol);
    }
}


int main(void) {
// number of blocks & thread for solve
const int blocks = 400;
const int threads = 32;

unsigned int prms[] = { 17, 11, 6, 60, 7, 13, 11, 34, 13, 2, 3, 37, 13, 11, 38, 2, 7, 105, 2, 7, 42, 11, 7, 3, 6, 15, 53, 44, 6, 6, 5, 15, 54, 7, 35, 10, 10, 15, 10, 10, 17, 17, 11, 10, 15, 43, 7, 5, 5, 3, 7, 43, 34, 2, 34, 2, 68, 53, 39, 10, 7, 6, 11, 2, 5, 2, 7, 2, 6, 5, 15, 40, 3, 5, 5, 2, 2, 10, 47, 13, 7, 43, 6, 7, 5, 6, 6, 13, 6, 35, 6, 15, 6, 13, 40, 10, 11, 2, 7, 2, 2, 3, 13, 3, 11, 15, 10, 5, 11, 14, 7, 11, 47, 5, 2, 2, 6, 2, 5, 55, 6, 5, 7, 2, 6, 58, 35, 11, 5, 12, 17, 6, 10, 12, 6, 6, 2, 53, 2, 2, 13, 5, 14, 7, 15, 6, 13, 62, 10, 6, 3, 7, 7, 3, 14, 5, 14, 73, 15, 11, 11, 6, 5, 17, 10, 5, 3, 37, 51, 10, 7, 5, 38, 12, 5, 11, 5, 7, 6, 5, 6, 40, 43, 57, 10, 13, 7, 15, 2, 10, 34, 7, 39, 10, 5, 3, 6, 13, 11, 5, 10, 43, 10, 5, 3, 14, 5, 2, 5, 41, 5, 39, 46, 2, 10, 2, 5, 12, 3, 2, 2, 5, 15, 43, 17, 41, 2, 13, 15, 38, 11, 11, 3, 34, 5, 6, 3, 7, 2, 37, 5, 6, 10, 17, 35, 2, 15, 6, 7, 5, 3, 13, 13, 12, 34, 2, 12, 10, 15, 13, 2, 2, 34, 6, 6, 5, 2, 7, 13, 3, 6, 11, 39, 42, 7, 2, 6, 39, 47, 3, 17, 5, 13, 7, 2, 47, 3, 7, 6, 11, 17, 37, 48, 7, 37, 11, 7, 10, 3, 14, 39, 14, 15, 43, 17, 2, 12, 7, 13, 5, 3, 6, 34, 37, 3, 17, 13, 2, 5, 10, 10, 44, 37, 2, 2, 10, 10, 7, 3, 7, 2, 7, 5, 43, 43, 11, 15, 51, 13, 17, 10, 11, 2, 5, 34, 17, 2, 2, 42, 6, 6, 5, 47, 15, 2, 12, 7, 3, 10, 15, 3, 7, 12, 12, 15, 43, 14, 7, 58, 13, 10, 6, 6, 38, 34, 5, 5, 13, 38, 6, 11, 10, 6, 7, 2, 55, 2, 13, 5, 11, 44, 15, 17, 2, 40, 2, 15, 13, 6, 2, 3, 3, 3, 3, 6, 39, 5, 11, 17, 37, 5, 7, 6, 10, 6, 12, 7, 5, 14, 10, 12, 71, 10, 35, 6, 11, 3, 2, 38, 3, 2, 34, 10, 17, 42, 2, 12, 6, 6, 11, 40, 12, 10, 6, 10, 2, 3, 3, 56, 11, 7, 42, 2, 38, 12, 2, 2, 13, 40, 12, 6, 5, 5, 59, 15, 38, 5, 5, 5, 7, 2, 10, 7, 2, 17, 10, 11, 6, 6, 6, 2, 10, 6, 54, 2, 82, 3, 34, 14, 15, 44, 5, 46, 2, 13, 5, 12, 13, 11, 10, 39, 5, 40, 3, 60, 3, 42, 11, 3, 46, 17, 3, 2, 37, 6, 42, 12, 14, 3, 12, 66, 13, 34, 7, 3, 13, 3, 11, 2, 13, 12, 38, 34, 5, 40, 10, 14, 6, 14, 11, 38, 58, 2, 48, 5, 15, 5, 73, 3, 37, 5, 11, 10, 5, 5, 13, 2, 10, 13, 34, 17, 3, 7, 47, 2, 2, 10, 15, 3, 3, 13, 6, 34, 13, 10, 13, 3, 6, 41, 10, 6, 2, 6, 2, 6, 2, 6, 6, 37, 10, 44, 35, 13, 51, 2, 7, 53, 5, 40, 5, 2, 37, 11, 15, 11, 13, 2, 5, 2, 6, 10, 17, 15, 43, 39, 17, 2, 12, 10, 15, 17, 7, 13, 3, 7, 15, 37, 5, 15, 7, 6, 10, 51, 2, 2, 40, 61, 2, 13, 13, 11, 2, 5, 34, 5, 5, 7, 2, 2, 2, 11, 3, 6, 13, 6, 17, 11, 10, 7, 46, 15, 7, 14, 35, 11, 7, 10, 6, 11, 40, 11, 2, 39, 7, 6, 66, 5, 3, 6, 5, 11, 10, 2, 10, 7, 13, 2, 45, 34, 6, 35, 2, 11, 5, 59, 75, 10, 17, 14, 17, 17, 17, 2, 11, 7, 10, 6, 11, 6, 56, 34, 35, 11, 14, 12, 41, 40, 17, 40, 3, 11, 7, 37, 14, 7, 13, 7, 5, 2, 10, 6, 39, 2, 7, 37, 35, 10, 5, 15, 2, 7, 38, 34, 11, 17, 5, 6, 10, 3, 6, 7, 7, 43, 14, 2, 43, 3, 2, 47, 7, 35, 7, 3, 53, 2, 10, 10, 10, 60, 10, 6, 2, 6, 10, 5, 7, 57, 53, 13, 3, 35, 38, 15, 42, 3, 3, 12, 2, 10, 3, 38, 54, 13, 10, 11, 7, 13, 7, 2, 12, 39, 10, 54, 2, 12, 38, 10, 12, 12, 5, 15, 6, 10, 13, 5, 15, 10, 13, 6, 41, 40, 14, 12, 10, 11, 40, 5, 11, 10, 2, 5, 2, 13, 6, 2, 13, 5, 2, 10, 15, 5, 5, 10, 34, 13, 2, 5, 14, 5, 6, 5, 13, 3, 43, 6, 13, 11, 50, 3, 6, 6, 12, 15, 11, 37, 7, 69, 11, 14, 14, 7, 43, 5, 35, 11, 35, 11, 11, 34, 34, 39, 14, 11, 2, 10, 53, 6, 11, 2, 11, 60, 39, 11, 6, 15, 40, 17, 47, 34, 50, 7, 59, 47, 5, 13, 39, 5, 6, 53, 10, 14, 5, 51, 5, 7, 5, 6, 77, 7, 12, 7, 42, 2, 5, 2, 6, 60, 10, 13, 10, 6, 47, 6, 15, 17, 10, 11, 10, 12, 7, 7, 10, 17, 34, 5, 10, 7, 7, 2, 6, 10, 38, 2, 15, 6, 13, 7, 13, 2, 3, 13, 5, 3, 17, 2, 5, 15, 11, 39, 7, 39, 10, 10, 2, 6, 13, 3, 5, 17, 6, 14, 10, 37, 44, 3, 34, 5, 11, 7, 12, 2, 5, 3, 12, 3, 2, 3, 133, 12, 2, 2, 2, 3, 34, 14, 41, 2, 37, 11, 2, 6, 11, 6, 7, 15, 11, 35, 13, 6, 5, 2, 14, 7, 2 };

printf("primes_copy: %s\n",cudaGetErrorString(cudaMemcpyToSymbol(primes,prms,1024*4)));

/*-----*/

// allocate buffers
unsigned int *dev_input,*dev_count;
printf("alloc_input: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_input,sizeof(int)*(1<<23))));
printf("alloc_count: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_count,4)));
printf("memset_count: %s\n",cudaGetErrorString(cudaMemset(dev_count,0,4)));
find<<<400,32>>>(dev_input,dev_count);
cudaDeviceSynchronize();

unsigned int count;
printf("copy_count: %s\n",cudaGetErrorString(cudaMemcpy(&count,dev_count,4,cudaMemcpyDeviceToHost)));

// sort found elements just to make debbug easier, it is not necessary
unsigned int *backup_numbers = new unsigned int[1000000];
printf("copy_backup: %s\n",cudaGetErrorString(cudaMemcpy(backup_numbers,dev_input,4*count,cudaMemcpyDeviceToHost)));
std::sort(backup_numbers,backup_numbers+count);
printf("copy_S_backup: %s\n",cudaGetErrorString(cudaMemcpy(dev_input,backup_numbers,4*count,cudaMemcpyHostToDevice)));
delete[] backup_numbers;

printf("\nsize: %u\n",count);

// allocate buffers
unsigned int *dev_backup, *dev_ans;
printf("malloc_backup: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_backup,sizeof(int)*blocks*threads)));
printf("malloc_ans: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_ans,4)));
printf("memset_ans: %s\n",cudaGetErrorString(cudaMemset(dev_ans,0xFF,4)));

solve<<<blocks,threads>>>(dev_input,dev_count,dev_backup,dev_ans);
cudaDeviceSynchronize();

unsigned int ans,*backup;
printf("memcpy_ans: %s\n",cudaGetErrorString(cudaMemcpy(&ans,dev_ans,4,cudaMemcpyDeviceToHost)));
backup = new unsigned int[400*32];
printf("memcpy_backup: %s\n",cudaGetErrorString(cudaMemcpy(backup,dev_backup,4*blocks*threads,cudaMemcpyDeviceToHost)));
unsigned int backup_ans = ~0;

// compute global result using backuped thread_results
// notice backup_ans and ans MUST be the same, but they are NOT (WHY!)
for(int i=0;i<threads*blocks;++i) backup_ans&=backup[i];
printf("ans: %u\nbackup_ans %u\n",ans,backup_ans);
printf("%u\n",backup[48]);

delete[] backup;
cudaFree(dev_ans);
cudaFree(dev_backup);
cudaFree(dev_count);
cudaFree(dev_input);
}

Весь код, кроме метода execute (), работает как намерение. Метод solve () просто вычисляет ерунду (потому что backup_ans и ans различаются), а также выдает ошибку "неопределенная ошибка запуска" на последних двух cudaMemcpy. Когда я бегу решить<<<1,1>>>(...) я получил

ans: 134816642 backup_ans 432501552

но когда я бегу решать<<<400,32>>>(...) это дает мне

ans: 134816642 backup_ans 0 (правильный ответ должен быть 0)

Во всех ситуациях он должен вычислять backup_ans=ans=0

Любой совет, что я делаю неправильно, будет полезен.

Код для генерации primes.bin

#include <cstdlib>
#include <stdio.h>
using namespace std;

const unsigned long long n = 1ll<<32;
const int buffer_size = 2000000;

typedef unsigned char uch;
typedef unsigned int uint;
typedef unsigned long long ull;

uch *primes;

int prime(long long x) {
if(x==2) return 1;
if(x%2==0) return 0;
long long pos = x/16;
long long index = (x&15)>>1;
return (1<<index)&(~(primes[pos]));
}
void eratosten_sieve(void) {
  long long pos;
  long long index;
  for(long long i=3;i*i<n;++i) {
      if(!prime(i)) continue;
      for(long long j=i*i;j<n;j+=(i<<1)) {
      pos = j/16;
      index = ((j&15)>>1);
      primes[pos]|=(1<<index);
      }
  }

}

int main(void) {
primes = new uch[(n/16)+1];
for(long long i=0;i<(n/16)+1;++i) primes[i]=0;
printf("generating\n");
eratosten_sieve(); 
int l = n/16 +1;
printf("writing\n");
FILE *f = fopen("primes.bin","wb");
fwrite(primes,1,l,f);
fclose(f);
printf("done\n");
delete[] primes;
}

PS: я компилирую это с помощью nvcc -arch compute_11

CUDA Driver Version / Runtime Version          5.5 / 5.5
CUDA Capability Major/Minor version number:    1.1
Total amount of global memory:                 1023 MBytes (1073020928 bytes)
(14) Multiprocessors, (  8) CUDA Cores/MP:     112 CUDA Cores
GPU Clock rate:                                1500 MHz (1.50 GHz)
Memory Clock rate:                             900 Mhz
Memory Bus Width:                              256-bit
Maximum Texture Dimension Size (x,y,z)         1D=(8192), 2D=(65536, 32768), 3D=(2048, 2048, 2048)
Maximum Layered 1D Texture Size, (num) layers  1D=(8192), 512 layers
Maximum Layered 2D Texture Size, (num) layers  2D=(8192, 8192), 512 layers
Total amount of constant memory:               65536 bytes
Total amount of shared memory per block:       16384 bytes
Total number of registers available per block: 8192
Warp size:                                     32
Maximum number of threads per multiprocessor:  768
Maximum number of threads per block:           512
Max dimension size of a thread block (x,y,z): (512, 512, 64)
Max dimension size of a grid size    (x,y,z): (65535, 65535, 1)
Maximum memory pitch:                          2147483647 bytes
Texture alignment:                             256 bytes
Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
Run time limit on kernels:                     Yes
Integrated GPU sharing Host Memory:            No
Support host page-locked memory mapping:       Yes
Alignment requirement for Surfaces:            Yes
Device has ECC support:                        Disabled
Device supports Unified Addressing (UVA):      No
Device PCI Bus ID / PCI location ID:           1 / 0
Compute Mode:
  < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 5.5, CUDA Runtime Version = 5.5, NumDevs = 1, Device0 = GeForce 9800 GT
Result = PASS

1 ответ

Решение

ОК, тебе не хватает памяти. Мне потребовалось некоторое время, чтобы понять, потому что я не думал о большом статическом распределении:

__device__ unsigned char primes[(1<<28)+1];

Обычно, когда людям не хватает памяти, они обнаруживают это на cudaMalloc операция. В вашем случае у вашего GPU есть 1 ГБ памяти, и я предполагаю, что вы также размещаете на нем дисплей (вы не ответили на этот вопрос). Посмотрите, сколько свободной памяти есть в nvidia-smi -a На выходе это будет выглядеть примерно так:

FB Memory Usage
    Total                       : 1535 MiB
    Used                        : 3 MiB
    Free                        : 1532 MiB

Ваши номера будут меньше - мы заботимся о бесплатной линии.

Ваши динамические распределения (т.е. от cudaMalloc) выделяют около 350МБ. Но запуск ядра приводит в действие статическое распределение, и тогда ваш общий объем увеличивается до 700 МБ (2^28 превышает 250 МБ). Если у вас есть дисплей, работающий на этом графическом процессоре, он будет использовать часть 1 ГБ памяти, в результате чего вам не хватит для запуска ядра, требующего 700 МБ.

Если вы хотите работать на этом графическом процессоре, посмотрите, сможете ли вы как-то уменьшить размер вашей проблемы.

И всегда хорошо делать правильную проверку ошибок cuda, но кроме этой проблемы ваш код, кажется, работает без ошибок для меня на устройствах с большим объемом памяти.

Другие вопросы по тегам