Необъяснимое поведение cuda, связанное с памятью
Так что в основном я взял свой код на C++ (который работает правильно) и переписал его на cuda (у меня нет опыта работы с cuda). Одна часть кода (метод execute ()) не работает правильно, и я действительно не знаю почему.
Поэтому мой вопрос заключается в том, что именно означает ошибку "неопределенная ошибка запуска" во время cudaMemcpy и почему это происходит в моем коде.
Мой второй вопрос: почему переменные backup_ans и ans отличаются, когда они вычисляют одно и то же?
#include "stdio.h"
#include <algorithm>
__device__ unsigned int primes[1024];
__device__ long long n = 1ll<<32; // #unsigned_integers
__device__ int hashh(long long x) {
return (x>>1)%1024;
}
// compute (x^e)%n
__device__ unsigned long long mulmod(unsigned long long x,unsigned long long e,unsigned long long n) {
unsigned long long ans = 1;
while(e>0) {
if(e&1) ans = (ans*x)%n;
x = (x*x)%n;
e>>=1;
}
return ans;
}
// determine whether n is strong probable prime base a or not.
// n is ODD
__device__ int is_SPRP(unsigned long long a,unsigned long long n) {
int d=0;
unsigned long long t = n-1;
while(t%2==0) {
++d;
t>>=1;
}
unsigned long long x = mulmod(a,t,n);
if(x==1) return 1;
for(int i=0;i<d;++i) {
if(x==n-1) return 1;
x=(x*x)%n;
}
return 0;
}
__device__ int prime(long long x) {
return is_SPRP((unsigned long long)primes[(((long long)0xAFF7B4*x)>>7)%1024],(unsigned long long)x);
}
// copy all unsigned COMPOSITE ingeters which are not congruent to zero modulo 2,3,5,7 and their hashh value = 0;
// count of those elements store in c
// 335545 is just magic constant to distribute all integers equally on all 400*32 threads
__global__ void find(unsigned int *out,unsigned int *c) {
unsigned int buff[4096];
int local_c = 0;
long long b = 121+(threadIdx.x+blockIdx.x*blockDim.x)*335545;
long long e = b+335545;
if(b%2==0) ++b;
for(long long i=b;i<e && i<n;i+=2) {
if(i%3==0 || i%5==0 || i%7==0 || prime(i)) continue;
if(hashh(i)==0) {
buff[local_c++]=(unsigned int)i;
if(local_c==4096) {
int start = atomicAdd(c,local_c);
for(int i=0;i<local_c;++i) out[i+start]=buff[i];
local_c=0;
}
}
}
int start = atomicAdd(c,local_c);
for(int i=0;i<local_c;++i) out[i+start]=buff[i];
}
// find base for which all elements in input are NOT SPRP. base is from {2,..,34} stored in 32bit uint
__global__ void solve(unsigned int *input, unsigned int *count,unsigned int *backup, unsigned int *ans) {
__shared__ unsigned int s[32];
unsigned int dif = (*count)/(blockDim.x*gridDim.x) +1;
unsigned int b = (threadIdx.x+blockIdx.x*blockDim.x)*dif;
unsigned int e = b+dif>(*count)?(*count):b+dif;
unsigned int mysol = 0;
for(long long i = 2; i<33; ++i) {
int sol = 1;
// each thread doing its part
for(unsigned int j = b; j<e ; ++j) {
//is some element is sprp base i break
if(is_SPRP((unsigned long long)i,(unsigned long long)input[j])!=0) {
sol=0;
break;
}
}
// if all elements passed store base to mysol
if(sol==1) mysol|=1<<(i-2);
}
s[threadIdx.x] = mysol;
// save thread_result
backup[threadIdx.x+blockDim.x*blockIdx.x] = mysol;
__syncthreads();
// compute global resulte and store it to ans
if(threadIdx.x==0) {
unsigned int global_sol = ~0;
for(int i=0;i<blockDim.x;++i) global_sol&=s[i];
atomicAnd(ans,global_sol);
}
}
int main(void) {
// number of blocks & thread for solve
const int blocks = 400;
const int threads = 32;
unsigned int prms[] = { 17, 11, 6, 60, 7, 13, 11, 34, 13, 2, 3, 37, 13, 11, 38, 2, 7, 105, 2, 7, 42, 11, 7, 3, 6, 15, 53, 44, 6, 6, 5, 15, 54, 7, 35, 10, 10, 15, 10, 10, 17, 17, 11, 10, 15, 43, 7, 5, 5, 3, 7, 43, 34, 2, 34, 2, 68, 53, 39, 10, 7, 6, 11, 2, 5, 2, 7, 2, 6, 5, 15, 40, 3, 5, 5, 2, 2, 10, 47, 13, 7, 43, 6, 7, 5, 6, 6, 13, 6, 35, 6, 15, 6, 13, 40, 10, 11, 2, 7, 2, 2, 3, 13, 3, 11, 15, 10, 5, 11, 14, 7, 11, 47, 5, 2, 2, 6, 2, 5, 55, 6, 5, 7, 2, 6, 58, 35, 11, 5, 12, 17, 6, 10, 12, 6, 6, 2, 53, 2, 2, 13, 5, 14, 7, 15, 6, 13, 62, 10, 6, 3, 7, 7, 3, 14, 5, 14, 73, 15, 11, 11, 6, 5, 17, 10, 5, 3, 37, 51, 10, 7, 5, 38, 12, 5, 11, 5, 7, 6, 5, 6, 40, 43, 57, 10, 13, 7, 15, 2, 10, 34, 7, 39, 10, 5, 3, 6, 13, 11, 5, 10, 43, 10, 5, 3, 14, 5, 2, 5, 41, 5, 39, 46, 2, 10, 2, 5, 12, 3, 2, 2, 5, 15, 43, 17, 41, 2, 13, 15, 38, 11, 11, 3, 34, 5, 6, 3, 7, 2, 37, 5, 6, 10, 17, 35, 2, 15, 6, 7, 5, 3, 13, 13, 12, 34, 2, 12, 10, 15, 13, 2, 2, 34, 6, 6, 5, 2, 7, 13, 3, 6, 11, 39, 42, 7, 2, 6, 39, 47, 3, 17, 5, 13, 7, 2, 47, 3, 7, 6, 11, 17, 37, 48, 7, 37, 11, 7, 10, 3, 14, 39, 14, 15, 43, 17, 2, 12, 7, 13, 5, 3, 6, 34, 37, 3, 17, 13, 2, 5, 10, 10, 44, 37, 2, 2, 10, 10, 7, 3, 7, 2, 7, 5, 43, 43, 11, 15, 51, 13, 17, 10, 11, 2, 5, 34, 17, 2, 2, 42, 6, 6, 5, 47, 15, 2, 12, 7, 3, 10, 15, 3, 7, 12, 12, 15, 43, 14, 7, 58, 13, 10, 6, 6, 38, 34, 5, 5, 13, 38, 6, 11, 10, 6, 7, 2, 55, 2, 13, 5, 11, 44, 15, 17, 2, 40, 2, 15, 13, 6, 2, 3, 3, 3, 3, 6, 39, 5, 11, 17, 37, 5, 7, 6, 10, 6, 12, 7, 5, 14, 10, 12, 71, 10, 35, 6, 11, 3, 2, 38, 3, 2, 34, 10, 17, 42, 2, 12, 6, 6, 11, 40, 12, 10, 6, 10, 2, 3, 3, 56, 11, 7, 42, 2, 38, 12, 2, 2, 13, 40, 12, 6, 5, 5, 59, 15, 38, 5, 5, 5, 7, 2, 10, 7, 2, 17, 10, 11, 6, 6, 6, 2, 10, 6, 54, 2, 82, 3, 34, 14, 15, 44, 5, 46, 2, 13, 5, 12, 13, 11, 10, 39, 5, 40, 3, 60, 3, 42, 11, 3, 46, 17, 3, 2, 37, 6, 42, 12, 14, 3, 12, 66, 13, 34, 7, 3, 13, 3, 11, 2, 13, 12, 38, 34, 5, 40, 10, 14, 6, 14, 11, 38, 58, 2, 48, 5, 15, 5, 73, 3, 37, 5, 11, 10, 5, 5, 13, 2, 10, 13, 34, 17, 3, 7, 47, 2, 2, 10, 15, 3, 3, 13, 6, 34, 13, 10, 13, 3, 6, 41, 10, 6, 2, 6, 2, 6, 2, 6, 6, 37, 10, 44, 35, 13, 51, 2, 7, 53, 5, 40, 5, 2, 37, 11, 15, 11, 13, 2, 5, 2, 6, 10, 17, 15, 43, 39, 17, 2, 12, 10, 15, 17, 7, 13, 3, 7, 15, 37, 5, 15, 7, 6, 10, 51, 2, 2, 40, 61, 2, 13, 13, 11, 2, 5, 34, 5, 5, 7, 2, 2, 2, 11, 3, 6, 13, 6, 17, 11, 10, 7, 46, 15, 7, 14, 35, 11, 7, 10, 6, 11, 40, 11, 2, 39, 7, 6, 66, 5, 3, 6, 5, 11, 10, 2, 10, 7, 13, 2, 45, 34, 6, 35, 2, 11, 5, 59, 75, 10, 17, 14, 17, 17, 17, 2, 11, 7, 10, 6, 11, 6, 56, 34, 35, 11, 14, 12, 41, 40, 17, 40, 3, 11, 7, 37, 14, 7, 13, 7, 5, 2, 10, 6, 39, 2, 7, 37, 35, 10, 5, 15, 2, 7, 38, 34, 11, 17, 5, 6, 10, 3, 6, 7, 7, 43, 14, 2, 43, 3, 2, 47, 7, 35, 7, 3, 53, 2, 10, 10, 10, 60, 10, 6, 2, 6, 10, 5, 7, 57, 53, 13, 3, 35, 38, 15, 42, 3, 3, 12, 2, 10, 3, 38, 54, 13, 10, 11, 7, 13, 7, 2, 12, 39, 10, 54, 2, 12, 38, 10, 12, 12, 5, 15, 6, 10, 13, 5, 15, 10, 13, 6, 41, 40, 14, 12, 10, 11, 40, 5, 11, 10, 2, 5, 2, 13, 6, 2, 13, 5, 2, 10, 15, 5, 5, 10, 34, 13, 2, 5, 14, 5, 6, 5, 13, 3, 43, 6, 13, 11, 50, 3, 6, 6, 12, 15, 11, 37, 7, 69, 11, 14, 14, 7, 43, 5, 35, 11, 35, 11, 11, 34, 34, 39, 14, 11, 2, 10, 53, 6, 11, 2, 11, 60, 39, 11, 6, 15, 40, 17, 47, 34, 50, 7, 59, 47, 5, 13, 39, 5, 6, 53, 10, 14, 5, 51, 5, 7, 5, 6, 77, 7, 12, 7, 42, 2, 5, 2, 6, 60, 10, 13, 10, 6, 47, 6, 15, 17, 10, 11, 10, 12, 7, 7, 10, 17, 34, 5, 10, 7, 7, 2, 6, 10, 38, 2, 15, 6, 13, 7, 13, 2, 3, 13, 5, 3, 17, 2, 5, 15, 11, 39, 7, 39, 10, 10, 2, 6, 13, 3, 5, 17, 6, 14, 10, 37, 44, 3, 34, 5, 11, 7, 12, 2, 5, 3, 12, 3, 2, 3, 133, 12, 2, 2, 2, 3, 34, 14, 41, 2, 37, 11, 2, 6, 11, 6, 7, 15, 11, 35, 13, 6, 5, 2, 14, 7, 2 };
printf("primes_copy: %s\n",cudaGetErrorString(cudaMemcpyToSymbol(primes,prms,1024*4)));
/*-----*/
// allocate buffers
unsigned int *dev_input,*dev_count;
printf("alloc_input: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_input,sizeof(int)*(1<<23))));
printf("alloc_count: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_count,4)));
printf("memset_count: %s\n",cudaGetErrorString(cudaMemset(dev_count,0,4)));
find<<<400,32>>>(dev_input,dev_count);
cudaDeviceSynchronize();
unsigned int count;
printf("copy_count: %s\n",cudaGetErrorString(cudaMemcpy(&count,dev_count,4,cudaMemcpyDeviceToHost)));
// sort found elements just to make debbug easier, it is not necessary
unsigned int *backup_numbers = new unsigned int[1000000];
printf("copy_backup: %s\n",cudaGetErrorString(cudaMemcpy(backup_numbers,dev_input,4*count,cudaMemcpyDeviceToHost)));
std::sort(backup_numbers,backup_numbers+count);
printf("copy_S_backup: %s\n",cudaGetErrorString(cudaMemcpy(dev_input,backup_numbers,4*count,cudaMemcpyHostToDevice)));
delete[] backup_numbers;
printf("\nsize: %u\n",count);
// allocate buffers
unsigned int *dev_backup, *dev_ans;
printf("malloc_backup: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_backup,sizeof(int)*blocks*threads)));
printf("malloc_ans: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_ans,4)));
printf("memset_ans: %s\n",cudaGetErrorString(cudaMemset(dev_ans,0xFF,4)));
solve<<<blocks,threads>>>(dev_input,dev_count,dev_backup,dev_ans);
cudaDeviceSynchronize();
unsigned int ans,*backup;
printf("memcpy_ans: %s\n",cudaGetErrorString(cudaMemcpy(&ans,dev_ans,4,cudaMemcpyDeviceToHost)));
backup = new unsigned int[400*32];
printf("memcpy_backup: %s\n",cudaGetErrorString(cudaMemcpy(backup,dev_backup,4*blocks*threads,cudaMemcpyDeviceToHost)));
unsigned int backup_ans = ~0;
// compute global result using backuped thread_results
// notice backup_ans and ans MUST be the same, but they are NOT (WHY!)
for(int i=0;i<threads*blocks;++i) backup_ans&=backup[i];
printf("ans: %u\nbackup_ans %u\n",ans,backup_ans);
printf("%u\n",backup[48]);
delete[] backup;
cudaFree(dev_ans);
cudaFree(dev_backup);
cudaFree(dev_count);
cudaFree(dev_input);
}
Весь код, кроме метода execute (), работает как намерение. Метод solve () просто вычисляет ерунду (потому что backup_ans и ans различаются), а также выдает ошибку "неопределенная ошибка запуска" на последних двух cudaMemcpy. Когда я бегу решить<<<1,1>>>(...) я получил
ans: 134816642 backup_ans 432501552
но когда я бегу решать<<<400,32>>>(...) это дает мне
ans: 134816642 backup_ans 0 (правильный ответ должен быть 0)
Во всех ситуациях он должен вычислять backup_ans=ans=0
Любой совет, что я делаю неправильно, будет полезен.
Код для генерации primes.bin
#include <cstdlib>
#include <stdio.h>
using namespace std;
const unsigned long long n = 1ll<<32;
const int buffer_size = 2000000;
typedef unsigned char uch;
typedef unsigned int uint;
typedef unsigned long long ull;
uch *primes;
int prime(long long x) {
if(x==2) return 1;
if(x%2==0) return 0;
long long pos = x/16;
long long index = (x&15)>>1;
return (1<<index)&(~(primes[pos]));
}
void eratosten_sieve(void) {
long long pos;
long long index;
for(long long i=3;i*i<n;++i) {
if(!prime(i)) continue;
for(long long j=i*i;j<n;j+=(i<<1)) {
pos = j/16;
index = ((j&15)>>1);
primes[pos]|=(1<<index);
}
}
}
int main(void) {
primes = new uch[(n/16)+1];
for(long long i=0;i<(n/16)+1;++i) primes[i]=0;
printf("generating\n");
eratosten_sieve();
int l = n/16 +1;
printf("writing\n");
FILE *f = fopen("primes.bin","wb");
fwrite(primes,1,l,f);
fclose(f);
printf("done\n");
delete[] primes;
}
PS: я компилирую это с помощью nvcc -arch compute_11
CUDA Driver Version / Runtime Version 5.5 / 5.5
CUDA Capability Major/Minor version number: 1.1
Total amount of global memory: 1023 MBytes (1073020928 bytes)
(14) Multiprocessors, ( 8) CUDA Cores/MP: 112 CUDA Cores
GPU Clock rate: 1500 MHz (1.50 GHz)
Memory Clock rate: 900 Mhz
Memory Bus Width: 256-bit
Maximum Texture Dimension Size (x,y,z) 1D=(8192), 2D=(65536, 32768), 3D=(2048, 2048, 2048)
Maximum Layered 1D Texture Size, (num) layers 1D=(8192), 512 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(8192, 8192), 512 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 16384 bytes
Total number of registers available per block: 8192
Warp size: 32
Maximum number of threads per multiprocessor: 768
Maximum number of threads per block: 512
Max dimension size of a thread block (x,y,z): (512, 512, 64)
Max dimension size of a grid size (x,y,z): (65535, 65535, 1)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 256 bytes
Concurrent copy and kernel execution: Yes with 1 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
Device supports Unified Addressing (UVA): No
Device PCI Bus ID / PCI location ID: 1 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 5.5, CUDA Runtime Version = 5.5, NumDevs = 1, Device0 = GeForce 9800 GT
Result = PASS
1 ответ
ОК, тебе не хватает памяти. Мне потребовалось некоторое время, чтобы понять, потому что я не думал о большом статическом распределении:
__device__ unsigned char primes[(1<<28)+1];
Обычно, когда людям не хватает памяти, они обнаруживают это на cudaMalloc
операция. В вашем случае у вашего GPU есть 1 ГБ памяти, и я предполагаю, что вы также размещаете на нем дисплей (вы не ответили на этот вопрос). Посмотрите, сколько свободной памяти есть в nvidia-smi -a
На выходе это будет выглядеть примерно так:
FB Memory Usage
Total : 1535 MiB
Used : 3 MiB
Free : 1532 MiB
Ваши номера будут меньше - мы заботимся о бесплатной линии.
Ваши динамические распределения (т.е. от cudaMalloc
) выделяют около 350МБ. Но запуск ядра приводит в действие статическое распределение, и тогда ваш общий объем увеличивается до 700 МБ (2^28 превышает 250 МБ). Если у вас есть дисплей, работающий на этом графическом процессоре, он будет использовать часть 1 ГБ памяти, в результате чего вам не хватит для запуска ядра, требующего 700 МБ.
Если вы хотите работать на этом графическом процессоре, посмотрите, сможете ли вы как-то уменьшить размер вашей проблемы.
И всегда хорошо делать правильную проверку ошибок cuda, но кроме этой проблемы ваш код, кажется, работает без ошибок для меня на устройствах с большим объемом памяти.