配列の最初の要素のみを5.0に設定しようとしています(たとえば)。つまり、スレッドの1つだけが値を設定し、残りのスレッドは何もしません。
これが私の完全なコードです
#include <stdio.h>
#include <cuda.h>
#define GPUERRCHK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void writeBuf( char * fName, float * out_frame, int dim )
{
FILE * fp = fopen( fName, "w+" );
int baseIndx = 0;
for( int i=0 ; i<dim ; i++ )
{
for( int j=0 ; j<dim ; j++ )
{
fprintf( fp, "%f ", out_frame[ dim + j ] );
}
baseIndx += dim;
fprintf( fp, "\n" );
}
fclose( fp );
}
__global__ void kernel( float * s1, float * s2, int dim, int * hx, int *hy, float *hT, int nHeaters )
{
int x = threadIdx.x + blockIdx.x*blockDim.x;
int y = threadIdx.y + blockIdx.y*blockDim.y;
int offset = x + y*blockDim.x*gridDim.x;
if( offset < 1 )
{
s2[0] = 1.0;
}
__syncthreads();
}
int main()
{
srand48( time(NULL) );
int dim = 1024;
float *dev_s1, *dev_s2;
GPUERRCHK( cudaMalloc( (void**)&dev_s1, dim*dim * sizeof(float) ));
GPUERRCHK( cudaMalloc( (void**)&dev_s2, dim*dim * sizeof(float) ));
GPUERRCHK( cudaMemset( dev_s1, 0x00, dim*dim * sizeof(float) ));
GPUERRCHK( cudaMemset( dev_s2, 0x00, dim*dim * sizeof(float) ));
//heaters
int *dev_hx, *dev_hy;
float *dev_hT;
int nHeaters = 20;
GPUERRCHK( cudaMalloc( (void**)&dev_hx, nHeaters * sizeof(int) ));
GPUERRCHK( cudaMalloc( (void**)&dev_hy, nHeaters * sizeof(int) ));
GPUERRCHK( cudaMalloc( (void**)&dev_hT, nHeaters * sizeof(float) ));
//init heaters on cpu
int * hx, *hy;
float * hT;
hx = (int*) malloc( nHeaters * sizeof(int) );
hy = (int*) malloc( nHeaters * sizeof(int) );
hT = (float*) malloc( nHeaters * sizeof(float) );
for( int i=0 ; i<nHeaters ; i++ )
{
hx[i] = (int) ((float)drand48() * (float)dim) + 5;
hy[i] = (int) (drand48() * dim) + 5;
hT[i] = (float) (drand48() * 100) + 50;
}
//transfer hx, hy, hT to GPU
GPUERRCHK( cudaMemcpy( dev_hx, hx, nHeaters * sizeof(int), cudaMemcpyHostToDevice ));
GPUERRCHK( cudaMemcpy( dev_hy, hy, nHeaters * sizeof(int), cudaMemcpyHostToDevice ));
GPUERRCHK( cudaMemcpy( dev_hT, hT, nHeaters * sizeof(float), cudaMemcpyHostToDevice ));
float *out_frame = (float *) malloc( dim*dim*sizeof(float) );
// run kernel
int nThreadsPerBlock = 16;
int nBlockX = (dim+nThreadsPerBlock-1)/nThreadsPerBlock;
int nBlockY = (dim+nThreadsPerBlock-1)/nThreadsPerBlock;
kernel<<< dim3(nBlockX, nBlockY), dim3(nThreadsPerBlock, nThreadsPerBlock) >>>( dev_s1, dev_s2, dim, dev_hx, dev_hy, dev_hT, nHeaters );
GPUERRCHK( cudaPeekAtLastError() );
GPUERRCHK( cudaDeviceSynchronize() );
// collect result
GPUERRCHK( cudaMemcpy( out_frame, dev_s2, dim*dim * sizeof(float), cudaMemcpyDeviceToHost ) );
int f=1;
char fName[100];
snprintf( fName, 100, "out/file_%04d.data", f );
writeBuf( fName, out_frame, dim );
cudaFree( dev_s1 );
cudaFree( dev_s2 );
free( out_frame );
}
これを実行すると、ファイルにすべてゼロが含まれます。達成しようとしていることをどのように達成しますか?何が問題なのですか?