私の 1D 波動方程式は、C/C++ よりも CUDA で実装すると遅くなります。誰が私が間違っているのか教えてもらえますか? これが私のコードです:
__global__ void Solver1d(float* up, float* u, float* um)
{
int id;
float dx,dt;
dx = (float)L/n;
dt = (float)dx/c;
float r= c*((float)dt/dx);
float R = r*r;
// index mapping between data and threads
id = threadIdx.x + blockIdx.x*blockDim.x;
// Allowing all threads in the range of valids data to execute
if (id<n)
{
if(id==0)
{
up[id]=0;
}
else if(id==n-1)
{
up[n-1]=0;
}
else
{
up[id] = 2*u[id]-um[id]+R*(u[id+1]-2*u[id]+u[id-1]);
}
}
}
// main program
int main(int argc, char *argv[])
{
// declare all variables
int i;
float inner,L2_exact,ue[n],dx,dt;
dx = (float)L/n;
dt = (float)(0.05*dx/c); // Max time step
float r= c*((float)dt/dx);
float R = r*r;
// Allocate memory on host
//float u=(float *)malloc((n)*sizeof(float));
//float um=(float *)malloc((n)*sizeof(float));
float up[n],um[n],u[n];
//Pointers for device memory allocation
float *dev_up, *dev_u, *dev_um;
// Allocating memory to device (GPU)
HANDLE_ERROR(cudaMalloc((void**)&dev_up, n*sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&dev_u, n*sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&dev_um, n*sizeof(float)));
cudaEvent_t start, stop;
float elapsedTime;
// Start timer
HANDLE_ERROR(cudaEventCreate( &start ));
HANDLE_ERROR(cudaEventCreate( &stop ));
HANDLE_ERROR(cudaEventRecord( start,0 ));
//Initialize the stream
cudaStream_t stream;
HANDLE_ERROR(cudaStreamCreate( &stream ));
//Initial condition
for(i=0;i<n;i++)
{
u[i]=sin(2*PI*i*dx);
//printf("Initialization ok\n");
}
// Enforcing special formula for t = -1
for(i=1;i<n-1 ;i++)
{
um[0] = 0;
um[n-1] = 0;
um[i] = u[i] + 0.5*R*(u[i-1] - 2*u[i] + u[i+1]); //+ 0.5*dt*dt*f(i*dx,t)
//printf("um is runing fine\n");
}
// setting blocks and threads numbers
int noThreads=128;
dim3 dimBlock(noThreads,1,1);
dim3 dimGrid(1+n/(noThreads-1),1,1);
// move u and um to GPU
HANDLE_ERROR(cudaMemcpy(dev_u, u, n*sizeof(float), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(dev_um, um, n*sizeof(float), cudaMemcpyHostToDevice));
float t=0;
//int counter=0;
while(t<=T)
{
//counter++;
t += dt;
Solver1d<<<dimGrid,dimBlock>>>(dev_up,dev_u,dev_um);
// cudaDeviceSynchronize();
for(i=0;i<n;i++)
{
um[i] = u[i];
u[i] = up[i];
}
}
HANDLE_ERROR(cudaEventRecord( stop,0 ));
HANDLE_ERROR(cudaEventSynchronize( stop ));
HANDLE_ERROR(cudaEventElapsedTime( &elapsedTime,start,stop ));
HANDLE_ERROR(cudaEventDestroy( start ));
HANDLE_ERROR(cudaEventDestroy( stop ));
printf("elapsed time: %lf sec\n",elapsedTime/1000);
// move the solution up from GPU to CPU
HANDLE_ERROR(cudaMemcpy(up, dev_up, n*sizeof(float), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaMemcpy(u, dev_u, n*sizeof(float), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaMemcpy(um, dev_um, n*sizeof(float), cudaMemcpyDeviceToHost));
int j;
float L2cpuSolution=0.0;
float L2gpuSolution=0.0;
float ERROR_PERCENTAGE=0.0;
// Verification with exact solution
for(j=0;j<(n);j++)
{
//printf("up[%d]=%.12g\n",j,up[j]);
ue[j]=0.5*(sin(2*PI*(j*dx+c*T))+sin(2*PI*(j*dx-c*T)));
//printf("um[%d]=%.12g\n",j,um[j]);
inner += (ue[j]-up[j])*(ue[j]-up[j]);
L2cpuSolution += ue[j]*ue[j];
L2gpuSolution += up[j]*up[j];
}
L2cpuSolution = sqrt(L2cpuSolution)/n;
L2gpuSolution = sqrt(L2gpuSolution)/n;
L2_exact = sqrt(inner/(n));
ERROR_PERCENTAGE = 100*(L2_exact/L2cpuSolution);
printf("L2_exact=%lf\n",L2_exact);
printf("gpul2=%lf, and cpuL2=%lf \n",L2gpuSolution,L2cpuSolution);
printf("ERROR_PERCENTAGE= %lf\n", ERROR_PERCENTAGE);
// Free device memory
cudaFree(dev_up);
cudaFree(dev_u);
cudaFree(dev_um);
return 0;
}