cuda - cudaプログラミングにおけるfloat型とdouble型の違いは何ですか?

Question

私はcudaプログラミングの初心者です。私のプログラム (共有メモリを使用した行列乗算) では、block_size=20 を定義しました。行列が 1200*1200 の場合、プログラムは double 要素で動作しますが、float 要素では動作しません (要素が float の場合、840*840 行列で動作します)。私の質問は、float 型が double より小さいことはわかっていますが、なぜそれが起こるのかということです。

// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)
#include <stdio.h>
#define BLOCK_SIZE 20
typedef struct {
int width;
int height;
int stride; 
float* elements;
} Matrix;
// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col)
{
return A.elements[row * A.stride + col];
}
// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col,
float value)
{
A.elements[row * A.stride + col] = value;
}
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
Matrix Asub;

Asub.width = BLOCK_SIZE;
Asub.height = BLOCK_SIZE;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row+ BLOCK_SIZE * col];
return Asub;
}
// Thread block size
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{

// Load A and B to device memory
Matrix d_A;
d_A.width = d_A.stride = A.width; d_A.height = A.height;
siz e_t size = A.width * A.height * sizeof(float);
cudaMalloc((void **)&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size,
cudaMemcpyHostToDevice);
Matrix d_B; 
d_B.width = d_B.stride = B.width; d_B.height = B.height;
size = B.width * B.height * sizeof(float);
cudaMalloc((void **)&d_B.elements, size);
cudaMemcpy(d_B.elements, B.elements, size,
cudaMemcpyHostToDevice);
// Allocate C in device memory
Matrix d_C;
d_C.width = d_C.stride = C.width; d_C.height = C.height;
size = C.width * C.height * sizeof(float);
cudaMalloc((void **)&d_C.elements, size);
// Invoke kernel
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
//dim3 dimBlock(C.height, C.width);
//dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
dim3 dimGrid((B.width+dimBlock.x-1) / dimBlock.x, (A.height+dimBlock.y-1) /dimBlock.y);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// Read C from device memory
cudaMemcpy(C.elements, d_C.elements, size,
cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Block row and column
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
// Each thread block computes one sub-matrix Csub of C
Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
// Each thread computes one element of Csub
// by accumulating results into Cvalue
float Cvalue = 0;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
for (int m = 0; m < (A.width / BLOCK_SIZE); ++m) {
// Get sub-matrix Asub of A
Matrix Asub = GetSubMatrix(A, blockRow, m);
// Get sub-matrix Bsub of B
Matrix Bsub = GetSubMatrix(B, m, blockCol);
// Shared memory used to store Asub and Bsub respectively
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[row][col] = GetElement(Asub, row, col);
Bs[row][col] = GetElement(Bsub, row, col);
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
 __syncthreads();
// Multiply Asub and Bsub together
for (int e = 0; e < BLOCK_SIZE; ++e)
Cvalue += As[row][e] * Bs[e][col];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write Csub to device memory
// Each thread writes one element
SetElement(Csub, row, col, Cvalue);
}
//////////////////////////////////////////////////////////
/// print_matrix function ///////////////////////////
////////////////////////////////////////////////////////
void print_matrix(float *c,int row,int col){
for (int i = 0; i < row; ++i){
for (int j = 0; j < col; ++j)
printf("%f ",c[col*i +j]);
printf("\n\n");
}
}
//////////////////////////////////////////////////////////
/// random_init function ///////////////////////////
////////////////////////////////////////////////////////
void random_init(float *a,int size){
for(int i=0;i<size;i++)
a[i]=rand()%10;
}
////////////////////////////////////////////////////////
int main(void){

//////////////////////////////////////////////////////\|/
cudaEvent_t start,stop;
///////////////////////////////////////////////////////|\

Matrix A,B,C;
A.width=1200;
A.height=1200;/////
B.width=1200;/////
B.height=1200;
C.width=B.width;
C.height=A.height;

size_t size = A.width * A.height * sizeof(float);
A.elements = (float *)malloc(size);
//random_init(A.elements,A.width * A.height );
size = B.width * B.height * sizeof(float);
B.elements= (float *)malloc(size);
//random_init(B.elements,B.width * B.height);
size = C.width * C.height * sizeof(float);
C.elements= (float *)malloc(size);
for(int i=0;i<A.width*A.height;i++)
A.elements[i]=1;
for(int i=0;i<B.width*B.height;i++)
B.elements[i]=1;
printf("matrix A(%d,%d) & matrix B(%d,%d) & matrix   C(%d,%d)\n",A.width,A.height,B.width,
B.height,C.width,C.height);
//////////////////////////////////////////////////////\|/
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
///////////////////////////////////////////////////////|\

MatMul(A,B,C);
//////////////////////////////////////////////////////\|/
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime,start,stop);
printf("Time to genreat : %3.5f ms\n",elapsedTime);
///////////////////////////////////////////////////////|\
printf("\nC\n");
//print_matrix(C.elements,C.height,C.width);


printf("C[%d]=%f\n",0,C.elements[0]);
printf("C[%d]=%f\n",C.width -1,C.elements[C.width-1]);
printf("C[%d]=%f\n",(C.width * C.height)-1,C.elements[(C.width * C.height)-1]);

getchar();
return(0);
}

score 2 · Accepted Answer

The following message:

"“display driver stopped responding and has recovered”"

is an indication that you have run into a windows TDR event.

Under windows, kernels that take too long to execute will cause the windows display watchdog timer to reset the display device, which will cause CUDA code execution to be terminated. Kernels that require more than about 2 seconds to execute may run into this.

If you search on "windows TDR" you will find other descriptions and possible methods to work around this. You might also investigate why your code is taking longer to execute after you make the changes.

cuda - cudaプログラミングにおけるfloat型とdouble型の違いは何ですか?

1 に答える 1

Related

Reference