c - CUDA を使用しているときに malloc() と calloc() が機能していないように見えるのはなぜですか?

Question

malloc()CUDA で使用すると、 /を使用した動的メモリ割り当てcalloc()が正しく機能していないようです。

チェックに関しては、を使用して次のコードを書きましたcalloc()。配列には必要なメモリが割り当てられているようで、いくつかの値を割り当てることもできます。しかし、カーネルから行列要素を出力すると、ガベージ値しか表示されませんでした。私はそれが問題かもしれないと思ったcudaMemcpy()が、代わりに**A、のように置くとA[5][5]、コードは完璧に機能する.

そして、memset()使用すると「コアダンプ」エラーが発生します。

malloc()/calloc()エラーなしで仲良くするのを手伝ってくれる人はいますか?

#include<stdio.h>

__global__ void threads(int* dA)
{
 int gi=threadIdx.x+(blockIdx.x*blockDim.x);
 int gj=threadIdx.y+(blockIdx.y*blockDim.y);

 printf("global Id in X= %d, in Y =%d, E= %d\n", gi,gj,dA[gi*5+gj]);
}

int main(int argc, char** argv)
{
 int **A, *dA;
 int R=5, C=4;
 int size=R*C*sizeof(int);

 A=(int **)calloc(R, sizeof(int*));

 for(int i=0; i<R; i++)
    A[i]=(int *)calloc(C, sizeof(int));

// memset(A, 0, size);

 for(int i=0; i<R; i++)
   {
   for(int j=0; j<C; j++)
      A[i][j]=i*C+j;
   }

printf(" \n Before \n");
for(int i=0; i<R; i++)
   {
    for(int j=0; j<C; j++)
        printf("%d ",A[i][j]);
    printf("\n");
   }

cudaMalloc((int**) &dA, size);
cudaMemcpy(dA, A, size, cudaMemcpyHostToDevice);

dim3 nblocks(R,C);
dim3 nthreads(1);

threads<<<nblocks, nthreads>>>(dA);
cudaDeviceSynchronize();

cudaFree(dA);
free(A);
return 0;
}

score 3 · Accepted Answer

コードの問題は、ホスト関数であるとの使用mallocとは関係ありません。calloc問題は、double ポインターとそれらが CUDA カーネルに渡される方法を正しく処理していないことです。Robert Crovella が指摘したように、適切なエラーチェックを行うことで、実装に何が欠けているかをより深く理解できたはずです。

以下は、プログラムの作業バージョンです。これは、 cuda 2D array problemで talonmiesによって提供された回答の適用にすぎません。

#include<stdio.h>
#include<conio.h>

inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
    if (code != 0) {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
        if (Abort) exit(code);
    }       
}

#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }

__global__ void threads(int* dA[]) {

    int gi=blockIdx.x;
    int gj=blockIdx.y;

    printf("global Id in X= %i, in Y =%i, E= %i\n", gi, gj, dA[gi][gj]);

}

int main(int argc, char** argv)
{

    int **A, *dA;
    int R=5, C=4;
    int size=R*C*sizeof(int);

    A=(int**)calloc(R,sizeof(int*));
    for(int i=0; i<R; i++) A[i]=(int*)calloc(C,sizeof(int));
    for(int i=0; i<R; i++) for(int j=0; j<C; j++) A[i][j]=i*C+j;

    printf("Before transfer \n");
    for(int i=0; i<R; i++) { for(int j=0; j<C; j++) { printf("%d ",A[i][j]); } printf("\n"); }
    printf("\n");

    // --- Create an array of R pointers on the host
    int** h_A = (int**)malloc(R*sizeof(int*));
    for(int i=0; i<R;i++){
        // --- For each array pointer, allocate space for C ints on the device
        GPUerrchk(cudaMalloc((void**)&h_A[i], C*sizeof(int)));
        // --- Copy the rows of A from host to device at the address determined by h_A[i]
        GPUerrchk(cudaMemcpy(h_A[i], &A[i][0], C*sizeof(int), cudaMemcpyHostToDevice));
    }

    // --- Create an array of R pointers on the device
    int **d_A; GPUerrchk(cudaMalloc((void***)&d_A, R*sizeof(int*)));
    // --- Copy the addresses of the rows of the device matrix from host to device
    GPUerrchk(cudaMemcpy(d_A, h_A, R*sizeof(int*), cudaMemcpyHostToDevice));

    dim3 nblocks(R,C);
    dim3 nthreads(1);

    printf("After transfer \n");
    threads<<<nblocks, nthreads>>>(d_A);
    GPUerrchk(cudaPeekAtLastError());

    cudaDeviceSynchronize();

    getch();

    return 0;

}

cuda 2D array problemでも下線が引かれているように、この煩雑な配列処理を避けるために、2D 配列を 1D にフラット化することを常にお勧めします。

c - CUDA を使用しているときに malloc() と calloc() が機能していないように見えるのはなぜですか?

1 に答える 1

Related

Reference