c - cuda マグマ行列 - 行列加算カーネル

Question

magmablas_sgeadd_q カーネルと同様の形式を使用してみましたが、適切な出力が得られず、実行するたびに異なる出力が得られます。私が使用したコードを以下に示します。

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>

#define BLK_X 2
#define BLK_Y 1

__global__ void matrixAdd2( const float *dA, const float *dB, float *dC, int m, int n)
{
int ldda = m;
int lddb = m;

int ind = blockIdx.x*BLK_X + threadIdx.x;
int iby = blockIdx.y*BLK_Y;
/* check if full block-column */
bool full = (iby + BLK_Y <= n);
/* do only rows inside matrix */
if ( ind < m ) {
    dA += ind + iby*ldda;
    dB += ind + iby*lddb;
    if ( full ) 
    {
        // full block-column
        #pragma unroll
        for( int j=0; j < BLK_Y; ++j )
        {
            dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
            printf("A is %f, B is %f, C is %f  \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
        }
    }
    else 
    {
        // partial block-column
        for( int j=0; j < BLK_Y && iby+j < n; ++j )
        {
            dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
            printf("parital: A is %f, B is %f, C is %f  \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
        }
    }
}
}



int main ( void )
{

int m = 4; // a - mxn matrix
int n = 2; // b - mxn matrix

size_t size =  m * n * sizeof(float);


printf("Matrix addition of %d rows and %d columns \n", m, n);

// allocate matrices on the host

float *h_A = (float *)malloc(size); // a- mxn matrix on the host
float *h_B = (float *)malloc(size); // b- mxn matrix on the host
float *h_C = (float *)malloc(size); // b- mxn matrix on the host


// Initialize the host input matrixs
for (int i = 0; i < m; ++i)
{
    for (int j = 0; j < n ; j ++)
    {
        h_A[i*m+j] = rand()/(float)RAND_MAX;
        h_B[i*m+j] = rand()/(float)RAND_MAX;

    }
}

// Allocate the device input matrix A   
float *d_A = NULL;
err = cudaMalloc((void **)&d_A, size);; // d_a - mxn matrix a on the device

// Allocate the device input matrix B
float *d_B = NULL;
err = cudaMalloc((void **)&d_B, size);

// Allocate the device output matrix C
float *d_C = NULL;
err = cudaMalloc((void **)&d_C, size);

// Copy the host input matrixs A and B in host memory to the device input    matrixs in device memory
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

// defining number of threads and blocks    
dim3 threads( BLK_X, 1 );
dim3 grid((int)ceil(m/BLK_X),(int)ceil(n/BLK_Y) );


// Launching kernel     
matrixAdd2<<<grid, threads, 0>>>(d_A, d_B, d_C, m, n);

// Copy the device result matrix in device memory to the host result matrix in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

//print A matrix 
printf("Matrix A");
for (int i = 0; i < m; i++)
{
    for (int j = 0; j < n; j++)
   {
        printf(" %f", h_A[i*m+j]);

    }
    printf("\n");
}

// print B matrix if required
printf("Matrix B");
for (int i = 0; i < m; i++)
{
    for (int j = 0; j < n; j++)
    {

        printf(" %f", h_B[i*m+j]);

    }
    printf("\n");
}

//Error checkng
printf("Matrix C ");
for (int i = 0; i < m; i++)
{
    for (int j = 0; j < n; j++)
   {    
        printf("%f", h_C[i*m+j]);
        if(h_C[i*m+j] == h_A[i*m+j] + h_B[i*m+j] )
        { 
            flag = flag + 1;
        }
    }
    printf("\n");
}

if(flag==m*n)
{
printf("Test PASSED\n");
}


// Free device global memory
err = cudaFree(d_A);

err = cudaFree(d_B);

err = cudaFree(d_C);

// Free host memory
free(h_A);
free(h_B);
free(h_C);


err = cudaDeviceReset();
printf("Done\n");
return 0;

}

私が得た出力：

4 行 2 列の行列加算ホストメモリから CUDA デバイスに入力データをコピー 2 スレッドの 4 ブロックで CUDA カーネルを起動 CUDA デバイスからホストメモリに出力データをコピー Aは 0.000000、B は 0.364784、C は 0.364784 Aは0.000000、Bは0.952230、Cは0.952230 Aは0.000000、Bは0.000000、Cは0.000000、Cは0.000000 Aは0.000000、Bは0.000000、Cは 0.840188、Bは0.394383、Cは1.234571 Aです。 0.798440、C は 1.581539 Aは 0.911647、B は 0.197551、C は 1.109199 Aは 0.335223、B は 0.768230、C は 1.103452

マトリックス A

0.840188 0.783099 0.911647 0.335223 0.277775 0.477397 0.364784 0.952230

マトリックス B

0.394383 0.798440 0.197551 0.768230 0.553970 0.628871 0.000000 0.000000

マトリックス C

0.0000000.000000 0.0000000.000000 0.0000000.000000 0.0000000.000000

コードに問題がある場合はお知らせください。

ありがとうございました

score 1 · Accepted Answer

私が見つけた2つのコーディングエラーがありました：

このメソッドを使用して行列dAとdBカーネルのベースポインタを「バンプ」する場合、matrix のベースポインタに対しても同じことを行う必要がありdCます。
```
if ( ind < m ) {
    dA += ind + iby*ldda;
    dB += ind + iby*lddb;
    dC += ind + iby*lddb;  // add this line
```
ネストされた for ループのホストコードが正しくインデックス付けされていません。外側のループは行全体にインデックスを作成することを目的としており、行nがありますが、外側のループがm行全体にインデックスを作成できるようにしています。
```
for (int i = 0; i < m; ++i)
{
    for (int j = 0; j < n ; j ++)
```
したがって、ここで実際のインデックス計算を行うと、次のようになります。
```
    h_A[i*m+j] = rand()/(float)RAND_MAX;
```
範囲外のインデックスを作成しています。(i*mの一部の値では、行列のサイズを超えていますi) この問題は、ホストコード内のネストされたすべての for ループで繰り返されます。修正は、 ,ループのm,n境界を逆にすることです。ij

次のコードでは、これらのエラーが修正されています (さらに、省略した変数定義のいくつかの追加 -現在投稿したコードerrでflagは定義されていないため、コンパイルエラーが発生します)。正しく実行され、正しい結果が生成されるようです。

$ cat t1213.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>

#define BLK_X 2
#define BLK_Y 1

__global__ void matrixAdd2( const float *dA, const float *dB, float *dC, int m, int n)
{
int ldda = m;
int lddb = m;

int ind = blockIdx.x*BLK_X + threadIdx.x;
int iby = blockIdx.y*BLK_Y;
/* check if full block-column */
bool full = (iby + BLK_Y <= n);
/* do only rows inside matrix */
if ( ind < m ) {
    dA += ind + iby*ldda;
    dB += ind + iby*lddb;
    dC += ind + iby*lddb;
    if ( full )
    {
        // full block-column
        #pragma unroll
        for( int j=0; j < BLK_Y; ++j )
        {
            dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
            printf("A is %f, B is %f, C is %f  \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
        }
    }
    else
    {
        // partial block-column
        for( int j=0; j < BLK_Y && iby+j < n; ++j )
        {
            dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
            printf("parital: A is %f, B is %f, C is %f  \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
        }
    }
}
}



int main ( void )
{

int m = 4; // a - mxn matrix
int n = 2; // b - mxn matrix

size_t size =  m * n * sizeof(float);


printf("Matrix addition of %d rows and %d columns \n", m, n);

// allocate matrices on the host

float *h_A = (float *)malloc(size); // a- mxn matrix on the host
float *h_B = (float *)malloc(size); // b- mxn matrix on the host
float *h_C = (float *)malloc(size); // b- mxn matrix on the host


// Initialize the host input matrixs
for (int i = 0; i < n; ++i)
{
    for (int j = 0; j < m ; j ++)
    {
        h_A[i*m+j] = rand()/(float)RAND_MAX;
        h_B[i*m+j] = rand()/(float)RAND_MAX;

    }
}

// Allocate the device input matrix A
float *d_A = NULL;
cudaError_t err = cudaMalloc((void **)&d_A, size);; // d_a - mxn matrix a on the device

// Allocate the device input matrix B
float *d_B = NULL;
err = cudaMalloc((void **)&d_B, size);

// Allocate the device output matrix C
float *d_C = NULL;
err = cudaMalloc((void **)&d_C, size);

// Copy the host input matrixs A and B in host memory to the device input    matrixs in device memory
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

// defining number of threads and blocks
dim3 threads( BLK_X, BLK_Y );
dim3 grid((int)ceil(m/BLK_X),(int)ceil(n/BLK_Y) );


// Launching kernel
matrixAdd2<<<grid, threads, 0>>>(d_A, d_B, d_C, m, n);

// Copy the device result matrix in device memory to the host result matrix in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

//print A matrix
printf("Matrix A");
for (int i = 0; i < n; i++)
{
    for (int j = 0; j < m; j++)
   {
        printf(" %f", h_A[i*m+j]);

    }
    printf("\n");
}

// print B matrix if required
printf("Matrix B");
for (int i = 0; i < n; i++)
{
    for (int j = 0; j < m; j++)
    {

        printf(" %f", h_B[i*m+j]);

    }
    printf("\n");
}
int flag = 0;
//Error checkng
printf("Matrix C ");
for (int i = 0; i < n; i++)
{
    for (int j = 0; j < m; j++)
   {
        printf("%f", h_C[i*m+j]);
        if(h_C[i*m+j] == h_A[i*m+j] + h_B[i*m+j] )
        {
            flag = flag + 1;
        }
    }
    printf("\n");
}

if(flag==m*n)
{
printf("Test PASSED\n");
}


// Free device global memory
err = cudaFree(d_A);

err = cudaFree(d_B);

err = cudaFree(d_C);

// Free host memory
free(h_A);
free(h_B);
free(h_C);


err = cudaDeviceReset();
printf("Done\n");
return 0;

}
$ nvcc -o t1213 t1213.cu
$ cuda-memcheck ./t1213
========= CUDA-MEMCHECK
Matrix addition of 4 rows and 2 columns
Copy input data from the host memory to the CUDA device
Copy output data from the CUDA device to the host memory
A is 0.277775, B is 0.553970, C is 0.831745
A is 0.477397, B is 0.628871, C is 1.106268
A is 0.364784, B is 0.513401, C is 0.878185
A is 0.952230, B is 0.916195, C is 1.868425
A is 0.911647, B is 0.197551, C is 1.109199
A is 0.335223, B is 0.768230, C is 1.103452
A is 0.840188, B is 0.394383, C is 1.234571
A is 0.783099, B is 0.798440, C is 1.581539
Matrix A 0.840188 0.783099 0.911647 0.335223
 0.277775 0.477397 0.364784 0.952230
Matrix B 0.394383 0.798440 0.197551 0.768230
 0.553970 0.628871 0.513401 0.916195
Matrix C 1.2345711.5815391.1091991.103452
0.8317451.1062680.8781851.868425
Test PASSED
Done
========= ERROR SUMMARY: 0 errors
$

c - cuda マグマ 行列 - 行列加算カーネル

1 に答える 1

Related

Reference

c - cuda マグマ行列 - 行列加算カーネル