cuda - CUDA 行列乗算がロックアップし、ゼロの行列を表示する

Question

2 つの行列の積を 3 番目の結果行列に継続的に追加する単純な行列乗算プログラムを作成しようとしています (別の装置で消費電力を測定しながら、本質的に GPU にトレーニングを与えています)。

多数の反復を指定すると、問題が発生します。BLOCK_SIZE と行列の次元値のいくつかの組み合わせでこれを試しました。行列の次元を小さくすると反復回数を増やすことができますが、BLOCK_SIZE は行列の次元 (正方行列) の平方根でなければなりません。

この場合の結果のエラーは、39 秒間 (「多すぎる」限り、反復値に関係なく) フリーズし、その後にすべてゼロのマトリックス出力が続きます。興味深いことに、これを 20000 回の反復で 1 回実行したところ、問題なく動作しました。もう一度実行すると、フリーズエラーが発生しました。

何か案は？前もって感謝します！

カーネル：

//********************************************************************
// matrixMultiplication_kernel.cu
//
// Kernel for a basic CUDA matrix multiplication program.
//********************************************************************

#ifndef MATRIXMULTIPLICATION_KERNEL
#define MATRIXMULTIPLICATION_KERNEL

#define BLOCK_SIZE 16 // Set thread block size
#define colsA 256     // Set matrix A column dimension
#define rowsA 256     // Set matrix A row dimension
#define colsB 256     // Set matrix B column dimension
#define rowsB colsA   // Set matrix B row dimension
#define colsC colsB   // Set matrix C column dimension
#define rowsC rowsA   // Set matrix C row dimension

//--------------------------------------------------------------------
// matrixMultiplication() - Multiplies matrixA and matrixB, storing
//                          the result in device memory for matrixC.
//
// PRE:  matrixA, matrixB, and matrixC are float pointers; numColsA
//       numColsB are integers.
// POST: The result of multiplying matrixA and matrixB is stored in
//       matrixC.
//--------------------------------------------------------------------
__global__ void matrixMultiplication(float * matrixA, float * matrixB,
                     float * matrixC, int numColsA,
                     int numColsB) {

    /* Declare matrix-multplication holder value ouside of for loop */
    float val;

    /* Set block and thread index positions */
    int blockX = blockIdx.x;
    int blockY = blockIdx.y;
    int threadX = threadIdx.x;
    int threadY = threadIdx.y;

    /*
    Set starting and ending indices of the first sub-matrix of A
    and sub-matrix size for matrix A
    */
    int startA = numColsA * BLOCK_SIZE * blockY;
    int endA = startA + numColsA - 1;
    int subSizeA = BLOCK_SIZE;

    /*
    Set starting index of the first sub-matrix of B and sub-matrix
    size for matrix B
    */
    int startB = BLOCK_SIZE * blockX;
    int subSizeB = BLOCK_SIZE * colsB;

    /* Perform matrix multiplication 20000 times */
    for (int iteration = 0; iteration < 20000; iteration++) {

        /* Loop through matrix A and matrix B's sub-matrices */
        for (int i = startA, j = startB; i <= endA; i += subSizeA,
             j += subSizeB) {

        /*
            Declare shared memory arrays for matrix A and B
            sub-matrices
        */
        __shared__ float subA[BLOCK_SIZE][BLOCK_SIZE];
        __shared__ float subB[BLOCK_SIZE][BLOCK_SIZE];

        /* Fill sub-matrices */
        subA[threadY][threadX] =
            matrixA[i + colsA * threadY + threadX];
        subB[threadY][threadX] =
            matrixB[j + colsB * threadY + threadX];

        /* Ensure that the matrices are loaded */
        __syncthreads();

        /* Loop through the block */
        for (int k = 0; k < BLOCK_SIZE; ++k) {

            /* Compute product of two matrix indices */
            val += subA[threadY][k] * subB[k][threadX];
        }

        /*
            Ensure completion before the next set of sub-matrices
            begin computation
        */
        __syncthreads();
    }

    /* Set device memory for this sub-matrix */
    int position = colsB * BLOCK_SIZE * blockY + BLOCK_SIZE * blockX;   
    matrixC[position + colsB * threadY + threadX] = val;
    }
}

#endif

ホスト：

//********************************************************************
// matrixMultiplication.cu
//
// A basic CUDA matrix multiplication program.
//********************************************************************

/* Include necessary libraries and kernel */
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <matrixMultiplication_kernel.cu>

/* Function declarations */
void fillMatrix(float * matrix, int numIndices);

//*************
// Main Program
//*************
int main(int argc, char** argv) {

    /* Declare device memory */
    float * deviceA;
    float * deviceB;
    float * deviceC;

    srand(2013); // Set random seed

    /* Determine total number of indices in each matrix */
    unsigned int numIndicesA = colsA * rowsA;
    unsigned int numIndicesB = colsB * rowsB;
    unsigned int numIndicesC = colsC * rowsC;

    /* Determine memory size of each matrix */
    unsigned int memoryA = sizeof(float) * numIndicesA;
    unsigned int memoryB = sizeof(float) * numIndicesB;
    unsigned int memoryC = sizeof(float) * numIndicesC;

    /* Allocate memory for each matrix */
    float * matrixA = (float *) malloc(memoryA);
    float * matrixB = (float *) malloc(memoryB);
    float * matrixC = (float *) malloc(memoryC);

    /* Set contents of matrices A and B (matrix C is all zeros) */
    fillMatrix(matrixA, numIndicesA);
    fillMatrix(matrixB, numIndicesB); 

    /* Allocate device memory for each matrix */
    cudaMalloc((void **) &deviceA, memoryA);
    cudaMalloc((void **) &deviceB, memoryB);
    cudaMalloc((void **) &deviceC, memoryC);

    /* Copy host memory to device memory for matrices A and B */
    cudaMemcpy(deviceA, matrixA, memoryA, cudaMemcpyHostToDevice);
    cudaMemcpy(deviceB, matrixB, memoryB, cudaMemcpyHostToDevice);

    /* Set thread count to BLOCK_SIZE x BLOCK_SIZE */
    dim3 tCount(BLOCK_SIZE, BLOCK_SIZE);

    /* Set thread block count */
    dim3 tbCount((colsC / tCount.x), (rowsC / tCount.y));

    /* Run kernel */
    matrixMultiplication <<< tbCount, tCount >>> (deviceA, deviceB,
                          deviceC, colsA,
                          colsB);

    /* Copy device memory to host memory for matrix C */
    cudaMemcpy(matrixC, deviceC, memoryC, cudaMemcpyDeviceToHost);

    for(int i = 0; i < 256; i++) {
        printf("%f ", matrixC[i]);
    }
    printf("\n");

    /* Free up host and device memory for each matrix */
    free(matrixA);
    free(matrixB);
    free(matrixC);
    cudaFree(deviceA);
    cudaFree(deviceB);
    cudaFree(deviceC);
}

//--------------------------------------------------------------------
// fillMatrix - Assigns a random float value to each indice of the
//              matrix.
//
// PRE:  matrix is a pointer to a block of bytes in memory; numIndices
//       is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been filled with random float
//       values.
//--------------------------------------------------------------------
void fillMatrix(float * matrix, int numIndices) {

    /* Loop through each index of the matrix */
    for (int i = 0; i < numIndices; ++i) {

    /*
        Assign a random float between 0 and 1 for this index of
        the matrix
    */
    matrix[i] = rand() / (float)RAND_MAX;
    }
}

メイクファイル:

GCC = nvcc
CUDA_INSTALL_PATH := /usr/local/cuda
INCLUDES := -I. -I$(CUDA_INSTALL_PATH)/include
CUDA_LIBS := -L$(CUDA_INSTALL_PATH)/lib -lcudart

matrixMultiplication.o:     matrixMultiplication.cu
                    $(GCC)  $(INCLUDES) -c matrixMultiplication.cu -o $@ 

matrixMultiplication:       matrixMultiplication.o
        $(GCC)  -o $@ matrixMultiplication.o $(CUDA_LIBS)

clean:
        $(RM)   *.o *~

score 1 · Accepted Answer

問題が解決しました！これは、カーネルの持続時間が長いため、システムタイムアウトの問題でした。端末専用モードに切り替えることで、問題を回避できました。

みんな助けてくれてありがとう！

cuda - CUDA 行列乗算がロックアップし、ゼロの行列を表示する

1 に答える 1

Related

Reference