c++ - CUDAの結果は非常に大きな配列を使用してガベージを返しますが、エラーは報告されません

Question

サイズnのデバイスとホスト配列を作成するテストプログラムを作成してから、デバイス配列の各場所に定数値0.95fを割り当てるn個のスレッドを作成するカーネルを起動します。完了後、デバイスアレイがホストアレイにコピーされ、すべてのエントリが合計され、最終的な合計が表示されます。

以下のプログラムは、最大約6000万フロートの配列サイズで正常に動作し、正しい結果を非常に迅速に返しますが、7000万に達すると、プログラムはしばらくハングし、最終的には合計でNAN結果を返します。6000万回の実行後にホストアレイを検査すると、0.95fが正しく入力されていることがわかりますが、7000万回の実行後にホストアレイを検査すると、NANが入力されていることがわかります。私の知る限り、どのCUDA呼び出しもエラーを返しません。

私は2GBGT640m（Compute 3.0）を使用しており、最大ブロックサイズは1024、最大グリッド寸法は2147483647です。

同様のことを達成するためのより良い方法があると確信しており、提案を聞きたいと思います。しかし、ここで何がうまくいかなかったのかを理解して、そこから学ぶことができるようにしたいと思います。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <fstream>

void cudaErrorHandler(cudaError_t status)
{
    // Cuda call returned an error, just print error for now
    if(status != cudaSuccess)
    {
        printf("Error");
    }
}

__global__ void addKernel(float* _Results, int _TotalCombinations)
{
    // Get thread Id
    unsigned int Id = (blockDim.x * blockDim.y * blockIdx.x) + (blockDim.x * threadIdx.y) + threadIdx.x;

    //If the Id is within simulation range, log it
    if(Id < _TotalCombinations)
    {
        _Results[Id] = 0.95f;
    }
}

#define BLOCK_DIM_X 32
#define BLOCK_DIM_Y 32
#define BLOCK_SIZE BLOCK_DIM_X * BLOCK_DIM_Y // Statc block size of 32*32 (1024)
#define CUDA_CALL(x) cudaErrorHandler(x)

int main()
{
    // The number of simulations to run
    unsigned int totalCombinations = 45000000;

    int gridsize = 1;

    // Work out how many blocks of size 1024 are required to perform all of totalCombinations
    for(unsigned int totalsize = gridsize * BLOCK_SIZE; totalsize < totalCombinations; 
        gridsize++, totalsize = gridsize * BLOCK_SIZE)
        ;

    // Allocate host memory
    float* host_results = new float[totalCombinations];
    memset(host_results, 0, sizeof(float) * totalCombinations);
    float *dev_results = 0;

    cudaSetDevice(0);

    // Allocate device memory
    CUDA_CALL(cudaMalloc((void**)&dev_results, totalCombinations * sizeof(float)));

    dim3 grid, block;

    block = dim3(BLOCK_DIM_X, BLOCK_DIM_Y);

    grid = dim3(gridsize);

    // Launch kernel
    addKernel<<<gridsize, block>>>(dev_results, totalCombinations);

    // Wait for synchronize
    CUDA_CALL(cudaDeviceSynchronize());

    // Copy device data back to host
    CUDA_CALL(cudaMemcpy(host_results, dev_results, totalCombinations * sizeof(float), cudaMemcpyDeviceToHost));

    double total = 0.0;

    // Total the results in the host array
    for(unsigned int i = 0; i < totalCombinations; i++)
        total+=host_results[i];

    // Print results to screen
    printf("Total %f\n", total);

    delete[] host_results;

    return 0;
}

score 7 · Accepted Answer

お気づきのとおり、エラー処理方法が機能していません。以下に、私が頻繁に使用するエラーチェックメソッドを使用してコードのバージョンを貼り付けました。障害点で問題が発生しない理由は、グリッドサイズ（1Dグリッドを起動している）がX次元の最大グリッドサイズ（デフォルトでは65535、つまり最大2.xの計算機能）を超えているためです。より大きなグリッドサイズの次元を利用したい場合（2 ^ 31 -1は計算機能3.0の制限です）、-arch=sm_30スイッチを使用してコンパイルする必要があります。

参考までに、私が頻繁に使用するエラーチェック方法を示すコードのバージョンを示します。

#include <stdio.h>
#include <fstream>


#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

__global__ void addKernel(float* _Results, int _TotalCombinations)
{
    // Get thread Id
    unsigned int Id = (blockDim.x * blockDim.y * blockIdx.x) + (blockDim.x * threadIdx.y) + threadIdx.x;

    //If the Id is within simulation range, log it
    if(Id < _TotalCombinations)
    {
        _Results[Id] = 0.95f;
    }
}

#define BLOCK_DIM_X 32
#define BLOCK_DIM_Y 32
#define BLOCK_SIZE BLOCK_DIM_X * BLOCK_DIM_Y // Statc block size of 32*32 (1024)

int main()
{
    // The number of simulations to run
    unsigned int totalCombinations = 65000000;

    int gridsize = 1;

    // Work out how many blocks of size 1024 are required to perform all of totalCombinations
    for(unsigned int totalsize = gridsize * BLOCK_SIZE; totalsize < totalCombinations;
        gridsize++, totalsize = gridsize * BLOCK_SIZE)
        ;
    printf("gridsize = %d, blocksize = %d\n", gridsize, BLOCK_SIZE);
    // Allocate host memory
    float* host_results = new float[totalCombinations];
    memset(host_results, 0, sizeof(float) * totalCombinations);
    float *dev_results = 0;

    cudaSetDevice(0);

    // Allocate device memory
    cudaMalloc((void**)&dev_results, totalCombinations * sizeof(float));
    cudaCheckErrors("cudaMalloc fail");

    dim3 grid, block;

    block = dim3(BLOCK_DIM_X, BLOCK_DIM_Y);

    grid = dim3(gridsize);

    // Launch kernel
    addKernel<<<gridsize, block>>>(dev_results, totalCombinations);
    cudaCheckErrors("kernel fail");
    // Wait for synchronize
    cudaDeviceSynchronize();
    cudaCheckErrors("sync fail");

    // Copy device data back to host
    cudaMemcpy(host_results, dev_results, totalCombinations * sizeof(float), cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy 2 fail");

    double total = 0.0;

    // Total the results in the host array
    for(unsigned int i = 0; i < totalCombinations; i++)
        total+=host_results[i];

    // Print results to screen
    printf("Total %f\n", total);

    delete[] host_results;

    return 0;
}

c++ - CUDAの結果は非常に大きな配列を使用してガベージを返しますが、エラーは報告されません

1 に答える 1

Related

Reference