cuda - ホストとGPUでのCUDA追加の結果が異なる

Question

カラー写真を撮り、そのグレーバージョンを返す関数があります。ホストでシーケンシャルコードを実行すると、すべてが完全に機能します。デバイスで実行した場合、結果はわずかに異なります（1000の1ピクセルは、正しい値と比較して+1または-1のいずれかです）。

これはコンバージョンと関係があると思いますが、よくわかりません。これは私が使用するコードです：

    __global__ void rgb2gray_d (unsigned char *deviceImage, unsigned char *deviceResult, const int height, const int width){
    /* calculate the global thread id*/
    int threadsPerBlock  = blockDim.x * blockDim.y;
    int threadNumInBlock = threadIdx.x + blockDim.x * threadIdx.y;
    int blockNumInGrid   = blockIdx.x  + gridDim.x  * blockIdx.y;

    int globalThreadNum = blockNumInGrid * threadsPerBlock + threadNumInBlock;
    int i = globalThreadNum;

    float grayPix = 0.0f;
    float r = static_cast< float >(deviceImage[i]);
    float g = static_cast< float >(deviceImage[(width * height) + i]);
    float b = static_cast< float >(deviceImage[(2 * width * height) + i]);
    grayPix = (0.3f * r) + (0.59f * g) + (0.11f * b);

    deviceResult[i] = static_cast< unsigned char > (grayPix);
}

void rgb2gray(unsigned char *inputImage, unsigned char *grayImage, const int width, const int height, NSTimer &timer) {

    unsigned char *deviceImage;
    unsigned char *deviceResult;

    int initialBytes = width * height * 3;  
    int endBytes =  width * height * sizeof(unsigned char);

    unsigned char grayImageSeq[endBytes];

    cudaMalloc((void**) &deviceImage, initialBytes);
    cudaMalloc((void**) &deviceResult, endBytes);
    cudaMemset(deviceResult, 0, endBytes);
    cudaMemset(deviceImage, 0, initialBytes);

    cudaError_t err = cudaMemcpy(deviceImage, inputImage, initialBytes, cudaMemcpyHostToDevice);    

    // Convert the input image to grayscale 
    rgb2gray_d<<<width * height / 256, 256>>>(deviceImage, deviceResult, height, width);
    cudaDeviceSynchronize();

    cudaMemcpy(grayImage, deviceResult, endBytes, cudaMemcpyDeviceToHost);

    ////// Sequential
    for ( int y = 0; y < height; y++ ) {
             for ( int x = 0; x < width; x++ ) {
                   float grayPix = 0.0f;
                   float r = static_cast< float >(inputImage[(y * width) + x]);
                   float g = static_cast< float >(inputImage[(width * height) + (y * width) + x]);
                   float b = static_cast< float >(inputImage[(2 * width * height) + (y * width) + x]);

                   grayPix = (0.3f * r) + (0.59f * g) + (0.11f * b);
                   grayImageSeq[(y * width) + x] = static_cast< unsigned char > (grayPix);
              }
        }

    //compare sequential and cuda and print pixels that are wrong
    for (int i = 0; i < endBytes; i++)
    {
        if (grayImage[i] != grayImageSeq[i])
        cout << i << "-" << static_cast< unsigned int >(grayImage[i]) <<
                 " should be " << static_cast< unsigned int >(grayImageSeq[i]) << endl;
        }

    cudaFree(deviceImage);
    cudaFree(deviceResult);
}

初期画像はCImgであるため、初期画像の幅*高さ*3を割り当てます。

私はGeForceGTX480に取り組んでいます。

score 5 · Accepted Answer

ついに答えを見つけました。CUDAは、単精度と倍精度の両方で乗算加算を自動的に融合します。以下の1、セクション4.4のドキュメントを使用して、なんとか修正できました。する代わりに

grayPix = (0.3f * r) + (0.59f * g) + (0.11f * b);

私は今やっています

grayPix = __fadd_rn(__fadd_rn(__fmul_rn(0.3f, r),__fmul_rn(0.59f, g)), __fmul_rn(0.11f, b));

これにより、乗算のマージが無効になり、乗算と加算の融合命令に追加されます。

NVIDIAGPUの浮動小数点およびIEEE754コンプライアンス

score 1 · Accepted Answer

浮動小数点演算は、デバイスコードとホストコードでわずかに異なる結果を生成する可能性があります。

これが当てはまる理由は複数あります。これらの2つの関数は、2つの異なるコンパイラによって、2つの異なる浮動小数点ハードウェア実装で実行される2つの異なるバイナリプログラムにコンパイルされることを考慮する必要があります。

たとえば、浮動小数点の計算が異なる順序で実行される場合、丸め誤差によって異なる結果が生じる可能性があります。

また、x86アーキテクチャCPUで32ビット（浮動小数点）または64ビット（倍精度）の浮動小数点表現を使用して浮動小数点計算を実行する場合、浮動小数点の計算は、内部で80ビットの精度を使用するFPUユニットによって実行され、結果は次のようになります。次に、floatデータ型の場合は32ビットに、doubleデータ型の場合は64ビットに切り捨てられます。

GPUのALUは、浮動小数点演算に32ビット精度を使用します（floatデータ型を使用していると想定）。

浮動小数点表現と算術のトピックについて説明している優れた記事は、ここにあります。

cuda - ホストとGPUでのCUDA追加の結果が異なる

2 に答える 2

Related

Reference