cuda - CUDA カーネルは CPU より遅い

Question

私はCUDAが初めてで、おそらく何か間違ったことをしているでしょう。必要なのは、2 つのバイナリベクトルに対する論理演算だけです。andベクトルの長さは 2048000 です。Matlabの C mex ファイルと CUDA カーネルの論理の速度を比較しました。CPU 上の C は、CUDA よりも最大 5% 高速です。カーネルの実行のみを測定したことに注意してください (メモリ転送なし)。私はi7 930と9800GTを持っています。

##MEX file testCPU.c:##

#include "mex.h"
void mexFunction( int nlhs, mxArray *plhs[],
        int nrhs, const mxArray *prhs[] ) {
    
    int i, varLen;
    unsigned char *vars, *output;
            
    vars = mxGetPr(prhs[0]);
    plhs[0] = mxCreateLogicalMatrix(2048000, 1);
    output = mxGetPr(plhs[0]);
    for (i=0;i<2048000;i++){
        output[i] = vars[i] & vars[2048000+i];
    }
}

コンパイル

mex testCPU.c

ベクトルを作成する

vars = ~~(randi(2,2048000,2)-1);

測定速度:

tic;testCPU(vars);toc;

CUDA :

#CUDA file testGPU.cu#
#include "mex.h"
#include "cuda.h"

__global__ void logical_and(unsigned char* in, unsigned char* out, int N) {
    int idx = blockIdx.x*blockDim.x+threadIdx.x;
    out[idx] = in[idx] && in[idx+N];
}


void mexFunction( int nlhs, mxArray *plhs[],
        int nrhs, const mxArray *prhs[] ) {
    
    int i;
    unsigned char *vars, *output, *gpu, *gpures;
    
    vars = (unsigned char*)mxGetData(prhs[0]);
    
    plhs[0] = mxCreateLogicalMatrix(2048000, 1);
    output = (unsigned char*)mxGetData(plhs[0]);       
       
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    float dt_ms;
    
    // input GPU malloc
    cudaEventRecord(start, 0);
    cudaMalloc( (void **) &gpu, sizeof(unsigned char)*4096000);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("GPU input malloc: %f ms, %i\n", dt_ms, cudaGetLastError());
    
    // output GPU malloc
    cudaEventRecord(start, 0);
    cudaMalloc( (void **) &gpures, sizeof(unsigned char)*2048000);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("GPU output malloc: %f ms, %i\n", dt_ms, cudaGetLastError());
    
    // copy from CPU to GPU
    cudaEventRecord(start, 0);
    cudaMemcpy( gpu, vars, sizeof(unsigned char)*4096000, cudaMemcpyHostToDevice);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("copy input from CPU to GPU: %f ms, %i\n", dt_ms, cudaGetLastError());
    
    dim3 dimBlock(32);
    printf("thread count: %i\n", dimBlock.x);
    dim3 dimGrid(2048000/dimBlock.x);
    printf("block count: %i\n", dimGrid.x);
    
    // --- KERNEL ---
    cudaEventRecord(start, 0);
    logical_and<<<dimGrid, dimBlock>>>(gpu, gpures, 2048000);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("GPU kernel: %f ms, %i\n", dt_ms, cudaGetLastError());
    
    // result from GPU to CPU
    cudaEventRecord(start, 0);
    cudaMemcpy( output, gpures, sizeof(unsigned char)*2048000, cudaMemcpyDeviceToHost );
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("copy output from GPU to CPU: %f ms, %i\n", dt_ms, cudaGetLastError());
    
    
    cudaFree(gpu);
    cudaFree(gpures);
    
}

コンパイル：

 nvmex -f nvmexopts_9.bat testGPU.cu 
-I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.2\include" 
-L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.2\lib\x64" -lcudart -lcufft

出力：

GPU input malloc: 0.772160 ms, 0
GPU output malloc: 0.041728 ms, 0
copy input from CPU to GPU: 1.494784 ms, 0
thread count: 32
block count: 64000
*** GPU kernel: 3.761216 ms, 0 ***
copy output from GPU to CPU: 1.203488 ms, 0

そのコードは大丈夫ですか？CPU は CUDA カーネルより ~0.1ms 高速でした。512 までのさまざまなスレッド数 (32 の乗数) を試しましたが、32 が最速でした。&& の代わりに演算子 & を使用すると、ほぼ 1 ミリ秒遅くなりました。

9800GTってそんなに弱いの？現在のメインストリームカード (GTX460、560) ではどの程度のスピードアップが期待できますか?

ありがとうございました

編集: talonmies のコメントに基づいて、これらの変更を加えました:

カーネル機能:

__global__ void logical_and(uchar4* in, uchar4* out, int N) {
    int idx = blockIdx.x*blockDim.x+threadIdx.x;
    out[idx].x = in[idx].x & in[idx+N].x;
    out[idx].y = in[idx].y & in[idx+N].y;
    out[idx].z = in[idx].z & in[idx+N].z;
    out[idx].w = in[idx].w & in[idx+N].w;
}

主な機能：

uchar4 *gpu, *gpures;

// 32 was worst, 64,128,256,512 were similar
dim3 dimBlock(128);
// block count is now 4xtimes smaller
dim3 dimGrid(512000/dimBlock.x);

出力：

GPU input malloc: 0.043360 ms, 0
GPU output malloc: 0.038592 ms, 0
copy input from CPU to GPU: 1.499584 ms, 0
thread count: 128
block count: 4000
*** GPU kernel: 0.131296 ms, 0 ***
copy output from GPU to CPU: 1.281120 ms, 0

あれは正しいですか？約30倍の高速化！本当であるには良すぎるように思えますが、結果は正しいです :) この特定のタスクで GTX560 はどれくらい速くなるでしょうか? どうも

編集2：

このコードですか

__global__ void logical_and(uchar4* in, uchar4* out, int N) {
    int idx = blockIdx.x*blockDim.x+threadIdx.x;
    
    out[idx].x = in[idx].x & in[idx+N].x;
    out[idx].y = in[idx].y & in[idx+N].y;
    out[idx].z = in[idx].z & in[idx+N].z;
    out[idx].w = in[idx].w & in[idx+N].w;
}

自動的に次のように変換されます:

__global__ void logical_and(uchar4* in, uchar4* out, int N) {
    int idx = blockIdx.x*blockDim.x+threadIdx.x;  
    uchar4 buff;

    buff.x = in[idx].x;
    buff.y = in[idx].y;
    buff.z = in[idx].z;
    buff.w = in[idx].w;

    buff.x &= in[idx+N].x;
    buff.y &= in[idx+N].y;
    buff.z &= in[idx+N].z;
    buff.w &= in[idx+N].w;

    out[idx].x = buff.x;
    out[idx].y = buff.y;
    out[idx].z = buff.z;
    out[idx].w = buff.w;
}

コンパイラによって？

それが正しければ、合体アクセスについての私の混乱を説明しています。in[idx] & in[idx+N]連続していないメモリにアクセスするため、非合体アクセスにつながると思いました。しかし実際には、in[idx]とin[idx+N]は 2 つの合体ステップでロードされます。Nuchar4 の長さは 4 バイトであり、合体アクセスの場合、アドレスは (1.1 デバイスでは) 64 バイトに揃える必要があるため、16 の任意の倍数にすることができます。私は正しいですか？

score 2 · Accepted Answer

talonmiesが指摘しているように、データにアクセスしてバイト単位で処理しているため、最適とは言えません。命令レベルの並列性やバッファリングされた読み取り/書き込みなど、検討する可能性のある一連の手法は、VasilyVolkovによるnVidiaウェビナーの低占有率でのパフォーマンスの向上にまとめられています。

一言で言えば、あなたがしたいことは、各スレッドで、合体した方法でいくつか uint4を読み、それらを処理し、そしてそれらを保存することです。

アップデート

次のようにコードを書き直しても違いはありますか？

__global__ void logical_and(unsigned int* in, unsigned int* out, int N) {
    int idx = blockIdx.x*blockDim.x*chunksize+threadIdx.x;
    unsigned int buff[chunksize];
    #pragma unroll
    for ( int k = 0 ; k < chunksize ; k++ )
        buff[k] = in[ blockDim.x*k + idx ];
    #pragma unroll
    for ( int k = 0 ; k < chunksize ; k++ )
        buff[k] &= in[ blockDim.x*k + idx + N ];
    #pragma unroll
    for ( int k = 0 ; k < chunksize ; k++ )
        out[ blockDim.x*k + idx ] = buff[k];
}

私はあなたがどこかでdchunksizeした変数であると仮定したことに注意してください、例えば#define

#define chunksize 4

そして、起動するブロックの数をその数で割る必要がありますN。私も使っunsigned intたのはたった4パックucharです。呼び出し元の関数では、それに応じてポインターをキャストする必要がある場合があります。

score 1 · Accepted Answer

What i think its happening is called false sharing. I think the problem is that the byte-sized regions you are trying to write from your threads are producing a massive race condition because different threads are trying to write to the same word-aligned address. I'm not sure the details in GPU, but in CPU, when different threads try to write to memory in the same 256-byte aligned region (called cache lines) they will continuously block each other, plummeting your global performance.

cuda - CUDA カーネルは CPU より遅い

編集: talonmies のコメントに基づいて、これらの変更を加えました:

編集2：

2 に答える 2

Related

Reference