cuda - cuda カーネルが同時に実行されない

Question

2.0 機能を持つ Nvidia Quadro 4000 の同時カーネル実行プロパティを調査しようとしています。

次のように同じように実行される 2 つの異なるストリームを使用します。

固定メモリの H2D 2 つの異なるチャンクをコピーする
カーネルを実行
D2H の 2 つの異なるチャンクを固定メモリにコピーします。

両方のストリームのカーネルはまったく同じで、実行時間はそれぞれ 190 ミリ秒です。

Visual プロファイラー (バージョン 5.0) では、両方のカーネルが同時に実行を開始すると予想していましたが、20 ミリ秒しかオーバーラップしていませんでした。コードサンプルは次のとおりです。

enter code here

//initiate the streams
        cudaStream_t stream0,stream1;
        CHK_ERR(cudaStreamCreate(&stream0));
        CHK_ERR(cudaStreamCreate(&stream1));
        //allocate the memory on the GPU for stream0
        CHK_ERR(cudaMalloc((void **)&def_img0, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&ref_img0, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outY_img0,width_size_for_out*height_size_for_out*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outX_img0,width_size_for_out*height_size_for_out*sizeof(char)));
        //allocate the memory on the GPU for stream1
        CHK_ERR(cudaMalloc((void **)&def_img1, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&ref_img1, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outY_img1,width_size_for_out*height_size_for_out*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outX_img1,width_size_for_out*height_size_for_out*sizeof(char)));

        //allocate page-locked memory for stream0
        CHK_ERR(cudaHostAlloc((void**)&host01, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host02, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host03, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host04, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));

        //allocate page-locked memory for stream1
        CHK_ERR(cudaHostAlloc((void**)&host11, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host12, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host13, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host14, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));


        memcpy(host01,in1,width*height*sizeof(char));
        memcpy(host02,in2,width*height*sizeof(char));

        memcpy(host11,in1,width*height*sizeof(char));
        memcpy(host12,in2,width*height*sizeof(char));



        cudaEvent_t start, stop;
        float time;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);

        dim3 dimBlock(CUDA_BLOCK_DIM, CUDA_BLOCK_DIM);
        dim3 Grid((width-SEARCH_RADIUS*2-1)/(dimBlock.x*4)+1, (height-SEARCH_RADIUS*2-1)/(dimBlock.y*4)+1);

        cudaEventRecord(start,0);
        // --------------------
        // Copy images to device
        // --------------------
        //enqueue copies of def stream0 and stream1
        CHK_ERR(cudaMemcpyAsync(def_img0, host01,width*height*sizeof(char), cudaMemcpyHostToDevice,stream0));
        CHK_ERR(cudaMemcpyAsync(def_img1, host11,width*height*sizeof(char), cudaMemcpyHostToDevice,stream1));
        //enqueue copies of ref stream0 and stream1
        CHK_ERR(cudaMemcpyAsync(ref_img0, host02,width*height*sizeof(char), cudaMemcpyHostToDevice,stream0));
        CHK_ERR(cudaMemcpyAsync(ref_img1, host12,width*height*sizeof(char), cudaMemcpyHostToDevice,stream1));

        CHK_ERR(cudaStreamSynchronize(stream0));
        CHK_ERR(cudaStreamSynchronize(stream1));

        //CALLING KERNEL
        //enqueue kernel in stream0 and stream1
        TIME_KERNEL((exhaustiveSearchKernel<CUDA_BLOCK_DIM*4,CUDA_BLOCK_DIM*4,SEARCH_RADIUS><<< Grid, dimBlock,0,stream0>>>(def_img0+SEARCH_RADIUS*width+SEARCH_RADIUS,ref_img0,outX_img0,outY_img0,width,width_size_for_out)),"exhaustiveSearchKernel stream0");
        TIME_KERNEL((exhaustiveSearchKernel<CUDA_BLOCK_DIM*4,CUDA_BLOCK_DIM*4,SEARCH_RADIUS><<< Grid, dimBlock,0,stream1>>>(def_img1+SEARCH_RADIUS*width+SEARCH_RADIUS,ref_img1,outX_img1,outY_img1,width,width_size_for_out)),"exhaustiveSearchKernel stream1");


        //Copy result back
        CHK_ERR(cudaMemcpyAsync(host03, outX_img0, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream0));
        CHK_ERR(cudaMemcpyAsync(host13, outX_img1, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream1));

        CHK_ERR(cudaMemcpyAsync(host04, outY_img0, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream0));
        CHK_ERR(cudaMemcpyAsync(host14, outY_img1, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream1));


        CHK_ERR(cudaStreamSynchronize(stream0));
        CHK_ERR(cudaStreamSynchronize(stream1));

        cudaEventRecord( stop, 0 );
        cudaEventSynchronize( stop );
        cudaEventElapsedTime( &time, start, stop );
        printf("Elapsed time=%f ms\n",time);

        memcpy(outX,host03,width_size_for_out*height_size_for_out*sizeof(char));
        memcpy(outY,host04,width_size_for_out*height_size_for_out*sizeof(char));


        cudaEventDestroy( start ); 
        cudaEventDestroy( stop );
        CHK_ERR(cudaStreamDestroy(stream0));
        CHK_ERR(cudaStreamDestroy(stream1));

        CHK_ERR(cudaDeviceReset());


    }

score 3 · Accepted Answer

計算能力2.x-3.0

コンピューティング機能2.x-3.0デバイスには、単一のハードウェアワークキューがあります。CUDAドライバーはコマンドをワークキューにプッシュします。GPUホストはコマンドを読み取り、コピーエンジンまたはCUDA Work Distributor（CWD）に作業をディスパッチします。CUDAドライバーは、同期コマンドをハードウェア作業キューに挿入して、同じストリームでの作業が同時に実行されないようにします。ホストが同期コマンドを実行すると、依存する作業が完了するまでホストは停止します。

カーネルの同時実行により、グリッドが小さすぎてGPU全体を埋められない場合、またはグリッドにテール効果がある場合（スレッドブロックのサブセットが他のスレッドブロックよりもはるかに長く実行される場合）、GPUの使用率が向上します。

ケース1：1つのストリームで連続したカーネル

アプリケーションが同じストリームで2つのカーネルを連続して起動する場合、CUDAドライバーによって挿入された同期コマンドは、最初のカーネルが完了するまで2番目のカーネルをCWDにディスパッチしません。

ケース2：2つのストリームでの連続したカーネル起動

アプリケーションが異なるストリームで2つのカーネルを起動した場合、ホストはコマンドを読み取り、コマンドをCWDにディスパッチします。CWDは最初のグリッドをラスタライズし（順序はアーキテクチャに依存します）、スレッドブロックをSMにディスパッチします。最初のグリッドからのすべてのスレッドブロックがディスパッチされた場合にのみ、CWDは2番目のグリッドからスレッドブロックをディスパッチします。

計算能力3.5

コンピューティング機能3.5では、GPUの使用率を向上させるためにいくつかの新機能が導入されました。これらには以下が含まれます：-HyperQは複数の独立したハードウェアワークキューをサポートします。-動的並列処理により、デバイスコードで新しい作業を開始できます。-CWD容量が32グリッドに増加しました。

資力

cuda - cuda カーネルが同時に実行されない

1 に答える 1

Related

Reference