cuda - CUDA: GPU へのデータのアップロードで Thrust が非常に遅いのはなぜですか?

Question

私はGPUの世界に不慣れで、プログラムを書くためにCUDAをインストールしました。スラストライブラリで遊んでみましたが、データを GPU にアップロードするときに非常に遅いことがわかりました。私の悪くないデスクトップでは、ホストからデバイスへの部分で約 35MB/s です。どうしてですか？

環境: Visual Studio 2012、CUDA 5.0、GTX760、Intel-i7、Windows 7 x64

GPU 帯域幅テスト: ここに画像の説明を入力

ホストからデバイスへ、またはその逆の転送速度は少なくとも 11 GB/秒であると想定されています。しかし、そうではありませんでした！

テストプログラムは次のとおりです。

#include <iostream>
#include <ctime>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

#define N 32<<22

int main(void)
{
    using namespace std;

    cout<<"GPU bandwidth test via thrust, data size: "<< (sizeof(double)*N) / 1000000000.0 <<" Gbytes"<<endl;
    cout<<"============program start=========="<<endl;

    int now = time(0);
    cout<<"Initializing h_vec...";
    thrust::host_vector<double> h_vec(N,0.0f);
    cout<<"time spent: "<<time(0)-now<<"secs"<<endl;

    now = time(0);
    cout<<"Uploading data to GPU...";
    thrust::device_vector<double> d_vec = h_vec;
    cout<<"time spent: "<<time(0)-now<<"secs"<<endl;

    now = time(0);
    cout<<"Downloading data to h_vec...";
    thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
    cout<<"time spent: "<<time(0)-now<<"secs"<<endl<<endl;

    system("PAUSE");
    return 0;
}

プログラム出力: ここに画像の説明を入力

ダウンロード速度: 1 秒未満。公称 11GB/秒と比較するとかなり理にかなっています。
アップロード速度: 1.07374GB /32 秒は約 33.5 MB/秒になり、まったく意味がありません。

誰も理由を知っていますか？それとも推力だけですか？

ありがとう！！

score 9 · Accepted Answer

あなたの比較にはいくつかの欠陥があり、そのうちのいくつかはコメントでカバーされています.

割り当ての影響を排除する必要があります。これを行うには、最初に「ウォームアップ」転送をいくつか実行します。
「起動」効果を排除する必要があります。これを行うには、最初に「ウォームアップ」転送をいくつか実行します。
データを比較するときは、推力が使用しないメモリ割り当てbandwidthTestを使用していることに注意してください。PINNEDしたがって、推力データの転送速度は遅くなります。これは通常、約 2 倍の係数に寄与します (つまり、固定メモリ転送は通常、ページング可能なメモリ転送よりも約 2 倍高速です。より良い比較が必要な場合bandwidthTestは、スイッチを使用して実行し--memory=pageableます。
タイミング関数の選択が最適ではない可能性があります。cudaEvents は、CUDA 操作のタイミングを計るのに非常に信頼性があります。

適切なタイミングを行うコードは次のとおりです。

$ cat t213.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/fill.h>

#define DSIZE ((1UL<<20)*32)

int main(){

  thrust::device_vector<int> d_data(DSIZE);
  thrust::host_vector<int> h_data(DSIZE);
  float et;
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  thrust::fill(h_data.begin(), h_data.end(), 1);
  thrust::copy(h_data.begin(), h_data.end(), d_data.begin());

  std::cout<< "warm up iteration " << d_data[0] << std::endl;
  thrust::fill(d_data.begin(), d_data.end(), 2);
  thrust::copy(d_data.begin(), d_data.end(), h_data.begin());
  std::cout<< "warm up iteration " << h_data[0] << std::endl;
  thrust::fill(h_data.begin(), h_data.end(), 3);
  cudaEventRecord(start);
  thrust::copy(h_data.begin(), h_data.end(), d_data.begin());
  cudaEventRecord(stop);
  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&et, start, stop);
  std::cout<<"host to device iteration " << d_data[0] << " elapsed time: " << (et/(float)1000) << std::endl;
  std::cout<<"apparent bandwidth: " << (((DSIZE*sizeof(int))/(et/(float)1000))/((float)1048576)) << " MB/s" << std::endl;
  thrust::fill(d_data.begin(), d_data.end(), 4);
  cudaEventRecord(start);
  thrust::copy(d_data.begin(), d_data.end(), h_data.begin());
  cudaEventRecord(stop);
  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&et, start, stop);
  std::cout<<"device to host iteration " << h_data[0] << " elapsed time: " << (et/(float)1000) << std::endl;
  std::cout<<"apparent bandwidth: " << (((DSIZE*sizeof(int))/(et/(float)1000))/((float)1048576)) << " MB/s" << std::endl;

  std::cout << "finished" << std::endl;
  return 0;
}

私はコンパイルします（私はcc2.0デバイスを備えたPCIE Gen2システムを持っています）

$ nvcc -O3 -arch=sm_20 -o t213 t213.cu

実行すると、次の結果が得られます。

$ ./t213
warm up iteration 1
warm up iteration 2
host to device iteration 3 elapsed time: 0.0476644
apparent bandwidth: 2685.44 MB/s
device to host iteration 4 elapsed time: 0.0500736
apparent bandwidth: 2556.24 MB/s
finished
$

bandwidthTestPCIE Gen2 システムを使用しているため、私のシステムではどちらの方向にも約 6GB/s と報告されるため、これは正しいように見えます。スラストは固定メモリではなくページング可能なメモリを使用するため、その帯域幅の約半分、つまり 3GB/秒が得られ、スラストは約 2.5GB/秒を報告しています。

比較のために、ページング可能なメモリを使用した、私のシステムでの帯域幅テストを次に示します。

$ /usr/local/cuda/samples/bin/linux/release/bandwidthTest --memory=pageable
[CUDA Bandwidth Test] - Starting...
Running on...

 Device 0: Quadro 5000
 Quick Mode

 Host to Device Bandwidth, 1 Device(s)
 PAGEABLE Memory Transfers
   Transfer Size (Bytes)        Bandwidth(MB/s)
   33554432                     2718.2

 Device to Host Bandwidth, 1 Device(s)
 PAGEABLE Memory Transfers
   Transfer Size (Bytes)        Bandwidth(MB/s)
   33554432                     2428.2

 Device to Device Bandwidth, 1 Device(s)
 PAGEABLE Memory Transfers
   Transfer Size (Bytes)        Bandwidth(MB/s)
   33554432                     99219.1

$

cuda - CUDA: GPU へのデータのアップロードで Thrust が非常に遅いのはなぜですか?

1 に答える 1

Related

Reference