c++ - VS2010/Cuda4.2 と比較して、VS2013/Cuda7.0 では CUFFT が 1000 倍遅い

Question

この単純な CUFFT コードは 2 つの IDE で実行されました -

VS 2013 と Cuda 7.0
VS 2010 と Cuda 4.2

Cuda 7.0 を搭載した VS 2013 は1000約 1 倍遅いことがわかりました。コード0.6 msは VS 2010 で実行され、 520 msVS 2013 で実行されました。両方とも平均です。

#include "stdafx.h"
#include "cuda.h"
#include "cuda_runtime_api.h"
#include "cufft.h"
typedef cuComplex Complex;
#include <iostream>
using namespace std;
int _tmain(int argc, _TCHAR* argv[])
{
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    const int SIZE = 10000;
    Complex *h_col = (Complex*)malloc(SIZE*sizeof(Complex));
    for (int i = 0; i < SIZE; i++)
    {
        h_col[i].x = i;
        h_col[i].y = i;
    }
    Complex *d_col;
    cudaMalloc((void**)&d_col, SIZE*sizeof(Complex));
    cudaMemcpy(d_col, h_col, SIZE*sizeof(Complex), cudaMemcpyHostToDevice);

    cufftHandle plan;
    const int BATCH = 1;
    cufftPlan1d(&plan, SIZE, CUFFT_C2C, BATCH);
    cufftExecC2C(plan, d_col, d_col, CUFFT_FORWARD);

    cudaMemcpy(h_col, d_col, SIZE*sizeof(Complex), cudaMemcpyDeviceToHost);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    cufftDestroy(plan);
    cout << milliseconds;

    return 0;
}

コードは、同じコンピューター、同じ OS、同じグラフィックスカードで、すぐに次々と実行されました。どちらの場合も構成は x64 リリースでした。C++ コンパイラを使用してファイルをコンパイルするか、CUDA C/C++ を使用してファイルをコンパイルするかを選択できます。両方のプロジェクトで両方のオプションを試しましたが、違いはありませんでした。

これを修正するためのアイデアはありますか?

FWIW、VS 2013のCuda 6.5でCuda 7と同じ結果が得られます

score 6 · Accepted Answer

cufft ライブラリは 4.2 から 7.0 でかなり大きくなり、初期化時間が大幅に長くなりました。この初期化時間を要因として取り除けば、実行時間の差は 1000 倍にも満たないことがわかると思います。

これを示す変更されたコードを次に示します。

$ cat t807.cu
#include <cufft.h>
#include <cuComplex.h>
typedef cuComplex Complex;
#include <iostream>
using namespace std;
int main(int argc, char* argv[])
{
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    const int SIZE = 10000;
    Complex *h_col = (Complex*)malloc(SIZE*sizeof(Complex));
    for (int i = 0; i < SIZE; i++)
    {
        h_col[i].x = i;
        h_col[i].y = i;
    }
    Complex *d_col;
    cudaMalloc((void**)&d_col, SIZE*sizeof(Complex));
    cudaMemcpy(d_col, h_col, SIZE*sizeof(Complex), cudaMemcpyHostToDevice);

    cufftHandle plan;
    const int BATCH = 1;
    cufftPlan1d(&plan, SIZE, CUFFT_C2C, BATCH);
    cufftExecC2C(plan, d_col, d_col, CUFFT_FORWARD);

    cudaMemcpy(h_col, d_col, SIZE*sizeof(Complex), cudaMemcpyDeviceToHost);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    cufftDestroy(plan);
    cout << milliseconds << endl;

    cudaEventRecord(start);
    for (int i = 0; i < SIZE; i++)
    {
        h_col[i].x = i;
        h_col[i].y = i;
    }
    cudaMemcpy(d_col, h_col, SIZE*sizeof(Complex), cudaMemcpyHostToDevice);

    cufftPlan1d(&plan, SIZE, CUFFT_C2C, BATCH);
    cufftExecC2C(plan, d_col, d_col, CUFFT_FORWARD);

    cudaMemcpy(h_col, d_col, SIZE*sizeof(Complex), cudaMemcpyDeviceToHost);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    cufftDestroy(plan);
    cout << milliseconds << endl;

    return 0;
}
$ nvcc -o t807 t807.cu -lcufft
$ ./t807
94.8298
1.44778
$

上記の 2 番目の数値は、カフトの初期化が削除された (最初のパスで行われたため) 基本的に同じコードを表します。

c++ - VS2010/Cuda4.2 と比較して、VS2013/Cuda7.0 では CUFFT が 1000 倍遅い

1 に答える 1

Related

Reference