performance - 最大のパフォーマンスでCUDAの行列列を正規化する方法は?

Question

CUDAで行列列を効果的に正規化するには?

私の行列は列優先で格納され、通常のサイズは 2000x200 です。

この操作は、次の matlab コードで表すことができます。

A = rand(2000,200);

A = exp(A);
A = A./repmat(sum(A,1), [size(A,1) 1]);

これは、Thrust、cuBLAS、および/または cuNPP によって効果的に実行できますか?

4 つのカーネルを含む迅速な実装を以下に示します。

特に cublasDgemv() によって実装される列の合計ステップのパフォーマンスを向上させるために、これらを 1 つまたは 2 つのカーネルで実行できるかどうか疑問に思っています。

#include <cuda.h>
#include <curand.h>
#include <cublas_v2.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/transform.h>
#include <thrust/iterator/constant_iterator.h>
#include <math.h>

struct Exp
{
    __host__ __device__ void operator()(double& x)
    {
        x = exp(x);
    }
};

struct Inv
{
    __host__ __device__ void operator()(double& x)
    {
        x = (double) 1.0 / x;
    }
};

int main()
{
    cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
    cublasHandle_t hd;
    curandGenerator_t rng;
    cublasCreate(&hd);
    curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT);

    const size_t m = 2000, n = 200;
    const double c1 = 1.0;
    const double c0 = 0.0;

    thrust::device_vector<double> A(m * n);
    thrust::device_vector<double> sum(1 * n);
    thrust::device_vector<double> one(m * n, 1.0);

    double* pA = thrust::raw_pointer_cast(&A[0]);
    double* pSum = thrust::raw_pointer_cast(&sum[0]);
    double* pOne = thrust::raw_pointer_cast(&one[0]);

    for (int i = 0; i < 100; i++)
    {
        curandGenerateUniformDouble(rng, pA, A.size());


        thrust::for_each(A.begin(), A.end(), Exp());

        cublasDgemv(hd, CUBLAS_OP_T, m, n,
                &c1, pA, m, pOne, 1, &c0, pSum, 1);

        thrust::for_each(sum.begin(), sum.end(), Inv());

        cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pA, m, pSum, 1, pA, m);
    }

    curandDestroyGenerator(rng);
    cublasDestroy(hd);

    return 0;
}

score 5 · Accepted Answer

M2090 と CUDA 5.0 での 3 つのアプローチのパフォーマンスを比較しました。

[173.179 us]質問に示されているcublasの実装
thrust::reduce_by_key[733.734 us] @talonmiesからの純粋な Thrust 実装
[1.508 ms] 純粋なスラストの実装thrust::inclusive_scan_by_key

$A_{2,000 x 200} のパフォーマンス$

次のことがわかります。

この場合、cublas が最高のパフォーマンスを発揮します。
両方thrust::reduce_by_keyとthrust::inclusive_scan_by_key複数のカーネルを起動すると、余分なオーバーヘッドが発生します。
thrust::inclusive_scan_by_keyよりもはるかに多くのデータを DRAM に書き込みますthrust::reduce_by_key。これは、カーネル時間が長くなる理由の 1 つになる可能性があります。
cublas とスラストアプローチの主なパフォーマンスの違いは、行列の列の合計です。thrust::reduce_by_key可変長のセグメントを削減するように設計されているため、推力は遅くなりcublas_gemv()ますが、固定長のセグメント (行/列) にのみ適用できます。

行列 A がカーネル起動のオーバーヘッドを無視できるほど大きい場合でも、cublas アプローチは最高のパフォーマンスを発揮します。A_{20,000 x 2,000} のプロファイリング結果は次のとおりです。

$A_{20,000 x 2,000} のパフォーマンス$

@talonmies で示されているように、最初のfor_each操作と呼び出しを融合すると、パフォーマンスがさらに向上する可能性がありますが、 .cublasSgemvthrust::reduce_by_key

3 つのアプローチのコードを以下に示します。

#include <cuda.h>
#include <curand.h>
#include <cublas_v2.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/transform.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <math.h>

struct Exp: public thrust::unary_function<double, double>
{
    __host__ __device__ double operator()(double x)
    {
        return exp(x);
    }
};

struct Inv: public thrust::unary_function<double, double>
{
    __host__ __device__ double operator()(double x)
    {
        return (double) 1.0 / x;
    }
};

template<typename T>
struct MulC: public thrust::unary_function<T, T>
{
    T C;
    __host__ __device__ MulC(T c) :
        C(c)
    {
    }
    __host__ __device__ T operator()(T x)
    {
        return x * C;
    }
};

template<typename T>
struct line2col: public thrust::unary_function<T, T>
{
    T C;
    __host__ __device__ line2col(T C) :
            C(C)
    {
    }

    __host__ __device__ T operator()(T i)
    {
        return i / C;
    }
};

int main()
{
    cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
    cublasHandle_t hd;
    curandGenerator_t rng;
    cublasCreate(&hd);
    curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT);

    const size_t m = 2000, n = 200;
    const double c1 = 1.0;
    const double c0 = 0.0;

    thrust::device_vector<double> A(m * n);
    thrust::device_vector<double> B(m * n);
    thrust::device_vector<double> C(m * n);
    thrust::device_vector<double> sum1(1 * n);
    thrust::device_vector<double> sum2(1 * n);
    thrust::device_vector<double> one(m * n, 1);

    double* pA = thrust::raw_pointer_cast(&A[0]);
    double* pB = thrust::raw_pointer_cast(&B[0]);
    double* pSum1 = thrust::raw_pointer_cast(&sum1[0]);
    double* pSum2 = thrust::raw_pointer_cast(&sum2[0]);
    double* pOne = thrust::raw_pointer_cast(&one[0]);

    curandGenerateUniformDouble(rng, pA, A.size());

    const int count = 2;

    for (int i = 0; i < count; i++)
    {
        thrust::transform(A.begin(), A.end(), B.begin(), Exp());
        cublasDgemv(hd, CUBLAS_OP_T, m, n, &c1, pB, m, pOne, 1, &c0, pSum1, 1);
        thrust::transform(sum1.begin(), sum1.end(), sum1.begin(), Inv());
        cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pB, m, pSum2, 1, pB, m);
    }

    for (int i = 0; i < count; i++)
    {
        thrust::reduce_by_key(
                thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)),
                thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)) + A.size(),
                thrust::make_transform_iterator(A.begin(), Exp()),
                thrust::make_discard_iterator(),
                sum2.begin());
        thrust::transform(
                A.begin(), A.end(),
                thrust::make_permutation_iterator(
                        sum2.begin(),
                        thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m))),
                C.begin(),
                thrust::divides<double>());
    }

    for (int i = 0; i < count; i++)
    {
        thrust::inclusive_scan_by_key(
                thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)),
                thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)) + A.size(),
                thrust::make_transform_iterator(A.begin(), Exp()),
                C.begin());
        thrust::copy(
                thrust::make_permutation_iterator(
                        C.begin() + m - 1,
                        thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(m))),
                thrust::make_permutation_iterator(
                        C.begin() + m - 1,
                        thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(m))) + n,
                sum2.begin());
        thrust::transform(
                A.begin(), A.end(),
                thrust::make_permutation_iterator(
                        sum2.begin(),
                        thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m))),
                C.begin(),
                thrust::divides<double>());
    }

    curandDestroyGenerator(rng);
    cublasDestroy(hd);

    return 0;
}

score 3 · Accepted Answer

for_each最初の操作とcublasSgemv呼び出しを 1 つの呼び出しに融合できるはずですreduce_by_key。ファンクターを次のように定義/再定義する場合:

struct Accessor : public thrust::unary_function<int,int>
{
    int lda;
    __host__ __device__ Accessor(int _lda) : lda(_lda) {};
    __host__ __device__ int operator()(const int& idx)
    {
        return idx/lda;
    }
};

struct Exp : public thrust::unary_function<double,double>
{
    __host__ __device__ double operator()(const double& x)
    {
        return exp(x);
    }
};

struct Inv : public thrust::unary_function<double,double>
{
    __host__ __device__ double operator()(const double& x)
    {
        return double(1.0) / x;
    }
};

次に、正規化された出力を次のように計算できます。

Accessor columns(m);
thrust::reduce_by_key(
        thrust::make_transform_iterator(thrust::make_counting_iterator(int(0)), columns),
        thrust::make_transform_iterator(thrust::make_counting_iterator(int(m*n)), columns),
        thrust::make_transform_iterator(A.begin(), Exp()),
        thrust::make_discard_iterator(),
        sum.begin());

thrust::for_each(sum.begin(), sum.end(), Inv());

cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pA, m, pSum, 1, pA, m);

[免責事項: すべてのコードはブラウザーで記述され、テストされていません。自己責任で使用してください]

カーネル呼び出しの数を減らすこととは別に、ファンシーイテレータを使用すると、大きなユニットマトリックスが不要になり、メモリフットプリントとメモリトランザクションの総数を減らして、合計とべき乗演算を実行できます。

score 2 · Accepted Answer

次の方法でArrayFireを使用できます

array A = randu(2000, 2000);
A = exp(A);
A /= tile(sum(A, 0), A.dims(0), 1);

スラストでもこれを行うことができます。しかし、(単純なベクトルとは対照的に) 行列を使用する場合は、効率的ではない for ループで実行する必要があります。

免責事項私は Accelereyes の開発者で、arrayfire に取り組んでいます。

編集要求に応じて、新しいベンチマークの生成に取り組んでいます。

EDITexpこのベンチマークにより、コードにパフォーマンスのバグが見つかりました。見直して修正中です。

performance - 最大のパフォーマンスでCUDAの行列列を正規化する方法は?

3 に答える 3

Related

Reference