cuda - CUDAの行列の行または列を減らす

Question

cuBLAS行列演算を実行するためにCUDAを使用しています。

行列の行（または列）を合計する必要があります。現在、行列に1のベクトルを乗算して実行していますが、これはそれほど効率的ではないようです。

より良い方法はありますか？に何も見つかりませんでしたcuBLAS。

score 6 · Accepted Answer

cublas_gemv()独自のカーネルを手動で作成することを検討している場合を除き、実際には、を使用して行列に1のベクトルを乗算することは非常に効率的な方法です。

のmem帯域幅を簡単にプロファイリングできますcublas_gemv()。これは、マトリックスデータ全体を1回読み取るだけの場合に非常に近く、マトリックスの行/列の合計の理論上のピークパフォーマンスと見なすことができます。

余分な操作「x1.0」は、次の理由でパフォーマンスを大幅に低下させることはありません。

cublas_gemv()基本的にはmem帯域幅にバインドされた操作であり、余分な算術命令がボトルネックになることはありません。
FMA命令は、命令スループットをさらに低下させます。
mem of onesベクトルは通常、マトリックスのmemよりもはるかに小さく、GPUで簡単にキャッシュして、mem帯域幅に減らすことができます。

cublas_gemv()また、マトリックスレイアウトの問題に対処するのに役立ちます。行/列メジャーおよび任意のパディングで機能します。

これについても同様の質問をしました。私の実験では、行列行の合計の別のアプローチである、をcublas_gemv()使用したセグメント化された削減よりも優れていることが示されています。Thrust::reduce_by_key

score 1 · Accepted Answer

同じトピックに関する有用な回答を含むこれに関連する投稿は、次のURLで入手できます。

CUDAで行列行を減らす

と

CUDAで行列列を減らします。

ここで、同じ行列による行の乗算によって行列の列を減らすアプローチを一般化して、ベクトルの集合の線形結合を実行する方法を指摘したいと思います。言い換えれば、次のベクトル基底展開を計算したい場合

ここでf(x_m)、は関数のサンプルでありf(x)、\psi_n「s」は基本関数であり、c_n「s」は展開係数です。「」は行列\psi_nに編成され、係数は行ベクトルに編成され、次を使用してベクトルx行列の乗算を計算できます。。N x Mc_ncublas<t>gemv

以下に、完全に機能する例を報告します。

#include <cublas_v2.h>

#include <thrust/device_vector.h>
#include <thrust/random.h>

#include <stdio.h>
#include <iostream>

#include "Utilities.cuh"

/********************************************/
/* LINEAR COMBINATION FUNCTION - FLOAT CASE */
/********************************************/
void linearCombination(const float * __restrict__ d_coeff, const float * __restrict__ d_basis_functions_real, float * __restrict__ d_linear_combination,
                       const int N_basis_functions, const int N_sampling_points, const cublasHandle_t handle) {

    float alpha = 1.f;
    float beta  = 0.f;
    cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_N, N_sampling_points, N_basis_functions, &alpha, d_basis_functions_real, N_sampling_points, 
                               d_coeff, 1, &beta, d_linear_combination, 1));

}

void linearCombination(const double * __restrict__ d_coeff, const double * __restrict__ d_basis_functions_real, double * __restrict__ d_linear_combination,
                       const int N_basis_functions, const int N_sampling_points, const cublasHandle_t handle) {

    double alpha = 1.;
    double beta  = 0.;
    cublasSafeCall(cublasDgemv(handle, CUBLAS_OP_N, N_sampling_points, N_basis_functions, &alpha, d_basis_functions_real, N_sampling_points, 
                               d_coeff, 1, &beta, d_linear_combination, 1));

}

/********/
/* MAIN */
/********/
int main()
{
    const int N_basis_functions = 5;     // --- Number of rows                  -> Number of basis functions
    const int N_sampling_points = 8;     // --- Number of columns               -> Number of sampling points of the basis functions

    // --- Random uniform integer distribution between 10 and 99
    thrust::default_random_engine rng;
    thrust::uniform_int_distribution<int> dist(10, 99);

    // --- Matrix allocation and initialization
    thrust::device_vector<float> d_basis_functions_real(N_basis_functions * N_sampling_points);
    for (size_t i = 0; i < d_basis_functions_real.size(); i++) d_basis_functions_real[i] = (float)dist(rng);

    thrust::device_vector<double> d_basis_functions_double_real(N_basis_functions * N_sampling_points);
    for (size_t i = 0; i < d_basis_functions_double_real.size(); i++) d_basis_functions_double_real[i] = (double)dist(rng);

    /************************************/
    /* COMPUTING THE LINEAR COMBINATION */
    /************************************/
    cublasHandle_t handle;
    cublasSafeCall(cublasCreate(&handle));

    thrust::device_vector<float>  d_linear_combination_real(N_sampling_points);
    thrust::device_vector<double> d_linear_combination_double_real(N_sampling_points);
    thrust::device_vector<float>  d_coeff_real(N_basis_functions, 1.f);
    thrust::device_vector<double> d_coeff_double_real(N_basis_functions, 1.);

    linearCombination(thrust::raw_pointer_cast(d_coeff_real.data()), thrust::raw_pointer_cast(d_basis_functions_real.data()), thrust::raw_pointer_cast(d_linear_combination_real.data()),
                      N_basis_functions, N_sampling_points, handle);
    linearCombination(thrust::raw_pointer_cast(d_coeff_double_real.data()), thrust::raw_pointer_cast(d_basis_functions_double_real.data()), thrust::raw_pointer_cast(d_linear_combination_double_real.data()),
                      N_basis_functions, N_sampling_points, handle);

    /*************************/
    /* DISPLAYING THE RESULT */
    /*************************/
    std::cout << "Real case \n\n";
    for(int j = 0; j < N_sampling_points; j++) {
        std::cout << "Column " << j << " - [ ";
        for(int i = 0; i < N_basis_functions; i++)
            std::cout << d_basis_functions_real[i * N_sampling_points + j] << " ";
        std::cout << "] = " << d_linear_combination_real[j] << "\n";
    }

    std::cout << "\n\nDouble real case \n\n";
    for(int j = 0; j < N_sampling_points; j++) {
        std::cout << "Column " << j << " - [ ";
        for(int i = 0; i < N_basis_functions; i++)
            std::cout << d_basis_functions_double_real[i * N_sampling_points + j] << " ";
        std::cout << "] = " << d_linear_combination_double_real[j] << "\n";
    }

    return 0;
}

cuda - CUDAの行列の行または列を減らす

2 に答える 2

Related

Reference