cuda - CUDA Thrustを使用してインデックスリストで行列から行を収集する方法

Question

これは一見単純な問題ですが、CUDA Thrust でこれを行うエレガントな方法がわかりません。

2 次元行列 NxM と、すべての行のサブセット (つまり、L < N) であり、規則的ではないサイズ L の目的の行インデックスのベクトルがあります (基本的に、7,11,13,205 のような不規則なリストです...等。）。行列は、推力デバイスベクトルの行ごとに格納されます。インデックスの配列もデバイスベクトルです。ここに私の2つの質問があります：

新しい行列 LxM を形成する元の NxM 行列から目的の行をコピーする最も効率的な方法は何ですか?
目的の行に属する要素のみを逆参照する元の NxM 行列の反復子を作成することは可能ですか?

ご助力ありがとうございます。

score 3 · Accepted Answer

あなたが求めているのは、かなり単純なストリーム圧縮の問題のように思えます。推力でそれを行うのに特に問題はありませんが、いくつかのひねりがあります。コピーする行を選択するには、ストリーム圧縮アルゴリズムで使用できるステンシルまたはキーが必要です。これは、コピーする行のリストを使用して検索または選択操作によって構築する必要があります。

これを行う手順の例は、次のようになります。

入力行列の任意のエントリの行番号を返す反復子を作成します。スラストには非常に便利な機能がcounting_iteratorあり、transform_iteratorこれを組み合わせてこれを行うことができます
その行番号反復子の検索を実行して、コピーする行のリストに一致するエントリを見つけます。thrust::binary searchこれに使用できます。検索により、ストリーム圧縮操作のステンシルが得られます
ステンシルを使用thrust::copy_ifして入力行列でストリーム圧縮を実行するために使用します。

多くの作業と中間ステップのように思えますが、カウントと変換の反復子は実際には中間デバイスベクトルを生成しません。必要な唯一の中間ストレージはステンシル配列です。これはブール値 (つまり m*n バイト) にすることができます。

コードの完全な例:

#include <thrust/copy.h>
#include <thrust/binary_search.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/device_vector.h>
#include <cstdio>

struct div_functor : public thrust::unary_function<int,int>
{
    int m;
    div_functor(int _m) : m(_m) {};

    __host__ __device__
    int operator()(int x) const
    {
        return x / m;
    }
};

struct is_true
{
    __host__ __device__
    bool operator()(bool x) { return x; }
};


int main(void)
{

    // dimensions of the problem
    const int m=20, n=5, l=4;

    // Counting iterator for generating sequential indices

    // Sample matrix containing 0...(m*n)
    thrust::counting_iterator<float> indices(0.f);
    thrust::device_vector<float> in_matrix(m*n);
    thrust::copy(indices, indices+(m*n), in_matrix.begin());

    // device vector contain rows to select
    thrust::device_vector<int> select(l);
    select[0] = 1;
    select[1] = 4;
    select[2] = 9;
    select[3] = 16;

    // construct device iterator supplying row numbers via a functor
    typedef thrust::counting_iterator<int> counter;
    typedef thrust::transform_iterator<div_functor, counter> rowIterator;
    rowIterator rows_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), div_functor(n));
    rowIterator rows_end = rows_begin + (m*n);

    // constructor a stencil array which indicates which entries will be copied
    thrust::device_vector<bool> docopy(m*n);
    thrust::binary_search(select.begin(), select.end(), rows_begin, rows_end, docopy.begin());

    // use stream compaction on the matrix with the stencil array
    thrust::device_vector<float> out_matrix(l*n);
    thrust::copy_if(in_matrix.begin(), in_matrix.end(), docopy.begin(), out_matrix.begin(), is_true());

    for(int i=0; i<(l*n); i++) {
        float val = out_matrix[i];
        printf("%i %f\n", i, val);
    }
}

（通常の免責事項：自己責任で使用してください）

私がする唯一のコメントについては、copy_if直接使用できるバイナリステンシルが既にあることを考えると、呼び出しの述語が少し冗長に感じるということですが、バイナリステンシルを直接。同様に、ストリーム圧縮呼び出しで行のリストを直接使用する賢明な方法を思いつきませんでした。これを推力で行うもっと効率的な方法があるかもしれませんが、少なくともこれで始めることができます。

あなたのコメントから、スペースが狭く、バイナリ検索とステンシル作成の追加のメモリオーバーヘッドがアプリケーションにとって法外なものであるようです。その場合、Roger Dahl の回答に対するコメントで提供したアドバイスに従い、代わりにカスタムコピーカーネルを使用します。スラストデバイスベクトルは、カーネルに直接渡すことができるポインターにキャストできるthrust::raw_pointer_castため ( )、既存のスラストコードに干渉する必要はありません。行ごとにスレッドのブロックを使用してコピーすることをお勧めします。これにより、読み取りと書き込みの合体が可能になり、thrust::copy行ごとに使用するよりもはるかに優れたパフォーマンスが得られます。非常に単純な実装は次のようになります (推力の例のほとんどを再利用します)。

#include <thrust/copy.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_vector.h>
#include <cstdio>

__global__ 
void rowcopykernel(const float *in, float *out, const int *list, const int m, const int n, const int l)
{
    __shared__ const float * inrowp; 
    __shared__ float * outrowp;

    if (threadIdx.x == 0) {
        inrowp = (blockIdx.x < l) ? in + (n*list[blockIdx.x]) : 0;
        outrowp = out + (n*blockIdx.x);
    }
    __syncthreads();

    for(int i=threadIdx.x; (inrowp != 0) && (i<n); i+=blockDim.x) {
        *(outrowp+i) = *(inrowp+i);
    }
}

int main(void)
{
    // dimensions of the problem
    const int m=20, n=5, l=4;

    // Sample matrix containing 0...(m*n)
    thrust::counting_iterator<float> indices(0.f);
    thrust::device_vector<float> in_matrix(m*n);
    thrust::copy(indices, indices+(m*n), in_matrix.begin());

    // device vector contain rows to select
    thrust::device_vector<int> select(l);
    select[0] = 1;
    select[1] = 4;
    select[2] = 9;
    select[3] = 16;

    // Output matrix
    thrust::device_vector<float> out_matrix(l*n);

    // raw pointer to thrust vectors
    int * selp = thrust::raw_pointer_cast(&select[0]);
    float * inp = thrust::raw_pointer_cast(&in_matrix[0]);
    float * outp = thrust::raw_pointer_cast(&out_matrix[0]);

    dim3 blockdim = dim3(128);
    dim3 griddim = dim3(l);
    rowcopykernel<<<griddim,blockdim>>>(inp, outp, selp, m, n, l);

    for(int i=0; i<(l*n); i++) {
        float val = out_matrix[i];
        printf("%i %f\n", i, val);
    }
}

(標準免責事項: 自己責任で使用してください)。

実行パラメータの選択はより手の込んだものにすることができますが、それ以外の場合は、それで十分です。行が非常に小さい場合は、ブロックではなく行ごとにワープを使用して調査することをお勧めします (したがって、1 つのブロックが複数の行をコピーします)。出力行が 65535 を超える場合は、2D グリッドを使用するか、コードを変更して各ブロックが複数の行を実行するようにする必要があります。しかし、推力ベースのソリューションと同様に、これで始められるはずです。

score 1 · Accepted Answer

スラストに固執していない場合は、Arrafireをチェックしてください。

驚くべきことに、このライブラリは添字インデックス作成をネイティブにサポートしているため、わずか数行のコードで問題を解決できます。

const int N = 7, M = 5;

float L_host[] = {3, 6, 4, 1}; 
int szL = sizeof(L_host) / sizeof(float); 

// generate random NxM matrix with cuComplex data 
array A = randu(N, M, c32); 
// array used to index rows
array L(szL, 1, L_host);

print(A);
print(L);

array B = A(L,span); // copy selected rows of A
print(B);

そして結果：

A =
       0.7402 +       0.9210i       0.6814 +       0.2920i       0.5786 +       0.5538i       0.2133 +       0.4131i       0.7305 +       0.9400i
       0.0390 +       0.9690i       0.3194 +       0.8109i       0.3557 +       0.7229i       0.0328 +       0.5360i       0.8432 +       0.6116i
       0.9251 +       0.4464i       0.1541 +       0.4452i       0.2783 +       0.6192i       0.7214 +       0.3546i       0.2674 +       0.0208i
       0.6673 +       0.1099i       0.2080 +       0.6110i       0.5876 +       0.3750i       0.2527 +       0.9847i       0.8331 +       0.7218i
       0.4702 +       0.5132i       0.3073 +       0.4156i       0.2405 +       0.4148i       0.9200 +       0.1872i       0.6087 +       0.6301i
       0.7762 +       0.2948i       0.2343 +       0.8793i       0.0937 +       0.6326i       0.1820 +       0.5984i       0.5298 +       0.8127i
       0.7140 +       0.3585i       0.6462 +       0.9264i       0.2849 +       0.7793i       0.7082 +       0.0421i       0.0593 +       0.4797i


L = (row indices)
       3.0000 
       6.0000 
       4.0000 
       1.0000




B =
       0.6673 +       0.1099i       0.2080 +       0.6110i       0.5876 +       0.3750i       0.2527 +       0.9847i       0.8331 +       0.7218i
       0.7140 +       0.3585i       0.6462 +       0.9264i       0.2849 +       0.7793i       0.7082 +       0.0421i       0.0593 +       0.4797i
       0.4702 +       0.5132i       0.3073 +       0.4156i       0.2405 +       0.4148i       0.9200 +       0.1872i       0.6087 +       0.6301i
       0.0390 +       0.9690i       0.3194 +       0.8109i       0.3557 +       0.7229i       0.0328 +       0.5360i       0.8432 +       0.6116i

また、かなり高速に動作します。次のコードを使用して、サイズ 2000 x 2000 の cuComplex の配列でこれをテストしました。

float *g_data = 0, *g_data2 = 0;
int g_N = 2000, g_M = 2000, // matrix of size g_N x g_M
       g_L = 400;          // copy g_L rows
void af_test() 
{   
  array A(g_N, g_M, (cuComplex *)g_data, afDevicePointer);
  array L(g_L, 1, g_data2, afDevicePointer);
  array B = (A(L, span));
  std::cout << "sz: " << B.elements() << "\n";
}

int main()
{
   // input matrix N x M of cuComplex
   array in = randu(g_N, g_M, c32);
   g_data = (float *)in.device< cuComplex >();
   // generate unique row indices
   array in2 = setunique(floor(randu(g_L) * g_N));
   print(in2);
   g_data2 = in2.device<float>();

   const int N_ITERS = 30;
   try {
     info();
     af::sync();
     timer::tic();

     for(int i = 0; i < N_ITERS; i++) {
       af_test();      
     }
     af::sync();
     printf("af:  %.5f seconds\n",  timer::toc() / N_ITERS);


   } catch (af::exception& e) {
     fprintf(stderr, "%s\n", e.what());
  }
  in.unlock();
  in2.unlock();
}

score 0 · Accepted Answer

Thrust でこれを行う方法はないと思いますが、操作はメモリバウンドになるため、この操作を最大限のパフォーマンスで実行するカーネルを簡単に作成できるはずです。ベクトルにあるインデックスと同じ数のスレッドを作成するだけです。各スレッドで 1 つの行のソースアドレスと宛先アドレスを計算し、それを使用memcpy()して行をコピーします。

また、後続の処理ステップを設定して行にアクセスできるかどうかを慎重に検討することもできます。これにより、メモリをシャッフルするだけの高価な「圧縮」操作全体を回避できます。行のアドレス指定が少し複雑になったとしても (追加のメモリルックアップと乗算など)、全体的なパフォーマンスは大幅に向上する可能性があります。

cuda - CUDA Thrustを使用してインデックスリストで行列から行を収集する方法

3 に答える 3

Related

Reference