cuda - 3 つの異なるサイズのベクトルの推力複素変換

Question

こんにちは、C+ にこのループがあり、それを推力に変換しようとしましたが、同じ結果が得られませんでした...何かアイデアはありますか? ありがとうございました

C++ コード

for (i=0;i<n;i++) 
    for (j=0;j<n;j++) 
      values[i]=values[i]+(binv[i*n+j]*d[j]);

スラストコード

thrust::fill(values.begin(), values.end(), 0);
thrust::transform(make_zip_iterator(make_tuple(
                thrust::make_permutation_iterator(values.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexDivFunctor(n))),
                binv.begin(),
                thrust::make_permutation_iterator(d.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexModFunctor(n))))),
                make_zip_iterator(make_tuple(
                thrust::make_permutation_iterator(values.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexDivFunctor(n))) + n,
                binv.end(),
                thrust::make_permutation_iterator(d.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexModFunctor(n))) + n)),
                thrust::make_permutation_iterator(values.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexDivFunctor(n))),
                function1()
                );

推力関数

struct IndexDivFunctor: thrust::unary_function<int, int>
{
  int n;

  IndexDivFunctor(int n_) : n(n_) {}

  __host__ __device__
  int operator()(int idx)
  {
    return idx / n;
  }
};

struct IndexModFunctor: thrust::unary_function<int, int>
{
  int n;

  IndexModFunctor(int n_) : n(n_) {}

  __host__ __device__
  int operator()(int idx)
  {
    return idx % n;
  }
};


struct function1
{
  template <typename Tuple>
  __host__ __device__
  double operator()(Tuple v)
  {
    return thrust::get<0>(v) + thrust::get<1>(v) * thrust::get<2>(v);
  }
};

score 4 · Accepted Answer

まず、いくつかの一般的なコメント。あなたのループ

for (i=0;i<n;i++) 
    for (j=0;j<n;j++) 
      v[i]=v[i]+(B[i*n+j]*d[j]);

標準のBLAS gemv操作と同等です

ここに画像の説明を入力

ここで、行列は行優先順に格納されます。デバイスでこれを行う最適な方法は、推力プリミティブから構築されたものではなく、CUBLAS を使用することです。

そうは言っても、あなたが投稿した推力コードがあなたのシリアルコードのようになることは絶対にありません. 表示されているエラーは、浮動小数点の結合性の結果ではありません。基本的thrust::transformに、提供されたファンクターを入力反復子のすべての要素に適用し、結果を出力反復子に格納します。投稿したループと同じ結果を得るには、thrust::transform呼び出しで、投稿した fmad ファンクターの (n*n) 操作を実行する必要があります。明らかにそうではありません。さらに、thrust::transformメモリ競合から安全な方法で合計/削減操作を実行する保証はありません。

正しい解決策は、おそらく次のようなものになるでしょう。

Thrust::transform を使用して、Bとdの要素の (n*n) 積を計算します。
Thrust::reduce_by_key を使用して積を部分和に減らし、Bdを生成します。
Thrust::transform を使用して、結果の行列ベクトル積をvに追加し、最終結果を生成します。

コードでは、最初に次のようなファンクターを定義します。

struct functor
{
  template <typename Tuple>
  __host__ __device__
  double operator()(Tuple v)
  {
    return thrust::get<0>(v) * thrust::get<1>(v);
  }
};

次に、次のようにして行列とベクトルの乗算を計算します

  typedef thrust::device_vector<int> iVec;
  typedef thrust::device_vector<double> dVec;

  typedef thrust::counting_iterator<int> countIt;
  typedef thrust::transform_iterator<IndexDivFunctor, countIt> columnIt;
  typedef thrust::transform_iterator<IndexModFunctor, countIt> rowIt;

  // Assuming the following allocations on the device
  dVec B(n*n), v(n), d(n);

  // transformation iterators mapping to vector rows and columns
  columnIt cv_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexDivFunctor(n));
  columnIt cv_end   = cv_begin + (n*n);

  rowIt rv_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexModFunctor(n));
  rowIt rv_end   = rv_begin + (n*n);

  dVec temp(n*n);
  thrust::transform(make_zip_iterator(
                      make_tuple(
                        B.begin(),
                        thrust::make_permutation_iterator(d.begin(),rv_begin) ) ),
                    make_zip_iterator(
                      make_tuple(
                        B.end(),
                        thrust::make_permutation_iterator(d.end(),rv_end) ) ),
                    temp.begin(),
                    functor());

  iVec outkey(n);
  dVec Bd(n);
  thrust::reduce_by_key(cv_begin, cv_end, temp.begin(), outkey.begin(), Bd.begin());
  thrust::transform(v.begin(), v.end(), Bd.begin(), v.begin(), thrust::plus<double>());

もちろん、これはdgemvCUBLAS のような目的に合わせて設計された行列とベクトルの乗算コードを使用する場合と比較して、計算を行うには非常に非効率的な方法です。

score 0 · Accepted Answer

あなたの結果はどのくらい違いますか？まったく違う答えですか、それとも最後の桁だけ違うのですか? ループは 1 回だけ実行されますか、それともある種の反復プロセスですか?

浮動小数点演算、特に特定の値を繰り返し加算または乗算する演算は、精度の問題により連想できません。さらに、高速演算最適化を使用する場合、演算は IEEE に準拠していない可能性があります。

まず、浮動小数点数に関するこのウィキペディアのセクションをチェックしてください: http://en.wikipedia.org/wiki/Floating_point#Accuracy_problems

cuda - 3 つの異なるサイズのベクトルの推力複素変換

2 に答える 2

Related

Reference