cuda - CUDA Cを使用してスパース配列をすばやく圧縮するにはどうすればよいですか？

Question

概要

[A - B - - - C]デバイスメモリ内の配列ですが、必要です[A B C]-CUDA Cを使用する最も簡単な方法は何ですか？

コンテクスト

Aデバイス（GPU）メモリに整数の配列があります。各反復で、0より大きいいくつかの要素をランダムに選択し、それらから1を引きます。L0に等しい要素のソートされたルックアップ配列を維持します。

Array A:
       @ iteration i: [0 1 0 3 3 2 0 1 2 3]
   @ iteration i + 1: [0 0 0 3 2 2 0 1 2 3]

Lookup for 0-elements L:
       @ iteration i: [0 - 2 - - - 6 - - -]  ->  want compacted form: [0 2 6]
   @ iteration i + 1: [0 1 2 - - - 6 - - -]  ->  want compacted form: [0 1 2 6]

（ここでは、要素をランダムに選択し、から1を減算します。CUDACでの実装では、各スレッドはの要素にマップされるため、データの競合を防ぎ、並べ替えられた順序を維持するために、ルックアップ配列はスパースです（例ではなく）。14A[0 1 2 6][0 2 6 1]）

後で、0に等しい要素に対してのみいくつかの操作を実行します。したがってL、スレッドを0要素にマップできるように、スパースルックアップ配列を圧縮する必要があります。

そのため、CUDA Cを使用してデバイスメモリ上のスパースアレイを圧縮する最も効率的な方法は何ですか？

どうもありがとう。

score 3 · Accepted Answer

私が持っているとしましょう：

int V[] = {1, 2, 0, 0, 5};

そして、私の望ましい結果は次のとおりです。

int R[] = {1, 2, 5}

実際には、ゼロの要素を削除するか、ゼロ以外の場合にのみ要素をコピーします。

#include <thrust/device_ptr.h>
#include <thrust/copy.h>
#include <stdio.h>
#define SIZE 5

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

  struct is_not_zero
  {
    __host__ __device__
    bool operator()(const int x)
    {
      return (x != 0);
    }
  };



int main(){

  int V[] = {1, 2, 0, 0, 5};
  int R[] = {0, 0, 0, 0, 0};
  int *d_V, *d_R;

  cudaMalloc((void **)&d_V, SIZE*sizeof(int));
  cudaCheckErrors("cudaMalloc1 fail");
  cudaMalloc((void **)&d_R, SIZE*sizeof(int));
  cudaCheckErrors("cudaMalloc2 fail");

  cudaMemcpy(d_V, V, SIZE*sizeof(int), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy1 fail");

  thrust::device_ptr<int> dp_V(d_V);
  thrust::device_ptr<int> dp_R(d_R);
  thrust::copy_if(dp_V, dp_V + SIZE, dp_R, is_not_zero());

  cudaMemcpy(R, d_R, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy2 fail");

  for (int i = 0; i<3; i++)
    printf("R[%d]: %d\n", i, R[i]);

  return 0;


}

構造体の定義は、ゼロ要素をテストするファンクターを提供します。推力では、カーネルはなく、デバイスコードを直接記述していないことに注意してください。それはすべて舞台裏で起こります。そして、この質問を推力のチュートリアルに変えないように、クイックスタートガイドに精通することを強くお勧めします。

コメントを確認した後、この修正バージョンのコードはcuda4.0の問題を回避できると思います。

#include <thrust/device_ptr.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <stdio.h>
#define SIZE 5

  struct is_not_zero
  {
    __host__ __device__
    bool operator()(const int x)
    {
      return (x != 0);
    }
  };



int main(){

  int V[] = {1, 2, 0, 0, 5};
  int R[] = {0, 0, 0, 0, 0};

  thrust::host_vector<int> h_V(V, V+SIZE);
  thrust::device_vector<int> d_V = h_V;
  thrust::device_vector<int> d_R(SIZE, 0);

  thrust::copy_if(d_V.begin(), d_V.end(), d_R.begin(), is_not_zero());
  thrust::host_vector<int> h_R = d_R;

  thrust::copy(h_R.begin(), h_R.end(), R);

  for (int i = 0; i<3; i++)
    printf("R[%d]: %d\n", i, R[i]);

  return 0;


}

cuda - CUDA Cを使用してスパース配列をすばやく圧縮するにはどうすればよいですか？

概要

コンテクスト

1 に答える 1

Related

Reference