sorting - CUDA スラストと sort_by_key

Question

要素の配列 A (double) を並べ替えて、その配列 A のキーの配列 B を返す CUDA の並べ替えアルゴリズムを探していますsort_by_key。Thrust ライブラリの関数は知っていますが、要素 A の配列を変わらないまま。私に何ができる？

私のコードは次のとおりです。

void sortCUDA(double V[], int P[], int N) {

        real_t *Vcpy = (double*) malloc(N*sizeof(double));
        memcpy(Vcpy,V,N*sizeof(double));

        thrust::sort_by_key(V, V + N, P);
        free(Vcpy);
}

スラストアルゴリズムを、シーケンシャル CPU にある他のアルゴリズムと比較しています

N               mergesort       sortCUDA
113             0.000008        0.000010
226             0.000018        0.000016
452             0.000036        0.000020
905             0.000061        0.000034
1810            0.000135        0.000071
3621            0.000297        0.000156
7242            0.000917        0.000338
14484           0.001421        0.000853
28968           0.003069        0.001931
57937           0.006666        0.003939
115874          0.014435        0.008025
231749          0.031059        0.016718
463499          0.067407        0.039848
926999          0.148170        0.118003
1853998         0.329005        0.260837
3707996         0.731768        0.544357
7415992         1.638445        1.073755
14831984        3.668039        2.150179
115035495       39.276560       19.812200
230070990       87.750377       39.762915
460141980       200.940501      74.605219

推力性能は悪くないのですが、OMPを使えば簡単にCPU時間稼げると思います

これはmemcpyのせいだと思います

解決：

void thrustSort(double V[], int P[], int N)
{
        thrust::device_vector<int> d_P(N);
        thrust::device_vector<double> d_V(V, V + N);
        thrust::sequence(d_P.begin(), d_P.end());

        thrust::sort_by_key(d_V.begin(), d_V.end(), d_P.begin());

        thrust::copy(d_P.begin(),d_P.end(),P);
}

ここで、V はソートする my double 値です

score 2 · Accepted Answer

比較演算子を変更して、値ではなくキーを並べ替えることができます。@Robert Crovella は、生のデバイスポインタをホストから割り当てることができないことを正しく指摘しました。変更されたアルゴリズムは次のとおりです。

struct cmp : public binary_function<int,int,bool>
{
  cmp(const double *ptr) : rawA(ptr) { }

  __host__ __device__ bool operator()(const int i, const int j) const 
  {return rawA[i] > rawA[j];}

   const double *rawA; // an array in global mem
}; 

void sortkeys(double *A, int n) {
  // move data to the gpu
  thrust::device_vector<double> devA(A, A + n);
  double *rawA = thrust::raw_pointer_cast(devA.data());

  thrust::device_vector<int> B(n);
  // initialize keys
  thrust::sequence(B.begin(), B.end());
  thrust::sort(B.begin(), B.end(), cmp(rawA));
  // B now contains the sorted keys
 }

そして、これがarrayfireの代替です。arrayfire ソリューションは 2 つの追加の配列を使用するため、どちらがより効率的かはわかりませんが、

void sortkeys(double *A, int n) {
   af::array devA(n, A, af::afHost);
   af::array vals, indices;
   // sort and populate vals/indices arrays
   af::sort(vals, indices, devA);
   std::cout << devA << "\n" << indices << "\n";
}

score 0 · Accepted Answer

@asmによって提供された回答（私はそれを機能させることができませんでした）に基づいて、このコードは私にとっては機能しているようで、キーのみをソートします。ただし、キーが（double）値に対応するシーケンス0、1、2、3、4...にある場合に限定されると思います。これは「インデックス値」ソートであるため、おそらくインデックス付きコピーを実行することにより、キーの任意のシーケンスの場合に拡張できます。ただし、インデックスシーケンスを生成してから元のキーを再配置するプロセスが、元の値データを新しいベクトルにコピーするよりも速くなるかどうかはわかりません（任意のキーの場合）。

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>

using namespace std;

__device__  double *rawA; // an array in global mem

struct cmp : public binary_function<int, int, bool>
{
  __host__ __device__  bool operator()(const int i, const int j) const
  {return ( rawA[i] < rawA[j]);}
};

void sortkeys(double *A, int n) {
  // move data to the gpu
  thrust::device_vector<double> devA(A, A + n);
//  rawA = thrust::raw_pointer_cast(&(devA[0]));
  double *test = raw_pointer_cast(devA.data());
  cudaMemcpyToSymbol(rawA, &test, sizeof(double *));

  thrust::device_vector<int> B(n);
  // initialize keys
  thrust::sequence(B.begin(), B.end());
  thrust::sort(B.begin(), B.end(), cmp());
  // B now contains the sorted keys
  thrust::host_vector<int> hostB = B;
  for (int i=0; i<hostB.size(); i++)
    std::cout << hostB[i] << " ";
  std::cout<<std::endl;
  for (int i=0; i<hostB.size(); i++)
    std::cout << A[hostB[i]] << " ";
  std::cout<<std::endl;
 }


int main(){

  double C[] = {0.7, 0.3, 0.4, 0.2, 0.6, 1.2, -0.5, 0.5, 0.0, 10.0};
  sortkeys(C, 9);
  std::cout << std::endl;
  return 0;
}

score 0 · Accepted Answer

この配列の大きさは? 速度の点で最も効率的な方法は、メモリが利用可能な場合、ソートする前に元の配列を複製することです。

sorting - CUDA スラストと sort_by_key

3 に答える 3

Related

Reference