cuda - サブマトリックス計算

Question

行列の 2 つのサブ行列間のペアごとの距離を計算したいと考えています。たとえば、行列 A (MxN) と、その行列 B1 (mxn) と B2 (kxt) の 2 つのブロックがあります。より具体的には、B2 の他のすべての要素からの B1(1,1) 要素の距離を計算し、すべての B1 要素に対してこのプロセスを実行したいと考えています。より明確にするために、B1とB2は行列のコンパクトな部分ではない可能性があり、基本的に私が知っている情報は、行列A上のB1とB2の要素の座標です。ここに例があります。

for(int i = 0; i < nRowsCoordsB1 ; i++ ){//nRows of B1
  for(int j = 0; j < nRowsCoordsB2 ; j++ ){//nRows of B2

    //CoordsofB1 is a nRowsB1x2 matrix that contains the element coordinates of the B1 sub matrix

    a_x = CoordsofB1[ i ]; //take the x coord of the corresponding row i
    a_y = CoordsofB1[ i + nRowsCoordsB1 ]; //take the y coord of the corresponding row

    b_x = CoordsofB2[ j ];
    b_y = CoordsofB2[ j + nRowsCoordsB2 ];

    int element1 = A[ a_x + a_y*nRowsofA ];
    int element2 = A[ b_x + b_y*nRowsofA ] ;
    sum +=abs( element1 - element2 ) ;

  }
}
*Output = sum/(float)(numberOfElementsofB1*numberOfElementsofB2);

今、私はCUDAで計算をスピードアップしたいと思っています:)私はCudaの視点が初めてなので、少し複雑でした。これで、Matrix レベルでブロックスレッドを割り当てるロジックは理解できたと思いますが、ここでは、CoordsofB1 と CoordsofB2 というサイズの異なる 2 つの異なるマトリックス部分があるため、それらにアクセスする方法について少し混乱します。座標を作成し、穴マトリックスで使用します。制約を使用して A で作業する必要があると考えましたが、明確な考えはありませんでした。

また、for ループの最後で合計が数量で除算されるという事実は、cuda 変換コードで誰を組み合わせるかについて私を混乱させます。

あらゆる提案、スニペット、例、参照は素晴らしいでしょう。

PS: 列優先の順序を使用する理由は、コードが matlab で評価されるためです。

更新:最大のサブ行列 B1 または B2 のサイズに等しいサイズのスレッドブロックを割り当て、正しい条件を使用してそれらを操作できますか? どうすればいいのかわからなかったので、最後の行にコメントします。コメントはありますか？

int r = blockDim.x * blockIdx.x + threadIdx.x; // rows
if( r < nRowsCoordsB1 ){       

  a_x = CoordsofB1[ r ]; 
  a_y = CoordsofB1[ r + nRowsCoordsB1 ]; 
  if( r < nRowsCoordsB2 ;){

    b_x = CoordsofB2[ r ];
    b_y = CoordsofB2[ r + nRowsCoordsB2 ];
    int element1 = A[ a_x + a_y*nRowsofA ];
    int element2 = A[ b_x + b_y*nRowsofA ] ;
    sum +=abs( element1 - element2 ) ;

  }
}
//*Output = sum/(float)(numberOfElementsofB1*numberOfElementsofB2);

ここにスケッチここに画像の説明を入力

B1 と B2 内の各要素の座標があり、値の差を計算したい

[ (B1(1,1) - B2(1,1)) + (B1(1,1) - B2(1,2)) + ... + (B1(1,1) - B2(:,: )) ] +

[ (B1(1,2) - B2(1,1)) + (B1(1,2) - B2(1,2)) + ... + (B1(1,2) - B2(:,: )) ] +

[ (B1(:,:) - B2(1,1)) + (B1(:,:) - B2(1,2)) + ... + (B1(:,:) - B2(:,:) )) ] .

score 1 · Accepted Answer

おそらく、2D スレッドグリッドを使用した以下のソリューションは、エリックがスラストを使用して問題をさらに洞察する代わりになる可能性があります。

以下のコードスニペットは、概念のみを説明するためのものです。これはテストされていないコードです。

2D グリッド

との要素間に関係するすべての絶対値の差を含むpartial_distancesサイズの行列を定義します。あなたが持っているファイルでnRowsCoordsB1 X nRowsCoordsB2B1B2main

__global__ void distance_calculator(int* partial_distances, int* CoordsofB1, int* CoordsofB2, int nRowsCoordsB1, int nRowsCoordsB2) {

    int i = blockDim.x * blockIdx.x + threadIdx.x;
    int j = blockDim.y * blockIdx.y + threadIdx.y;

    int a_x = CoordsofB1[i]; 
    int a_y = CoordsofB1[i+nRowsCoordsB1];

    int b_x = CoordsofB2[j];
    int b_y = CoordsofB2[j+nRowsCoordsB2];

    partial_distances[j*nRowsCoordsB1+i] = abs(A[a_x+a_y*nRowsofA]-A[b_x+b_y*nRowsofA]);

}

int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }

#define BLOCKSIZE 32

int main() {

    int* partial_distances; cudaMalloc((void**)&partial_distances,nRowsCoordsB1*nRowsCoordsB2*sizeof(int));

    dim3 BlocSize(BLOCKSIZE,BLOCKSIZE);
    dim3 GridSize;
    GridSize.x = iDivUp(nRowsCoordsB1,BLOCKSIZE);
    GridSize.y = iDivUp(nRowsCoordsB2,BLOCKSIZE);

    distance_calculator<<<GridSize,BlockSize>>>(partial_distances,CoordsofB1,CoordsofB2,nRowsCoordsB1,nRowsCoordsB2);

   REDUCTION_STEP

}

はREDUCTION_STEP、の特定の要素に対応するすべての要素を合計する 1D リダクションカーネルへの反復呼び出しとして実装できますB1。

別の方法として、動的並列処理を使用してリダクションルーチンをカーネル内で直接呼び出すこともできますが、これは使用しているカードには適していません。

score 1 · Accepted Answer

私がそれを正しく理解していれば、あなたがやろうとしていることは次のmatlabコードで書くことができます。

rep_B1 = repmat(B1(:),  1, length(B2(:)) );
rep_B2 = repmat(B2(:)', length(B1(:), 1) );
absdiff_B1B2 = abs(rep_B1 - repB2);
Result = mean( absdiff_B1B2(:) );

リダクションの前にabsdiff_B1B2、サイズlength(B1(:))xの行列length(B2(:))、つまりm*nxがあることに気付くでしょうk*t(上記のコードを 1 つの CUDA カーネルに実装した場合、この行列は決してグローバル mem に保存されません)。このマトリックスを 16x16 サブマトリックスに分割し、サブマトリックスごとに 1 つの 256 スレッドブロックを使用してワークロードを GPU に分解できます。

一方、推力を使用して生活を楽にすることもできます。

アップデート

B1とB2はの部分行列であるためA、最初にを使用cudaMemcpy2D()してそれらを線形空間にコピーし、次にカーネルを使用して行列を構築し、縮小することができabsdiff_B1B2ます。

最終的な正規化操作 (コードの最後の行) については、CPU で行うことができます。

absdiff_B1B2スラストを使用して、単一のカーネルで行列を構築および削減する方法を示すコードを次に示します。ただし、構築手順は共有メモリを使用せず、最適化されていないことがわかります。共有メモリを使用してさらに最適化すると、パフォーマンスが向上します。

#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>

template<typename T>
struct abs_diff
{
    inline __host__ __device__ T operator()(const T& x, const T& y)
    {
        return abs(x - y);
    }
};

int main()
{
    using namespace thrust::placeholders;

    const int m = 98;
    const int n = 87;
    int k = 76;
    int t = 65;
    double result;

    thrust::device_vector<double> B1(m * n, 1.0);
    thrust::device_vector<double> B2(k * t, 2.0);

    result = thrust::inner_product(
            thrust::make_permutation_iterator(
                    B1.begin(),
                    thrust::make_transform_iterator(
                            thrust::make_counting_iterator(0),
                            _1 % (m * n))),
            thrust::make_permutation_iterator(
                    B1.begin(),
                    thrust::make_transform_iterator(
                            thrust::make_counting_iterator(0),
                            _1 % (m * n))) + (m * n * k * t),
            thrust::make_permutation_iterator(
                    B2.begin(),
                    thrust::make_transform_iterator(
                            thrust::make_counting_iterator(0),
                            _1 / (m * n))),
            0.0,
            thrust::plus<double>(),
            abs_diff<double>());
    result /= m * n * k * t;

    std::cout << result << std::endl;

    return 0;
}

cuda - サブマトリックス計算

2 に答える 2

アップデート

Related

Reference