cuda - CUDA での行列の行の合計 (行優先または列優先のいずれかで格納)

Question

CUDA で行列の行を合計する問題に取り組んでいます。次の例を挙げています。

次の20 * 4配列があるとします。

2 次元配列を (行優先または列優先のいずれかで) 1 次元配列にフラット化した後、各スレッドを別の行に割り当て、その行のコストを計算する必要があります。

たとえば、
スレッド 1 は次のコストを計算する必要があり、1 2 3 4
スレッド 2 は次のコストを計算する必要があります。4 1 2 3

どうすればCUDAでそれを行うことができますか?

返信ありがとうございます

score 1 · Accepted Answer

#include <stdio.h>
#include <stdlib.h>
#define MROWS 20
#define NCOLS 4
#define nTPB 256

__global__ void mykernel(int *costdata, int rows, int cols, int *results){
  int tidx = threadIdx.x + blockDim.x*blockIdx.x;
  if (tidx < rows){
    int mycost = 0;
    for (int i = 0; i < cols; i++)
       mycost += costdata[(tidx*cols)+i];
    results[tidx] = mycost;
    }
  }

int main(){
  //define and initialize host and device storage for cost and results
  int *d_costdata, *h_costdata, *d_results, *h_results;
  h_results = (int *)malloc(MROWS*sizeof(int));
  h_costdata = (int *)malloc(MROWS*NCOLS*sizeof(int));
  for (int i=0; i<(MROWS*NCOLS); i++)
    h_costdata[i] = rand()%4;
  cudaMalloc((void **)&d_results, MROWS*sizeof(int));
  cudaMalloc((void **)&d_costdata, MROWS*NCOLS*sizeof(int));
  //copy cost data from host to device
  cudaMemcpy(d_costdata, h_costdata, MROWS*NCOLS*sizeof(int), cudaMemcpyHostToDevice);
  mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results);
  // copy results back from device to host
  cudaMemcpy(h_results, d_results, MROWS*sizeof(int), cudaMemcpyDeviceToHost);
  for (int i=0; i<MROWS; i++){
    int loc_cost = 0;
    for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
    printf("cost[%d]: host= %d, device = %d\n", i, loc_cost, h_results[i]);
    }
  }

これは、各行の「コスト」が各行の要素の合計であることを前提としています。別の「コスト」関数がある場合は、それに応じてカーネル for ループのアクティビティを変更できます。これは、C スタイルの行優先のデータストレージ (1 2 3 4 4 1 2 3 3 4 1 2 など) も想定しています。

代わりに列優先のストレージ (1 4 3 など) を使用すると、データの読み取りを完全に結合できるため、パフォーマンスをわずかに向上させることができます。次に、カーネルコードは次のようになります。

for (int i = 0; i < cols; i++)
  mycost += costdata[(i*rows)+tidx];

また、すべての CUDA API 呼び出しとカーネル呼び出しで適切な cuda エラーチェックを使用する必要があります。

編集:以下のコメントで説明されているように、行優先のストレージの場合、状況によっては、基本型ではなく 16 バイトの量をロードすることを選択することで、メモリ効率が向上する場合があります。以下は、このアイデアを任意の次元と (多かれ少なかれ) 任意の基本型に実装する修正版です。

#include <iostream>
#include <typeinfo>
#include <cstdlib>
#include <vector_types.h>

#define MROWS 1742
#define NCOLS 801
#define nTPB 256

typedef double mytype;

__host__ int sizetype(){
  int size = 0;
  if ((typeid(mytype) == typeid(float)) || (typeid(mytype) == typeid(int)) || (typeid(mytype) == typeid(unsigned int)))
      size = 4;
  else if (typeid(mytype) == typeid(double))
      size = 8;
  else if ((typeid(mytype) == typeid(unsigned char)) || (typeid(mytype) == typeid(char)))
      size = 1;
  return size;
  }


template<typename T>
__global__ void mykernel(const T *costdata, int rows, int cols, T *results, int size, size_t pitch){
  int chunk = 16/size;  // assumes size is a factor of 16
  int tidx = threadIdx.x + blockDim.x*blockIdx.x;
  if (tidx < rows){
    T *myrowptr = (T *)(((unsigned char *)costdata) + tidx*pitch);
    T mycost = (T)0;
    int count = 0;
    while (count < cols){
      if ((cols-count)>=chunk){
      // read 16 bytes
        int4 temp = *((int4 *)(myrowptr + count));
        int bcount = 16;
        int j = 0;
        while (bcount > 0){
          mycost += *(((T *)(&temp)) + j++);
          bcount -= size;
          count++;}
        }
      else {
      // read one quantity at a time
        for (; count < cols; count++)
          mycost += myrowptr[count];
        }
    results[tidx] = mycost;
    }
  }
}

int main(){
  int typesize = sizetype();
  if (typesize == 0) {std::cout << "invalid type selected" << std::endl; return 1;}
  //define and initialize host and device storage for cost and results
  mytype *d_costdata, *h_costdata, *d_results, *h_results;
  h_results = (mytype *)malloc(MROWS*sizeof(mytype));
  h_costdata = (mytype *)malloc(MROWS*NCOLS*sizeof(mytype));
  for (int i=0; i<(MROWS*NCOLS); i++)
    h_costdata[i] = (mytype)(rand()%4);
  size_t pitch = 0;
  cudaMalloc((void **)&d_results, MROWS*sizeof(mytype));
  cudaMallocPitch((void **)&d_costdata, &pitch, NCOLS*sizeof(mytype), MROWS);
  //copy cost data from host to device
  cudaMemcpy2D(d_costdata, pitch, h_costdata, NCOLS*sizeof(mytype), NCOLS*sizeof(mytype),  MROWS, cudaMemcpyHostToDevice);

  mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results, typesize, pitch);
  // copy results back from device to host
  cudaMemcpy(h_results, d_results, MROWS*sizeof(mytype), cudaMemcpyDeviceToHost);
  for (int i=0; i<MROWS; i++){
    mytype loc_cost = (mytype)0;
    for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
    if ((i < 10) && (typesize > 1))
      std::cout <<"cost[" << i << "]: host= " << loc_cost << ", device = " << h_results[i] << std::endl;
    if (loc_cost != h_results[i]){ std::cout << "mismatch at index" << i << "should be:" << loc_cost << "was:" << h_results[i] << std::endl; return 1; }
    }
  std::cout << "Results are correct!" << std::endl;
  }

cuda - CUDA での行列の行の合計 (行優先または列優先のいずれかで格納)

1 に答える 1

Related

Reference