matrix - 行列の断片的な処理-CUDA

Question

さて、処理したい（N x N）行列があるとしましょう。このマトリックスは私のコンピューターにとっては非常に大きく、一度にデバイスに送信しようとすると、「メモリ不足エラー」が発生します。

では、マトリックスのセクションをデバイスに送信する方法はありますか？私がそれを行うために見ることができる1つの方法は、ホスト上のマトリックスの一部をコピーし、次にこれらの管理可能なコピーされた部分をホストからデバイスに送信し、最後にそれらを元に戻すことです。

これは私が試したものですが、forループのcudaMemcpyはエラーコード11、「無効な引数」を返します。

int h_N = 10000;
size_t h_size_m = h_N*sizeof(float);
h_A  = (float*)malloc(h_size_m*h_size_m);

int d_N = 2500;
size_t d_size_m = d_N*sizeof(float);

InitializeMatrices(h_N);

int i;
int iterations = (h_N*h_N)/(d_N*d_N);

for( i = 0; i < iterations; i++ ) 
{
    float* h_array_ref = h_A+(i*d_N*d_N);
    cudasafe( cudaMemcpy(d_A, h_array_ref, d_size_m*d_size_m, cudaMemcpyHostToDevice), "cudaMemcpy");
    cudasafe( cudaFree(d_A), "cudaFree(d_A)" );
}

上記のコードで達成しようとしているのは、これです。マトリックス全体をデバイスに送信する代わりに、そのマトリックス内の場所へのポインターを送信し、デバイス上で作業を行うのに十分なスペースを予約します。ループの次の反復では、マトリックス内などでポインターを前方に移動します。

score 4 · Accepted Answer

これを実行できるだけでなく（問題がこの方法でサブ配列に簡単に分解されると仮定して）、パフォーマンスのために実行すると非常に便利な場合があります。説明した基本的なアプローチが機能するようになったら、非同期メモリコピーとダブルバッファリングの使用を開始して、メモリ転送時間の一部を、すでにカードにあるものの計算に費やした時間とオーバーラップさせることができます。

しかし、最初のものは単純なものを機能させます。以下は1dの例です（ベクトルにスカラーを乗算し、別のスカラーを追加します）が、線形化された2d配列を使用することは同じです。重要な部分は

CHK_CUDA( cudaMalloc(&xd, batchsize*sizeof(float)) );
CHK_CUDA( cudaMalloc(&yd, batchsize*sizeof(float)) );
tick(&gputimer);

int nbatches = 0;
for (int nstart=0; nstart < n; nstart+=batchsize) {

    int size=batchsize;
    if ((nstart + batchsize) > n) size = n - nstart;

    CHK_CUDA( cudaMemcpy(xd, &(x[nstart]), size*sizeof(float), cudaMemcpyHostToDevice) );

    blocksize = (size+nblocks-1)/nblocks;
    cuda_saxpb<<<nblocks, blocksize>>>(xd, a, b, yd, size);

    CHK_CUDA( cudaMemcpy(&(ycuda[nstart]), yd, size*sizeof(float), cudaMemcpyDeviceToHost) );

    nbatches++;
}
gputime = tock(&gputimer);

CHK_CUDA( cudaFree(xd) );
CHK_CUDA( cudaFree(yd) );

開始時にバッファを割り当て、完了するまでループします。毎回、コピーを実行し、カーネルを起動してから、コピーして戻します。あなたは最後に自由になります。

完全なコードは

#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <cuda.h>
#include <sys/time.h>
#include <math.h>

#define CHK_CUDA(e) {if (e != cudaSuccess) {fprintf(stderr,"Error: %s\n", cudaGetErrorString(e)); exit(-1);}}

__global__ void cuda_saxpb(const float *xd, const float a, const float b,
                           float *yd, const int n) {

    int i = threadIdx.x + blockIdx.x*blockDim.x;
    if (i<n) {
        yd[i] = a*xd[i]+b;
    }
    return;
}

void cpu_saxpb(const float *x, float a, float b, float *y, int n) {

    int i;
    for (i=0;i<n;i++) {
        y[i] = a*x[i]+b;
    }
    return;
}

int get_options(int argc, char **argv, int *n, int *s, int *nb, float *a, float *b);
void tick(struct timeval *timer);
double tock(struct timeval *timer);

int main(int argc, char **argv) {
    int n=1000;
    int nblocks=10;
    int batchsize=100;
    float a = 5.;
    float b = -1.;
    int err;
    float *x, *y, *ycuda;
    float *xd, *yd;
    double abserr;
    int blocksize;
    int i;
    struct timeval cputimer;
    struct timeval gputimer;
    double cputime, gputime;

    err = get_options(argc, argv, &n, &batchsize, &nblocks, &a, &b);
    if (batchsize > n) {
        fprintf(stderr, "Resetting batchsize to size of vector, %d\n", n);
        batchsize = n;
    }
    if (err) return 0;

    x = (float *)malloc(n*sizeof(float));
    if (!x) return 1;

    y = (float *)malloc(n*sizeof(float));
    if (!y) {free(x); return 1;}

    ycuda = (float *)malloc(n*sizeof(float));
    if (!ycuda) {free(y); free(x); return 1;}

    /* run CPU code */

    tick(&cputimer);
    cpu_saxpb(x, a, b, y, n);
    cputime = tock(&cputimer);

    /* run GPU code */

    /* only have to allocate once */
    CHK_CUDA( cudaMalloc(&xd, batchsize*sizeof(float)) );
    CHK_CUDA( cudaMalloc(&yd, batchsize*sizeof(float)) );
    tick(&gputimer);

    int nbatches = 0;
    for (int nstart=0; nstart < n; nstart+=batchsize) {

        int size=batchsize;
        if ((nstart + batchsize) > n) size = n - nstart;

        CHK_CUDA( cudaMemcpy(xd, &(x[nstart]), size*sizeof(float), cudaMemcpyHostToDevice) );

        blocksize = (size+nblocks-1)/nblocks;
        cuda_saxpb<<<nblocks, blocksize>>>(xd, a, b, yd, size);

        CHK_CUDA( cudaMemcpy(&(ycuda[nstart]), yd, size*sizeof(float), cudaMemcpyDeviceToHost) );

        nbatches++;
    }
    gputime = tock(&gputimer);

    CHK_CUDA( cudaFree(xd) );
    CHK_CUDA( cudaFree(yd) );

    abserr = 0.;
    for (i=0;i<n;i++) {
        abserr += fabs(ycuda[i] - y[i]);
    }

    printf("Y = a*X + b, problemsize = %d\n", n);
    printf("CPU time = %lg millisec.\n", cputime*1000.);
    printf("GPU time = %lg millisec (done with %d batches of %d).\n",
                  gputime*1000., nbatches, batchsize);
    printf("CUDA and CPU results differ by %lf\n", abserr);

    free(x);
    free(y);
    free(ycuda);
    return 0;
}


int get_options(int argc, char **argv, int *n, int *s, int *nb, float *a, float *b) {

  const struct option long_options[] = {
    {"nvals"     , required_argument, 0, 'n'},
    {"nblocks"   , required_argument, 0, 'B'},
    {"batchsize" , required_argument, 0, 's'},
    {"a", required_argument, 0, 'a'},
    {"b", required_argument, 0, 'b'},
    {"help",      no_argument, 0, 'h'},
    {0, 0, 0, 0}};

  char c;
  int option_index;
  int tempint;

  while (1) {
    c = getopt_long(argc, argv, "n:B:a:b:s:h", long_options, &option_index);
    if (c == -1) break;

    switch(c) {
      case 'n': tempint = atoi(optarg);
          if (tempint < 1 || tempint > 500000) {
            fprintf(stderr,"%s: Cannot use number of points %s;\n  Using %d\n", argv[0], optarg, *n);
          } else {
            *n = tempint;
          }
          break;

      case 's': tempint = atoi(optarg);
          if (tempint < 1 || tempint > 50000) {
            fprintf(stderr,"%s: Cannot use number of points %s;\n  Using %d\n", argv[0], optarg, *s);
          } else {
            *s = tempint;
          }
          break;

      case 'B': tempint = atoi(optarg);
          if (tempint < 1 || tempint > 1000 || tempint > *n) {
            fprintf(stderr,"%s: Cannot use number of blocks %s;\n  Using %d\n", argv[0], optarg, *nb);
          } else {
            *nb = tempint;
          }
          break;

      case 'a': *a = atof(optarg);
          break;

      case 'b': *b = atof(optarg);
          break;

      case 'h':
          puts("Calculates y[i] = a*x[i] + b on the GPU.");
          puts("Options: ");
          puts("    --nvals=N      (-n N): Set the number of values in y,x.");
          puts("    --batchsize=N  (-s N): Set the number of values to transfer at a time.");
          puts("    --nblocks=N    (-B N): Set the number of blocks used.");
          puts("    --a=X          (-a X): Set the parameter a.");
          puts("    --b=X          (-b X): Set the parameter b.");
          puts("    --niters=N     (-I X): Set number of iterations to calculate.");
          puts("");
          return +1;
        }
    }

    return 0;
}

void tick(struct timeval *timer) {
    gettimeofday(timer, NULL);
}

double tock(struct timeval *timer) {
    struct timeval now;
    gettimeofday(&now, NULL);
    return (now.tv_usec-timer->tv_usec)/1.0e6 + (now.tv_sec - timer->tv_sec);
}

これを実行すると、次のようになります。

$  ./batched-saxpb --nvals=10240 --batchsize=10240 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.072 millisec.
GPU time = 0.117 millisec (done with 1 batches of 10240).
CUDA and CPU results differ by 0.000000

$ ./batched-saxpb --nvals=10240 --batchsize=5120 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.066 millisec.
GPU time = 0.133 millisec (done with 2 batches of 5120).
CUDA and CPU results differ by 0.000000

$ ./batched-saxpb --nvals=10240 --batchsize=2560 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.067 millisec.
GPU time = 0.167 millisec (done with 4 batches of 2560).
CUDA and CPU results differ by 0.000000

この場合、GPU時間は長くなりますが（より多くのメモリコピーを実行しています）、答えは同じままです。

編集：このコードの元のバージョンには、タイミングの目的でカーネルの複数の反復を実行するオプションがありましたが、このコンテキストでは不必要に混乱するため、削除されました。

matrix - 行列の断片的な処理-CUDA

1 に答える 1

Related

Reference