c++ - std::inner_product with omp

Question

std::inner_product()ライブラリでC++から並列化することは可能omp.hですか? 残念ながら__gnu_parallel::inner_product()、gcc の新しいバージョンでは available を使用できません。独自に実装してinner_product並列化できることはわかっていますが、標準的な手段を使用したいと考えています。

score 2 · Accepted Answer

短い答え：いいえ。

のようなアルゴリズムの要点はinner_product、ループを抽象化してユーザーから遠ざけることです。しかし、アルゴリズムを並列化するには、そのループを並列化する必要があります#pragma omp parallel for(並列セクションまたは並列セクションを介して)。どちらのメソッドも本質的にコード構造内のループにリンクされているため、ループが簡単に並列化できる場合でも (そうである可能性があります)、関数内に OpenMP プラグマを配置して並列化を適用する必要があります。

score 2 · Accepted Answer

Hristoのコメントをフォローアップすると、スレッドを介して配列を分解し、inner_product各サブ配列を呼び出してから、何らかのリダクション操作を使用してサブ結果を結合することで、これを行うことができます

#include <iostream>
#include <numeric>
#include <omp.h>

#include <sys/time.h>
void tick(struct timeval *t);
double tock(struct timeval *t);

int main (int argc, char **argv) {
  const long int nelements=1000000;
  long int *a = new long int[nelements];
  long int *b = new long int[nelements];
  int nthreads;
  long int sum = 0;
  struct timeval t;
  double time;

  #pragma omp parallel for
  for (long int i=0; i<nelements; i++) {
        a[i] = i+1;
        b[i] = 1;
  }

  tick(&t);
  #pragma omp parallel 
  #pragma omp single
  nthreads = omp_get_num_threads();

  #pragma omp parallel default(none) reduction(+:sum) shared(a,b,nthreads) 
  {
       int tid = omp_get_thread_num();
       int nitems = nelements/nthreads;
       int start = tid*nitems;
       int end   = start + nitems;
       if (tid == nthreads-1) end = nelements;

       sum += std::inner_product( &(a[start]), a+end, &(b[start]), 0L);
  }
  time = tock(&t);

  std::cout << "using omp: sum = " << sum << " time = " << time << std::endl;

  delete [] a;
  delete [] b;



  a = new long int[nelements];
  b = new long int[nelements];
  sum = 0;

  for (long int i=0; i<nelements; i++) {
        a[i] = i+1;
        b[i] = 1;
  }
  tick(&t);
  sum = std::inner_product( a, a+nelements, b, 0L);
  time = tock(&t);

  std::cout << "single threaded: sum = " << sum << " time = " << time << std::endl;

  std::cout << "correct answer: sum = " << (nelements)*(nelements+1)/2 << std::endl ;

  delete [] a;
  delete [] b;

  return 0;
}

void tick(struct timeval *t) {
    gettimeofday(t, NULL);
}

/* returns time in seconds from now to time described by t */
double tock(struct timeval *t) {
    struct timeval now;
    gettimeofday(&now, NULL);
    return (double)(now.tv_sec - t->tv_sec) + ((double)(now.tv_usec - t->tv_usec)/1000000.);
}

これを実行すると、予想よりも高速化されます。

$ for NT in 1 2 4 8; do export OMP_NUM_THREADS=${NT}; echo; echo "NTHREADS=${NT}";./inner; done

NTHREADS=1
using omp: sum = 500000500000 time = 0.004675
single threaded: sum = 500000500000 time = 0.004765
correct answer: sum = 500000500000

NTHREADS=2
using omp: sum = 500000500000 time = 0.002317
single threaded: sum = 500000500000 time = 0.004773
correct answer: sum = 500000500000

NTHREADS=4
using omp: sum = 500000500000 time = 0.001205
single threaded: sum = 500000500000 time = 0.004758
correct answer: sum = 500000500000

NTHREADS=8
using omp: sum = 500000500000 time = 0.000617
single threaded: sum = 500000500000 time = 0.004784
correct answer: sum = 500000500000

c++ - std::inner_product with omp

2 に答える 2

Related

Reference