c - OpenMPとreduction（）

Question

単純に3つの関数があります。1つは制御関数で、次の2つの関数はOpenMPを使用して少し異なる方法で実行されます。しかし、関数thread1はthread2とcontrolとは別のスコアを与え、その理由がわかりません。

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>

float function(float x){
    return pow(x,pow(x,sin(x)));
}



 float integrate(float begin, float end, int count){
    float score = 0 , width = (end-begin)/(1.0*count), i=begin, y1, y2;


    for(i = 0; i<count; i++){
            score += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
 }
    return score; 
 }




 float thread1(float begin, float end, int count){
    float score = 0 , width = (end-begin)/(1.0*count), y1, y2;

    int i;
    #pragma omp parallel for reduction(+:score) private(y1,i) shared(count)
    for(i = 0; i<count; i++){
        y1 = ((function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0);
       score = score + y1;
    }

    return score;
 }


 float thread2(float begin, float end, int count){
    float score = 0 , width = (end-begin)/(1.0*count), y1, y2;

    int i;
    float * tab = (float*)malloc(count * sizeof(float));

    #pragma omp parallel for
    for(i = 0; i<count; i++){
            tab[i] = (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
    }

    for(i=0; i<count; i++)
            score += tab[i];
    return score;
  }


  unsigned long long int rdtsc(void){
     unsigned long long int x;
     unsigned a, d;

    __asm__ volatile("rdtsc" : "=a" (a), "=d" (d));

    return ((unsigned long long)a) | (((unsigned long long)d) << 32);
   }






   int main(int argc, char** argv){
        unsigned long long counter = 0;


    //test
       counter = rdtsc();
       printf("control: %f \n ",integrate (atof(argv[1]), atof(argv[2]), atoi(argv[3])));
       printf("control count: %lld \n",rdtsc()-counter);
        counter = rdtsc();
       printf("thread1: %f \n ",thread1(atof(argv[1]), atof(argv[2]), atoi(argv[3])));
       printf("thread1 count: %lld \n",rdtsc()-counter);
        counter = rdtsc();
       printf("thread2: %f \n ",thread2(atof(argv[1]), atof(argv[2]), atoi(argv[3])));
       printf("thread2 count: %lld \n",rdtsc()-counter);

       return 0;
      }

ここに簡単な答えがあります：

 gcc -fopenmp zad2.c -o zad -pg -lm
 env OMP_NUM_THREADS=2 ./zad 3 13 100000
 control: 5407308.500000 
 control count: 138308058 
 thread1: 5407494.000000 
 thread1 count: 96525618 
 thread2: 5407308.500000 
 thread2 count: 104770859

アップデート：

わかりました。これをより迅速に実行し、期間の値を2回カウントしないようにしました。

double thread3(double begin, double end, int count){
     double score = 0 , width = (end-begin)/(1.0*count), yp, yk;    
     int i,j, k;

     #pragma omp parallel private (yp,yk) 
     {
       int thread_num = omp_get_num_threads();
       k = count / thread_num;

    #pragma omp for private(i) reduction(+:score) 
    for(i=0; i<thread_num; i++){
        yp = function(begin + i*k*width);
        yk = function(begin + (i*k+1)*width);
        score += (yp + yk) * width / 2.0;
        for(j=i*k +1; j<(i+1)*k; j++){
            yp = yk;
            yk = function(begin + (j+1)*width);
            score  += (yp + yk) * width / 2.0;
        }
    }

  #pragma omp for private(i) reduction(+:score) 
  for(i = k*thread_num; i<count; i++)
    score += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
 }  
   return score;
 }

しかし、いくつかのテストの結果、スコアは正しい値に近いが、等しくないことがわかりました。スレッドの1つが開始しない場合があります。OpenMpを使用していないときは、値は正しいです。

score 15 · Accepted Answer

非常に強くピークのある関数（x ^{（x ^sin（x）））}を統合しています。これは、統合する範囲で7桁以上をカバーします。これは32ビット浮動小数点数の制限であるため、数値を合計する順序によっては問題が発生します。これはOpenMPのことではなく、単なる数値感度のことです。

統合機能

したがって、たとえば、この完全にシリアルなコードが同じ積分を実行していると考えてください。

#include <stdio.h>
#include <math.h>

float function(float x){
    return pow(x,pow(x,sin(x)));
}

int main(int argc, char **argv) {

    const float begin=3., end=13.;
    const int count = 100000;
    const float width=(end-begin)/(1.*count);

    float integral1=0., integral2=0., integral3=0.;

    /* left to right */
    for (int i=0; i<count; i++) {
         integral1 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
    }

    /* right to left */
    for (int i=count-1; i>=0; i--) {
         integral2 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
    }

    /* centre outwards, first right-to-left, then left-to-right */
    for (int i=count/2; i<count; i++) {
         integral3 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
    }
    for (int i=count/2-1; i>=0; i--) {
         integral3 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
    }

    printf("Left to right: %lf\n", integral1);
    printf("Right to left: %lf\n", integral2);
    printf("Centre outwards: %lf\n", integral3);

    return 0;
}

これを実行すると、次のようになります。

$ ./reduce
Left to right: 5407308.500000
Right to left: 5407430.000000
Centre outwards: 5407335.500000

-あなたが見るのと同じ種類の違い。2つのスレッドで合計を行うと、必然的に合計の順序が変わるため、答えが変わります。

ここにはいくつかのオプションがあります。これが単なるテストの問題であり、この関数が実際に統合するものを表していない場合は、すでに問題がない可能性があります。それ以外の場合は、別の数値的方法を使用すると役立つ場合があります。

しかし、ここでも簡単な解決策があります。数値の範囲がの範囲を超えているfloatため、答えは合計の順序に非常に敏感ですが、の範囲内に快適に収まりdouble、問題はそれほど深刻ではありません。sに変更することdoubleは、すべての魔法の解決策ではないことに注意してください。場合によっては、問題を延期したり、数値的方法の欠陥を紙に書いたりすることができます。しかし、ここでは実際に根本的な問題にかなりうまく対処しています。float上記のすべてのsをsに変更すると、次のようになりdoubleます。

$ ./reduce
Left to right: 5407589.272885
Right to left: 5407589.272885
Centre outwards: 5407589.272885

一方、この関数を範囲（18,23）に統合する必要がある場合は、doubleでも節約できません。

c - OpenMPとreduction（）

アップデート：

1 に答える 1

Related

Reference