c++ - QuickHull の並列化: OpenMP はわずかなスピードアップを実現し、TBB は負のスピードアップを実現

Question

OpenMP と TBB を使用した共有メモリプログラミングに関して言えば、私は初心者です。

データポイントのセットの凸包を見つけるために、QuickHull アルゴリズム ( http://en.wikipedia.org/wiki/QuickHull ) の並列バージョンを実装しています。( http://en.wikipedia.org/wiki/Convex_hull )。

基本的に、次のタスクを並行して実行できます。

左端と右端の点 (P と Q) を見つけます。
これらの 2 点 (P と Q) の線の接続に従って、データセット全体を分割します。
これら 2 つのセットのそれぞれについて、最後の分割が発生した線 (PQ) から最も遠い点を取得します。
データを最も遠い点 (C) に基づいて 2 つのセットに分割します。1 つは線 PC の右側にあるすべての要素を含み、もう 1 つは線 QC の右側にあるすべての要素を含みます。

パート 3 と 4 は、すべてのサブセットが空になるまで再帰的に行われることに注意してください。

最初に、ほとんどを使用して OpenMP でこれを行いました#pragma omp parallel for...。しかし、スピードアップが2倍を超えることはないので、個人的には何か間違っていると思います。次に、スピードアップを比較するために Intel TBB を使用して実装を行いましたが、これは負のスピードアップをもたらしました (大規模なデータセットの場合でも)。TBB を使用して、tbb::parallel_for() と tbb::parallel_reduce() の両方を使用しました。

基本的に、私の質問は 2 つの部分に分けることができます: 1) OpenMP の実装 2) TBB の実装

パート1

以下のベンチマークでわかるように、データセットのサイズが大きくなると、十分なスレッドを使用すると速度も向上します。

最大 10,000 要素の小さなセットのランタイム 100,000 要素から始まる大規模なセットのランタイム OpenMP とシーケンシャルアルゴリズムの比較を高速化

高速化が 2 倍を超えていないことに注意してください。個人的には、このアルゴリズムでは非常に悪いと思います。これは、大部分が並列化可能であるためです。関連するコードは次のとおりです。

     void ompfindHull(POINT_VECTOR &input, Point* p1, Point* p2, POINT_VECTOR& output){
        // If there are no points in the set... just stop. This is the stopping criteria for the recursion. :)
        if (input.empty() || input.size() == 0) return;

        int num_threads = omp_get_max_threads();
        // Get the point that is the farthest from the p1-p2 segment
        Point** farthest_sub = new Point*[num_threads];
        double** distance_sub = new double*[num_threads];
        int thread_id;
        #pragma omp parallel private (thread_id)
        {
            thread_id = omp_get_thread_num();
            farthest_sub[thread_id] = input[0];
            distance_sub[thread_id] = new double(0);

            #pragma omp for
                for (int index = 1; index < input.size(); index++){
                    Point*a = p1;
                    Point*b = p2;
                    Point*c = input[index];

                    double distance = ( ( b->x - a->x ) * ( a->y - c->y ) ) - ( ( b->y - a->y ) * ( a->x - c->x ) );
                    distance = distance >= 0 ? distance : -distance;

                    double cur_distance = *distance_sub[thread_id];
                    if (cur_distance < distance){
                        farthest_sub[thread_id] = input[index];
                        distance_sub[thread_id] = new double(distance);
                    }
                }
        }

        Point* farthestPoint = farthest_sub[0];
        int distance = *distance_sub[0];
        for (int index = 1; index < num_threads; index++){
            if (distance < *distance_sub[index]){
                farthestPoint = farthest_sub[index];
            }
        }

        delete [] farthest_sub;
        delete [] distance_sub;

        // Add the farthest point to the output as it is part of the convex hull.
        output.push_back(farthestPoint);

        // Split in two sets.
        // The first one contains points right from p1 - farthestPoint
        // The second one contains points right from farthestPoint - p2
        vector<POINT_VECTOR> left_sub(num_threads), right_sub(num_threads);
        #pragma omp parallel private(thread_id)
        {
            thread_id = omp_get_thread_num();
            #pragma omp for
            for (size_t index = 0; index < input.size(); index++){
                Point* curPoint = input[index];
                if (curPoint != farthestPoint){
                    if (getPosition(p1, farthestPoint, curPoint) == RIGHT){
                        left_sub[thread_id].push_back(curPoint);
                    } else if (getPosition(farthestPoint, p2, curPoint) == RIGHT){
                        right_sub[thread_id].push_back(curPoint);
                    }
                }
            }
        }

        //Merge all vectors into a single vector :)
        POINT_VECTOR left, right;
        for (int index=0; index < num_threads; index++){
            left.insert(left.end(), left_sub[index].begin(), left_sub[index].end());
            right.insert(right.end(), right_sub[index].begin(), right_sub[index].end());
        }

        input.clear();


        // We do more recursion :)
        ompfindHull(left, p1, farthestPoint, output);
        ompfindHull(right, farthestPoint, p2, output);
     }

     double ompquickHull(POINT_VECTOR input, POINT_VECTOR& output){
        Timer timer;
        timer.start();

        // Find the left- and rightmost point.
        // We get the number of available threads.
        int num_threads = omp_get_max_threads();
        int thread_id;
        POINT_VECTOR minXPoints(num_threads);
        POINT_VECTOR maxXPoints(num_threads);

        // Devide all the points in subsets between several threads. For each of these subsets
        // we need to find the minX and maxX
        #pragma omp parallel shared(minXPoints,maxXPoints, input) private(thread_id)
        {
            thread_id = omp_get_thread_num();
            minXPoints[thread_id] = input[0];
            maxXPoints[thread_id] = input[0];

            int index;
            #pragma omp for
            for (index = 1; index < input.size(); index++)
            {
                Point* curPoint = input[index];
                if (curPoint->x > maxXPoints[thread_id]->x){
                    maxXPoints[thread_id] = curPoint;
                } else if (curPoint->x < minXPoints[thread_id]->x) {
                    minXPoints[thread_id] = curPoint;
                }
            }

            #pragma omp barrier

        }

        // We now have all the minX and maxX points of every single subset. We now use
        // these values to find the overall min and max X-point.
        Point* minXPoint = input[0], *maxXPoint = input[0];
        for (int index = 0; index < num_threads; index++){
            if (minXPoint->x > minXPoints[index]->x){
                minXPoint = minXPoints[index];
            }

            if (maxXPoint->x < maxXPoints[index]->x){
                maxXPoint = maxXPoints[index];
            }
        }

        // These points are sure to be part of the convex hull, so add them
        output.push_back(minXPoint);
        output.push_back(maxXPoint);

        // Now we have to split the set of point in subsets.
        // The first one containing all points above the line
        // The second one containing all points below the line
        const int size = input.size();
        vector<POINT_VECTOR> left_sub(num_threads), right_sub(num_threads);

        #pragma omp parallel private(thread_id)
        {
            thread_id = omp_get_thread_num();
            #pragma omp for
            for (unsigned int index = 0; index < input.size(); index++){
                Point* curPoint = input[index];
                if (curPoint != minXPoint || curPoint != maxXPoint){
                    if (getPosition(minXPoint, maxXPoint, curPoint) == RIGHT){
                        left_sub[thread_id].push_back(curPoint);
                    }
                    else if (getPosition(maxXPoint, minXPoint, curPoint) == RIGHT){
                        right_sub[thread_id].push_back(curPoint);
                    }
                }
            }
        }

        //Merge all vectors into a single vector :)
        POINT_VECTOR left, right;
        for (int index=0; index < num_threads; index++){
            left.insert(left.end(), left_sub[index].begin(), left_sub[index].end());
            right.insert(right.end(), right_sub[index].begin(), right_sub[index].end());
        }

        // We now have the initial two points belonging to the hill
        // We also split all the points into a group containing points left of AB and a group containing points right of of AB
        // We now recursively find all other points belonging to the convex hull.

        ompfindHull(left,minXPoint, maxXPoint, output);
        ompfindHull(right, maxXPoint, minXPoint, output);

        timer.end();

        return timer.getTimeElapsed();
     }

コードの大部分が並列化可能であるのに、8 コアを使用して 2 倍のスピードアップのみを達成するのが正常かどうかを知っている人はいますか? そうでない場合、ここで何が間違っているのですか!?

パート2

本当の問題はこれからです...

TBB 実装で同じテストを実行すると、次の結果が得られます。小さなデータセットのランタイム大規模なデータセットのランタイム高速化グラフ

ご覧のとおり、並列実装の実行時間は常に逐次実装の実行時間を超えています。スピードアップグラフに関しては、スピードアップは 1 未満です。つまり、マイナスのスピードアップです。

私が作成したさまざまな構造体のコードは次のとおりです。

ご了承くださいtypedef tbb::concurrent_vector<Point*> CPOINT_VECTOR

編集: Arch のコメントを適用しました。

    class FindExtremum{
public:
    enum ExtremumType{
        MINIMUM,MAXIMUM
    };

public:
    FindExtremum(CPOINT_VECTOR& points):fPoints(points), fMinPoint(points[0]), fMaxPoint(points[0]){}
    FindExtremum(const FindExtremum& extremum, tbb::split):fPoints(extremum.fPoints), fMinPoint(extremum.fMinPoint), fMaxPoint(extremum.fMaxPoint){}

    void join(const FindExtremum& other){
        Point* curMinPoint = other.fMinPoint;
        Point* curMaxPoint = other.fMaxPoint;

        if (isLargerThan(curMinPoint, MINIMUM)){
            fMinPoint = curMinPoint;
        }

        if (isSmallerThan(curMaxPoint, MAXIMUM)){
            fMaxPoint = curMaxPoint;
        }
    }

    void operator()(const BLOCKED_RANGE& range){
        for (size_t index = range.begin(); index < range.end(); index++){
            Point* curPoint = fPoints[index];

            if (isLargerThan(curPoint, MINIMUM)){
                fMinPoint = curPoint;
            }

            if (isSmallerThan(curPoint, MAXIMUM)){
                fMaxPoint = curPoint;
            }
        }
    }

private:
    bool isSmallerThan(const Point* point, const ExtremumType& type){
        switch (type){
        case MINIMUM:
            return fMinPoint->x < point->x;
        case MAXIMUM:
            return fMaxPoint->x < point->x;
        }
    }

    bool isLargerThan(const Point* point, const ExtremumType& type){
        return !isSmallerThan(point, type);
    }

public:
    Point* getMaxPoint(){
        return this->fMaxPoint;
    }

    Point* getMinPoint(){
        return this->fMinPoint;
    }

public:
    CPOINT_VECTOR fPoints;
    Point* fMinPoint;
    Point* fMaxPoint;

};

class Splitter{
public:
    Splitter(const CPOINT_VECTOR& points, Point* point1, Point* point2,
            Point* farthestPoint, CPOINT_VECTOR* left, CPOINT_VECTOR* right, int grainsize):
        fPoints(points), p1(point1), p2(point2), farthestPoint(farthestPoint), fLeft(left), fRight(right), fGrain(grainsize)
    {
        //fLeft = new tbb::concurrent_vector<Point*>();
        //fRight = new tbb::concurrent_vector<Point*>();
        //fLeft = new vector<Point*>();
        //fRight = new vector<Point*>();
    };

    Splitter(const Splitter& splitter, tbb::split):
        fPoints(splitter.fPoints), p1(splitter.p1), p2(splitter.p2), farthestPoint(splitter.farthestPoint),
        fLeft(splitter.fLeft), fRight(splitter.fRight), fGrain(splitter.fGrain){}

    void operator()(const BLOCKED_RANGE& range) const{
        const int grainsize = fGrain;
        Point** left = new Point*[grainsize];
        Point** right = new Point*[grainsize];
        int leftcounter = 0;
        int rightcounter = 0;
        for (size_t index = range.begin(); index < range.end(); index++){
            Point* curPoint = fPoints[index];
            if (curPoint != farthestPoint){
                if (getPosition(p1, farthestPoint, curPoint) == RIGHT){
                    left[leftcounter++] = curPoint;
                } else if (getPosition(farthestPoint, p2, curPoint) == RIGHT){
                    right[rightcounter++] = curPoint;
                }
            }
        }
        appendVector(left,leftcounter,*fLeft);
        appendVector(right,rightcounter,*fRight);
    }

public:
    Point* p1;
    Point* p2;
    Point* farthestPoint;
    int fGrain;
    CPOINT_VECTOR* fLeft;
    CPOINT_VECTOR* fRight;
    CPOINT_VECTOR fPoints;

};

class InitialSplitter{
public:
    InitialSplitter(const CPOINT_VECTOR& points, CPOINT_VECTOR* left, CPOINT_VECTOR* right,
                    Point* point1, Point* point2, int grainsize):
            fPoints(points), p1(point1), p2(point2), fLeft(left), fRight(right), fGrain(grainsize){}

    InitialSplitter(const InitialSplitter& splitter, tbb::split):
        fPoints(splitter.fPoints), p1(splitter.p1), p2(splitter.p2),
        fLeft(splitter.fLeft), fRight(splitter.fRight), fGrain(splitter.fGrain){
    }

    void operator()(const BLOCKED_RANGE& range) const{
        const int grainsize = fGrain;
        Point** left = new Point*[grainsize];
        Point** right = new Point*[grainsize];
        int leftcounter = 0;
        int rightcounter = 0;
        for (size_t index = range.begin(); index < range.end(); index++){
            Point* curPoint = fPoints[index];
            if (curPoint != p1 || curPoint != p2){
                if (getPosition(p1, p2, curPoint) == RIGHT){
                    left[leftcounter++] = curPoint;
                } else if (getPosition(p2, p1, curPoint) == RIGHT){
                    right[rightcounter++] = curPoint;
                }
            }
        }
        appendVector(left,leftcounter,*fLeft);
        appendVector(right,rightcounter,*fRight);
    }

public:
    CPOINT_VECTOR fPoints;
    int fGrain;
    Point* p1;
    Point* p2;
    CPOINT_VECTOR* fLeft;
    CPOINT_VECTOR* fRight;
};

class FarthestPointFinder{
public:
    FarthestPointFinder(const CPOINT_VECTOR& points, Point* p1, Point* p2):
        fPoints(points), fFarthestPoint(points[0]),fDistance(-1), p1(p1), p2(p2){}

    FarthestPointFinder(const FarthestPointFinder& fpf, tbb::split):
        fPoints(fpf.fPoints), fFarthestPoint(fpf.fFarthestPoint),fDistance(-1), p1(fpf.p1), p2(fpf.p2){}

    void operator()(const BLOCKED_RANGE& range){
        for (size_t index = range.begin(); index < range.end(); index++){
            Point* curPoint = fPoints[index];
            double curDistance = distance(p1,p2,curPoint);
            if (curDistance > fDistance){
                fFarthestPoint = curPoint;
                fDistance = curDistance;
            }
        }
    }

    void join(const FarthestPointFinder& other){
        if (fDistance < other.fDistance){
            fFarthestPoint = other.fFarthestPoint;
            fDistance = other.fDistance;
        }
    }

public:
    Point* getFarthestPoint(){
        return this->fFarthestPoint;
    }

public:
    CPOINT_VECTOR fPoints;
    Point* fFarthestPoint;
    int fDistance;
    Point* p1;
    Point* p2;
};

QuickHull コードが続きます。

   void tbbfindHull(CPOINT_VECTOR &input, Point* p1, Point* p2, POINT_VECTOR& output, int max_threads){
    // If there are no points in the set... just stop. This is the stopping criteria for the recursion. :)
    if (input.empty() || input.size() == 0) return;
    else if (input.size() == 1) {
        output.push_back(input[0]);
        return;
    }

    // Get the point that is the farthest from the p1-p2 segment

    int GRAINSIZE = ((double)input.size())/max_threads;

    FarthestPointFinder fpf(input, p1, p2);
    tbb::parallel_reduce(BLOCKED_RANGE(0,input.size(),GRAINSIZE), fpf);
    Point *farthestPoint = fpf.getFarthestPoint();

    // Add the farthest point to the output as it is part of the convex hull.
    output.push_back(farthestPoint);

    // Split in two sets.
    // The first one contains points right from p1 - farthestPoint
    // The second one contains points right from farthestPoint - p2

    CPOINT_VECTOR* left = new CPOINT_VECTOR();
    CPOINT_VECTOR* right = new CPOINT_VECTOR();

    Splitter splitter(input,p1,p2,farthestPoint, left, right, GRAINSIZE);
    tbb::parallel_for(BLOCKED_RANGE(0,input.size(), GRAINSIZE), splitter);

    // We do more recursion :)
    tbbfindHull(*left, p1, farthestPoint, output, max_threads);
    tbbfindHull(*right, farthestPoint, p2, output, max_threads);

}

/**
 * Calling the quickHull algorithm!
 */
double tbbquickHull(POINT_VECTOR input_o, POINT_VECTOR& output, int max_threads){

    CPOINT_VECTOR input;
    for (int i =0; i < input_o.size(); i++){
        input.push_back(input_o[i]);
    }

    int GRAINSIZE = input.size()/max_threads;
    Timer timer;
    timer.start();

    // Find the left- and rightmost point.
    FindExtremum fextremum(input);
    tbb::parallel_reduce(BLOCKED_RANGE(0, input.size(),GRAINSIZE), fextremum);

    Point* minXPoint = fextremum.getMinPoint();
    Point* maxXPoint = fextremum.getMaxPoint();

    // These points are sure to be part of the convex hull, so add them
    output.push_back(minXPoint);
    output.push_back(maxXPoint);

    // Now we have to split the set of point in subsets.
    // The first one containing all points above the line
    // The second one containing all points below the line
    CPOINT_VECTOR* left = new CPOINT_VECTOR;
    CPOINT_VECTOR* right = new CPOINT_VECTOR;

    //Timer temp1;
    //temp1.start();
    InitialSplitter splitter(input, left, right, minXPoint, maxXPoint, GRAINSIZE);
    tbb::parallel_for(BLOCKED_RANGE(0, input.size(),GRAINSIZE), splitter);
    // We now have the initial two points belonging to the hill
    // We also split all the points into a group containing points left of AB and a group containing points right of of AB
    // We now recursively find all other points belonging to the convex hull.

    tbbfindHull(*left,minXPoint, maxXPoint, output, max_threads);
    tbbfindHull(*right, maxXPoint, minXPoint, output, max_threads);
    timer.end();

    return timer.getTimeElapsed();
}

TBB では、コードのさまざまな並列部分のタイミングを計るときに、いくつかの異常に気付くことがありました。tbb::parallel_for() を使用してサブセット全体を最初InitialSplitterに 2 つのサブセットに分割するには、対応する OpenMP バージョンのランタイム全体とほぼ同じ時間がかかりますが、この時間は、異なる数のスレッドが使用されても変わりません。InitialSplittertbb::parallel_for() への引数として渡される -object内でタイミングを取ると、大幅なスピードアップが見られるため、これは奇妙です。' operator() メソッドで繰り返される for ループはInitialSplitters、スレッド数が増えると予想される速度向上を示しています。

tbb::parallel_for()たとえば、インスタンスを取得する初期化のような単一のInitialSplitterインスタンスが、OpenMP 実装全体を実行するのと同じくらい時間がかかるのは非常に奇妙だと思います。-operator()tbb::parallel_for()内のタイミングがほぼ線形のスピードアップを観察できる一方で、スピードアップなしの周りのタイミングが観察できるのは非常に奇妙だと思います...InitialSplitters

私を助けてくれる人はここにいますか!?

前もって感謝します！

score 1 · Accepted Answer

役に立つと思われるコメントがいくつかあります。

一般に、スレッド番号 (thread_id) に直接アクセスすることは避けようとします。スレッド数と同じサイズの配列を定義する代わりに、変数を並列ブロックで定義する必要があります (これにより、変数が自動的に非公開になります)。次に、並列ブロックの後にスレッド数をループする代わりに、アトミック、クリティカル、または単一のアプローチを使用する必要があります (ここではどれが最適かはわかりません)。このようなもの。

Point* farthestPoint;
//int distance = *distance_sub[0];  //is this a bug shouldn't distance be a double?
double distance = 0
#pragma omp parallel private
{
    Point farthest_sub = input[0];
    double distance_sub = 0;
     #pragma omp for nowait
     for (int index = 1; index < input.size(); index++){
     // for loop code
     }
     #pragma omp critical 
     {        
         if (distance < distance_sub){
            farthestPoint = farthest_sub;
         }
     }
     #pragma omp barrier 
     //next part of code 
}

現在直面している問題の 1 つは、偽の共有です。各スレッドは、同じキャッシュライン (配列 distance_sub[num_threads] など) 内の配列に書き込もうとしています。並列ブロック内で値を宣言するときに OpenMP が何をするかはわかりませんが、偽の共有を避けるために値を割り当てる可能性が高いと思われます。

特に少数の要素で、OpenMP を何度も呼び出さないようにする必要がある別のコメント。OpenMP にはオーバーヘッドがあります。バリアなどを使用して、1 つの並列ブロックでできるだけ多くを取得しようとします。

また、あなたのコードでint distance = *distance_sub[0]はこれはバグですか? 距離は倍じゃないの？

最後に、衒学的なポイント。私はあなたが8コアを持っているとは思わない。Intel ハイパースレッディングにより、おそらく 4 つのコアと 8 つのハードウェアスレッドを使用しています。この区別が重要になる場合があります。

c++ - QuickHull の並列化: OpenMP はわずかなスピードアップを実現し、TBB は負のスピードアップを実現

パート1

パート2

編集: Arch のコメントを適用しました。

2 に答える 2

Related

Reference