OpenMP と TBB を使用した共有メモリ プログラミングに関して言えば、私は初心者です。
データ ポイントのセットの凸包を見つけるために、QuickHull アルゴリズム ( http://en.wikipedia.org/wiki/QuickHull ) の並列バージョンを実装しています。( http://en.wikipedia.org/wiki/Convex_hull )。
基本的に、次のタスクを並行して実行できます。
- 左端と右端の点 (P と Q) を見つけます。
- これらの 2 点 (P と Q) の線の接続に従って、データセット全体を分割します。
- これら 2 つのセットのそれぞれについて、最後の分割が発生した線 (PQ) から最も遠い点を取得します。
- データを最も遠い点 (C) に基づいて 2 つのセットに分割します。1 つは線 PC の右側にあるすべての要素を含み、もう 1 つは線 QC の右側にあるすべての要素を含みます。
パート 3 と 4 は、すべてのサブセットが空になるまで再帰的に行われることに注意してください。
最初に、ほとんどを使用して OpenMP でこれを行いました#pragma omp parallel for...
。しかし、スピードアップが2倍を超えることはないので、個人的には何か間違っていると思います。次に、スピードアップを比較するために Intel TBB を使用して実装を行いましたが、これは負のスピードアップをもたらしました (大規模なデータ セットの場合でも)。TBB を使用して、tbb::parallel_for() と tbb::parallel_reduce() の両方を使用しました。
基本的に、私の質問は 2 つの部分に分けることができます: 1) OpenMP の実装 2) TBB の実装
パート1
以下のベンチマークでわかるように、データ セットのサイズが大きくなると、十分なスレッドを使用すると速度も向上します。
高速化が 2 倍を超えていないことに注意してください。個人的には、このアルゴリズムでは非常に悪いと思います。これは、大部分が並列化可能であるためです。関連するコードは次のとおりです。
void ompfindHull(POINT_VECTOR &input, Point* p1, Point* p2, POINT_VECTOR& output){
// If there are no points in the set... just stop. This is the stopping criteria for the recursion. :)
if (input.empty() || input.size() == 0) return;
int num_threads = omp_get_max_threads();
// Get the point that is the farthest from the p1-p2 segment
Point** farthest_sub = new Point*[num_threads];
double** distance_sub = new double*[num_threads];
int thread_id;
#pragma omp parallel private (thread_id)
{
thread_id = omp_get_thread_num();
farthest_sub[thread_id] = input[0];
distance_sub[thread_id] = new double(0);
#pragma omp for
for (int index = 1; index < input.size(); index++){
Point*a = p1;
Point*b = p2;
Point*c = input[index];
double distance = ( ( b->x - a->x ) * ( a->y - c->y ) ) - ( ( b->y - a->y ) * ( a->x - c->x ) );
distance = distance >= 0 ? distance : -distance;
double cur_distance = *distance_sub[thread_id];
if (cur_distance < distance){
farthest_sub[thread_id] = input[index];
distance_sub[thread_id] = new double(distance);
}
}
}
Point* farthestPoint = farthest_sub[0];
int distance = *distance_sub[0];
for (int index = 1; index < num_threads; index++){
if (distance < *distance_sub[index]){
farthestPoint = farthest_sub[index];
}
}
delete [] farthest_sub;
delete [] distance_sub;
// Add the farthest point to the output as it is part of the convex hull.
output.push_back(farthestPoint);
// Split in two sets.
// The first one contains points right from p1 - farthestPoint
// The second one contains points right from farthestPoint - p2
vector<POINT_VECTOR> left_sub(num_threads), right_sub(num_threads);
#pragma omp parallel private(thread_id)
{
thread_id = omp_get_thread_num();
#pragma omp for
for (size_t index = 0; index < input.size(); index++){
Point* curPoint = input[index];
if (curPoint != farthestPoint){
if (getPosition(p1, farthestPoint, curPoint) == RIGHT){
left_sub[thread_id].push_back(curPoint);
} else if (getPosition(farthestPoint, p2, curPoint) == RIGHT){
right_sub[thread_id].push_back(curPoint);
}
}
}
}
//Merge all vectors into a single vector :)
POINT_VECTOR left, right;
for (int index=0; index < num_threads; index++){
left.insert(left.end(), left_sub[index].begin(), left_sub[index].end());
right.insert(right.end(), right_sub[index].begin(), right_sub[index].end());
}
input.clear();
// We do more recursion :)
ompfindHull(left, p1, farthestPoint, output);
ompfindHull(right, farthestPoint, p2, output);
}
double ompquickHull(POINT_VECTOR input, POINT_VECTOR& output){
Timer timer;
timer.start();
// Find the left- and rightmost point.
// We get the number of available threads.
int num_threads = omp_get_max_threads();
int thread_id;
POINT_VECTOR minXPoints(num_threads);
POINT_VECTOR maxXPoints(num_threads);
// Devide all the points in subsets between several threads. For each of these subsets
// we need to find the minX and maxX
#pragma omp parallel shared(minXPoints,maxXPoints, input) private(thread_id)
{
thread_id = omp_get_thread_num();
minXPoints[thread_id] = input[0];
maxXPoints[thread_id] = input[0];
int index;
#pragma omp for
for (index = 1; index < input.size(); index++)
{
Point* curPoint = input[index];
if (curPoint->x > maxXPoints[thread_id]->x){
maxXPoints[thread_id] = curPoint;
} else if (curPoint->x < minXPoints[thread_id]->x) {
minXPoints[thread_id] = curPoint;
}
}
#pragma omp barrier
}
// We now have all the minX and maxX points of every single subset. We now use
// these values to find the overall min and max X-point.
Point* minXPoint = input[0], *maxXPoint = input[0];
for (int index = 0; index < num_threads; index++){
if (minXPoint->x > minXPoints[index]->x){
minXPoint = minXPoints[index];
}
if (maxXPoint->x < maxXPoints[index]->x){
maxXPoint = maxXPoints[index];
}
}
// These points are sure to be part of the convex hull, so add them
output.push_back(minXPoint);
output.push_back(maxXPoint);
// Now we have to split the set of point in subsets.
// The first one containing all points above the line
// The second one containing all points below the line
const int size = input.size();
vector<POINT_VECTOR> left_sub(num_threads), right_sub(num_threads);
#pragma omp parallel private(thread_id)
{
thread_id = omp_get_thread_num();
#pragma omp for
for (unsigned int index = 0; index < input.size(); index++){
Point* curPoint = input[index];
if (curPoint != minXPoint || curPoint != maxXPoint){
if (getPosition(minXPoint, maxXPoint, curPoint) == RIGHT){
left_sub[thread_id].push_back(curPoint);
}
else if (getPosition(maxXPoint, minXPoint, curPoint) == RIGHT){
right_sub[thread_id].push_back(curPoint);
}
}
}
}
//Merge all vectors into a single vector :)
POINT_VECTOR left, right;
for (int index=0; index < num_threads; index++){
left.insert(left.end(), left_sub[index].begin(), left_sub[index].end());
right.insert(right.end(), right_sub[index].begin(), right_sub[index].end());
}
// We now have the initial two points belonging to the hill
// We also split all the points into a group containing points left of AB and a group containing points right of of AB
// We now recursively find all other points belonging to the convex hull.
ompfindHull(left,minXPoint, maxXPoint, output);
ompfindHull(right, maxXPoint, minXPoint, output);
timer.end();
return timer.getTimeElapsed();
}
コードの大部分が並列化可能であるのに、8 コアを使用して 2 倍のスピードアップのみを達成するのが正常かどうかを知っている人はいますか? そうでない場合、ここで何が間違っているのですか!?
パート2
本当の問題はこれからです...
TBB 実装で同じテストを実行すると、次の結果が得られます。
ご覧のとおり、並列実装の実行時間は常に逐次実装の実行時間を超えています。スピードアップ グラフに関しては、スピードアップは 1 未満です。つまり、マイナスのスピードアップです。
私が作成したさまざまな構造体のコードは次のとおりです。
ご了承くださいtypedef tbb::concurrent_vector<Point*> CPOINT_VECTOR
編集: Arch のコメントを適用しました。
class FindExtremum{
public:
enum ExtremumType{
MINIMUM,MAXIMUM
};
public:
FindExtremum(CPOINT_VECTOR& points):fPoints(points), fMinPoint(points[0]), fMaxPoint(points[0]){}
FindExtremum(const FindExtremum& extremum, tbb::split):fPoints(extremum.fPoints), fMinPoint(extremum.fMinPoint), fMaxPoint(extremum.fMaxPoint){}
void join(const FindExtremum& other){
Point* curMinPoint = other.fMinPoint;
Point* curMaxPoint = other.fMaxPoint;
if (isLargerThan(curMinPoint, MINIMUM)){
fMinPoint = curMinPoint;
}
if (isSmallerThan(curMaxPoint, MAXIMUM)){
fMaxPoint = curMaxPoint;
}
}
void operator()(const BLOCKED_RANGE& range){
for (size_t index = range.begin(); index < range.end(); index++){
Point* curPoint = fPoints[index];
if (isLargerThan(curPoint, MINIMUM)){
fMinPoint = curPoint;
}
if (isSmallerThan(curPoint, MAXIMUM)){
fMaxPoint = curPoint;
}
}
}
private:
bool isSmallerThan(const Point* point, const ExtremumType& type){
switch (type){
case MINIMUM:
return fMinPoint->x < point->x;
case MAXIMUM:
return fMaxPoint->x < point->x;
}
}
bool isLargerThan(const Point* point, const ExtremumType& type){
return !isSmallerThan(point, type);
}
public:
Point* getMaxPoint(){
return this->fMaxPoint;
}
Point* getMinPoint(){
return this->fMinPoint;
}
public:
CPOINT_VECTOR fPoints;
Point* fMinPoint;
Point* fMaxPoint;
};
class Splitter{
public:
Splitter(const CPOINT_VECTOR& points, Point* point1, Point* point2,
Point* farthestPoint, CPOINT_VECTOR* left, CPOINT_VECTOR* right, int grainsize):
fPoints(points), p1(point1), p2(point2), farthestPoint(farthestPoint), fLeft(left), fRight(right), fGrain(grainsize)
{
//fLeft = new tbb::concurrent_vector<Point*>();
//fRight = new tbb::concurrent_vector<Point*>();
//fLeft = new vector<Point*>();
//fRight = new vector<Point*>();
};
Splitter(const Splitter& splitter, tbb::split):
fPoints(splitter.fPoints), p1(splitter.p1), p2(splitter.p2), farthestPoint(splitter.farthestPoint),
fLeft(splitter.fLeft), fRight(splitter.fRight), fGrain(splitter.fGrain){}
void operator()(const BLOCKED_RANGE& range) const{
const int grainsize = fGrain;
Point** left = new Point*[grainsize];
Point** right = new Point*[grainsize];
int leftcounter = 0;
int rightcounter = 0;
for (size_t index = range.begin(); index < range.end(); index++){
Point* curPoint = fPoints[index];
if (curPoint != farthestPoint){
if (getPosition(p1, farthestPoint, curPoint) == RIGHT){
left[leftcounter++] = curPoint;
} else if (getPosition(farthestPoint, p2, curPoint) == RIGHT){
right[rightcounter++] = curPoint;
}
}
}
appendVector(left,leftcounter,*fLeft);
appendVector(right,rightcounter,*fRight);
}
public:
Point* p1;
Point* p2;
Point* farthestPoint;
int fGrain;
CPOINT_VECTOR* fLeft;
CPOINT_VECTOR* fRight;
CPOINT_VECTOR fPoints;
};
class InitialSplitter{
public:
InitialSplitter(const CPOINT_VECTOR& points, CPOINT_VECTOR* left, CPOINT_VECTOR* right,
Point* point1, Point* point2, int grainsize):
fPoints(points), p1(point1), p2(point2), fLeft(left), fRight(right), fGrain(grainsize){}
InitialSplitter(const InitialSplitter& splitter, tbb::split):
fPoints(splitter.fPoints), p1(splitter.p1), p2(splitter.p2),
fLeft(splitter.fLeft), fRight(splitter.fRight), fGrain(splitter.fGrain){
}
void operator()(const BLOCKED_RANGE& range) const{
const int grainsize = fGrain;
Point** left = new Point*[grainsize];
Point** right = new Point*[grainsize];
int leftcounter = 0;
int rightcounter = 0;
for (size_t index = range.begin(); index < range.end(); index++){
Point* curPoint = fPoints[index];
if (curPoint != p1 || curPoint != p2){
if (getPosition(p1, p2, curPoint) == RIGHT){
left[leftcounter++] = curPoint;
} else if (getPosition(p2, p1, curPoint) == RIGHT){
right[rightcounter++] = curPoint;
}
}
}
appendVector(left,leftcounter,*fLeft);
appendVector(right,rightcounter,*fRight);
}
public:
CPOINT_VECTOR fPoints;
int fGrain;
Point* p1;
Point* p2;
CPOINT_VECTOR* fLeft;
CPOINT_VECTOR* fRight;
};
class FarthestPointFinder{
public:
FarthestPointFinder(const CPOINT_VECTOR& points, Point* p1, Point* p2):
fPoints(points), fFarthestPoint(points[0]),fDistance(-1), p1(p1), p2(p2){}
FarthestPointFinder(const FarthestPointFinder& fpf, tbb::split):
fPoints(fpf.fPoints), fFarthestPoint(fpf.fFarthestPoint),fDistance(-1), p1(fpf.p1), p2(fpf.p2){}
void operator()(const BLOCKED_RANGE& range){
for (size_t index = range.begin(); index < range.end(); index++){
Point* curPoint = fPoints[index];
double curDistance = distance(p1,p2,curPoint);
if (curDistance > fDistance){
fFarthestPoint = curPoint;
fDistance = curDistance;
}
}
}
void join(const FarthestPointFinder& other){
if (fDistance < other.fDistance){
fFarthestPoint = other.fFarthestPoint;
fDistance = other.fDistance;
}
}
public:
Point* getFarthestPoint(){
return this->fFarthestPoint;
}
public:
CPOINT_VECTOR fPoints;
Point* fFarthestPoint;
int fDistance;
Point* p1;
Point* p2;
};
QuickHull コードが続きます。
void tbbfindHull(CPOINT_VECTOR &input, Point* p1, Point* p2, POINT_VECTOR& output, int max_threads){
// If there are no points in the set... just stop. This is the stopping criteria for the recursion. :)
if (input.empty() || input.size() == 0) return;
else if (input.size() == 1) {
output.push_back(input[0]);
return;
}
// Get the point that is the farthest from the p1-p2 segment
int GRAINSIZE = ((double)input.size())/max_threads;
FarthestPointFinder fpf(input, p1, p2);
tbb::parallel_reduce(BLOCKED_RANGE(0,input.size(),GRAINSIZE), fpf);
Point *farthestPoint = fpf.getFarthestPoint();
// Add the farthest point to the output as it is part of the convex hull.
output.push_back(farthestPoint);
// Split in two sets.
// The first one contains points right from p1 - farthestPoint
// The second one contains points right from farthestPoint - p2
CPOINT_VECTOR* left = new CPOINT_VECTOR();
CPOINT_VECTOR* right = new CPOINT_VECTOR();
Splitter splitter(input,p1,p2,farthestPoint, left, right, GRAINSIZE);
tbb::parallel_for(BLOCKED_RANGE(0,input.size(), GRAINSIZE), splitter);
// We do more recursion :)
tbbfindHull(*left, p1, farthestPoint, output, max_threads);
tbbfindHull(*right, farthestPoint, p2, output, max_threads);
}
/**
* Calling the quickHull algorithm!
*/
double tbbquickHull(POINT_VECTOR input_o, POINT_VECTOR& output, int max_threads){
CPOINT_VECTOR input;
for (int i =0; i < input_o.size(); i++){
input.push_back(input_o[i]);
}
int GRAINSIZE = input.size()/max_threads;
Timer timer;
timer.start();
// Find the left- and rightmost point.
FindExtremum fextremum(input);
tbb::parallel_reduce(BLOCKED_RANGE(0, input.size(),GRAINSIZE), fextremum);
Point* minXPoint = fextremum.getMinPoint();
Point* maxXPoint = fextremum.getMaxPoint();
// These points are sure to be part of the convex hull, so add them
output.push_back(minXPoint);
output.push_back(maxXPoint);
// Now we have to split the set of point in subsets.
// The first one containing all points above the line
// The second one containing all points below the line
CPOINT_VECTOR* left = new CPOINT_VECTOR;
CPOINT_VECTOR* right = new CPOINT_VECTOR;
//Timer temp1;
//temp1.start();
InitialSplitter splitter(input, left, right, minXPoint, maxXPoint, GRAINSIZE);
tbb::parallel_for(BLOCKED_RANGE(0, input.size(),GRAINSIZE), splitter);
// We now have the initial two points belonging to the hill
// We also split all the points into a group containing points left of AB and a group containing points right of of AB
// We now recursively find all other points belonging to the convex hull.
tbbfindHull(*left,minXPoint, maxXPoint, output, max_threads);
tbbfindHull(*right, maxXPoint, minXPoint, output, max_threads);
timer.end();
return timer.getTimeElapsed();
}
TBB では、コードのさまざまな並列部分のタイミングを計るときに、いくつかの異常に気付くことがありました。tbb::parallel_for() を使用してサブセット全体を最初InitialSplitter
に 2 つのサブセットに分割するには、対応する OpenMP バージョンのランタイム全体とほぼ同じ時間がかかりますが、この時間は、異なる数のスレッドが使用されても変わりません。InitialSplitter
tbb::parallel_for() への引数として渡される -object内でタイミングを取ると、大幅なスピードアップが見られるため、これは奇妙です。' operator() メソッドで繰り返される for ループはInitialSplitters
、スレッド数が増えると予想される速度向上を示しています。
tbb::parallel_for()
たとえば、インスタンスを取得する初期化のような単一のInitialSplitter
インスタンスが、OpenMP 実装全体を実行するのと同じくらい時間がかかるのは非常に奇妙だと思います。-operator()tbb::parallel_for()
内のタイミングがほぼ線形のスピードアップを観察できる一方で、スピードアップなしの周りのタイミングが観察できるのは非常に奇妙だと思います...InitialSplitters
私を助けてくれる人はここにいますか!?
前もって感謝します!