c++ - parallel_for 実装の最適化

Question

Microsoft の PPL を使用して parallel_for ループを実行するコードがいくつかありましたが、それを Linux と Mac に移動して、独自のバージョンを作成する必要がありました。本来あるべきことを実行し、適切なパフォーマンスで実行しますが、それ以外の点では同一の PPL の parallel_for ループよりも 20% ほど遅くなります。

一般的に 1 万回から 10 万回の反復が実行されていることを言及しておく必要がありますが、各反復は平方根と乗算の 2 つのみです。ただし、これはインタラクティブなアプリケーション用であるため、非常に高速に実行する必要があります。

C++ 11 はまだ新しいので、経験豊富な人が私の実装を見て、なぜそれがうまくいかないのか、何を改善できるのかについてフィードバックをいただければ幸いです。

template<size_t THREADS_PER_CORE = 1>
void parallel_forMine(size_t start, size_t end, const std::function<void(size_t)> &userLambda)
{
    int threadCount = std::thread::hardware_concurrency()*THREADS_PER_CORE;

    int blockSize = (end - start) / threadCount;
    if (blockSize*threadCount < end - start)
        blockSize++;

    std::vector<std::future<void>> futures;

    int blockStart = start;
    int blockEnd = blockStart + blockSize;
    if (blockEnd > end) blockEnd = end;

    for (int threadIndex = 0; threadIndex < threadCount; threadIndex++)
    {
        futures.push_back(std::move(std::async(std::launch::async, [blockStart, blockEnd, &userLambda]
        {
            for (size_t i = blockStart; i < blockEnd; ++i)
            {
                userLambda(i);
            }
        })));

        blockStart += blockSize;
        blockEnd = blockStart + blockSize;
        if (blockStart >= end) break;
        if (blockEnd > end) blockEnd = end;
    }

    for (std::future<void> &f: futures)
        f.get();
}

完全なテストコードは以下のとおりです。

#include "stdafx.h" //nothing in there in this test
#include <ppl.h>
#include <chrono>
#include <iostream>
#include <vector>
#include <future>

template<size_t THREADS_PER_CORE = 1>
void parallel_forMine(size_t start, size_t end, const std::function<void(size_t)> &userLambda)
{
int threadCount = std::thread::hardware_concurrency()*THREADS_PER_CORE;

int blockSize = (end - start) / threadCount;
if (blockSize*threadCount < end - start)
    blockSize++;

std::vector<std::future<void>> futures;

int blockStart = start;
int blockEnd = blockStart + blockSize;
if (blockEnd > end) blockEnd = end;

for (int threadIndex = 0; threadIndex < threadCount; threadIndex++)
{
    futures.push_back(std::move(std::async(std::launch::async, [blockStart, blockEnd, &userLambda]
    {
        for (size_t i = blockStart; i < blockEnd; ++i)
        {
            userLambda(i);
        }
    })));

    blockStart += blockSize;
    blockEnd = blockStart + blockSize;
    if (blockStart >= end) break;
    if (blockEnd > end) blockEnd = end;
}

for (std::future<void> &f: futures)
    f.get();
}



int main()
{
    //serial execution
    std::vector<double> valueSerial(1000);
    auto startSerial = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < 1000; i++)
        for (int j = 0; j < 1000000; j++)
            valueSerial[i] += sqrt(abs(cos(sin(sqrt(i)))));
    auto durationSerial = (std::chrono::high_resolution_clock::now() - startSerial).count() / 1000;
    std::cout << durationSerial << " Serial" << std::endl;


//PPL parallel for
std::vector<double> valueParallelForPPL(1000);
auto startParallelForPPL = std::chrono::high_resolution_clock::now();
Concurrency::parallel_for(size_t(0), size_t(1000), [&](size_t i)
{
    for (int j = 0; j < 1000000; j++)
        valueParallelForPPL[i] += sqrt(abs(cos(sin(sqrt(i)))));
});
auto durationParallelForPPL = (std::chrono::high_resolution_clock::now() - startParallelForPPL).count() / 1000;
std::cout << durationParallelForPPL << " PPL parallel for"<<std::endl;


//my parallel for
std::vector<double> valueParallelFor(1000);
auto startParallelFor = std::chrono::high_resolution_clock::now();
parallel_forMine(0, 1000, [&](size_t i)
{
    for (int j = 0; j < 1000000; j++)
        valueParallelFor[i] += sqrt(abs(cos(sin(sqrt(i)))));
});
auto durationParallelFor = (std::chrono::high_resolution_clock::now() - startParallelFor).count() / 1000;
std::cout << durationParallelFor << " My parallel for"<<std::endl;


//only really to make sure the compiler doesn't optimize everything away
for (int i = 0; i < valueSerial.size();i++)
    if (valueSerial[i] != valueParallelFor[i] || valueParallelFor[i]!= valueParallelForPPL[i])
        std::cout << "error";


std::cin.get();

return 0;
}

c++ - parallel_for 実装の最適化

0 に答える 0

Related

Reference