c# - ヒストグラム関数の並列化

Question

32bppArgbビットマップからヒストグラムを計算する単純な関数の通常の並列バージョンを実装しました。通常バージョンは1920x1080イメージで約0.03秒かかりますが、パラレルバージョンは0.07秒かかります。

オーバーヘッドのスレッド化は本当に重いですか？Parallel以外に他の構造はありますか？そのためにこのプロセスをスピードアップできますか？30fpsのビデオで作業しているので、これを高速化する必要があります。

簡略化されたコードは次のとおりです。

public sealed class Histogram
{
    public int MaxA = 0;
    public int MaxR = 0;
    public int MaxG = 0;
    public int MaxB = 0;
    public int MaxT = 0;

    public int [] A = null;
    public int [] R = null;
    public int [] G = null;
    public int [] B = null;

    public Histogram ()
    {
        this.A = new int [256];
        this.R = new int [256];
        this.G = new int [256];
        this.B = new int [256];

        this.Initialize();
    }

    public void Initialize ()
    {
        this.MaxA = 0;
        this.MaxR = 0;
        this.MaxG = 0;
        this.MaxB = 0;
        this.MaxT = 0;

        for (int i = 0; i < this.A.Length; i++)
            this.A [i] = 0;
        for (int i = 0; i < this.R.Length; i++)
            this.R [i] = 0;
        for (int i = 0; i < this.G.Length; i++)
            this.G [i] = 0;
        for (int i = 0; i < this.B.Length; i++)
            this.B [i] = 0;
    }

    public void ComputeHistogram (System.Drawing.Bitmap bitmap, bool parallel = false)
    {
        System.Drawing.Imaging.BitmapData data = null;

        data = bitmap.LockBits
        (
            new System.Drawing.Rectangle(0, 0, bitmap.Width, bitmap.Height),
            System.Drawing.Imaging.ImageLockMode.ReadOnly,
            System.Drawing.Imaging.PixelFormat.Format32bppArgb
        );

        try
        {
            ComputeHistogram(data, parallel);
        }
        catch
        {
            bitmap.UnlockBits(data);

            throw;
        }

        bitmap.UnlockBits(data);
    }

    public void ComputeHistogram (System.Drawing.Imaging.BitmapData data, bool parallel = false)
    {
        int stride = System.Math.Abs(data.Stride);

        this.Initialize();

        if (parallel)
        {
            unsafe
            {
                System.Threading.Tasks.Parallel.For
                (
                    0,
                    data.Height,
                    new System.Threading.Tasks.ParallelOptions() { MaxDegreeOfParallelism = System.Environment.ProcessorCount },
                    y =>
                    {
                        byte* pointer = ((byte*) data.Scan0) + (stride * y);

                        for (int x = 0; x < stride; x += 4)
                        {
                            this.B [pointer [x + 0]]++;
                            this.G [pointer [x + 1]]++;
                            this.R [pointer [x + 2]]++;
                            this.A [pointer [x + 3]]++;
                        }
                    }
                );
            }
        }
        else
        {
            unsafe
            {
                for (int y = 0; y < data.Height; y++)
                {
                    byte* pointer = ((byte*) data.Scan0) + (stride * y);

                    for (int x = 0; x < stride; x += 4)
                    {
                        this.B [pointer [x + 0]]++;
                        this.G [pointer [x + 1]]++;
                        this.R [pointer [x + 2]]++;
                        this.A [pointer [x + 3]]++;
                    }
                }
            }
        }

        for (int i = 0; i < this.A.Length; i++)
            if (this.MaxA < this.A [i]) this.MaxA = this.A [i];
        for (int i = 0; i < this.R.Length; i++)
            if (this.MaxR < this.R [i]) this.MaxR = this.R [i];
        for (int i = 0; i < this.G.Length; i++)
            if (this.MaxG < this.G [i]) this.MaxG = this.G [i];
        for (int i = 0; i < this.B.Length; i++)
            if (this.MaxB < this.B [i]) this.MaxB = this.B [i];

        if (this.MaxT < this.MaxA) this.MaxT = this.MaxA;
        if (this.MaxT < this.MaxR) this.MaxT = this.MaxR;
        if (this.MaxT < this.MaxG) this.MaxT = this.MaxG;
        if (this.MaxT < this.MaxB) this.MaxT = this.MaxB;
    }
}

score 8 · Accepted Answer

まず、Parallel ループに大きなバグがあります。

複数のスレッドが共有配列にアクセスし、インクリメントし、更新することになります。同じイメージでサンプルコードを複数回実行するだけで、固有の競合状態により、結果が大きく異なります。

しかし、それはあなたが尋ねたものではありません。

並列実装を使用してパフォーマンスの低下が見られる理由については、簡単な答えは、各並列タスクの本体で、新しいタスクを作成してスケジュールするための「スピンアップコスト」を相殺するのに十分な作業を行っていない可能性があるということです。等

おそらくより重要なのは、メモリ内を飛び回るすべての L1/L2 キャッシュから地獄をスラッシングしていると私が信じていることです。各タスクスレッドは、必要と思われるものをキャッシュメモリにロードしようとしますが、いたるところにインデックスを作成すると、一貫したアクセスパターンが作成されなくなり、ビットマップバッファーまたは内部配列にアクセスしようとするたびにキャッシュミスが発生する可能性があります。

安全でないコードを使用せずにビットマップの読み取り専用データを取得する同様にパフォーマンスの高い方法もあります...実際、最初にそれを行いましょう:

したがって、を呼び出すことLockBitsで、アンマネージメモリへのポインタを取得できます。それをコピーしてみましょう：

System.Drawing.Imaging.BitmapData data = null;
data = bitmap.LockBits
(
    new System.Drawing.Rectangle(0, 0, bitmap.Width, bitmap.Height),
    System.Drawing.Imaging.ImageLockMode.ReadOnly,
    System.Drawing.Imaging.PixelFormat.Format32bppArgb
);

// For later usage
var imageStride = data.Stride;
var imageHeight = data.Height;

// allocate space to hold the data
byte[] buffer = new byte[data.Stride * data.Height];

// Source will be the bitmap scan data
IntPtr pointer = data.Scan0;

// the CLR marshalling system knows how to move blocks of bytes around, FAST.
Marshal.Copy(pointer, buffer, 0, buffer.Length);

// and now we can unlock this since we don't need it anymore
bitmap.UnlockBits(data);

ComputeHistogram(buffer, imageStride, imageHeight, parallel);

さて、競合状態については、Interlocked呼び出しを使用してカウントを増やすことにより、かなりパフォーマンスの高い方法でこれを克服できます（注!!! マルチスレッドプログラミングは難しいです。ここでの私の解決策が完璧ではない可能性は十分にあります!）

public void ComputeHistogram (byte[] data, int stride, int height, bool parallel = false)
{
    this.Initialize();

    if (parallel)
    {
        System.Threading.Tasks.Parallel.For
        (
            0,
            height,
            new ParallelOptions() { MaxDegreeOfParallelism = Environment.ProcessorCount },
            y =>
            {
                int startIndex = (stride * y);
                int endIndex = stride * (y+1);
                for (int x = startIndex; x < endIndex; x += 4)
                {
                    // Interlocked actions are more-or-less atomic 
                    // (caveats abound, but this should work for us)
                    Interlocked.Increment(ref this.B[data[x]]);
                    Interlocked.Increment(ref this.G[data[x+1]]);
                    Interlocked.Increment(ref this.R[data[x+2]]);
                    Interlocked.Increment(ref this.A[data[x+3]]);
                }
            }
        );
    }
    else
    {
        // the original way is ok for non-parallel, since only one
        // thread is mucking around with the data
    }

    // Sorry, couldn't help myself, this just looked "cleaner" to me
    this.MaxA = this.A.Max();
    this.MaxR = this.R.Max();
    this.MaxG = this.G.Max();
    this.MaxB = this.B.Max();
    this.MaxT = new[] { this.MaxA, this.MaxB, this.MaxG, this.MaxR }.Max();
}

では、これは実行時の動作にどのような影響を与えるのでしょうか?

それほど多くはありませんが、少なくともパラレルフォークは正しい結果を計算するようになりました。:)

非常に安価なテスト装置を使用する:

void Main()
{    
    foreach(var useParallel in new[]{false, true})
    {
        var totalRunTime = TimeSpan.Zero;
        var sw = new Stopwatch();
        var runCount = 10;
        for(int run=0; run < runCount; run++)
        {
            GC.Collect();
            GC.WaitForPendingFinalizers();
            GC.Collect();
            sw.Reset();
            sw.Start();
            var bmp = Bitmap.FromFile(@"c:\temp\banner.bmp") as Bitmap;
            var hist = new Histogram();
            hist.ComputeHistogram(bmp, useParallel);
            sw.Stop();
            totalRunTime = totalRunTime.Add(sw.Elapsed);
        }
        Console.WriteLine("Parallel={0}, Avg={1} ms", useParallel, totalRunTime.TotalMilliseconds / runCount);
    }
}

次のような結果が得られます。

Parallel=False, Avg=1.69777 ms
Parallel=True, Avg=5.33584 ms

ご覧のとおり、元の質問にはまだ対応していません。:)

それでは、並列作業を「より良く」することに挑戦しましょう。

タスクに「より多くの仕事を与える」ことが何をするか見てみましょう:

if (parallel)
{
    var batchSize = 2;
    System.Threading.Tasks.Parallel.For
    (
        0,
        height / batchSize,
        new ParallelOptions() { MaxDegreeOfParallelism = Environment.ProcessorCount },
        y =>
        {
            int startIndex = (stride * y * batchSize);
            int endIndex = startIndex + (stride * batchSize);
            for (int x = startIndex; x < endIndex; x += 4)
            {
                // Interlocked actions are more-or-less atomic 
                // (caveats abound, but this should work for us)
                Interlocked.Increment(ref this.B[data[x]]);
                Interlocked.Increment(ref this.G[data[x+1]]);
                Interlocked.Increment(ref this.R[data[x+2]]);
                Interlocked.Increment(ref this.A[data[x+3]]);
            }
        }
    );
}

結果：

Parallel=False, Avg=1.70273 ms
Parallel=True, Avg=4.82591 ms

おお、それは有望に見えます...私たちが変わると何が起こるのだろうかbatchSize？

このようにテストリグを変更してみましょう。

void Main()
{    
    foreach(var useParallel in new[]{false, true})
    {
        for(int batchSize = 1; batchSize < 1024; batchSize <<= 1)
        {
            var totalRunTime = TimeSpan.Zero;
            var sw = new Stopwatch();
            var runCount = 10;
            for(int run=0; run < runCount; run++)
            {
                GC.Collect();
                GC.WaitForPendingFinalizers();
                GC.Collect();
                sw.Reset();
                sw.Start();
                var bmp = Bitmap.FromFile(@"c:\temp\banner.bmp") as Bitmap;
                var hist = new Histogram();
                hist.ComputeHistogram(bmp, useParallel, batchSize);
                sw.Stop();
                totalRunTime = totalRunTime.Add(sw.Elapsed);
            }
            Console.WriteLine("Parallel={0}, BatchSize={1} Avg={2} ms", useParallel, batchSize, totalRunTime.TotalMilliseconds / runCount);
        }        
    }
}

結果: (非平行は変更されないため、parallel=true のみを表示)

Parallel=True, BatchSize=1 Avg=5.57644 ms
Parallel=True, BatchSize=2 Avg=5.49982 ms
Parallel=True, BatchSize=4 Avg=5.20434 ms
Parallel=True, BatchSize=8 Avg=5.1721 ms
Parallel=True, BatchSize=16 Avg=5.00405 ms
Parallel=True, BatchSize=32 Avg=4.44973 ms
Parallel=True, BatchSize=64 Avg=2.28332 ms
Parallel=True, BatchSize=128 Avg=1.39957 ms
Parallel=True, BatchSize=256 Avg=1.29156 ms
Parallel=True, BatchSize=512 Avg=1.28656 ms

バッチサイズが 64 ～ 128 の範囲に達すると、一種の漸近線に近づいているように見えますが、もちろん、マイレージはビットマップサイズなどによって異なる場合があります。

これが役立つことを願っています！本番ビルドが完了するのを待っていた 1 日からの楽しい気晴らしでした。:)

score 1 · Accepted Answer

スレッドの作成にはかなりのオーバーヘッドがあります。実行はシングルスレッドバージョンよりも大幅に高速に実行される可能性がありますが、完了が速すぎてこの初期オーバーヘッドを補うことができません。

これをすべてのフレームで行うと、速度が低下するだけです。

ただし、スレッドプールを手動で作成し、作業を手動で割り当て、フレームごとにスレッドを再利用すると、フレーム 2 または 3 までに、コードがシングルスレッドバージョンを超えていることに気付く場合があります。

c# - ヒストグラム関数の並列化

2 に答える 2

Related

Reference