openmp - OpenMP によるガウスぼかしアルゴリズムの並列化

Question

OpenMP を使用してガウスぼかし関数を並列化しようとしていましたが、OpenMP は初めてで、2 つの for ループを並列化しようとすると (スレッドごとにプライベートにする必要がある変数はないと思います)、実行が以前よりもさらに遅くなり、出力が異なりました。それで、私は何か悪いことをしましたか？より速く実行するにはどうすればよいですか?

void gaussian_blur(float **src, float **dst, int w, int h, float sigma)
{
    int x, y, i;
    int ksize = (int)(sigma * 2.f * 4.f + 1) | 1;
    int halfk = ksize / 2;
    float scale = -0.5f/(sigma*sigma);
    float sum = 0.f;
    float *kernel, *ringbuf;
    int xmax = w - halfk;
    int ymax = h - halfk;

    // if sigma too small, just copy src to dst
    if (ksize <= 1)
    {
        for (y = 0; y < h; y++)
            for (x = 0; x < w; x++)
                dst[y][x] = src[y][x];
        return;
    }

    // create Gaussian kernel
    kernel = malloc(ksize * sizeof(float));
    ringbuf = malloc(ksize * sizeof(float));

    #pragma omp parallel for reduction(+ : sum)
    for (i = 0; i < ksize; i++)
    {
        float x = (float)(i - halfk);
        float t = expf(scale * x * x);
        kernel[i] = t;
        sum += t;
    }

    scale = 1.f / sum;
    #pragma omp parallel for
    for (i = 0; i < ksize; i++)
        kernel[i] *= scale;

    // blur each row
    #pragma omp parallel for // this is the for loop I parallelized but ended up with wrong output and running slower
    for (y = 0; y < h; y++)
    {
        int x1;
        int bufi0 = ksize-1;
        float tmp = src[y][0];

        for (x1 = 0; x1 < halfk  ; x1++) ringbuf[x1] = tmp;
        for (; x1 < ksize-1; x1++) ringbuf[x1] = src[y][x1-halfk];

        for (x1 = 0; x1 < w; x1++)
        {
            if(x1 < xmax)
                ringbuf[bufi0++] = src[y][x1+halfk];
            else
                ringbuf[bufi0++] = src[y][w-1];
            if (bufi0 == ksize) bufi0 = 0;
            dst[y][x1] = convolve(kernel, ringbuf, ksize, bufi0);
        }
    }

    // blur each column
    #pragma omp parallel for // this is the for loop I parallelized but ended up with wrong output and running slower
    for (x = 0; x < w; x++)
    {
        int y1;
        int bufi0 = ksize-1;
        float tmp = dst[0][x];
        for (y1 = 0; y1 < halfk  ; y1++) ringbuf[y1] = tmp;
        for (     ; y1 < ksize-1; y1++) ringbuf[y1] = dst[y1-halfk][x];

        for (y1 = 0; y1 < h; y1++)
        {
            if(y1 < ymax)
                ringbuf[bufi0++] = dst[y1+halfk][x];
            else
                ringbuf[bufi0++] = dst[h-1][x];

            if (bufi0 == ksize) bufi0 = 0;
            dst[y1][x] = convolve(kernel, ringbuf, ksize, bufi0);
        }
    }

    // clean up
    free(kernel);
    free(ringbuf);
}

score 2 · Accepted Answer

私はあなたのコードを修正したかもしれません。畳み込み関数を投稿していないため、確実に言うのは難しいですが、それが重要かどうかはわかりません。少なくとも 2 つのバグがあります。ringbuf配列に競合状態があります。これを修正するために、配列をスレッド数倍に拡張します。

ringbuf = (float*)malloc(nthreads*ksize * sizeof(float));

配列にアクセスするには、次のようにします

int ithread = omp_get_thread_num();
ringbuf[ksize*ithread + x1]

ringbuf編集:並列ブロック内で定義するコードをいくつか追加しました。そうすれば、スレッド番号に基づいて ringbuf にアクセスする必要がなくなります。

2 番目のエラーはibufi0変数です。私はこのような新しいものを定義しました

const int ibufi0_fix = (x1+ksize-1)%ksize;

以下は、それを確認するために使用したコードです。畳み込み関数に置き換えます。これはまだ非常に非効率的である可能性があることに注意してください。キャッシュミスや誤った共有などのキャッシュの問題がある可能性があります (特に垂直にたたみ込みを行う場合)。うまくいけば、しかし、イメージは今正しいでしょう。

編集: これはインテルの論文で、AVX でこれを最適に行う方法を示しています。キャッシュミスを最小限に抑えるように最適化されています。ただし、スレッド化に最適化されているかどうかはわかりません。 http://software.intel.com/en-us/articles/iir-gaussian-blur-filter-implementation-using-intel-advanced-vector-extensions

SSE/AVX も使用する独自の関数を書いています (これが実際に OpenMP を学び始めた理由です)。行列乗算と画像フィルタリングには多くの類似点があるため、最初に行列乗算を最適化する方法を学び、すぐにガウスぼかしを行います...

#include "math.h"
#include "omp.h"
#include "stdio.h"
#include <nmmintrin.h>

float convolve(const float *kernel, const float *ringbuf, const int ksize, const int bufi0) {
    float sum = 0.0f;
    for(int i=0; i<ksize; i++) {
        sum += kernel[i]*ringbuf[i];
    }
    return sum;
}


void gaussian_blur(float *src, float *dst, int w, int h, float sigma, int nthreads)
{
    int x, y, i;
    int ksize = (int)(sigma * 2.f * 4.f + 1) | 1;
    int halfk = ksize / 2;
    printf("ksize %d\n", ksize);
    float scale = -0.5f/(sigma*sigma);
    float sum = 0.f;
    float *kernel, *ringbuf;
    int xmax = w - halfk;
    int ymax = h - halfk;

    // if sigma too small, just copy src to dst
    if (ksize <= 1)
    {
        for (y = 0; y < h; y++)
            for (x = 0; x < w; x++)
                dst[y*w + x] = src[y*w + x];
        return;
    }

    // create Gaussian kernel
    //kernel = malloc(ksize * sizeof(float));
    kernel =  (float*)_mm_malloc(ksize * sizeof(float),16);
    //ringbuf = malloc(ksize * sizeof(float));
    ringbuf = (float*)_mm_malloc(nthreads*ksize * sizeof(float),16);

    #pragma omp parallel for reduction(+ : sum) if(nthreads>1)
    for (i = 0; i < ksize; i++)
    {
        float x = (float)(i - halfk);
        float t = expf(scale * x * x);
        kernel[i] = t;
        sum += t;
    }

    scale = 1.f / sum;
    #pragma omp parallel for if(nthreads>1)
    for (i = 0; i < ksize; i++)
        kernel[i] *= scale;

    // blur each row
    #pragma omp parallel for if(nthreads>1)// this is the for loop I parallelized but ended up with wrong output and running slower
    for (y = 0; y < h; y++)
    {
        int ithread = omp_get_thread_num();
        //printf("nthread %d\n", nthread);
        int x1;
        int bufi0 = ksize-1;
        float tmp = src[y*w + 0];
        for (x1 = 0; x1 < halfk  ; x1++) ringbuf[ksize*ithread + x1] = tmp;
        for (; x1 < ksize-1; x1++) ringbuf[ksize*ithread + x1] = src[y*w + x1-halfk];
        for (x1 = 0; x1 < w; x1++)
        {
            const int ibufi0_fix = (x1+ksize-1)%ksize;

            if(x1 < xmax)
                ringbuf[ksize*ithread + ibufi0_fix] = src[y*w + x1+halfk];
            else
                ringbuf[ksize*ithread + ibufi0_fix] = src[y*w + w-1];
            if (bufi0 == ksize) bufi0 = 0;
            dst[y*w + x1] = convolve(kernel, &ringbuf[ksize*ithread], ksize, bufi0);
        }
    }
    // blur each column
    #pragma omp parallel for if(nthreads>1)// this is the for loop I parallelized but ended up with wrong output and running slower
    for (x = 0; x < w; x++)
    {
        int ithread = omp_get_thread_num();
        int y1;
        int bufi0 = ksize-1;
        float tmp = dst[0*w + x];
        for (y1 = 0; y1 < halfk  ; y1++) ringbuf[ksize*ithread + y1] = tmp;
        for (     ; y1 < ksize-1; y1++) ringbuf[ksize*ithread + y1] = dst[(y1-halfk)*w + x];

        for (y1 = 0; y1 < h; y1++)
        {
            const int ibufi0_fix = (y1+ksize-1)%ksize;
            if(y1 < ymax)
                ringbuf[ibufi0_fix] = dst[(y1+halfk)*w + x];
            else
                ringbuf[ibufi0_fix] = dst[(h-1)*w + x];

            if (bufi0 == ksize) bufi0 = 0;
            dst[y1*w + x] = convolve(kernel, &ringbuf[ksize*ithread], ksize, bufi0);
        }
    }

    // clean up
    _mm_free(kernel);
    _mm_free(ringbuf);
}

int compare(float *dst1, float *dst2, const int n) {
    int error = 0;
    for(int i=0; i<n; i++) {
        if(*dst1 != *dst2) error++;
    }
    return error;
}


int main() {
    const int w = 20;
    const int h = 20;

    float *src =  (float*)_mm_malloc(w*h*sizeof(float),16);
    float *dst1 =  (float*)_mm_malloc(w*h*sizeof(float),16);
    float *dst2 =  (float*)_mm_malloc(w*h*sizeof(float),16);
    for(int i=0; i<w*h; i++) {
        src[i] = i;
    }

    gaussian_blur(src, dst1, w, h, 1.0f, 1);
    gaussian_blur(src, dst2, w, h, 1.0f, 4);
    int error = compare(dst1, dst2, w*h);
    printf("error %d\n", error);
    _mm_free(src);
    _mm_free(dst1);
    _mm_free(dst2);
}

ringbuf編集: Hristo のコメントに基づいて、並列ブロック内を定義するコードを次に示します。同等である必要があります。

#include "math.h"
#include "omp.h"
#include "stdio.h"
#include <nmmintrin.h>

float convolve(const float *kernel, const float *ringbuf, const int ksize, const int bufi0) {
    float sum = 0.0f;
    for(int i=0; i<ksize; i++) {
        sum += kernel[i]*ringbuf[i];
    }
    return sum;
}


void gaussian_blur(float *src, float *dst, int w, int h, float sigma, int nthreads)
{
    int x, y, i;
    int ksize = (int)(sigma * 2.f * 4.f + 1) | 1;
    int halfk = ksize / 2;
    printf("ksize %d\n", ksize);
    float scale = -0.5f/(sigma*sigma);
    float sum = 0.f;
    float *kernel;
    int xmax = w - halfk;
    int ymax = h - halfk;

    // if sigma too small, just copy src to dst
    if (ksize <= 1)
    {
        for (y = 0; y < h; y++)
            for (x = 0; x < w; x++)
                dst[y*w + x] = src[y*w + x];
        return;
    }

    // create Gaussian kernel
    //kernel = malloc(ksize * sizeof(float));
    kernel =  (float*)_mm_malloc(ksize * sizeof(float),16);

    #pragma omp parallel for reduction(+ : sum) if(nthreads>1)
    for (i = 0; i < ksize; i++)
    {
        float x = (float)(i - halfk);
        float t = expf(scale * x * x);
        kernel[i] = t;
        sum += t;
    }

    scale = 1.f / sum;
    #pragma omp parallel for if(nthreads>1)
    for (i = 0; i < ksize; i++)
        kernel[i] *= scale;

    // blur each row
    //#pragma omp parallel for if(nthreads>1)// this is the for loop I parallelized but ended up with wrong output and running slower
    #pragma omp parallel if(nthreads>1) 
    {
        float *ringbuf = (float*)_mm_malloc(ksize * sizeof(float),16);
        #pragma omp for// this is the for loop I parallelized but ended up with wrong output and running slower
        for (y = 0; y < h; y++)
        {
            //printf("nthread %d\n", nthread);
            int x1;
            int bufi0 = ksize-1;
            float tmp = src[y*w + 0];
            for (x1 = 0; x1 < halfk  ; x1++) ringbuf[x1] = tmp;
            for (; x1 < ksize-1; x1++) ringbuf[x1] = src[y*w + x1-halfk];
            for (x1 = 0; x1 < w; x1++)
            {
                const int ibufi0_fix = (x1+ksize-1)%ksize;

                if(x1 < xmax)
                    ringbuf[ibufi0_fix] = src[y*w + x1+halfk];
                else
                    ringbuf[ibufi0_fix] = src[y*w + w-1];
                if (bufi0 == ksize) bufi0 = 0;
                dst[y*w + x1] = convolve(kernel, ringbuf, ksize, bufi0);
            }
        }
        _mm_free(ringbuf);
    }

    // blur each column
    #pragma omp parralel if(ntheads>1)
    {
        float *ringbuf = (float*)_mm_malloc(ksize * sizeof(float),16);
        #pragma omp for// this is the for loop I parallelized but ended up with wrong output and running slower
        for (x = 0; x < w; x++)
        {
            int y1;
            int bufi0 = ksize-1;
            float tmp = dst[0*w + x];
            for (y1 = 0; y1 < halfk  ; y1++) ringbuf[y1] = tmp;
            for (     ; y1 < ksize-1; y1++) ringbuf[y1] = dst[(y1-halfk)*w + x];

            for (y1 = 0; y1 < h; y1++)
            {
                const int ibufi0_fix = (y1+ksize-1)%ksize;
                if(y1 < ymax)
                    ringbuf[ibufi0_fix] = dst[(y1+halfk)*w + x];
                else
                    ringbuf[ibufi0_fix] = dst[(h-1)*w + x];

                if (bufi0 == ksize) bufi0 = 0;
                dst[y1*w + x] = convolve(kernel, ringbuf, ksize, bufi0);
            }
        }
        _mm_free(ringbuf);
    }
    // clean up
    _mm_free(kernel);
}

int compare(float *dst1, float *dst2, const int n) {
    int error = 0;
    for(int i=0; i<n; i++) {
        if(*dst1 != *dst2) error++;
    }
    return error;
}


int main() {
    const int w = 20;
    const int h = 20;

    float *src =  (float*)_mm_malloc(w*h*sizeof(float),16);
    float *dst1 =  (float*)_mm_malloc(w*h*sizeof(float),16);
    float *dst2 =  (float*)_mm_malloc(w*h*sizeof(float),16);
    for(int i=0; i<w*h; i++) {
        src[i] = i;
    }

    gaussian_blur(src, dst1, w, h, 1.0f, 1);
    gaussian_blur(src, dst2, w, h, 1.0f, 4);
    int error = compare(dst1, dst2, w*h);
    printf("error %d\n", error);
    _mm_free(src);
    _mm_free(dst1);
    _mm_free(dst2);
}

openmp - OpenMP によるガウスぼかしアルゴリズムの並列化

2 に答える 2

Related

Reference