c++ - 非常に遅い双一次内挿（OpenCVと比較して）

Question

template<typename T>
cv::Mat_<T> const bilinear_interpolation(cv::Mat_<T> const &src, cv::Size dsize,
                                     float dx, float dy)
{
    cv::Mat_<T> dst = dsize.area() == 0 ? cv::Mat_<T>(src.rows * dy, src.cols * dx) :
                                        cv::Mat_<T>(dsize);
  
    float const x_ratio = static_cast<float>((src.cols - 1)) / dst.cols;
    float const y_ratio = static_cast<float>((src.rows - 1)) / dst.rows;
    for(int row = 0; row != dst.rows; ++row)
    {
        int y = static_cast<int>(row * y_ratio);
        float const y_diff = (row * y_ratio) - y; //distance of the nearest pixel(y axis)
        float const y_diff_2 = 1 - y_diff;
        auto *dst_ptr = &dst(row, 0)[0];
        for(int col = 0; col != dst.cols; ++col)
        {
            int x = static_cast<int>(col * x_ratio);
            float const x_diff = (col * x_ratio) - x; //distance of the nearest pixel(x axis)
            float const x_diff_2 = 1 - x_diff;
            float const y2_cross_x2 = y_diff_2 * x_diff_2;
            float const y2_cross_x = y_diff_2 * x_diff;
            float const y_cross_x2 = y_diff * x_diff_2;
            float const y_cross_x = y_diff * x_diff;
            for(int channel = 0; channel != cv::DataType<T>::channels; ++channel)
            {
                *dst_ptr++ = y2_cross_x2 * src(y, x)[channel] +
                             y2_cross_x * src(y, x + 1)[channel] +
                             y_cross_x2 * src(y + 1, x)[channel] +
                             y_cross_x * src(y + 1, x + 1)[channel];
            }
        }
    }
    
    return dst;
}

これは双一次補間の実装です。512*512の画像（ "lena.png"）を2048 * 2048に拡大するために使用します。ジョブを完了するのに0.195秒かかりますが、cv :: resize（GPUバージョンではありません））のOpenCVは0.026秒しかかかりません。何が私のプログラムをそれほど遅くしているのかわかりません（OpenCVは私よりもほぼ750％高速です）。OpenCVのサイズ変更のソースコードを見たいのですが、その実装が見つかりません。

OpenCVのサイズ変更が非常に速い、または私の双線形が遅すぎる理由を知っていますか？

    {
        timeEstimate<> time;
        cv::Mat_<cv::Vec3b> const src = input;
        bilinear_interpolation(src, cv::Size(), dx, dy);
        std::cout << "bilinear" << std::endl;
    }

    {
        timeEstimate<> time;
        cv::Mat output = input.clone();
        cv::resize(input, output, cv::Size(), dx, dy, cv::INTER_LINEAR);
        std::cout << "bilinear cv" << std::endl;
    }

コンパイラ：mingw4.6.2 os：win7 64ビットCPU：Intel®i3-2330M（2.2G）

score 5 · Accepted Answer

OpenCVのバージョンを高速化する主な理由は2つあります。

OpenCVは、サイズ変更を「分離可能な操作」として実装します。つまり、2つのステップで実行されます。画像は水平方向に引き伸ばされ、次に垂直方向に引き伸ばされます。この手法により、少ない算術演算を使用してサイズ変更を行うことができます。
手作業でコーディングされたSSE最適化。

score 2 · Accepted Answer

最近、CPUベースのグラフィックコードにバイリニアアップスケーリングを追加したときに同じ質問がありました。

まず、次の設定でコードを実行しました。

OS：VM内のXubuntu 20
コンパイラ：gcc 9.3.0
OpenCVバージョン：4.2.0
CPU：i3-6100u（2.3 GHz）
ソースビットマップサイズ：512x512
宛先ビットマップサイズ：2048x2048

OpenCVが4.2msかかったのに対し、あなたのコードは92msかかったことがわかりました。そのため、2012年に質問したときよりも、今ではその差はさらに大きくなっています。それ以来、OpenCVはさらに最適化されていると思います。

（この時点で、WindowsでVisual Studio 2013を使用するように切り替え、x64ターゲット用に構築しました）。

固定小数点演算を使用するようにコードを変換すると、時間が30ミリ秒に短縮されました。データを整数として保持するため、固定小数点演算が役立ちます。入力データと出力データは整数です。それらをフロートに変換して再び戻す必要があるのはコストがかかります。GCC 9.3を使い続けていれば、VS 2013よりも高速なコードが生成されることが多いので、スピードアップはさらに進んだと思います。とにかく、コードは次のとおりです。

typedef union {
    unsigned c;
    struct { unsigned char b, g, r, a; };
} DfColour;

typedef struct _DfBitmap {
    int width, height;
    DfColour *pixels;
} DfBitmap;

void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
    unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
    unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
    int dstH = scale * src->height;
    int dstW = scale * src->width;

    // For every output pixel...
    for (int y = 0; y < dstH; y++) {
        int srcYAndWeight = (y * heightRatio) >> 8;
        int srcY = srcYAndWeight >> 8;

        DfColour *dstPixel = &dst->pixels[y * dst->width];
        DfColour *srcRow = &src->pixels[srcY * src->width];

        unsigned weightY2 = srcYAndWeight & 0xFF;
        unsigned weightY = 256 - weightY2;

        for (int x = 0; x < dstW; x++, dstPixel++) {
            // Perform bilinear interpolation on 2x2 src pixels.

            int srcXAndWeight = (x * widthRatio) >> 8;
            int srcX = srcXAndWeight >> 8;

            unsigned r = 0, g = 0, b = 0;
            unsigned weightX2 = srcXAndWeight & 0xFF;
            unsigned weightX = 256 - weightX2;

            // Pixel 0,0
            DfColour *srcPixel = &srcRow[srcX];
            unsigned w = (weightX * weightY) >> 8;
            r += srcPixel->r * w;
            g += srcPixel->g * w;
            b += srcPixel->b * w;

            // Pixel 1,0
            srcPixel++;
            w = (weightX2 * weightY) >> 8;
            r += srcPixel->r * w;
            g += srcPixel->g * w;
            b += srcPixel->b * w;

            // Pixel 1,1
            srcPixel += src->width;
            w = (weightX2 * weightY2) >> 8;
            r += srcPixel->r * w;
            g += srcPixel->g * w;
            b += srcPixel->b * w;

            // Pixel 0,1
            srcPixel--;
            w = (weightX * weightY2) >> 8;
            r += srcPixel->r * w;
            g += srcPixel->g * w;
            b += srcPixel->b * w;

            dstPixel->r = r >> 8;
            dstPixel->g = g >> 8;
            dstPixel->b = b >> 8;
        }
    }
}

より良いアルゴリズムに切り替えると、時間が19.5msに短縮されました。Andrey Kamaevの回答が述べているように、より優れたアルゴリズムは、垂直方向と水平方向のサイズ変更を2つの別々のパスに分割することで機能します。デスティネーションビットマップは、最初のパスの出力用の一時的なストレージスペースとして使用されます。2番目のパスのXトラバーサルは、必要なデータの上書きを回避するために逆方向になります。コードは次のとおりです。

void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
    unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
    unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
    int dstH = scale * src->height;
    int dstW = scale * src->width;

    for (int y = 0; y < dstH; y++) {
        int srcYAndWeight = (y * heightRatio) >> 8;
        int srcY = srcYAndWeight >> 8;

        DfColour *dstPixel = &dst->pixels[y * dst->width];
        DfColour *srcRow = &src->pixels[srcY * src->width];

        unsigned weightY2 = srcYAndWeight & 0xFF;
        unsigned weightY = 256 - weightY2;

        for (int x = 0; x < src->width; x++, dstPixel++) {
            unsigned r = 0, g = 0, b = 0;

            // Pixel 0,0
            DfColour *srcPixel = &srcRow[x];
            r += srcPixel->r * weightY;
            g += srcPixel->g * weightY;
            b += srcPixel->b * weightY;

            // Pixel 1,0
            srcPixel += src->width;
            r += srcPixel->r * weightY2;
            g += srcPixel->g * weightY2;
            b += srcPixel->b * weightY2;

            dstPixel->r = r >> 8;
            dstPixel->g = g >> 8;
            dstPixel->b = b >> 8;
        }
    }

    for (int y = 0; y < dstH; y++) {
        DfColour *dstRow = &dst->pixels[y * dst->width];

        for (int x = dstW - 1; x; x--) {
            int srcXAndWeight = (x * widthRatio) >> 8;
            int srcX = srcXAndWeight >> 8;

            unsigned r = 0, g = 0, b = 0;
            unsigned weightX2 = srcXAndWeight & 0xFF;
            unsigned weightX = 256 - weightX2;

            // Pixel 0,0
            DfColour *srcPixel = &dstRow[srcX];
            r += srcPixel->r * weightX;
            g += srcPixel->g * weightX;
            b += srcPixel->b * weightX;

            // Pixel 0,1
            srcPixel++;
            r += srcPixel->r * weightX2;
            g += srcPixel->g * weightX2;
            b += srcPixel->b * weightX2;

            DfColour *dstPixel = &dstRow[x];
            dstPixel->r = r >> 8;
            dstPixel->g = g >> 8;
            dstPixel->b = b >> 8;
        }
    }
}

シンプルなポータブルSIMDスキームを使用すると、時間が16.5msに短縮されました。SIMDスキームは、SSE/AVXのような独自の命令セット拡張を使用しません。代わりに、ハックを使用して、赤と青のチャネルを32ビット整数で格納および操作できるようにします。AVXの実装ほど高速ではありませんが、単純であるという利点があります。コードは次のとおりです。

void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
    unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
    unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
    int dstH = scale * src->height;
    int dstW = scale * src->width;

    for (int y = 0; y < dstH; y++) {
        int srcYAndWeight = (y * heightRatio) >> 8;
        int srcY = srcYAndWeight >> 8;

        DfColour *dstPixel = &dst->pixels[y * dst->width];
        DfColour *srcRow = &src->pixels[srcY * src->width];

        unsigned weightY2 = srcYAndWeight & 0xFF;
        unsigned weightY = 256 - weightY2;

        for (int x = 0; x < src->width; x++, dstPixel++) {
            unsigned rb = 0, g = 0;

            // Pixel 0,0
            DfColour *srcPixel = &srcRow[x];
            rb += (srcPixel->c & 0xff00ff) * weightY;
            g += srcPixel->g * weightY;

            // Pixel 1,0
            srcPixel += src->width;
            rb += (srcPixel->c & 0xff00ff) * weightY2;
            g += srcPixel->g * weightY2;

            dstPixel->c = rb >> 8;
            dstPixel->g = g >> 8;
        }
    }

    for (int y = 0; y < dstH; y++) {
        DfColour *dstRow = &dst->pixels[y * dst->width];

        for (int x = dstW - 1; x; x--) {
            int srcXAndWeight = (x * widthRatio) >> 8;
            int srcX = srcXAndWeight >> 8;

            unsigned rb = 0, g = 0;
            unsigned weightX2 = srcXAndWeight & 0xFF;
            unsigned weightX = 256 - weightX2;

            // Pixel 0,0
            DfColour *srcPixel = &dstRow[srcX];
            rb += (srcPixel->c & 0xff00ff) * weightX;
            g += srcPixel->g * weightX;

            // Pixel 0,1
            srcPixel++;
            rb += (srcPixel->c & 0xff00ff) * weightX2;
            g += srcPixel->g * weightX2;

            DfColour *dstPixel = &dstRow[x];
            dstPixel->c = rb >> 8;
            dstPixel->g = g >> 8;
        }
    }
}

X軸パスを分離しておくことは可能ですが、Y軸パスを組み合わせることができます。これにより、キャッシュの一貫性が向上し、コードが少し単純になります。2つのパスを再結合すると、時間が14.6msに短縮されます。コードは次のとおりです。

void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
    unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
    unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
    int dstH = scale * src->height;
    int dstW = scale * src->width;

    for (int y = 0; y < dstH; y++) {
        int srcYAndWeight = (y * heightRatio) >> 8;
        int srcY = srcYAndWeight >> 8;

        DfColour *dstRow = &dst->pixels[y * dst->width];
        DfColour *srcRow = &src->pixels[srcY * src->width];

        unsigned weightY2 = srcYAndWeight & 0xFF;
        unsigned weightY = 256 - weightY2;

        for (int x = 0; x < src->width; x++) {
            unsigned rb = 0, g = 0;

            // Pixel 0,0
            DfColour *srcPixel = &srcRow[x];
            rb += (srcPixel->c & 0xff00ff) * weightY;
            g += srcPixel->g * weightY;

            // Pixel 1,0
            srcPixel += src->width;
            rb += (srcPixel->c & 0xff00ff) * weightY2;
            g += srcPixel->g * weightY2;

            dstRow[x].c = rb >> 8;
            dstRow[x].g = g >> 8;
        }

        for (int x = dstW - 1; x; x--) {
            unsigned rb = 0, g = 0;

            int srcXAndWeight = (x * widthRatio) >> 8;
            int srcX = srcXAndWeight >> 8;
            unsigned weightX2 = srcXAndWeight & 0xFF;
            unsigned weightX = 256 - weightX2;

            // Pixel 0,0
            DfColour *srcPixel = &dstRow[srcX];
            rb += (srcPixel->c & 0xff00ff) * weightX;
            g += srcPixel->g * weightX;

            // Pixel 0,1
            srcPixel++;
            rb += (srcPixel->c & 0xff00ff) * weightX2;
            g += srcPixel->g * weightX2;

            dstRow[x].c = rb >> 8;
            dstRow[x].g = g >> 8;
        }
    }
}

この時点では、コードはまだシングルスレッドです。私のCPUには、2つの物理コアと合計4つのスレッドがあります。OpenCVは私のマシンで2つのスレッドを使用しています。2つのスレッドを使用するようにコードを変換すると、時間が約8ミリ秒に短縮されると思います。

実際のAVXSIMD実装への変換がおそらく必要ですが、4msに到達するために他にどのようなトリックが必要かわかりません。

score 0 · Accepted Answer

おそらく少し遅れますが、アプリケーションをデバッグモードで実行しているかどうかも確認してください。OpenCVはライブラリであり、リリース用にコンパイルされる可能性があります-コンパイラの最適化を使用します。

c++ - 非常に遅い双一次内挿（OpenCVと比較して）

3 に答える 3

Related

Reference