c++ - ショートからフロートへのキャストをスピードアップしますか?

Question

私のコードのボトルネックとなっている C++ のショートトゥフロートキャストがあります。

コードは、ネイティブショートであるハードウェアデバイスバッファから変換されます。これは、ファンシーフォトンカウンタからの入力を表します。

float factor=  1.0f/value;
for (int i = 0; i < W*H; i++)//25% of time is spent doing this
{
    int value = source[i];//ushort -> int
    destination[i] = value*factor;//int*float->float
}

いくつかの詳細

値は 0 から 2^16-1 の範囲で、高感度カメラのピクセル値を表します
i7 プロセッサ (SSE 4.2 および 4.1 である i7 960) を搭載したマルチコア x86 マシンを使用しています。
ソースは 8 ビット境界に揃えられます (ハードウェアデバイスの要件)。
W*H は常に 8 で割り切れます。ほとんどの場合、W と H は 8 で割り切れます。

これは私を悲しませます、私にできることはありますか？

私はVisual Studios 2012を使用しています...

score 10 · Accepted Answer

基本的な SSE4.1 の実装は次のとおりです。

__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < W*H; i += 8)
{
    //  Load 8 16-bit ushorts.
    //  vi = {a,b,c,d,e,f,g,h}
    __m128i vi = _mm_load_si128((const __m128i*)(source + i));

    //  Convert to 32-bit integers
    //  vi0 = {a,0,b,0,c,0,d,0}
    //  vi1 = {e,0,f,0,g,0,h,0}
    __m128i vi0 = _mm_cvtepu16_epi32(vi);
    __m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));

    //  Convert to float
    __m128 vf0 = _mm_cvtepi32_ps(vi0);
    __m128 vf1 = _mm_cvtepi32_ps(vi1);

    //  Multiply
    vf0 = _mm_mul_ps(vf0,factor);
    vf1 = _mm_mul_ps(vf1,factor);

    //  Store
    _mm_store_ps(destination + i + 0,vf0);
    _mm_store_ps(destination + i + 4,vf1);
}

これは、次のことを前提としています。

source両方ともdestination16 バイトにアラインされています。
W*Hは 8 の倍数です。

このループをさらに展開することで、より良い結果が得られる可能性があります。（下記参照）

ここでの考え方は次のとおりです。

8 つのショートを単一の SSE レジスタにロードします。
レジスターを 2 つに分割します。1 つは下の 4 つのショートで、もう 1 つは上の 4 つのショートです。
両方のレジスタを 32 ビット整数にゼロ拡張します。
float両方をsに変換します。
係数を掛けます。
に格納しますdestination。

編集：

このタイプの最適化を行ってからしばらく経ったので、先に進んでループを展開しました。

Core i7 920 @ 3.5 GHz
Visual Studio 2012 - リリース x64:

Original Loop      : 4.374 seconds
Vectorize no unroll: 1.665
Vectorize unroll 2 : 1.416

さらにアンローリングすると、リターンが減少しました。

テストコードは次のとおりです。

#include <smmintrin.h>
#include <time.h>
#include <iostream>
#include <malloc.h>
using namespace std;


void default_loop(float *destination,const short* source,float value,int size){
    float factor = 1.0f / value; 
    for (int i = 0; i < size; i++)
    {
        int value = source[i];
        destination[i] = value*factor;
    }
}
void vectorize8_unroll1(float *destination,const short* source,float value,int size){
    __m128 factor = _mm_set1_ps(1.0f / value);
    for (int i = 0; i < size; i += 8)
    {
        //  Load 8 16-bit ushorts.
        __m128i vi = _mm_load_si128((const __m128i*)(source + i));

        //  Convert to 32-bit integers
        __m128i vi0 = _mm_cvtepu16_epi32(vi);
        __m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));

        //  Convert to float
        __m128 vf0 = _mm_cvtepi32_ps(vi0);
        __m128 vf1 = _mm_cvtepi32_ps(vi1);

        //  Multiply
        vf0 = _mm_mul_ps(vf0,factor);
        vf1 = _mm_mul_ps(vf1,factor);

        //  Store
        _mm_store_ps(destination + i + 0,vf0);
        _mm_store_ps(destination + i + 4,vf1);
    }
}
void vectorize8_unroll2(float *destination,const short* source,float value,int size){
    __m128 factor = _mm_set1_ps(1.0f / value);
    for (int i = 0; i < size; i += 16)
    {
        __m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
        __m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));

        //  Split into two registers
        __m128i b0 = _mm_unpackhi_epi64(a0,a0);
        __m128i b1 = _mm_unpackhi_epi64(a1,a1);

        //  Convert to 32-bit integers
        a0 = _mm_cvtepu16_epi32(a0);
        b0 = _mm_cvtepu16_epi32(b0);
        a1 = _mm_cvtepu16_epi32(a1);
        b1 = _mm_cvtepu16_epi32(b1);

        //  Convert to float
        __m128 c0 = _mm_cvtepi32_ps(a0);
        __m128 d0 = _mm_cvtepi32_ps(b0);
        __m128 c1 = _mm_cvtepi32_ps(a1);
        __m128 d1 = _mm_cvtepi32_ps(b1);

        //  Multiply
        c0 = _mm_mul_ps(c0,factor);
        d0 = _mm_mul_ps(d0,factor);
        c1 = _mm_mul_ps(c1,factor);
        d1 = _mm_mul_ps(d1,factor);

        //  Store
        _mm_store_ps(destination + i +  0,c0);
        _mm_store_ps(destination + i +  4,d0);
        _mm_store_ps(destination + i +  8,c1);
        _mm_store_ps(destination + i + 12,d1);
    }
}
void print_sum(const float *destination,int size){
    float sum = 0;
    for (int i = 0; i < size; i++){
        sum += destination[i];
    }
    cout << sum << endl;
}

int main(){

    int size = 8000;

    short *source       = (short*)_mm_malloc(size * sizeof(short), 16);
    float *destination  = (float*)_mm_malloc(size * sizeof(float), 16);

    for (int i = 0; i < size; i++){
        source[i] = i;
    }

    float value = 1.1;

    int iterations = 1000000;
    clock_t start;

    //  Default Loop
    start = clock();
    for (int it = 0; it < iterations; it++){
        default_loop(destination,source,value,size);
    }
    cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
    print_sum(destination,size);

    //  Vectorize 8, no unroll
    start = clock();
    for (int it = 0; it < iterations; it++){
        vectorize8_unroll1(destination,source,value,size);
    }
    cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
    print_sum(destination,size);

    //  Vectorize 8, unroll 2
    start = clock();
    for (int it = 0; it < iterations; it++){
        vectorize8_unroll2(destination,source,value,size);
    }
    cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
    print_sum(destination,size);

    _mm_free(source);
    _mm_free(destination);

    system("pause");
}

score 9 · Accepted Answer

私は最善の答えを持っていると信じています。私の結果は Mystical のものよりもはるかに高速です。それらは SSE2 のみを必要としますが、SSE3、SSE4、AVX、および利用可能な場合は AVX2 を利用します。コードを変更する必要はありません。再コンパイルするだけです。

8008、64000、および 2560*1920 = 4915200 の 3 つのサイズを実行しました。いくつかの異なるバリエーションを試しました。最も重要なものを以下にリストします。関数vectorize8_unroll2は神秘的な関数です。私は彼と呼ばれるの改良版を作りましたvectorize8_unroll2_parallel。関数vec16_loop_unroll2_fixとvec16_loop_unroll2_parallel_fixは、神秘的な関数よりも優れていると私が信じている私の関数です。これらの関数は、AVX でコンパイルすると自動的に AVX を使用しますが、SSE4 や SSE2 でも正常に動作します

さらに、「W*H は常に 8 で割り切れます。ほとんどの場合、W と H は 8 で割り切れます」と書きました。したがって、すべての場合に W*H が 16 で割り切れるとは限りません。サイズが 16 の倍数でない場合、 Mystical の関数 vectorize8_unroll2 にはバグがあります (彼のコードで size=8008 を試してみると、私の言いたいことがわかるでしょう)。私のコードにはそのようなバグはありません。

ベクトル化には Ander Fog の vectorclass を使用しています。libまたはdllファイルではありません。ほんの数個のヘッダーファイルです。並列化には OpenMP を使用します。結果の一部を次に示します。

Intel Xeon E5630 @2.53GHz (supports upto SSE4.2)    
size 8008, size2 8032, iterations 1000000

                        default_loop time: 7.935 seconds, diff 0.000000
                  vectorize8_unroll2 time: 1.875 seconds, diff 0.000000
              vec16_loop_unroll2_fix time: 1.878 seconds, diff 0.000000
         vectorize8_unroll2_parallel time: 1.253 seconds, diff 0.000000
     vec16_loop_unroll2_parallel_fix time: 1.151 seconds, diff 0.000000

size 64000, size2 64000, iterations 100000
                        default_loop time: 6.387 seconds, diff 0.000000
                  vectorize8_unroll2 time: 1.875 seconds, diff 0.000000
              vec16_loop_unroll2_fix time: 2.195 seconds, diff 0.000000
         vectorize8_unroll2_parallel time: 0.439 seconds, diff 0.000000
     vec16_loop_unroll2_parallel_fix time: 0.432 seconds, diff 0.000000

size 4915200, size2 4915200, iterations 1000
                        default_loop time: 5.125 seconds, diff 0.000000
                  vectorize8_unroll2 time: 3.496 seconds, diff 0.000000
              vec16_loop_unroll2_fix time: 3.490 seconds, diff 0.000000
         vectorize8_unroll2_parallel time: 3.119 seconds, diff 0.000000
     vec16_loop_unroll2_parallel_fix time: 3.127 seconds, diff 0.000000

編集:この回答の最後に、GCC を使用して AVX を搭載したシステムの結果を追加しました。

以下はコードです。多くのクロスチェックを行い、多くのバリエーションをテストしているため、コードが長く見えるだけです。http://www.agner.org/optimize/#vectorclassで vectorclass をダウンロードします。ヘッダーファイル (vectorclass.h、instrset.h、vectorf128.h、vectorf256.h、vectorf256e.h、vectori128.h、vectori256.h、vectori256e.h) をコンパイル元のディレクトリにコピーします。C++/CommandLine の下に /D__SSE4_2__ を追加します。リリースモードでコンパイルします。AVX を搭載した CPU を使用している場合は、代わりに /arch:AVX を入力してください。C++ プロパティ/言語の下に OpenMP サポートを追加します。

In GCC
SSE4.2: g++ foo.cpp -o foo_gcc -O3 -mSSE4.2 -fopenmp
AVX: g++ foo.cpp -o foo_gcc -O3 -mavx -fopenmp

以下のコードでは、関数は配列が 32 の倍数である必要があります。配列のサイズを 32 の倍数に変更することができます (これは size2 が参照するものです)。それが不可能な場合は、そのような制限のないvec16_loop_unroll2_parallel関数を使用することができます。vec16_loop_unroll2_parallel_fix. とにかく速いです。

#include <stdio.h>
#include "vectorclass.h"
#include "omp.h"

#define ROUND_DOWN(x, s) ((x) & ~((s)-1))

inline void* aligned_malloc(size_t size, size_t align) {
    void *result;
    #ifdef _MSC_VER 
    result = _aligned_malloc(size, align);
    #else 
     if(posix_memalign(&result, align, size)) result = 0;
    #endif
    return result;
}

inline void aligned_free(void *ptr) {
    #ifdef _MSC_VER 
        _aligned_free(ptr);
    #else 
      free(ptr);
    #endif

}

void default_loop(float *destination, const unsigned short* source, float value, int size){
    float factor = 1.0f/value;
    for (int i = 0; i < size; i++) {
        int value = source[i];
        destination[i] = value*factor;
    }
}


void default_loop_parallel(float *destination, const unsigned short* source, float value, int size){
    float factor = 1.0f / value;
    #pragma omp parallel for  
    for (int i = 0; i < size; i++) {
        int value = source[i];
        destination[i] = value*factor;
    }
}

void vec8_loop(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  for (int i = 0; i < size; i += 8) {
    Vec8us vi = Vec8us().load(source + i);
    Vec4ui vi0 = extend_low(vi);
    Vec4ui vi1 = extend_high(vi);
    Vec4f vf0 = to_float(vi0);
    Vec4f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i);
    vf1.store(destination + i + 4);
  }
}

void vec8_loop_unroll2(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  for (int i = 0; i < size; i += 16) {
    Vec8us vi = Vec8us().load(source + i);
    Vec4ui vi0 = extend_low(vi);
    Vec4ui vi1 = extend_high(vi);
    Vec4f vf0 = to_float(vi0);
    Vec4f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i + 0);
    vf1.store(destination + i + 4);

    Vec8us vi_new = Vec8us().load(source + i + 8);
    Vec4ui vi2 = extend_low(vi_new);
    Vec4ui vi3 = extend_high(vi_new);
    Vec4f vf2 = to_float(vi2);
    Vec4f vf3 = to_float(vi3);
    vf2*=factor;
    vf3*=factor;
    vf2.store(destination + i + 8);
    vf3.store(destination + i + 12);
  }
}

void vec8_loop_parallel(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  #pragma omp parallel for
  for (int i = 0; i < size; i += 8) {
    Vec8us vi = Vec8us().load(source + i);
    Vec4ui vi0 = extend_low(vi);
    Vec4ui vi1 = extend_high(vi);
    Vec4f vf0 = to_float(vi0);
    Vec4f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i);
    vf1.store(destination + i + 4);
  }
}

void vec8_loop_unroll2_parallel(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  #pragma omp parallel for
  for (int i = 0; i < size; i += 16) {
    Vec8us vi = Vec8us().load(source + i);
    Vec4ui vi0 = extend_low(vi);
    Vec4ui vi1 = extend_high(vi);
    Vec4f vf0 = to_float(vi0);
    Vec4f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i + 0);
    vf1.store(destination + i + 4);

    Vec8us vi_new = Vec8us().load(source + i + 8);
    Vec4ui vi2 = extend_low(vi_new);
    Vec4ui vi3 = extend_high(vi_new);
    Vec4f vf2 = to_float(vi2);
    Vec4f vf3 = to_float(vi3);
    vf2*=factor;
    vf3*=factor;
    vf2.store(destination + i + 8);
    vf3.store(destination + i + 12);
  }
}

void vec16_loop(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  for (int i = 0; i < size; i += 16) {
    Vec16us vi = Vec16us().load(source + i);
    Vec8ui vi0 = extend_low(vi);
    Vec8ui vi1 = extend_high(vi);
    Vec8f vf0 = to_float(vi0);
    Vec8f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i);
    vf1.store(destination + i + 8);
  }
}

void vec16_loop_unroll2(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  for (int i = 0; i < size; i += 32) {
    Vec16us vi = Vec16us().load(source + i);

    Vec8ui vi0 = extend_low(vi);
    Vec8ui vi1 = extend_high(vi);
    Vec8f vf0 = to_float(vi0);
    Vec8f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i + 0);
    vf1.store(destination + i + 8);

    Vec16us vi_new = Vec16us().load(source + i + 16);

    Vec8ui vi2 = extend_low(vi_new);
    Vec8ui vi3 = extend_high(vi_new);
    Vec8f vf2 = to_float(vi2);
    Vec8f vf3 = to_float(vi3);
    vf2*=factor;
    vf3*=factor;
    vf2.store(destination + i + 16);
    vf3.store(destination + i + 24);

  }
}

void vec16_loop_unroll2_fix(float *destination, const unsigned short* source, float value, int size) {
    float factor=  1.0f/value;
    int i = 0;
    for (; i <ROUND_DOWN(size, 32); i += 32) {
    Vec16us vi = Vec16us().load(source + i);

    Vec8ui vi0 = extend_low(vi);
    Vec8ui vi1 = extend_high(vi);
    Vec8f vf0 = to_float(vi0);
    Vec8f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i + 0);
    vf1.store(destination + i + 8);

    Vec16us vi_new = Vec16us().load(source + i + 16);

    Vec8ui vi2 = extend_low(vi_new);
    Vec8ui vi3 = extend_high(vi_new);
    Vec8f vf2 = to_float(vi2);
    Vec8f vf3 = to_float(vi3);
    vf2*=factor;
    vf3*=factor;
    vf2.store(destination + i + 16);
    vf3.store(destination + i + 24);

    }
    for (; i < size; i++) {
        int value = source[i];
        destination[i] = value*factor;
    }

}

void vec16_loop_parallel(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  #pragma omp parallel for
  for (int i = 0; i < size; i += 16) {
    Vec16us vi = Vec16us().load(source + i);
    Vec8ui vi0 = extend_low(vi);
    Vec8ui vi1 = extend_high(vi);
    Vec8f vf0 = to_float(vi0);
    Vec8f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i);
    vf1.store(destination + i + 8);
  }
}

void vec16_loop_unroll2_parallel(float *destination, const unsigned short* source, float value, int size) {
    float factor=  1.0f/value;
    #pragma omp parallel for
    for (int i = 0; i < size; i += 32) {
        Vec16us vi = Vec16us().load(source + i); 
        Vec8ui vi0 = extend_low(vi);
        Vec8ui vi1 = extend_high(vi);
        Vec8f vf0 = to_float(vi0);
        Vec8f vf1 = to_float(vi1);
        vf0*=factor;
        vf1*=factor;
        vf0.store(destination + i + 0);
        vf1.store(destination + i + 8);

        Vec16us vi_new = Vec16us().load(source + i + 16);
        Vec8ui vi2 = extend_low(vi_new);
        Vec8ui vi3 = extend_high(vi_new);
        Vec8f vf2 = to_float(vi2);
        Vec8f vf3 = to_float(vi3);
        vf2*=factor;
        vf3*=factor;
        vf2.store(destination + i + 16);
        vf3.store(destination + i + 24);
    }
}

void vec16_loop_unroll2_parallel_fix(float *destination, const unsigned short* source, float value, int size) {
    float factor=  1.0f/value;
    int i = 0;  
    #pragma omp parallel for 
    for (int i=0; i <ROUND_DOWN(size, 32); i += 32) {
        Vec16us vi = Vec16us().load(source + i);  
        Vec8ui vi0 = extend_low(vi);
        Vec8ui vi1 = extend_high(vi);
        Vec8f vf0 = to_float(vi0);
        Vec8f vf1 = to_float(vi1);
        vf0*=factor;
        vf1*=factor;
        vf0.store(destination + i + 0);
        vf1.store(destination + i + 8);

        Vec16us vi_new = Vec16us().load(source + i + 16); 
        Vec8ui vi2 = extend_low(vi_new);
        Vec8ui vi3 = extend_high(vi_new);
        Vec8f vf2 = to_float(vi2);
        Vec8f vf3 = to_float(vi3);
        vf2*=factor;
        vf3*=factor;
        vf2.store(destination + i + 16);
        vf3.store(destination + i + 24);

    }

    for(int i = ROUND_DOWN(size, 32); i < size; i++) {
        int value = source[i];
        destination[i] = value*factor;
    }

}

void vectorize8_unroll1(float *destination,const unsigned short* source,float value,int size){
    __m128 factor = _mm_set1_ps(1.0f / value);
    for (int i = 0; i < size; i += 8)
    {
        //  Load 8 16-bit ushorts.
        __m128i vi = _mm_load_si128((const __m128i*)(source + i));

        //  Convert to 32-bit integers
        __m128i vi0 = _mm_cvtepu16_epi32(vi);
        __m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));

        //  Convert to float
        __m128 vf0 = _mm_cvtepi32_ps(vi0);
        __m128 vf1 = _mm_cvtepi32_ps(vi1);

        //  Multiply
        vf0 = _mm_mul_ps(vf0,factor);
        vf1 = _mm_mul_ps(vf1,factor);

        //  Store
        _mm_store_ps(destination + i + 0,vf0);
        _mm_store_ps(destination + i + 4,vf1);
    }
}

void vectorize8_unroll2(float *destination,const unsigned short* source,float value,int size){
    __m128 factor = _mm_set1_ps(1.0f / value);
    for (int i = 0; i < size; i += 16)
    {
        __m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
        __m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));

        //  Split into two registers
        __m128i b0 = _mm_unpackhi_epi64(a0,a0);
        __m128i b1 = _mm_unpackhi_epi64(a1,a1);

        //  Convert to 32-bit integers
        a0 = _mm_cvtepu16_epi32(a0);
        b0 = _mm_cvtepu16_epi32(b0);
        a1 = _mm_cvtepu16_epi32(a1);
        b1 = _mm_cvtepu16_epi32(b1);

        //  Convert to float
        __m128 c0 = _mm_cvtepi32_ps(a0);
        __m128 d0 = _mm_cvtepi32_ps(b0);
        __m128 c1 = _mm_cvtepi32_ps(a1);
        __m128 d1 = _mm_cvtepi32_ps(b1);

        //  Multiply
        c0 = _mm_mul_ps(c0,factor);
        d0 = _mm_mul_ps(d0,factor);
        c1 = _mm_mul_ps(c1,factor);
        d1 = _mm_mul_ps(d1,factor);

        //  Store
        _mm_store_ps(destination + i +  0,c0);
        _mm_store_ps(destination + i +  4,d0);
        _mm_store_ps(destination + i +  8,c1);
        _mm_store_ps(destination + i + 12,d1);
    }
}

void vectorize8_unroll1_parallel(float *destination,const unsigned short* source,float value,int size){
    __m128 factor = _mm_set1_ps(1.0f / value);
    #pragma omp parallel for
    for (int i = 0; i < size; i += 8)
    {
        //  Load 8 16-bit ushorts.
        __m128i vi = _mm_load_si128((const __m128i*)(source + i));

        //  Convert to 32-bit integers
        __m128i vi0 = _mm_cvtepu16_epi32(vi);
        __m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));

        //  Convert to float
        __m128 vf0 = _mm_cvtepi32_ps(vi0);
        __m128 vf1 = _mm_cvtepi32_ps(vi1);

        //  Multiply
        vf0 = _mm_mul_ps(vf0,factor);
        vf1 = _mm_mul_ps(vf1,factor);

        //  Store
        _mm_store_ps(destination + i + 0,vf0);
        _mm_store_ps(destination + i + 4,vf1);
    }
}



void vectorize8_unroll2_parallel(float *destination,const unsigned short* source,float value,int size){
    __m128 factor = _mm_set1_ps(1.0f / value);
    #pragma omp parallel for
    for (int i = 0; i < size; i += 16)
    {
        __m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
        __m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));

        //  Split into two registers
        __m128i b0 = _mm_unpackhi_epi64(a0,a0);
        __m128i b1 = _mm_unpackhi_epi64(a1,a1);

        //  Convert to 32-bit integers
        a0 = _mm_cvtepu16_epi32(a0);
        b0 = _mm_cvtepu16_epi32(b0);
        a1 = _mm_cvtepu16_epi32(a1);
        b1 = _mm_cvtepu16_epi32(b1);

        //  Convert to float
        __m128 c0 = _mm_cvtepi32_ps(a0);
        __m128 d0 = _mm_cvtepi32_ps(b0);
        __m128 c1 = _mm_cvtepi32_ps(a1);
        __m128 d1 = _mm_cvtepi32_ps(b1);

        //  Multiply
        c0 = _mm_mul_ps(c0,factor);
        d0 = _mm_mul_ps(d0,factor);
        c1 = _mm_mul_ps(c1,factor);
        d1 = _mm_mul_ps(d1,factor);

        //  Store
        _mm_store_ps(destination + i +  0,c0);
        _mm_store_ps(destination + i +  4,d0);
        _mm_store_ps(destination + i +  8,c1);
        _mm_store_ps(destination + i + 12,d1);
    }
}

void copy_arrays(float* a, float*b, const int size) {
    float sum = 0;
    for(int i=0; i<size; i++) {
        b[i] = a[i];
    }
}

float compare_arrays(float* a, float*b, const int size) {
    float sum = 0;
    for(int i=0; i<size; i++) {
        float diff = a[i] - b[i];
        if(diff!=0)  {
            printf("i %d, a[i] %f, b[i] %f, diff %f\n", i, a[i], b[i], diff);
            break;
        }
        sum += diff;
    }
    return sum;
}

void randomize_array(unsigned short* a, const int size) {
    for(int i=0; i<size; i++) {
        float r = (float)rand()/RAND_MAX;
        a[i] = (int)(65536*r);
    }
}

void run(int size, int iterations) {
    int rd = ROUND_DOWN(size, 32);
    int size2 = rd == size ? size : rd + 32;
    float value = 1.1f;

    printf("size %d, size2 %d, iterations %d\n", size, size2, iterations);
    unsigned short* source = (unsigned short*)aligned_malloc(size2*sizeof(short), 16);
    float* destination = (float*)aligned_malloc(size2*sizeof(float), 16);
    float* destination_old = (float*)aligned_malloc(size2*sizeof(float), 16);
    float* destination_ref = (float*)aligned_malloc(size2*sizeof(float), 16);

    void (*fp[16])(float *destination, const unsigned short* source, float value, int size);
    fp[0] = default_loop;
    fp[1] = vec8_loop;
    fp[2] = vec8_loop_unroll2;
    fp[3] = vec16_loop;
    fp[4] = vec16_loop_unroll2;
    fp[5] = vec16_loop_unroll2_fix;
    fp[6] = vectorize8_unroll1;
    fp[7] = vectorize8_unroll2;

    fp[8] = default_loop_parallel;
    fp[9] = vec8_loop_parallel;
    fp[10] = vec8_loop_unroll2_parallel;
    fp[11] = vec16_loop_parallel;
    fp[12] = vec16_loop_unroll2_parallel;
    fp[13] = vec16_loop_unroll2_parallel_fix;
    fp[14] = vectorize8_unroll1_parallel;
    fp[15] = vectorize8_unroll2_parallel;

    char* func_str[] = {"default_loop", "vec8_loop", "vec8_loop_unrool2", "vec16_loop", "vec16_loop_unroll2", "vec16_loop_unroll2_fix", "vectorize8_unroll1", "vectorize8_unroll2",
        "default_loop_parallel", "vec8_loop_parallel", "vec8_loop_unroll2_parallel","vec16_loop_parallel", "vec16_loop_unroll2_parallel", "vec16_loop_unroll2_parallel_fix",
        "vectorize8_unroll1_parallel", "vectorize8_unroll2_parallel"};

    randomize_array(source, size2);

    copy_arrays(destination_old, destination_ref, size);
    fp[0](destination_ref, source, value, size);

    for(int i=0; i<16; i++) {
        copy_arrays(destination_old, destination, size);
        double dtime = omp_get_wtime();
        for (int it = 0; it < iterations; it++){
            fp[i](destination, source, value, size);
        }
        dtime = omp_get_wtime() - dtime;
        float diff = compare_arrays(destination, destination_ref, size);
        printf("%40s time: %.3f seconds, diff %f\n", func_str[i], dtime, diff);
    }
    printf("\n");
    aligned_free(source);
    aligned_free(destination);
    aligned_free(destination_old);
    aligned_free(destination_ref);
}
int main() {
    run(8008, 1000000); 
    run(64000, 100000);
    run(2560*1920, 1000);
}

結果 AVX を備えたシステムで GCC を使用。GCC はループを自動的に並列化します (Visual Studio は短いために失敗しますが、int を試すと機能します)。手書きのベクトル化コードではほとんど得られません。ただし、配列のサイズによっては、複数のスレッドを使用すると役立つ場合があります。小さな配列サイズ 8008 の場合、OpenMP はより悪い結果をもたらします。ただし、より大きな配列サイズ 128000 の場合、OpenMP を使用すると、はるかに優れた結果が得られます。最大の配列サイズ 4915200 の場合、完全にメモリバウンドになり、OpenMP は役に立ちません。

i7-2600k @ 4.4GHz
size 8008, size2 8032, iterations 1000000
                        default_loop time: 1.319 seconds, diff 0.000000          
              vec16_loop_unroll2_fix time: 1.167 seconds, diff 0.000000
                  vectorize8_unroll2 time: 1.227 seconds, diff 0.000000                
         vec16_loop_unroll2_parallel time: 1.528 seconds, diff 0.000000
         vectorize8_unroll2_parallel time: 1.381 seconds, diff 0.000000

size 128000, size2 128000, iterations 100000
                        default_loop time: 2.902 seconds, diff 0.000000                     
              vec16_loop_unroll2_fix time: 2.838 seconds, diff 0.000000
                  vectorize8_unroll2 time: 2.844 seconds, diff 0.000000         
     vec16_loop_unroll2_parallel_fix time: 0.706 seconds, diff 0.000000
         vectorize8_unroll2_parallel time: 0.672 seconds, diff 0.000000

size 4915200, size2 4915200, iterations 1000
                        default_loop time: 2.313 seconds, diff 0.000000
              vec16_loop_unroll2_fix time: 2.309 seconds, diff 0.000000    
                  vectorize8_unroll2 time: 2.318 seconds, diff 0.000000                
     vec16_loop_unroll2_parallel_fix time: 2.353 seconds, diff 0.000000         
         vectorize8_unroll2_parallel time: 2.349 seconds, diff 0.000000

score 5 · Accepted Answer

私のマシン [クアッドコア Athlon、3.3 GHz、16 GB の RAM] で SSE 組み込み関数を使用し、g++ -O2最適化 [1] を行うと、約 2.5 ～ 3 倍の速度が得られます。インラインアセンブラーで同じことを行う関数も作成しましたが、それほど高速ではありません (繰り返しますが、これは私のマシンに当てはまります。他のマシンで自由に実行してください)。

H * W のさまざまなサイズを試しましたが、ほぼ同じ結果が得られました。

[1] を使用すると、明らかに「コードを自動的にベクトル化する」ことが有効になるg++ -O3ため、4 つの関数すべてで同じ時間が得られます。-O3したがって、コンパイラが同様の自動ベクトル化機能をサポートしていると仮定すると、全体が少し時間の無駄になりました。

結果

convert_naive                  sum=4373.98 t=7034751 t/n=7.03475
convert_naive                  sum=4373.98 t=7266738 t/n=7.26674
convert_naive                  sum=4373.98 t=7006154 t/n=7.00615
convert_naive                  sum=4373.98 t=6815329 t/n=6.81533
convert_naive                  sum=4373.98 t=6820318 t/n=6.82032
convert_unroll4                sum=4373.98 t=8103193 t/n=8.10319
convert_unroll4                sum=4373.98 t=7276156 t/n=7.27616
convert_unroll4                sum=4373.98 t=7028181 t/n=7.02818
convert_unroll4                sum=4373.98 t=7074258 t/n=7.07426
convert_unroll4                sum=4373.98 t=7081518 t/n=7.08152
convert_sse_intrinsic          sum=4373.98 t=3377290 t/n=3.37729
convert_sse_intrinsic          sum=4373.98 t=3227018 t/n=3.22702
convert_sse_intrinsic          sum=4373.98 t=3007898 t/n=3.0079
convert_sse_intrinsic          sum=4373.98 t=3253366 t/n=3.25337
convert_sse_intrinsic          sum=4373.98 t=5576068 t/n=5.57607
convert_sse_inlineasm          sum=4373.98 t=3470887 t/n=3.47089
convert_sse_inlineasm          sum=4373.98 t=2838492 t/n=2.83849
convert_sse_inlineasm          sum=4373.98 t=2828556 t/n=2.82856
convert_sse_inlineasm          sum=4373.98 t=2789052 t/n=2.78905
convert_sse_inlineasm          sum=4373.98 t=3176522 t/n=3.17652

コード

#include <iostream>
#include <iomanip>
#include <cstdlib> 
#include <cstring>
#include <xmmintrin.h>
#include <emmintrin.h>


#define W 1000
#define H 1000

static __inline__ unsigned long long rdtsc(void)
{
    unsigned hi, lo;
    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}

void convert_naive(short *source, float *destination)
{
    float factor=  1.0f/32767;
    for (int i = 0; i < W*H; i++)
    {
    int value = source[i];
    destination[i] = value*factor;
    }
}


void convert_unroll4(short *source, float *destination)
{
    float factor=  1.0f/32767;
    for (int i = 0; i < W*H; i+=4)
    {
    int v1 = source[i];
    int v2 = source[i+1];
    int v3 = source[i+2];
    int v4 = source[i+3];
    destination[i]   = v1*factor;
    destination[i+1] = v2*factor;
    destination[i+2] = v3*factor;
    destination[i+3] = v4*factor;
    }
}


void convert_sse_intrinsic(short *source, float *destination)
{
    __m128 factor =  { 1.0f/32767, 1.0f/32767, 1.0f/32767, 1.0f/32767 };
    __m64 zero1 =  { 0,0 };
    __m128i zero2 =  { 0,0 };
    __m64 *ps = reinterpret_cast<__m64 *>(source);
    __m128 *pd = reinterpret_cast<__m128 *>(destination);
    for (int i = 0; i < W*H; i+=4)
    {
    __m128i value = _mm_unpacklo_epi16(_mm_set_epi64(zero1, *ps), zero2);
    value = _mm_srai_epi32(_mm_slli_epi32(value, 16), 16);
    __m128  fval  = _mm_cvtepi32_ps(value);
    *pd = _mm_mul_ps(fval, factor);   // destination[0,1,2,3] = value[0,1,2,3] * factor;
    pd++;
    ps++;
    }
}

void convert_sse_inlineasm(short *source, float *destination)
{
    __m128 factor =  { 1.0f/32767, 1.0f/32767, 1.0f/32767, 1.0f/32767 };
    __asm__ __volatile__(
    "\t pxor       %%xmm1, %%xmm1\n"
    "\t movaps     %3, %%xmm2\n"
    "\t mov        $0, %%rax\n"
    "1:"
    "\t movq       (%1, %%rax), %%xmm0\n"
    "\t movq       8(%1, %%rax), %%xmm3\n"
    "\t movq       16(%1, %%rax), %%xmm4\n"
    "\t movq       24(%1, %%rax), %%xmm5\n"
    "\t punpcklwd  %%xmm1, %%xmm0\n"
    "\t pslld      $16, %%xmm0\n"
    "\t psrad      $16, %%xmm0\n"
    "\t cvtdq2ps   %%xmm0, %%xmm0\n"
    "\t mulps      %%xmm2, %%xmm0\n"
    "\t punpcklwd  %%xmm1, %%xmm3\n"
    "\t pslld      $16, %%xmm3\n"
    "\t psrad      $16, %%xmm3\n"
    "\t cvtdq2ps   %%xmm3, %%xmm3\n"
    "\t mulps      %%xmm2, %%xmm3\n"
    "\t punpcklwd  %%xmm1, %%xmm4\n"
    "\t pslld      $16, %%xmm4\n"
    "\t psrad      $16, %%xmm4\n"
    "\t cvtdq2ps   %%xmm4, %%xmm4\n"
    "\t mulps      %%xmm2, %%xmm4\n"
    "\t punpcklwd  %%xmm1, %%xmm5\n"
    "\t pslld      $16, %%xmm5\n"
    "\t psrad      $16, %%xmm5\n"
    "\t cvtdq2ps   %%xmm5, %%xmm5\n"
    "\t mulps      %%xmm2, %%xmm5\n"
    "\t movaps     %%xmm0, (%0, %%rax, 2)\n"
    "\t movaps     %%xmm3, 16(%0, %%rax, 2)\n"
    "\t movaps     %%xmm4, 32(%0, %%rax, 2)\n"
    "\t movaps     %%xmm5, 48(%0, %%rax, 2)\n"
    "\t addq       $32, %%rax\n"
    "\t cmpq       %2, %%rax\n"
    "\t jbe        1b\n"
    : /* no outputs */ 
    : "r" (destination), "r" (source), "i"(sizeof(*source) * H * W), "m"(factor):
      "rax", "xmm0", "xmm1", "xmm3");
}




short inbuffer[W * H] __attribute__ ((aligned (16)));
float outbuffer[W * H + 16] __attribute__ ((aligned (16)));
#ifdef DEBUG
float outbuffer2[W * H];
#endif


typedef void (*func)(short *source, float *destination);

struct BmEntry
{
    const char *name;
    func  fn;
};

void bm(BmEntry& e)
{
    memset(outbuffer, 0, sizeof(outbuffer));
    unsigned long long t = rdtsc();
    e.fn(inbuffer, outbuffer);
    t = rdtsc() - t; 

    float sum = 0;
    for(int i = 0; i < W * H; i++)
    {
    sum += outbuffer[i]; 
    }

#if DEBUG
    convert_naive(inbuffer, outbuffer2);
    for(int i = 0; i < W * H; i++)
    {
    if (outbuffer[i] != outbuffer2[i])
    {
        std::cout << i << ":: " << inbuffer[i] << ": " 
              << outbuffer[i] << " != " << outbuffer2[i] 
              << std::endl;
    }
    }
#endif

    std::cout << std::left << std::setw(30) << e.name << " sum=" << sum << " t=" << t << 
    " t/n=" << (double)t / (W * H) << std::endl;
}


#define BM(x) { #x, x }


BmEntry table[] = 
{
    BM(convert_naive),
    BM(convert_unroll4),
    BM(convert_sse_intrinsic),
    BM(convert_sse_inlineasm),
};


int main()
{
    for(int i = 0; i < W * H; i++)
    {
    inbuffer[i] = (short)i;
    }

    for(int i = 0; i < sizeof(table)/sizeof(table[i]); i++)
    {
    for(int j = 0; j < 5; j++)
        bm(table[i]);
    }
    return 0;
}

score 2 · Accepted Answer

OpenMP を使用して、CPU のすべてのコアを使用できます。次のようにするだけで簡単です。

#include <omp.h>
float factor=  1.0f/value;
#pragma omp parallel for 
for (int i = 0; i < W*H; i++)//25% of time is spent doing this
{
    int value = source[i];//ushort -> int
    destination[i] = value*factor;//int*float->float
}

これは前のプログラムに基づく結果です。次のように追加するだけです。

#pragma omp parallel for 
for (int it = 0; it < iterations; it++){
 ...
}

そして、これが結果です

beta@beta-PC ~
$ g++ -o opt.exe opt.c -msse4.1 -fopenmp

beta@beta-PC ~
$ opt
0.748
2.90873e+007
0.484
2.90873e+007
0.796
2.90873e+007


beta@beta-PC ~
$ g++ -o opt.exe opt.c -msse4.1 -O3


beta@beta-PC ~
$ opt
1.404
2.90873e+007
1.404
2.90873e+007
1.404
2.90873e+007

. .

結果は、openmp で 100% の改善を示しています。Visual C++ は openmp もサポートしています。

score 1 · Accepted Answer

あなたは式を近似しようとすることができます

float factor = 1.0f/value;

とのnumerator/denomitator両方がsである分数。これは、アプリケーションで必要な精度で行うことができますnumeratordenominatorint

int denominator = 10000;
int numerator = factor * denominator;

次に、次のような整数演算で計算を行うことができます

int value = source[i];
destination[i] = (value * numerator) / numerator;

オーバーフローに注意する必要があります。おそらく、計算のためにlong(またはlong long64 ビットシステムでも) 切り替える必要があります。

c++ - ショートからフロートへのキャストをスピードアップしますか?

7 に答える 7

Related

Reference