c++ - 単純な SSE ループは非 SSE バージョンより遅い

Question

SSE float[4] の追加と標準の float[4] の追加を比較しようとしています。デモとして、SSE の有無にかかわらず、合計されたコンポーネントの合計を計算します。

#include <iostream>
#include <vector>

struct Point4
{
  Point4()
  {
    data[0] = 0;
    data[1] = 0;
    data[2] = 0;
    data[3] = 0;
  }

  float data[4];
};

void Standard()
{
  Point4 a;
  a.data[0] = 1.0f;
  a.data[1] = 2.0f;
  a.data[2] = 3.0f;
  a.data[3] = 4.0f;

  Point4 b;
  b.data[0] = 1.0f;
  b.data[1] = 6.0f;
  b.data[2] = 3.0f;
  b.data[3] = 5.0f;

  float total = 0.0f;
  for(unsigned int i = 0; i < 1e9; ++i)
  {
    for(unsigned int component = 0; component < 4; ++component)
    {
      total += a.data[component] + b.data[component];
    }
  }

  std::cout << "total: " << total << std::endl;
}

void Vectorized()
{
  typedef float v4sf __attribute__ (( vector_size(4*sizeof(float)) ));

  v4sf a;
  float* aPointer = (float*)&a;
  aPointer[0] = 1.0f; aPointer[1] = 2.0f; aPointer[2] = 3.0f; aPointer[3] = 4.0f;

  v4sf b;
  float* bPointer = (float*)&b;
  bPointer[0] = 1.0f; bPointer[1] = 6.0f; bPointer[2] = 3.0f; bPointer[3] = 5.0f;

  v4sf result;
  float* resultPointer = (float*)&result;
  resultPointer[0] = 0.0f;
  resultPointer[1] = 0.0f;
  resultPointer[2] = 0.0f;
  resultPointer[3] = 0.0f;

  for(unsigned int i = 0; i < 1e9; ++i)
  {
    result += a + b; // Vectorized operation
  }

  // Sum the components of the result (this is done with the "total += " in the Standard() loop
  float total = 0.0f;
  for(unsigned int component = 0; component < 4; ++component)
  {
    total += resultPointer[component];
  }
  std::cout << "total: " << total << std::endl;
}

int main()
{

//  Standard();

  Vectorized();

  return 0;
}

ただし、コードは、ベクトル化された (~.4 秒) メソッドよりも標準メソッドの方が高速 (~.2 秒) のようです。v4sf 値を合計する for ループが原因ですか? これら 2 つの手法の違いを計り、出力を比較して、2 つの間に違いがないことを確認するために使用できるより良い操作はありますか?

score 1 · Accepted Answer

次に、SSE が反復ごとに 4 回、SSE レジスターからスカラーレジスターにアンパックする必要があるため、バージョンが遅い理由は、ベクトル化された追加から得られるものよりも多くのオーバーヘッドがあります。分解を見ると、より明確な画像が得られるはずです。

あなたがやりたいことは次のとおりだと思います（SSEの方が高速です）：

for(unsigned int i = 0; i < 1e6; ++i)
{
    result += a + b; // Vectorized operation
}

// Sum the components of the result (this is done with the "total += " in the Standard() loop
for(unsigned int component = 0; component < 4; ++component)
{
    total += resultPointer[component];
}

また、次の場合はさらに高速になる可能性があります。

for(unsigned int i = 0; i < 1e6/4; ++i)
{
    result0 += a + b; // Vectorized operation
    result1 += a + b; // Vectorized operation
    result2 += a + b; // Vectorized operation
    result3 += a + b; // Vectorized operation
}

// Sum the components of the result (this is done with the "total += " in the Standard() loop
for(unsigned int component = 0; component < 4; ++component)
{
    total += resultPointer0[component];
    total += resultPointer1[component];
    total += resultPointer2[component];
    total += resultPointer3[component];
}

c++ - 単純な SSE ループは非 SSE バージョンより遅い

1 に答える 1

Related

Reference