タイトループで双一次内挿を実装し、SSEで最適化しようとしていますが、スピードアップはゼロです。
コードは次のとおりです。非SIMDバージョンは、struct Vec3f { float x, y, z; }
実装された乗算および加算演算子と同様に定義できる単純なベクトル構造を使用します。
#ifdef USE_SIMD
const Color c11 = pixelCache[y1 * size.x + x1];
const Color c12 = pixelCache[y2 * size.x + x1];
const Color c22 = pixelCache[y2 * size.x + x2];
const Color c21 = pixelCache[y1 * size.x + x2];
__declspec(align(16)) float mc11[4] = { 1.0, c11.GetB(), c11.GetG(), c11.GetR() };
__declspec(align(16)) float mc12[4] = { 1.0, c12.GetB(), c12.GetG(), c12.GetR() };
__declspec(align(16)) float mc22[4] = { 1.0, c22.GetB(), c22.GetG(), c22.GetR() };
__declspec(align(16)) float mc21[4] = { 1.0, c21.GetB(), c21.GetG(), c21.GetR() };
// scalars in vector form for SSE
const float s11 = (x2-x)*(y2-y);
const float s12 = (x2-x)*(y-y1);
const float s22 = (x-x1)*(y-y1);
const float s21 = (x-x1)*(y2-y);
__declspec(align(16)) float ms11[4] = {1.0, s11, s11, s11};
__declspec(align(16)) float ms12[4] = {1.0, s12, s12, s12};
__declspec(align(16)) float ms22[4] = {1.0, s22, s22, s22};
__declspec(align(16)) float ms21[4] = {1.0, s21, s21, s21};
__asm {
movaps xmm0, mc11
movaps xmm1, mc12
movaps xmm2, mc22
movaps xmm3, mc21
movaps xmm4, ms11
movaps xmm5, ms12
movaps xmm6, ms22
movaps xmm7, ms21
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
mulps xmm3, xmm7
addps xmm0, xmm1
addps xmm0, xmm2
addps xmm0, xmm3
movaps mc11, xmm0
}
#else
const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]);
const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]);
const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]);
const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]);
const Vec3f colour =
c11*(x2-x)*(y2-y) +
c21*(x-x1)*(y2-y) +
c12*(x2-x)*(y-y1) +
c22*(x-x1)*(y-y1);
#endif
レジスタを再利用するためにasmコードを再配置すると(最終的には3つのxmmレジスタだけになります)、効果はありませんでした。また、組み込み関数を使用してみました。
// perform bilinear interpolation
const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]);
const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]);
const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]);
const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]);
// scalars in vector form for SSE
const float s11 = (x2-x)*(y2-y);
const float s12 = (x2-x)*(y-y1);
const float s22 = (x-x1)*(y-y1);
const float s21 = (x-x1)*(y2-y);
__m128 mc11 = _mm_set_ps(1.f, c11.b, c11.g, c11.r);
__m128 mc12 = _mm_set_ps(1.f, c12.b, c12.g, c12.r);
__m128 mc22 = _mm_set_ps(1.f, c22.b, c22.g, c22.r);
__m128 mc21 = _mm_set_ps(1.f, c21.b, c21.g, c21.r);
__m128 ms11 = _mm_set_ps(1.f, s11, s11, s11);
__m128 ms12 = _mm_set_ps(1.f, s12, s12, s12);
__m128 ms22 = _mm_set_ps(1.f, s22, s22, s22);
__m128 ms21 = _mm_set_ps(1.f, s21, s21, s21);
mc11 = _mm_mul_ps(mc11, ms11);
mc12 = _mm_mul_ps(mc12, ms12);
mc22 = _mm_mul_ps(mc22, ms22);
mc21 = _mm_mul_ps(mc21, ms21);
mc11 = _mm_add_ps(mc11, mc12);
mc11 = _mm_add_ps(mc11, mc22);
mc11 = _mm_add_ps(mc11, mc21);
Vec3f colour;
_mm_storeu_ps(colour.array, mc11);
そして無駄に。私は何かが足りないのですか、それともここで余分な速度を得るのは不可能ですか?