これは、私の SSE ライブラリの一部です。大量のデータを処理するとき、私は常に SoA の代わりに SoA を使用します。また、_ m128/ _m256 の演算子のオーバーロードにより、C/C++ アルゴリズムを SIMD に簡単に変換できます。
SSE/AVX はメモリ操作に非常に敏感であるため、ロード/ストアはライブラリでサポートされていません。メモリ アクセスが悪いと、何十もの CPU サイクルが発生し、計算が停止します。
__forceinline __m128 operator+(__m128 l, __m128 r) { return _mm_add_ps(l,r); }
__forceinline __m128 operator-(__m128 l, __m128 r) { return _mm_sub_ps(l,r); }
__forceinline __m128 operator*(__m128 l, __m128 r) { return _mm_mul_ps(l,r); }
__forceinline __m128 operator/(__m128 l, __m128 r) { return _mm_div_ps(l,r); }
__forceinline __m128 operator&(__m128 l, __m128 r) { return _mm_and_ps(l,r); }
__forceinline __m128 operator|(__m128 l, __m128 r) { return _mm_or_ps(l,r); }
__forceinline __m128 operator<(__m128 l, __m128 r) { return _mm_cmplt_ps(l,r); }
__forceinline __m128 operator>(__m128 l, __m128 r) { return _mm_cmpgt_ps(l,r); }
__forceinline __m128 operator<=(__m128 l, __m128 r) { return _mm_cmple_ps(l,r); }
__forceinline __m128 operator>=(__m128 l, __m128 r) { return _mm_cmpge_ps(l,r); }
__forceinline __m128 operator!=(__m128 l, __m128 r) { return _mm_cmpneq_ps(l,r); }
__forceinline __m128 operator==(__m128 l, __m128 r) { return _mm_cmpeq_ps(l,r); }
__forceinline __m128 _mm_merge_ps(__m128 m, __m128 l, __m128 r)
{
return _mm_or_ps(_mm_andnot_ps(m, l), _mm_and_ps(m, r));
}
struct TPoint4
{
TPoint4() {}
TPoint4(const D3DXVECTOR3& a) :x(_mm_set1_ps(a.x)), y(_mm_set1_ps(a.y)), z(_mm_set1_ps(a.z)) {}
TPoint4(__m128 a, __m128 b, __m128 c) :x(a), y(b), z(c) {}
TPoint4(const __m128* a) :x(a[0]), y(a[1]), z(a[2]) {}
TPoint4(const D3DXVECTOR3& a, const D3DXVECTOR3& b, const D3DXVECTOR3& c, const D3DXVECTOR3& d) :x(_mm_set_ps(a.x,b.x,c.x,d.x)), y(_mm_set_ps(a.y,b.y,c.y,d.y)), z(_mm_set_ps(a.z,b.z,c.z,d.z)) {}
operator __m128* () { return &x; }
operator const __m128* () const { return &x; }
TPoint4 operator+(const TPoint4& r) const { return TPoint4(x+r.x, y+r.y, z+r.z); }
TPoint4 operator-(const TPoint4& r) const { return TPoint4(x-r.x, y-r.y, z-r.z); }
TPoint4 operator*(__m128 r) const { return TPoint4(x * r, y * r, z * r); }
TPoint4 operator/(__m128 r) const { return TPoint4(x / r, y / r, z / r); }
__m128 operator[](int index) const { return _val[index]; }
union
{
struct
{
__m128 x, y, z;
};
struct
{
__m128 _val[3];
};
};
};
__forceinline TPoint4* TPoint4Cross(TPoint4* result, const TPoint4* l, const TPoint4* r)
{
result->x = (l->y * r->z) - (l->z * r->y);
result->y = (l->z * r->x) - (l->x * r->z);
result->z = (l->x * r->y) - (l->y * r->x);
return result;
}
__forceinline __m128 TPoint4Dot(const TPoint4* l, const TPoint4* r)
{
return (l->x * r->x) + (l->y * r->y) + (l->z * r->z);
}
__forceinline TPoint4* TPoint4Normalize(TPoint4* result, const TPoint4* l)
{
__m128 rec_len = _mm_rsqrt_ps( (l->x * l->x) + (l->y * l->y) + (l->z * l->z) );
result->x = l->x * rec_len;
result->y = l->y * rec_len;
result->z = l->z * rec_len;
return result;
}
__forceinline __m128 TPoint4Length(const TPoint4* l)
{
return _mm_sqrt_ps( (l->x * l->x) + (l->y * l->y) + (l->z * l->z) );
}
__forceinline TPoint4* TPoint4Merge(TPoint4* result, __m128 mask, const TPoint4* l, const TPoint4* r)
{
result->x = _mm_merge_ps(mask, l->x, r->x);
result->y = _mm_merge_ps(mask, l->y, r->y);
result->z = _mm_merge_ps(mask, l->z, r->z);
return result;
}
extern __m128 g_zero4;
extern __m128 g_one4;
extern __m128 g_fltMax4;
extern __m128 g_mask4;
extern __m128 g_epsilon4;