assembly - x86 asmに書き換えることによるintersectRaySpherecプロシージャの最適化（方法？）

Question

Hullo、アセンブリについてはあまり知識がありません。これをx86（32ビットfpuまたはsse2）アセンブリで書き直すことで最適化する方法を考えています。最適化する必要があります。正しいアセンブリで書き直してから、テストします。速度が上がった場合（length（）とdot（）はここでもasmで記述する必要があります）このコードは私の単純なリアルタイムレイトレーサーで使用され、機能しますが、asmの最適化はあまり得意ではありません）

    inline float intersectRaySphere(float3* rO, float3* rV, float3* sO, float sR)
   {
    static float3 Q;

    Q = sub(sO,rO);
    float c = length(&Q);
    float v = dot(&Q,rV);
    float d = sR*sR - (c*c - v*v);

    // If there was no intersection, return -1
    if (d < 0.0) return (-1.0f);

    // Return the distance to the [first] intersecting point
    return (v - sqrt(d));
    }

前もって感謝します

//編集

    struct float3
    {
     float x;
     float y;
     float z;
    };


    inline float length(float3* v) {
     return sqrt( (v->x)*(v->x) + (v->y)*(v->y) + (v->z)*(v->z) );
    }

   inline float dot(float3* a, float3* b) {
     return (*a).x * (*b).x + (*a).y * (*b).y + (*a).z * (*b).z;
   }

およびdemoexe（最適化されていないcでも最適化されていない）：

dl.dropbox.com/u/42887985/re29.zip

たぶん誰かが私に長さドットのためのいくらか良いfpuasmルーチンを与えることができます（またはここに示されていない正規化）?? 交差手順の手順全体が最適ですが;-)

score 2 · Accepted Answer

__asm
    {
    movaps xmm0,[float3] //this is vector of yours into xmm0
    mulps xmm0,xmm0       //this is each term squared
    pxor xmm1,xmm1       //clean xmm1 first
    movlhps xmm1,xmm0    //lower 2 terms to the higher 2 parts of xmm1
    addps xmm0,xmm1      //higher 2 terms of xmm0 now has x_square+z_square and  y_square + zero_square
    shufps xmm2,xmm0,0 //we copy y_square to all 4 elements of xmm2
    addps xmm0,xmm2     //now we have sum of all squares in highest of xmm0
    shufps xmm0,xmm0,11111111b // copy result to all 4 parts
    sqrtss xmm0,xmm0           //scalar square-root
    movaps [result],xmm0
    }

これは完全に最適化されるよりも遅い場合がありますが、ベクトル長の計算には十分な速度である必要があります。ベクトルを整列させる必要があります-16バイト。位置合わせが必要ない場合は、movapsをmovupsに変更します。このコードを機能させることができれば、次のように置くことでパフォーマンスをさらに向上させることができます

align 16

movaps xmm0、[float3]の先頭で、コードも整列させます。次に、各命令のバイト数を確認できます。最適なコード長（16バイトの倍数）に到達するようにしてください。sse2（sse3、sse4、avx）の後に、結果を取得するために1つの命令のみを作成する垂直-水平ベクトル命令があります。

2番目の命令でmm0、xmm0をxmm0、xmm0に編集

ここにいくつかのリストがあります：

http://softpixel.com/~cwright/programming/simd/sse2.php

score 2 · Accepted Answer

これは、SSEに変換するための「優れた」関数ではありません。実際にはほとんど何も並列ではありません。それでは、一度に4つの光線と交差するように関数を変更しましょう。また、光線がAOS（構造体の配列）ではなくSOA（配列の構造体）に格納されていると便利です。

これらの変更により、次のようになる可能性があります（テストされていません）。

inline void intersect4RaysSphere(
 float* rOx, float* rOy, float* rOz,
 float* rVx, float* rVy, float* rVz,
 float sOx, float sOy, float sOz,
 float sR)
{
    // calculate Q
    movss xmm0, sOx
    movss xmm1, sOy
    movss xmm2, sOz
    shufps xmm0, xmm0, 0
    shufps xmm1, xmm1, 0
    shufps xmm2, xmm2, 0
    subps xmm0, [rOx]
    subps xmm1, [rOy]
    subps xmm2, [rOz]
    // calculate pow(dot(Q, rV), 2) in xmm3
    movaps xmm3, [rVx]
    movaps xmm4, [rVy]
    movaps xmm5, [rVz]
    mulps xmm3, xmm0
    mulps xmm4, xmm1
    mulps xmm5, xmm2
    addps xmm3, xmm4
    addps xmm3, xmm5
    movaps xmm4, xmm3
    mulps xmm3, xmm3
    // calculate pow(length(Q), 2)
    // there's no point in taking the square root only to then square it
    mulps xmm0, xmm0
    mulps xmm1, xmm1
    mulps xmm2, xmm2
    addps xmm0, xmm1
    addps xmm0, xmm2
    // calculate d
    movss xmm1, sR
    mulss xmm1, xmm1
    shufps xmm1, xmm1, 0
    subps xmm0, xmm3
    subps xmm1, xmm0
    sqrtps xmm1, xmm1
    // test for intersection
    // at this point:
    // xmm3 = v * v
    // xmm4 = v
    // xmm1 = sqrt(d)
    movaps xmm0, [minus1]  // memory location with { -1.0, -1.0, -1.0, -1.0 }
    subps xmm4, xmm1
    // get a mask of d's smaller than 0.0
    psrad xmm1, 31
    // select -1 if less than zero or v*v - d if >= 0
    andps xmm0, xmm1
    andnps xmm1, xmm4
    orps xmm0, xmm1
    ret
}

組み込み関数を含むバージョン（わずかにテストされています-コンパイル可能であり、OKアセンブリを生成するようです）：

__m128 intersect4RaysSphere(
     float* rOx, float* rOy, float* rOz,
     float* rVx, float* rVy, float* rVz,
     float sOx, float sOy, float sOz,
     float sR)
{
    __m128 Qx = _mm_sub_ps(_mm_set1_ps(sOx), _mm_load_ps(rOx));
    __m128 Qy = _mm_sub_ps(_mm_set1_ps(sOy), _mm_load_ps(rOy));
    __m128 Qz = _mm_sub_ps(_mm_set1_ps(sOz), _mm_load_ps(rOz));
    __m128 v = _mm_add_ps(_mm_mul_ps(Qx, _mm_load_ps(rVx)),
               _mm_add_ps(_mm_mul_ps(Qy, _mm_load_ps(rVy)),
                          _mm_mul_ps(Qz, _mm_load_ps(rVz))));
    __m128 vsquared = _mm_mul_ps(v, v);
    __m128 lengthQsquared = _mm_add_ps(_mm_mul_ps(Qx, Qx),
                            _mm_add_ps(_mm_mul_ps(Qy, Qy),
                                       _mm_mul_ps(Qz, Qz)));
    __m128 sr = _mm_set1_ps(sR);
    __m128 d = _mm_sub_ps(_mm_mul_ps(sr, sr), _mm_sub_ps(lengthQsquared, vsquared));
    __m128 mask = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(d), 31));
    //__m128 result = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-1.0f), mask),
                              _mm_andnot_ps(mask, _mm_sub_ps(vsquared, d)));
    __m128 result = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-1.0f), mask),
                              _mm_andnot_ps(mask, _mm_sub_ps(v, _mm_sqrt_ps(d))));
    return result;
}

assembly - x86 asmに書き換えることによるintersectRaySpherecプロシージャの最適化（方法？）

2 に答える 2

Related

Reference