cuda - CUDA PTX f32.f32 テクスチャ読み取り

Question

浮動小数点インデックスを直接使用して CUDA テクスチャから読み取ることは可能ですか?たとえば、tex.1d.v4.f32.f32.

これにより、ファイルを見ると 2 つの命令が節約されているように見え.ptx、これはベンチマーク時のパフォーマンスの向上に反映されています。ただし、かなり重大な欠点は、これが問題なく実行されているように見えても、望ましい結果が得られないことです。

以下のコードは、この問題を示しています。

#include "cuda.h"
#include <thrust/device_vector.h>

//create a global 1D texture of type float
texture<float, cudaTextureType1D, cudaReadModeElementType> tex;

//below is a hand rolled ptx texture lookup using tex.1d.v4.f32.f32
__device__
float tex_load(float idx)
{
    float4 temp;
    asm("tex.1d.v4.f32.f32 {%0, %1, %2, %3}, [tex, {%4}];" :
        "=f"(temp.x), "=f"(temp.y), "=f"(temp.z), "=f"(temp.w) : "f"(idx));
    return temp.x;
}

//Try to read from the texture using tex1Dfetch and the custom tex_load
__global__ void read(){
    float x = tex1Dfetch(tex,0.0f);
    float y = tex_load(0.0f);
    printf("tex1Dfetch: %f    tex_load: %f\n",x,y);
}

int main()
{
    //create a vector of size 1 with the x[0]=3.14 
    thrust::device_vector<float> x(1,3.14);
    float* x_ptr = thrust::raw_pointer_cast(&x[0]);

    //bind the texture
    cudaBindTexture(0, tex, x_ptr, sizeof(float));

    //launch a single thread single block kernel
    read<<<1,1>>>();
    cudaUnbindTexture(tex);
    return 0;
}

いくつかのカード (K40、C2070) といくつかの CUDA バージョン (6.0、7.0) でこれを試しましたが、すべて同じ出力が得られます。

tex1Dfetch: 3.140000    tex_load: 0.000000

これは可能ですか、それとも間違ったツリーを吠えていますか?

cuda - CUDA PTX f32.f32 テクスチャ読み取り

1 に答える 1

Related

Reference