cuda - 3D 配列からの CUDA 2D レイヤードテクスチャ (float vs int)

Question

SDKサンプルの「Simple Layered Texture」に近い次のプログラムを取得しました。

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, kernels
#include <cuda_runtime.h>

// includes, project
#include <helper_cuda.h>
#include <helper_functions.h>  // helper for shared that are common to CUDA SDK samples

#define EXIT_WAIVED 2

static char *sSDKname = "simpleLayeredTexture";

// includes, kernels
// declare texture reference for layered 2D float texture
// Note: The "dim" field in the texture reference template is now deprecated.
// Instead, please use a texture type macro such as cudaTextureType1D, etc.

typedef int TYPE;

texture<TYPE, cudaTextureType2DLayered> tex;

////////////////////////////////////////////////////////////////////////////////
//! Transform a layer of a layered 2D texture using texture lookups
//! @param g_odata  output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void
transformKernel(TYPE *g_odata, int width, int height, int layer)
{
    // calculate this thread's data point
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

    // 0.5f offset and division are necessary to access the original data points
    // in the texture (such that bilinear interpolation will not be activated).
    // For details, see also CUDA Programming Guide, Appendix D
    float u = (x+0.5f) / (float) width;
    float v = (y+0.5f) / (float) height;

    // read from texture, do expected transformation and write to global memory
    TYPE sample = tex2DLayered(tex, u, v, layer);
    g_odata[layer*width*height + y*width + x] = sample;

    printf("Sample %d\n", sample);
}


////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    printf("[%s] - Starting...\n", sSDKname);

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    int devID = findCudaDevice(argc, (const char **)argv);

    bool bResult = true;

    // get number of SMs on this GPU
    cudaDeviceProp deviceProps;

    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
    printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);

    if (deviceProps.major < 2)
    {
        printf("%s requires SM >= 2.0 to support Texture Arrays.  Test will be waived... \n", sSDKname);
        cudaDeviceReset();
        exit(EXIT_SUCCESS);
    }

    // generate input data for layered texture
    unsigned int width=16, height=16, num_layers = 5;
    unsigned int size = width * height * num_layers * sizeof(TYPE);
    TYPE *h_data = (TYPE *) malloc(size);

    for (unsigned int layer = 0; layer < num_layers; layer++)
        for (int i = 0; i < (int)(width * height); i++)
        {
            h_data[layer*width*height + i] = 15;//(float)i;
        }

    // this is the expected transformation of the input data (the expected output)
    TYPE *h_data_ref = (TYPE *) malloc(size);

    for (unsigned int layer = 0; layer < num_layers; layer++)
        for (int i = 0; i < (int)(width * height); i++)
        {
            h_data_ref[layer*width*height + i] = h_data[layer*width*height + i];
        }

    // allocate device memory for result
    TYPE *d_data = NULL;
    checkCudaErrors(cudaMalloc((void **) &d_data, size));

    // allocate array and copy image data
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<TYPE>();
    cudaArray *cu_3darray;
    checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered));
    cudaMemcpy3DParms myparms = {0};
    myparms.srcPos = make_cudaPos(0,0,0);
    myparms.dstPos = make_cudaPos(0,0,0);
    myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(TYPE), width, height);
    myparms.dstArray = cu_3darray;
    myparms.extent = make_cudaExtent(width, height, num_layers);
    myparms.kind = cudaMemcpyHostToDevice;
    checkCudaErrors(cudaMemcpy3D(&myparms));

    // set texture parameters
    tex.addressMode[0] = cudaAddressModeWrap;
    tex.addressMode[1] = cudaAddressModeWrap;
//    tex.filterMode = cudaFilterModeLinear;
    tex.filterMode = cudaFilterModePoint;
    tex.normalized = true;  // access with normalized texture coordinates

    // Bind the array to the texture
    checkCudaErrors(cudaBindTextureToArray(tex, cu_3darray, channelDesc));

    dim3 dimBlock(8, 8, 1);
    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);

    printf("Covering 2D data array of %d x %d: Grid size is %d x %d, each block has 8 x 8 threads\n",
           width, height, dimGrid.x, dimGrid.y);

    transformKernel<<< dimGrid, dimBlock >>>(d_data, width, height, 0);  // warmup (for better timing)

    // check if kernel execution generated an error
    getLastCudaError("warmup Kernel execution failed");

    checkCudaErrors(cudaDeviceSynchronize());

    StopWatchInterface *timer = NULL;
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

    // execute the kernel
    for (unsigned int layer = 0; layer < num_layers; layer++)
        transformKernel<<< dimGrid, dimBlock, 0 >>>(d_data, width, height, layer);

    // check if kernel execution generated an error
    getLastCudaError("Kernel execution failed");

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timer);
    printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
    printf("%.2f Mtexlookups/sec\n", (width *height *num_layers / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
    sdkDeleteTimer(&timer);

    // allocate mem for the result on host side
    TYPE *h_odata = (TYPE *) malloc(size);
    // copy result from device to host
    checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));

    printf("Comparing kernel output to expected data\n");

#define MIN_EPSILON_ERROR 5e-3f
    bResult = compareData(h_odata, h_data_ref, width*height*num_layers, MIN_EPSILON_ERROR, 0.0f);

    printf("Host sample: %d == %d\n", h_data_ref[0], h_odata[0]);

    // cleanup memory
    free(h_data);
    free(h_data_ref);
    free(h_odata);

    checkCudaErrors(cudaFree(d_data));
    checkCudaErrors(cudaFreeArray(cu_3darray));

    cudaDeviceReset();

    if (bResult)
        printf("Success!");
    else
        printf("Failure!");

    exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

int (または uint) を TYPE として使用すると、出力は正しくなります。float の場合、間違った結果が生成されます。つまり、常に 0 になります (SDK の compareData 関数はすべて問題ないと言っていますが!?)。CUDAにバグがあると信じ始めています。Kepler K20 でバージョン 5.0 を使用しています。

提案やテスト結果は大歓迎です。コードはそのまま実行できるはずです。

前もって感謝します、ベン

編集：OSはLinux（Ubuntu 12.04.2 LTS）x86_64 3.2.0-38-genericです

score 1 · Accepted Answer

ここでの問題は、これだけを変更した場合:

typedef int TYPE;

これに：

typedef float TYPE;

カーネルの次の行は正しくありません。

printf("Sample %d\n", sample);
               ^^

printf形式指定子%dがタイプに対して正しくないためですfloat。その指定子をに変更すると%f、期待される出力が得られます。

$ cat t1519.cu
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, kernels
#include <cuda_runtime.h>

// includes, project
#include <helper_cuda.h>
#include <helper_functions.h>  // helper for shared that are common to CUDA SDK samples

#define EXIT_WAIVED 2

static char *sSDKname = "simpleLayeredTexture";

// includes, kernels
// declare texture reference for layered 2D float texture
// Note: The "dim" field in the texture reference template is now deprecated.
// Instead, please use a texture type macro such as cudaTextureType1D, etc.

typedef float TYPE;

texture<TYPE, cudaTextureType2DLayered> tex;

////////////////////////////////////////////////////////////////////////////////
//! Transform a layer of a layered 2D texture using texture lookups
//! @param g_odata  output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void
transformKernel(TYPE *g_odata, int width, int height, int layer)
{
    // calculate this thread's data point
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

    // 0.5f offset and division are necessary to access the original data points
    // in the texture (such that bilinear interpolation will not be activated).
    // For details, see also CUDA Programming Guide, Appendix D
    float u = (x+0.5f) / (float) width;
    float v = (y+0.5f) / (float) height;

    // read from texture, do expected transformation and write to global memory
    TYPE sample = tex2DLayered(tex, u, v, layer);
    g_odata[layer*width*height + y*width + x] = sample;

    printf("Sample %f\n", sample);
}


////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    printf("[%s] - Starting...\n", sSDKname);

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    int devID = findCudaDevice(argc, (const char **)argv);

    bool bResult = true;

    // get number of SMs on this GPU
    cudaDeviceProp deviceProps;

    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
    printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);

    if (deviceProps.major < 2)
    {
        printf("%s requires SM >= 2.0 to support Texture Arrays.  Test will be waived... \n", sSDKname);
        cudaDeviceReset();
        exit(EXIT_SUCCESS);
    }

    // generate input data for layered texture
    unsigned int width=16, height=16, num_layers = 5;
    unsigned int size = width * height * num_layers * sizeof(TYPE);
    TYPE *h_data = (TYPE *) malloc(size);

    for (unsigned int layer = 0; layer < num_layers; layer++)
        for (int i = 0; i < (int)(width * height); i++)
        {
            h_data[layer*width*height + i] = 15;//(float)i;
        }

    // this is the expected transformation of the input data (the expected output)
    TYPE *h_data_ref = (TYPE *) malloc(size);

    for (unsigned int layer = 0; layer < num_layers; layer++)
        for (int i = 0; i < (int)(width * height); i++)
        {
            h_data_ref[layer*width*height + i] = h_data[layer*width*height + i];
        }

    // allocate device memory for result
    TYPE *d_data = NULL;
    checkCudaErrors(cudaMalloc((void **) &d_data, size));

    // allocate array and copy image data
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<TYPE>();
    cudaArray *cu_3darray;
    checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered));
    cudaMemcpy3DParms myparms = {0};
    myparms.srcPos = make_cudaPos(0,0,0);
    myparms.dstPos = make_cudaPos(0,0,0);
    myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(TYPE), width, height);
    myparms.dstArray = cu_3darray;
    myparms.extent = make_cudaExtent(width, height, num_layers);
    myparms.kind = cudaMemcpyHostToDevice;
    checkCudaErrors(cudaMemcpy3D(&myparms));

    // set texture parameters
    tex.addressMode[0] = cudaAddressModeWrap;
    tex.addressMode[1] = cudaAddressModeWrap;
//    tex.filterMode = cudaFilterModeLinear;
    tex.filterMode = cudaFilterModePoint;
    tex.normalized = true;  // access with normalized texture coordinates

    // Bind the array to the texture
    checkCudaErrors(cudaBindTextureToArray(tex, cu_3darray, channelDesc));

    dim3 dimBlock(8, 8, 1);
    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);

    printf("Covering 2D data array of %d x %d: Grid size is %d x %d, each block has 8 x 8 threads\n",
           width, height, dimGrid.x, dimGrid.y);

    transformKernel<<< dimGrid, dimBlock >>>(d_data, width, height, 0);  // warmup (for better timing)

    // check if kernel execution generated an error
    getLastCudaError("warmup Kernel execution failed");

    checkCudaErrors(cudaDeviceSynchronize());

    StopWatchInterface *timer = NULL;
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

    // execute the kernel
    for (unsigned int layer = 0; layer < num_layers; layer++)
        transformKernel<<< dimGrid, dimBlock, 0 >>>(d_data, width, height, layer);

    // check if kernel execution generated an error
    getLastCudaError("Kernel execution failed");

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timer);
    printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
    printf("%.2f Mtexlookups/sec\n", (width *height *num_layers / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
    sdkDeleteTimer(&timer);

    // allocate mem for the result on host side
    TYPE *h_odata = (TYPE *) malloc(size);
    // copy result from device to host
    checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));

    printf("Comparing kernel output to expected data\n");

#define MIN_EPSILON_ERROR 5e-3f
    bResult = compareData(h_odata, h_data_ref, width*height*num_layers, MIN_EPSILON_ERROR, 0.0f);

    printf("Host sample: %d == %d\n", h_data_ref[0], h_odata[0]);

    // cleanup memory
    free(h_data);
    free(h_data_ref);
    free(h_odata);

    checkCudaErrors(cudaFree(d_data));
    checkCudaErrors(cudaFreeArray(cu_3darray));

    cudaDeviceReset();

    if (bResult)
        printf("Success!");
    else
        printf("Failure!");

    exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
$ nvcc -I/usr/local/cuda/samples/common/inc t1519.cu -o t1519
t1519.cu(15): warning: conversion from a string literal to "char *" is deprecated

t1519.cu(15): warning: conversion from a string literal to "char *" is deprecated

[user2@dc10 misc]$ cuda-memcheck ./t1519
========= CUDA-MEMCHECK
[simpleLayeredTexture] - Starting...
GPU Device 0: "Tesla V100-PCIE-32GB" with compute capability 7.0

CUDA device [Tesla V100-PCIE-32GB] has 80 Multi-Processors SM 7.0
Covering 2D data array of 16 x 16: Grid size is 2 x 2, each block has 8 x 8 threads
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
...
Sample 15.000000
Sample 15.000000
Sample 15.000000
Processing time: 13.991 msec
0.09 Mtexlookups/sec
Comparing kernel output to expected data
Host sample: 8964432 == 1
Success!========= ERROR SUMMARY: 0 errors
$

printf間違ったフォーマット指定子を修正していないため、最終的な出力行はまだ正しくないことに注意してください。

printf("Host sample: %d == %d\n", h_data_ref[0], h_odata[0]);

cuda - 3D 配列からの CUDA 2D レイヤード テクスチャ (float vs int)

1 に答える 1

Related

Reference

cuda - 3D 配列からの CUDA 2D レイヤードテクスチャ (float vs int)