cudaBindTexture2Dに問題があります。この問題を再現するために作成した次のコードですが、これは...機能します:
#include "cuda.h"
#include <stdio.h>
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors(cudaError err, const char *file, const int line )
{
if(cudaSuccess != err)
{
fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
exit(-1);
}
}
texture<float,2> myTex;
int main(int argc, char* argv[])
{
float* input;
input = new float[656 * 480];
for(int i = 0; i < 656*480; ++i)
{
input[i] = i;
}
float* inputDevice;
checkCudaErrors(cudaMalloc ((void**)&inputDevice, 656 * 480 * sizeof(float) ));
checkCudaErrors(cudaMemcpy(inputDevice, input, 656 * 480 * sizeof(float), cudaMemcpyHostToDevice));
cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
checkCudaErrors(cudaBindTexture2D(0, myTex, inputDevice, desc, 656, 480, sizeof(float) * 656));
cudaUnbindTexture(myTex);
cudaFree(inputDevice);
return 0;
}
しかし、私の実際のプロジェクトでは、おそらく同じコードが機能していません。
texture<float,2> texInput;
/* a lot of code here, but nothing with texInput */
void CUDAConv::DoConvolution(float* input, float* kernel1D, float* resultMap, unsigned char* rMap, unsigned char* orientMap, int width, int height, int kernelSize)
{
int fDim = (int)(floor((sqrt((float)(width * width + height * height)) / 2 + 0.5f)));
//Lock* locks = new Lock[width * height];
int dim = fDim * 2 + 1;
devWidth = width;
devHeight = height;
// allocate memory on GPU for the summing images
checkCudaErrors(cudaMalloc((void**)&inputDevice, width * height * sizeof(float)));
checkCudaErrors(cudaMemcpy(inputDevice, input, width * height * sizeof(float), cudaMemcpyHostToDevice));
cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
checkCudaErrors(cudaMalloc((void**)&kernel1DDevice, kernelSize));
checkCudaErrors(cudaBindTexture2D(0, texInput, inputDevice, desc, width, height, sizeof(float) * width));
checkCudaErrors(cudaMalloc((void**)&radiusDevice, width * height));
checkCudaErrors(cudaMalloc((void**)&orientationDevice, width * height));
checkCudaErrors(cudaMalloc((void**)&resultDevice, width * height * sizeof(float)));
checkCudaErrors(cudaMemset(resultDevice, 0x00, width * height * sizeof(float)));
checkCudaErrors(cudaMemcpy(kernel1DDevice, kernel1D, kernelSize * sizeof(float), cudaMemcpyHostToDevice));
for(int i = 0; i < angles; ++i)
{
checkCudaErrors(cudaMalloc((void**)&sumUpImage[i], angles * width * height * sizeof(float)));
checkCudaErrors(cudaMemset(&sumUpImage[i], 0x00, width * height * angles * sizeof(float)));
checkCudaErrors(cudaMalloc((void**)&rotationImage[i], angles * dim * dim * sizeof(float)));
checkCudaErrors(cudaMemset(&rotationImage[i], 0x00, dim * dim * angles * sizeof(float)));
}
// do all convolution calculations in the Convolution function
convolution <<<1, angles>>> (/*locks, */inputDevice, kernel1DDevice, rotationImage, sumUpImage, resultDevice, radiusDevice, orientationDevice, devWidth, devHeight, angles);
checkCudaErrors(cudaMemcpy(resultMap, resultDevice, width * height * sizeof(float), cudaMemcpyDeviceToHost));
// free memory allocated on the GPU
for(int i = 0; i < angles; ++i)
{
checkCudaErrors(cudaFree(sumUpImage[i]));
}
//free(locks);
cudaUnbindTexture(texInput);
cudaFree(inputDevice);
cudaFree(kernel1DDevice);
cudaFree(radiusDevice);
cudaFree(orientationDevice);
cudaFree(resultDevice);
}
結果としてcudaBindTexture2Dで発生するエラーは次のとおりです。
CUDA_Conv.cu(203):CUDAランタイムAPIエラー18:無効なテクスチャ参照。
texInputをデバッグすると、myTexと同じように見え、ここで何が起こっているのか理解できません。
VS2010でのCUDA4.2の使用。