c++ - 大きな画像でのCUDAメモリ割り当ての問題

Question

画像からヒストグラムを作成する機能があります（順次バージョン（宿題））

CImg< unsigned char > histogramImage = CImg< unsigned char >(BAR_WIDTH * HISTOGRAM_SIZE, HISTOGRAM_SIZE, 1, 1);
unsigned int *histogram;
histogram = (unsigned int *)malloc(HISTOGRAM_SIZE * sizeof(unsigned int));
 memset(reinterpret_cast< void * >(histogram), 0, HISTOGRAM_SIZE * sizeof(unsigned int));

cudaMemset(gpuImage, 0, grayImage.width() * grayImage.height() * sizeof(unsigned char));

cuda_err = cudaMemcpy(gpuImage, grayImage, grayImage.width() * grayImage.height() * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cuda_err != cudaSuccess)
{
    std::cout << "ERROR: Failed cudaMemcpy" << std::endl;
   return -1;
}

unsigned int *gpuhistogram;
cuda_err = cudaMalloc((void **)(&gpuhistogram), HISTOGRAM_SIZE * sizeof(unsigned int));
if (cuda_err != cudaSuccess)
{
    std::cout << "ERROR: Failed cudaMalloc" << std::endl;
}
cudaMemset (gpuhistogram, 0, HISTOGRAM_SIZE * sizeof(unsigned int));

histogram1D(gpuImage, histogramImage, grayImage.width(), grayImage.height(), gpuhistogram, HISTOGRAM_SIZE, BAR_WIDTH, total, gridSize, blockSize);

cuda_err = cudaMemcpy(histogram, gpuhistogram, HISTOGRAM_SIZE * sizeof(unsigned int), cudaMemcpyDeviceToHost);
if (cuda_err != cudaSuccess)
{
    std::cout << "ERROR: Failed cudaMemcpy" << std::endl;
}

それは

void histogram1D(unsigned char *grayImage, unsigned char *histogramImage, const int width, const int height, unsigned int *histogram, const unsigned int HISTOGRAM_SIZE, const unsigned int BAR_WIDTH, NSTimer &timer, dim3 grid_size, dim3 block_size) {

NSTimer kernelTime = NSTimer("kernelTime", false, false);

kernelTime.start();
histo <<< grid_size, block_size >>> (grayImage, histogram,width);
cudaDeviceSynchronize();
kernelTime.stop();

cout << fixed << setprecision(6);
cout << "histogram1D (kernel): \t\t" << kernelTime.getElapsed() << " seconds." << endl;
}

カーネル機能は

__global__ void histo(unsigned char *inputImage, unsigned int *histogram, int width)
{

int x = threadIdx.x + (blockIdx.x * blockDim.x);
int y = threadIdx.y + (blockIdx.y * blockDim.y);

unsigned int index = static_cast< unsigned int >(inputImage[(y * width) + x]);
atomicAdd(&histogram[index],1);
}

私が抱えている問題は、1024x1024から3543x2480の範囲の画像でこれを呼び出すと、機能することです。ただし、8192x8192の画像が1つあり、関数が戻っても、*ヒストグラムの値はすべて0のままです。私の試行では、* gpuhistogramのメモリ割り当てに関係しているようです（unsignedintが十分に大きい必要はありません）。？）これのシーケンシャルバージョンが機能するため。これをどのように修正しますか？何か案は？

score 0 · Accepted Answer

カードを確認してください。ウィキペディアから：

技術仕様計算能力（バージョン）1.0 1.1 1.2 1.3 2.x 3.03.5スレッドブロックのグリッドの最大次元23スレッドブロックのグリッドの最大x、y、またはz次元65535 231-1
ヒストグラムのパフォーマンスはCPUコードよりも悪いと思われます。共有メモリなどを使用して、256個の値を想定してください。秘訣は、ブロックあたりのスレッドのbin＃（ブロックあたり256スレッド）を使用することです。著者の収入を破壊したくないので、参照してくださいCUDA by Example 2010

score 0 · Accepted Answer

追加したかっただけです。これは、ミハイルの答えに従って、私が今していることです。

void histogram1D(unsigned char *grayImage, unsigned char *histogramImage, const int width, const int height, unsigned int *histogram, const unsigned int HISTOGRAM_SIZE, const unsigned int BAR_WIDTH, NSTimer &timer, dim3 grid_size, dim3 block_size) {

NSTimer kernelTime = NSTimer("kernelTime", false, false);


kernelTime.start();
// Kernel
histo <<< 15*2, 256 >>> (grayImage, histogram,width,height);//15 is the number of blocks for my device
//cudaDeviceSynchronize(); //i get slow results with this. figured it's not nessesary since the kernel threads are synced.
kernelTime.stop();

cout << fixed << setprecision(6);
cout << "histogram1D (kernel): \t\t" << kernelTime.getElapsed()*1000 << " milliseconds." << endl;
}

カーネルコード;

__global__ void histo(unsigned char *inputImage, unsigned int *histogram, int width, int height)
{
__shared__ unsigned int temp[256];
temp[threadIdx.x] = 0;

__syncthreads();

int i = threadIdx.x + blockIdx.x * blockDim.x;
int offset = blockDim.y * gridDim.x;
while(i<width*height)
{
    atomicAdd(&temp[inputImage[i]],1);
    i += offset;
}

__syncthreads();
atomicAdd(&(histogram[threadIdx.x]),temp[threadIdx.x]);
}

c++ - 大きな画像でのCUDAメモリ割り当ての問題

2 に答える 2

Related

Reference