cuda - reduce0 の改善 CUDA リダクションコードが機能しない

Question

基本的に配列を合計する CUDA コードを作成しました。配列のサイズNはの累乗2、つまりでなければなりません2^x。ただし、私のコードは正しく機能していません。たとえば、出力がの場合150177410、私のコードはを出力します150177408。過去5数時間、これをデバッグしようとしています。どんな助けでも大歓迎です。以下はコードです：

//only for array size of 2^x and TPB of 2^y as godata is = num of blocks. But num of blocks 2^sth if previous satisfied
//Works for arbitrary size array of type 2^x


#include<stdio.h>

__global__ void computeAddShared(int *in , int *out, int sizeInput){
    //not made parameters gidata and godata to emphasize that parameters get copy of address and are different from pointers in host code
    extern __shared__ float temp[];

    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int ltid = threadIdx.x;
    temp[ltid] = 0;
    while(tid < sizeInput){
        temp[ltid] += in[tid];
        tid+=gridDim.x * blockDim.x; // to handle array of any size
    }
    __syncthreads();
    int offset = 1;
    while(offset < blockDim.x){
        if(ltid % (offset * 2) == 0){
            temp[ltid] = temp[ltid] + temp[ltid + offset];
        }
        __syncthreads();
        offset*=2;
    }
    if(ltid == 0){
        out[blockIdx.x] = temp[0];
    }

}

int main(){



    int N = 8192;//should be 2^sth
    int size = N;
    int *a = (int*)malloc(N * sizeof(int));
    /* TO create random number
    FILE *f;
        f = fopen("invertedList.txt" , "w");
        a[0] = 1 + (rand() % 8);
        fprintf(f, "%d,",a[0]);
        for( int i = 1 ; i< N; i++){
            a[i] = a[i-1] + (rand() % 8) + 1;
            fprintf(f, "%d,",a[i]);
        }
        fclose(f);
        return 0;*/
    FILE *f;
    f = fopen("invertedList.txt","r");
    if( f == NULL){
            printf("File not found\n");
            system("pause");
            exit(1);
    }
    int count = 0 ;
    long actualSum = 0;
    for( int i =0 ; i < N ; i++){
        fscanf(f, "%d,", &a[count]);
        actualSum+=a[count];
        count++;
    }
    fclose(f);
    printf("The actual sum is %d\n",actualSum);
    int* gidata;
    int* godata;
    cudaMalloc((void**)&gidata, N* sizeof(int));
    cudaMemcpy(gidata,a, size * sizeof(int), cudaMemcpyHostToDevice);
    int TPB  = 256;
    int blocks = 10; //to get things kicked off
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    while(blocks != 1 ){
        if(size < TPB){
            TPB  = size; // size is 2^sth
        }
        blocks  = (size+ TPB -1 ) / TPB;
        cudaMalloc((void**)&godata, blocks * sizeof(int));
        computeAddShared<<<blocks, TPB,TPB*sizeof(int)>>>(gidata, godata,size);
        //cudaFree(gidata);
        gidata = godata;
        size = blocks;
    }
    //printf("The error by cuda is %s",cudaGetErrorString(cudaGetLastError()));


    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime; 
    cudaEventElapsedTime(&elapsedTime , start, stop);
    printf("time is %f ms", elapsedTime);
    int *output = (int*)malloc(sizeof(int));
    cudaMemcpy(output, gidata,size *  sizeof(int), cudaMemcpyDeviceToHost);
    //Cant free either earlier as both point to same location
    cudaError_t chk = cudaFree(godata);
    if(chk!=0){
        printf("First chk also printed error. Maybe error in my logic\n");
    }

    printf("The error by threadsyn is %s", cudaGetErrorString(cudaGetLastError()));
    printf("The sum of the array is %d\n", output[0]);
    getchar();

    return 0;
}

score 1 · Accepted Answer

talonmies が事前に言ったように、カーネル自体は問題ありません。基本的にはreduce0、CUDA SDKのカーネルを、同じ SDKのカーネルを改良するときに使用されるBrent の最適化の意味で、アルゴリズムのカスケードで改良したものです。reduce5reduce6

カーネルが正しく動作することは、次のテストコードで示すことができます。これは、コードreduce0で指定された OP のカーネルのパフォーマンスと比較reduce0_stackoverflowします。カーネルはまたreduce0_stackoverflow、コメント付きで、対応するreduce0.

以下のテストケースでは、GeForce GT540M カードと比較してreduce0_stackoverflow実行されます。0.030ms0.049ms

以下のコードでは、配列サイズがの累乗である必要はないことに注意してください2。

#include <thrust\device_vector.h>

#define BLOCKSIZE 256

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) { getchar(); exit(code); }
    }
}

/*******************************************************/
/* CALCULATING THE NEXT POWER OF 2 OF A CERTAIN NUMBER */
/*******************************************************/
unsigned int nextPow2(unsigned int x)
{
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
}

/*************************************/
/* CHECK IF A NUMBER IS A POWER OF 2 */
/*************************************/
bool isPow2(unsigned int x)
{
    return ((x&(x-1))==0);
}

/******************/
/* REDUCE0 KERNEL */
/******************/
/* This reduction interleaves which threads are active by using the modulo
    operator.  This operator is very expensive on GPUs, and the interleaved
    inactivity means that no whole warps are active, which is also very
    inefficient */
template <class T>
__global__ void reduce0(T *g_idata, T *g_odata, unsigned int N)
{
    extern __shared__ T sdata[];

    unsigned int tid    = threadIdx.x;                              // Local thread index
    unsigned int i      = blockIdx.x * blockDim.x + threadIdx.x;    // Global thread index

    // --- Loading data to shared memory
    sdata[tid] = (i < N) ? g_idata[i] : 0;

    // --- Before going further, we have to make sure that all the shared memory loads have been completed
    __syncthreads();

    // --- Reduction in shared memory
    for (unsigned int s=1; s < blockDim.x; s *= 2)
    {
        // --- Only the threads with index multiple of 2*s perform additions. Furthermore, modulo arithmetic is slow.       
        if ((tid % (2*s)) == 0) { sdata[tid] += sdata[tid + s]; }
        // --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
        __syncthreads();
    }

    // --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
    //     individual blocks
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

/***************************/
/* REDUCE0 - STACKOVERFLOW */
/***************************/
template <class T>
__global__ void reduce0_stackoverflow(T *g_idata , T *g_odata, unsigned int N) {

    extern __shared__ T sdata[];

    unsigned int tid    = threadIdx.x;                              // Local thread index
    unsigned int i      = blockIdx.x * blockDim.x + threadIdx.x;    // Global thread index

    // --- Loading data to shared memory
    //sdata[tid] = (i < N) ? g_idata[i] : 0;                        // CUDA SDK
    sdata[tid] = 0;
    while(i < N){
        sdata[tid] += g_idata[i];
        i+=gridDim.x * blockDim.x; // to handle array of any size
    }

    // --- Before going further, we have to make sure that all the shared memory loads have been completed
    __syncthreads();

    // --- Reduction in shared memory
    //  for (unsigned int s=1; s < blockDim.x; s *= 2)                  // CUDA SDK
    //  {
    //     if ((tid % (2*s)) == 0) { sdata[tid] += sdata[tid + s]; }
    //     __syncthreads();
    //  }
    unsigned int s = 1;
    while(s < blockDim.x) 
    {
        // --- Only the threads with index multiple of 2*s perform additions. Furthermore, modulo arithmetic is slow.       
        if ((tid % (2*s)) == 0) { sdata[tid] += sdata[tid + s]; }

        // --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
        __syncthreads();

        s*=2;
    }

    // --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
    //     individual blocks
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];

}

/********/
/* MAIN */
/********/
int main()
{
    const int N = 15336;

    thrust::device_vector<int> d_vec(N,3);

    int NumThreads  = (N < BLOCKSIZE) ? nextPow2(N) : BLOCKSIZE;
    int NumBlocks   = (N + NumThreads - 1) / NumThreads;

    // when there is only one warp per block, we need to allocate two warps
    // worth of shared memory so that we don't index shared memory out of bounds
    int smemSize = (NumThreads <= 32) ? 2 * NumThreads * sizeof(int) : NumThreads * sizeof(int);

    // --- Creating events for timing
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    thrust::device_vector<int> d_vec_block(NumBlocks);

    /***********/
    /* REDUCE0 */
    /***********/
    cudaEventRecord(start, 0);
    reduce0<<<NumBlocks, NumThreads, smemSize>>>(thrust::raw_pointer_cast(d_vec.data()), thrust::raw_pointer_cast(d_vec_block.data()), N);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("reduce0 - Elapsed time:  %3.3f ms \n", time);   

    // --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host
    thrust::host_vector<int> h_vec_block(d_vec_block);
    int sum_reduce0 = 0;
    for (int i=0; i<NumBlocks; i++) sum_reduce0 = sum_reduce0 + h_vec_block[i];
    printf("Result for reduce0 = %i\n",sum_reduce0);

    /**********************************/
    /* REDUCE0 KERNEL - STACKOVERFLOW */
    /**********************************/
    cudaEventRecord(start, 0);
    reduce0_stackoverflow<<<NumBlocks/2, NumThreads, smemSize>>>(thrust::raw_pointer_cast(d_vec.data()), thrust::raw_pointer_cast(d_vec_block.data()), N);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("reduce0 - stackoverflow - Elapsed time:  %3.3f ms \n", time);   

    // --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host
    h_vec_block = d_vec_block;
    int sum_reduce0_stackoverflow = 0;
    for (int i=0; i<NumBlocks; i++) sum_reduce0_stackoverflow = sum_reduce0_stackoverflow + h_vec_block[i];
    printf("Result for reduce0_stackoverflow = %i\n",sum_reduce0_stackoverflow);

    getchar();
}

cuda - reduce0 の改善 CUDA リダクション コードが機能しない

1 に答える 1

Related

Reference

cuda - reduce0 の改善 CUDA リダクションコードが機能しない