-2

John Conway の Game of Lifeをシミュレートするための CUDA プログラムを作成しましたが、メモリ リークが発生することがあります (存在しないはずのセルが表示されます)。

ここに私のカーネルがあります:

__global__ void gameOfLife(matrix pcuda_main,int lblock,int generations_to_run) {
/************************************
notice: first dimension is one section for read and one for 
write(change purpose every simulation generation). 2 extra section rows
for first and last line not need to check their up and down position evry time
also edge lines ensure that blocks handles edge sectors are having edge lines less that other blocks
************************************/
    __shared__ unsigned int section[2][SECTION_SIZE][CELLS_IN_LINE];
    int i,j;
    unsigned int sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b;
    unsigned int left_top,left,left_bot,right_top,right,right_bot;
    int read_section=0;
    int write_section=1;
    int bx = blockIdx.x;
    int row = SECTION_ROWS*blockIdx.x+threadIdx.x;
    int rowx = threadIdx.x+SECTION_ROWS;

    // I am zeroeing the perimiters lines since they dont loaded with values and can be corrupt while more lines may be zeroes its done for avoiding ifs
    section[0][rowx>>LINE_NUMBER_BITS_OFFSET][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[1][rowx>>LINE_NUMBER_BITS_OFFSET][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[0][SECTION_ROWS-1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[1][SECTION_ROWS-1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[0][3*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[1][3*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[0][2*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[1][2*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    __syncthreads(); // ensure no crashes between zeroeing perimiter lines and loading data
    // since entire warp access the first and last cell together no extrag edges are needed however only first and last line access lines beyond the edges
    for(i=0;i<CELLS_IN_LINE;i++) {
        if ( bx > 0 ) {
            section[0][rowx-SECTION_ROWS+1][i] = pcuda_main[((row-SECTION_ROWS)<<LINE_NUMBER_BITS_OFFSET)+i];
        }
        section[0][rowx+1][i] = pcuda_main[((row)<<LINE_NUMBER_BITS_OFFSET)+i];
        if ( bx < lblock ) {
            // not last block sector row
            section[0][rowx+SECTION_ROWS+1][i] = pcuda_main[((row+SECTION_ROWS)<<LINE_NUMBER_BITS_OFFSET)+i];
        }
    }
    __syncthreads(); // ensure all data read
    for ( i=0;i<generations_to_run;i++ ) {
        for(j=0;j<CELLS_IN_LINE;j++) {
            if ( bx > 0 ) {
                if ( j > 0 ) {
                    left_top = section[read_section][rowx-SECTION_ROWS][j-1];
                    left = section[read_section][rowx-SECTION_ROWS+1][j-1];
                    left_bot = section[read_section][rowx-SECTION_ROWS+2][j-1];
                } else {
                    left_top = 0;
                    left=0;
                    left_bot=0;
                }
                if ( j<CELLS_IN_LINE_RESIDUE ) {
                    right_top= section[read_section][rowx-SECTION_ROWS][j+1];
                    right= section[read_section][rowx-SECTION_ROWS+1][j+1];
                    right_bot= section[read_section][rowx-SECTION_ROWS+2][j+1];
                } else {
                    right_top = 0;
                    right=0;
                    right_bot=0;
                }

                CELL32(section[write_section][rowx-SECTION_ROWS+1][j],
                    left_top,section[read_section][rowx-SECTION_ROWS][j],right_top,
                    left,section[read_section][rowx-SECTION_ROWS+1][j],right,
                    left_bot,section[read_section][rowx-SECTION_ROWS+2][j],right_bot,
                    sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b);
            }
            if ( j > 0 ) {
                left_top = section[read_section][rowx][j-1];
                left = section[read_section][rowx+1][j-1];
                left_bot = section[read_section][rowx+2][j-1];
            } else {
                left_top = 0;
                left=0;
                left_bot=0;
            }
            if ( j<CELLS_IN_LINE_RESIDUE ) {
                right_top= section[read_section][rowx][j+1];
                right= section[read_section][rowx+1][j+1];
                right_bot= section[read_section][rowx+2][j+1];
            } else {
                right_top = 0;
                right=0;
                right_bot=0;
            }
            CELL32(section[write_section][rowx+1][j],
                left_top,section[read_section][rowx][j],right_top,
                left,section[read_section][rowx+1][j],right,
                left_bot,section[read_section][rowx+2][j],right_bot,
                sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b);
            if ( bx < lblock ) {
                if ( j > 0 ) {
                    left_top = section[read_section][rowx+SECTION_ROWS][j-1];
                    left = section[read_section][rowx+SECTION_ROWS+1][j-1];
                    left_bot = section[read_section][rowx+SECTION_ROWS+2][j-1];
                } else {
                    left_top = 0;
                    left=0;
                    left_bot=0;
                }
                if ( j<CELLS_IN_LINE_RESIDUE ) {
                    right_top= section[read_section][rowx+SECTION_ROWS][j+1];
                    right= section[read_section][rowx+SECTION_ROWS+1][j+1];
                    right_bot= section[read_section][rowx+SECTION_ROWS+2][j+1];
                } else {
                    right_top = 0;
                    right=0;
                    right_bot=0;
                }
                CELL32(section[write_section][rowx+SECTION_ROWS+1][j],
                    left_top,section[read_section][rowx+SECTION_ROWS][j],right_top,
                    left,section[read_section][rowx+SECTION_ROWS+1][j],right,
                    left_bot,section[read_section][rowx+SECTION_ROWS+2][j],right_bot,
                    sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b);
            }
        }
        read_section = read_section^1;
        write_section = write_section^1;
        //printf("passed %u generation for row: %u\n",i,row);
        __syncthreads();
    }

    // now writing back to the global memory notice write section turns into read section after every generation so 
    // I write the read section
    for(i=0;i<CELLS_IN_LINE;i++) {
        pcuda_main[((row)<<LINE_NUMBER_BITS_OFFSET)+i] = section[read_section][rowx+1][i];
    }
    __syncthreads();
}

セル 32 の定義とその依存関係は次のとおりです。

#define HALFADDER(s0,s1,a0,a1)do{s1=(a0)&(a1);s0=(a0)^(a1);}while(0)
#define FULLADDER(s0,s1,a0,a1,a2)do{s1=((a0)&(a1))|((a2)&((a0)^(a1)));s0 =(a2)^((a0)^(a1));}while(0)

#define CELL32(output,top_left,top,top_right,left,cur,right,bot_left,bot,bot_right,sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b)do{FULLADDER(sum_top_b0,sum_top_b1,(top_left<<31)|(top>>1),top,(top_right>>31)|(top<<1));HALFADDER(sum_cur_b0,sum_cur_b1,(left<<31)|(cur>>1),(right>>31)|(cur<<1));FULLADDER(sum_bot_b0,sum_bot_b1,(bot_left<<31)|(bot>>1),bot,((bot_right>>31)|(bot<<1)));FULLADDER(newone,newtwo,sum_bot_b0,sum_cur_b0,sum_top_b0);FULLADDER(newtwo,new4a,newtwo,sum_bot_b1,sum_top_b1);HALFADDER(newtwo,new4b,newtwo,sum_cur_b1);newone=newone|cur;output=newone&newtwo&(~new4a)&(~new4b);}while(0)

これがプライマリ ループとメモリのコピーです。

cudaMalloc((void **)(&pcuda),NUM_CELLS*sizeof(unsigned int));
    //cudaMalloc((void **)(&pdata),ROWS*sizeof(int));
    cudaMemcpy((void *)pcuda,(void *)p,sizeof(unsigned int)*NUM_CELLS,cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();

    while ( generations > 0 ) {
        if ( generations > SECTION_ROWS ) {
            generations_run = SECTION_ROWS;
        } else {
            generations_run = generations;
        }
        generations -= generations_run;
        printf("running params last_row:%u, generations_run:%u, generations left:%u,grid size:%d, array size in bytes: %u,last cell index: %d\n",
            ROWS-SECTION_ROWS,generations_run,generations,dimGrid.x,NUM_CELLS*sizeof(unsigned int),((ROWS-1)<<LINE_NUMBER_BITS_OFFSET)+CELLS_IN_LINE_RESIDUE);
        gameOfLife<<<dimGrid,dimBlock>>>(pcuda,dimGrid.x-1,generations_run);
        error = cudaDeviceSynchronize();
        if (error != cudaSuccess)
        {
            printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
            lineid = __LINE__;
            err = error;
        }
        else
        {
            printf("GPU Device %d:synchronized\n", devID);
        }
    }
    cudaMemcpy((void *)p,(void *)pcuda,sizeof(int)*NUM_CELLS,cudaMemcpyDeviceToHost);

メモリリークはどこですか?

4

1 に答える 1

3

CUDA ツールキットには cuda-memcheck が付属しており、これはデフォルトでカーネル内の境界外アクセスをチェックします。リークチェッカーなど、他のモードもあります。終了する前に cudaDeviceReset() を呼び出して、ツールが解放されていないデバイス メモリを探す必要があることに注意してください。

于 2013-01-26T22:41:09.383 に答える