cuda - 共有メモリを使用した CUDA の 3D ボックスカーフィルタ

Question

オブジェクトの問題に対して共有メモリを使用すると、実行時間が改善され、速度が向上するかどうかを確認しようとしています。

共有メモリを使用しないカーネル機能

__global__ void  3dc(const int nx, const int ny, const int nz, const float* in1, 
    const float* in2, const float* in3, const float* in4, float* out)
{
    int i, j, k;

    int tidx = threadIdx.x + blockIdx.x*blockDim.x;

    if(tidx < (nx)*(ny)*(nz)){
        k = tidx/((nx)*(ny));
        j = (tidx - k*(nx)*(ny))/(nx);
        i = tidx - k*(nx)*(ny) - j*(nx);

        out[i + nx*j + nx*ny*k] = 
            in1[i     + nx*j     + nx*ny*k    ]+
            in1[(i+1) + nx*j     + nx*ny*k    ]+
            in1[(i+1) + nx*(j+1) + nx*ny*k    ]+
            in1[i     + nx*(j+1) + nx*ny*k    ]+
            in1[i     + nx*j     + nx*ny*(k+1)]+
            in1[(i+1) + nx*j     + nx*ny*(k+1)]+
            in1[(i+1) + nx*(j+1) + nx*ny*(k+1)]+
            in1[i     + nx*(j+1) + nx*ny*(k+1)]+
            in2[i     + nx*j     + nx*ny*k    ]+
            in2[(i+1) + nx*j     + nx*ny*k    ]+
            in2[(i+1) + nx*(j+1) + nx*ny*k    ]+
            in2[i     + nx*(j+1) + nx*ny*k    ]+
            in2[i     + nx*j     + nx*ny*(k+1)]+
            in2[(i+1) + nx*j     + nx*ny*(k+1)]+
            in2[(i+1) + nx*(j+1) + nx*ny*(k+1)]+
            in2[i     + nx*(j+1) + nx*ny*(k+1)]+
            in3[i     + nx*j     + nx*ny*k    ]+
            in3[(i+1) + nx*j     + nx*ny*k    ]+
            in3[(i+1) + nx*(j+1) + nx*ny*k    ]+
            in3[i     + nx*(j+1) + nx*ny*k    ]+
            in3[i     + nx*j     + nx*ny*(k+1)]+
            in3[(i+1) + nx*j     + nx*ny*(k+1)]+
            in3[(i+1) + nx*(j+1) + nx*ny*(k+1)]+
            in3[i     + nx*(j+1) + nx*ny*(k+1)]+
            in4[i     + nx*j     + nx*ny*k    ]+
            in4[(i+1) + nx*j     + nx*ny*k    ]+
            in4[(i+1) + nx*(j+1) + nx*ny*k    ]+
            in4[i     + nx*(j+1) + nx*ny*k    ]+
            in4[i     + nx*j     + nx*ny*(k+1)]+
            in4[(i+1) + nx*j     + nx*ny*(k+1)]+
            in4[(i+1) + nx*(j+1) + nx*ny*(k+1)]+
            in4[i     + nx*(j+1) + nx*ny*(k+1)];
    } 
} // 3dc

共有メモリを使用したカーネル機能

__global__ void 3d_shared_memory(const int nx, const int ny, const int nz, const float* in1, const float* in2, const float* in3, const float* in4, float* out){
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    int idy = blockIdx.y*blockDim.y + threadIdx.y;
    int idz = blockIdx.z*blockDim.z + threadIdx.z;

    __shared__ float smem1[16][16][4];
    __shared__ float smem2[16][16][4];
    __shared__ float smem3[16][16][4];
    __shared__ float smem4[16][16][4];

    if ((idx < nx) && (idy < ny) && (idz < nz)){
        smem1[threadIdx.x][threadIdx.y][threadIdx.z] = in1[idz * nx * ny + idy * nx + idx];
        smem2[threadIdx.x][threadIdx.y][threadIdx.z] = in2[idz * nx * ny + idy * nx + idx];
        smem3[threadIdx.x][threadIdx.y][threadIdx.z] = in3[idz * nx * ny + idy * nx + idx];
        smem4[threadIdx.x][threadIdx.y][threadIdx.z] = in4[idz * nx * ny + idy * nx + idx];                        
        __syncthreads();

        for(int k = 0; k < 3; k++){
            for(int j = 0; j < 15; j++){
                for(int i = 0; i < 15; i++){
                    out[idz * nx * ny + idy * nx + idx] = smem1[i][j][k] + smem1[i+1][j][k] + smem1[i+1][j+1][k] + smem1[i][j+1][k] + smem1[i][j][k+1] + smem1[i+1][j][k+1] + smem1[i+1][j+1][k+1] + smem1[i][j+1][k+1] +
                        smem2[i][j][k] + smem2[i+1][j][k] + smem2[i+1][j+1][k] + smem2[i][j+1][k] + smem2[i][j][k+1] + smem2[i+1][j][k+1] + smem2[i+1][j+1][k+1] + smem2[i][j+1][k+1] +
                        smem3[i][j][k] + smem3[i+1][j][k] + smem3[i+1][j+1][k] + smem3[i][j+1][k] + smem3[i][j][k+1] + smem3[i+1][j][k+1] + smem3[i+1][j+1][k+1] + smem3[i][j+1][k+1] +
                        smem4[i][j][k] + smem4[i+1][j][k] + smem4[i+1][j+1][k] + smem4[i][j+1][k] + smem4[i][j][k+1] + smem4[i+1][j][k+1] + smem4[i+1][j+1][k+1] + smem4[i][j+1][k+1];
                }
            }
        }

    }

} //3d_shared_memory example

共有メモリコードは常に低速です。この問題のために共有メモリを活用するより良い方法はありますか? 提案をお寄せいただきありがとうございます。

score 2 · Accepted Answer

未回答リストから削除するために、この投稿に遅い回答を提供しています。

基本的に、共有メモリを使用して 3D でボックスカーフィルターを実装しています。上記のコメントで既に述べたものに加えて、共有メモリを使用しているときに速度が向上しない理由として、次の 2 つが考えられます。

共有メモリのロードとストアは結合されません。
ボックスカーのサイズが2.

以下に、グローバルメモリのみを使用する場合と共有メモリを使用する場合を比較するコードを示します。このコードは、Robert Crovella が3d CUDA kernel indexing for image filtering? で投稿したコードを修正したものです? .

に対するこのコードの結果DATASIZE_X x DATASIZE_Y x DATASIZE_Z = 1024 x 1024 x 64:

GT540Mケース

BOXCAR_SIZE            GLOBAL            SHARED
     2                  360ms             342ms
     4                 1292ms             583ms
     6                 3675ms            1166ms

ケプラー K20c ケース

BOXCAR_SIZE            GLOBAL            SHARED
     2                    8ms              16ms
     4                   40ms              33ms
     6                  142ms             102ms

コード：

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define BOXCAR_SIZE 6

#define DATASIZE_X 1024
#define DATASIZE_Y 1024
#define DATASIZE_Z 64

#define BLOCKSIZE_X 8
#define BLOCKSIZE_Y 8
#define BLOCKSIZE_Z 8

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/*****************************/
/* BOXCAR WITH SHARED MEMORY */
/*****************************/
__global__ void boxcar_shared(int* __restrict__ output, const int* __restrict__ input)
{
    __shared__ int smem[(BLOCKSIZE_Z + (BOXCAR_SIZE-1))][(BLOCKSIZE_Y + (BOXCAR_SIZE-1))][(BLOCKSIZE_X + (BOXCAR_SIZE-1))];

    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    int idy = blockIdx.y*blockDim.y + threadIdx.y;
    int idz = blockIdx.z*blockDim.z + threadIdx.z;

    if ((idx < (DATASIZE_X+BOXCAR_SIZE-1)) && (idy < (DATASIZE_Y+BOXCAR_SIZE-1)) && (idz < (DATASIZE_Z+BOXCAR_SIZE-1))){

        smem[threadIdx.z][threadIdx.y][threadIdx.x]=input[idz*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + idy*(DATASIZE_X+BOXCAR_SIZE-1) + idx];

    if ((threadIdx.z > (BLOCKSIZE_Z - BOXCAR_SIZE)) && (idz < DATASIZE_Z))
        smem[threadIdx.z + (BOXCAR_SIZE-1)][threadIdx.y][threadIdx.x] = input[(idz + (BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + idy*(DATASIZE_X+BOXCAR_SIZE-1) + idx];

    if ((threadIdx.y > (BLOCKSIZE_Y - BOXCAR_SIZE)) && (idy < DATASIZE_Y))
        smem[threadIdx.z][threadIdx.y + (BOXCAR_SIZE-1)][threadIdx.x] = input[idz*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + (idy+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1) + idx];

    if ((threadIdx.x > (BLOCKSIZE_X - BOXCAR_SIZE)) && (idx < DATASIZE_X))
        smem[threadIdx.z][threadIdx.y][threadIdx.x + (BOXCAR_SIZE-1)] = input[idz*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + idy*(DATASIZE_X+BOXCAR_SIZE-1) + (idx+(BOXCAR_SIZE-1))];

    if ((threadIdx.z > (BLOCKSIZE_Z - BOXCAR_SIZE)) && (threadIdx.y > (BLOCKSIZE_Y - BOXCAR_SIZE)) && (idz < DATASIZE_Z) && (idy < DATASIZE_Y))
        smem[threadIdx.z + (BOXCAR_SIZE-1)][threadIdx.y + (BOXCAR_SIZE-1)][threadIdx.x] = input[(idz+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + (idy+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1) + idx];

    if ((threadIdx.z > (BLOCKSIZE_Z - BOXCAR_SIZE)) && (threadIdx.x > (BLOCKSIZE_X - BOXCAR_SIZE)) && (idz < DATASIZE_Z) && (idx < DATASIZE_X))
        smem[threadIdx.z + (BOXCAR_SIZE-1)][threadIdx.y][threadIdx.x + (BOXCAR_SIZE-1)] = input[(idz+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + idy*(DATASIZE_X+BOXCAR_SIZE-1) + (idx+(BOXCAR_SIZE-1))];

    if ((threadIdx.y > (BLOCKSIZE_Y - BOXCAR_SIZE)) && (threadIdx.x > (BLOCKSIZE_X - BOXCAR_SIZE)) && (idy < DATASIZE_Y) && (idx < DATASIZE_X))
        smem[threadIdx.z][threadIdx.y + (BOXCAR_SIZE-1)][threadIdx.x + (BOXCAR_SIZE-1)] = input[idz*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + (idy+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1) + (idx+(BOXCAR_SIZE-1))];

    if ((threadIdx.z > (BLOCKSIZE_Z - BOXCAR_SIZE)) && (threadIdx.y > (BLOCKSIZE_Y - BOXCAR_SIZE)) && (threadIdx.x > (BLOCKSIZE_X - BOXCAR_SIZE)) && (idz < DATASIZE_Z) && (idy < DATASIZE_Y) && (idx < DATASIZE_X))
        smem[threadIdx.z+(BOXCAR_SIZE-1)][threadIdx.y+(BOXCAR_SIZE-1)][threadIdx.x+(BOXCAR_SIZE-1)] = input[(idz+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + (idy+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1) + (idx+(BOXCAR_SIZE-1))];
}

    __syncthreads();

    if ((idx < DATASIZE_X) && (idy < DATASIZE_Y) && (idz < DATASIZE_Z)){

        int temp = 0;

        for (int i=0; i<BOXCAR_SIZE; i++)
            for (int j=0; j<BOXCAR_SIZE; j++)
                for (int k=0; k<BOXCAR_SIZE; k++)
                    temp = temp + smem[threadIdx.z + i][threadIdx.y + j][threadIdx.x + k];

        output[idz*DATASIZE_X*DATASIZE_Y + idy*DATASIZE_X + idx] = temp;
    }
}

/********************************/
/* BOXCAR WITHOUT SHARED MEMORY */
/********************************/
__global__ void boxcar(int* __restrict__ output, const int* __restrict__ input)
{
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    int idy = blockIdx.y*blockDim.y + threadIdx.y;
    int idz = blockIdx.z*blockDim.z + threadIdx.z;

    if ((idx < DATASIZE_X) && (idy < DATASIZE_Y) && (idz < DATASIZE_Z)){

        int temp = 0;
        for (int i=0; i<BOXCAR_SIZE; i++)
            for (int j=0; j<BOXCAR_SIZE; j++)
                for (int k=0; k<BOXCAR_SIZE; k++)
                    temp = temp + input[(k+idz)*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + (j+idy)*(DATASIZE_X+BOXCAR_SIZE-1) + (i+idx)];

        output[idz*DATASIZE_X*DATASIZE_Y + idy*DATASIZE_X + idx] = temp;
    }
}

/********/
/* MAIN */
/********/
int main(void)
{
    int i, j, k, u, v, w, temp;

    // --- these are just for timing
    clock_t t0, t1, t2, t3;
    double t1sum=0.0f;
    double t2sum=0.0f;
    double t3sum=0.0f;

    const int nx = DATASIZE_X;
    const int ny = DATASIZE_Y;
    const int nz = DATASIZE_Z;

    const int wx = BOXCAR_SIZE;
    const int wy = BOXCAR_SIZE;
    const int wz = BOXCAR_SIZE;

    // --- start timing
    t0 = clock();

    // --- CPU memory allocations
    int *input, *output, *ref_output; 
    if ((input  = (int*)malloc(((nx+(wx-1))*(ny+(wy-1))*(nz+(wz-1)))*sizeof(int))) == 0)    { fprintf(stderr, "malloc Fail \n"); return 1; }
    if ((output = (int*)malloc((nx*ny*nz)*sizeof(int))) == 0)                               { fprintf(stderr, "malloc Fail \n"); return 1; }
    if ((ref_output = (int*)malloc((nx*ny*nz)*sizeof(int))) == 0)                               { fprintf(stderr, "malloc Fail \n"); return 1; }

    // --- Data generation
    srand(time(NULL));
    for(int i=0; i<(nz+(wz-1)); i++)
        for(int j=0; j<(ny+(wy-1)); j++)
            for (int k=0; k<(nx+(wx-1)); k++)
                input[i*(ny+(wy-1))*(nx+(wx-1))+j*(nx+(wx-1))+k] = rand(); 

    t1 = clock();

    // --- Allocate GPU space for data and results
    int *d_output, *d_input;  // storage for input
    gpuErrchk(cudaMalloc((void**)&d_input, (((nx+(wx-1))*(ny+(wy-1))*(nz+(wz-1)))*sizeof(int))));
    gpuErrchk(cudaMalloc((void**)&d_output, ((nx*ny*nz)*sizeof(int))));

    // --- Copy data from GPU to CPU
    gpuErrchk(cudaMemcpy(d_input, input, (((nx+(wx-1))*(ny+(wy-1))*(nz+(wz-1)))*sizeof(int)), cudaMemcpyHostToDevice));

    const dim3 blockSize(BLOCKSIZE_X, BLOCKSIZE_Y, BLOCKSIZE_Z);
    const dim3 gridSize(((DATASIZE_X+BLOCKSIZE_X-1)/BLOCKSIZE_X), ((DATASIZE_Y+BLOCKSIZE_Y-1)/BLOCKSIZE_Y), ((DATASIZE_Z+BLOCKSIZE_Z-1)/BLOCKSIZE_Z));

    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    boxcar_shared<<<gridSize,blockSize>>>(d_output, d_input);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time:  %3.4f ms \n", time);

    // --- Copy result from GPU to CPU
    gpuErrchk(cudaMemcpy(output, d_output, ((nx*ny*nz)*sizeof(int)), cudaMemcpyDeviceToHost));

    t2 = clock();
    t2sum = ((double)(t2-t1))/CLOCKS_PER_SEC;
    printf(" Device compute took %3.2f seconds.  Beginning host compute.\n", t2sum);

    // --- Host-side computations
    for (int u=0; u<nz; u++)
        for (int v=0; v<ny; v++)
            for (int w=0; w<nx; w++){
                temp = 0;
                for (int i=0; i<wz; i++)
                    for (int j=0; j<wy; j++)
                        for (int k=0; k<wx; k++)
                            temp = temp + input[(i+u)*(ny+(wy-1))*(nx+(wx-1))+(j+v)*(nx+(wx-1))+(k+w)];
                ref_output[u*ny*nx + v*nx + w] = temp;
            }

    t3 = clock();
    t3sum = ((double)(t3-t2))/CLOCKS_PER_SEC;
    printf(" Host compute took %3.2f seconds.  Comparing results.\n", t3sum);

    // --- Check CPU and GPU results
    for (int i=0; i<nz; i++)
        for (int j=0; j<ny; j++)
            for (int k=0; k<nx; k++)
                if (ref_output[i*ny*nx + j*nx + k] != output[i*ny*nx + j*nx + k]) {
                    printf("Mismatch at x= %d, y= %d, z= %d  Host= %d, Device = %d\n", i, j, k, ref_output[i*ny*nx + j*nx + k], output[i*ny*nx + j*nx + k]);
                    return 1;
                }
    printf("Results match!\n");

    // --- Freeing memory
    free(input);
    free(output);
    gpuErrchk(cudaFree(d_input));
    gpuErrchk(cudaFree(d_output));

    cudaDeviceReset();

    return 0;
}

cuda - 共有メモリを使用した CUDA の 3D ボックスカー フィルタ

1 に答える 1

Related

Reference

cuda - 共有メモリを使用した CUDA の 3D ボックスカーフィルタ