2 つの 1D ガウス フィルター (ガウス分離可能性) を使用して、OpenCL に 2D ガウス フィルターを実装しました。
私は畳み込みの2つのバージョンを実装しました:
- 2 つ目は 2 つのカーネルを使用します。1 つは畳み込みを適用し、もう 1 つは画像を転置します (画像の転置には約 7ms - 畳み込みと約 1ms - かかります)。
両方の実装の計算時間を評価したところ、2 つのカーネルを使用した実装は、1 つのカーネルを使用した実装よりも高速であることがわかりました (転置カーネルは畳み込みカーネルを待たなければならないことに注意してください)。
たった 1 つのカーネルのセットアップ時間が 2 つのカーネルのセットアップ時間よりも速くなければならない場合でも、1 つのカーネルを使用した実装が遅い理由を理解するのを手伝ってくれませんか。
両方の実装の OpenCL ソース コードを以下に示します。
1 つのカーネルの実装
__kernel void ConvolutionKernel(__read_only image2d_t srcBuffer,__constant int4 *par, __constant float *filter,__local float4 *cache, __local float4 *temp,__write_only image2d_t dstBuffer) { int width = par[0].x; int height = par[0].y; int widthG = par[0].z; int heightG = par[0].w; int gaussFilterWidth = par[1].x; int margin = 1 + (gaussFilterWidth>>2); int2 location; float4 focus,set; float *s = &set; int m,start; int widthGM = widthG + (margin<<1); const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST; int i = get_global_id(0); int j = get_global_id(1); int iG = get_local_id(0); int jG = get_local_id(1); location.x = i; location.y = j; focus = read_imagef(srcBuffer, sampler, location); cache[(iG+margin) + jG*widthGM] = focus; if(iG == 0) { if(i == 0) { cache[0 + jG*widthGM].x = focus.x; cache[0 + jG*widthGM].y = focus.x; cache[0 + jG*widthGM].z = focus.x; cache[0 + jG*widthGM].w = focus.x; cache[1 + jG*widthGM].x = focus.x; cache[1 + jG*widthGM].y = focus.x; cache[1 + jG*widthGM].z = focus.x; cache[1 + jG*widthGM].w = focus.x; } else { location.x = i-margin; location.y = j; cache[jG*widthGM] = read_imagef(srcBuffer, sampler, location); location.x = i-(margin-1); location.y = j; cache[1 + jG*widthGM] = read_imagef(srcBuffer, sampler, location); } } if(iG == ((widthGM-1)-(margin<<1))) { if(i == (width-1)) { cache[(widthGM-2) + jG*widthGM].x = focus.w; cache[(widthGM-2) + jG*widthGM].y = focus.w; cache[(widthGM-2) + jG*widthGM].z = focus.w; cache[(widthGM-2) + jG*widthGM].w = focus.w; cache[(widthGM-1) + jG*widthGM].x = focus.w; cache[(widthGM-1) + jG*widthGM].y = focus.w; cache[(widthGM-1) + jG*widthGM].z = focus.w; cache[(widthGM-1) + jG*widthGM].w = focus.w; } else { location.x = i+margin; location.y = j; cache[(widthGM-1) + jG*widthGM] = read_imagef(srcBuffer, sampler, location); location.x = i+(margin-1); location.y = j; cache[(widthGM-2) + jG*widthGM] = read_imagef(srcBuffer, sampler, location); } } barrier(CLK_LOCAL_MEM_FENCE); float4 bar[10],barX[10],barY[10],barZ[10],barW[10]; float4 *p = &bar, *pX = &barX, *pY = &barY, *pZ = &barZ, *pW = &barW; float *f = &bar, *fX = &barX, *fY = &barY, *fZ = &barZ, *fW = &barW; float4 gauss[4]; float *gf = &gauss; float4 acc; gf[0] = filter[0]; gf[1] = filter[1]; gf[2] = filter[2]; gf[3] = filter[3]; gf[4] = filter[4]; gf[5] = filter[5]; gf[6] = filter[6]; gf[7] = filter[7]; gf[8] = filter[8]; gf[9] = filter[9]; gf[10]= filter[10]; gf[11]= filter[11]; gf[12]= filter[12]; gf[13]= filter[13]; gf[14]= filter[14]; gf[15]= 0.0f; start = iG + jG*widthGM; fX[0] = cache[start+0].y; fX[1] = fY[0] = cache[start+0].z; fX[2] = fY[1] = fZ[0] = cache[start+0].w; fX[3] = fY[2] = fZ[1] = fW[0] = cache[start+1].x; fX[4] = fY[3] = fZ[2] = fW[1] = cache[start+1].y; fX[5] = fY[4] = fZ[3] = fW[2] = cache[start+1].z; fX[6] = fY[5] = fZ[4] = fW[3] = cache[start+1].w; fX[7] = fY[6] = fZ[5] = fW[4] = cache[start+2].x; fX[8] = fY[7] = fZ[6] = fW[5] = cache[start+2].y; fX[9] = fY[8] = fZ[7] = fW[6] = cache[start+2].z; fX[10]= fY[9] = fZ[8] = fW[7] = cache[start+2].w; fX[11]= fY[10]= fZ[9] = fW[8] = cache[start+3].x; fX[12]= fY[11]= fZ[10]= fW[9] = cache[start+3].y; fX[13]= fY[12]= fZ[11]= fW[10]= cache[start+3].z; fX[14]= fY[13]= fZ[12]= fW[11]= cache[start+3].w; fX[15]= fY[14]= fZ[13]= fW[12]= cache[start+4].x; fX[15]= fY[15]= fZ[14]= fW[13]= cache[start+4].y; fX[15]= fY[15]= fZ[15]= fW[14]= cache[start+4].z; fX[15]= fY[15]= fZ[15]= fW[15]= 0.0f; acc.x = fX[0]*gf[0] + fX[1]*gf[1] + fX[2]*gf[2] + fX[3]*gf[3] + fX[4]*gf[4] + fX[5]*gf[5] + fX[6]*gf[6] + fX[7]*gf[7] + fX[8]*gf[8] + fX[9]*gf[9] + fX[10]*gf[10] + fX[11]*gf[11] + fX[12]*gf[12] + fX[13]*gf[13] + fX[14]*gf[14] + fX[15]*gf[15]; acc.y = fY[0]*gf[0] + fY[1]*gf[1] + fY[2]*gf[2] + fY[3]*gf[3] + fY[4]*gf[4] + fY[5]*gf[5] + fY[6]*gf[6] + fY[7]*gf[7] + fY[8]*gf[8] + fY[9]*gf[9] + fY[10]*gf[10] + fY[11]*gf[11] + fY[12]*gf[12] + fY[13]*gf[13] + fY[14]*gf[14] + fY[15]*gf[15]; acc.z = fZ[0]*gf[0] + fZ[1]*gf[1] + fZ[2]*gf[2] + fZ[3]*gf[3] + fZ[4]*gf[4] + fZ[5]*gf[5] + fZ[6]*gf[6] + fZ[7]*gf[7] + fZ[8]*gf[8] + fZ[9]*gf[9] + fZ[10]*gf[10] + fZ[11]*gf[11] + fZ[12]*gf[12] + fZ[13]*gf[13] + fZ[14]*gf[14] + fZ[15]*gf[15]; acc.w = fW[0]*gf[0] + fW[1]*gf[1] + fW[2]*gf[2] + fW[3]*gf[3] + fW[4]*gf[4] + fW[5]*gf[5] + fW[6]*gf[6] + fW[7]*gf[7] + fW[8]*gf[8] + fW[9]*gf[9] + fW[10]*gf[10] + fW[11]*gf[11] + fW[12]*gf[12] + fW[13]*gf[13] + fW[14]*gf[14] + fW[15]*gf[15]; temp[iG + jG*widthG] = acc; barrier(CLK_LOCAL_MEM_FENCE); int I,S; I = j >> 2; S = j & 3; if(S == 0) { set.x = temp[iG + (jG+0)*widthG].x; set.y = temp[iG + (jG+1)*widthG].x; set.z = temp[iG + (jG+2)*widthG].x; set.w = temp[iG + (jG+3)*widthG].x; } if(S == 1) { set.x = temp[iG + (jG-1)*widthG].y; set.y = temp[iG + (jG+0)*widthG].y; set.z = temp[iG + (jG+1)*widthG].y; set.w = temp[iG + (jG+2)*widthG].y; } if(S == 2) { set.x = temp[iG + (jG-2)*widthG].z; set.y = temp[iG + (jG-1)*widthG].z; set.z = temp[iG + (jG+0)*widthG].z; set.w = temp[iG + (jG+1)*widthG].z; } if(S == 3) { set.x = temp[iG + (jG-3)*widthG].w; set.y = temp[iG + (jG-2)*widthG].w; set.z = temp[iG + (jG-1)*widthG].w; set.w = temp[iG + (jG+0)*widthG].w; } location.x = I; location.y = (i*4 + S); write_imagef(dstBuffer, location , set); }
2 つのカーネルの実装
畳み込み:
__kernel void ConvolutionKernel(__read_only image2d_t srcBuffer, __constant int4 *par, __constant float *filter, __local float4 *cache, __local float4 *temp, __write_only image2d_t dstBuffer) { int width = par[0].x; int height = par[0].y; int widthG = par[0].z; int heightG = par[0].w; int gaussFilterWidth = par[1].x; int margin = 1 + (gaussFilterWidth>>2); int2 location; float4 focus,set; float *s = &set; int m,start; int widthGM = widthG + (margin<<1); const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST; int i = get_global_id(0); int j = get_global_id(1); int iG = get_local_id(0); int jG = get_local_id(1); location.x = i; location.y = j; focus = read_imagef(srcBuffer, sampler, location); cache[(iG+margin) + jG*widthGM] = focus; if(iG == 0) { if(i == 0) { cache[0 + jG*widthGM].x = focus.x; cache[0 + jG*widthGM].y = focus.x; cache[0 + jG*widthGM].z = focus.x; cache[0 + jG*widthGM].w = focus.x; cache[1 + jG*widthGM].x = focus.x; cache[1 + jG*widthGM].y = focus.x; cache[1 + jG*widthGM].z = focus.x; cache[1 + jG*widthGM].w = focus.x; } else { location.x = i-margin; location.y = j; cache[jG*widthGM] = read_imagef(srcBuffer, sampler, location); location.x = i-(margin-1); location.y = j; cache[1 + jG*widthGM] = read_imagef(srcBuffer, sampler, location); } } if(iG == ((widthGM-1)-(margin<<1))) { if(i == (width-1)) { cache[(widthGM-2) + jG*widthGM].x = focus.w; cache[(widthGM-2) + jG*widthGM].y = focus.w; cache[(widthGM-2) + jG*widthGM].z = focus.w; cache[(widthGM-2) + jG*widthGM].w = focus.w; cache[(widthGM-1) + jG*widthGM].x = focus.w; cache[(widthGM-1) + jG*widthGM].y = focus.w; cache[(widthGM-1) + jG*widthGM].z = focus.w; cache[(widthGM-1) + jG*widthGM].w = focus.w; } else { location.x = i+margin; location.y = j; cache[(widthGM-1) + jG*widthGM] = read_imagef(srcBuffer, sampler, location); location.x = i+(margin-1); location.y = j; cache[(widthGM-2) + jG*widthGM] = read_imagef(srcBuffer, sampler, location); } } barrier(CLK_LOCAL_MEM_FENCE); float4 bar[10],barX[10],barY[10],barZ[10],barW[10]; float4 *p = &bar, *pX = &barX, *pY = &barY, *pZ = &barZ, *pW = &barW; float *f = &bar, *fX = &barX, *fY = &barY, *fZ = &barZ, *fW = &barW; float4 gauss[4]; float *gf = &gauss; float4 acc; gf[0] = filter[0]; gf[1] = filter[1]; gf[2] = filter[2]; gf[3] = filter[3]; gf[4] = filter[4]; gf[5] = filter[5]; gf[6] = filter[6]; gf[7] = filter[7]; gf[8] = filter[8]; gf[9] = filter[9]; gf[10]= filter[10]; gf[11]= filter[11]; gf[12]= filter[12]; gf[13]= filter[13]; gf[14]= filter[14]; gf[15]= 0.0f; start = iG + jG*widthGM; fX[0] = cache[start+0].y; fX[1] = fY[0] = cache[start+0].z; fX[2] = fY[1] = fZ[0] = cache[start+0].w; fX[3] = fY[2] = fZ[1] = fW[0] = cache[start+1].x; fX[4] = fY[3] = fZ[2] = fW[1] = cache[start+1].y; fX[5] = fY[4] = fZ[3] = fW[2] = cache[start+1].z; fX[6] = fY[5] = fZ[4] = fW[3] = cache[start+1].w; fX[7] = fY[6] = fZ[5] = fW[4] = cache[start+2].x; fX[8] = fY[7] = fZ[6] = fW[5] = cache[start+2].y; fX[9] = fY[8] = fZ[7] = fW[6] = cache[start+2].z; fX[10]= fY[9] = fZ[8] = fW[7] = cache[start+2].w; fX[11]= fY[10]= fZ[9] = fW[8] = cache[start+3].x; fX[12]= fY[11]= fZ[10]= fW[9] = cache[start+3].y; fX[13]= fY[12]= fZ[11]= fW[10]= cache[start+3].z; fX[14]= fY[13]= fZ[12]= fW[11]= cache[start+3].w; fX[15]= fY[14]= fZ[13]= fW[12]= cache[start+4].x; fX[15]= fY[15]= fZ[14]= fW[13]= cache[start+4].y; fX[15]= fY[15]= fZ[15]= fW[14]= cache[start+4].z; fX[15]= fY[15]= fZ[15]= fW[15]= 0.0f; acc.x = fX[0]*gf[0] + fX[1]*gf[1] + fX[2]*gf[2] + fX[3]*gf[3] + fX[4]*gf[4] + fX[5]*gf[5] + fX[6]*gf[6] + fX[7]*gf[7] + fX[8]*gf[8] + fX[9]*gf[9] + fX[10]*gf[10] + fX[11]*gf[11] + fX[12]*gf[12] + fX[13]*gf[13] + fX[14]*gf[14] + fX[15]*gf[15]; acc.y = fY[0]*gf[0] + fY[1]*gf[1] + fY[2]*gf[2] + fY[3]*gf[3] + fY[4]*gf[4] + fY[5]*gf[5] + fY[6]*gf[6] + fY[7]*gf[7] + fY[8]*gf[8] + fY[9]*gf[9] + fY[10]*gf[10] + fY[11]*gf[11] + fY[12]*gf[12] + fY[13]*gf[13] + fY[14]*gf[14] + fY[15]*gf[15]; acc.z = fZ[0]*gf[0] + fZ[1]*gf[1] + fZ[2]*gf[2] + fZ[3]*gf[3] + fZ[4]*gf[4] + fZ[5]*gf[5] + fZ[6]*gf[6] + fZ[7]*gf[7] + fZ[8]*gf[8] + fZ[9]*gf[9] + fZ[10]*gf[10] + fZ[11]*gf[11] + fZ[12]*gf[12] + fZ[13]*gf[13] + fZ[14]*gf[14] + fZ[15]*gf[15]; acc.w = fW[0]*gf[0] + fW[1]*gf[1] + fW[2]*gf[2] + fW[3]*gf[3] + fW[4]*gf[4] + fW[5]*gf[5] + fW[6]*gf[6] + fW[7]*gf[7] + fW[8]*gf[8] + fW[9]*gf[9] + fW[10]*gf[10] + fW[11]*gf[11] + fW[12]*gf[12] + fW[13]*gf[13] + fW[14]*gf[14] + fW[15]*gf[15]; location.x = i; location.y = j; write_imagef(dstBuffer, location , acc); }
転置:
__kernel void TransponseKernel(__read_only image2d_t srcBuffer, __constant int4 *par, __local float4 *temp, __write_only image2d_t dstBuffer) { int width = par[0].x; int height = par[0].y; int widthG = par[0].z; int heightG = par[0].w; int gaussFilterWidth = par[1].x; int margin = 1 + (gaussFilterWidth>>2); int widthGM = widthG + (margin<<1); int2 location; float4 focus,set; const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST; int i = get_global_id(0); int j = get_global_id(1); int iG = get_local_id(0); int jG = get_local_id(1); location.x = i; location.y = j; focus = read_imagef(srcBuffer, sampler, location); temp[iG + jG*widthG] = focus; barrier(CLK_LOCAL_MEM_FENCE); int I,S; I = j >> 2; S = j & 3; if(S == 0) { set.x = temp[iG + (jG+0)*widthG].x; set.y = temp[iG + (jG+1)*widthG].x; set.z = temp[iG + (jG+2)*widthG].x; set.w = temp[iG + (jG+3)*widthG].x; } if(S == 1) { set.x = temp[iG + (jG-1)*widthG].y; set.y = temp[iG + (jG+0)*widthG].y; set.z = temp[iG + (jG+1)*widthG].y; set.w = temp[iG + (jG+2)*widthG].y; } if(S == 2) { set.x = temp[iG + (jG-2)*widthG].z; set.y = temp[iG + (jG-1)*widthG].z; set.z = temp[iG + (jG+0)*widthG].z; set.w = temp[iG + (jG+1)*widthG].z; } if(S == 3) { set.x = temp[iG + (jG-3)*widthG].w; set.y = temp[iG + (jG-2)*widthG].w; set.z = temp[iG + (jG-1)*widthG].w; set.w = temp[iG + (jG+0)*widthG].w; } location.x = I; location.y = (i*4 + S); write_imagef(dstBuffer, location , set); }