OpenCL の仕組みは理解できたと思ったので、ここで質問していますが、理解できないことがいくつかあると思います。
私がやりたいことは、2 つの配列のすべての値の差を取得し、hypot を計算して、最後に最大 hypot 値を取得することです。
double[] arrA = new double[]{1,2,3}
double[] arrB = new double[]{6,7,8}
dx1 = 1 - 1; dx2 = 2 - 1; dx3 = 3 - 1, dx4= 1 - 2;... dxLast = 3 - 3
dy1 = 6 - 6; dy2 = 7 - 6; dy3 = 8 - 6, dy4= 6 - 7;... dyLast = 8 - 8
(Extreme dx and dy will get 0, but i don't care about ignoring those cases by now)
次に、hypot(dx(i), dy(i)) に基づいて各hypotを計算します。これらの値がすべて取得されたら、hypotの最大値を取得します。
String programSource =
"#ifdef cl_khr_fp64 \n"
+ " #pragma OPENCL EXTENSION cl_khr_fp64 : enable \n"
+ "#elif defined(cl_amd_fp64) \n"
+ " #pragma OPENCL EXTENSION cl_amd_fp64 : enable \n"
+ "#else "
+ " #error Double precision floating point not supported by OpenCL implementation.\n"
+ "#endif \n"
+ "__kernel void "
+ "sampleKernel(__global const double *bufferX,"
+ " __global const double *bufferY,"
+ " __local double* scratch,"
+ " __global double* result,"
+ " __const int lengthX,"
+ " __const int lengthY){"
+ " const int index_a = get_global_id(0);"//Get the global indexes for 2D reference
+ " const int index_b = get_global_id(1);"
+ " const int local_index = get_local_id(0);"//Current thread id -> Should be the same as index_a * index_b + index_b;
+ " if (local_index < (lengthX * lengthY)) {"// Load data into local memory
+ " if(index_a < lengthX && index_b < lengthY)"
+ " {"
+ " double dx = (bufferX[index_b] - bufferX[index_a]);"
+ " double dy = (bufferY[index_b] - bufferY[index_a]);"
+ " scratch[local_index] = hypot(dx, dy);"
+ " }"
+ " } "
+ " else {"
+ " scratch[local_index] = 0;"// Infinity is the identity element for the min operation
+ " }"
//Make a Barrier to make sure all values were set into the local array
+ " barrier(CLK_LOCAL_MEM_FENCE);"
//If someone can explain to me the offset thing I'll really apreciate that...
//I just know there is alway a division by 2
+ " for(int offset = get_local_size(0) / 2; offset > 0; offset >>= 1) {"
+ " if (local_index < offset) {"
+ " float other = scratch[local_index + offset];"
+ " float mine = scratch[local_index];"
+ " scratch[local_index] = (mine > other) ? mine : other;"
+ " }"
+ " barrier(CLK_LOCAL_MEM_FENCE);"
//A barrier to make sure that all values where checked
+ " }"
+ " if (local_index == 0) {"
+ " result[get_group_id(0)] = scratch[0];"
+ " }"
+ "}";
この場合、定義された GWG サイズは (100, 100, 0) で、LWI サイズは (10, 10, 0) です。
したがって、この例では、両方の配列のサイズが 10 で、GWG と LWI は次のように取得されます。
//clGetKernelWorkGroupInfo(kernel, device, CL.CL_KERNEL_WORK_GROUP_SIZE, Sizeof.size_t,, null);
long kernel_work_group_size = OpenClUtil.getKernelWorkGroupSize(kernel, device.getCl_device_id(), 3);
//clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, Sizeof.size_t * numValues,, null);
long[] maxSize = device.getMaximumSizes();
maxSize[0] = ( kernel_work_group_size > maxSize[0] ? maxSize[0] : kernel_work_group_size);
maxSize[1] = ( kernel_work_group_size > maxSize[1] ? maxSize[1] : kernel_work_group_size);
maxSize[2] = ( kernel_work_group_size > maxSize[2] ? maxSize[2] : kernel_work_group_size);
// maxSize[2] =
long xMaxSize = (x > maxSize[0] ? maxSize[0] : x);
long yMaxSize = (y > maxSize[1] ? maxSize[1] : y);
long zMaxSize = (z > maxSize[2] ? maxSize[2] : z);
long local_work_size[] = new long[] { xMaxSize, yMaxSize, zMaxSize };
int numWorkGroupsX = 0;
int numWorkGroupsY = 0;
int numWorkGroupsZ = 0;
if(local_work_size[0] != 0)
numWorkGroupsX = (int) ((total + local_work_size[0] - 1) / local_work_size[0]);
if(local_work_size[1] != 0)
numWorkGroupsY = (int) ((total + local_work_size[1] - 1) / local_work_size[1]);
if(local_work_size[2] != 0)
numWorkGroupsZ = (int) ((total + local_work_size[2] - 1) / local_work_size[2]);
long global_work_size[] = new long[] { numWorkGroupsX * local_work_size[0],
numWorkGroupsY * local_work_size[1], numWorkGroupsZ * local_work_size[2]};
問題は、期待される値を取得していないため、より小さなカーネルに基づいていくつかのテストを行い、結果配列で返される [VARIABLE TO TEST VALUES] オブジェクトを変更することにしました。
* The source code of the OpenCL program to execute
private static String programSourceA =
"#ifdef cl_khr_fp64 \n"
+ " #pragma OPENCL EXTENSION cl_khr_fp64 : enable \n"
+ "#elif defined(cl_amd_fp64) \n"
+ " #pragma OPENCL EXTENSION cl_amd_fp64 : enable \n"
+ "#else "
+ " #error Double precision floating point not supported by OpenCL implementation.\n"
+ "#endif \n"
+ "__kernel void "
+ "sampleKernel(__global const double *bufferX,"
+ " __global const double *bufferY,"
+ " __local double* scratch,"
+ " __global double* result,"
+ " __const int lengthX,"
+ " __const int lengthY){"
//Get the global indexes for 2D reference
+ " const int index_a = get_global_id(0);"
+ " const int index_b = get_global_id(1);"
//Current thread id -> Should be the same as index_a * index_b + index_b;
+ " const int local_index = get_local_id(0);"
// Load data into local memory
//Only print values if index_a < ArrayA length
//Only print values if index_b < ArrayB length
//Only print values if local_index < (lengthX * lengthY)
//Only print values if this is the first work group.
+ " if (local_index < (lengthX * lengthY)) {"
+ " if(index_a < lengthX && index_b < lengthY)"
+ " {"
+ " double dx = (bufferX[index_b] - bufferX[index_a]);"
+ " double dy = (bufferY[index_b] - bufferY[index_a]);"
+ " result[local_index] = hypot(dx, dy);"
+ " }"
+ " } "
+ " else {"
// Infinity is the identity element for the min operation
+ " result[local_index] = 0;"
+ " }"
返された値はあまり見られませんが、[VARIABLE TO TEST VALUES] が (index_a * index_b) + index_a の場合、返された配列のほぼすべての値が正しい (index_a * index_b) + index_a 値になります。つまり、次のようになります。
result[0] -> 0
result[1] -> 1
result[2] -> 2
result[97] -> 97
result[98] -> 98
result[99] -> 99