カーネルがデータを間違った場所に書き込んだり、ホストがデータを間違って読み取ったりするという問題に直面しました。同じデータ (データを書き込むインデックス) を異なる型の 2 つのグローバル配列に書き込みます。インデックスが正しいことを確認するために、atom_inc によってインクリメントされるグローバル カウンターが使用されます。この問題は、データがホストの 2 番目のアレイから読み取られるときに発生します。例えば:
.....
output array index: 442: (output1 value:442.0000 output2 value:442)
output array index: 443: (output1 value:443.0000 output2 value:443)
output array index: 444: (output1 value:444.0000 output2 value:444)
output array index: 445: (output1 value:445.0000 output2 value:445)
output array index: 446: (output1 value:446.0000 output2 value:1152892928)
output array index: 447: (output1 value:447.0000 output2 value:447)
output array index: 448: (output1 value:448.0000 output2 value:1152909312)
output array index: 449: (output1 value:449.0000 output2 value:1152917504)
output array index: 450: (output1 value:450.0000 output2 value:1152925696)
......
インデックス 446、448、449、および 450+ でわかるように、output2 に間違った値が含まれています。これの考えられる理由は何ですか?
デバイス: ATI Radeon HD5750
コードサンプル:
#include <stdio.h>
#include <math.h>
#include <OpenCL/OpenCL.h>
// wtf example
const char *programSource =
"__kernel void kernel1(__global uint *counter,\n" \
"__global float *weights,\n" \
"__global uint *weights_pos)\n" \
"{\n"\
"const uint global_size = get_global_size(0);\n" \
"const uint global_id = get_global_id(0);\n" \
"uint local_id = get_local_id(0);\n" \
"if(global_id == 0) {\n" \
"counter[5] = 0; // set index of pos in weights to zero\n" \
"}\n" \
"uint insert_index = atom_inc(&counter[5]);\n" \
"weights[insert_index] = insert_index;\n" \
"weights_pos[insert_index] = insert_index;\n" \
"}";
void art_process_sinogram(const char* tiff_filename,
const float *angles2,
const unsigned int n_angles2,
const unsigned int n_ray2s,
const float distanc2e)
{
/******************************
* OPENCL ENVIRONMENT
*/
cl_int status;
cl_uint numPlatforms = 0;
cl_platform_id *platforms = NULL;
cl_device_id device_id;
//discover platforms
status = clGetPlatformIDs(0, NULL, &numPlatforms);
platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
//discover devices
cl_uint numDevices = 0;
cl_device_id *devices = NULL;
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
device_id = devices[1];
//create context
cl_context context = NULL;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
cl_program program = clCreateProgramWithSource(context, 1, (const char **)&programSource, NULL, &status);
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
cl_kernel kernel_weights = clCreateKernel(program, "kernel1", &status);
//create queue
cl_command_queue command_queue1 = clCreateCommandQueue(context, device_id, 0, &status);
/******************************
* HARDWARE PARAMETERS
*/
cl_uint wavefronts_per_SIMD = 7;
size_t global_work_size;
size_t local_work_size = 64;
cl_uint max_compute_units;
clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL);
size_t wg_count = max_compute_units * wavefronts_per_SIMD;
global_work_size = wg_count * local_work_size;
/**************************** DATA PART *************************************/
size_t w_portion_size = 768 * sizeof(cl_float);
size_t w_pos_portion_size = 768 * sizeof(cl_uint);
size_t counters_data_size = 6 * sizeof(cl_uint);
cl_uint counters_data[6];
counters_data[0] = 1;
counters_data[1] = 2; // max number of the cells intersected by the ray
counters_data[2] = 3;
counters_data[3] = 4;
counters_data[4] = 5; // same to the number of rays
counters_data[5] = 0; // counter inside kernel
/*****************
* Main buffers
*/
cl_mem weights1_buffer = clCreateBuffer(context,
CL_MEM_READ_WRITE,
w_portion_size,
NULL,
NULL);
cl_mem weights_pos1_buffer = clCreateBuffer(context,
CL_MEM_READ_WRITE,
w_pos_portion_size,
NULL,
NULL);
/*****************
* Supplement buffers (constant)
*/
cl_mem counters_data_buffer = clCreateBuffer(context,
CL_MEM_READ_ONLY,
counters_data_size,
NULL,
&status);
cl_event supplement_buffer_ready[1];
status = clEnqueueWriteBuffer(command_queue1,
counters_data_buffer,
CL_FALSE,
0,
counters_data_size,
counters_data,
0,
NULL,
&supplement_buffer_ready[0]);
status = clSetKernelArg(kernel_weights, 0, sizeof(void *), (void *)&counters_data_buffer);
status = clSetKernelArg(kernel_weights, 1, sizeof(void *), (void *)&weights1_buffer);
status = clSetKernelArg(kernel_weights, 2, sizeof(void *), (void *)&weights_pos1_buffer);
status = clEnqueueNDRangeKernel(command_queue1,
kernel_weights,
1, // work dimensional 1D, 2D, 3D
NULL, // offset
&global_work_size, // total number of WI
&local_work_size, // nomber of WI in WG
1, // num events in wait list
supplement_buffer_ready, // event wait list
NULL); // event
clFinish(command_queue1);
cl_float *output1 = (cl_float *) clEnqueueMapBuffer(command_queue1,
weights1_buffer,//*pmain_weights_buffer,
CL_TRUE,
CL_MAP_READ,
0,
w_portion_size,
0, NULL, NULL, NULL);
cl_uint *output2 = malloc(w_portion_size);
status = clEnqueueReadBuffer(command_queue1, weights_pos1_buffer,
CL_TRUE, 0, w_pos_portion_size, output2,
0, NULL, NULL);
clFinish(command_queue1);
for(int i = 0; i < 790; ++i) {
printf("output array index: %d: (output1 value:%.4f \t output2 value:%d) \n", i, output1[i], output2[i]);
}
}
解決:
カーネルは次のようになります (インデックスを確認する必要があります)。
__kernel void k_1(__global uint *counter,
__global uint *weights,
__global uint2 *weights_pos)
{
const uint global_size = get_global_size(0);
const uint global_id = get_global_id(0);
uint local_id = get_local_id(0);
uint insert_index = atom_inc(&counter[5]);
if(insert_index < 768) {
weights[insert_index]= insert_index;
weights_pos[insert_index].x = insert_index;
weights_pos[insert_index].y = insert_index;
}
}