c++ - OpenCL で間違ったデータを読み取る

Question

カーネルがデータを間違った場所に書き込んだり、ホストがデータを間違って読み取ったりするという問題に直面しました。同じデータ (データを書き込むインデックス) を異なる型の 2 つのグローバル配列に書き込みます。インデックスが正しいことを確認するために、atom_inc によってインクリメントされるグローバルカウンターが使用されます。この問題は、データがホストの 2 番目のアレイから読み取られるときに発生します。例えば：

.....
output array index: 442: (output1 value:442.0000     output2 value:442) 
output array index: 443: (output1 value:443.0000     output2 value:443) 
output array index: 444: (output1 value:444.0000     output2 value:444) 
output array index: 445: (output1 value:445.0000     output2 value:445) 
output array index: 446: (output1 value:446.0000     output2 value:1152892928) 
output array index: 447: (output1 value:447.0000     output2 value:447) 
output array index: 448: (output1 value:448.0000     output2 value:1152909312) 
output array index: 449: (output1 value:449.0000     output2 value:1152917504) 
output array index: 450: (output1 value:450.0000     output2 value:1152925696)
......

インデックス 446、448、449、および 450+ でわかるように、output2 に間違った値が含まれています。これの考えられる理由は何ですか？

デバイス: ATI Radeon HD5750

コードサンプル:

#include <stdio.h>
#include <math.h>
#include <OpenCL/OpenCL.h>

// wtf example
const char *programSource =
"__kernel void kernel1(__global uint *counter,\n" \
"__global float *weights,\n" \
"__global uint *weights_pos)\n" \
"{\n"\
"const uint global_size = get_global_size(0);\n" \
"const uint global_id = get_global_id(0);\n" \
"uint local_id = get_local_id(0);\n" \

"if(global_id == 0) {\n" \
"counter[5] = 0; // set index of pos in weights to zero\n" \
"}\n" \

"uint insert_index = atom_inc(&counter[5]);\n" \
"weights[insert_index] = insert_index;\n" \
"weights_pos[insert_index] = insert_index;\n" \
"}";

void art_process_sinogram(const char* tiff_filename,
                          const float *angles2,
                          const unsigned int n_angles2,
                          const unsigned int n_ray2s,
                          const float distanc2e)
{
  /******************************
   * OPENCL ENVIRONMENT
   */
  cl_int status;
  cl_uint numPlatforms = 0;
  cl_platform_id *platforms = NULL;
  cl_device_id device_id;

  //discover platforms
  status = clGetPlatformIDs(0, NULL, &numPlatforms);
  platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
  status = clGetPlatformIDs(numPlatforms, platforms, NULL);

  //discover devices
  cl_uint numDevices = 0;
  cl_device_id *devices = NULL;

  status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
  devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
  status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
  device_id = devices[1];
  //create context
  cl_context context = NULL;
  context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);

  cl_program program = clCreateProgramWithSource(context, 1, (const char **)&programSource, NULL, &status);
  clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  cl_kernel  kernel_weights = clCreateKernel(program, "kernel1", &status);

  //create queue
  cl_command_queue command_queue1 = clCreateCommandQueue(context, device_id, 0, &status);

  /******************************
   * HARDWARE PARAMETERS
   */
  cl_uint wavefronts_per_SIMD = 7;
  size_t global_work_size;
  size_t local_work_size = 64;

  cl_uint max_compute_units;

  clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL);

  size_t wg_count = max_compute_units * wavefronts_per_SIMD;
  global_work_size = wg_count * local_work_size;

  /**************************** DATA PART *************************************/

  size_t w_portion_size = 768 * sizeof(cl_float);
  size_t w_pos_portion_size = 768 * sizeof(cl_uint);

  size_t counters_data_size = 6 * sizeof(cl_uint);
  cl_uint counters_data[6];
  counters_data[0] = 1;
  counters_data[1] = 2; // max number of the cells intersected by the ray
  counters_data[2] = 3;
  counters_data[3] = 4;
  counters_data[4] = 5; // same to the number of rays
  counters_data[5] = 0; // counter inside kernel

  /*****************
   * Main buffers
   */
  cl_mem weights1_buffer = clCreateBuffer(context,
                                          CL_MEM_READ_WRITE,
                                          w_portion_size,
                                          NULL,
                                          NULL);

  cl_mem weights_pos1_buffer = clCreateBuffer(context,
                                              CL_MEM_READ_WRITE,
                                              w_pos_portion_size,
                                              NULL,
                                              NULL);
  /*****************
   * Supplement buffers (constant)
   */
  cl_mem counters_data_buffer = clCreateBuffer(context,
                                               CL_MEM_READ_ONLY,
                                               counters_data_size,
                                               NULL,
                                               &status);


  cl_event supplement_buffer_ready[1];

  status = clEnqueueWriteBuffer(command_queue1,
                                counters_data_buffer,
                                CL_FALSE,
                                0,
                                counters_data_size,
                                counters_data,
                                0,
                                NULL,
                                &supplement_buffer_ready[0]);

  status = clSetKernelArg(kernel_weights, 0, sizeof(void *), (void *)&counters_data_buffer);
  status = clSetKernelArg(kernel_weights, 1, sizeof(void *), (void *)&weights1_buffer);
  status = clSetKernelArg(kernel_weights, 2, sizeof(void *), (void *)&weights_pos1_buffer);

  status = clEnqueueNDRangeKernel(command_queue1,
                                  kernel_weights,
                                  1, // work dimensional 1D, 2D, 3D
                                  NULL, // offset
                                  &global_work_size, // total number of WI
                                  &local_work_size, // nomber of WI in WG
                                  1, // num events in wait list
                                  supplement_buffer_ready,  // event wait list
                                  NULL); // event

  clFinish(command_queue1);
  cl_float *output1 = (cl_float *) clEnqueueMapBuffer(command_queue1,
                                                      weights1_buffer,//*pmain_weights_buffer,
                                                      CL_TRUE,
                                                      CL_MAP_READ,
                                                      0,
                                                      w_portion_size,
                                                      0, NULL, NULL, NULL);
  cl_uint *output2 = malloc(w_portion_size);
  status = clEnqueueReadBuffer(command_queue1, weights_pos1_buffer,
                               CL_TRUE, 0, w_pos_portion_size, output2,
                               0, NULL, NULL);

  clFinish(command_queue1);
  for(int i = 0; i < 790; ++i) {
    printf("output array index: %d: (output1 value:%.4f \t output2 value:%d) \n", i, output1[i], output2[i]);
  }
}

解決：

カーネルは次のようになります (インデックスを確認する必要があります)。

  __kernel void k_1(__global uint *counter, 
                    __global uint *weights, 
                    __global uint2 *weights_pos)
 {
    const uint global_size = get_global_size(0);
    const uint global_id = get_global_id(0);
    uint local_id = get_local_id(0);

    uint insert_index = atom_inc(&counter[5]);
    if(insert_index < 768) {
       weights[insert_index]= insert_index;
       weights_pos[insert_index].x = insert_index;
       weights_pos[insert_index].y = insert_index;
    }
}

score 2 · Accepted Answer

バッファの次元を台無しにしています。

1) バッファにはそれぞれ 768 個の要素が含まれています ( との初期化をw_portion_size参照w_pos_portion_size)

2) 私のマシンのワークグループサイズは 896 です (の初期化を参照wg_count)

3) 790 個の値を出力します。

これとは別に、1 つの概念的なエラーがここにあります。

if(global_id == 0) {
     counter[5] = 0; // set index of pos in weights to zero
}
//atomic increments on counter[5]

最初の仮想プロセッサがこの行を他の行より先に実行するとは想定できません。counter[5]ホスト側で初期化するため、この行を完全に削除する必要があります。(これが問題の原因だと思いますが、再現できません)。

これらの問題を修正した後、コードは正常に動作するようです (Intel 実装)。

score 0 · Accepted Answer

カーネルは次のようになります (インデックスを確認する必要があります)。

__kernel void k_1(__global uint *counter, 
                __global uint *weights, 
                __global uint2 *weights_pos)
{
   const uint global_size = get_global_size(0);
   const uint global_id = get_global_id(0);
   uint local_id = get_local_id(0);

   uint insert_index = atom_inc(&counter[5]);

   if(insert_index < 768) {
     weights[insert_index]= insert_index;
     weights_pos[insert_index].x = insert_index;
    weights_pos[insert_index].y = insert_index;
   }
}

c++ - OpenCL で間違ったデータを読み取る

2 に答える 2

Related

Reference