kernel - WorkGroups ディメンションを変更すると、AMD サンプルリダクションから Opencl Sum Reduction を機能させることができません

Question

__kernel
void reduce(__global float* buffer,
            __local float* scratch,
            __const int length,
            __global float* result) {

  int global_index = get_global_id(0);
  float accumulator = INFINITY;
  // Loop sequentially over chunks of input vector
  while (global_index < length) {
    float element = buffer[global_index];
    accumulator = (accumulator < element) ? accumulator : element;
    global_index += get_global_size(0);
  }

  // Perform parallel reduction
  int local_index = get_local_id(0);
  scratch[local_index] = accumulator;
  barrier(CLK_LOCAL_MEM_FENCE);
  for(int offset = get_local_size(0) / 2;
      offset > 0;
      offset = offset / 2) {
    if (local_index < offset) {
      float other = scratch[local_index + offset];
      float mine = scratch[local_index];
      scratch[local_index] = (mine < other) ? mine : other;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
  }
  if (local_index == 0) {
     result[get_group_id(0)] = scratch[0];
  }
}

合計削減として機能するように調整しました。

__kernel
void reduce(__global float* buffer,
            __local float* scratch,
            __const int length,
            __global float* result) {

  int global_index = get_global_id(0);
  float accumulator = 0.0;
  // Loop sequentially over chunks of input vector
  while (global_index < length) {
    float element = buffer[global_index];
    accumulator = accumulator + element;
    global_index += get_global_size(0);
  }

  // Perform parallel reduction
  int local_index = get_local_id(0);
  scratch[local_index] = accumulator;
  barrier(CLK_LOCAL_MEM_FENCE);
  for(int offset = get_local_size(0) / 2;
      offset > 0;
      offset = offset / 2) {
    if (local_index < offset) {
      float other = scratch[local_index + offset];
      float mine = scratch[local_index];
      scratch[local_index] = mine + other;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
  }
  if (local_index == 0) {
     result[get_group_id(0)] = scratch[0];
  }
}

そして、ワークグループを 1 つしか使用しない場合 (つまり、に与える) は魅力的に機能NULLしlocal_work_sizeますclEnqueueNDRangeKernel()が、ワークグループの次元を変更しようとすると、制御不能になります。(私は OpenCl の初心者です)

私がすることは次のとおりです

#define GLOBAL_DIM 600
#define WORK_DIM 60

size_t global_1D[3] = {GLOBAL_DIM,1,1};
size_t work_dim[3] = {WORK_DIM,1,1};
err = clEnqueueNDRangeKernel(commands, av_velocity_kernel, 1, NULL, global_1D, work_dim, 0, NULL, NULL); //TODO CHECK THIS LINE
if (err)    {
  printf("Error: Failed to execute av_velocity_kernel!\n");            printf("\n%s",err_code(err));   fflush(stdout);      return EXIT_FAILURE;    }

やり方が悪いのでしょうか？？

#define GLOBAL_DIM 60000さらに、設定すると（これが必要になります）、ローカルメモリが不足することに気付きました。複数のワークグループを使用すると、「より多くの」ローカルメモリを取得できますか?

score 0 · Accepted Answer

まず第一に、ワークグループサイズが 2 の累乗の場合にのみ、これらのリダクションカーネルが正しく機能します。これは、60 の代わりに何か 64 を使用する必要があることを意味します。また、GLOBAL_DIM を変更してもローカルメモリが不足することはありません。おそらく、カーネルを呼び出すときに何か間違ったことをしている可能性があります。

kernel - WorkGroups ディメンションを変更すると、AMD サンプル リダクションから Opencl Sum Reduction を機能させることができません

1 に答える 1

Related

Reference

kernel - WorkGroups ディメンションを変更すると、AMD サンプルリダクションから Opencl Sum Reduction を機能させることができません