2

Fixstars の例とソース コードを使用します。具体的には、第 5 章の最後のコード (2 つの移動平均 - 別名ゴールデン クロス) を試しています。

http://www.fixstars.com/en/opencl/book/OpenCLProgrammingBook/opencl-programming-practice/

コードは次の場所から入手できます。

http://www.fixstars.com/en/opencl/book/sample/

以下に具体例を載せておきます。ただし、簡単に言えば、次のようにコマンド キューを設定することです。

command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ret);

無効な command_queue (clError) になります。そのため、返されたデータは適切に処理されません。つまり、すべてゼロです。

ただし、 を使用せずに移動平均を 1 つだけ計算するようにコードを設定すると、うまくいきますCL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE。それに対応して、この章より前の Fixstars コードはすべて正常に動作しているようです。

私は、NVIDIA チップを搭載した真新しい MacBook Pro (網膜) に取り組んでいます。だから、NVIDIAの実装の問題なのか、それとも他の側面の問題なのか疑問に思っています。

とにかく、私が最終的にやろうとしているのは、それぞれが異なるパラメーターを持つ複数の (同一の) プロセスを起動することであるため、これは私にとってショーストッパーです。

以下のコード スニペット。ソースにデバッグ出力機能を追加しました。したがって、プログラムの実行時に次のように表示されます。

clGetPlatformIDs からの戻りコード: 成功!
clGetDeviceIDs からの戻りコード: 成功!
clCreateContext からの戻りコード: 成功!
clCreateCommandQueue からの戻りコード: 無効な値
clBuildProgram からの戻りコード: 成功!
clCreateKernel(13) からの戻りコード: 成功!
clCreateKernel(26) からの戻りコード: 成功!
clEnqueueTask(13) からの戻りコード: 無効なコマンド キュー
clEnqueueTask(26) からの戻りコード: 無効なコマンド キュー
結果[25]:[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)
結果[26]:[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)
結果[27]:[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)
結果[28]:[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)

...残りのデータもすべてゼロです。私はコンパイルしています:

gcc -O2 -c moving_average_vec4p.c<br>
gcc moving_average_vec4p.o -o moving_average_vec4p -framework opencl

---- (ホストコード) moving_average_vec4p.c ----

#include <stdlib.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#include <stdio.h>

#define NAME_NUM (4) /* Number of stocks */
#define DATA_NUM (100) /* Number of data to process for each stock */

/* Read Stock data */
int stock_array_4[NAME_NUM*DATA_NUM]= {
#include "stock_array_4.txt"
};

/* Moving average width */
#define WINDOW_SIZE_13 (13)
#define WINDOW_SIZE_26 (26)

#define MAX_SOURCE_SIZE (0x100000)

/* DT: added to aid in debugging */

void printCLError (int err) {
  switch (err) {
  case CL_SUCCESS:                            printf("Success!\n"); break;
  case CL_DEVICE_NOT_FOUND:                   printf("Device not found.\n"); break;
  case CL_DEVICE_NOT_AVAILABLE:               printf("Device not available\n"); break;
  case CL_COMPILER_NOT_AVAILABLE:             printf("Compiler not available\n"); break;
  case CL_MEM_OBJECT_ALLOCATION_FAILURE:      printf("Memory object allocation failure\n"); break;
  case CL_OUT_OF_RESOURCES:                   printf("Out of resources\n"); break;
  case CL_OUT_OF_HOST_MEMORY:                 printf("Out of host memory\n"); break;
  case CL_PROFILING_INFO_NOT_AVAILABLE:       printf("Profiling information not available\n"); break;
  case CL_MEM_COPY_OVERLAP:                   printf("Memory copy overlap\n"); break;
  case CL_IMAGE_FORMAT_MISMATCH:              printf("Image format mismatch\n"); break;
  case CL_IMAGE_FORMAT_NOT_SUPPORTED:         printf("Image format not supported\n"); break;
  case CL_BUILD_PROGRAM_FAILURE:              printf("Program build failure\n"); break;
  case CL_MAP_FAILURE:                        printf("Map failure\n"); break;
  case CL_INVALID_VALUE:                      printf("Invalid value\n"); break;
  case CL_INVALID_DEVICE_TYPE:                printf("Invalid device type\n"); break;
  case CL_INVALID_PLATFORM:                   printf("Invalid platform\n"); break;
  case CL_INVALID_DEVICE:                     printf("Invalid device\n"); break;
  case CL_INVALID_CONTEXT:                    printf("Invalid context\n"); break;
  case CL_INVALID_QUEUE_PROPERTIES:           printf("Invalid queue properties\n"); break;
  case CL_INVALID_COMMAND_QUEUE:              printf("Invalid command queue\n"); break;
  case CL_INVALID_HOST_PTR:                   printf("Invalid host pointer\n"); break;
  case CL_INVALID_MEM_OBJECT:                 printf("Invalid memory object\n"); break;
  case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:    printf("Invalid image format descriptor\n"); break;
  case CL_INVALID_IMAGE_SIZE:                 printf("Invalid image size\n"); break;
  case CL_INVALID_SAMPLER:                    printf("Invalid sampler\n"); break;
  case CL_INVALID_BINARY:                     printf("Invalid binary\n"); break;
  case CL_INVALID_BUILD_OPTIONS:              printf("Invalid build options\n"); break;
  case CL_INVALID_PROGRAM:                    printf("Invalid program\n"); break;
  case CL_INVALID_PROGRAM_EXECUTABLE:         printf("Invalid program executable\n"); break;
  case CL_INVALID_KERNEL_NAME:                printf("Invalid kernel name\n"); break;
  case CL_INVALID_KERNEL_DEFINITION:          printf("Invalid kernel definition\n"); break;
  case CL_INVALID_KERNEL:                     printf("Invalid kernel\n"); break;
  case CL_INVALID_ARG_INDEX:                  printf("Invalid argument index\n"); break;
  case CL_INVALID_ARG_VALUE:                  printf("Invalid argument value\n"); break;
  case CL_INVALID_ARG_SIZE:                   printf("Invalid argument size\n"); break;
  case CL_INVALID_KERNEL_ARGS:                printf("Invalid kernel arguments\n"); break;
  case CL_INVALID_WORK_DIMENSION:             printf("Invalid work dimension\n"); break;
  case CL_INVALID_WORK_GROUP_SIZE:            printf("Invalid work group size\n"); break;
  case CL_INVALID_WORK_ITEM_SIZE:             printf("Invalid work item size\n"); break;
  case CL_INVALID_GLOBAL_OFFSET:              printf("Invalid global offset\n"); break;
  case CL_INVALID_EVENT_WAIT_LIST:            printf("Invalid event wait list\n"); break;
  case CL_INVALID_EVENT:                      printf("Invalid event\n"); break;
  case CL_INVALID_OPERATION:                  printf("Invalid operation\n"); break;
  case CL_INVALID_GL_OBJECT:                  printf("Invalid OpenGL object\n"); break;
  case CL_INVALID_BUFFER_SIZE:                printf("Invalid buffer size\n"); break;
  case CL_INVALID_MIP_LEVEL:                  printf("Invalid mip-map level\n"); break;
  default: printf("Unknown\n");
  }
}


int main(void)
{
  cl_platform_id platform_id = NULL;
  cl_uint ret_num_platforms;
  cl_device_id device_id = NULL;
  cl_uint ret_num_devices;
  cl_context context = NULL;
  cl_command_queue command_queue = NULL;
  cl_mem memobj_in = NULL;
  cl_mem memobj_out13 = NULL;
  cl_mem memobj_out26 = NULL;
  cl_program program = NULL;
  cl_kernel kernel13 = NULL;
  cl_kernel kernel26 = NULL;
  cl_event event13, event26;
  size_t kernel_code_size;
  char *kernel_src_str;
  float *result13;
  float *result26;
  cl_int ret;
  FILE *fp;

  int window_num_13 = (int)WINDOW_SIZE_13;
  int window_num_26 = (int)WINDOW_SIZE_26;
  int point_num = (NAME_NUM * DATA_NUM);
  int data_num = (int)DATA_NUM;
  int name_num = (int)NAME_NUM;

  int i, j;
  /* Allocate space to read in kernel code */
  kernel_src_str = (char *)malloc(MAX_SOURCE_SIZE);

  /* Allocate space for the result on the host side */
  result13 = (float *)malloc(point_num*sizeof(float)); /* average over13 weeks */
  result26 = (float *)malloc(point_num*sizeof(float)); /* average over26 weeks */

  /* Get Platform */
  ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
  printf("Return code from clGetPlatformIDs: ");
  printCLError(ret);

  /* Get Device */
  ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id,
               &ret_num_devices);
  printf("Return code from clGetDeviceIDs: ");
  printCLError(ret);

  /* Create Context */
  context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
  printf("Return code from clCreateContext: ");
  printCLError(ret);

  /* Create Command Queue */

  // DT:  this seems to break it (ie., output is all zeros)
  command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ret);
  printf("Return code from clCreateCommandQueue: ");
  printCLError(ret);

  /* Read kernel source code */
  fp = fopen("moving_average_vec4.cl", "r");
  kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp);
  fclose(fp);

  /* Create Program Object */
  program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str,
                      (const size_t *)&kernel_code_size, &ret);

  /* Compile kernel */
  ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
  printf("Return code from clBuildProgram: ");
  printCLError(ret);

  /* Create kernel */
  kernel13 = clCreateKernel(program, "moving_average_vec4", &ret); /* 13 weeks */
  printf("Return code from clCreateKernel(13): ");
  printCLError(ret);

  kernel26 = clCreateKernel(program, "moving_average_vec4", &ret); /* 26 weeks */
  printf("Return code from clCreateKernel(26): ");
  printCLError(ret);

  /* Create buffer for the input data on the device */
  memobj_in = clCreateBuffer(context, CL_MEM_READ_WRITE,
                 point_num * sizeof(int), NULL, &ret);

  /* Create buffer for the result on the device */
  memobj_out13 = clCreateBuffer(context, CL_MEM_READ_WRITE,
                point_num * sizeof(float), NULL, &ret); /* 13 weeks */
  memobj_out26 = clCreateBuffer(context, CL_MEM_READ_WRITE,
                point_num * sizeof(float), NULL, &ret); /* 26 weeks */

  /* Copy input data to the global memory on the device*/
  ret = clEnqueueWriteBuffer(command_queue, memobj_in, CL_TRUE, 0, point_num * sizeof(int), stock_array_4, 0, NULL, NULL);

  /* Set Kernel Arguments (13 weeks) */
  ret = clSetKernelArg(kernel13, 0, sizeof(cl_mem), (void *)&memobj_in);
  ret = clSetKernelArg(kernel13, 1, sizeof(cl_mem), (void *)&memobj_out13);
  ret = clSetKernelArg(kernel13, 2, sizeof(int), (void *)&data_num);
  ret = clSetKernelArg(kernel13, 3, sizeof(int), (void *)&window_num_13);

  /* Submit task to compute the moving average over 13 weeks */

  ret = clEnqueueTask(command_queue, kernel13, 0, NULL, NULL);
  printf("Return code from clEnqueueTask(13): ");
  printCLError(ret);

  /* Set Kernel Arguments (26 weeks) */
  ret = clSetKernelArg(kernel26, 0, sizeof(cl_mem), (void *)&memobj_in);
  ret = clSetKernelArg(kernel26, 1, sizeof(cl_mem), (void *)&memobj_out26);
  ret = clSetKernelArg(kernel26, 2, sizeof(int), (void *)&data_num);
  ret = clSetKernelArg(kernel26, 3, sizeof(int), (void *)&window_num_26);

  /* Submit task to compute the moving average over 26 weeks */
  ret = clEnqueueTask(command_queue, kernel26, 0, NULL, &event26);
  printf("Return code from clEnqueueTask(26): ");
  printCLError(ret);

  // DT:  doesn't seem to help ... ;-(
  ret = clFinish(command_queue);

  /* Copy result for the 13 weeks moving average from device to host */
  ret = clEnqueueReadBuffer(command_queue, memobj_out13, CL_TRUE, 0, point_num * sizeof(float), result13, 1, &event13, NULL);

  /* Copy result for the 26 weeks moving average from device to host */
  ret = clEnqueueReadBuffer(command_queue, memobj_out26, CL_TRUE, 0, point_num * sizeof(float), result26, 1, &event26, NULL);


  /* OpenCL Object Finalization */
  ret = clReleaseKernel(kernel13);
  ret = clReleaseKernel(kernel26);
  ret = clReleaseProgram(program);
  ret = clReleaseMemObject(memobj_in);
  ret = clReleaseMemObject(memobj_out13);
  ret = clReleaseMemObject(memobj_out26);
  ret = clReleaseCommandQueue(command_queue);
  ret = clReleaseContext(context);


  /* Display results */

  /* DT: also added printout to see if actual numeric values are passing through */

  for (i=window_num_26-1; i < data_num; i++) {
    printf("result[%d]:", i );
    for (j=0; j < name_num; j++ ) {
      /* Display whether the 13 week average is greater */
      printf( "[%d] (%f,%f)", (result13[i*NAME_NUM+j] > result26[i*NAME_NUM+j]),result13[i*NAME_NUM+j],result26[i*NAME_NUM+j] );
    }
    printf("\n");
  }

  /* Deallocate memory on the host */
  free(result13);
  free(result26);
  free(kernel_src_str);

  return 0;
}

---- (OPENCL カーネル コード) moving_average_vec4.cl ----

__kernel void moving_average_vec4(__global int4 *values,
                  __global float4 *average,
                  int length,
                  int width)
{
  int i;
  int4 add_value; /* A vector to hold 4 components */

  /* Compute sum for the first "width" elements for 4 stocks */
  add_value = (int4)0;
  for (i=0; i < width; i++) {
    add_value += values[i];
  }
  average[width-1] = convert_float4(add_value);

  /* Compute sum for the (width)th ~ (length-1)th elements for 4 stocks */
  for (i=width; i < length; i++) {
    add_value = add_value - values[i-width] + values[i];
    average[i] = convert_float4(add_value);
  }

  /* Insert zeros to 0th ~ (width-2)th element for 4 stocks*/
  for (i=0; i < width-1; i++) {
    average[i] = (float4)(1.1f);
  }

  /* Compute average of (width-1) ~ (length-1) elements for 4 stocks  */
  for (i=width-1; i < length; i++) {
    average[i] /= (float4)width;
  }
}
4

2 に答える 2

3

アウトオブオーダー コマンド キューをサポートする OpenCL 実装はほとんどありません。オーバーラップする作業については、代わりに複数のコマンド キューと (アウトオブオーダー キューのような) イベントを使用して同期します。

于 2013-06-02T14:13:46.160 に答える
0

ATI HD6950 を搭載した Ubuntu 12.04 PC をセットアップするだけです。上記のコードは、そこで意図したとおりに機能します。したがって、これは今のところMacOSのものだと思います。

ssh 経由で Aquamacs に移動します。

于 2013-05-30T21:48:07.413 に答える