Fixstars の例とソース コードを使用します。具体的には、第 5 章の最後のコード (2 つの移動平均 - 別名ゴールデン クロス) を試しています。
http://www.fixstars.com/en/opencl/book/OpenCLProgrammingBook/opencl-programming-practice/
コードは次の場所から入手できます。
http://www.fixstars.com/en/opencl/book/sample/
以下に具体例を載せておきます。ただし、簡単に言えば、次のようにコマンド キューを設定することです。
command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ret);
無効な command_queue (clError) になります。そのため、返されたデータは適切に処理されません。つまり、すべてゼロです。
ただし、 を使用せずに移動平均を 1 つだけ計算するようにコードを設定すると、うまくいきますCL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
。それに対応して、この章より前の Fixstars コードはすべて正常に動作しているようです。
私は、NVIDIA チップを搭載した真新しい MacBook Pro (網膜) に取り組んでいます。だから、NVIDIAの実装の問題なのか、それとも他の側面の問題なのか疑問に思っています。
とにかく、私が最終的にやろうとしているのは、それぞれが異なるパラメーターを持つ複数の (同一の) プロセスを起動することであるため、これは私にとってショーストッパーです。
以下のコード スニペット。ソースにデバッグ出力機能を追加しました。したがって、プログラムの実行時に次のように表示されます。
clGetPlatformIDs からの戻りコード: 成功! clGetDeviceIDs からの戻りコード: 成功! clCreateContext からの戻りコード: 成功! clCreateCommandQueue からの戻りコード: 無効な値 clBuildProgram からの戻りコード: 成功! clCreateKernel(13) からの戻りコード: 成功! clCreateKernel(26) からの戻りコード: 成功! clEnqueueTask(13) からの戻りコード: 無効なコマンド キュー clEnqueueTask(26) からの戻りコード: 無効なコマンド キュー 結果[25]:[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000) 結果[26]:[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000) 結果[27]:[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000) 結果[28]:[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)
...残りのデータもすべてゼロです。私はコンパイルしています:
gcc -O2 -c moving_average_vec4p.c<br>
gcc moving_average_vec4p.o -o moving_average_vec4p -framework opencl
---- (ホストコード) moving_average_vec4p.c ----
#include <stdlib.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#include <stdio.h>
#define NAME_NUM (4) /* Number of stocks */
#define DATA_NUM (100) /* Number of data to process for each stock */
/* Read Stock data */
int stock_array_4[NAME_NUM*DATA_NUM]= {
#include "stock_array_4.txt"
};
/* Moving average width */
#define WINDOW_SIZE_13 (13)
#define WINDOW_SIZE_26 (26)
#define MAX_SOURCE_SIZE (0x100000)
/* DT: added to aid in debugging */
void printCLError (int err) {
switch (err) {
case CL_SUCCESS: printf("Success!\n"); break;
case CL_DEVICE_NOT_FOUND: printf("Device not found.\n"); break;
case CL_DEVICE_NOT_AVAILABLE: printf("Device not available\n"); break;
case CL_COMPILER_NOT_AVAILABLE: printf("Compiler not available\n"); break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("Memory object allocation failure\n"); break;
case CL_OUT_OF_RESOURCES: printf("Out of resources\n"); break;
case CL_OUT_OF_HOST_MEMORY: printf("Out of host memory\n"); break;
case CL_PROFILING_INFO_NOT_AVAILABLE: printf("Profiling information not available\n"); break;
case CL_MEM_COPY_OVERLAP: printf("Memory copy overlap\n"); break;
case CL_IMAGE_FORMAT_MISMATCH: printf("Image format mismatch\n"); break;
case CL_IMAGE_FORMAT_NOT_SUPPORTED: printf("Image format not supported\n"); break;
case CL_BUILD_PROGRAM_FAILURE: printf("Program build failure\n"); break;
case CL_MAP_FAILURE: printf("Map failure\n"); break;
case CL_INVALID_VALUE: printf("Invalid value\n"); break;
case CL_INVALID_DEVICE_TYPE: printf("Invalid device type\n"); break;
case CL_INVALID_PLATFORM: printf("Invalid platform\n"); break;
case CL_INVALID_DEVICE: printf("Invalid device\n"); break;
case CL_INVALID_CONTEXT: printf("Invalid context\n"); break;
case CL_INVALID_QUEUE_PROPERTIES: printf("Invalid queue properties\n"); break;
case CL_INVALID_COMMAND_QUEUE: printf("Invalid command queue\n"); break;
case CL_INVALID_HOST_PTR: printf("Invalid host pointer\n"); break;
case CL_INVALID_MEM_OBJECT: printf("Invalid memory object\n"); break;
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: printf("Invalid image format descriptor\n"); break;
case CL_INVALID_IMAGE_SIZE: printf("Invalid image size\n"); break;
case CL_INVALID_SAMPLER: printf("Invalid sampler\n"); break;
case CL_INVALID_BINARY: printf("Invalid binary\n"); break;
case CL_INVALID_BUILD_OPTIONS: printf("Invalid build options\n"); break;
case CL_INVALID_PROGRAM: printf("Invalid program\n"); break;
case CL_INVALID_PROGRAM_EXECUTABLE: printf("Invalid program executable\n"); break;
case CL_INVALID_KERNEL_NAME: printf("Invalid kernel name\n"); break;
case CL_INVALID_KERNEL_DEFINITION: printf("Invalid kernel definition\n"); break;
case CL_INVALID_KERNEL: printf("Invalid kernel\n"); break;
case CL_INVALID_ARG_INDEX: printf("Invalid argument index\n"); break;
case CL_INVALID_ARG_VALUE: printf("Invalid argument value\n"); break;
case CL_INVALID_ARG_SIZE: printf("Invalid argument size\n"); break;
case CL_INVALID_KERNEL_ARGS: printf("Invalid kernel arguments\n"); break;
case CL_INVALID_WORK_DIMENSION: printf("Invalid work dimension\n"); break;
case CL_INVALID_WORK_GROUP_SIZE: printf("Invalid work group size\n"); break;
case CL_INVALID_WORK_ITEM_SIZE: printf("Invalid work item size\n"); break;
case CL_INVALID_GLOBAL_OFFSET: printf("Invalid global offset\n"); break;
case CL_INVALID_EVENT_WAIT_LIST: printf("Invalid event wait list\n"); break;
case CL_INVALID_EVENT: printf("Invalid event\n"); break;
case CL_INVALID_OPERATION: printf("Invalid operation\n"); break;
case CL_INVALID_GL_OBJECT: printf("Invalid OpenGL object\n"); break;
case CL_INVALID_BUFFER_SIZE: printf("Invalid buffer size\n"); break;
case CL_INVALID_MIP_LEVEL: printf("Invalid mip-map level\n"); break;
default: printf("Unknown\n");
}
}
int main(void)
{
cl_platform_id platform_id = NULL;
cl_uint ret_num_platforms;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem memobj_in = NULL;
cl_mem memobj_out13 = NULL;
cl_mem memobj_out26 = NULL;
cl_program program = NULL;
cl_kernel kernel13 = NULL;
cl_kernel kernel26 = NULL;
cl_event event13, event26;
size_t kernel_code_size;
char *kernel_src_str;
float *result13;
float *result26;
cl_int ret;
FILE *fp;
int window_num_13 = (int)WINDOW_SIZE_13;
int window_num_26 = (int)WINDOW_SIZE_26;
int point_num = (NAME_NUM * DATA_NUM);
int data_num = (int)DATA_NUM;
int name_num = (int)NAME_NUM;
int i, j;
/* Allocate space to read in kernel code */
kernel_src_str = (char *)malloc(MAX_SOURCE_SIZE);
/* Allocate space for the result on the host side */
result13 = (float *)malloc(point_num*sizeof(float)); /* average over13 weeks */
result26 = (float *)malloc(point_num*sizeof(float)); /* average over26 weeks */
/* Get Platform */
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
printf("Return code from clGetPlatformIDs: ");
printCLError(ret);
/* Get Device */
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id,
&ret_num_devices);
printf("Return code from clGetDeviceIDs: ");
printCLError(ret);
/* Create Context */
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
printf("Return code from clCreateContext: ");
printCLError(ret);
/* Create Command Queue */
// DT: this seems to break it (ie., output is all zeros)
command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ret);
printf("Return code from clCreateCommandQueue: ");
printCLError(ret);
/* Read kernel source code */
fp = fopen("moving_average_vec4.cl", "r");
kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
/* Create Program Object */
program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str,
(const size_t *)&kernel_code_size, &ret);
/* Compile kernel */
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
printf("Return code from clBuildProgram: ");
printCLError(ret);
/* Create kernel */
kernel13 = clCreateKernel(program, "moving_average_vec4", &ret); /* 13 weeks */
printf("Return code from clCreateKernel(13): ");
printCLError(ret);
kernel26 = clCreateKernel(program, "moving_average_vec4", &ret); /* 26 weeks */
printf("Return code from clCreateKernel(26): ");
printCLError(ret);
/* Create buffer for the input data on the device */
memobj_in = clCreateBuffer(context, CL_MEM_READ_WRITE,
point_num * sizeof(int), NULL, &ret);
/* Create buffer for the result on the device */
memobj_out13 = clCreateBuffer(context, CL_MEM_READ_WRITE,
point_num * sizeof(float), NULL, &ret); /* 13 weeks */
memobj_out26 = clCreateBuffer(context, CL_MEM_READ_WRITE,
point_num * sizeof(float), NULL, &ret); /* 26 weeks */
/* Copy input data to the global memory on the device*/
ret = clEnqueueWriteBuffer(command_queue, memobj_in, CL_TRUE, 0, point_num * sizeof(int), stock_array_4, 0, NULL, NULL);
/* Set Kernel Arguments (13 weeks) */
ret = clSetKernelArg(kernel13, 0, sizeof(cl_mem), (void *)&memobj_in);
ret = clSetKernelArg(kernel13, 1, sizeof(cl_mem), (void *)&memobj_out13);
ret = clSetKernelArg(kernel13, 2, sizeof(int), (void *)&data_num);
ret = clSetKernelArg(kernel13, 3, sizeof(int), (void *)&window_num_13);
/* Submit task to compute the moving average over 13 weeks */
ret = clEnqueueTask(command_queue, kernel13, 0, NULL, NULL);
printf("Return code from clEnqueueTask(13): ");
printCLError(ret);
/* Set Kernel Arguments (26 weeks) */
ret = clSetKernelArg(kernel26, 0, sizeof(cl_mem), (void *)&memobj_in);
ret = clSetKernelArg(kernel26, 1, sizeof(cl_mem), (void *)&memobj_out26);
ret = clSetKernelArg(kernel26, 2, sizeof(int), (void *)&data_num);
ret = clSetKernelArg(kernel26, 3, sizeof(int), (void *)&window_num_26);
/* Submit task to compute the moving average over 26 weeks */
ret = clEnqueueTask(command_queue, kernel26, 0, NULL, &event26);
printf("Return code from clEnqueueTask(26): ");
printCLError(ret);
// DT: doesn't seem to help ... ;-(
ret = clFinish(command_queue);
/* Copy result for the 13 weeks moving average from device to host */
ret = clEnqueueReadBuffer(command_queue, memobj_out13, CL_TRUE, 0, point_num * sizeof(float), result13, 1, &event13, NULL);
/* Copy result for the 26 weeks moving average from device to host */
ret = clEnqueueReadBuffer(command_queue, memobj_out26, CL_TRUE, 0, point_num * sizeof(float), result26, 1, &event26, NULL);
/* OpenCL Object Finalization */
ret = clReleaseKernel(kernel13);
ret = clReleaseKernel(kernel26);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(memobj_in);
ret = clReleaseMemObject(memobj_out13);
ret = clReleaseMemObject(memobj_out26);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
/* Display results */
/* DT: also added printout to see if actual numeric values are passing through */
for (i=window_num_26-1; i < data_num; i++) {
printf("result[%d]:", i );
for (j=0; j < name_num; j++ ) {
/* Display whether the 13 week average is greater */
printf( "[%d] (%f,%f)", (result13[i*NAME_NUM+j] > result26[i*NAME_NUM+j]),result13[i*NAME_NUM+j],result26[i*NAME_NUM+j] );
}
printf("\n");
}
/* Deallocate memory on the host */
free(result13);
free(result26);
free(kernel_src_str);
return 0;
}
---- (OPENCL カーネル コード) moving_average_vec4.cl ----
__kernel void moving_average_vec4(__global int4 *values,
__global float4 *average,
int length,
int width)
{
int i;
int4 add_value; /* A vector to hold 4 components */
/* Compute sum for the first "width" elements for 4 stocks */
add_value = (int4)0;
for (i=0; i < width; i++) {
add_value += values[i];
}
average[width-1] = convert_float4(add_value);
/* Compute sum for the (width)th ~ (length-1)th elements for 4 stocks */
for (i=width; i < length; i++) {
add_value = add_value - values[i-width] + values[i];
average[i] = convert_float4(add_value);
}
/* Insert zeros to 0th ~ (width-2)th element for 4 stocks*/
for (i=0; i < width-1; i++) {
average[i] = (float4)(1.1f);
}
/* Compute average of (width-1) ~ (length-1) elements for 4 stocks */
for (i=width-1; i < length; i++) {
average[i] /= (float4)width;
}
}