profiling - OpenCL CodeXL Profiler がカーネルを数回実行する

Question

CodeXL (またはそれ以上のポイント sprofile) を使用して、いくつかの Opencl コードのプロファイルを作成しようとしています。performancecounter モードでプロファイリングすると (ただし、 trace オプションを使用する場合はそうではありません-t)、常に間違った出力が返されるので、その理由を調べてみました。いくつかの実験の後、各カーネルが 3 回実行されると、既存のデータを上書きするのではなく変更するカーネルの間違った結果につながるという結論に達しました。次のおもちゃプログラムは、この動作を示しています。

私の質問は次のとおりです。なぜそれがそのように動作するのか、そしてそれを止める方法を知っている人はいますか?

私の OS は Fedora Linux 18 CodeXL バージョン: CodeXL-Linux-1.1.1537.0 グラフィックカード: ATI Technologies Inc Device 6798

実行コマンドは次のとおりです。

   /opt/CodeXL-Linux-1.1.1537.0-x86_64-release/Output_x86_64/release/bin/x86_64/sprofile -o example.csv -w . OpenCLExample

私のコード:

    cl_context CreateContext()
   {
       cl_int errNum;
       cl_uint numPlatforms;
       cl_platform_id firstPlatformId;
       cl_context context = NULL;
       errNum = clGetPlatformIDs(1,&firstPlatformId, &numPlatforms);
       cl_context_properties contextProperties[] =
       {
            CL_CONTEXT_PLATFORM,
            (cl_context_properties)firstPlatformId,
            0
       };
       context = clCreateContextFromType(contextProperties,CL_DEVICE_TYPE_GPU,
                 NULL,NULL,&errNum);

       return context;
   }

   cl_command_queue CreateCommandQueue(cl_context context,cl_device_id *device)
   {
        cl_int errNum;
        cl_device_id *devices;
        cl_command_queue commandQueue = NULL;
        size_t deviceBufferSize = -1;

        errNum = clGetContextInfo(context,CL_CONTEXT_DEVICES,0,NULL,&deviceBufferSize);

        devices = new cl_device_id[deviceBufferSize/sizeof(cl_device_id)];
        errNum = clGetContextInfo(context,CL_CONTEXT_DEVICES,deviceBufferSize,devices,NULL);

        commandQueue = clCreateCommandQueue(context,devices[0],0,NULL);

        *device = devices[0];
        delete[] devices;
        return commandQueue;
   }

   cl_program CreateProgram(cl_context context,cl_device_id device,const char* filename)
   {
        cl_int errNum;
        cl_program program;

        std::ifstream kernelFile(filename,std::ios::in);
        kernelFile.is_open();

        std::ostringstream oss;
        oss << kernelFile.rdbuf();

        std::string srcStdStr = oss.str();
        const char *srcStr = srcStdStr.c_str();
        program = clCreateProgramWithSource(context,1,
                                           (const char**)&srcStr,
                                           NULL,NULL);    

        errNum = clBuildProgram(program,0,NULL,NULL,NULL,NULL);
        return program;
    }


    bool CreateMemObjects(cl_context context,cl_mem memObjects[3],float *a,float *b)
    {
       memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                       sizeof(float)*ARRAY_SIZE,a,NULL);
       memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                       sizeof(float)*ARRAY_SIZE,b,NULL);
       memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                       sizeof(float)*ARRAY_SIZE,NULL,NULL);

       return true;
    }

     int main(int arg,char** argv)
    {
         cl_context context=0;
         cl_command_queue commandQueue = 0;
         cl_program program = 0;
         cl_device_id device = 0;
         cl_kernel kernel = 0;
         cl_mem memObjects[3] = {0,0,0};
         cl_int errNum;

         context = CreateContext(); 
         commandQueue = CreateCommandQueue(context,&device);
         program = CreateProgram(context,device,"Example.cl");
         kernel = clCreateKernel(program,"example_kernel",NULL);

         float result[ARRAY_SIZE];
         float a[ARRAY_SIZE];
         float b[ARRAY_SIZE];
         for(int i=0;i<ARRAY_SIZE;i++)
         {
            a[i] = i;
            b[i] = i*2;
         }

         if(!CreateMemObjects(context,memObjects,a,b))
             return 1;

         errNum = clSetKernelArg(kernel,0,sizeof(cl_mem),&memObjects[0]);
         errNum |= clSetKernelArg(kernel,1,sizeof(cl_mem),&memObjects[1]);
         errNum |= clSetKernelArg(kernel,2,sizeof(cl_mem),&memObjects[2]);

         size_t globalWorkSize[1] = {ARRAY_SIZE};
         size_t localWorkSize[1] = { 1 };

         errNum = clEnqueueNDRangeKernel(commandQueue,kernel,1,NULL,globalWorkSize,localWorkSize,0,
         NULL,NULL);

         errNum = clEnqueueReadBuffer(commandQueue,memObjects[2], CL_TRUE,
         0,ARRAY_SIZE*sizeof(float),result,
         0,NULL,NULL);

         return 0;

     }

カーネル：

    #pragma OPENCL EXTENSION cl_amd_printf : enable

    kernel void example_kernel(global const float *a,
                               global const float *b,
                               global float *result)
    {
         int gid = get_global_id(0);
         result[gid] = a[gid] * b[gid];
         printf((__constant char *)"DEBUG: example_kernel id: %d result: %g\n", gid, result[gid]);
    }

これは私が結果として得るものです：

    DEBUG: example_kernel id: 0 result: 0
    DEBUG: example_kernel id: 1 result: 2
    DEBUG: example_kernel id: 2 result: 8
    DEBUG: example_kernel id: 3 result: 18
    DEBUG: example_kernel id: 0 result: 0
    DEBUG: example_kernel id: 1 result: 2
    DEBUG: example_kernel id: 2 result: 8
    DEBUG: example_kernel id: 3 result: 18
    DEBUG: example_kernel id: 0 result: 0
    DEBUG: example_kernel id: 1 result: 2
    DEBUG: example_kernel id: 2 result: 8
    DEBUG: example_kernel id: 3 result: 18

profiling - OpenCL CodeXL Profiler がカーネルを数回実行する

1 に答える 1

Related

Reference