cuda - CUDA カーネルを数回実行する

Question

CUDA カーネルを数回実行すると問題が発生します。私のコードの環境に何か問題があります。1 回目はコードが正常に動作し、2 回目は環境のクリーンアップ中に 3 回目の呼び出しの前にランダムクラッシュが発生します。何らかの理由でメモリが破損していると思います。クラッシュは CUDA ドライバーで発生することがあり、単純な printf クラッシュや安価な kernel32.dll で発生することもあります。コードのメモリ管理に問題があると思います。

再びカーネルを実行する前に何をすべきですか?

このコードは、1 回実行すると機能します。CURAND を使用して乱数発生器を初期化しています。これが私のコードです：

    #define GRID_BLOCK 64
    #define GRID_THREAD 8
    #define CITIES 100
    #define CIPOW2 101
    int lenghtPaths = GRID_BLOCK*GRID_THREAD;
    int cities = CITIES; 
    //prepare CURAND 
    curandState *devStates;
    CUDA_CALL(cudaMalloc((void **)&devStates, GRID_BLOCK*GRID_THREAD*sizeof(curandState)));
    /* Setup prng states */
    setup_kernel<<<GRID_BLOCK ,GRID_THREAD>>>(devStates);
    CUDA_CALL(cudaDeviceSynchronize());
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) 
        fprintf(stderr, "CURAND preparation failed: %s\n", cudaGetErrorString(cudaStatus));
    //copy distance grid to constant memory 
    cudaMemcpyToSymbol(cdist, dist, sizeof(int) *CIPOW2*CIPOW2);
    CUDA_CALL(cudaMalloc((void**)&dev_pathsForThreads, lenghtPaths * cities * sizeof(int)));
    CUDA_CALL(cudaMalloc((void**)&d_results, GRID_BLOCK*GRID_THREAD * sizeof(int)));
    for (int k = 0; k < 5; k++){
        int* pathsForThreads;
        pathsForThreads = (int*)malloc(lenghtPaths * cities * sizeof(int));
        pathsForThreads = PreaparePaths(Path, lenghtPaths, cities);
        CUDA_CALL(cudaMemcpy(dev_pathsForThreads, pathsForThreads, lenghtPaths *cities*sizeof(int), cudaMemcpyHostToDevice));
        GPUAnnealing<<<GRID_BLOCK ,GRID_THREAD >>>(dev_pathsForThreads, devStates, iterationLimit,temperature, coolingRate, absoluteTemperature, cities,d_results);
        CUDA_CALL(cudaDeviceSynchronize());
        cudaStatus = cudaGetLastError();
        if (cudaStatus != cudaSuccess) 
            fprintf(stderr, "GPUAnnealing launch failed: %s\n", cudaGetErrorString(cudaStatus));
        h_results = (int*) malloc(GRID_BLOCK*GRID_THREAD * sizeof(int));
        //Copy lenght of each path to CPU 
        CUDA_CALL(cudaMemcpy(h_results, d_results,  GRID_BLOCK*GRID_THREAD * sizeof(int),cudaMemcpyDeviceToHost));
        //Copy paths to CPU 
        CUDA_CALL(cudaMemcpy(pathsForThreads, dev_pathsForThreads, lenghtPaths *cities*sizeof(int), cudaMemcpyDeviceToHost));
        //check the shortest path                       
        shortestPath = FindTheShortestPath(h_results);
        fprintf (stdout, "Shortest path on index = %d value = %d \n", shortestPath, h_results[shortestPath]);
        for (int i = 0; i < GRID_BLOCK*GRID_BLOCK ; i++)
            Path[i] = pathsForThreads[shortestPath*CITIES +i]; 
        free(pathsForThreads);
        free(h_results);
    }
    CUDA_CALL(cudaFree(dev_pathsForThreads));
    CUDA_CALL(cudaFree(d_results));
    CUDA_CALL(cudaFree(devStates));
    CUDA_CALL(cudaDeviceReset());

cuda - CUDA カーネルを数回実行する

1 に答える 1

Related

Reference