cuda - カーネルの for ループからマルチスレッドへの変更

Question

現在、グリッドの補間に取り組んでおり、マルチスレッドに関していくつかの問題があります。このコードは、2x2 マトリックスで表されるマップを読み取り、それを補間してポイント数を 100 倍に増やすことを想定しています。カーネルで for ループを使用すると、うまく機能します。

補間前：http: //bildr.no/view/OWV1UDRO

補間後: http://bildr.no/view/eTlmNmpo

for ループをスレッドで変更しようとすると、奇妙な結果が生じました。数値の代わりに、結果の行列を -1 で埋めました。#QNAN

カーネル内の for ループを使用した作業コードは次のとおりです

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <fstream>
#include "cuda.h"

using namespace std;

float Z[41][41];

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaThreadSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

texture<float, 2, cudaReadModeElementType> tex;


__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare) 
{
    int k = sqrt(numberOfInterpolationsPerSquare);


    for (float i=0; i<n*k; i++)
    {
        for (float j=0; j<m*k; j++) 
        {
        f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f);
        }
    }

}

int main (void)
{
    // Start timer
    clock_t tStart = clock();

    // Size of map
    int n=41;
    int m=41;

    int g = 0;

    float numberOfInterpolationsPerSquare = 100;
    float numberOfElements = pow(sqrt(numberOfInterpolationsPerSquare)*n,2);

    size_t pitch, tex_ofs;
    float *f;
    float *r;
    float *map_d = 0;

    // Build read-Streams
    ifstream map;   

    //Create and open a txt file for MATLAB
    ofstream file;

    // Open data
    map.open("Map.txt", ios_base::in); 
    file.open("Bilinear.txt");

    // Store the map in a 2D array
    for (int i=0; i<n; i++)
    {
        for (int j=0; j<m; j++)
        {
            map >> Z[i][j];
        }
    }

    // Allocate memory on host and device
    CUDA_SAFE_CALL(cudaMallocPitch((void**)&map_d,&pitch,n*sizeof(*map_d),m));
    CUDA_SAFE_CALL(cudaMalloc((void**)&f, numberOfElements*sizeof(float)));
    r = (float*)malloc(numberOfElements*sizeof(float));

    // Copy map from host to device
    CUDA_SAFE_CALL(cudaMemcpy2D(map_d, pitch, Z, n*sizeof(Z[0][0]), n*sizeof(Z[0][0]),m,cudaMemcpyHostToDevice));

    // Set texture mode to bilinear interpolation
    tex.normalized = false;
    tex.filterMode = cudaFilterModeLinear;

    // Bind the map to texture
    CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, map_d, &tex.channelDesc, n, m, pitch));

    // Checking for offset
    if (tex_ofs !=0) {
        printf ("tex_ofs = %zu\n", tex_ofs);
        return EXIT_FAILURE;
   }

    // Launch Kernel
    kernel <<< 1,1 >>> (m, n, f, numberOfInterpolationsPerSquare);
    CHECK_LAUNCH_ERROR();    
    CUDA_SAFE_CALL (cudaDeviceSynchronize());

    // Copy result from device to host
    cudaMemcpy(r, f, numberOfElements*sizeof(float), cudaMemcpyDeviceToHost);

    // Write results to file
    for(int h=0;h<numberOfElements;h++)
    {
        if(g==sqrt(numberOfElements))
        {
            file << endl;
            g=0;
        }
        file << r[h] << " ";
        g++;
    }

    // Free memory
    CUDA_SAFE_CALL (cudaUnbindTexture (tex));
    CUDA_SAFE_CALL (cudaFree (map_d));
    CUDA_SAFE_CALL (cudaFree (f));
    free( r );

    // Print out execution time
    printf("Time taken: %.3fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);

    return EXIT_SUCCESS;
}

これは、動作しないマルチスレッドを備えたカーネルです

__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare) 
{
    int k = sqrt(numberOfInterpolationsPerSquare);

    int i= blockIdx.x * blockDim.x + threadIdx.x;
    int j= blockIdx.y * blockDim.y + threadIdx.y;


    if(i>=n*k || j>=m*k)
        return;

    f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f);

}

マルチスレッドバージョンが機能しない理由を知っている人はいますか?

よろしく

ソンドレ

cuda - カーネルの for ループからマルチスレッドへの変更

1 に答える 1

Related

Reference