c++ - テクスチャメモリの結合 CUDA 6 のユニファイドメモリ

Question

私は、CUDA 6 を使用して Jetson TK1 用の CUDA アプリケーションを作成しています。マーク・ハリスのブログ投稿で印象を受けました。

Jetson TK1: どこでも CUDA を利用できるモバイル組み込みスーパーコンピューター

Tegra K1 のメモリは物理的に統合されています。cudaMallocManagedが通常よりもグローバルメモリの方が大幅に高速であることを示す結果も観察しcudaMemcpyました。これはおそらく、統合メモリがコピーを必要としないためです。

しかし、アプリケーションの一部にテクスチャメモリを使用したい場合はどうすればよいでしょうか? を使用したテクスチャのサポートが見つからなかったcudaMallocManagedので、通常cudaMemcpyToArrayとbindTextureToArray?を使用する必要があると想定しました。

前述の方法を使用すると、多くの場合うまくいくように見えますが、によって管理さcudaMallocManagedれる変数によって、奇妙なセグメンテーションエラーが発生することがあります。これは、テクスチャメモリを統合メモリと共に使用する正しい方法ですか? 次のコードは、その方法を示しています。このコードは正常に動作しますが、私の質問は、これが正しい方法なのか、それともセグメンテーション違反などを引き起こす可能性のある未定義の動作を作成する可能性があるのかということです。

#define width 16
#define height 16
texture<float, cudaTextureType2D, cudaReadModeElementType> input_tex;

__global__ void some_tex_kernel(float* output){
    int i= threadIdx.x;
    float x = i%width+0.5f;
    float y =  i/width+0.5f;
    output[i] = tex2D(input_tex, x, y);
}

int main(){
    float* out;
    if(cudaMallocManaged(&out, width*height*sizeof(float))!= cudaSuccess)
        std::cout << "unified not working\n";

    for(int i=0; i< width*height; ++i){
        out[i] = float(i);
    }

    const cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
    cudaArray* input_t;
    cudaMallocArray(&input_t, &desc, width, height);
    cudaMemcpyToArrayAsync(input_t, 0, 0, out, width*height*sizeof(float),  cudaMemcpyHostToDevice);

    input_tex.filterMode = cudaFilterModeLinear;
    cudaBindTextureToArray(input_tex, input_t, desc);

    some_tex_kernel<<<1, width*height>>>(out);
    cudaDeviceSynchronize();

    for(int i=0;i<width*height; ++i)
        std::cout << out[i] << " ";

    cudaFree(out);
    cudaFreeArray(input_t); 
    }
}

私が奇妙だと思うもう 1 つの点はcudaDeviceSynchronize()、コード内のを削除すると、常にセグメンテーションエラーが発生することです。同期せずに読んだ場合、結果が完成しない可能性があることは理解していますが、変数にまだアクセスできるべきではありませんか?

誰にも手がかりがありますか？

マティアス

score 3 · Accepted Answer

Robert Crovella はすでにあなたの質問に答えています。ただし、cudaMallocManagedがテクスチャメモリのフレームワークで使用できることを示すために、1D 線形補間コードをダスティングし、を使用して変換しましたcudaMallocManaged。コードが 4 つの異なる方法で 1D 線形補間を実行することがわかります。

CPU;
GPU;
GPU を使用してtex1Dfetch;
tex1Dフィルタリングを使用する GPU 。

コードはすべてのケースで問題なく動作し、特に後者の 2 つのケースでは Kepler K20c カード上で動作します。

// includes, system
#include <cstdlib> 
#include <conio.h>
#include <math.h>
#include <fstream>
#include <iostream> 
#include <iomanip>

// includes, cuda 
#include <cuda.h>
#include <cuda_runtime.h>

using namespace std;

texture<float, 1, cudaReadModeElementType> data_d_texture_filtering;
texture<float, 1> data_d_texture;

#define BLOCK_SIZE 256

/******************/
/* ERROR CHECKING */
/******************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) { getch(); exit(code); }
    }
}

/************/
/* LINSPACE */
/************/
// --- Generates N equally spaced, increasing points between a and b and stores them in x 
void linspace(float* x, float a, float b, int N) {
    float delta_x=(b-a)/(float)N;
    x[0]=a;
    for(int k=1;k<N;k++) x[k]=x[k-1]+delta_x;
}

/*************/
/* RANDSPACE */
/*************/
// --- Generates N randomly spaced, increasing points between a and b and stores them in x 
void randspace(float* x, float a, float b, int N) {
    float delta_x=(b-a)/(float)N;
    x[0]=a;
    for(int k=1;k<N;k++) x[k]=x[k-1]+delta_x+(((float)rand()/(float)RAND_MAX-0.5)*(1./(float)N));
}

/******************/
/* DATA GENERATOR */
/******************/
// --- Generates N complex random data points, with real and imaginary parts ranging in (0.f,1.f)
void Data_Generator(float* data, int N) {
    for(int k=0;k<N;k++) {
        data[k]=(float)rand()/(float)RAND_MAX;
    }
}

/*************************************/
/* LINEAR INTERPOLATION KERNEL - CPU */
/*************************************/
float linear_kernel_CPU(float in)
{
    float d_y;
    return 1.-abs(in);
}

/***************************************/
/* LINEAR INTERPOLATION FUNCTION - CPU */
/***************************************/
void linear_interpolation_function_CPU(float* result_GPU, float* data, float* x_in, float* x_out, int M, int N){

    float a;
    for(int j=0; j<N; j++){
        int k = floor(x_out[j]+M/2);
        a = x_out[j]+M/2-floor(x_out[j]+M/2);
        result_GPU[j] = a * data[k+1] + (-data[k] * a + data[k]);
    }   
}

/*************************************/
/* LINEAR INTERPOLATION KERNEL - GPU */
/*************************************/
__device__ float linear_kernel_GPU(float in)
{
    float d_y;
    return 1.-abs(in);
}

/**************************************************************/
/* LINEAR INTERPOLATION KERNEL FUNCTION - GPU - GLOBAL MEMORY */
/**************************************************************/
__global__ void linear_interpolation_kernel_function_GPU(float* __restrict__ result_d, const float* __restrict__ data_d, const float* __restrict__ x_out_d, const int M, const int N)
{
    int j = threadIdx.x + blockDim.x * blockIdx.x;

    if(j<N)
    {
        float reg_x_out = x_out_d[j]+M/2;
        int k = __float2int_rz(reg_x_out); 
        float a = reg_x_out - truncf(reg_x_out);
        float dk = data_d[k];
        float dkp1 = data_d[k+1];
        result_d[j] = a * dkp1 + (-dk * a + dk);
    } 
}

/***************************************************************/
/* LINEAR INTERPOLATION KERNEL FUNCTION - GPU - TEXTURE MEMORY */
/***************************************************************/
__global__ void linear_interpolation_kernel_function_GPU_texture(float* __restrict__ result_d, const float* __restrict__ x_out_d, const int M, const int N)
{
    int j = threadIdx.x + blockDim.x * blockIdx.x;

    if(j<N)
    {
        float reg_x_out = x_out_d[j]+M/2;
        int k = __float2int_rz(reg_x_out); 
        float a = reg_x_out - truncf(reg_x_out);
        float dk = tex1Dfetch(data_d_texture,k);
        float dkp1 = tex1Dfetch(data_d_texture,k+1);
        result_d[j] = a * dkp1 + (-dk * a + dk);
    } 
}

/************************************************************************************/
/* LINEAR INTERPOLATION KERNEL FUNCTION - GPU - TEXTURE MEMORY - FILTERING FEATURES */
/************************************************************************************/
__global__ void linear_interpolation_kernel_function_GPU_texture_filtering(float* __restrict__ result_d, const float* __restrict__ x_out_d, const int M, const int N)
{
    int j = threadIdx.x + blockDim.x * blockIdx.x; 
    if(j<N) result_d[j] = tex1D(data_d_texture_filtering,float(x_out_d[j]+M/2+0.5));
}

/***************************************/
/* LINEAR INTERPOLATION FUNCTION - GPU */
/***************************************/
void linear_interpolation_function_GPU(float* result_d, float* data_d, float* x_in_d, float* x_out_d, int M, int N){

    dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(N/BLOCK_SIZE + (N%BLOCK_SIZE == 0 ? 0:1),1);
    linear_interpolation_kernel_function_GPU<<<dimGrid,dimBlock>>>(result_d, data_d, x_out_d, M, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
}

/********************************************************/
/* LINEAR INTERPOLATION FUNCTION - GPU - TEXTURE MEMORY */
/********************************************************/
void linear_interpolation_function_GPU_texture(float* result_d, float* data_d, float* x_in_d, float* x_out_d, int M, int N){

    cudaBindTexture(NULL, data_d_texture, data_d, M*sizeof(float));

    dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(N/BLOCK_SIZE + (N%BLOCK_SIZE == 0 ? 0:1),1);
    linear_interpolation_kernel_function_GPU_texture<<<dimGrid,dimBlock>>>(result_d, x_out_d, M, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
}

/*****************************************************************************/
/* LINEAR INTERPOLATION FUNCTION - GPU - TEXTURE MEMORY - FILTERING FEATURES */
/*****************************************************************************/
void linear_interpolation_function_GPU_texture_filtering(float* result_d, float* data, float* x_in_d, float* x_out_d, int M, int N){

    cudaArray* data_d = NULL; gpuErrchk(cudaMallocArray(&data_d, &data_d_texture_filtering.channelDesc, M, 1)); 
    gpuErrchk(cudaMemcpyToArray(data_d, 0, 0, data, sizeof(float)*M, cudaMemcpyHostToDevice)); 
    gpuErrchk(cudaBindTextureToArray(data_d_texture_filtering, data_d)); 
    data_d_texture_filtering.normalized = false; 
    data_d_texture_filtering.filterMode = cudaFilterModeLinear;

    dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(N/BLOCK_SIZE + (N%BLOCK_SIZE == 0 ? 0:1),1);
    linear_interpolation_kernel_function_GPU_texture_filtering<<<dimGrid,dimBlock>>>(result_d, x_out_d, M, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

}

/********/
/* MAIN */
/********/
int main()
{

    int M=1024;             // --- Number of input points

    int N=1024;             // --- Number of output points

    int Nit = 100;          // --- Number of computations for time measurement

    // --- Input sampling
    float* x_in; gpuErrchk(cudaMallocManaged(&x_in,sizeof(float)*M));

    // --- Input data
    float *data;        gpuErrchk(cudaMallocManaged(&data,(M+1)*sizeof(float))); Data_Generator(data,M); data[M]=0.;

    // --- Output sampling
    float* x_out;       gpuErrchk(cudaMallocManaged((void**)&x_out,sizeof(float)*N)); randspace(x_out,-M/2.,M/2.,N);

    // --- Result allocation
    float *result_CPU;                          result_CPU=(float*)malloc(N*sizeof(float));
    float *result_d;                            gpuErrchk(cudaMallocManaged(&result_d,sizeof(float)*N));
    float *result_d_texture;                    gpuErrchk(cudaMallocManaged(&result_d_texture,sizeof(float)*N));
    float *result_d_texture_filtering;          gpuErrchk(cudaMallocManaged(&result_d_texture_filtering,sizeof(float)*N));

    // --- Reference interpolation result as evaluated on the CPU
    linear_interpolation_function_CPU(result_CPU, data, x_in, x_out, M, N);

    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    for (int k=0; k<Nit; k++) linear_interpolation_function_GPU(result_d, data, x_in, x_out, M, N);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    cout << "GPU Global memory [ms]: " << setprecision (10) << time/Nit << endl;

    cudaEventRecord(start, 0);
    for (int k=0; k<Nit; k++) linear_interpolation_function_GPU_texture_filtering(result_d_texture_filtering, data, x_in, x_out, M, N);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    cout << "GPU Texture filtering [ms]: " << setprecision (10) << time/Nit << endl;

    cudaEventRecord(start, 0);
    for (int k=0; k<Nit; k++) linear_interpolation_function_GPU_texture(result_d_texture, data, x_in, x_out, M, N);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    cout << "GPU Texture [ms]: " << setprecision (10) << time/Nit << endl;

    float diff_norm=0.f, norm=0.f;
    for(int j=0; j<N; j++) {
        diff_norm = diff_norm + (result_CPU[j]-result_d[j])*(result_CPU[j]-result_d[j]);
        norm      = norm      + result_CPU[j]*result_CPU[j];
    }
    printf("Error GPU [percentage] = %f\n",100.*sqrt(diff_norm/norm));

    float diff_norm_texture_filtering=0.f;
    for(int j=0; j<N; j++) {
        diff_norm_texture_filtering = diff_norm_texture_filtering + (result_CPU[j]-result_d_texture_filtering[j])*(result_CPU[j]-result_d_texture_filtering[j]);
    }
    printf("Error texture filtering [percentage] = %f\n",100.*sqrt(diff_norm_texture_filtering/norm));

    float diff_norm_texture=0.f;
    for(int j=0; j<N; j++) {
        diff_norm_texture = diff_norm_texture + (result_CPU[j]-result_d_texture[j])*(result_CPU[j]-result_d_texture[j]);
    }
    printf("Error texture [percentage] = %f\n",100.*sqrt(diff_norm_texture/norm));

    cudaDeviceReset();

    return 0;
}

score 3 · Accepted Answer

現時点で唯一可能なマネージメモリは、を使用した静的割り当て__device__ __managed__またはを使用した動的割り当てcudaMallocManaged()です。テクスチャ、サーフェス、定数メモリなどは直接サポートされていません。

テクスチャリングの使用は問題ありません。テクスチャの使用とマネージメモリの唯一の重複は、次の呼び出しにあります。

cudaMemcpyToArrayAsync(input_t, 0, 0, out, width*height*sizeof(float),  cudaMemcpyHostToDevice);

ここで、マネージメモリは転送のソース (ホスト側) です。これは、カーネルが実行されていない期間中に呼び出しが発行されている限り、許容されます (以下を参照)。

「私が奇妙だと思うもう 1 つのことは、コード内の cudaDeviceSynchronize() を削除すると、常にセグメンテーションエラーが発生することです。」

cudaDeviceSynchronize();カーネル呼び出しの後で、マネージメモリをホストから再び見えるようにするために必要です。ドキュメントのこのセクションを注意深く読むことをお勧めします。

「一般に、GPU がアクティブな間、CPU が管理された割り当てまたは変数にアクセスすることは許可されていません。同時 CPU/GPU アクセスは、...セグメンテーション違反を引き起こします...」

ご指摘のとおり、投稿したコードは正常に動作します。マネージメモリの使用中に予測不可能なセグフォールトが発生する他のコードがある場合は、コードフローを注意深く調べて (特にストリーム、つまり同時実行性を使用している場合)、ホストがマネージデータにアクセスするのは acudaDeviceSynchronize();が発行された後であることを確認します。後続のカーネル呼び出しの前。

c++ - テクスチャ メモリの結合 CUDA 6 のユニファイド メモリ

2 に答える 2

Related

Reference

c++ - テクスチャメモリの結合 CUDA 6 のユニファイドメモリ