cuda - cub ライブラリでサポートされる最大サイズ

Question

cub::scan でサポートされている最大サイズを知っている人はいますか? 5 億を超える入力サイズのコアダンプを取得しました。自分が悪いことをしていないことを確認したかったのです...

これが私のコードです：

#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;

bool                    g_verbose = false;  // Whether to display input/output to console
CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
typedef int mytype;

/**
 * Solve inclusive-scan problem
 */

static void solve(mytype *h_in, mytype *h_cpu, int n)
{
    mytype inclusive = 0;
    for (int i = 0; i < n; ++i) {
      inclusive += h_in[i];
      h_cpu[i] = inclusive;
    }
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
    for (int i = 0; i < n; i++) {
      if (h_cpu[i] != h_o[i]) {
        return i + 1;
      }
    }
    return 0;
}

/**
 * Main
 */
int main(int argc, char** argv)
{
    cudaSetDevice(0);
    struct timeval start, end;
    int num_items = 1073741824;
    const int repetitions = 5;
    mytype *h_in, *h_out, *h_cpu;
    const int size = num_items * sizeof(mytype);
    // Allocate host arrays
    h_in = (mytype *)malloc(size);
    h_out = (mytype *)malloc(size);
    h_cpu = (mytype *)malloc(size);


    // Initialize problem and solution
    for (int i = 0; i < num_items; i++) {
        h_in[i] = i;
        h_out[i] = 0;
        h_cpu[i] = 0;
    }

    solve(h_in, h_cpu, num_items);

    // Allocate problem device arrays
    mytype *d_in = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));

    // Initialize device input
    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));

    // Allocate device output array
    mytype *d_out = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));

    // Allocate temporary storage
    void            *d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;


    CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

    // Run
    gettimeofday(&start, NULL);
    for (long i = 0; i < repetitions; i++) 
        DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
    cudaThreadSynchronize();
    gettimeofday(&end, NULL);
    double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;

    cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
    int cmp = compare(h_cpu, h_out, num_items);
    printf("%d\t", num_items);
    if (!cmp)
        printf("\t%7.4fs \n", ctime);
    printf("\n");
    if (h_in) delete[] h_in;
    if (h_out) delete[] h_out;
    if (h_cpu) delete[] h_cpu;
    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

    printf("\n\n");

    return 0;
}

score 2 · Accepted Answer

問題はここにあります：

const int size = num_items * sizeof(mytype);

そして、次のように変更することで修正できます。

const size_t size = num_items * sizeof(mytype);

コード内のの値はnum_items10 億を超えています。これを掛けるとsizeof(mytype)4 倍になるので、結果は 40 億を超えます。intこの値は変数に格納できません。とにかくそのように使用しようとすると、後続のホストコードが悪いことをします。この問題 (コアダンプ) は、実際には CUDA とは何の関係もありません。すべての CUB 要素を削除すると、コードはコアダンプします。

上記のコード行を変更し、正しい GPU (たとえば-arch=sm_35、私の場合、または-arch=sm_52Titan X GPU) 用にコンパイルすると、正しい答えが得られます (そして、セグフォールト/コアダンプはありません)。

一般に、セグフォールト/コアダンプタイプのエラーを追跡する際の正しい出発点は、このエラーがホストコードから発生していることを認識することであり、このエラーを生成しているソースコードの正確な行をローカライズする必要があります。printfこれは、printf 出力が表示されないコードの行を特定するまでコードに多くのステートメントを挿入するか、または Linux の gdb などのホストコードデバッガーを使用することによって、簡単に/退屈に実行できます。

また、このコードを記述すると、ホストに 12 GB をわずかに超えるメモリ、GPU に 8 GB をわずかに超えるメモリが必要になるため、このような設定でのみ適切に実行されることに注意してください。

参考までに、修正されたコードは次のとおりです（OPがここに投稿したものに基づいています）：

#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;

bool                    g_verbose = false;  // Whether to display input/output to console
CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
typedef int mytype;

/**
 * Solve inclusive-scan problem
 */

static void solve(mytype *h_in, mytype *h_cpu, int n)
{
    mytype inclusive = 0;
    for (int i = 0; i < n; ++i) {
      inclusive += h_in[i];
      h_cpu[i] = inclusive;
    }
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
    for (int i = 0; i < n; i++) {
      if (h_cpu[i] != h_o[i]) {
        return i + 1;
      }
    }
    return 0;
}

/**
 * Main
 */
int main(int argc, char** argv)
{
    cudaSetDevice(0);
    struct timeval start, end;
    int num_items = 1073741824;
    const int repetitions = 5;
    mytype *h_in, *h_out, *h_cpu;
    const size_t size = num_items * sizeof(mytype);
    // Allocate host arrays
    h_in = (mytype *)malloc(size);
    h_out = (mytype *)malloc(size);
    h_cpu = (mytype *)malloc(size);


    // Initialize problem and solution
    for (int i = 0; i < num_items; i++) {
        h_in[i] = i;
        h_out[i] = 0;
        h_cpu[i] = 0;
    }

    solve(h_in, h_cpu, num_items);

    // Allocate problem device arrays
    mytype *d_in = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));

    // Initialize device input
    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));

    // Allocate device output array
    mytype *d_out = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));

    // Allocate temporary storage
    void            *d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;


    CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

    // Run
    gettimeofday(&start, NULL);
    for (long i = 0; i < repetitions; i++) 
        DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
    cudaThreadSynchronize();
    gettimeofday(&end, NULL);
    double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;

    cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
    int cmp = compare(h_cpu, h_out, num_items);
    printf("%d\t", num_items);
    if (!cmp)
        printf("\t%7.4fs \n", ctime);
    printf("\n");
    if (h_in) delete[] h_in;
    if (h_out) delete[] h_out;
    if (h_cpu) delete[] h_cpu;
    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

    printf("\n\n");

    return 0;
}

cuda - cub ライブラリでサポートされる最大サイズ

1 に答える 1

Related