c++ - 複数の Nvidia GPU で次のコードを使用した場合のセグメンテーション違反

Question

次のコードは、複数のカードで実行するように構成されている場合、しばらくすると確実にセグメンテーション違反につながります (私は 4 つの GPU システム (NVIDIA Titan) を使用しています)。これは、私が以前に解決しようとした問題の最小限の例です (ここを参照してください: libcuda.so.1 から呼び出された __pthread_getspecific のセグメンテーション違反)。

コードの機能の説明:

まず、各カード (最初の N 枚のカード、N は 1 ～ 4) に膨大な量のメモリを割り当てます。これは、カードにできるだけ多くの負荷をかけることを目的としています。次に、16 個のスレッドを起動し、各スレッドが 1 枚のランダムカードで 2 つのカーネルを実行します。これを数回繰り返し、毎回ランダムにカードを選んで終了します。マスタースレッドは、16 個のスレッドすべてが終了するのを待ってから、無期限に、またはスレッドの 1 つが何らかの理由で失敗するまで、プロセスを繰り返します。

マスタースレッドを数回 (通常は約 20 回ですが、多かれ少なかれ) 繰り返した後、セグメンテーション違反が発生します。通常、スタックは次のようになります。

#0  0x00007f164a71f43c in ?? () from /usr/lib/libcuda.so
#1  0x00007f164a6bd1b5 in ?? () from /usr/lib/libcuda.so
#2  0x00007f164a5dff1a in ?? () from /usr/lib/libcuda.so
#3  0x00007f164a6c0b34 in ?? () from /usr/lib/libcuda.so
#4  0x00007f164a6c0c92 in ?? () from /usr/lib/libcuda.so
#5  0x00007f164a5e009f in ?? () from /usr/lib/libcuda.so
#6  0x00007f164a5d03c0 in ?? () from /usr/lib/libcuda.so
#7  0x00007f164a5c43bf in ?? () from /usr/lib/libcuda.so
#8  0x00007f164c131c39 in ?? () from /usr/local/cuda-5.5/lib64/libcudart.so.5.5
#9  0x00007f164c152879 in cudaDeviceSynchronize () from /usr/local/cuda-5.5/lib64/libcudart.so.5.5
#10 0x0000000000401911 in TestCUDA(int, unsigned int, unsigned int, unsigned int const*, unsigned short*) ()
#11 0x00000000004012b0 in main (argc=0, argv=0x100000200) at main.cpp:208

完全なソースコード:

main.cpp:

#include <stdint.h>
#include <cstdlib>
#include <cstdio>
#include <pthread.h>
#include <string.h>
#include <math.h>
#include <assert.h>

#include <cuda_runtime.h>

class CriticalSection
{
    pthread_mutex_t cs;

public:

    CriticalSection();
    ~CriticalSection();

    void Lock( void );
    void Unlock( void );
};

CriticalSection::CriticalSection()
{
    assert( pthread_mutex_init( &cs, NULL ) == 0 );
}

CriticalSection::~CriticalSection()
{
    assert( pthread_mutex_destroy( &cs ) == 0 );
}

void CriticalSection::Lock( void )
{
    assert( pthread_mutex_lock( &cs ) == 0 );
}

void CriticalSection::Unlock( void )
{
    assert( pthread_mutex_unlock( &cs ) == 0 );
}

class DeviceWrapper
{
protected:

    CriticalSection m_cs;

public:

    int32_t     m_i32DeviceId;

    uint32_t*   m_pdu32Data;
    uint16_t*   m_pdu16Res;
    uint32_t    m_u32Count;

    DeviceWrapper();
    ~DeviceWrapper();

    void Lock( void );
    void Unlock( void );

    bool Init( const int32_t i32DevId, const uint32_t u32Count );
    bool Free();    
};

DeviceWrapper::DeviceWrapper()
{
    m_i32DeviceId = 0;
    m_pdu32Data = NULL;
    m_pdu16Res = NULL;
    m_u32Count = 0;
}

DeviceWrapper::~DeviceWrapper()
{

}

void DeviceWrapper::Lock( void )
{
    m_cs.Lock();
}

void DeviceWrapper::Unlock( void )
{
    m_cs.Unlock();
}

bool DeviceWrapper::Init( const int32_t i32DevId, const uint32_t u32Count )
{
    if ( cudaSetDevice( i32DevId ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Init: Failed to set device %d\n", i32DevId );
        return false;
    }

    if ( cudaMalloc( &m_pdu32Data, sizeof( uint32_t ) * u32Count ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Init: Failed to allocate %u unsigned int's on device %d\n", u32Count, i32DevId );
        return false;
    }

    if ( cudaMalloc( &m_pdu16Res, sizeof( uint16_t ) * u32Count ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Init: Failed to allocate %u unsigned short's on device %d\n", u32Count, i32DevId );
        return false;
    }

    m_u32Count = u32Count;
    m_i32DeviceId = i32DevId;   
    return true;
}

bool DeviceWrapper::Free()
{
    if ( cudaSetDevice( m_i32DeviceId ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Free: Failed to set device %d\n", m_i32DeviceId );
        return false;
    }

    if ( cudaFree( m_pdu32Data ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Free: Failed to free pdu32Mem on device %d\n", m_i32DeviceId );
        return false;
    }

    if ( cudaFree( m_pdu16Res ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Free: Failed to free pdu16Mem on device %d\n", m_i32DeviceId );
        return false;
    }

    m_pdu32Data = NULL;
    m_pdu16Res = NULL;
    m_u32Count = 0;
    m_i32DeviceId = 0;
    return true;
}

bool TestCUDA( const int32_t i32DeviceId, const uint32_t u32Iterations, const uint32_t u32Count, const uint32_t* pdu32Data, uint16_t* pdu16Res );

void* DoWork( void* pArg );

static bool bRun = true;

static DeviceWrapper devices[4];

int main( int argc, char* argv[] )
{
    if ( argc != 2 )
    {
        printf( "Usage: %s <number of cards to use>\n", argv[0] );
        return 1;
    }

    uint32_t u32CardsToUse = strtoul( argv[1], NULL, 0 );

    if ( !u32CardsToUse || u32CardsToUse > 4 )
    {
        printf( "Invalid argument, must be in range 1-4\n" );
        return 2;
    }

    for ( int32_t i = 0; i < u32CardsToUse; i++ )
    {
        if ( !devices[i].Init( i, 0x20000000 ) )
        {
            for ( uint32_t j = 0; j < i; j++ )
            {
                devices[j].Free();
            }

            printf( "Failed to init device %d\n", i );
            return 3;
        }
    }

    uint32_t u32IterationsCompleted = 0;

    while ( bRun )
    {
        pthread_t pWorkers[ 16 ];
        memset( pWorkers, 0, 16 * sizeof( pthread_t ) );

        for ( uint32_t i = 0; i < 16; i++ )
        {       
            int iReturnValue = pthread_create( &pWorkers[i], NULL, &DoWork, (void*)u32CardsToUse );

            if ( iReturnValue != 0 )
            {
                printf( "Error calling pthread_create: %d\n", iReturnValue );
                return 4;
            }
        }

        for ( uint32_t i = 0; i < 16; i++ )
        {
            pthread_join( pWorkers[i], NULL );
        }

        printf( "Iterations completed: %u\n", ++u32IterationsCompleted );
    }

    printf( "Finished\n" );
    fflush( stdout );
    return 0;
}

void* DoWork( void* pArg )
{
    uint32_t u32CardsToUse = uint32_t( pArg );

    uint32_t u32TestCount = (rand() % 4) + 4;

    for ( uint32_t i = 0; i < u32TestCount; i++ )
    {
        int32_t i32DeviceId = int32_t( rand() % u32CardsToUse );

        devices[ i32DeviceId ].Lock();

        if ( !TestCUDA( i32DeviceId, 1, devices[i32DeviceId].m_u32Count, devices[i32DeviceId].m_pdu32Data, devices[i32DeviceId].m_pdu16Res ) )
        {
            printf( "DoWork: Failure in executing TestCUDA for device %d (test number %u)\n", i32DeviceId, i );
            bRun = false;
            devices[ i32DeviceId ].Unlock();
            return NULL;
        }

        devices[ i32DeviceId ].Unlock();
    }

    return NULL;
}

cuda_test.cu:

#include <stdint.h>
#include <cstdlib>
#include <cstdio>
#include <cuda_runtime.h>

__global__ void HammingU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        pu16Results[ gidx ] += __popc( pu32Data[gidx] ^ gidx );
        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void EqualU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ]++;

        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void EqualByteU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ] += 4;

        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void EqualBitU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ] += 32;

        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void OrderU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        uint32_t u32File = pu32Data[gidx];                  // 32-bit value to find the log2 of 
        uint32_t u32FileLog = 0;                            // result of log2 will go here
        uint32_t u32Shift = 0;

        u32FileLog = (u32File > 0xFFFF) << 4;
        u32File >>= u32FileLog;
        u32Shift = (u32File > 0xFF) << 3;
        u32File >>= u32Shift;
        u32FileLog |= u32Shift;
        u32Shift = (u32File > 0xF) << 2;
        u32File >>= u32Shift;
        u32FileLog |= u32Shift;
        u32Shift = (u32File > 0x3) << 1;
        u32File >>= u32Shift;
        u32FileLog |= u32Shift;
        u32FileLog |= (u32File >> 1);

        uint32_t u32Other = gidx;           // 32-bit value to find the log2 of 
        uint32_t u32OtherLog = 0;                           // result of log2 will go here
        u32Shift = 0;

        u32OtherLog = (u32Other > 0xFFFF) << 4;
        u32Other >>= u32OtherLog;
        u32Shift = (u32Other > 0xFF) << 3;
        u32Other >>= u32Shift;
        u32OtherLog |= u32Shift;
        u32Shift = (u32Other > 0xF) << 2;
        u32Other >>= u32Shift;
        u32OtherLog |= u32Shift;
        u32Shift = (u32Other > 0x3) << 1;
        u32Other >>= u32Shift;
        u32OtherLog |= u32Shift;
        u32OtherLog |= (u32Other >> 1);

        if ( u32FileLog >= u32OtherLog )
        {
            pu16Results[ gidx ] += uint16_t( u32FileLog - u32OtherLog );
        }
        else
        {
            pu16Results[ gidx ] += uint16_t( u32OtherLog - u32FileLog );
        }

        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void LogU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        uint32_t u32Value = 0;

        if ( pu32Data[gidx] >= gidx )
        {
            u32Value = pu32Data[gidx] - gidx;
        }
        else
        {
            u32Value = gidx - pu32Data[gidx];
        }

        uint32_t u32Log = 0;        // result of log2 will go here
        uint32_t u32Shift = 0;

        u32Log = (u32Value > 0xFFFF) << 4;
        u32Value >>= u32Log;
        u32Shift = (u32Value > 0xFF) << 3;
        u32Value >>= u32Shift;
        u32Log |= u32Shift;
        u32Shift = (u32Value > 0xF) << 2;
        u32Value >>= u32Shift;
        u32Log |= u32Shift;
        u32Shift = (u32Value > 0x3) << 1;
        u32Value >>= u32Shift;
        u32Log |= u32Shift;
        u32Log |= (u32Value >> 1);

        pu16Results[ gidx ] += (uint16_t)u32Log;

        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void EqualRetU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ] += 32;

        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void HammingMulU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        pu16Results[ gidx ] += __popc( pu32Data[gidx] ^ gidx ) << 5;
        gidx += blockDim.x * gridDim.x;
    }
}

bool TestCUDA( const int32_t i32DeviceId, const uint32_t u32Iterations, const uint32_t u32Count, const uint32_t* pdu32Data, uint16_t* pdu16Res )
{   
    for ( uint32_t i = 0; i < u32Iterations; i++ )
    {
        if ( cudaSetDevice( i32DeviceId ) != cudaSuccess )
        {
            return false;
        }

        if ( cudaMemset( pdu16Res, 0, u32Count * sizeof( uint16_t ) ) != cudaSuccess )
        {
            return false;
        }

        for ( uint32_t j = 0; j < 3; j++ )
        {
            HammingU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            EqualU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            EqualByteU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            EqualBitU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            OrderU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            LogU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            EqualRetU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            HammingMulU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
        }

        cudaDeviceSynchronize();
    }

    return true;
}

メイクファイル:

IDIR_CUDA = -I. -I/usr/local/cuda-5.5/include
CC_CUDA = g++
CFLAGS_CUDA = -g $(IDIR_CUDA)
LIBS_CUDA = -lz -lpthread -lrt -ldl -L/usr/local/cuda-5.5/lib64 -lcudart -lcuda

all:
    nvcc -I/usr/include -arch=compute_35 -code=sm_35 --machine 64 --compile cuda_test.cu
    $(CC_CUDA) -fpermissive *.cpp *.o -o test_cuda.out $(CFLAGS_CUDA) $(LIBS_CUDA)
    @echo DONE TEST_CUDA BUILD

clean:
    rm -f *.o test_cuda.out

CUDA 5.5 と最新のドライバー (319.32) を使用し、Ubuntu Linux (64 ビット) を実行しています。

私の質問は次のとおり です。コードにこの動作を引き起こす可能性のあるバグはありますか? カードを 1 枚だけ使用しているのにクラッシュが発生しないのはなぜですか? クラッシュの可能性がはるかに低いというだけですか？(より多くのカードを使用すると、より少ないカードよりも早くクラッシュが発生するよう です) おまけ: 他の誰かがこのコードを使用してクラッシュを見ていますか?

score 0 · Accepted Answer

クラッシュを解決する回避策を見つけることができました：

これは、使用されるGPU ごとに 1 つのスレッドがあり、他のワーカースレッドがこれらのスレッドに作業を送信して GPU で実行するという考え方です。マシン上でテストアプリを 1 日問題なく実行できました (6,000 回以上の反復が完了しました)。

c++ - 複数の Nvidia GPU で次のコードを使用した場合のセグメンテーション違反

1 に答える 1

Related

Reference