cuda - CUDA 加算とシフト命令のパフォーマンス

Question

私が取り組んでいる大規模な CUDA カーネルの命令スループットを理解しようとしています。加算命令とシフト命令のスループットを比較するために、2 つの小さなプログラムを作成しました。CUDA C Programming Guide によると、shift 命令のスループットは add 命令の半分です。しかし、Tesla M2070 で 2 つのプログラムの実行時間を測定すると、時間はまったく同じです。誰かがなぜこれが当てはまるのか説明してもらえますか?

追加プログラム:

#include <limits.h>
#include <stdio.h>
#include <fstream>
#include <iostream>
#include <cstdlib>
#include <stdint.h>

using namespace std;

__global__ void testAdd(int numIterations, uint1* result){
  int total = 1;
  for(int i=0; i< numIterations;i ++){
    total = total+i;
  }
  result[0] = make_uint1(total);
}

int main(){
  uint1* result;
  cudaMalloc((void**)(&(result)), sizeof(uint1));
  float totalElapsedTime = 0;
  int i;

  for(i = 0; i < 10; i++){
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    testAdd<<<1,1>>>(100000, result);
    cudaError_t e50 = cudaGetLastError();
    if(e50 == cudaSuccess){
      cudaEventRecord(stop, 0);
      cudaEventSynchronize(stop);
      float elapsedTime;
      cudaEventElapsedTime(&elapsedTime, start, stop);
      totalElapsedTime += elapsedTime;;
      //cout << "Elapsed Time:" << elapsedTime << endl;                                                                                                                              

    }else{
      cout << "Error launching kernel: " << e50 << endl;
    }
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
  }
  cout << "Elapsed Time: " << totalElapsedTime/i << endl;
  cudaFree(result);
}

シフトプログラム:

#include <limits.h>
#include <stdio.h>
#include <fstream>
#include <iostream>
#include <cstdlib>
#include <stdint.h>

using namespace std;

__global__ void testShift(int numIterations, uint1* result){
  int total = 1;
  for(int i=0; i< numIterations;i ++){
    total = total<<i;
  }
  result[0] = make_uint1(total);
}

int main(){
  uint1* result;
  cudaMalloc((void**)(&(result)), sizeof(uint1));
  float totalElapsedTime = 0;
  int i;

  for(i = 0; i < 10; i++){
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    testShift<<<1,1>>>(100000, result);
    cudaError_t e50 = cudaGetLastError();
    if(e50 == cudaSuccess){
      cudaEventRecord(stop, 0);
      cudaEventSynchronize(stop);
      float elapsedTime;
      cudaEventElapsedTime(&elapsedTime, start, stop);
      totalElapsedTime += elapsedTime;;
      //cout << "Elapsed Time:" << elapsedTime << endl;                                                                                                                              

    }else{
      cout << "Error launching kernel: " << e50 << endl;
    }
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
  }
  cout << "Elapsed Time: " << totalElapsedTime/i << endl;
  cudaFree(result);
}

編集：ptxコードの追加およびシフトプログラムの追加。ご覧のとおり、唯一の違いは 78 行目です。つまり、add 命令と shl 命令です。

PTX コードを追加:

        .entry _Z7testAddiP5uint1 (
                .param .s32 __cudaparm__Z7testAddiP5uint1_numIterations,
                .param .u64 __cudaparm__Z7testAddiP5uint1_result)
        {
        .reg .u32 %r<8>;
        .reg .u64 %rd<3>;
        .reg .pred %p<4>;
        .loc    16      10      0
 //   6  #include <stdint.h>
 //   7
 //   8  using namespace std;
 //   9
 //  10  __global__ void testAdd(int numIterations, uint1* result){
$LDWbegin__Z7testAddiP5uint1:
        ld.param.s32    %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
        mov.u32         %r2, 0;
        setp.le.s32     %p1, %r1, %r2;
        @%p1 bra        $Lt_0_2306;
        ld.param.s32    %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
        mov.s32         %r3, %r1;
        mov.s32         %r4, 0;
        mov.s32         %r5, 1;
        mov.s32         %r6, %r3;
$Lt_0_1794:
 //<loop> Loop body line 10, nesting depth: 1, estimated iterations: unknown
        .loc    16      13      0
 //  11    int total = 1;
 //  12    for(int i=0; i< numIterations;i ++){
 //  13      total = total+i;
        add.s32         %r5, %r4, %r5;
        add.s32         %r4, %r4, 1;
        .loc    16      10      0
        ld.param.s32    %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
        .loc    16      13      0
        setp.ne.s32     %p2, %r1, %r4;
//   6  #include <stdint.h>
 //   7
 //   8  using namespace std;
 //   9
 //  10  __global__ void testAdd(int numIterations, uint1* result){
$LDWbegin__Z7testAddiP5uint1:
        ld.param.s32    %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
        mov.u32         %r2, 0;
        setp.le.s32     %p1, %r1, %r2;
        @%p1 bra        $Lt_0_2306;
        ld.param.s32    %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
        mov.s32         %r3, %r1;
        mov.s32         %r4, 0;
        mov.s32         %r5, 1;
        mov.s32         %r6, %r3;
$Lt_0_1794:
 //<loop> Loop body line 10, nesting depth: 1, estimated iterations: unknown
        .loc    16      13      0
 //  11    int total = 1;
 //  12    for(int i=0; i< numIterations;i ++){
 //  13      total = total+i;
        add.s32         %r5, %r4, %r5;
        add.s32         %r4, %r4, 1;
        .loc    16      10      0
        ld.param.s32    %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
        .loc    16      13      0
        setp.ne.s32     %p2, %r1, %r4;
        @%p2 bra        $Lt_0_1794;
        bra.uni         $Lt_0_1282;
$Lt_0_2306:
        mov.s32         %r5, 1;
$Lt_0_1282:
        .loc    16      15      0
 //  14    }
 //  15    result[0] = make_uint1(total);
        ld.param.u64    %rd1, [__cudaparm__Z7testAddiP5uint1_result];
        st.global.u32   [%rd1+0], %r5;
        .loc    16      16      0
 //  16  }
        exit;
$LDWend__Z7testAddiP5uint1:
        } // _Z7testAddiP5uint1

シフト PTX コード:

        .entry _Z9testShiftiP5uint1 (
                .param .s32 __cudaparm__Z9testShiftiP5uint1_numIterations,
                .param .u64 __cudaparm__Z9testShiftiP5uint1_result)
        {
        .reg .u32 %r<8>;
        .reg .u64 %rd<3>;
        .reg .pred %p<4>;
        .loc    16      10      0
 //   6  #include <stdint.h>
 //   7
 //   8  using namespace std;
 //   9
 //  10  __global__ void testShift(int numIterations, uint1* result){
$LDWbegin__Z9testShiftiP5uint1:
        ld.param.s32    %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
        mov.u32         %r2, 0;
        setp.le.s32     %p1, %r1, %r2;
        @%p1 bra        $Lt_0_2306;
        ld.param.s32    %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
        mov.s32         %r3, %r1;
        mov.s32         %r4, 0;
        mov.s32         %r5, 1;
    mov.s32         %r6, %r3;
$Lt_0_1794:
 //<loop> Loop body line 10, nesting depth: 1, estimated iterations: unknown
        .loc    16      13      0
 //  11    int total = 1;
 //  12    for(int i=0; i< numIterations;i ++){
 //  13      total = total<<i;
        shl.b32         %r5, %r5, %r4;
        add.s32         %r4, %r4, 1;
        .loc    16      10      0
        .reg .u64 %rd<3>;
        .reg .pred %p<4>;
        .loc    16      10      0
 //   6  #include <stdint.h>
 //   7
 //   8  using namespace std;
 //   9
 //  10  __global__ void testShift(int numIterations, uint1* result){
$LDWbegin__Z9testShiftiP5uint1:
        ld.param.s32    %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
        mov.u32         %r2, 0;
        setp.le.s32     %p1, %r1, %r2;
        @%p1 bra        $Lt_0_2306;
        ld.param.s32    %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
        mov.s32         %r3, %r1;
        mov.s32         %r4, 0;
        mov.s32         %r5, 1;
    mov.s32         %r6, %r3;
$Lt_0_1794:
 //<loop> Loop body line 10, nesting depth: 1, estimated iterations: unknown
        .loc    16      13      0
 //  11    int total = 1;
 //  12    for(int i=0; i< numIterations;i ++){
 //  13      total = total<<i;
        shl.b32         %r5, %r5, %r4;
        add.s32         %r4, %r4, 1;
        .loc    16      10      0
        ld.param.s32    %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
        .loc    16      13      0
        setp.ne.s32     %p2, %r1, %r4;
        @%p2 bra        $Lt_0_1794;
        bra.uni         $Lt_0_1282;
$Lt_0_2306:
        mov.s32         %r5, 1;
$Lt_0_1282:
        .loc    16      15      0
 //  14    }
 //  15    result[0] = make_uint1(total);
        ld.param.u64    %rd1, [__cudaparm__Z9testShiftiP5uint1_result];
        st.global.u32   [%rd1+0], %r5;
        .loc    16      16      0
 //  16  }
        exit;
$LDWend__Z9testShiftiP5uint1:
        } // _Z9testShiftiP5uint1

score 2 · Accepted Answer

@gmemon: GPU アセンブリコードを調べたい場合、PTX は中間言語であるため、ここではあまり役に立ちません。

実際のアセンブリコードを取得するには、次のようにします。

NVCC -keep オプションを使用してプログラムをコンパイルします
逆アセンブルするには、CUBIN ファイルで cuobjdump --dump-sass を使用します。

CUBIN ファイルは通常、アーキテクチャに応じて foo.sm_20.cubin または foo.sm_30.cubin と呼ばれます。

たとえば、ケプラーの分解は次のようになります。

    /*7458*/     /*0x001b9e85c0000000*/     LDL.CS R46, [R1];
    /*7460*/     /*0x101ade85c0000000*/     LDL.CS R43, [R1+0x4];
    /*7468*/     /*0xf2655c85c8000063*/     STL [R38+0x18fc], R21;
    /*7470*/     /*0x3ee35c036800c000*/     LOP.AND R13, R46, 0xf;
    /*7478*/     /*0x400000076000000c*/     SSY 0x7790;
    /*7488*/     /*0xfcdfdd0348010000*/     IADD RZ.CC, R13, -RZ;
    /*7490*/     /*0xfff1dc63190e0000*/     ISETP.EQ.X.AND P0, pt, RZ, RZ, pt;
    /*7498*/     /*0x800001e74000000b*/     @P0 BRA 0x7780;
    /*74a0*/     /*0xfc001de428000000*/     MOV R0, RZ;
    /*74a8*/     /*0x04039de218000000*/     MOV32I R14, 0x1;
    /*74b0*/     /*0x0403dde218000000*/     MOV32I R15, 0x1;
    /*74b8*/     /*0x626fdca5c8000064*/     STL.64 [R38+0x1918], RZ;

命令のセマンティクスは、cuobjdump ツールのマニュアルに記載されています。

score 0 · Accepted Answer

PTX コードの命令数を確認することをお勧めします。2 つの例の PTX コードを投稿できますか? これにより、パフォーマンスについての手がかりが得られるはずです。

ところで、1 つのスレッドだけを使用して確実にパフォーマンスをテストできるかどうかはわかりません。

cuda - CUDA 加算とシフト命令のパフォーマンス

2 に答える 2

Related

Reference