cuda - CUDAのnvccは、三項演算子/条件付き速記を適切にコンパイルしませんか？

Question

編集
これはあなたがあなた自身のためにこれらの種類のエラーを見るためにあなたがコンパイルする小さなプログラムです...

//for printf
#include <stdio.h>

#include <cuda.h>

__inline __host__ void gpuAssert(cudaError_t code, char *file, int line, 
                 bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code),
          file, line);
      //if (abort) exit(code);
   }
}

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

__global__ void myKernel1(int *dev_idx, int *dev_tID, const int offset)
{
   int myElement = threadIdx.x + blockDim.x * blockIdx.x;
   //
   int temp;
   temp = myElement+
      offset +
      (offset==0)?0:(offset&0x01==0x0)?(offset-1)*(offset>>1):
      (offset)*(offset>>1);
   dev_idx[myElement+offset] = temp;
   dev_tID[myElement+offset] = myElement;

}

__global__ void myKernel2(int *dev_idx, int *dev_tID, const int offset)
{
   int myElement = threadIdx.x + blockDim.x * blockIdx.x;
   //
   int temp;
   temp = myElement+offset;
   if (offset != 0 && offset&0x01==0x0) temp+= (offset-1)*(offset>>1);
   if (offset != 0 && offset&0x01!=0x0) temp+= offset*( offset>>1);
   dev_idx[myElement+offset] = temp;
   dev_tID[myElement+offset] = myElement;

}

__host__ void PrintMethod1(int *h_idx, int * h_tID, const int offset, 
               const int total, const int h_set)
{
   for (int c=(h_set==0)?0:offset;
    c < (h_set==0)?offset:total;
    c++)
      printf("Element #%d --> idx: %d   tID: %d\n",
         c,h_idx[c],h_tID[c]);
}

__host__ void PrintMethod2(int *h_idx, int * h_tID, const int offset, 
               const int total, const int h_set)
{
   int loopStart = (h_set==0)?0:offset;
   int loopEnd = (h_set==0)?offset:total;
   printf("Loop Start: %d, Loop End: %d\n",
      loopStart, loopEnd);
   for (int c=loopStart; c < loopEnd; c++)
      printf("Element #%d --> idx: %d   tID: %d\n",
         c,h_idx[c],h_tID[c]);
}

//Checks if there is a compatible device
bool IsCompatibleDeviceRunning()
{
   int *dummy;
   return cudaGetDeviceCount(dummy) != cudaSuccess;
}

int main()
{
   //Check for compatible device
   if (!IsCompatibleDeviceRunning())
   {
      printf("ERROR: No compatible CUDA devices found!\n");
      exit(1);
   }
   const int total = 30;
   const int offset = total/2;

   int * h_tID, * dev_tID, * h_idx, * dev_idx, h_set;
   h_tID = (int *) malloc(total*sizeof(int));
   h_idx = (int *) malloc(total*sizeof(int));
   gpuErrchk(cudaMalloc((void **) &dev_tID,total*sizeof(int)));
   gpuErrchk(cudaMalloc((void **) &dev_idx,total*sizeof(int)));
   myKernel1<<<1,offset>>>(dev_idx, dev_tID, 0);
   //myKernel2<<<1,offset>>>(dev_idx, dev_tID, 0);
   gpuErrchk(cudaPeekAtLastError());
   gpuErrchk(cudaDeviceSynchronize());
   myKernel1<<<1,offset>>>(dev_idx, dev_tID, offset);
   //myKernel2<<<1,offset>>>(dev_idx, dev_tID, offset);
   gpuErrchk(cudaPeekAtLastError());
   gpuErrchk(cudaDeviceSynchronize());
   gpuErrchk(cudaMemcpy(h_tID, dev_tID, total*sizeof(int),
            cudaMemcpyDeviceToHost));
   gpuErrchk(cudaMemcpy(h_idx, dev_idx, total*sizeof(int),
            cudaMemcpyDeviceToHost));
   h_set = 0;
   //PrintMethod1(h_idx, h_tID, offset, total, h_set);
   PrintMethod2(h_idx, h_tID, offset, total, h_set);
   h_set = 1;
   //PrintMethod1(h_idx, h_tID, offset, total, h_set);
   PrintMethod2(h_idx, h_tID, offset, total, h_set);
   return 0;
}

がMyKernel2実行されると、正しい出力が配列に書き込まれます。

Loop Start: 0, Loop End: 15
Element #0 --> idx: 0   tID: 0
Element #1 --> idx: 1   tID: 1
Element #2 --> idx: 2   tID: 2
Element #3 --> idx: 3   tID: 3
Element #4 --> idx: 4   tID: 4
Element #5 --> idx: 5   tID: 5
Element #6 --> idx: 6   tID: 6
Element #7 --> idx: 7   tID: 7
Element #8 --> idx: 8   tID: 8
Element #9 --> idx: 9   tID: 9
Element #10 --> idx: 10   tID: 10
Element #11 --> idx: 11   tID: 11
Element #12 --> idx: 12   tID: 12
Element #13 --> idx: 13   tID: 13
Element #14 --> idx: 14   tID: 14
Loop Start: 15, Loop End: 30
Element #15 --> idx: 120   tID: 0
Element #16 --> idx: 121   tID: 1
Element #17 --> idx: 122   tID: 2
Element #18 --> idx: 123   tID: 3
Element #19 --> idx: 124   tID: 4
Element #20 --> idx: 125   tID: 5
Element #21 --> idx: 126   tID: 6
Element #22 --> idx: 127   tID: 7
Element #23 --> idx: 128   tID: 8
Element #24 --> idx: 129   tID: 9
Element #25 --> idx: 130   tID: 10
Element #26 --> idx: 131   tID: 11
Element #27 --> idx: 132   tID: 12
Element #28 --> idx: 133   tID: 13
Element #29 --> idx: 134   tID: 14

がMyKernel1実行されると、同じ3値ベースのidx割り当てで、すべての結果に対してゼロになります。

Loop Start: 0, Loop End: 15
Element #0 --> idx: 0   tID: 0
Element #1 --> idx: 0   tID: 1
Element #2 --> idx: 0   tID: 2
Element #3 --> idx: 0   tID: 3
Element #4 --> idx: 0   tID: 4
Element #5 --> idx: 0   tID: 5
Element #6 --> idx: 0   tID: 6
Element #7 --> idx: 0   tID: 7
Element #8 --> idx: 0   tID: 8
Element #9 --> idx: 0   tID: 9
Element #10 --> idx: 0   tID: 10
Element #11 --> idx: 0   tID: 11
Element #12 --> idx: 0   tID: 12
Element #13 --> idx: 0   tID: 13
Element #14 --> idx: 0   tID: 14
Loop Start: 15, Loop End: 30
Element #15 --> idx: 0   tID: 0
Element #16 --> idx: 0   tID: 1
Element #17 --> idx: 0   tID: 2
Element #18 --> idx: 0   tID: 3
Element #19 --> idx: 0   tID: 4
Element #20 --> idx: 0   tID: 5
Element #21 --> idx: 0   tID: 6
Element #22 --> idx: 0   tID: 7
Element #23 --> idx: 0   tID: 8
Element #24 --> idx: 0   tID: 9
Element #25 --> idx: 0   tID: 10
Element #26 --> idx: 0   tID: 11
Element #27 --> idx: 0   tID: 12
Element #28 --> idx: 0   tID: 13
Element #29 --> idx: 0   tID: 14

（三項境界を使用して）実行するPrintMethod1と、セグフォールトが発生し、本質的に無限ループに陥ります。注意してください、これはホスト側にあります!!

を実行するPrintMethod2と、出力は通常、上記のように出力されます。

これが私のコンパイルコマンドです：

nvcc --compiler-options -fno-strict-aliasing -DUNIX -m64 -O2 \
--compiler-bindir /usr/bin/g++ \
-gencode=arch=compute_20,code=\"sm_21,compute_20\" \
-I/usr/local/CUDA_SDK/C/common/inc -I/usr/local/CUDA_SDK/shared/inc \
-o TEST Test.cu

私が持っている唯一の手がかりは、両方のカーネルが不適切なパラメータを持っていることについて不平を言っているということですが、それは正しく見え、正しい結果を取得しますMyKernel2。

上記の例は、以下の説明に基づいてコメント投稿者が自分で試すことができたものとほぼ同じだと思いますが、コードを書く時間と労力を節約できます。

これを理解するために、他に投稿できるものがあれば教えてください。

元の質問

langで定義されているほとんどのCコンパイラ。標準サポートの三項演算子。

例えば

int myVar;
myVar=(testFlg==true)?-1:1;

ただし、驚くべきことに、CUDAnvccは、カーネル内で使用されるときに、一部の三項演算子を取り除き、それらをゼロに置き換えるように見えます...

cuPrintf私は、コードの問題のあるブロックをチェックするために適用することによってこれを発見しました。たとえば、出力用にグローバル配列を共有する2つのカーネルがあるとします。最初のカーネルは、要素の最初のチャンクを処理します。2番目のカーネルは、最初のカーネルの要素を上書きしないように、配列内でジャンプする距離を示すオフセットを取得します。オフセットは、偶数と奇数で異なります。

だから私は書くことができます：

if (krnl!=0 && offset&0x01==0x0)
   idx+=(offset-1)*(offset>>1);
if (krnl!=0 && offset&0x01!=0x0)
   idx+=offset*(offset>>1);

しかし、（私の意見では）ほぼ同等の速記構文を書く方がコンパクトで読みやすいでしょう。

idx += (krnl==0)?0:(offset&0x01==0)?
   (offset-1)*(offset>>1):
   offset*(offset>>1);

ただし、後者のコードは、CUDAのコンパイラが省略形の条件を切り取るため、常にゼロを生成します。

この機能コードが悪用されてスレッドの分岐が発生することはわかっていますが、単純なケースでは、コンパイラーが適切に処理すれば、標準の条件と何ら変わらないように見えます。

これはコンパイラのバグですか、それとも意図的にサポートされていませんか？

この機能がCUDAに登場するかどうか誰かが知っていますか？

それが私のアドレス指定の失敗とsegfaultsの原因であることがわかって非常に驚きました...

編集
これは標準のC機能です。読み間違えて、誤って非標準だと言いました。

編集2
私はコンパイラのために「チョークアンドダイ」と言っていました。「ダイ」は間違いなく不適切な用語です。むしろ、nvccコンパイルを完了しますが、三項演算子ベースの割り当てを取り除き、ゼロに置き換えたようです。ものが適切なスポットに書き込まれていなかったため、これは後で戻ってきて私を噛みました。そして、それらのスポットは、ダブルインデックススキームのインデックスとして使用されました。インデックスはCPU側のラップアップ中に使用されたため、セグメンテーションフォールトはCPU側で発生しましたが、コンパイラのスニッピングによって駆動されていました。

コンパイラv4.1を使用していて、-O2オンにしています。オプティマイザーは、このバグの原因である可能性がある三項演算内で使用される変数を最適化しているようです。

エラーが発生しやすい3項演算は、上記の例とほぼ同じですが、大規模な加算演算が含まれます。

以下のコメント投稿者のアドバイスに従い、NVIDIAにバグレポートを提出する予定ですが、この投稿は他の人への警告として残しています。

編集3

ここでは、常にゼロを生成する、わずかにサニタイズされた完全なステートメントを示します。

__global__ void MyFunc
( const int offset
  const CustomType * dev_P,
  ...
  const int box)
{
   int tidx = blockIdx.x * blockDim.x + threadIdx.x;
   int idx=0;
   ...
   idx = tidx +
      dev_P->B +
      (box == 0)?0:(offset&0x01!=0x0):
      (offset-1)*(offset>>1):offset*(offset>>1);
   //NOTES:
   //I put the cuPrintf here.... from it I could see that tidx was diff. ints (as you 
   //would expect), but that when added together the sum was always "magically"
   //becoming zero.  The culprit was the nested ternary operator.
   //Once I replaced it with the equivalent conditional, the assignment worked as
   //expected.
   //"offset" is constant on the level of this kernel, but it is not always 0.
   //Outside the kernel "offset" varies greatly over the course of the simulation,
   //meaning that each time the kernel is called, it likely has a different value.
   //"tidx" obviously varies.
   //but somehow the above sum gave 0, likely due to an unreported compiler bug.
   //box is either 0 or 1.  For a certain type of op in my simulation I call this 
   //kernel twice, once for box value 0 and a second time for box value 1
   ...
}

score 1 · Accepted Answer

私は答えを見つけました...これは一般的なCの問題であり、CUDA固有ではありません。

三項演算子の優先順位は、LHSとRHSの両方で非常に低くなっています（ただし、それぞれの優先順位は奇妙に異なります）。

ただし、3項全体を括弧で囲むことにより、優先順位をオーバーライドできます((...)?...:...)。

私はここで言語標準にこのアプローチを採用する常識についての一般的な質問をフォークしました：
予期しない結果、GnuCの三項演算子

cuda - CUDAのnvccは、三項演算子/条件付き速記を適切にコンパイルしませんか？

1 に答える 1

Related

Reference