cuda - CUDAワープ同期の問題

Question

2D配列の値を1スペース右にシフトする（行の境界をラップする）カーネルを一般化すると、ワープ同期の問題が発生します。完全なコードが添付され、以下に含まれています。

このコードは、任意の配列幅、配列の高さ、スレッドブロックの数、およびブロックあたりのスレッドの数に対して機能することを目的としています。33のスレッドサイズ（つまり、フルワープより1つ多いスレッド）を選択すると、同期されない33番目のスレッド__syncthreads()が呼び出されます。これにより、出力データに問題が発生します。この問題は、複数のワープがあり、配列の幅がスレッドの数よりも大きい場合にのみ発生します（たとえば、width = 35および34スレッドの場合）。

以下は、何が起こるかを縮小した例です（実際には、カーネルがエラーを生成するために、配列にはより多くの要素が必要になります）。

初期配列：

0 1 2 3 4 
5 6 7 8 9

期待される結果：

4 0 1 2 3
9 5 6 7 8

カーネルプロデュース：

4 0 1 2 3
8 5 6 7 8

最初の行は正しく実行され（複数ある場合はブロックごとに）、後続のすべての行で最後から2番目の値が繰り返されます。私はこれを2つの異なるカード（8600GTとGTX280）でテストし、同じ結果を得ました。これが私のカーネルの単なるバグなのか、それともコードを調整しても修正できない問題なのか知りたいのですが。

完全なソースファイルは以下に含まれています。

ありがとうございました。

#include <cstdio>
#include <cstdlib>

// A method to ensure all reads use the same logical layout.
inline __device__ __host__ int loc(int x, int y, int width)
{
  return y*width + x;
}

//kernel to shift all items in a 2D array one position to the right (wrapping around rows)
__global__ void shiftRight ( int* globalArray, int width, int height)
{
  int temp1=0;          //temporary swap variables
  int temp2=0;

  int blockRange=0;     //the number of rows that a single block will shift

  if (height%gridDim.x==0)  //logic to account for awkward array sizes
    blockRange = height/gridDim.x;
  else
    blockRange = (1+height/gridDim.x);

  int yStart = blockIdx.x*blockRange;
  int yEnd = yStart+blockRange; //the end condition for the y-loop
  yEnd = min(height,yEnd);              //make sure that the array doesn't go out of bounds

  for (int y = yStart; y < yEnd ; ++y)
  {
    //do the first read so the swap variables are loaded for the x-loop
    temp1 = globalArray[loc(threadIdx.x,y,width)];
    //Each block shifts an entire row by itself, even if there are more columns than threads
    for (int threadXOffset = threadIdx.x  ; threadXOffset < width ; threadXOffset+=blockDim.x)
    {
      //blockDim.x is added so that we store the next round of values
      //this has to be done now, because the next operation will
      //overwrite one of these values
      temp2 = globalArray[loc((threadXOffset + blockDim.x)%width,y,width)];
      __syncthreads();  //sync before the write to ensure all the values have been read
      globalArray[loc((threadXOffset +1)%width,y,width)] = temp1;
      __syncthreads();  //sync after the write so ensure all the values have been written
      temp1 = temp2;        //swap the storage variables.
    }
    if (threadIdx.x == 0 && y == 0)
      globalArray[loc(12,2,width)]=globalArray[67];
  }
}


int main (int argc, char* argv[])
{
  //set the parameters to be used
  int width = 34;
  int height = 3;
  int threadsPerBlock=33;
  int numBlocks = 1;

  int memSizeInBytes = width*height*sizeof(int);

  //create the host data and assign each element of the array to equal its index
  int* hostData = (int*) malloc (memSizeInBytes);
  for (int y = 0 ; y < height ; ++y)
    for (int x = 0 ; x < width ; ++x)
      hostData [loc(x,y,width)] = loc(x,y,width);

  //create an allocate the device pointers
  int* deviceData;
  cudaMalloc ( &deviceData  ,memSizeInBytes);
  cudaMemset (  deviceData,0,memSizeInBytes);
  cudaMemcpy (  deviceData, hostData, memSizeInBytes, cudaMemcpyHostToDevice);
  cudaThreadSynchronize();

  //launch the kernel
  shiftRight<<<numBlocks,threadsPerBlock>>> (deviceData, width, height);
  cudaThreadSynchronize();

  //copy the device data to a host array
  int* hostDeviceOutput = (int*) malloc (memSizeInBytes);
  cudaMemcpy (hostDeviceOutput, deviceData, memSizeInBytes, cudaMemcpyDeviceToHost); 
  cudaFree (deviceData);

  //Print out the expected/desired device output
  printf("---- Expected Device Output ----\n");
  printf("   | ");
  for (int x = 0 ; x < width ; ++x)
    printf("%4d ",x);
  printf("\n---|-");
  for (int x = 0 ; x < width ; ++x)
    printf("-----");
  for (int y = 0 ; y < height ; ++y)
  {
    printf("\n%2d | ",y);
    for (int x = 0 ; x < width ; ++x)
      printf("%4d ",hostData[loc((x-1+width)%width,y,width)]);
  }
  printf("\n\n");

  printf("---- Actual Device Output ----\n");
  printf("   | ");
  for (int x = 0 ; x < width ; ++x)
    printf("%4d ",x);
  printf("\n---|-");
  for (int x = 0 ; x < width ; ++x)
    printf("-----");
  for (int y = 0 ; y < height ; ++y)
  {
    printf("\n%2d | ",y);
    for (int x = 0 ; x < width ; ++x)
      printf("%4d ",hostDeviceOutput[loc(x,y,width)]);
  }
  printf("\n\n");
}

score 2 · Accepted Answer

すべてのスレッドが同じ数のループ反復を実行しているわけではないため、同期が問題になります。すべてのスレッドは常に同じ__syncthreads（）-sにヒットする必要があります。

最も内側のforループを次のようなものに変換することをお勧めします。

for(int blockXOffset=0; blockXOffset < width; blockXOffset+=blockDim.x) {
  int threadXOffset=blockXOffset+threadIdx.x;
  bool isActive=(threadXOffset < width);
  if (isActive) temp2 = globalArray[loc((threadXOffset + blockDim.x)%width,y,width)];
  __syncthreads();
  if (isActive) globalArray[loc((threadXOffset +1)%width,y,width)] = temp1;
  __syncthreads();
  temp1 = temp2;
}

score 1 · Accepted Answer

プログラミングガイドから：

__syncthreads()条件付きコードで許可されますが、条件付きがスレッドブロック全体で同じように評価される場合に限ります。そうでない場合、コードの実行がハングしたり、意図しない副作用が発生したりする可能性があります。

私の例では、すべてのスレッドが同じ数のループ反復を実行しているわけではないため、同期は行われません。

cuda - CUDAワープ同期の問題

2 に答える 2

Related

Reference