parallel-processing - CUDA 削減: アトミック操作はパフォーマンスに影響しない?

Question

パフォーマンスへの影響を測定するために、最適化された配列削減カーネルにアトミック加算操作を挿入する効果をテストしています。結果を理解できていません。5 つの異なるカーネルをテストしました。

0 - fully optimized reduction kernel as provided in samples/6_Advanced/reduction/reduction_kernel.cu  
1 - optimized reduction kernel as described in samples/6_Advanced/docs/reduction.pdf  
2 - kernel 1 with atomic warp-synchronous reduction  
3 - kernel 2 with completely atomic reduction within all shared memory  
4 - kernel 3 with completely atomic reduction

要素の十分に大きなサンプルで使用しているデバイスの平均削減時間:

0 - 0.00103s  
1 - 0.00103s  
2 - 0.00103s  
3 - 0.00103s  
4 - 0.00117s

アトミック操作がカーネルにまったく影響を与えないように見える、2または3カーネルにわずかな影響を与えるように見えるのはなぜ4ですか?

これが完全なコードです。関連するカーネルは次のとおりです。

  /////////////////
 // warp reduce //
/////////////////
/* warp-synchronous reduction using volatile memory
 * to prevent instruction reordering for non-atomic
 * operations */

template <unsigned int blockSize>
__device__ void warpReduce(volatile int *sdata, int tid) {
  if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
  if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
  if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
  if (blockSize >=  8) sdata[tid] += sdata[tid + 4];
  if (blockSize >=  4) sdata[tid] += sdata[tid + 2];
  if (blockSize >=  2) sdata[tid] += sdata[tid + 1];
}

  ////////////////////////
 // atomic warp reduce //
////////////////////////
/* warp-synchronous reduction using atomic operations
 * to serialize computation */

template <unsigned int blockSize>
__device__ void atomicWarpReduce(int *sdata, int tid) {
  if (blockSize >= 64) atomicAdd(&sdata[tid], sdata[tid + 32]);
  if (blockSize >= 32) atomicAdd(&sdata[tid], sdata[tid + 16]);
  if (blockSize >= 16) atomicAdd(&sdata[tid], sdata[tid + 8]);
  if (blockSize >=  8) atomicAdd(&sdata[tid], sdata[tid + 4]);
  if (blockSize >=  4) atomicAdd(&sdata[tid], sdata[tid + 2]);
  if (blockSize >=  2) atomicAdd(&sdata[tid], sdata[tid + 1]);
}

  ////////////////////////
 // reduction kernel 0 //
////////////////////////
/* fastest reduction algorithm provided by
 * cuda/samples/6_Advanced/reduction/reduction_kernel.cu */

template <unsigned int blockSize, bool nIsPow2>
__global__ void reduce0(int *g_idata, int *g_odata, unsigned int n) {
  extern __shared__ int sdata[];
  // first level of reduction (global -> shared)
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  unsigned int gridSize = blockSize * 2 * gridDim.x;
  int sum = 0;
  // reduce multiple elements per thread
  while (i < n) {
    sum += g_idata[i];
    // check bounds
    if (nIsPow2 || i + blockSize < n)
      sum += g_idata[i + blockSize];
    i += gridSize;
  }
  // local sum -> shared memory
  sdata[tid] = sum;
  __syncthreads();
  // reduce in shared memory
  if (blockSize >= 512) {
    if (tid < 256)
      sdata[tid] = sum = sum + sdata[tid + 256];
  __syncthreads();
  }
  if (blockSize >= 256) {
    if (tid < 128)
      sdata[tid] = sum = sum + sdata[tid + 128];
  __syncthreads();
  }
  if (blockSize >= 128) {
    if (tid < 64)
      sdata[tid] = sum = sum + sdata[tid + 64];
  __syncthreads();
  }
  if (tid < 32) {
    // warp-synchronous reduction
    // volatile memory stores won't be reordered by compiler
    volatile int *smem = sdata;
    if (blockSize >= 64)
      smem[tid] = sum = sum + smem[tid + 32];
    if (blockSize >= 32)
      smem[tid] = sum = sum + smem[tid + 16];
    if (blockSize >= 16)
      smem[tid] = sum = sum + smem[tid + 8];
    if (blockSize >= 8)
      smem[tid] = sum = sum + smem[tid + 4];
    if (blockSize >= 4)
      smem[tid] = sum = sum + smem[tid + 2];
    if (blockSize >= 2)
      smem[tid] = sum = sum + smem[tid + 1];
  }
  // write result for block to global memory
  if (tid == 0)
    g_odata[blockIdx.x] = sdata[0];
}

  /////////////////////////
 // reduction kernel 1  //
/////////////////////////
/* fastest reduction alrogithm described in
 * cuda/samples/6_Advanced/reduction/doc/reduction.pdf */

template <unsigned int blockSize>
__global__ void reduce1(int *g_idata, int *g_odata, unsigned int n) {
  extern __shared__ int sdata[];
  // first level of reduction (global -> shared)
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  unsigned int gridSize = blockSize * 2 * gridDim.x;
  sdata[tid] = 0;
  // reduce multiple elements per thread
  while (i < n) {
    sdata[tid] += g_idata[i] + g_idata[i+blockSize];
    i += gridSize;
  }
  __syncthreads();
  // reduce in shared memory
  if (blockSize >= 512) {
    if (tid < 256)
      sdata[tid] += sdata[tid + 256];
  __syncthreads();
  }
  if (blockSize >= 256) {
    if (tid < 128)
      sdata[tid] += sdata[tid + 128];
  __syncthreads();
  }
  if (blockSize >= 128) {
    if (tid < 64)
      sdata[tid] += sdata[tid + 64];
  __syncthreads();
  }
  if (tid < 32) warpReduce<blockSize>(sdata, tid);
  // write result for block to global memory
  if (tid == 0)
    g_odata[blockIdx.x] = sdata[0];
}

  /////////////////////////
 // reduction kernel 2  //
/////////////////////////
/* reduction kernel 1 executed
 * with atomic warp-synchronous addition */

template <unsigned int blockSize>
__global__ void reduce2(int *g_idata, int *g_odata, unsigned int n) {
  extern __shared__ int sdata[];
  // first level of reduction (global -> shared)
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  unsigned int gridSize = blockSize * 2 * gridDim.x;
  sdata[tid] = 0;
  // reduce multiple elements per thread
  while (i < n) {
    sdata[tid] += g_idata[i] + g_idata[i+blockSize];
    i += gridSize;
  }
  __syncthreads();
  // reduce in shared memory
  if (blockSize >= 512) {
    if (tid < 256)
      sdata[tid] += sdata[tid + 256];
  __syncthreads();
  }
  if (blockSize >= 256) {
    if (tid < 128)
      sdata[tid] += sdata[tid + 128];
  __syncthreads();
  }
  if (blockSize >= 128) {
    if (tid < 64)
      sdata[tid] += sdata[tid + 64];
  __syncthreads();
  }
  if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
  // write result for block to global memory
  if (tid == 0)
    g_odata[blockIdx.x] = sdata[0];
}

  /////////////////////////
 // reduction kernel 3  //
/////////////////////////

template <unsigned int blockSize>
__global__ void reduce3(int *g_idata, int *g_odata, unsigned int n) {
  extern __shared__ int sdata[];
  // first level of reduction (global -> shared)
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  unsigned int gridSize = blockSize * 2 * gridDim.x;
  sdata[tid] = 0;
  // reduce multiple elements per thread
  while (i < n) {
    sdata[tid] += g_idata[i] + g_idata[i+blockSize];
    i += gridSize;
  }
  __syncthreads();
  // reduce in shared memory
  if (blockSize >= 512) {
    if (tid < 256)
      atomicAdd(&sdata[tid], sdata[tid + 256]);
    __syncthreads();
  }
  if (blockSize >= 256) {
    if (tid < 128)
      atomicAdd(&sdata[tid], sdata[tid + 128]);
    __syncthreads();
  }
  if (blockSize >= 128) {
    if (tid < 64)
      atomicAdd(&sdata[tid], sdata[tid + 64]);
    __syncthreads();
  }
  if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
  // write result for block to global memory
  if (tid == 0)
    g_odata[blockIdx.x] = sdata[0];
}

  /////////////////////////
 // reduction kernel 4  //
/////////////////////////

template <unsigned int blockSize>
__global__ void reduce4(int *g_idata, int *g_odata, unsigned int n) {
  extern __shared__ int sdata[];
  // first level of reduction (global -> shared)
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  unsigned int gridSize = blockSize * 2 * gridDim.x;
  sdata[tid] = 0;
  // reduce multiple elements per thread
  while (i < n) {
    atomicAdd(&sdata[tid], (g_idata[i] + g_idata[i+blockSize]));
    i += gridSize;
  }
  __syncthreads();
  // reduce in shared memory
  if (blockSize >= 512) {
    if (tid < 256)
      atomicAdd(&sdata[tid], sdata[tid + 256]);
    __syncthreads();
  }
  if (blockSize >= 256) {
    if (tid < 128)
      atomicAdd(&sdata[tid], sdata[tid + 128]);
    __syncthreads();
  }
  if (blockSize >= 128) {
    if (tid < 64)
      atomicAdd(&sdata[tid], sdata[tid + 64]);
    __syncthreads();
  }
  if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
  // write result for block to global memory
  if (tid == 0)
    g_odata[blockIdx.x] = sdata[0];
}

parallel-processing - CUDA 削減: アトミック操作はパフォーマンスに影響しない?

1 に答える 1

Related

Reference