CUDA SDK でリダクションの最適化を読んでいますが、reduce2 から reduce3 に何が起こるかを追跡する際に問題があります。
/*
This version uses sequential addressing -- no divergence or bank conflicts.
*/
template <class T>
__global__ void
reduce2(T *g_idata, T *g_odata, unsigned int n)
{
T *sdata = SharedMemory<T>();
// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = (i < n) ? g_idata[i] : 0;
__syncthreads();
// do reduction in shared mem
for (unsigned int s=blockDim.x/2; s>0; s>>=1)
{
if (tid < s)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
/*
This version uses n/2 threads --
it performs the first level of reduction when reading from global memory.
*/
template <class T>
__global__ void
reduce3(T *g_idata, T *g_odata, unsigned int n)
{
T *sdata = SharedMemory<T>();
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
T mySum = (i < n) ? g_idata[i] : 0;
if (i + blockDim.x < n)
mySum += g_idata[i+blockDim.x];
sdata[tid] = mySum;
__syncthreads();
// do reduction in shared mem
for (unsigned int s=blockDim.x/2; s>0; s>>=1)
{
if (tid < s)
{
sdata[tid] = mySum = mySum + sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
reduce3 の最初のレベルの削減が何をしようとしているのか、またはスレッド数が半分に削減された理由を視覚化するのに問題があります。誰かが私にいくつかの指針を与えることができますか?