編集:
ログ情報とプッシュおよびポップ関連の機能が追加されました。
セットアップ: Geforce GT520、Windows 64 ビット (32 ビット用にコンパイル)、Cuda 4.2。
私のコード スニペットには、スレッドに作業を供給する作業両端キューがあります。各ブロックには独自の両端キューがあり、動的に生成された作業項目を下部にプッシュまたはポップできます (popWork および pushWork 関数)。popWork() は、それ自体の両端キュー内の作業項目の数がしきい値よりも少ない場合、他の両端キューから作業を盗むこともできます。
template <class TreeNode, class BV , int iDequeSize , int iFrontSize>
__global__ void traverseTree(const TreeNode* tree_object1, const GPUVertex* vertex_object1, const uint3* tri_object1,
int2* aBvttDeques, int* aiBvttBottoms, unsigned int* auiBvttAges, int *aiBvttDequeFlags , int *piOverflowFlag ,
int2* outputList, unsigned int* outputListIdx , int2* aFrontDeques , int *auiFrontBottoms)
{
int iTid = threadIdx.x;
int iBid = blockIdx.x;
__shared__ int2 aLocalBvtt[BVTT_DEQUE_SHARED_SIZE];
__shared__ int2 aLocalFront[FRONT_DEQUE_SHARED_SIZE];
__shared__ int iLocalBvttCounter[WORK_STEALING_THREADS];
__shared__ int iLocalFrontCounter[WORK_STEALING_THREADS];
__shared__ unsigned int uiPopDequeIdx;
__shared__ int2 pushOrPopStartIdxAndSize;
__shared__ bool bPopOrPushFlag;
__shared__ unsigned int uiActiveDeques[NDEQUES]; //Contains indices for deques with useful work that can be stolen
__shared__ unsigned int uiActiveDequesIdx;
//Debug
int iRun = 0;
//
while(/*true*/ /*Debug*/iRun++ < 10) //Work loop will continue until cannot pop from bottom or cannot steal work from other deques
{
//Debug
if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
{
printf("(%d,%d) before sync0\n" , iBid , iTid);
}
__syncthreads();
if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
{
printf("(%d,%d) after sync0\n" , iBid , iTid);
}
//
int2 aWork_items[3];
int2 aFront_item;
int iBvttCount = 0;
int iFrontCount = 0;
if(!popWork<int2 , iDequeSize>(aiBvttDequeFlags , aiBvttBottoms , auiBvttAges , aBvttDeques , iTid , iBid ,
uiPopDequeIdx , pushOrPopStartIdxAndSize , /*iLocalBvttCounter , iLocalFrontCounter ,*/ bPopOrPushFlag ,
uiActiveDeques , uiActiveDequesIdx , aWork_items[0]))
{ //No more work
//Debug
if(iTid == 0)
{
printf("(%d,%d)no work\n" , iBid , iTid);
}
//
return;
}
//Debug
if(iTid == 0 && iBid == 0)
{
printf("(%d,%d) run=%d work=(%d,%d)\n" , iBid , iTid , iRun , aWork_items[0].x , aWork_items[0].y);
}
//
if(iTid < pushOrPopStartIdxAndSize.y)
{
TreeNode node1 = tree_object1[aWork_items[0].x];
TreeNode node2 = tree_object1[aWork_items[0].y];
if(aWork_items[0].x == aWork_items[0].y)
{ //intra-collision test (self collision)
intraCollision<TreeNode , BV>(iBvttCount , aWork_items , aWork_items[0] , node1 , tree_object1);
}
else
{ //inter-collision test
//interCollision<TreeNode , BV>(iLocalBvttCounter , aLocalBvtt , outputList , outputListIdx , work_item ,
// node1 , node2 , tree_object1 , tri_object1 , aFrontDeques , auiFrontBottoms[iBid]);
interCollision<TreeNode , BV>(iBvttCount , aWork_items , outputList , outputListIdx , aWork_items[0] ,
node1 , node2 , tree_object1 , tri_object1 , aFront_item , iFrontCount);
}
//__syncthreads();
}
iLocalBvttCounter[iTid] = iBvttCount;
iLocalFrontCounter[iTid] = iFrontCount;
//Debug
if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
{
printf("(%d,%d) before sync1\n" , iBid , iTid);
}
//
__syncthreads();
//Debug
if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
{
printf("(%d,%d) after sync1\n" , iBid , iTid);
}
//
{
int iPrefixSum = prefixSum<WORK_STEALING_THREADS>(iLocalFrontCounter , iTid , iFrontCount);
if(iFrontCount)
{
aLocalFront[iPrefixSum] = aFront_item;
}
if(iTid == WORK_STEALING_THREADS - 1)
{
iLocalFrontCounter[iTid] = iPrefixSum + iFrontCount;
}
iPrefixSum = prefixSum<WORK_STEALING_THREADS>(iLocalBvttCounter , iTid , iBvttCount);
//Debug
if(iTid == 0 && iBid == 0)
{
printf("(%d,%d) nChildren=%d prefixSum=%d\n" , iBid , iTid , iBvttCount , iPrefixSum);
for(int i = 0 ; i < iBvttCount ; ++i)
{
printf("(%d,%d) children %d=(%d,%d)\n" , iBid , iTid , i , aWork_items[i].x , aWork_items[i].y);
}
}
if(iTid == 1 && iBid == 0)
{
printf("(%d,%d) nprefixSumt2=%d\n" , iBid , iTid , iPrefixSum);
}
//
for(int i = 0 ; i < iBvttCount ; ++i)
{
aLocalBvtt[iPrefixSum + i] = aWork_items[i];
}
if(iTid == WORK_STEALING_THREADS - 1)
{
iLocalBvttCounter[iTid] = iPrefixSum + iBvttCount;
//Debug
if(iBid == 0)
{
printf("(%d,%d) totalWork=%d, prefix=%d + count=%d\n" , iBid , iTid , iLocalBvttCounter[iTid] , iPrefixSum , iBvttCount);
}
//
}
}
//Debug
if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
{
printf("(%d,%d) before sync2\n" , iBid , iTid);
}
//
__syncthreads();
//Debug
if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
{
printf("(%d,%d) after sync2\n" , iBid , iTid);
}
//
//TODO: push work back only when memory size is good for better coallescence
//Push back front to global mem
if(!pushWork<int2 , iFrontSize>(auiFrontBottoms , aFrontDeques , iTid , iBid ,
iLocalFrontCounter[WORK_STEALING_THREADS - 1] , aLocalFront , bPopOrPushFlag , pushOrPopStartIdxAndSize))
{ //overflow
if(iTid == 0)
{
//Debug
printf("(%d,%d) front overflow\n" , iBid , iTid);
//
*piOverflowFlag = 1;
atomicExch(&aiBvttDequeFlags[iBid] , 0);
}
return;
}
//Debug
if(iTid == 0 && iBid == 0)
{
printf("(%d,%d) localnWork=%d\n" , iBid , iTid , iLocalBvttCounter[WORK_STEALING_THREADS - 1]);
}
//
//Push back BVTT nodes to global mem
if(!pushWork<int2 , iDequeSize>(aiBvttBottoms , aBvttDeques , iTid , iBid , iLocalBvttCounter[WORK_STEALING_THREADS - 1] ,
aLocalBvtt , bPopOrPushFlag , pushOrPopStartIdxAndSize))
{ //overflow
if(iTid == 0)
{
//Debug
printf("(%d,%d) bvtt overflow\n" , iBid , iTid);
//
*piOverflowFlag = 1;
atomicExch(&aiBvttDequeFlags[iBid] , 0);
}
return;
}
//Debug
if(iTid == 0 && iBid == 0)
{
printf("(%d,%d) bot=%d\n\n" , iBid , iTid , aiBvttBottoms[iBid]);
}
//
//Debug
if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
{
printf("(%d,%d) before sync4\n" , iBid , iTid);
}
__syncthreads();
if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
{
printf("(%d,%d) after sync4\n" , iBid , iTid);
}
//
}
//Debug
if(iTid == 0)
{
printf("(%d,%d) max iter\n" , iBid , iTid);
}
//
}
popWork() 関連のコード:
bool __inline__ __device__ popTop(int *aiDequesBottoms , unsigned int *auiDequesAges , const int &iBid ,
int2 &popStartIdxAndSize)
{
int index;
unsigned int oldAge = auiDequesAges[iBid];
int localBot = aiDequesBottoms[iBid];
index = oldAge >> WORK_STEALING_TAG_NBITS;
if(localBot < index + WORK_STEALING_POP_SIZE + WORK_STEALING_PUSH_SIZE)
{
return false;
}
int localTag = oldAge & WORK_STEALING_TAG_MASK;
int size = min(WORK_STEALING_POP_SIZE , localBot - index);
unsigned int newAge = (index+size << WORK_STEALING_TAG_NBITS)| localTag;
if(oldAge == atomicCAS(&auiDequesAges[iBid] , oldAge , newAge))
{
popStartIdxAndSize.x = index;
popStartIdxAndSize.y = size;
return true;
}
else
{
return false;
}
}
bool __inline__ __device__ popBottom(int *aiDequesBottoms , unsigned int *auiDequesAges , const int &iBid ,
int2 &popStartIdxAndSize)
{
int localBot = aiDequesBottoms[iBid];
if(localBot == 0)
{
return false;
}
int index = localBot;
localBot = localBot - WORK_STEALING_POP_SIZE;
atomicExch(&aiDequesBottoms[iBid] , localBot);
unsigned int oldAge = auiDequesAges[iBid];
int oldAgeTop = int(oldAge >> WORK_STEALING_TAG_NBITS);
if(localBot > oldAgeTop)
{
popStartIdxAndSize.y = WORK_STEALING_POP_SIZE;
popStartIdxAndSize.x = index - WORK_STEALING_POP_SIZE;
return true;
}
atomicExch(&aiDequesBottoms[iBid] , 0);
unsigned int newAge = ((oldAge & WORK_STEALING_TAG_MASK) + 1) % (WORK_STEALING_TAG_MASK + 1);
if(index > oldAgeTop)
{
if(oldAge == atomicCAS(&auiDequesAges[iBid] , oldAge , newAge))
{
popStartIdxAndSize.y = index - oldAgeTop;
popStartIdxAndSize.x = index - popStartIdxAndSize.y;
return true;
}
}
atomicExch(&auiDequesAges[iBid] , newAge);
return false;
}
template <typename Work , int iDequeSize>
bool __inline__ __device__ popWork(int *aiDequeFlags , int *aiDequesBottoms , unsigned int *auiDequesAges ,
const Work *aDeques , const int &iTid , const int &iBid , unsigned int &uiPopDequeIdx , int2 &popStartIdxAndSize ,
/*int &iLocalDequeCounter , int &iLocalFrontCounter ,*/ bool &bPopFlag , unsigned int *uiActiveDeques ,
unsigned int &uiActiveDequesIdx , Work &work)
{
if(iTid == 0)
{ //Try to pop from block deque
//iLocalDequeCounter = 0;
//iLocalFrontCounter = 0;
bPopFlag = popBottom(aiDequesBottoms , auiDequesAges , iBid , popStartIdxAndSize);
if(bPopFlag)
{
uiPopDequeIdx = iBid;
}
else
{
atomicExch(&aiDequeFlags[iBid] , 0);
}
}
__syncthreads();
while(!bPopFlag)
{ //No more work, try to steal some (Help, police! We have a burglar here)!
if(iTid == 0)
{
uiActiveDequesIdx = 0;
//Debug
/*if(iBid == 6 || iBid == 1)
{
printf("bid=%d dequeFlags:[%d,%d,%d,%d,%d,%d,%d,%d]\n" , iBid , aiDequeFlags[0] , aiDequeFlags[1] ,
aiDequeFlags[2] , aiDequeFlags[3] , aiDequeFlags[4] , aiDequeFlags[5] , aiDequeFlags[6] ,
aiDequeFlags[7]);
printf("bId=%d dequesCounts:[%d,%d,%d,%d,%d,%d,%d,%d]\n" , iBid , aiDequesBottoms[0] ,
aiDequesBottoms[1] , aiDequesBottoms[2] , aiDequesBottoms[3] , aiDequesBottoms[4] ,
aiDequesBottoms[5] , aiDequesBottoms[6] , aiDequesBottoms[7]);
}*/
//
}
__syncthreads();
if(iTid < NDEQUES)
{
if(aiDequeFlags[iTid] == 1) //assuming iTid >= NDEQUES
{ //Set this deque for a work stealing atempt.
unsigned int uiIdx = atomicAdd(&uiActiveDequesIdx,1);
uiActiveDeques[uiIdx] = iTid;
}
}
__syncthreads();
if(iTid == 0)
{ //Try to steal until succeeds or there are no more deques left to search
bPopFlag = false;
for(uiPopDequeIdx = 0 ; uiPopDequeIdx < uiActiveDequesIdx; ++uiPopDequeIdx)
{
bPopFlag = popTop(aiDequesBottoms , auiDequesAges , uiPopDequeIdx , popStartIdxAndSize);
if(bPopFlag)
{
atomicExch(&aiDequeFlags[iBid] , 1);
break;
}
}
}
__syncthreads();
if(uiActiveDequesIdx == 0)
{ //No more work to steal. End.
return false;
}
__syncthreads();
}
//Get poped data
if(iTid < popStartIdxAndSize.y) //assuming number of threads >= WORK_SIZE
{
work = aDeques[uiPopDequeIdx*iDequeSize + popStartIdxAndSize.x + iTid];
//Debug
/*if(iTid == 20 && iBid == 0)
{
printf("work=(%d,%d) deque=%d dSize=%d start=%d final=%d\n" , work.x , work.y , uiPopDequeIdx , iDequeSize ,
popStartIdxAndSize.x , uiPopDequeIdx*iDequeSize + popStartIdxAndSize.x + iTid);
}*/
//
}
return true;
}
pushWork() 関連のコード:
template<int iDequeSize>
bool __inline__ __device__ pushBottom(int *aiDequeBottoms , const int &iBid , int2 &pushStartIdxAndSize)
{
int iOldBot = aiDequeBottoms[iBid];
pushStartIdxAndSize.x = iOldBot;
iOldBot += pushStartIdxAndSize.y;
if(iOldBot < iDequeSize)
{
atomicExch(&aiDequeBottoms[iBid] , iOldBot);
return true;
}
return false;
}
template <typename Work , int iDequeSize>
bool __inline__ __device__ pushWork(int *aiDequesBottoms , Work *aDeques , const int &iTid , const int &iBid ,
const unsigned int &uiDequeOutputCounter , Work *aOutputLocalWork , bool &bPushFlag , int2 &pushStartIdxAndSize)
{
if(uiDequeOutputCounter == 0)
{
return true;
}
pushStartIdxAndSize.y = uiDequeOutputCounter;
if(iTid == 0)
{
bPushFlag = pushBottom<iDequeSize>(aiDequesBottoms , iBid , pushStartIdxAndSize);
}
//Debug
if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
{
printf("(%d,%d) before push sync\n" , iBid , iTid);
}
//
__syncthreads();
//Debug
if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
{
printf("(%d,%d) after push sync\n" , iBid , iTid);
}
//
if(!bPushFlag)
{
return false;
}
//Transfer to global mem.
//unsigned int uiWorkLeft = uiDequeOutputCounter;
unsigned int uiThreadOffset = iTid;
//Debug
//int iRun = 0;
//
while(uiThreadOffset < uiDequeOutputCounter)
{
//Debug
/*if(iTid == 0 && iBid == 0)
{
printf("workLeft=%d bot=%d\n" , uiWorkLeft , aiDequesBottoms[iBid]);
}*/
//
//Debug
/*if(iBid == 0)
{
printf("tid=%d run=%d pushStartIdx.x=%d final=%d\n" , iTid , iRun , pushStartIdxAndSize.x ,
iDequeSize*iBid + pushStartIdxAndSize.x + iTid);
}*/
//
//aDeques[iDequeSize*iBid + pushStartIdxAndSize.x + iTid] = aOutputLocalWork[uiThreadOffset];
atomicExch(&(aDeques[iDequeSize*iBid + pushStartIdxAndSize.x + uiThreadOffset].x) , aOutputLocalWork[uiThreadOffset].x);
atomicExch(&(aDeques[iDequeSize*iBid + pushStartIdxAndSize.x + uiThreadOffset].y) , aOutputLocalWork[uiThreadOffset].y);
uiThreadOffset += blockDim.x;
//Debug
//++iRun;
//
//__threadfence();
}
return true;
}
私の問題は、最初の繰り返しの後、スレッドが同期ポイント (__syncthreads()) で待機していないように見えることです。これは、生成されたログに記載されています。また、この問題はリリース ビルドでのみ発生します。この点では、デバッグ ビルドは正常に機能しています。
(0,0) before sync0
(0,95) before sync0
(0,0) after sync0
(0,95) after sync0
(0,95) before sync1
(0,0) run=1 work=(0,0)
(0,0) before sync1
(0,0) after sync1
(0,95) after sync1
(0,0) nChildren=3 prefixSum=0
(0,95) totalWork=3, prefix=3 + count=0
(0,0) children 0=(1,2)
(0,95) before sync2
(0,0) children 1=(1,1)
(0,0) children 2=(2,2)
(0,1) nprefixSumt2=3
(0,0) before sync2
(0,0) after sync2
(0,95) after sync2
(0,0) localnWork=3
(0,95) before push sync
(0,0) before push sync
(0,0) after push sync
(0,95) after push sync
(0,95) before sync4
(0,95) after sync4 //PROBLEM HERE! (0,95) SHOULD STAY WAINTING IN SYNC4.
(0,0) bot=3
(0,0) before sync4
(0,95) before sync0 //NOW THREADS ARE SYNC ON DIFFERENT POINTS.
(0,0) after sync4
(0,95) after sync0
注目してくれてありがとう。