このイテレータのループを CUDA で動作させるには問題があります。誰でもここで助けることができますか?
std::vector<cv::DMatch> matches;
std::vector<cv::KeyPoint> key_pts1, key_pts2;
std::vector<cv::Point2f> points1, points2;
for (std::vector<cv::DMatch>::const_iterator itr = matches.begin(); itr!= matches.end(); ++it)
{
float x = key_pts1[itr->queryIdx].pt.x;
float y = key_pts1[itr->queryIdx].pt.y;
points1.push_back(cv::Point2f(x,y));
x = key_pts2[itr->trainIdx].pt.x;
y = key_pts2[itr->trainIdx].pt.y;
points2.push_back(cv::Point2f(x,y));
}
上記の CUDA への変換 - 並列処理は、私には非常に難しいと思われます。
void dmatchLoopHomography(float *itr, float *match_being, float *match_end, float *keypoint_1, float *keypoint_2, float *pts1, float *pts2)
{
float x, y;
// allocate memory in GPU memory
unsigned char *mtch_begin, *mtch_end, *keypt_1, *keypt_2, points1, *points2;
cudaHostGetDevicePointer(&mtch_begin, match_being, 0);
cudaHostGetDevicePointer(&mtch_end, match_end, 0);
cudaHostGetDevicePointer(&keypt_1, keypoint_1, 0);
cudaHostGetDevicePointer(&keypt_2, keypoint_2, 0);
cudaHostGetDevicePointer(&points1, pts1, 0);
cudaHostGetDevicePointer(&points2, pts2, 0);
//dim3 blocks(16, 16);
dim3 threads(itr, itr);
//kernal
dmatchLoopHomography_ker<<<itr,itr>>>(mtch_begin, mtch_end, keypt_1, keypt_2, points1. points2)
cudaThreadSynchronize();
}
と
__global__ void dmatchLoopHomography_ker(float *itr, float *match_being, float *match_end, float *keypoint_1, float *keypoint_2, float *pts1, float *pts2)
{
//how do I go about it ??
}