OpenCL初心者の質問:)
次のようなopenclカーネルを作成しようとしています
__kernel void NLLSQ
(
__global double* image,
__global double* nllsqResult
)
{
//Do some stuff
}
ループを入れようとするまではうまくいきます。
__kernel void NLLSQ
(
__global double* image,
__global double* nllsqResult
)
{
for (int i = 0; i < 2; i++)
{
//Do some stuff
}
}
これにより、コンピューターがクラッシュしてモニターが黒くなります。問題は、グラフィックス カードにあまりにも多くの作業を送信したことだと思います。
私の完全なコードは次のようになります
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
#error "Double precision doubleing point not supported by OpenCL implementation."
#endif
int2 clipPixel(int2 coordinate, int width, int height)
{
coordinate.x = max(0, coordinate.x);
coordinate.y = max(0, coordinate.y);
coordinate.x = min(width, coordinate.x); //1911
coordinate.y = min(height, coordinate.y); //1071
return coordinate;
}
int Coord2Index(int X, int Y, int width)
{
return (width * Y) + X;
}
//2D Gaussian 'bubble' Function
double f(int x, int y, double a, double b, double s)
{
return a + b*exp(-(x*x+y*y)/(s*s));
}
// (∂f/∂b)
double dfdb(int x, int y, double s)
{
return exp(-(x*x+y*y)/(s*s));
}
// (∂f/∂σ)
double dfds(int x, int y, double b, double s)
{
double v = -(x*x + y*y);
return b * exp(v/(s*s))*-2*v/(s*s*s);
}
//Non-Linear Least Squares
__kernel void NLLSQ
(
__global double* image,
__global double* nllsqResult
)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
int index = Coord2Index( x, y, 1912 );
int jacIndex = 0;
int dyIndex = 0;
int indexRslt = Coord2Index( x, y, 1904 );
double dY[81];
double J[81][3];
double JTJ[3][3];
double3 B = (double3)(0, 1, 1); //initial guess
double JTdY[3];
//Creates the dY vector
for (int j = -4; j <= 4; j++)
{
for (int i = -4; i <= 4; i++)
{
dY[dyIndex] = image[index] - f( i, j, B.x, B.y, B.z);
dyIndex = dyIndex + 1;
}
}
//Creates the Jacobian
for (int j = -4; j <= 4; j++)
{
for (int i = -4; i <= 4; i++)
{
index = Coord2Index( x + i + 4, y + j + 4, 1912 );
J[jacIndex][0] = 1;
J[jacIndex][1] = dfdb(i, j, B.z);
J[jacIndex][2] = dfds(i, j, B.y, B.z);
jacIndex = jacIndex + 1;
}
}
//Now to solve (JT * J) * ΔB = JT * ΔY for ΔB ....
JTdY[0] = 0;
JTdY[1] = 0;
JTdY[2] = 0;
//Create JTJ
for (int i = 0; i < 81; i++)
{
JTJ[0][0] = J[i][0] * J[i][0];
JTJ[0][1] = J[i][0] * J[i][1];
JTJ[0][2] = J[i][0] * J[i][2];
JTJ[1][0] = J[i][1] * J[i][0];
JTJ[1][1] = J[i][1] * J[i][1];
JTJ[1][2] = J[i][1] * J[i][2];
JTJ[2][0] = J[i][2] * J[i][0];
JTJ[2][1] = J[i][2] * J[i][1];
JTJ[2][2] = J[i][2] * J[i][2];
//JT * ΔY
JTdY[0] = J[i][0] * dY[i];
JTdY[1] = J[i][1] * dY[i];
JTdY[2] = J[i][2] * dY[i];
}
//TO DO: might have to make this next part more general if I decide not to use a 9x9 bubble template size
// Also not sure what to do when det(A) = 0 (is that even possible?)
// (JT * J) * ΔB = JT * ΔY is a system of the form Ax = b
// A = (JT * J), ΔB = x, JT * ΔY = b
//Solve using cramer's rule http://en.wikipedia.org/wiki/Cramer%27s_rule
// xi = det(Ai)/det(A)
//determinant of A
double detA =
JTJ[0][0] * (JTJ[1][1] * JTJ[2][2] - JTJ[1][2] * JTJ[2][1]) -
JTJ[0][1] * (JTJ[1][0] * JTJ[2][2] - JTJ[1][2] * JTJ[2][0]) +
JTJ[0][2] * (JTJ[1][0] * JTJ[2][1] - JTJ[1][1] * JTJ[2][0]) ;
double detA1 =
JTdY[0] * (JTJ[1][1] * JTJ[2][2] - JTJ[1][2] * JTJ[2][1]) -
JTJ[0][1] * ( JTdY[1] * JTJ[2][2] - JTJ[1][2] * JTdY[2] ) +
JTJ[0][2] * ( JTdY[1] * JTJ[2][1] - JTJ[1][1] * JTdY[2] ) ;
double detA2 =
JTJ[0][0] * (JTdY[1] * JTJ[2][2] - JTJ[1][2] * JTdY[2] ) -
JTdY[0] * (JTJ[1][0] * JTJ[2][2] - JTJ[1][2] * JTJ[2][0]) +
JTJ[0][2] * (JTJ[1][0] * JTdY[2] - JTdY[1] * JTJ[2][0]) ;
double detA3 =
JTJ[0][0] * (JTJ[1][1] * JTdY[2] - JTdY[1] * JTJ[2][1]) -
JTJ[0][1] * (JTJ[1][0] * JTdY[2] - JTdY[1] * JTJ[2][0]) +
JTdY[0] * (JTJ[1][0] * JTJ[2][1] - JTJ[1][1] * JTJ[2][0]) ;
// B(k+1) = B(k) + ΔB
B.x = B.x + (detA1/detA);
B.y = B.y + (detA2/detA);
B.z = B.z + (detA3/detA);
nllsqResult[indexRslt] = B.z;
}
forループをそのまま使いたい
//Non-Linear Least Squares
__kernel void NLLSQ
(
__global double* image,
__global double* nllsqResult
)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
int index = Coord2Index( x, y, 1912 );
int jacIndex = 0;
int dyIndex = 0;
int indexRslt = Coord2Index( x, y, 1904 );
double dY[81];
double J[81][3];
double JTJ[3][3];
double3 B = (double3)(0, 1, 1); //initial guess
double JTdY[3];
//Creates the dY vector
for (int j = -4; j <= 4; j++)
{
for (int i = -4; i <= 4; i++)
{
dY[dyIndex] = image[index] - f( i, j, B.x, B.y, B.z);
dyIndex = dyIndex + 1;
}
}
for (int iters = 0; iters < 10; iters++) //FOR LOOP ADDED HERE
{
jacIndex = 0;
//Creates the Jacobian
for (int j = -4; j <= 4; j++)
{
for (int i = -4; i <= 4; i++)
{
index = Coord2Index( x + i + 4, y + j + 4, 1912 );
J[jacIndex][0] = 1;
J[jacIndex][1] = dfdb(i, j, B.z);
J[jacIndex][2] = dfds(i, j, B.y, B.z);
jacIndex = jacIndex + 1;
}
}
//Now to solve (JT * J) * ΔB = JT * ΔY for ΔB ....
JTdY[0] = 0;
JTdY[1] = 0;
JTdY[2] = 0;
//Create JTJ
for (int i = 0; i < 81; i++)
{
JTJ[0][0] = J[i][0] * J[i][0];
JTJ[0][1] = J[i][0] * J[i][1];
JTJ[0][2] = J[i][0] * J[i][2];
JTJ[1][0] = J[i][1] * J[i][0];
JTJ[1][1] = J[i][1] * J[i][1];
JTJ[1][2] = J[i][1] * J[i][2];
JTJ[2][0] = J[i][2] * J[i][0];
JTJ[2][1] = J[i][2] * J[i][1];
JTJ[2][2] = J[i][2] * J[i][2];
//JT * ΔY
JTdY[0] = J[i][0] * dY[i];
JTdY[1] = J[i][1] * dY[i];
JTdY[2] = J[i][2] * dY[i];
}
//TO DO: might have to make this next part more general if I decide not to use a 9x9 bubble template size
// Also not sure what to do when det(A) = 0 (is that even possible?)
// (JT * J) * ΔB = JT * ΔY is a system of the form Ax = b
// A = (JT * J), ΔB = x, JT * ΔY = b
//Solve using cramer's rule http://en.wikipedia.org/wiki/Cramer%27s_rule
// xi = det(Ai)/det(A)
//determinant of A
double detA =
JTJ[0][0] * (JTJ[1][1] * JTJ[2][2] - JTJ[1][2] * JTJ[2][1]) -
JTJ[0][1] * (JTJ[1][0] * JTJ[2][2] - JTJ[1][2] * JTJ[2][0]) +
JTJ[0][2] * (JTJ[1][0] * JTJ[2][1] - JTJ[1][1] * JTJ[2][0]) ;
double detA1 =
JTdY[0] * (JTJ[1][1] * JTJ[2][2] - JTJ[1][2] * JTJ[2][1]) -
JTJ[0][1] * ( JTdY[1] * JTJ[2][2] - JTJ[1][2] * JTdY[2] ) +
JTJ[0][2] * ( JTdY[1] * JTJ[2][1] - JTJ[1][1] * JTdY[2] ) ;
double detA2 =
JTJ[0][0] * (JTdY[1] * JTJ[2][2] - JTJ[1][2] * JTdY[2] ) -
JTdY[0] * (JTJ[1][0] * JTJ[2][2] - JTJ[1][2] * JTJ[2][0]) +
JTJ[0][2] * (JTJ[1][0] * JTdY[2] - JTdY[1] * JTJ[2][0]) ;
double detA3 =
JTJ[0][0] * (JTJ[1][1] * JTdY[2] - JTdY[1] * JTJ[2][1]) -
JTJ[0][1] * (JTJ[1][0] * JTdY[2] - JTdY[1] * JTJ[2][0]) +
JTdY[0] * (JTJ[1][0] * JTJ[2][1] - JTJ[1][1] * JTJ[2][0]) ;
// B(k+1) = B(k) + ΔB
B.x = B.x + (detA1/detA);
B.y = B.y + (detA2/detA);
B.z = B.z + (detA3/detA);
}
nllsqResult[indexRslt] = B.z;
}