このコードを移植しました:
if(_layersCount > 1)
{
for(int i=_layersCount-2;i>=0;i--)
{
for(int j=0;j<_neuronsPerLayerCount[i];j++) // cuda kernel
{
localGradients[indexByLayerAndNeuron(i, j)] = 0;
for(int k=0;k<_neuronsPerLayerCount[i+1];k++)
{
localGradients[indexByLayerAndNeuron(i, j)] += _neuronsInputsWeights[indexByLayerNeuronAndInput(i+1, k, j)]
* localGradients[indexByLayerAndNeuron(i+1, k)];
}
localGradients[indexByLayerAndNeuron(i, j)] *= derivatives[indexByLayerAndNeuron(i, j)];
}
}
}
CUDA へ:
if(_layersCount > 1)
{
for(int i=_layersCount-2;i>=0;i--)
{
// calculateLocalGradientsForAnotherLayers
blocksCount = floor((double) _neuronsPerLayerCount[i] / threads.x) + 1;
blocks = dim3(blocksCount, 1);
calculateLocalGradientsForAnotherLayers <<<blocks, threads>>> (deviceLocalGradients, _neuronsInputsWeights, deviceDerivatives, _neuronsPerLayerCount[i], _neuronsInPreviousLayers[i], _neuronsInPreviousLayers[i+1], _neuronsPerLayerCount[i+1], _inputsInPreviousLayers[i], _inputsInCurrentLayer[i]);
}
}
calculateLocalGradientsForAnotherLayers カーネル:
__global__ void calculateLocalGradientsForAnotherLayers(double * localGradients, double * neuronsInputsWeights, double * derivatives, int neuronsCount, int neuronsInPreviousLayers, int neuronsInPreviousLayersWithCurrent, int neuronsInNextLayer, int inputsInPreviousLayers, int inputsInCurrentLayer)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < neuronsCount)
{
int neuron = neuronsInPreviousLayers + idx;
localGradients[neuron] = 0;
// this to Kernel, then reduce localGradients.
for(int k=0;k<neuronsInNextLayer;k++)
{
localGradients[neuron] += neuronsInputsWeights[inputsInPreviousLayers + k*inputsInCurrentLayer + idx]
* localGradients[neuronsInPreviousLayersWithCurrent + k];
}
localGradients[neuron] *= derivatives[neuron];
}
}
しかし、小数第 2 位からの結果の違いがわかります。なぜ誤差が大きいのですか?これを除いて、すべてのカーネルは正常に動作します。
私のGPUはNV GF555Mです。倍精度をサポートしています。