Alea GPU バージョン 2.2 (現時点では最新) まではまだ malloc array2d をサポートしていないため、カーネルで自分で行と列ごとにインデックスをフラット化する必要があります。ホスト側では、いくつかの拡張メソッドを作成して、CUDA ドライバー API P/Invoke (これらの P/Invoke 関数は Alea.CUDA.dll から入手可能) を使用して、固定された .NET 配列をデバイスとの間で転送できます。
だからここに私が書いた簡単な回避策があります:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;
using Alea.CUDA;
using Alea.CUDA.IL;
using NUnit.Framework;
namespace ConsoleApplication1
{
static class Extension
{
public static DeviceMemory<T> Malloc<T>(this Worker worker, T[,] array2D)
{
var rows = array2D.GetLength(0);
var cols = array2D.GetLength(1);
var dmem = worker.Malloc<T>(rows*cols);
var handle = GCHandle.Alloc(array2D, GCHandleType.Pinned);
try
{
var hostPtr = handle.AddrOfPinnedObject();
var devicePtr = dmem.Handle;
// we now pinned .NET array, and need to copy them with CUDA Driver API
// to do so we need use worker.Eval to make sure the worker's context is
// pushed onto current thread.
worker.EvalAction(() =>
{
CUDAInterop.cuSafeCall(CUDAInterop.cuMemcpyHtoD(devicePtr, hostPtr,
new IntPtr(Intrinsic.__sizeof<T>()*rows*cols)));
});
}
finally
{
handle.Free();
}
return dmem;
}
public static DeviceMemory<T> Malloc<T>(this Worker worker, int rows, int cols)
{
return worker.Malloc<T>(rows*cols);
}
public static void Gather<T>(this DeviceMemory<T> dmem, T[,] array2D)
{
var rows = array2D.GetLength(0);
var cols = array2D.GetLength(1);
var handle = GCHandle.Alloc(array2D, GCHandleType.Pinned);
try
{
var hostPtr = handle.AddrOfPinnedObject();
var devicePtr = dmem.Handle;
// we now pinned .NET array, and need to copy them with CUDA Driver API
// to do so we need use worker.Eval to make sure the worker's context is
// pushed onto current thread.
dmem.Worker.EvalAction(() =>
{
CUDAInterop.cuSafeCall(CUDAInterop.cuMemcpyDtoH(hostPtr, devicePtr,
new IntPtr(Intrinsic.__sizeof<T>() * rows * cols)));
});
}
finally
{
handle.Free();
}
}
}
class Program
{
static int FlattenIndex(int row, int col, int cols)
{
return row*cols + col;
}
[AOTCompile]
static void Kernel(deviceptr<double> outputs, deviceptr<double> inputs, int rows, int cols)
{
// for simplicity, I do all things in one thread.
for (var row = 0; row < rows; row++)
{
for (var col = 0; col < cols; col++)
{
outputs[FlattenIndex(row, col, cols)] = inputs[FlattenIndex(row, col, cols)];
}
}
}
[Test]
public static void Test()
{
var worker = Worker.Default;
// make it small, for we only do it in one GPU thread.
const int rows = 10;
const int cols = 5;
var rng = new Random();
var inputs = new double[rows, cols];
for (var row = 0; row < rows; ++row)
{
for (var col = 0; col < cols; ++col)
{
inputs[row, col] = rng.Next(1, 100);
}
}
var dInputs = worker.Malloc(inputs);
var dOutputs = worker.Malloc<double>(rows, cols);
var lp = new LaunchParam(1, 1);
worker.Launch(Kernel, lp, dOutputs.Ptr, dInputs.Ptr, rows, cols);
var outputs = new double[rows, cols];
dOutputs.Gather(outputs);
Assert.AreEqual(inputs, outputs);
}
public static void Main(string[] args)
{
}
}
}