デバイスメモリを割り当て、コピーし、GPUで計算を実行し、結果をコピーして戻し、割り当てたデバイスメモリを解放しようとしています。制限を超えていないことを確認したかったので、共有メモリスペースにいくつかのアレイをダンプするのに十分なメモリがあるかどうかを確認したかったのです。
デバイスメモリを割り当てると、エラーは返されません。割り当てられたメモリの量を確認するために使用cudaMemGetInfo
すると、メモリが割り当てられていないように見えcudaMalloc
ます。また、メモリを解放しようとすると、1つのポインタだけが解放されているように見えます。
matlabMexfunction
インターフェイスを使用してGPUメモリをセットアップし、カーネルを起動しています。この時点では、カーネルを呼び出して、結果の単位行列を返すだけではありません。
cudaError_t cudaErr;
size_t freeMem = 0;
size_t totalMem = 0;
size_t allocMem = 0;
cudaMemGetInfo(&freeMem, &totalMem);
mexPrintf("Memory avaliable: Free: %lu, Total: %lu\n",freeMem, totalMem);
/* Pointers for the device memory */
double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;
double *deviceReceivedReal, *deviceReceivedImag;
/* Allocate memory on the device for the arrays. */
mexPrintf("Allocating memory.\n");
cudaErr = cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to devicePulseDelay\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceTarDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceScattDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceScatterers\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceReceivedReal\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceReceivedImag\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n", allocMem, totalMem,(freeMem - allocMem));
/* copy the input arrays across to the device */
mexPrintf("\nCopying memory.\n");
cudaErr = cudaMemcpy(devicePulseDelay, pulseDelay, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to devicePulseDelay\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceTarDistance, tarDistance, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to deviceTarDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScattDistance, scattDistance, sizeof(double)*999*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to deviceScattDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScatterers, scatterers, sizeof(double)*999,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to deviceScatterers\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
/* call the kernel */
// launchKernel<<<1,512>>>(........);
/* retireve the output */
cudaErr = cudaMemcpy(receivedReal, deviceReceivedReal, sizeof(double)*512*512,cudaMemcpyDeviceToHost);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to receivedReal\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("receivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(receivedImag, deviceReceivedImag, sizeof(double)*512*512,cudaMemcpyDeviceToHost);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to receivedImag\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("receivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
/* free the memory. */
mexPrintf("\nFree'ing memory.\n");
cudaMemGetInfo(&freeMem, &totalMem);
mexPrintf("Before freeing: Free %lu, Total: %lu\n", freeMem, totalMem);
cudaErr = cudaFree(devicePulseDelay);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free devicePulseDelay\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceTarDistance);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceTarDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceScattDistance);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceScattDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceScatterers);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceScatterers\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceReceivedReal);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceReceivedReal\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceReceivedImag);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceReceivedImag\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
これからの出力は次のとおりです。
使用可能なメモリ:無料:2523959296、合計:2818572288 メモリの割り当て。 devicePulseDelay:使用可能なメモリ:無料:2522910720、合計:2818572288、消費:1048576 deviceTarDistance:使用可能なメモリ:無料:2522910720、合計:2818572288、消費:1048576 deviceScattDistance:使用可能なメモリ:無料:2518716416、合計:2818572288、消費:5242880 deviceScatterers:使用可能なメモリ:無料:2517667840、合計:2818572288、消費:6291456 deviceReceivedReal:使用可能なメモリ:無料:2515570688、合計:2818572288、消費:8388608 deviceReceivedImag:使用可能なメモリ:無料:2513473536、合計:2818572288、消費:10485760 メモリをコピーしています。 devicePulseDelay:使用可能なメモリ:無料:2513473536、合計:2818572288、消費:10485760 deviceTarDistance:使用可能なメモリ:無料:2513473536、合計:2818572288、消費:10485760 deviceScattDistance:使用可能なメモリ:無料:2513473536、合計:2818572288、消費:10485760 deviceScatterers:使用可能なメモリ:無料:2513473536、合計:2818572288、消費:10485760 receiveReal:使用可能なメモリ:無料:2513473536、合計:2818572288、消費:10485760 receiveImag:使用可能なメモリ:無料:2513473536、合計:2818572288、消費:10485760 メモリを解放します。 解放する前:無料2513473536、合計:2818572288 devicePulseDelay:使用可能なメモリ:空き:2513473536、合計:2818572288、空き容量:0 deviceTarDistance:使用可能なメモリ:空き:2513473536、合計:2818572288、空き容量:0 deviceScattDistance:使用可能なメモリ:空き:2513473536、合計:2818572288、空き容量:0 deviceScatterers:使用可能なメモリ:無料:2514522112、合計:2818572288、無料'd:1048576 deviceReceivedReal:使用可能なメモリ:空き:2514522112、合計:2818572288、空き容量:1048576 deviceReceivedImag:使用可能なメモリ:無料:2514522112、合計:2818572288、無料'd:1048576
私が欠けていることは明らかな何かがあるように感じます。誰かが何が起こっているのかを説明するのを手伝ってもらえますか?
編集:プラットフォームは、TeslaC2050GPuカードを搭載したWindows7です。