CUBLAS_STATUS_EXECUTION_FAILED (13) の出力がなければ、cublasStrsmBatched (113 行目) を実行できません。簡単にするために、すべての行列値とアルファは 1.0 で、すべての行列は正方形で、lda、ldb、m、n は等しいです。エラーなしで同じ方法で cublasSgemmBatched と cublasStrsm を実行できます。cublasStrsmBatched は同じはずですが、そうではありません。このコードで何が間違っているのかについて何か考えがあるかどうか教えてください:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
cublasHandle_t handle;
void CheckCublasCreate(cublasStatus_t status);
void CheckAllocateHost(void* h_pointer);
void CheckCudaMalloc(cudaError_t d_allocStatus);
void CheckCudaMemcpy( cudaError_t error );
void CheckCublasSetGetMatrix(cublasStatus_t status);
void CheckKernelExecution(cublasStatus_t status);
void CheckCublasDestroy(cublasStatus_t status);
void TestCublasStrsmBatched(int size, int numOfLinSys);
int main()
{
cublasStatus_t status = cublasCreate(&handle);
CheckCublasCreate(status);
/*arguments are size of square matrix
and number of linear systems*/
TestCublasStrsmBatched(2,2);
status = cublasDestroy(handle);
CheckCublasDestroy(status);
}
void TestCublasStrsmBatched(int size, int numOfLinSys)
{
cublasStatus_t status;
cudaError_t error;
float **h_A;
float **d_A;
float **h_B;
float **d_B;
float **hd_A;
float **hd_B;
float *alpha;
const int n = size;
const int m = size;
const int lda=m;
const int ldb=m;
const int matA_numOfElem = m*m;
const int matB_numOfElem = m*n;
int i,j;
h_A = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(h_A);
h_B = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(h_B);
alpha=(float *)malloc(sizeof(float));
*alpha = 1.0;
for (j=0; j<numOfLinSys; j++){
h_A[j] = (float *)malloc(matA_numOfElem * sizeof(float));
CheckAllocateHost(h_A);
for (i=0; i < matA_numOfElem; i++)
h_A[j][i] = 1.0;
h_B[j] = (float *)malloc(matB_numOfElem * sizeof(float));
CheckAllocateHost(h_B);
for (i=0; i < matB_numOfElem; i++)
h_B[j][i] = 1.0;
}
hd_A = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(hd_A);
hd_B = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(hd_B);
for (j=0; j<numOfLinSys; j++){
error = cudaMalloc((void **)&hd_A[j],
matA_numOfElem * sizeof(float));
CheckCudaMalloc(error);
error = cudaMalloc((void **)&hd_B[j],
matB_numOfElem * sizeof(float));
CheckCudaMalloc(error);
status = cublasSetMatrix(m, m, sizeof(float),
h_A[j], lda, hd_A[j], lda);
CheckCublasSetGetMatrix(status);
status = cublasSetMatrix(m, n, sizeof(float),
h_B[j], ldb, hd_B[j], ldb);
CheckCublasSetGetMatrix(status);
}
error = cudaMalloc((void **)&d_A, numOfLinSys * sizeof(float*));
CheckCudaMalloc(error);
error = cudaMalloc((void **)&d_B, numOfLinSys * sizeof(float*));
CheckCudaMalloc(error);
error = cudaMemcpy(d_A, hd_A, numOfLinSys * sizeof(float*),
cudaMemcpyHostToDevice);
CheckCudaMemcpy(error);
error = cudaMemcpy(d_B, hd_B, numOfLinSys * sizeof(float*),
cudaMemcpyHostToDevice);
CheckCudaMemcpy(error);
/*After cublasStrsmBatched call
status changes to CUBLAS_STATUS_EXECUTION_FAILED (13)*/
status = cublasStrsmBatched(handle,
CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER,
CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
m, n, alpha, d_A, lda, d_B, ldb, numOfLinSys);
CheckKernelExecution(status);
}
void CheckCublasCreate( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr,
"!!!! CUBLAS initialization error \n");
exit(EXIT_FAILURE);
}
}
void CheckAllocateHost( void* h_pointer )
{
if (h_pointer == 0){
fprintf(stderr,
"!!!! host memory allocation error \n");
exit(EXIT_FAILURE);
}
}
void CheckCudaMalloc( cudaError_t error )
{
if (error != cudaSuccess){
fprintf(stderr,
"!!!! device memory allocation error (error code %s)\n",
cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
}
void CheckCudaMemcpy( cudaError_t error )
{
if (error != cudaSuccess){
fprintf(stderr, "!!!! data copy error (error code %s)\n",
cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
}
void CheckCublasSetGetMatrix( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr, "!!!! device access error \n");
exit(EXIT_FAILURE);
}
}
void CheckKernelExecution( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr, "!!!! kernel execution error.\n");
exit(EXIT_FAILURE);
}
}
void CheckCublasDestroy( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr, "!!!! shutdown error \n");
exit(EXIT_FAILURE);
}
}
Linux、CUDA 5.5、T10 および Windows、CUDA 5.5、GTX285 を使用
ありがとう!