cuda - 密な線形システム AX = B を CUDA で解く

Question

新しい cuSOLVER ライブラリ (CUDA 7) を使用して次の形式の線形システムを解くことはできますか?

AX = B

とAはX密B行列NxNですか?

score 21 · Accepted Answer

はい。

アプローチ番号 1

cuSOLVER のフレームワークでは、QR 分解を使用できます。CUDA で線形システムを解くための QR 分解を参照してください。

アプローチ番号 2

または、次の逐次インボリューションによって逆行列を計算できます。

 cublas<t>getrfBatched()

行列の LU 分解を計算し、

cublas<t>getriBatched()

LU 分解から始まる逆行列を計算します。

アプローチ番号 3

最終的な可能性は使用しています

cublas<t>getrfBatched()

の2回の呼び出しが続きます

cublas<t>trsm()

上または下三角線形システムを解きます。

Robert Crovella が指摘したように、答えは関係するマトリックスのサイズとタイプによって異なる場合があります。

アプローチ番号のコード。1

CUDA で線形システムを解くには、QR 分解を参照してください。

アプローチ番号のコード。2および番号。3

以下に、アプローチ nr の実装の実際の例を報告しています。2と3。ハンケル行列は、適切に調整された可逆行列をアプローチに供給するために使用されます。アプローチ番号に注意してください。3の呼び出しに続いて取得されたピボット配列に従って、システム係数ベクトルを並べ替える (再配置する) 必要がありcublas<t>getrfBatched()ます。この置換は、CPU 上で簡単に実行できます。

#include <stdio.h>
#include <fstream>
#include <iomanip>
#include <stdlib.h>     /* srand, rand */
#include <time.h>       /* time */

#include "cuda_runtime.h" 
#include "device_launch_parameters.h"

#include "cublas_v2.h"

#include "Utilities.cuh"
#include "TimingGPU.cuh"

#define prec_save 10

#define BLOCKSIZE 256

#define BLOCKSIZEX 16
#define BLOCKSIZEY 16

/************************************/
/* SAVE REAL ARRAY FROM CPU TO FILE */
/************************************/
template <class T>
void saveCPUrealtxt(const T * h_in, const char *filename, const int M) {

    std::ofstream outfile;
    outfile.open(filename);
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
    outfile.close();

}

/************************************/
/* SAVE REAL ARRAY FROM GPU TO FILE */
/************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {

    T *h_in = (T *)malloc(M * sizeof(T));

    gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));

    std::ofstream outfile;
    outfile.open(filename);
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
    outfile.close();

}

/***************************************************/
/* FUNCTION TO SET THE VALUES OF THE HANKEL MATRIX */
/***************************************************/
// --- https://en.wikipedia.org/wiki/Hankel_matrix
void setHankelMatrix(double * __restrict h_A, const int N) {

    double *h_atemp = (double *)malloc((2 * N - 1) * sizeof(double));

    // --- Initialize random seed
    srand(time(NULL));

    // --- Generate random numbers
    for (int k = 0; k < 2 * N - 1; k++) h_atemp[k] = rand();

    // --- Fill the Hankel matrix. The Hankel matrix is symmetric, so filling by row or column is equivalent.
    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++)
            h_A[i * N + j] = h_atemp[(i + 1) + (j + 1) - 2];

    free(h_atemp);

}

/***********************************************/
/* FUNCTION TO COMPUTE THE COEFFICIENTS VECTOR */
/***********************************************/
void computeCoefficientsVector(const double * __restrict h_A, const double * __restrict h_xref, 
                               double * __restrict h_y, const int N) {

    for (int k = 0; k < N; k++) h_y[k] = 0.f;

    for (int m = 0; m < N; m++)
        for (int n = 0; n < N; n++)
            h_y[m] = h_y[m] + h_A[n * N + m] * h_xref[n];

}

/************************************/
/* COEFFICIENT REARRANGING FUNCTION */
/************************************/
void rearrange(double *vec, int *pivotArray, int N){
    for (int i = 0; i < N; i++) {
        double temp = vec[i];
        vec[i] = vec[pivotArray[i] - 1];
        vec[pivotArray[i] - 1] = temp;
    }   
}

/********/
/* MAIN */
/********/
int main() {

    const unsigned int N = 1000;

    const unsigned int Nmatrices = 1;

    // --- CUBLAS initialization
    cublasHandle_t cublas_handle;
    cublasSafeCall(cublasCreate(&cublas_handle));

    TimingGPU timerLU, timerApproach1, timerApproach2;
    double timingLU, timingApproach1, timingApproach2;

    /***********************/
    /* SETTING THE PROBLEM */
    /***********************/
    // --- Matrices to be inverted (only one in this example)
    double *h_A = (double *)malloc(N * N * Nmatrices * sizeof(double));

    // --- Setting the Hankel matrix
    setHankelMatrix(h_A, N);

    // --- Defining the solution
    double *h_xref = (double *)malloc(N * sizeof(double));
    for (int k = 0; k < N; k++) h_xref[k] = 1.f;

    // --- Coefficient vectors (only one in this example)
    double *h_y = (double *)malloc(N * sizeof(double));

    computeCoefficientsVector(h_A, h_xref, h_y, N);

    // --- Result (only one in this example)
    double *h_x = (double *)malloc(N * sizeof(double));

    // --- Allocate device space for the input matrices 
    double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * Nmatrices * sizeof(double)));
    double *d_y; gpuErrchk(cudaMalloc(&d_y, N *                 sizeof(double)));
    double *d_x; gpuErrchk(cudaMalloc(&d_x, N *                 sizeof(double)));

    // --- Move the relevant matrices from host to device
    gpuErrchk(cudaMemcpy(d_A, h_A, N * N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_y, h_y, N *                 sizeof(double), cudaMemcpyHostToDevice));

    /**********************************/
    /* COMPUTING THE LU DECOMPOSITION */
    /**********************************/
    timerLU.StartCounter();

    // --- Creating the array of pointers needed as input/output to the batched getrf
    double **h_inout_pointers = (double **)malloc(Nmatrices * sizeof(double *));
    for (int i = 0; i < Nmatrices; i++) h_inout_pointers[i] = d_A + i * N * N;

    double **d_inout_pointers;
    gpuErrchk(cudaMalloc(&d_inout_pointers, Nmatrices * sizeof(double *)));
    gpuErrchk(cudaMemcpy(d_inout_pointers, h_inout_pointers, Nmatrices * sizeof(double *), cudaMemcpyHostToDevice));
    free(h_inout_pointers);

    int *d_pivotArray; gpuErrchk(cudaMalloc(&d_pivotArray, N * Nmatrices * sizeof(int)));
    int *d_InfoArray;  gpuErrchk(cudaMalloc(&d_InfoArray,      Nmatrices * sizeof(int)));

    int *h_InfoArray  = (int *)malloc(Nmatrices * sizeof(int));

    cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, d_pivotArray, d_InfoArray, Nmatrices));
    //cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, NULL, d_InfoArray, Nmatrices));

    gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));

    for (int i = 0; i < Nmatrices; i++)
        if (h_InfoArray[i] != 0) {
            fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
            cudaDeviceReset();
            exit(EXIT_FAILURE);
        }

    timingLU = timerLU.GetCounter();
    printf("Timing LU decomposition %f [ms]\n", timingLU);

    /*********************************/
    /* CHECKING THE LU DECOMPOSITION */
    /*********************************/
    saveCPUrealtxt(h_A,          "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\A.txt", N * N);
    saveCPUrealtxt(h_y,          "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\y.txt", N);
    saveGPUrealtxt(d_A,          "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\Adecomposed.txt", N * N);
    saveGPUrealtxt(d_pivotArray, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\pivotArray.txt", N);

    /******************************************************************************/
    /* APPROACH NR.1: COMPUTE THE INVERSE OF A STARTING FROM ITS LU DECOMPOSITION */
    /******************************************************************************/
    timerApproach1.StartCounter();

    // --- Allocate device space for the inverted matrices 
    double *d_Ainv; gpuErrchk(cudaMalloc(&d_Ainv, N * N * Nmatrices * sizeof(double)));

    // --- Creating the array of pointers needed as output to the batched getri
    double **h_out_pointers = (double **)malloc(Nmatrices * sizeof(double *));
    for (int i = 0; i < Nmatrices; i++) h_out_pointers[i] = (double *)((char*)d_Ainv + i * ((size_t)N * N) * sizeof(double));

    double **d_out_pointers;
    gpuErrchk(cudaMalloc(&d_out_pointers, Nmatrices*sizeof(double *)));
    gpuErrchk(cudaMemcpy(d_out_pointers, h_out_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice));
    free(h_out_pointers);

    cublasSafeCall(cublasDgetriBatched(cublas_handle, N, (const double **)d_inout_pointers, N, d_pivotArray, d_out_pointers, N, d_InfoArray, Nmatrices));

    gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));

    for (int i = 0; i < Nmatrices; i++)
        if (h_InfoArray[i] != 0) {
        fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i);
        cudaDeviceReset();
        exit(EXIT_FAILURE);
        }

    double alpha1 = 1.f;
    double beta1 = 0.f;

    cublasSafeCall(cublasDgemv(cublas_handle, CUBLAS_OP_N, N, N, &alpha1, d_Ainv, N, d_y, 1, &beta1, d_x, 1));

    timingApproach1 = timingLU + timerApproach1.GetCounter();
    printf("Timing approach 1 %f [ms]\n", timingApproach1);

    /**************************/
    /* CHECKING APPROACH NR.1 */
    /**************************/
    saveGPUrealtxt(d_x, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach1.txt", N);

    /*************************************************************/
    /* APPROACH NR.2: INVERT UPPER AND LOWER TRIANGULAR MATRICES */
    /*************************************************************/
    timerApproach2.StartCounter();

    double *d_P; gpuErrchk(cudaMalloc(&d_P, N * N * sizeof(double)));

    gpuErrchk(cudaMemcpy(h_y, d_y, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
    int *h_pivotArray = (int *)malloc(N * Nmatrices*sizeof(int));
    gpuErrchk(cudaMemcpy(h_pivotArray, d_pivotArray, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));

    rearrange(h_y, h_pivotArray, N);
    gpuErrchk(cudaMemcpy(d_y, h_y, N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));

    // --- Now P*A=L*U
    //     Linear system A*x=y => P.'*L*U*x=y => L*U*x=P*y

    // --- 1st phase - solve Ly = b 
    const double alpha = 1.f;

    // --- Function solves the triangular linear system with multiple right hand sides, function overrides b as a result 

    // --- Lower triangular part
    cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, d_A, N, d_y, N));

    // --- Upper triangular part
    cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, d_A, N, d_y, N));

    timingApproach2 = timingLU + timerApproach2.GetCounter();
    printf("Timing approach 2 %f [ms]\n", timingApproach2);

    /**************************/
    /* CHECKING APPROACH NR.2 */
    /**************************/
    saveGPUrealtxt(d_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach2.txt", N);

    return 0;
}

このような例を実行するために必要なUtilities.cuおよびUtilities.cuhファイルは、このgithub ページで管理されています。TimingGPU.cuおよびTimingGPU.cuhファイルは、この github ページで管理されています。

3 番目のアプローチに関する参考資料:

NAG Fortran ライブラリルーチンドキュメント

科学計算ソフトウェアライブラリ (SCSL) ユーザーガイド

https://www.cs.drexel.edu/~jjohnson/2010-11/summer/cs680/programs/lapack/Danh/verify_sequential.c

編集

アプローチ nr のタイミング (ミリ秒)。2 および 3 (GTX960 カードで実行されたテスト、cc. 5.2)。

N        LU decomposition       Approach nr. 2       Approach nr. 3
100      1.08                   2.75                 1.28
500      45.4                   161                  45.7
1000     302                    1053                 303

それが現れたら、nrに近づきます。3 の方が便利で、そのコストは基本的に LU 因数分解を計算するコストです。さらに：

LU 分解による線形システムの解法は、QR 分解を使用するよりも高速です( CUDA で線形システムを解くための QR 分解を参照)。
LU 分解は正方線形システムに限定されますが、QR 分解は非正方線形システムの場合に役立ちます。

以下のMatlabコードは、結果を確認するために使用できます

clear all
close all
clc

warning off

N = 1000;

% --- Setting the problem solution
x = ones(N, 1);

%%%%%%%%%%%%%%%%%%%%%
% NxN HANKEL MATRIX %
%%%%%%%%%%%%%%%%%%%%%
% --- https://en.wikipedia.org/wiki/Hankel_matrix
load A.txt
load y.txt

A = reshape(A, N, N);

yMatlab = A * x;
fprintf('Percentage rms between coefficients vectors in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(yMatlab - y).^2)) / sum(sum(abs(yMatlab).^2))));

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% COMPUTATION OF THE LU DECOMPOSITION %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
[Lmatlab, Umatlab] = lu(A);

load Adecomposed.txt
Adecomposed = reshape(Adecomposed, N, N);

L = eye(N);
for k = 1 : N
    L(k + 1 : N, k) = Adecomposed(k + 1 : N, k);
end
U = zeros(N);
for k = 1 : N
    U(k, k : N) = Adecomposed(k, k : N);
end

load pivotArray.txt
Pj = eye(N);
for j = 1 : N
   tempVector = Pj(j, :);
   Pj(j, :) = Pj(pivotArray(j), :);
   Pj(pivotArray(j), :) = tempVector;
end

fprintf('Percentage rms between Pj * A and L * U in CUDA %f\n', 100 * sqrt(sum(sum(abs(Pj * A - L * U).^2)) / sum(sum(abs(Pj * A).^2))));

xprime      = inv(Lmatlab) * yMatlab;
xMatlab     = inv(Umatlab) * xprime;

fprintf('Percentage rms between reference solution and solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));

load xApproach1.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.1 %f\n', 100 * sqrt(sum(sum(abs(xApproach1 - x).^2)) / sum(sum(abs(x).^2))));

load xApproach2.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.2 %f\n', 100 * sqrt(sum(sum(abs(xApproach2 - x).^2)) / sum(sum(abs(x).^2))));

cuda - 密な線形システム AX = B を CUDA で解く

1 に答える 1

Related

Reference