c - LU 分解による MPI 境界ブロックシステム/マトリックス

Question

境界ブロックシステムを解決する必要があります。ここには 2 つのバージョンがあります。正常に動作するシリアルバージョンと動作しないパラレルバージョン (MPI を使用) で、理由がわかりません... 誰かが私を理解するのを手伝ってくれるかもしれませんなぜ？

================================================== =============================== = アプリケーションプロセスの 1 つの不適切な終了 = 終了コード: 11 = クリーンアップ中残りのプロセス = 以下のクリーンアップメッセージは無視できます ======================================= ========================================= アプリケーションが終了文字列で終了しました: セグメンテーション違反 (シグナル 11) これは通常、アプリケーションの問題を示しています。デバッグの提案については、FAQ ページを参照してください。

シリアル版:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>

#include "res.h"

// Numero di blocchi
#define N 128

// Dimensione di ogni Blocco
#define BS 256

// Leading Dimension
#define LD BS * BS

/*
 *
 */
int main (int argc, char ** argv) {

    // Partenza del timer
    clock_t t_end, t_start = clock();

    double time;
    size_t ii, jj;

    /* 
     * Alloco tutta la memoria necessaria per poter svolgere i calcoli e 
     * popolo le matrici (in ordine) A, B, C, As, bi, bs 
     */
    double *A = calloc (LD * N, sizeof(*A));
    double *B = calloc (LD * N, sizeof(*B));
    double *C = calloc (LD * N, sizeof(*C));
    double *As = calloc (LD, sizeof(*As));

    double *x = calloc (BS * N, sizeof(*x));    
    double *xs = calloc (BS, sizeof(*xs));
    double *B_tmp = calloc (BS * N, sizeof(*x));

    double *b = calloc (BS * N, sizeof(*b));
    double *bs = calloc (BS, sizeof(*bs));

    double *Y = calloc (LD * N, sizeof(*Y));
    double *y = calloc (BS * N, sizeof(*y));
    double *D = calloc (LD * N, sizeof(*D));
    double *d = calloc (BS * N, sizeof(*d));

    double *A_cap = calloc (LD, sizeof(*A_cap));
    double *A_tmp = calloc (LD, sizeof(*A_tmp));
    double *b_cap = calloc (BS, sizeof(*b_cap));
    double *b_tmp = calloc (BS, sizeof(*b_tmp));

    double *xi = calloc (BS * N, sizeof(*xi));
    double *xs_tmp = calloc (BS, sizeof(*xs_tmp));

    for (ii = 0; ii < N; ii++){
        genera_Ai(&A[LD*ii], BS, ii+1);
        genera_Bi(&B[LD*ii], BS, ii+1);
        genera_Bi(&C[LD*ii], BS, ii+1);
    }

    genera_Ai(As, BS, N+2);

    for (ii = 0; ii < N; ii++) {
        for(jj = 0; jj < BS; jj++) 
            b[BS*ii + jj] = 1;
    }

    for (ii = 0; ii < BS; ii++) 
        bs[ii] = 1;

    /* Step #1 
     * Calcolo la fattorizzazione LU di tutte le matrici A0,..., AN.
     * Ogni matrice Ai ha dimensione BS * BS (256 x 256) overo LD.
     */
    for (ii = 0; ii < N; ii++) 
        LU (&A[LD*ii], BS); 
    /* Troviamo le matrici Di = Ai^-1 * Bi, di = Ai^-1 * bi con la 
    fattorizzazione LU e eliminazione in avanti e sostituzione all'indietro */
    for (ii = 0; ii < N; ii++) {
        elim_avanti   (&A[LD*ii], BS, &B[LD*ii], BS, &Y[LD*ii]);
        elim_avanti   (&A[LD*ii], BS, &b[BS*ii],  1, &y[BS*ii]);
    }
    for (ii = 0; ii < N; ii++) {
        sost_indietro (&A[LD*ii], BS, &Y[LD*ii], BS, &D[LD*ii]);
        sost_indietro (&A[LD*ii], BS, &y[BS*ii],  1, &d[BS*ii]);
    }

    /* Step #2
     * Calcolo A_cap e b_cap, eseguendo prima il prodotto matrice matrice e poi 
     * la sottrazione dalla matrice As | bs (rispettivamente per A_cap e b_cap).
     *
     *  A_tmp = A_tmp + (Ci * Di) ---> A_cap = As - A_tmp
     *  b_tmp = b_tmp + (Ci * di) ---> b_cap = bs - b_tmp
     */
    // #2.1 - Prodotto matrice matrice, ovvero A_tmp = A_tmp + (Ci * Di) 
    for (ii = 0; ii < N; ii++)
        my_gemm (BS, BS, BS, &C[LD*ii], BS, 1.0, &D[LD*ii], BS, A_tmp, BS);
    // #2.2 - Calcolo di A_cap, ovvero A_cap = As - A_tmp
    for (ii = 0; ii < BS; ii++)
        for (jj = 0; jj < BS; jj++)
            A_cap[jj+BS*ii] = As[jj+BS*ii] - A_tmp[jj+BS*ii];

    // #2.3 - Prodotto matrice matrice, ovvero b_tmp = b_tmp + (Ci * di)
    for (ii = 0; ii < N; ii++)
        my_gemm (BS, 1, BS, &C[LD*ii], BS, 1.0, &d[BS*ii], 1, b_tmp, 1);
    // #2.4 - Calcolo di b_cap = bs - b_tmp
    for (ii = 0; ii < BS; ii++)
        b_cap[ii] = bs[ii] - b_tmp[ii];


    /* Step #3
     * Risolvo il sistema A_cap * xs = b_cap, dove:
     *      
     * A_cap = (As - Ci * Ai^-1 * Bi)
     * b_cap = (bs - Ci * Ai^-1 * bi)
     */
    LU (A_cap, BS);
    elim_avanti   (A_cap, BS, b_cap,  1, xs_tmp);
    sost_indietro (A_cap, BS, xs_tmp, 1, xs);

    /*
     * Step #4
     * 
     * Calcolo le restanti componenti del vettore soluzione x risolvendo
     * N sistemi lineari Ai * xi = bi - Bi * xs.
     */
    for (ii = 0; ii < N; ii++) {
        // #4.1 Calcolo Bi*xs = B_tmp
        my_gemm(BS, 1, BS, &B[LD*ii], BS, 1, xs, 1, &B_tmp[BS*ii], 1);

        // #4.2 Calcolo bi-B_tmp = B_tmp
        for (jj = 0; jj < BS; jj++)
            B_tmp[BS*ii + jj] = b[BS*ii + jj] - B_tmp[BS*ii + jj];

        // #4.3 Risolvo il sistema Ai*xi = x
        elim_avanti(&A[LD*ii], BS, &B_tmp[BS*ii], 1, xs_tmp);
        sost_indietro(&A[LD*ii], BS, xs_tmp, 1, &x[BS*ii]);
    }

    // Salvo tutte le soluzioni nel vettore finale xi 
    for (ii = 0; ii <= N; ii++){
        for (jj = 0; jj < BS; jj++) {
            if (ii < N)
                xi[BS*ii + jj] = x[BS*ii + jj];
            else
                xi[BS*ii + jj] = xs[jj];
        }
    }

    // Fermo il timer
    t_end = clock();

    time = (t_end - t_start) / (double) CLOCKS_PER_SEC;
    printf("Time is: %f\n", time);

    free(A); free(B); free(C); free(As);
    free(x); free(xs); free(b); free(bs);
    free(Y); free(y);  free(D); free(d);
    free(A_cap); free(A_tmp); free(b_cap); free(b_tmp);
    free(xs_tmp); free(xi);

    return  0;
}

パラレルバージョン:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include <mpi.h>

#include "res.h"

// Blocks
#define N 128

// Block Size
#define BS 256

// Leading Dimension
#define LD BS * BS

// The master node
#define MASTER 0

// Globals

/*
 *
 */
int main (int argc, char ** argv) {

    size_t ii, jj;
    int my_rank, num_nodes;
    double t_start, t_end;
    double *A, *A_recv, *B, *B_recv, *C, *C_recv;
    double *As, *x, *xs, *b, *bs;
    double *Y, *y, *D, *d, *D_tmp, *d_tmp;
    double *A_cap, *A_tmp, *b_cap, *b_tmp;
    double *xi, *xs_tmp;    
    double *x_res;

    MPI_Status status;
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &num_nodes);

    /**************************** From Master *******************************/
    if (my_rank == MASTER) {

        t_start = MPI_Wtime();

        A = calloc (LD * N, sizeof(*A));
        B = calloc (LD * N, sizeof(*B));
        C = calloc (LD * N, sizeof(*C));
        As = calloc (LD, sizeof(*As));
        x = calloc (BS * N, sizeof(*x));
        xs = calloc (BS, sizeof(*xs));
        b = calloc (BS * N, sizeof(*b));
        bs = calloc (BS, sizeof(*bs));
        Y = calloc (LD * N, sizeof(*Y));
        y = calloc (BS * N, sizeof(*y));
        D = calloc (LD * N, sizeof(*D));
        d = calloc (BS * N, sizeof(*d));
        A_cap = calloc (LD, sizeof(*A_cap));
        A_tmp = calloc (LD, sizeof(*A_tmp));
        b_cap = calloc (BS, sizeof(*b_cap));
        b_tmp = calloc (BS, sizeof(*b_tmp));
        d_tmp = calloc (BS, sizeof(*d_tmp));
        xi = calloc (BS * N, sizeof(*xi));
        xs_tmp = calloc (BS, sizeof(*xs_tmp));
        x_res = calloc (BS, sizeof(*x_res));

        // Initialize the matrices
        for (ii = 0; ii < N; ii++){
            genera_Ai(&A[LD*ii], BS, ii+1);
            genera_Bi(&B[LD*ii], BS, ii+1);
            genera_Bi(&C[LD*ii], BS, ii+1);
        }

        genera_Ai(As, BS, N+1);

        for (ii = 0; ii < N; ii++) {
            for(jj = 0; jj < BS; jj++) 
                b[BS*ii + jj] = 1;
        }

        for (ii = 0; ii < BS; ii++) 
            bs[ii] = 1;

        // Each node will receive (LD * N / num_nodes) entries
        A_recv = calloc (LD * N / num_nodes, sizeof(*A_recv));
        B_recv = calloc (LD * N / num_nodes, sizeof(*B_recv));
        C_recv = calloc (LD * N / num_nodes, sizeof(*C_recv));

        // Send A matrix
        MPI_Scatter (A, LD * N, MPI_DOUBLE, 
            A_recv, LD * N, MPI_DOUBLE, 
            MASTER, MPI_COMM_WORLD);

        // Send B matrix     
        MPI_Scatter (B, LD * N, MPI_DOUBLE, 
            B_recv, LD * N, MPI_DOUBLE, 
            MASTER, MPI_COMM_WORLD);

        // Send C matrix
        MPI_Scatter (C, LD * N, MPI_DOUBLE, 
            C_recv, LD * N, MPI_DOUBLE, 
            MASTER, MPI_COMM_WORLD);
    }

    /**************************** From Workers *******************************/
    /* Step #1 
     * Each processor performs the LU decomposition of their matrices Ai
         * generating the two matrices Li and Ui.
     */
    for (ii = 0; ii < N / num_nodes; ii++) 
        LU (&A[LD*ii], BS);

    /* Step #2 
     * Each processor calculates the term di = Ci * Ai^-1 * bi for each matrix
         * For it meant taking advantage of the LU decomposition calculated in step
         * Number 1. Then it calculates the sum.
     */
    // 2.1
    for (ii = 0; ii < N / num_nodes; ii++)
        elim_avanti   (&A[LD*ii], BS, &b[BS*ii],  1, &y[BS*ii]);

    // 2.2
    for (ii = 0; ii < N / num_nodes; ii++)
        sost_indietro (&A[LD*ii], BS, &y[BS*ii],  1, &d[BS*ii]);

     // #2.3 - matrix multiplication, d_tmp = d_tmp + (Ci * di)
    for (ii = 0; ii < N; ii++)
        my_gemm (BS, 1, BS, &C[LD*ii], BS, 1.0, &d[BS*ii], 1, d_tmp, 1);

    MPI_Barrier (MPI_COMM_WORLD);

    /* Step #3
     * Through a communication type "Reduce" with sum over the terms "di" the
         * Master processor obtains the sum Ci * Ai ^ -1 * Bi with i from 0 to n-1 and
         * This calculates b_cap.
     */
    // 3.1 - sum d_tmp
    MPI_Reduce (&d_tmp, &b_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

    // #3.2 - Calc b_cap = bs - b_tmp
    for (ii = 0; ii < BS; ii++)
        b_cap[ii] = bs[ii] - b_tmp[ii];


    /* Step #4
     * Each processor calcs the term Di = Ci * Ai^-1 * Bi by the resolution of
     * n linear systems Ai * xj = Bij
     */
    // 4.1
    for (ii = 0; ii < N / num_nodes; ii++)
        elim_avanti   (&A[LD*ii], BS, &B[LD*ii], BS, &Y[LD*ii]);

    // 4.2
    for (ii = 0; ii < N / num_nodes; ii++)
        sost_indietro (&A[LD*ii], BS, &Y[LD*ii], BS, &D[LD*ii]);

    // #4.3 - Prodotto matrice matrice, ovvero D_tmp = D_tmp + (Ci * Di) 
    for (ii = 0; ii < N; ii++)
        my_gemm (BS, BS, BS, &C[LD*ii], BS, 1.0, &D[LD*ii], BS, D_tmp, BS);

    /* Step #5
     * Through a communication type "Reduce" with sum over the terms 'di', the
         * Master processor obtains the sum There Ci * Ai ^ -1 * Bi with i from 0 to n-1 and
         * This calculates A_cap.
     */
    // 5.1 - sum all D_tmp
    MPI_Reduce (&D_tmp, &A_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

    // #5.2 - calc A_cap, that is A_cap = As - A_tmp
    for (ii = 0; ii < BS; ii++)
        for (jj = 0; jj < BS; jj++)
            A_cap[jj+BS*ii] = As[jj+BS*ii] - A_tmp[jj+BS*ii];

    /* Step #6 
     * The "master" processor resolve the system A_cap * xs = b_cap
     * to dermine xs.
     */
    LU (A_cap, BS);
    elim_avanti   (A_cap, BS, b_cap,  1, xs_tmp);
    sost_indietro (A_cap, BS, xs_tmp, 1, xs);

    MPI_Barrier (MPI_COMM_WORLD);

    /* Step #7
     * The solution xs is communicated to all processors via a broadcast.
     */
    if (my_rank == MASTER)
        MPI_Bcast (xs, BS, MPI_DOUBLE, MASTER, MPI_COMM_WORLD);

    /* Step #8
     * each processor j-esimo resolve the linear systems Ai*xi = bi - Bi*xs
     */
    for (ii = 0; ii < N / num_nodes; ii++) {
        // #8.1 Calc Bi*xs = x
        my_gemm(BS, 1, BS, &B[LD*ii], BS, 1, xs, 1, &x[BS*ii], 1);

        // #8.2 Calc bi-x = x
        for (jj = 0; jj < BS; jj++)
            x[BS*ii + jj] = b[BS*ii + jj] -x[BS*ii + jj];

        // #8.3 Resolve the system Ai*xi = x
        elim_avanti   (&A[LD*ii], BS, &x[BS*ii], 1, xs_tmp);
        sost_indietro (&A[LD*ii], BS, xs_tmp,    1, &x[BS*ii]);
    }

    /* Step #9
     * The components "xi" are combined into a single vector via a
     * Gather type communication. Joined together to form the solution "xs"
     * To the initial problem.
     */
    MPI_Gather (xi, BS, MPI_DOUBLE,
         x_res, BS, MPI_DOUBLE,
             MASTER, MPI_COMM_WORLD);

    for (ii = 0; ii < N; ii++){
        for (jj = 0; jj < BS; jj++) {
            if (ii < N)
                xi[BS*ii + jj] = x[BS*ii + jj];
            else
                xi[BS*ii + jj] = xs[jj];
        }
    }

    // Wait all nodes before to print the result
    MPI_Barrier (MPI_COMM_WORLD);

    // Stop timer
    if (my_rank == MASTER) {
        t_end = MPI_Wtime();
        printf("Time is %f\n", t_end - t_start);
    }

    free(A); free(B); free(C); free(As);
    free(x); free(xs); free(b); free(bs);
    free(Y); free(y); free(D); free(d);
    free(A_cap); free(A_tmp); free(b_cap); free(b_tmp);
    free(xs_tmp); free(xi);

    MPI_Finalize();
    return  0;
}

これは res.h http://bpaste.net/show/yAQCCwFsZlQCCiw7GKC9/で、先生から提供されたものです。

下手な英語といくつかのイタリア語のコメントで申し訳ありません。コードは明確である必要があります。私の目標は、コードのシリアル部分を MPI で並列化することです。プロセッサの数は常に 4 の倍数です。よろしくお願いします。

score 1 · Accepted Answer

あなたはこれを他の場所に投稿しました：

Fatal error in PMPI_Reduce: Internal MPI error!, error stack:
PMPI_Reduce(1217)..............: MPI_Reduce(sbuf=0x7fffffffda88, rbuf=0x7fffffffda78, count=8388608, MPI_DOUBLE, MPI_SUM, root=0, MPI_COMM_WORLD) failed
MPIR_Reduce_impl(1029).........: 
MPIR_Reduce_intra(825).........: 
MPIR_Reduce_redscat_gather(300): 
MPIR_Localcopy(357)............: memcpy arguments alias each other, dst=0x7fffffffda78 src=0x7fffffffda88 len=67108864
[Inferior 1 (process 4210) exited with code 01]

エラーメッセージにあるように、MPI_Reduce の引数にエイリアスが設定されています。これは、値ではなくポインターのアドレスを渡すためです。これは、少なくともバッファーを動的に割り当てる場合に、MPI にデータバッファーを提供する正しい方法です。

これを変えると思います

MPI_Reduce (&d_tmp, &b_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

に

MPI_Reduce (d_tmp, b_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

問題を解決します。& が気に入った場合は、次の操作を実行できます。

MPI_Reduce (&d_tmp[0], &b_tmp[0], LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

しかし、これは無意味です。

void* 引数が要求されたときに ptr の代わりに &ptr を使用すると、他の MPI 呼び出しが間違っていることに気付くかもしれません。

score 0 · Accepted Answer

解決策を見つけました。

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include <mpi.h>

#include "res.h"

// Number of Blocks
#define N 128

// Block Size
#define BS 256

// Leading Dimension
#define LD BS * BS

// The master node
#define MASTER 0

/*
 *
 */
int main (int argc, char ** argv) {

    size_t ii, jj;
    int my_rank, num_nodes;
    double t_start, t_end;
    double *A, *A_data, *B, *B_data, *C, *C_data;
    double *As, *x, *xs, *b, *bs;
    double *Y, *y, *D, *d, *D_tmp, *d_tmp;
    double *A_cap, *A_tmp, *b_cap, *b_tmp;
    double *xi, *xs_tmp;    
    double *x_res;

    MPI_Status status;
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &num_nodes);

    A = calloc (LD * N / num_nodes, sizeof(*A));
    B = calloc (LD * N / num_nodes, sizeof(*B));
    C = calloc (LD * N / num_nodes, sizeof(*C));
    Y = calloc (LD * N, sizeof(*Y));
    y = calloc (BS * N, sizeof(*y));
    D = calloc (LD * N, sizeof(*D));
    d = calloc (BS * N, sizeof(*d));
    b = calloc (BS * N, sizeof(*b));
    As = calloc (LD, sizeof(*As));
    x = calloc (BS * N, sizeof(*x));
    xs = calloc (BS, sizeof(*xs));
    bs = calloc (BS, sizeof(*bs));;
    A_cap = calloc (LD, sizeof(*A_cap));
    A_tmp = calloc (LD, sizeof(*A_tmp));
    b_cap = calloc (BS, sizeof(*b_cap));
    b_tmp = calloc (BS, sizeof(*b_tmp));
    d_tmp = calloc (BS, sizeof(*d_tmp));
    D_tmp = calloc (LD, sizeof(*D_tmp));
    xi = calloc (BS * N, sizeof(*xi));
    xs_tmp = calloc (BS, sizeof(*xs_tmp));
    x_res = calloc (BS * num_nodes, sizeof(*x_res));    

    if (my_rank == MASTER) {

        t_start = MPI_Wtime();

        A_data = calloc (LD * N, sizeof(*A_data));
        B_data = calloc (LD * N, sizeof(*B_data));
        C_data = calloc (LD * N, sizeof(*C_data));

        // Initialize the matrices
        for (ii = 0; ii < N; ii++){
            genera_Ai(&A_data[LD*ii], BS, ii+1);
            genera_Bi(&B_data[LD*ii], BS, ii+1);
            genera_Bi(&C_data[LD*ii], BS, ii+1);
        }

        genera_Ai(As, BS, N+1);

        for (ii = 0; ii < N; ii++) {
            for(jj = 0; jj < BS; jj++) 
                b[BS*ii + jj] = 1;
        }   

        for (ii = 0; ii < BS; ii++) 
            bs[ii] = 1;
    }

    // Send A matrix to all process
    MPI_Scatter(A_data, LD * N / num_nodes, MPI_DOUBLE, 
                A,      LD * N / num_nodes, MPI_DOUBLE, 
                MASTER, MPI_COMM_WORLD);  

    // Send B matrix to all process
    MPI_Scatter(B_data, LD * N / num_nodes, MPI_DOUBLE, 
                B,      LD * N / num_nodes, MPI_DOUBLE, 
                MASTER, MPI_COMM_WORLD);

    // Send C matrix to all process
    MPI_Scatter(C_data, LD * N / num_nodes, MPI_DOUBLE, 
                C,      LD * N / num_nodes, MPI_DOUBLE, 
                MASTER, MPI_COMM_WORLD);

    /* Step #1 
     * Each processor performs the LU decomposition of their matrices Ai 
     * generating the two matrices Li and Ui.
     */
    for (ii = 0; ii < N / num_nodes; ii++) 
        LU (&A[LD*ii], BS);

    /* Step #2 
     * Each processor calculates the term 'di' = Ci * Ai ^ -1 * bi for each 
     * matrix Ai addressed to it using the LU decomposition calculated in step 
     * number 1. Then it calculates the sum.
     */
    // 2.1
    for (ii = 0; ii < N / num_nodes; ii++) {
        elim_avanti   (&A[LD*ii], BS, &b[BS*ii],  1, &y[BS*ii]);
    }

    // 2.2
    for (ii = 0; ii < N / num_nodes; ii++)
        sost_indietro (&A[LD*ii], BS, &y[BS*ii],  1, &d[BS*ii]);

     // #2.3 - Product matrix matrix, that is d_tmp = d_tmp + (Ci * di)
    for (ii = 0; ii < N / num_nodes; ii++)
        my_gemm (BS, 1, BS, &C[LD*ii], BS, 1.0, &d[BS*ii], 1, d_tmp, 1);

    /* Step #3
     * Through a communication type "Reduce" with the sum on the terms 'di', 
     * the master processor get the sum Ci = Ai^-1 * bi, with i from 0 to n-1 
     * and with this calculates b_cap.
     */
    // 3.1 - sum of d_tmp
    MPI_Reduce (d_tmp, b_tmp, BS, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

    // #3.2 - Calcs di b_cap = bs - b_tmp
    for (ii = 0; ii < BS; ii++)
        b_cap[ii] = bs[ii] - b_tmp[ii];

    /* Step #4
     * Each processor calculates the term Di = Ci * Ai^-1 * Bi by the 
     * resolution of the n linear systems Ai * xj = Bij
     */
    // 4.1
    for (ii = 0; ii < N / num_nodes; ii++)
        elim_avanti   (&A[LD*ii], BS, &B[LD*ii], BS, &Y[LD*ii]);

    // 4.2
    for (ii = 0; ii < N / num_nodes; ii++)
        sost_indietro (&A[LD*ii], BS, &Y[LD*ii], BS, &D[LD*ii]);

    // #4.3 - Products matrix matrix, that is D_tmp = D_tmp + (Ci * Di)
    for (ii = 0; ii < N / num_nodes; ii++)
        my_gemm (BS, BS, BS, &C[LD*ii], BS, 1.0, &D[LD*ii], BS, D_tmp, BS);

    /* Step #5
     * Through a communication type "Reduce" with sum over the terms 'di', 
     * the master processor get the sum Ci * Ai^-1 * Bi with i from 0 to n-1 
     * and with this calculates A_cap.
     */
    // 5.1 - sum of D_tmp
    MPI_Reduce (D_tmp, A_tmp, BS, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

    // #5.2 - Calcs of A_cap, that is A_cap = As - A_tmp
    for (ii = 0; ii < BS; ii++)
        for (jj = 0; jj < BS; jj++)
            A_cap[jj+BS*ii] = As[jj+BS*ii] - A_tmp[jj+BS*ii];

    /* Step #6 
     * The master processor solves the linear system A_cap * xs = xs b_cap 
     * to determine.
     */
    LU (A_cap, BS);
    elim_avanti   (A_cap, BS, b_cap,  1, xs_tmp);
    sost_indietro (A_cap, BS, xs_tmp, 1, xs);

    /* Step #7
     * The solution xs is communicated to all processors via a broadcast.
     */
    MPI_Bcast (xs, BS, MPI_DOUBLE, MASTER, MPI_COMM_WORLD);

    /* Step #8
     * Each processor j-th solves linear systems Ai*xi = bi - Bi*xs
     */
    for (ii = 0; ii < N / num_nodes; ii++) {
        // #8.1 Calcs Bi*xs = x
        my_gemm(BS, 1, BS, &B[LD*ii], BS, 1, xs, 1, &x[BS*ii], 1);

        // #8.2 Calcs bi-x = x
        for (jj = 0; jj < BS; jj++)
            x[BS*ii + jj] = b[BS*ii + jj] -x[BS*ii + jj];

        // #8.3 Resolve the system Ai*xi = x
        elim_avanti   (&A[LD*ii], BS, &x[BS*ii], 1, xs_tmp);
        sost_indietro (&A[LD*ii], BS, xs_tmp,    1, &x[BS*ii]);
    }

    /* Step #9
     * The components xi are combined into a single vector with a "Gather". 
     * Joined together to form the xs solution to the initial problem.
     */
    MPI_Gather (xi,    BS, MPI_DOUBLE,
                x_res, BS, MPI_DOUBLE,
                MASTER, MPI_COMM_WORLD);

    for (ii = 0; ii < N; ii++){
        for (jj = 0; jj < BS; jj++) {
            if (ii < N)
                xi[BS*ii + jj] = x[BS*ii + jj];
            else
                xi[BS*ii + jj] = xs[jj];
        }
    }

    // Stop timer
    if (my_rank == MASTER) {
        t_end = MPI_Wtime();
        printf("Time is %f\n", t_end - t_start);
        free(A_data); free(B_data); free(C_data);
    }

    free(A); free(A_cap); free(A_tmp); 
    free(D); free(D_tmp); free(d_tmp); free(d);
    free(b); free(b_cap); free(b_tmp);
    free(B); free(C); 
    free(Y); free(y); 
    free(As); free(bs);
    free(x); free(xs); free(xs_tmp); free(x_res);

    MPI_Finalize();
    return  0;
}

特別なアドバイスをくれたマキシムに感謝します:)

c - LU 分解による MPI 境界ブロック システム/マトリックス

2 に答える 2

Related

Reference

c - LU 分解による MPI 境界ブロックシステム/マトリックス