境界ブロック システムを解決する必要があります。ここには 2 つのバージョンがあります。正常に動作するシリアル バージョンと動作しないパラレル バージョン (MPI を使用) で、理由がわかりません... 誰かが私を理解するのを手伝ってくれるかもしれませんなぜ?
================================================== =============================== = アプリケーションプロセスの 1 つの不適切な終了 = 終了コード: 11 = クリーンアップ中残りのプロセス = 以下のクリーンアップ メッセージは無視できます ======================================= ========================================= アプリケーションが終了文字列で終了しました: セグメンテーション違反 (シグナル 11) これは通常、アプリケーションの問題を示しています。デバッグの提案については、FAQ ページを参照してください。
シリアル版:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include "res.h"
// Numero di blocchi
#define N 128
// Dimensione di ogni Blocco
#define BS 256
// Leading Dimension
#define LD BS * BS
/*
*
*/
int main (int argc, char ** argv) {
// Partenza del timer
clock_t t_end, t_start = clock();
double time;
size_t ii, jj;
/*
* Alloco tutta la memoria necessaria per poter svolgere i calcoli e
* popolo le matrici (in ordine) A, B, C, As, bi, bs
*/
double *A = calloc (LD * N, sizeof(*A));
double *B = calloc (LD * N, sizeof(*B));
double *C = calloc (LD * N, sizeof(*C));
double *As = calloc (LD, sizeof(*As));
double *x = calloc (BS * N, sizeof(*x));
double *xs = calloc (BS, sizeof(*xs));
double *B_tmp = calloc (BS * N, sizeof(*x));
double *b = calloc (BS * N, sizeof(*b));
double *bs = calloc (BS, sizeof(*bs));
double *Y = calloc (LD * N, sizeof(*Y));
double *y = calloc (BS * N, sizeof(*y));
double *D = calloc (LD * N, sizeof(*D));
double *d = calloc (BS * N, sizeof(*d));
double *A_cap = calloc (LD, sizeof(*A_cap));
double *A_tmp = calloc (LD, sizeof(*A_tmp));
double *b_cap = calloc (BS, sizeof(*b_cap));
double *b_tmp = calloc (BS, sizeof(*b_tmp));
double *xi = calloc (BS * N, sizeof(*xi));
double *xs_tmp = calloc (BS, sizeof(*xs_tmp));
for (ii = 0; ii < N; ii++){
genera_Ai(&A[LD*ii], BS, ii+1);
genera_Bi(&B[LD*ii], BS, ii+1);
genera_Bi(&C[LD*ii], BS, ii+1);
}
genera_Ai(As, BS, N+2);
for (ii = 0; ii < N; ii++) {
for(jj = 0; jj < BS; jj++)
b[BS*ii + jj] = 1;
}
for (ii = 0; ii < BS; ii++)
bs[ii] = 1;
/* Step #1
* Calcolo la fattorizzazione LU di tutte le matrici A0,..., AN.
* Ogni matrice Ai ha dimensione BS * BS (256 x 256) overo LD.
*/
for (ii = 0; ii < N; ii++)
LU (&A[LD*ii], BS);
/* Troviamo le matrici Di = Ai^-1 * Bi, di = Ai^-1 * bi con la
fattorizzazione LU e eliminazione in avanti e sostituzione all'indietro */
for (ii = 0; ii < N; ii++) {
elim_avanti (&A[LD*ii], BS, &B[LD*ii], BS, &Y[LD*ii]);
elim_avanti (&A[LD*ii], BS, &b[BS*ii], 1, &y[BS*ii]);
}
for (ii = 0; ii < N; ii++) {
sost_indietro (&A[LD*ii], BS, &Y[LD*ii], BS, &D[LD*ii]);
sost_indietro (&A[LD*ii], BS, &y[BS*ii], 1, &d[BS*ii]);
}
/* Step #2
* Calcolo A_cap e b_cap, eseguendo prima il prodotto matrice matrice e poi
* la sottrazione dalla matrice As | bs (rispettivamente per A_cap e b_cap).
*
* A_tmp = A_tmp + (Ci * Di) ---> A_cap = As - A_tmp
* b_tmp = b_tmp + (Ci * di) ---> b_cap = bs - b_tmp
*/
// #2.1 - Prodotto matrice matrice, ovvero A_tmp = A_tmp + (Ci * Di)
for (ii = 0; ii < N; ii++)
my_gemm (BS, BS, BS, &C[LD*ii], BS, 1.0, &D[LD*ii], BS, A_tmp, BS);
// #2.2 - Calcolo di A_cap, ovvero A_cap = As - A_tmp
for (ii = 0; ii < BS; ii++)
for (jj = 0; jj < BS; jj++)
A_cap[jj+BS*ii] = As[jj+BS*ii] - A_tmp[jj+BS*ii];
// #2.3 - Prodotto matrice matrice, ovvero b_tmp = b_tmp + (Ci * di)
for (ii = 0; ii < N; ii++)
my_gemm (BS, 1, BS, &C[LD*ii], BS, 1.0, &d[BS*ii], 1, b_tmp, 1);
// #2.4 - Calcolo di b_cap = bs - b_tmp
for (ii = 0; ii < BS; ii++)
b_cap[ii] = bs[ii] - b_tmp[ii];
/* Step #3
* Risolvo il sistema A_cap * xs = b_cap, dove:
*
* A_cap = (As - Ci * Ai^-1 * Bi)
* b_cap = (bs - Ci * Ai^-1 * bi)
*/
LU (A_cap, BS);
elim_avanti (A_cap, BS, b_cap, 1, xs_tmp);
sost_indietro (A_cap, BS, xs_tmp, 1, xs);
/*
* Step #4
*
* Calcolo le restanti componenti del vettore soluzione x risolvendo
* N sistemi lineari Ai * xi = bi - Bi * xs.
*/
for (ii = 0; ii < N; ii++) {
// #4.1 Calcolo Bi*xs = B_tmp
my_gemm(BS, 1, BS, &B[LD*ii], BS, 1, xs, 1, &B_tmp[BS*ii], 1);
// #4.2 Calcolo bi-B_tmp = B_tmp
for (jj = 0; jj < BS; jj++)
B_tmp[BS*ii + jj] = b[BS*ii + jj] - B_tmp[BS*ii + jj];
// #4.3 Risolvo il sistema Ai*xi = x
elim_avanti(&A[LD*ii], BS, &B_tmp[BS*ii], 1, xs_tmp);
sost_indietro(&A[LD*ii], BS, xs_tmp, 1, &x[BS*ii]);
}
// Salvo tutte le soluzioni nel vettore finale xi
for (ii = 0; ii <= N; ii++){
for (jj = 0; jj < BS; jj++) {
if (ii < N)
xi[BS*ii + jj] = x[BS*ii + jj];
else
xi[BS*ii + jj] = xs[jj];
}
}
// Fermo il timer
t_end = clock();
time = (t_end - t_start) / (double) CLOCKS_PER_SEC;
printf("Time is: %f\n", time);
free(A); free(B); free(C); free(As);
free(x); free(xs); free(b); free(bs);
free(Y); free(y); free(D); free(d);
free(A_cap); free(A_tmp); free(b_cap); free(b_tmp);
free(xs_tmp); free(xi);
return 0;
}
パラレルバージョン:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include <mpi.h>
#include "res.h"
// Blocks
#define N 128
// Block Size
#define BS 256
// Leading Dimension
#define LD BS * BS
// The master node
#define MASTER 0
// Globals
/*
*
*/
int main (int argc, char ** argv) {
size_t ii, jj;
int my_rank, num_nodes;
double t_start, t_end;
double *A, *A_recv, *B, *B_recv, *C, *C_recv;
double *As, *x, *xs, *b, *bs;
double *Y, *y, *D, *d, *D_tmp, *d_tmp;
double *A_cap, *A_tmp, *b_cap, *b_tmp;
double *xi, *xs_tmp;
double *x_res;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &num_nodes);
/**************************** From Master *******************************/
if (my_rank == MASTER) {
t_start = MPI_Wtime();
A = calloc (LD * N, sizeof(*A));
B = calloc (LD * N, sizeof(*B));
C = calloc (LD * N, sizeof(*C));
As = calloc (LD, sizeof(*As));
x = calloc (BS * N, sizeof(*x));
xs = calloc (BS, sizeof(*xs));
b = calloc (BS * N, sizeof(*b));
bs = calloc (BS, sizeof(*bs));
Y = calloc (LD * N, sizeof(*Y));
y = calloc (BS * N, sizeof(*y));
D = calloc (LD * N, sizeof(*D));
d = calloc (BS * N, sizeof(*d));
A_cap = calloc (LD, sizeof(*A_cap));
A_tmp = calloc (LD, sizeof(*A_tmp));
b_cap = calloc (BS, sizeof(*b_cap));
b_tmp = calloc (BS, sizeof(*b_tmp));
d_tmp = calloc (BS, sizeof(*d_tmp));
xi = calloc (BS * N, sizeof(*xi));
xs_tmp = calloc (BS, sizeof(*xs_tmp));
x_res = calloc (BS, sizeof(*x_res));
// Initialize the matrices
for (ii = 0; ii < N; ii++){
genera_Ai(&A[LD*ii], BS, ii+1);
genera_Bi(&B[LD*ii], BS, ii+1);
genera_Bi(&C[LD*ii], BS, ii+1);
}
genera_Ai(As, BS, N+1);
for (ii = 0; ii < N; ii++) {
for(jj = 0; jj < BS; jj++)
b[BS*ii + jj] = 1;
}
for (ii = 0; ii < BS; ii++)
bs[ii] = 1;
// Each node will receive (LD * N / num_nodes) entries
A_recv = calloc (LD * N / num_nodes, sizeof(*A_recv));
B_recv = calloc (LD * N / num_nodes, sizeof(*B_recv));
C_recv = calloc (LD * N / num_nodes, sizeof(*C_recv));
// Send A matrix
MPI_Scatter (A, LD * N, MPI_DOUBLE,
A_recv, LD * N, MPI_DOUBLE,
MASTER, MPI_COMM_WORLD);
// Send B matrix
MPI_Scatter (B, LD * N, MPI_DOUBLE,
B_recv, LD * N, MPI_DOUBLE,
MASTER, MPI_COMM_WORLD);
// Send C matrix
MPI_Scatter (C, LD * N, MPI_DOUBLE,
C_recv, LD * N, MPI_DOUBLE,
MASTER, MPI_COMM_WORLD);
}
/**************************** From Workers *******************************/
/* Step #1
* Each processor performs the LU decomposition of their matrices Ai
* generating the two matrices Li and Ui.
*/
for (ii = 0; ii < N / num_nodes; ii++)
LU (&A[LD*ii], BS);
/* Step #2
* Each processor calculates the term di = Ci * Ai^-1 * bi for each matrix
* For it meant taking advantage of the LU decomposition calculated in step
* Number 1. Then it calculates the sum.
*/
// 2.1
for (ii = 0; ii < N / num_nodes; ii++)
elim_avanti (&A[LD*ii], BS, &b[BS*ii], 1, &y[BS*ii]);
// 2.2
for (ii = 0; ii < N / num_nodes; ii++)
sost_indietro (&A[LD*ii], BS, &y[BS*ii], 1, &d[BS*ii]);
// #2.3 - matrix multiplication, d_tmp = d_tmp + (Ci * di)
for (ii = 0; ii < N; ii++)
my_gemm (BS, 1, BS, &C[LD*ii], BS, 1.0, &d[BS*ii], 1, d_tmp, 1);
MPI_Barrier (MPI_COMM_WORLD);
/* Step #3
* Through a communication type "Reduce" with sum over the terms "di" the
* Master processor obtains the sum Ci * Ai ^ -1 * Bi with i from 0 to n-1 and
* This calculates b_cap.
*/
// 3.1 - sum d_tmp
MPI_Reduce (&d_tmp, &b_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);
// #3.2 - Calc b_cap = bs - b_tmp
for (ii = 0; ii < BS; ii++)
b_cap[ii] = bs[ii] - b_tmp[ii];
/* Step #4
* Each processor calcs the term Di = Ci * Ai^-1 * Bi by the resolution of
* n linear systems Ai * xj = Bij
*/
// 4.1
for (ii = 0; ii < N / num_nodes; ii++)
elim_avanti (&A[LD*ii], BS, &B[LD*ii], BS, &Y[LD*ii]);
// 4.2
for (ii = 0; ii < N / num_nodes; ii++)
sost_indietro (&A[LD*ii], BS, &Y[LD*ii], BS, &D[LD*ii]);
// #4.3 - Prodotto matrice matrice, ovvero D_tmp = D_tmp + (Ci * Di)
for (ii = 0; ii < N; ii++)
my_gemm (BS, BS, BS, &C[LD*ii], BS, 1.0, &D[LD*ii], BS, D_tmp, BS);
/* Step #5
* Through a communication type "Reduce" with sum over the terms 'di', the
* Master processor obtains the sum There Ci * Ai ^ -1 * Bi with i from 0 to n-1 and
* This calculates A_cap.
*/
// 5.1 - sum all D_tmp
MPI_Reduce (&D_tmp, &A_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);
// #5.2 - calc A_cap, that is A_cap = As - A_tmp
for (ii = 0; ii < BS; ii++)
for (jj = 0; jj < BS; jj++)
A_cap[jj+BS*ii] = As[jj+BS*ii] - A_tmp[jj+BS*ii];
/* Step #6
* The "master" processor resolve the system A_cap * xs = b_cap
* to dermine xs.
*/
LU (A_cap, BS);
elim_avanti (A_cap, BS, b_cap, 1, xs_tmp);
sost_indietro (A_cap, BS, xs_tmp, 1, xs);
MPI_Barrier (MPI_COMM_WORLD);
/* Step #7
* The solution xs is communicated to all processors via a broadcast.
*/
if (my_rank == MASTER)
MPI_Bcast (xs, BS, MPI_DOUBLE, MASTER, MPI_COMM_WORLD);
/* Step #8
* each processor j-esimo resolve the linear systems Ai*xi = bi - Bi*xs
*/
for (ii = 0; ii < N / num_nodes; ii++) {
// #8.1 Calc Bi*xs = x
my_gemm(BS, 1, BS, &B[LD*ii], BS, 1, xs, 1, &x[BS*ii], 1);
// #8.2 Calc bi-x = x
for (jj = 0; jj < BS; jj++)
x[BS*ii + jj] = b[BS*ii + jj] -x[BS*ii + jj];
// #8.3 Resolve the system Ai*xi = x
elim_avanti (&A[LD*ii], BS, &x[BS*ii], 1, xs_tmp);
sost_indietro (&A[LD*ii], BS, xs_tmp, 1, &x[BS*ii]);
}
/* Step #9
* The components "xi" are combined into a single vector via a
* Gather type communication. Joined together to form the solution "xs"
* To the initial problem.
*/
MPI_Gather (xi, BS, MPI_DOUBLE,
x_res, BS, MPI_DOUBLE,
MASTER, MPI_COMM_WORLD);
for (ii = 0; ii < N; ii++){
for (jj = 0; jj < BS; jj++) {
if (ii < N)
xi[BS*ii + jj] = x[BS*ii + jj];
else
xi[BS*ii + jj] = xs[jj];
}
}
// Wait all nodes before to print the result
MPI_Barrier (MPI_COMM_WORLD);
// Stop timer
if (my_rank == MASTER) {
t_end = MPI_Wtime();
printf("Time is %f\n", t_end - t_start);
}
free(A); free(B); free(C); free(As);
free(x); free(xs); free(b); free(bs);
free(Y); free(y); free(D); free(d);
free(A_cap); free(A_tmp); free(b_cap); free(b_tmp);
free(xs_tmp); free(xi);
MPI_Finalize();
return 0;
}
これは res.h http://bpaste.net/show/yAQCCwFsZlQCCiw7GKC9/で、先生から提供されたものです。
下手な英語といくつかのイタリア語のコメントで申し訳ありません。コードは明確である必要があります。私の目標は、コードのシリアル部分を MPI で並列化することです。プロセッサの数は常に 4 の倍数です。よろしくお願いします。