獲得集體中止在我的泊松方程代碼

/***every function is working correct but after only first iteration is giving collective abort anyone can tell what is or coulde be the reason***/ 
#include<stdio.h> 
#include<stdlib.h> 
#include"mpi.h" 

const double tolerance = 0.00001; 
const int maxit = 10000; 

void MPE_decomp1d(int n, int size, int id, int *s, int *e) 
{ 
    /*****calculating start and end row for every process*****/ 
    *s = (n/size)*id + ((n%size)>0)*(id>(n%size)?n%size:id); 
     *e = *s + (n/size)+((n%size)>id); 
} 

void onedinit(double **a, double **b, double **f, const int nx, const int s, const int e) 
{ 
    int i, j; 
    int ls, le; 
    ls = s - (s!=0); 
    le = e + (e!=nx); 
    /***setting all the intial values to zero****/ 
    for (i = ls; i < le; i++) 
    { 
     for (j = 0; j < nx; j++) 
     { 
      a[i][j] = b[i][j] = f[i][j] = 0.0; 
     } 
    } 
    //***************************Defining Boundary Condition***********************************// 
    /***setting left boundary to 1***/ 
    for (i = ls; i < le; i++) a[i][0] = b[i][0] = 1; 
    /***setting value f(0, i) is 2***/ 
    if (s==0) for (i = 0; i < nx; i++) a[0][i] = b[0][i] = 2.0; 
} 

void exchng1(double **a, const int nx, const int s, const int e, MPI_Comm comm1d, int nbrbottom, int nbrtop) 
{ 
    int rank, coord; 
    MPI_Status status; 
    MPI_Comm_rank(comm1d, &rank); 
    MPI_Cart_coords(comm1d, rank, 1, &coord); 
    /*****************if process id is odd then first send and if even then first recive to avoid deadlock**********/ 
    if (coord&1) 
    { 
     if (nbrbottom != -1) MPI_Send(a[e-s], nx, MPI_DOUBLE, nbrbottom, 0, comm1d); 
     if (nbrtop != -1) MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 1, comm1d, &status); 
     if (nbrtop != -1) MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 0, comm1d); 
     if (nbrbottom != -1) MPI_Recv(a[e-s+1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status); 
    } 
    else 
    { 
     if (nbrtop != -1) MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status); 
     if (nbrbottom != -1) MPI_Send(a[e-s-(s==0)], nx, MPI_DOUBLE, nbrbottom, 1, comm1d); 
     if (nbrbottom != -1) MPI_Recv(a[e-s+(s!=0)], nx, MPI_DOUBLE, nbrbottom, 0, comm1d, &status); 
     if (nbrtop != -1) MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 1, comm1d); 
    } 
} 

void sweep1d(double **a, double **f, int nx, const int s, const int e, double **b) 
{ 
    int i, j; 
    int rows; 
    rows = e - s - (s==0) - (e==0); 
    nx -= 1; 
    double h = 1.0/(double)nx; 
    for (i = 1; i <= rows; i++) for (j = 1; j < nx; j++) 
     b[i][j] = 0.25 * (a[i-1][j] + a[i][j+1] + a[i][j-1] + a[i+1][j]) - h*h*f[i][j]; 
    return; 
} 

double diff(double **a, double **b, const int nx, int s, int e) 
{ 
    double sum = 0.0; 
    int i, j; 
    int st, ed; 
    st = (s!=0); 
    ed = e-s+(s!=0); 
    for (i = st; i < ed; i++) for (j = 0; j < nx; j++) 
     sum += (a[i][j] - b[i][j])*(a[i][j] - b[i][j]); 
    return sum; 
} 
int main(int argc, char *argv[]) 
{ 
    int nx, ny; 
    int myid, root, numprocs, period=0; 
    int nbrbottom, nbrtop, s, e, it; 

    double diffnorm, dwork; 
    double t1, t2; 
    double **a, **b, **f; 

    root = 0; 
    MPI_Comm comm1d; 
    MPI_Init(&argc, &argv);; 
    MPI_Comm_rank(MPI_COMM_WORLD, &myid); 
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs); 

    if(!myid) 
    { 
     /******for this piece of code nx and ny are assumed to be same please*******/ 
     printf("Enter the number of cells in X & Y direction\n"); 
     scanf("%d %d", &nx, &ny); 
     nx += 1; 
     ny += 1; 
     ny = nx; //forced to follow our assumption; 
    } 
    MPI_Bcast(&nx, 1, MPI_INT, 0, MPI_COMM_WORLD); 
    MPI_Bcast(&ny, 1, MPI_INT, 0, MPI_COMM_WORLD); 

    MPI_Cart_create(MPI_COMM_WORLD, 1, &numprocs, &period, 1, &comm1d); 

    MPI_Comm_rank(comm1d, &myid); 
    MPI_Cart_shift(comm1d, 0, 1, &nbrtop, &nbrbottom); 

    MPE_decomp1d(ny, numprocs, myid, &s, &e); 
    int ls, le, rows; 
    int i, j; 
    ls = s - (s!=0); 
    le = e + (e!=nx); 
    rows = le - ls; 

    a = (double**)malloc(rows*sizeof(double*)); 
    b = (double**)malloc(rows*sizeof(double*)); 
    f = (double**)malloc(rows*sizeof(double*)); 
    for (i = ls; i < le; i++) 
    { 
     a[i] = (double*)malloc(nx*sizeof(double)); 
     b[i] = (double*)malloc(nx*sizeof(double)); 
     f[i] = (double*)malloc(nx*sizeof(double)); 
    } 
    onedinit(a, b, f, nx, s, e); 
    diffnorm = 0.0; 
    it = 0; 
    do 
    { 
//  printf("%danshu\n", myid); 

     exchng1(a, nx, s, e, comm1d, nbrbottom, nbrtop); 
     sweep1d(a, f, nx, s, e, b); 
     exchng1(b, nx, s, e, comm1d, nbrbottom, nbrtop); 
     sweep1d(b, f, nx, s, e, a); 
     dwork = diff(a, b, nx, s, e); 

     /************printing matrix a after every iteration******/ 
     for (i = 0; i < rows; i++) 
     { 
      for (j = 0; j < nx; j++) printf("%lf ", a[i][j]); 
      printf("\n"); 
     } 

     MPI_Barrier(comm1d); 

     //printf("%lfhehe\n", dwork); 
     MPI_Allreduce(&dwork, &diffnorm, 1, MPI_DOUBLE, MPI_SUM, comm1d); 
     //printf("%dhere\n", myid); 
    } 
    while (++it < maxit && diffnorm > tolerance); 

    MPI_Finalize(); 
    return 0; 
}

來源

2011-03-01 anshuman Mishra

泊松？反正TLDR。 – Anycorn 2011-03-01 07:00:47

歡迎來到SO。通過這種方式，你可以很容易地找到某人查看這些代碼。請縮小你的問題範圍，解釋你正在做的事情，並更詳細地解釋如何做到這一點。請使用編輯按鈕來完成此操作。 – 2011-03-01 07:59:46

所以才傾銷130行代碼在SO，問爲什麼它不工作，可能是無法得到很好的答案的最佳途徑 - 特別是當你寫的唯一實際森泰斯是「每個功能都在起作用」......如果是這樣的話，你就不會有問題。你需要把事情縮小到更具體的情況，並得到更具體的問題。

在這種特殊情況下，我在教學過程中看到過很多像這樣的代碼，因此可以看到發生的一些事情。

首先，你不能做這樣的東西：

ls = s - (s!=0); 
le = e + (e!=nx); 
rows = le - ls; 

a = (double**)malloc(rows*sizeof(double*)); 
/*...*/ 
for (i = ls; i < le; i++) 
{ 
    a[i] = (double*)malloc(nx*sizeof(double)); 
    /*...*/ 
}

如果你有100行分成4個處理器，而你（說）MPI任務2，那麼你的s爲50和e是75，所以ls將是49和le將是76，所以你試圖訪問a[49..76]，即使你只分配了a大小27！這個特定的錯誤出現在代碼中，需要修復。你想要訪問a[0..rows-1]。

順便說一句，我甚至沒有檢查過，看看MPE_decomp1d實際上是否做對了。我們都經歷了一個階段，我們認爲C中可愛的東西是通過使用乘以三元運算符等的邏輯表達式將事情放在一行中的，但是嚴重的是，當別人必須修復它時，它會使您的代碼變得不必要繁瑣，無論是2個月後的SOers還是你自己。

在exchng1，你正在做不必要的工作。您無需查看nbrbottom或nbrtop是否有效;如果它們不是，則MPI_Cart_shift返回MPI_PROC_NULL，其中發送或接收是無操作的。因此，從這些等級發送/接收是無害的，這是一個偉大的設計決策，因爲它避免了邏輯中的大量情況。

同樣，爲避免死鎖，您可以使用MPI_Sendrecv而不是單獨的Send s和Recv s。這加上上述表示的，而不是這樣的：

if (coord&1) 
{ 
    if (nbrbottom != -1) MPI_Send(a[e-s], nx, MPI_DOUBLE, nbrbottom, 0, comm1d); 
    if (nbrtop != -1) MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 1, comm1d, &status); 
    if (nbrtop != -1) MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 0, comm1d); 
    if (nbrbottom != -1) MPI_Recv(a[e-s+1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status); 
} 
else 
{ 
    if (nbrtop != -1) MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status); 
    if (nbrbottom != -1) MPI_Send(a[e-s-(s==0)], nx, MPI_DOUBLE, nbrbottom, 1, comm1d); 
    if (nbrbottom != -1) MPI_Recv(a[e-s+(s!=0)], nx, MPI_DOUBLE, nbrbottom, 0, comm1d, &status); 
    if (nbrtop != -1) MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 1, comm1d); 
}

，你可以這樣做：

MPI_Sendrecv(a[e-s], nx, MPI_DOUBLE, nbrbottom, 0, a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status); 
MPI_Sendrecv(a[1], nx, MPI_DOUBLE, nbrtop, 1, a[e-s+1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status);

- 方式簡單，對不對？

雖然交流中仍然存在一些問題，收到a[e-s+1]是不對的，雖然正如我所提到的，我不能打擾解密MPE_decomp1d找出原因。想必你想收到a[rows-1]。

最後，MPI_Barrier()是緩慢和完全不必要的;在guardcell交換機中有足夠的同步（對Allreduce來說沒什麼），你不需要它。

當所有這些更改都完成後，代碼運行時不會出現內存訪問問題;你必須檢查它是否給出了正確的答案。

#include<stdio.h> 
#include<stdlib.h> 
#include"mpi.h" 

const double tolerance = 0.00001; 
const int maxit = 10000; 

void MPE_decomp1d(int n, int size, int id, int *rows) 
{ 
    int s, e; 
    s = (n/size)*id + ((n%size)>0)*(id>(n%size)?n%size:id); 
    e = s + (n/size)+((n%size)>id); 
    *rows = e - s - (s==0) - (e==0); 
} 

void onedinit(double **a, double **b, double **f, const int nx, const int rows, const int id, const int nprocs) 
{ 
    int i, j; 

    for (i = 0; i < rows; i++) 
    { 
     for (j = 0; j < nx; j++) 
     { 
      a[i][j] = b[i][j] = f[i][j] = 0.0; 
     } 
    } 
    for (i = 0; i < rows; i++) a[i][0] = b[i][0] = 1; 

    if (id == 0) 
     for (i = 0; i < nx; i++) a[0][i] = b[0][i] = 2.0; 
} 
void exchng1(double **a, const int nx, const int rows, MPI_Comm comm1d, int nbrbottom, int nbrtop) 
{ 
    int rank, coord; 
    MPI_Status status; 
    MPI_Comm_rank(comm1d, &rank); 
    MPI_Cart_coords(comm1d, rank, 1, &coord); 

    /* send data downwards */ 
    MPI_Sendrecv(a[rows-2], nx, MPI_DOUBLE, nbrbottom, 0, a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status); 
    /* send data upwards */ 
    MPI_Sendrecv(a[1], nx, MPI_DOUBLE, nbrtop, 1, a[rows-1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status); 
} 

void sweep1d(double **a, double **f, const int nx, const int rows, double **b) 
{ 
    int i, j; 
    double h = 1.0/(double)nx; 
    for (i = 1; i < rows-1; i++) for (j = 1; j < nx-1; j++) 
     b[i][j] = 
      0.25 * (a[i-1][j] + a[i][j+1] + a[i][j-1] + a[i+1][j]) - h*h*f[i][j]; 
    return; 
} 

double diff(double **a, double **b, const int nx, const int rows) 
{ 
    double sum = 0.0; 
    int i, j; 
    for (i = 0; i < rows; i++) for (j = 0; j < nx; j++) 
     sum += (a[i][j] - b[i][j])*(a[i][j] - b[i][j]); 
    return sum; 
} 
int main(int argc, char *argv[]) 
{ 
    int nx, ny; 
    int myid, root, numprocs, period=0; 
    int nbrbottom, nbrtop, it; 

    double diffnorm, dwork; 

    double **a, **b, **f; 

    root = 0; 
    MPI_Comm comm1d; 
    MPI_Init(&argc, &argv);; 
    MPI_Comm_rank(MPI_COMM_WORLD, &myid); 
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs); 

    if(!myid) 
    { 
     /******for this piece of code nx and ny are assumed to be same please*******/ 
     printf("Enter the number of cells in X & Y direction\n"); 
     scanf("%d %d", &nx, &ny); 
     nx += 1; 
     ny += 1; 
     ny = nx; //forced to follow our assumption; 
    } 
    MPI_Bcast(&nx, 1, MPI_INT, 0, MPI_COMM_WORLD); 
    MPI_Bcast(&ny, 1, MPI_INT, 0, MPI_COMM_WORLD); 

    MPI_Cart_create(MPI_COMM_WORLD, 1, &numprocs, &period, 1, &comm1d); 

    MPI_Comm_rank(comm1d, &myid); 
    MPI_Cart_shift(comm1d, 0, 1, &nbrtop, &nbrbottom); 

    int rows; 
    MPE_decomp1d(ny, numprocs, myid, &rows); 
    int i, j; 

    a = (double**)malloc(rows*sizeof(double*)); 
    b = (double**)malloc(rows*sizeof(double*)); 
    f = (double**)malloc(rows*sizeof(double*)); 
    for (i = 0; i < rows; i++) 
    { 
     a[i] = (double*)malloc(nx*sizeof(double)); 
     b[i] = (double*)malloc(nx*sizeof(double)); 
     f[i] = (double*)malloc(nx*sizeof(double)); 
    } 
    onedinit(a, b, f, nx, rows, myid, numprocs); 
    diffnorm = 0.0; 
    it = 0; 
    do 
    { 
     exchng1(a, nx, rows, comm1d, nbrbottom, nbrtop); 
     sweep1d(a, f, nx, rows, b); 
     exchng1(b, nx, rows, comm1d, nbrbottom, nbrtop); 
     sweep1d(b, f, nx, rows, a); 
     dwork = diff(a, b, nx, rows); 

     /************printing matrix a after every iteration******/ 
     for (i = 0; i < rows; i++) 
     { 
      for (j = 0; j < nx; j++) printf("%lf ", a[i][j]); 
      printf("\n"); 
     } 

     //printf("%lfhehe\n", dwork); 
     MPI_Allreduce(&dwork, &diffnorm, 1, MPI_DOUBLE, MPI_SUM, comm1d); 
     //printf("%dhere\n", myid); 
    } 
    while (++it < maxit && diffnorm > tolerance); 

    MPI_Finalize(); 
    return 0; 
}

來源

2011-03-01 14:28:57

+1只是爲了經歷這麼多意大利麪代碼的麻煩。 – RedX 2011-03-01 14:35:59

thnks所有，我已經得到了錯誤。一些內存分配問題.. – 2011-03-10 09:53:13

不，男人。代碼中存在比「某些內存分配問題」更嚴重的問題。 – 2011-03-10 18:49:07

獲得集體中止在我的泊松方程代碼

回答

相關問題