使用MPI進行Bitonic分選

我正在開發一個關於使用MPI和C實現它的Parallel Bitonic Sorting的項目。我開發的程序工作正常，但由於簡單的QuickSort（嘆氣）在執行時間方面擊敗了它，所以效率不高。也許問題是關於溝通的成本，但我不明白如何改善這一點，所以下面的代碼：使用MPI進行Bitonic分選

#include <stdio.h> 
#include <stdlib.h> 
#include <mpi.h> 
#include <math.h> 
#include <time.h> 
#include <sys/time.h> 
#include <string.h> 

#include "bs-util.h" 
#include "quicksort.h" 

#define TAG 1 


/* Run this program knowing that: 
* 1) The number of cores must be a power of 2 
* 2) The length of the array to order must be a power of 2 
* 
* Exec Example: mpirun -n 4 ./bs 1024 1024 
* */ 


void exchange(FILE *log, int i, int partner, int up); 

int countTransfer = 0; 

int *myArray, *partnerArray; 
int currentPartner = -1; 
int rank, size; 
MPI_Status status; 
int verbose = 0; //this var toggles on(1) or off(0) some useful prints for debugging purpose 
int amount=0; 

int main(int argc, char *argv[]) 
{ 
    int *array; 
    int i=0; 
    int carry=0; 
    int up=1; 
    int count=0; 

    struct timeval tim; 

    FILE *log; 

    char logName[15] = "log/"; 

    MPI_Init(&argc, &argv); 
    MPI_Comm_rank(MPI_COMM_WORLD, &rank); 
    MPI_Comm_size(MPI_COMM_WORLD, &size); 

    /* Time meter */ 
    srand((double) time(NULL)); 
    gettimeofday(&tim, NULL); 
    double t1=tim.tv_sec+(tim.tv_usec/1000000.0); 

    snprintf(logName+4, 10, "%d",rank); 
    log = fopen(logName,"w"); 

    printf("Hello world from process %d of %d.\n", rank, size); 
    MPI_Barrier(MPI_COMM_WORLD); 

    /* INPUT */ 

    if (rank==0) 
    { 
     if (argc==2) /* by file */ 
     { 
      FILE *input = fopen(argv[1],"r"); 
      char line[20]; 
      count = 0; 
      while(fgets(line,20,input) != NULL) 
      { 
       count++; 
      } 
      fclose(input); 
      array = (int *)malloc(count*sizeof(int)); 
      input = fopen(argv[1],"r"); 
      i = 0; 
      while(fgets(line,20,input) != NULL) 
      { 
       array[i] = atoi(line); 
       i++; 
      } 
      fclose(input); 
     } 
     else 
      if (argc==3) /* by command line */ 
      { 
       count = atoi(argv[1]); 
       int max = atoi(argv[2]); 
       array = (int *)malloc(count*sizeof(int)); 
       srand(time(NULL)); 
       for (i=0; i<count; i++) 
       { 
        array[i] = rand()%max; 
       } 
      } 
      else 
      { 
       printf("\n\n ----------- ERRORE NEI PARAMETRI DI INPUT ----------- \n\n"); 
       return 1; 
      } 

     /* END OF THE INPUT */ 

     if (verbose){ 
      printf("Initial array:\n"); 
      for (i=0; i<count; i++) 
      { 
       printf("%d\t", array[i]); 
      } 
      printf("\n"); 
     } 
     /* Everyone wait eachother */ 
     MPI_Barrier(MPI_COMM_WORLD); 

     carry = count%size; 
     amount = count/size + carry; 
     printf("\nParametri: amount=%d carry=%d\n\n", amount, carry); 
     up=1; 
     int startIndex = amount; 


     myArray = (int *)malloc(amount*sizeof(int)); 
     /* Buffer (partner) */ 
     partnerArray = (int *)malloc(amount*sizeof(int)); 

     for (i=0; i<amount; i++) 
      myArray[i] = array[i]; 
     printf("Processo %d riceve amount=%d e up=%d\n", rank, amount, up); 
     if (verbose){ 
      printf("Mia porzione ---> "); 
      for (i=0; i<amount; i++) 
      { 
       printf("%d\t", myArray[i]); 
      } 
      printf("\n"); 
     } 

     /* Sending the big array's chunks */ 
     for (i=1; i<size; i++) 
     { 
      up = (i+1) % 2; 
      MPI_Send(&up, 1, MPI_INT, i, TAG, MPI_COMM_WORLD); 
      MPI_Send(&amount, 1, MPI_INT, i, TAG, MPI_COMM_WORLD); 
      MPI_Send(&carry, 1, MPI_INT, i, TAG, MPI_COMM_WORLD); 

      MPI_Send(array+startIndex, amount-carry, MPI_INT, i, TAG, MPI_COMM_WORLD); 

      startIndex += amount-carry; 
     } 

     MPI_Barrier(MPI_COMM_WORLD); 
    } 
    else 
    { 
     MPI_Barrier(MPI_COMM_WORLD); 

     MPI_Recv(&up, 1, MPI_INT, 0, TAG, MPI_COMM_WORLD, &status); 
     MPI_Recv(&amount, 1, MPI_INT, 0, TAG, MPI_COMM_WORLD, &status); 
     MPI_Recv(&carry, 1, MPI_INT, 0, TAG, MPI_COMM_WORLD, &status); 
     myArray = (int *)malloc(amount*sizeof(int)); 
     partnerArray = (int *)malloc(amount*sizeof(int)); /* Buffer (partner) */ 
     MPI_Recv(myArray, amount, MPI_INT, 0, TAG, MPI_COMM_WORLD, &status); 


     /* Experimental padding: every chunck has the same amount of items. */ 
     for (i=amount-carry; i<amount; i++) 
     { 
      myArray[i] = 0; 
     } 

     printf("\n"); 
     printf("Processo %d riceve amount=%d e up=%d\n", rank, amount-carry, up); 
     if (verbose){ 
      printf("Mia porzione ---> "); 
      for (i=0; i<amount; i++) 
      { 
       printf("%d\t", myArray[i]); 
      } 
      printf("\n"); 
     } 
     MPI_Barrier(MPI_COMM_WORLD); 
    } 

    /* CORE */ 

    /* Local Quicksort */ 
    int result = quickSort(&myArray[0], amount); //this function is written within src/quicksort.c 
    if (verbose){ 
     if (result == 1) 
      printf("Quick Sort: FAIL \n"); 
     else 
     { 
      printf("\nLa mia porzione ordinata (processo %d)\n", rank); 
      for(i=0; i<amount; i++) 
      { 
       printf("%d ",myArray[i]); 
      } 
      printf ("\n"); 
     } 
    } 

    int j; 

    for (up=8;up<=amount*size;up=2*up) 
    { 
     for (j=up>>1;j>0;j=j>>1) 
     { 
      for (i=0;i<amount*size;i++) 
      { 
       int partner=i^j;     
       if ((partner)>i) 
       { 
        exchange(log,i,partner,i&up); 
       } 

      } 
     } 
    } 

    /* END OF THE CORE */ 

    if (rank!=0) 
    { 
     MPI_Send(myArray, amount, MPI_INT, 0, TAG, MPI_COMM_WORLD); 
    } 
    gettimeofday(&tim, NULL); 
    double t2=tim.tv_sec+(tim.tv_usec/1000000.0); 
    if (rank==0) 
    { 
     myArray = (int *)realloc(myArray,sizeof(int)*amount*size); 
     for (i=1; i<size; i++) 
      MPI_Recv(myArray+i*amount, amount, MPI_INT, i, TAG, MPI_COMM_WORLD, &status); 
     printf("\nTempo trascorso %6f\n", t2-t1); 
     fprintf(log,"\n\n----------> Array Iniziale <----------\n"); 
     printArray(log,array,count); 
     fprintf(log,"\n\n----------> Array Finale <----------\n"); 
     printArray(log,myArray+(carry*(size-1)),count); 
     /*printArray(log,myArray,newAmount*size);*/ 

    }  
    fprintf(log,"Numero di chunk scambiati: %d\n",countTransfer); 
    fclose(log); 
    MPI_Finalize(); 
    return 0; 
} 

void exchange(FILE *log, int i, int partner, int up) 
{ 
    int rank_i = i/amount; 
    int rank_partner = partner/amount; 

    int offset_i = i%amount; 
    int offset_partner = partner%amount; 
    /*if (verbose) 
     fprintf(log,"\nnewAmount = %d - Rank_i = %d - Rank_partner = %d - Offset_i = %d - Offset_partner = %d \n",amount,rank_i,rank_partner,offset_i,offset_partner); 
    */ 

    if ((rank_i != rank) && (rank_partner != rank)) 
     return; 

    if ((rank_i == rank) && (rank_partner == rank)) 
    { 
     if (((up==0) && (myArray[offset_i] > myArray[offset_partner])) || ((up!=0) && (myArray[offset_i] < myArray[offset_partner]))) 
     { 
      int temp = myArray[offset_i]; 
      myArray[offset_i] = myArray[offset_partner]; 
      myArray[offset_partner] = temp; 
     } 
     return; 
    } 

    if (rank_i == rank && rank_partner != rank) 
    { 
     if (currentPartner != rank_partner) 
     { 
      MPI_Send(myArray, amount, MPI_INT, rank_partner, TAG, MPI_COMM_WORLD); 
      MPI_Recv(partnerArray, amount, MPI_INT, rank_partner, TAG, MPI_COMM_WORLD, &status); 
      currentPartner = rank_partner; 
      countTransfer++; 
     } 
     if (((up==0) && (myArray[offset_i] > partnerArray[offset_partner])) || ((up!=0) && (myArray[offset_i] < partnerArray[offset_partner]))) 
      myArray[offset_i] = partnerArray[offset_partner]; 
     return; 
    } 

    if (rank_i != rank && rank_partner == rank) 
    { 
     if (currentPartner != rank_i) 
     { 
      MPI_Recv(partnerArray, amount, MPI_INT, rank_i, TAG, MPI_COMM_WORLD, &status); 
      MPI_Send(myArray, amount, MPI_INT, rank_i, TAG, MPI_COMM_WORLD); 
      currentPartner = rank_i; 
      countTransfer++; 
     } 
     if (((up==0) && (partnerArray[offset_i] > myArray[offset_partner])) || ((up!=0) && (partnerArray[offset_i] < myArray[offset_partner]))) 
      myArray[offset_partner] = partnerArray[offset_i]; 
     return; 
    } 

}

和這裏的make文件：

CC = mpicc 
OPTIMIZE = 
CFLAGS = $(DEFINES) $(OPTIMIZE) 
LFLAGS = -lm     
PROGS = ./bs 
PROGS_SRC = src/bs-util.c src/bs.c src/quicksort.c 


all: 
    $(CC) $(CFLAGS) $(LFLAGS) -o $(PROGS) $(PROGS_SRC)

幫助將是非常讚賞:)

參考文獻：http://goo.gl/nXt4p

來源

2011-09-15 Flavio Marcato

請記住，與快速排序N log N（串行版本）相比，雙音速排序的時間複雜度類似N/P (log N)^2。這意味着，與log N > P（P〜處理器數量）甚至應該串行快速跳動比特排序（我不是在談論乘以一些因素取決於實施，無論是通信）。 Bitonic排序適用於真正的並行計算機（它在GPU上非常好），而不是像您可能擁有的少數幾個PC。

來源

2011-09-15 22:47:17

事實上，我正在處理4,8或16個處理器，而我正在嘗試對數百萬項目進行排序（學術目的）。感謝您的有用解釋！ –

庵。除了障礙之外，我沒有看到你在進行任何集體通信......

來源

2011-09-15 15:52:33

那麼MPI_Send/MPI_Recv呢？ –

有關MPI的令人敬畏的事情是廣播，alltoall，收集，分散，減少，掃描，...都爲你優化。另外，還有MPI文件IO來加速文件讀取。 Batcher排序的基本框架是使用MPI文件IO讀取數據，使用C++的std :: sort在本地進行排序，然後開始進行合併。 –

總是歡迎實用技巧，謝謝！ –

許多小數據塊的發送/接收（如交換功能）嚴重影響性能。更高效的是將小塊組合成一個緩衝區併發送。

來源

2011-09-15 17:18:47

感謝您的建議，我會嘗試這種方式！ –

@Flavio：所有謝謝SO（StackOverflow）應該轉換爲upvotes和/或答案接受 –

使用MPI進行Bitonic分選

回答

相關問題