OpenMP，並行for循環，處理時間差別很大

我開發了一個程序，它讀取.txt文件中的數字，它將存儲到一個矢量中，以進行一系列組合和計算，以確定結果是否與數字匹配我想要的。這些過程將在多個線程中完成，其中每個線程將負責處理parallel for循環內的各種迭代次數。OpenMP，並行for循環，處理時間差別很大

長話短說，當處理時間可能短到3分鐘或可能超過10分鐘時，處理時間會有很大變化（例如9個數字）。

這裏的基準，我試過到目前爲止：

8 numbers serial : 18.119 seconds 
8 numbers multithread (first-try): 10.238 seconds 
8 numbers multithread (second-try): 18.943 seconds 
9 numbers serial : 458.980 seconds 
9 numbers multithread (first-try): 172.347 seconds 
9 numbers multithread (second-try): 519.532 seconds //Seriously? 
//Another try after suggested modifications 
9 numbers multithread (first-try): 297.017 seconds 
9 numbers multithread (second-try): 297.85 seconds 
9 numbers multithread (third-try): 304.755 seconds 
9 numbers multithread (fourth-try): 396.391 seconds

所以現在的問題是，是否有利於改善項目（多線程），因此只需要最少的任何可能的方式洗牌/計算數量的時間？

下面的代碼的一部分，其中用於循環並行發生（更新略有修改）：

#include <iostream>  
#include <fstream> 
#include <string> 
#include <vector> 
#include <stdlib.h> 
#include <algorithm> 
#include <stdio.h> 
#include <Windows.h> 
#include <omp.h> 

#define OPERATORSIZE 3 

using namespace std; 
int cur_target; 
ofstream outFile; 

string get_operator(int i) { 
     switch (i) { 
     case 0: 
      return "+"; 
     case 1: 
      return "-"; 
     case 2: 
      return "*"; 
     case 3: 
      return "/"; 
     default: 
      return ""; 
     } 
} 

int prev_num_pos(vector<int> &cur_equation, int count) { 
    for (int i = count - 1; i >= 0; i--) { 
     if (cur_equation[i] != -1) return i + 1; 
    } 
    return 0; 
} 

bool nextoperator(int k, vector<int> &operator_array) { 
     for (int i = k - 2; i >= 0; i--) { 
      if (operator_array[i] < OPERATORSIZE) { 
       operator_array[i] += 1; 
       break; 
      } 
      else 
       operator_array[i] = 0; 
      switch (i) { 
      case 0: 
       return false; 
      } 
     } 
     return true; 
} 

void vector_combination(vector<int> int_list) {            // Generate the number combinations from the number list 

    bool div_remainder = false; 
    int count = 0; 

    #pragma omp parallel for schedule(dynamic) firstprivate(div_remainder) reduction(+:count) 
    for (int i = 0; i < int_list.size(); ++i) { 

     vector<int> cur_equation, cur_temp, cur_list, operator_array; 

     auto list = int_list; 
     rotate(list.begin(), list.begin() + i, list.begin() + i + 1); 
     do 
     { 
      cur_list.clear(); 
      operator_array.clear(); 
      for (auto x : list) 
       cur_list.push_back(x); 
      for (int i = 0; i < cur_list.size() - 1; i++) 
       operator_array.push_back(0); 

      do 
      { 
       div_remainder = false; 
       count = 0; 
       cur_equation = operator_array; 
       cur_temp = cur_list; 

       for (int i = 0; i < cur_equation.size(); ++i) {         // Check for equation priorities 
        if (cur_equation[i] == 3) { 
         count = i; 
         if (cur_temp[count] % cur_temp[count + 1] != 0) { 
          div_remainder = true; 
          break; 
         } 
        } 
       } 

       if (div_remainder) 
        continue; 

       for (int i = 0; i < cur_temp.size() - 1; ++i) { 
        count = -1; 
        if (cur_equation[i] == 2 || cur_equation[i] == 3) { 
         count = prev_num_pos(cur_equation, i); 
        } 
        else 
         continue; 
        if (cur_equation[i] == 2) { 
         cur_temp[count] *= cur_temp[i + 1]; 
         cur_equation[i] = -1; 
        } 
        else if (cur_equation[i] == 3) { 
         if (cur_temp[i + 1] != 0) { 
          cur_temp[count] /= cur_temp[i + 1]; 
          cur_equation[i] = -1; 
         } 
         else { 
          div_remainder = true; 
          break; 
         } 
        } 
       } 

       if (div_remainder) 
        continue; 

       for (int i = 0; i < cur_temp.size() - 1; ++i) { 
        switch (cur_equation[i]) { 
        case 0: { 
         cur_temp[0] += cur_temp[i + 1];            // Addition 
         cur_equation[i] = -1; 
         break; 
        } 
        case 1: {                  // Subtraction 
         cur_temp[0] -= cur_temp[i + 1]; 
         cur_equation[i] = -i; 
         break; 
        } 
        } 
       } 


       if (cur_temp[0] == cur_target) { 
        #pragma omp critical 
        { 
         for (int i = 0; i < cur_list.size(); ++i) { 
          outFile << cur_list[i]; 
          if (i < cur_list.size() - 1) { outFile << get_operator(operator_array[i]); } 
         } 
         outFile << "\n"; 
        } 
       } 

      } while (nextoperator(cur_list.size(), operator_array)); 

      // Send to function to undergone a list of operator combinations 
     } while (next_permutation(list.begin() + 1, list.end())); 
    } 
} 

int main(void) { 
    SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS); 
    vector<int> int_list; 
    string line; 
    ifstream myfile("Problem.txt"); 
    if (myfile.is_open()) { 
     while (getline(myfile, line)) { 
      int num = stoi(line); 
      int_list.push_back(num); 
      cur_target = num; 
     } 
    } 
    else 
     cout << "Unable to open file." << endl; 

    myfile.close(); 
    int_list.pop_back(); 
    sort(int_list.begin(), int_list.end()); 
    outFile.open("answer.txt"); 
    vector_combination(int_list); 
    outFile.close(); 

    int answer_count = 0; 

    myfile.open("answer.txt"); 
    if (myfile.is_open()) { 
     while (getline(myfile, line)) { 
      ++answer_count; 
      if (answer_count > 1) 
       break; 
     } 
    } 
    myfile.close(); 

    if (answer_count == 0) { 
     outFile.open("answer.txt"); 
     outFile << "-1" << endl; 
    } 
    outFile.close(); 

    return 0; 
}

至於樣品輸入，創建名爲「Problem.txt」利用隨機數等.txt文件所以（最後一個數字是目標的結果）（與用於基準當前樣本輸入更新）：

該程序上運行的硬件/軟件規範： 處理器：酷睿i5的Sandy Bridge 2500K，拉姆：8GB， OS：視窗10名專業人員， IDE：的Visual Studio 2015企業版，

來源

2016-07-23 John Wan

嘗試衡量程序完成了多少工作，以及它自己的運行時間是多少。在linux中有'/ usr/bin/time。/ your_program'和'perf stat。/ your_program'。檢查是你的循環佔用大部分時間，或者由內核或硬盤驅動器進行長時間的I/O工作。還要檢查你的CPU核心是否空閒，CPU頻率是否好（不要在電池供電的筆記本電腦上進行基準測試）。如果不知道數據的大小（以及每個循環數），完整的代碼+數據示例/分析數據，操作系統和硬件信息，則很難回答。 – osgx

@osgx我通過使用探查器進行了檢查。到目前爲止，消耗大部分資源的唯一部分是向量複製，循環花費不大。無論如何，我會盡我所能以您提到的要求更新問題。 –

是在openmp循環內複製的向量嗎？該問題中包含片段中的矢量複製代碼？什麼時間openmp循環（<5％，20％50％或更多）？你能發佈性能分析結果嗎？矢量的大小，循環次數，內存消耗是多少？ – osgx

運行時差異顯然是由向量造成的。我使用性能分析器對其進行了檢查，並注意到向量之間複製值的時間不一致。我已經將它修改爲指針數組，現在運行時已經得到了巨大改進並且一致。

來源

2016-08-23 06:31:28

移動#pragma omp critical如果條件內。由於cur_temp是線程專用的，並且cur_target是全局只讀的，所以沒有必要使用關鍵部分來保護條件。這種改變極大地減少了線程之間的直接交互，並且在我的系統上一致地加快了並行版本的速度。

我會弱猜測性能變化受不同線程上運行的循環之間的（看似隨機的）相移的影響。

如果性能差異依然存在，請嘗試啓用線程綁定。檢查OpenMP實現的文檔，查找OMP_PROC_BIND，「線程固定」，「綁定」或「關聯」。

來源

2016-07-25 12:44:15 Zulan

您好，我已經嘗試了您的建議方法，但變體仍然存在。我會看看你提到的關鍵字。謝謝 –

OpenMP，並行for循環，處理時間差別很大

回答

相關問題