C＃TPL比C++ PPL更快嗎？

我編寫了一個非常簡單的應用程序所使用的斐波那契fonction比較TPL的Parallel.ForEach VS PPL的parallel_for_each，結果真的很奇怪，在PC上有8個內核，C＃的是11秒更快那麼C++。C＃TPL比C++ PPL更快嗎？

兩個VS2010和VS 2011的預覽相同的結果。

C＃代碼：

using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Text; 
using System.Collections.Concurrent; 
using System.Threading.Tasks; 
using System.Diagnostics; 

namespace ConsoleApplication1 
{ 
    class Program 
    { 

      static void Main(string[] args) 
      { 
       var ll = new ConcurrentQueue<Tuple<int, int>>(); 
       var a = new int[12] { 40, 41, 42, 43, 44, 45, 46, 47, 35, 25, 36, 37 }; 

       long elapsed = time_call(() => 
       { 
        Parallel.ForEach(a, (n) => { ll.Enqueue(new Tuple<int, int>(n, fibonacci(n))); }); 
       }); 

       Console.WriteLine("TPL C# elapsed time: " + elapsed + "\n\r"); 
       foreach (var ss in ll) 
       { 
        Console.WriteLine(String.Format("fib<{0}>: {1}", ss.Item1, +ss.Item2)); 
       } 

       Console.ReadLine(); 
      } 

      static long time_call(Action f) 
      { 
       var p = Stopwatch.StartNew(); 
       p.Start(); 
       f(); 
       p.Stop(); 
       return p.ElapsedMilliseconds; 
      } 

      Computes the nth Fibonacci number. 
      static int fibonacci(int n) 
      { 
       if (n < 2) return n; 
       return fibonacci(n - 1) + fibonacci(n - 2); 
      } 
     } 
    }

C++代碼：

#include <windows.h> 
#include <ppl.h> 
#include <concurrent_vector.h> 
#include <array> 
#include <tuple> 
#include <algorithm> 
#include <iostream> 

using namespace Concurrency; 
using namespace std; 

template <class Function> 
__int64 time_call(Function&& f) { 
    __int64 begin = GetTickCount(); 
    f(); 
    return GetTickCount() - begin; 
} 

// Computes the nth Fibonacci number. 
int fibonacci(int n) { 
    if (n < 2) return n; 
    return fibonacci(n-1) + fibonacci(n-2); 
} 

int wmain() { 
    __int64 elapsed; 
    array<int, 12> a ={ 40, 41, 42, 43, 44, 45, 46, 47, 35, 25, 36, 37 }; 
    concurrent_vector<tuple<int,int>> results2; 

    elapsed = time_call([&]{ 
     parallel_for_each(a.begin(), a.end(), [&](int n) { 
      results2.push_back(make_tuple(n, fibonacci(n))); 
     }); 
    }); 

    wcout << L"PPL time: " << elapsed << L" ms" << endl << endl; 
    for_each (results2.begin(), results2.end(), [](tuple<int,int>& pair) { 
     wcout << L"fib(" << get<0>(pair) << L"): " << get<1>(pair) << endl; 
    }); 

    cin.ignore(); 
}

能否請你點我，在這裏我的C++的一部分代碼，我錯了？

寬度group_task我具有相同的時間如C＃代碼：

task_group tasks; 
    elapsed = time_call([&] 
    { 
     for_each(begin(a), end(a), [&](int n) 
     { 
      tasks.run([&,n]{results2.push_back(make_tuple(n, fibonacci(n)));}); 
     }); 
     tasks.wait();

來源

2012-01-03 user1127781

您能否發佈格式正確的源代碼？這些陳述之間的空白聯繫尤其令人惱火，因爲它們迫使我滾動很多。 – 2012-01-03 12:28:10

對我而言，跳出的是您在示例中使用的不同集合。 C＃版本使用隊列，而C++版本使用向量。 – 2012-01-03 13:00:35

另外，你是否只在每個例子中定時調用'fibonacci'函數？ – 2012-01-03 13:01:43

的GetTickCount（http://msdn.microsoft.com/en-us/library/windows/desktop/ms724408%28v=vs。 85％29.aspx）用於測量在本地傳遞的時間的函數並不準確。該描述是這樣說的：

「GetTickCount函數的分辨率被限制爲系統定時器，這是通常在10毫秒到16毫秒的範圍內的分辨率。」

從我的經驗API使用GetSystemTime產生在Windows Vista更好的效果和最多（在Win XP中有還挺相同的分辨率，如果剔計數我沒記錯的話）。或者更好的是，您可以使用細粒度測量，其他API提供小於mils的分辨率。可能在您的情況下，構建大型數據集會更有幫助，從而獲得一些有意義的數據。

來源

2012-01-03 14:14:23 Ghita

我認爲這個問題既不GetTickCount函數也不是併發容器，如果我將parallel_for_each C++代碼更改爲 task_group我有同樣的時間喜歡c＃代碼。 – user1127781 2012-01-03 16:42:52

根據OP，兩者之間有11秒的差異，所以它可能與滴答計數的準確性無關。 – 2013-06-05 21:42:54

這裏是由拉胡爾v帕蒂爾微軟團隊

你好的解釋，

感謝提出這個問題。事實上，您已經確定了與*的默認並行相關聯的開銷 - 尤其是當迭代次數很少且工作大小可變時。默認並行開始通過將工作分解爲8 塊（在8個內核上）。工作結束後，工作將動態地進行，負載平衡。默認工作在大多數情況下，大（大量迭代），並在每次迭代的基礎工作不順利瞭解（假設你叫成庫） - 但它不來在某些情況下無法接受的開銷。

該解決方案是完全在您的備用 implemtnation已經確定了什麼。爲此，我們將有一個平行的分區稱爲在Visual Studio的下一個版本的「簡單」，這將是類似於你描述的替代實現，將有更好的性能。

PS：對於每個實現的C＃和C++並行使用略有不同算法，他們如何去通過重複 - 因此你會看到根據工作量稍有不同的性能特徵。

問候

來源

2012-01-04 08:12:42 user1127781

有一些問題與您的代碼，讓我們解決這些問題一個接一個：

使用遞歸計算斐波納契使進程使用的內存過多的，因爲它是使用調用堆棧來計算結果。遞歸斐波那契意味着你沒有比較C＃/ C++並行框架，你正在比較調用棧機制。你可以更快地計算斐波那契：

int fibonacci(int n) 
{ 
    int curr = 1, prev = 0, total = 0; 
    for (int i = 0; i < n; i++) 
    { 
     int pc = curr; 
     curr += prev; 
     total += curr; 
     prev = pc; 
    } 
    return total; 
}

有了這個函數，我必須運行至少100萬次才能獲得可測量的值。

使用GetTickCount64代替的GetTickCount：

template <class Function> 
__int64 time_call(Function&& f) 
{ 
    __int64 begin = ::GetTickCount64(); 
    f(); 
    return GetTickCount64() - begin; 
}

運行parallel_for時有這樣的小身體可能實際上會降低性能。最好使用更細粒度的函子。

有了這些特質，這裏是C++代碼：

// ParallelFibo.cpp : Defines the entry point for the console application. 
// 

#include "stdafx.h" 
#include <windows.h> 
#include <ppl.h> 
#include <concurrent_vector.h> 
#include <array> 
#include <tuple> 
#include <algorithm> 
#include <iostream> 
#include <random> 

using namespace Concurrency; 
using namespace std; 

template <class Function> 
__int64 time_call(Function&& f) 
{ 
    __int64 begin = ::GetTickCount64(); 
    f(); 
    return GetTickCount64() - begin; 
} 

// Computes the nth Fibonacci number. 
inline int fibonacci(int n) 
{ 
    int curr = 1, prev = 0, total = 0; 
    for (int i = 0; i < n; i++) 
    { 
     int pc = curr; 
     curr += prev; 
     total += curr; 
     prev = pc; 
    } 
    return total; 
} 

#define NUMBER_OF_REPETITIONS 1000000 
#define MIN_FIBO    37 
#define MAX_FIBO    49 

int main() 
{ 
    __int64 elapsed; 
    vector<int> v; 
    for (int i = MIN_FIBO; i < MAX_FIBO; i++) 
    { 
     v.emplace_back(i); 
    } 

    concurrent_vector<tuple<int, int>> results; 
    elapsed = time_call([&] { 
     parallel_for(MIN_FIBO, MAX_FIBO, [&](int n) { 
      for (int i = 0; i < NUMBER_OF_REPETITIONS; i++) 
      { 
       results.push_back(make_tuple(n, fibonacci(n))); 
      } 
     }); 
    }); 
    wcout << L"PPL time: " << elapsed << L" ms" << endl << endl; 
    cin.ignore(); 
    return 0; 
}

這裏是在C＃代碼：

using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Text; 
using System.Threading.Tasks; 
using System.Collections.Concurrent; 
using System.Diagnostics; 

namespace ParallelFiboCS 
{ 
    class Program 
    { 
     const int NUMBER_OF_REPETITIONS = 1000000; 
     const int MIN_FIBO = 37; 
     const int MAX_FIBO = 49; 
     static void Main(string[] args) 
     { 
      var ll = new ConcurrentQueue<Tuple<int, int>>(); 

      var a = new int[MAX_FIBO - MIN_FIBO]; 
      for (int i = MIN_FIBO; i < MAX_FIBO; i++) 
      { 
       a[i - MIN_FIBO] = i; 
      } 

      long elapsed = time_call(() => 
      { 
       Parallel.ForEach(a, (n) => 
       { 
        for (int i = 0; i < NUMBER_OF_REPETITIONS; i++) 
        { 
         ll.Enqueue(new Tuple<int, int>(n, fibonacci(n))); 
        } 
       }); 
      }); 

      Console.WriteLine("TPL C# elapsed time: " + elapsed + "\n\r"); 

      Console.ReadLine(); 
     } 

     static long time_call(Action f) 
     { 
      var p = Stopwatch.StartNew(); 
      p.Start(); 
      f(); 
      p.Stop(); 
      return p.ElapsedMilliseconds; 
     } 

     static int fibonacci(int n) 
     { 
      int curr = 1, prev = 0, total = 0; 
      for (int i = 0; i < n; i++) 
      { 
       int pc = curr; 
       curr += prev; 
       total += curr; 
       prev = pc; 
      } 
      return total; 
     } 
    } 
}

平均時間運行1200萬個之間的數字斐波那契數計算37和49：

C++：513ms

C＃：2527ms

來源

2015-07-17 22:59:59

C＃TPL比C++ PPL更快嗎？

回答

相關問題