定時測試：單線程VS回調TP VS未來TP

我有3個單元在https://github.com/spakai/threadpool_future 定時測試：單線程VS回調TP VS未來TP

class ThreadPoolTest : public Test { 
    public: 
     ThreadPool pool; 
     std::condition_variable wasExecuted; 
     std::mutex m; 
     std::vector<std::shared_ptr<std::thread>> threads; 

     unsigned int count{0}; 

     void incrementCountAndNotify() { 
      std::unique_lock<std::mutex> lock(m); 
      ++count; 
      std::cout << count << std::endl; 
      wasExecuted.notify_all(); 
     } 

     void waitForNotificationOrFailOnTimeout(unsigned expectedCount, int milliseconds=80000) { 
      std::unique_lock<std::mutex> lock(m); 
      ASSERT_THAT(wasExecuted.wait_for(lock, std::chrono::milliseconds(milliseconds), [&] { return count == expectedCount; }), Eq(true));  

     } 

     bool hasDuplicates(const std::vector<int> & birthdays) { 
      std::set<int> uniqueBirthdays(birthdays.begin(), birthdays.end()); 
      return (uniqueBirthdays.size() != birthdays.size()); 
     } 

     std::vector<int> generateNumbers(const int popSize) { 
      std::vector<int> list; 
      std::random_device rd; 
      std::default_random_engine dre(rd()); 
      std::uniform_int_distribution<int> di(0,365); 
      for(int i{0}; i < popSize ; i++) { 
       list.push_back(di(dre)); 
      } 
      return list; 
     } 

     void TearDown() override { 
      for (auto& t: threads) t->join(); 
     } 
}; 



TEST_F(ThreadPoolTest,TimingTestWithFuture) { 
    pool.start(4); 
    std::vector<std::future<unsigned long long>> results; 
    auto work = [](int n) { 
     unsigned long long factorial = 1; 
     for(int i = 1; i <=n; ++i) { 
     factorial *= i; 
     } 

     return factorial; 

    }; 


    TestTimer timer("4-sized-TP with Future",0); 
    for (int i = 5; i < 60 ; i++) { 
     results.push_back(pool.submit(work,i)); 
    } 


    for(unsigned int i = 0; i< results.size(); i++) { 
     results.at(i).get(); 
    } 
} 

TEST_F(ThreadPoolTest,TimingTestWithCallback) { 
    pool.start(4); 
    std::vector<unsigned long long> results; 
    TestTimer timer("4-sized-TP-Callback",0); 
    for (int n = 5; n < 60 ; n++) { 
     auto work = [&]() { 
      unsigned long long factorial = 1; 
      for(int i = 1; i <=n; ++i) { 
       factorial *= i; 
      } 
      { 
       std::lock_guard<std::mutex> guard(m); 
       results.push_back(factorial); 
      } 
      incrementCountAndNotify(); 
     }; 

     pool.add(work); 
    } 

    waitForNotificationOrFailOnTimeout(55); 
} 

TEST_F(ThreadPoolTest,TimingTestWithoutTP) { 

    std::vector<unsigned long long> results; 
    auto work = [](int n) { 
     unsigned long long factorial = 1; 
     for(int i = 1; i <=n; ++i) { 
     factorial *= i; 
     } 

     return factorial; 

    }; 


    TestTimer timer("In Sequence",0); 
    for (int i = 5; i < 60 ; i++) { 
     results.push_back(work(i)); 
    } 

    for(unsigned int i = 0; i< results.size(); i++) { 
     results.at(i); 
    } 

}

我一個4 CPU的計算機上運行的情況下測試該代碼線程池。我得到的計時結果顯示單線程速度最快，而未來速度最慢。

4尺寸-TP與未來時間採取= 2.364ms

4尺寸-TP-回叫時間採取= 1.103ms

在序列時間採取= 0.026ms

我期待時間安排將按相反的順序進行。我做測試的方式錯了嗎？或者它是我的代碼？

新的測試這將是CPU重

TEST_F(ThreadPoolTest,BirthdayParadoxInSequenceTimingTest) { 

    std::vector<int> results; 

    TestTimer timer("Birthday Paradox :: In Sequence",0); 

    std::vector<int> popList = {10,23,30,40,50,60,70,80,90,100,120,150}; 
    for(auto it=popList.begin(); it!=popList.end(); ++it) { 
     int id = *it; 
     int dup{0}; 
     for(int i{0}; i< 100000; i++) { 
      auto list = generateNumbers(id); 
      if(hasDuplicates(list)) ++dup; 
     } 

     results.push_back(dup); 
    } 

     for(unsigned int i = 0; i< results.size(); i++) { 
      results.at(i); 
     } 
} 

TEST_F(ThreadPoolTest,BirthdayParadoxTPWithFutureTimingTest) { 
    std::vector<int> popList = {10,23,30,40,50,60,70,80,90,100,120,150}; 

    pool.start(4); 
    std::vector<std::future<int>> results; 

    TestTimer timer("4-sized-TP with Future",0); 

    for(auto it=popList.begin(); it!=popList.end(); ++it) { 
     int id = *it; 
     auto work = [&](int pop) { 
      int dup{0}; 
      for(int i{0}; i < 100000 ; i++) { 
       auto list = generateNumbers(pop); 
       if(hasDuplicates(list)) ++dup; 
      } 

      return dup; 

     }; 

     results.push_back(pool.submit(work,id));   
    } 

    for(unsigned int i = 0; i< results.size(); i++) { 
     results.at(i).get(); 
    } 
} 



TEST_F(ThreadPoolTest,BirthdayParadoxTPWithCallBackTimingTest) { 
    std::vector<int> popList = {10,23,30,40,50,60,70,80,90,100,120,150}; 

    pool.start(4); 
    std::vector<int> results; 

    TestTimer timer("4-sized-TP with Callback",0); 

    for(auto it=popList.begin(); it!=popList.end(); ++it) { 
     int id = *it; 
     auto work = [&,id]() { 
      int dup{0}; 
      for(int i{0}; i < 100000 ; i++) { 
       auto list = generateNumbers(id); 
       if(hasDuplicates(list)) ++dup; 

       { 
        std::lock_guard<std::mutex> guard(m); 
        results.push_back(dup); 

       } 
      } 

      incrementCountAndNotify(); 
     }; 

     pool.add(work);  
    } 

    waitForNotificationOrFailOnTimeout(12); 
}

結果雖然還沒有我期待

在序列時間採取= 37555.7ms

4級-TP與未來所用時間= 62544.8ms

帶回調時間的4碼TP = 62563.6ms

完整的代碼和測試是在https://github.com/spakai/threadpool_future

來源

2017-05-09 spakai

計算階乘對於CPU來說不是一項具有挑戰性的任務。所有的開銷都發生在上下文切換和鎖定中。嘗試在它上面投擲稅收問題。 – paddy

好吧，我改變了一些CPU的重量。時間還沒有結束。當我使用top時，序列測試顯示CPU接近100％，未來和回調顯示接近400％。我已經更新了上面的新測試。 – spakai

我試圖複製你的行爲。在回購的文件中有一些錯誤。第一個TThreadPoolTest.cpp缺少一個'#include '，其次是cmakelists.txt中的'sources'變量需要包含TestTimer.cpp。可悲的消息是，我得到了類似的時間行爲。 – OutOfBound

生日悖論問題，你選擇不是一個具有挑戰性的CPU任務。但要了解您看到的問題，我們首先必須對代碼進行一些更改。

我們想測量算法完成所花費的時間。內存分配很昂貴，應該在經常重複的部分程序中進行分配。創建矢量或增加其大小將始終觸發內存分配。創建一個集合也是如此。要刪除momory分配我修改您的代碼如下所示：

#include "gmock/gmock.h" 
#include <chrono> 
#include <condition_variable> 
#include <mutex> 
#include <random> 
#include <set> 
#include <vector> 

#include "ThreadPool.h" 
#include "TestTimer.h" 


const unsigned int runs = 100000; 

using namespace testing; 

class ThreadPoolTest : public Test { 
    public: 
     ThreadPool pool; 
     std::condition_variable wasExecuted; 
     std::mutex m; 
     std::mutex n; 
     std::vector<std::shared_ptr<std::thread>> threads; 
     std::vector<int> popList = {10,11,12,23}; 

     unsigned int count{0}; 

     void incrementCountAndNotify() { 
      { 
       std::unique_lock<std::mutex> lock(m); 
       ++count; 
      } 
      wasExecuted.notify_all(); 
     } 

     void waitForNotificationOrFailOnTimeout(unsigned expectedCount, int milliseconds=80000) { 
      std::unique_lock<std::mutex> lock(m); 
      ASSERT_THAT(wasExecuted.wait_for(lock, std::chrono::milliseconds(milliseconds), [&] { return count == expectedCount; }), Eq(true)); 

     } 

     bool hasDuplicates(const std::vector<int> & birthdays) { 
      //This way to check for duplicates is very expensive, since it allocates new memory and copies all values around 
      //std::set<int> uniqueBirthdays(birthdays.begin(), birthdays.end()); 
      //return (uniqueBirthdays.size() != birthdays.size()); 
      for(unsigned int i = 0; i < birthdays.size(); i++) { 
       for(unsigned int j = i+1; j < birthdays.size(); j++) { 
        if(birthdays[i]==birthdays[j]) return true; 
       } 
      } 
      return false; 
     } 

     //I added the parameter list, to avoid the allocation of new memory 
     //The list will also have the needed size, so that we dont need to it here 
     std::vector<int> generateNumbers(std::vector<int>& list) { 
      //It is not exactly specified how the random_device works, it may read from /dev/random, which can not be done in parallel 
      //To make the measurements compareable over multiple machiens i removed this code 
      //std::random_device rd; 
      std::default_random_engine dre(0); 
      std::uniform_int_distribution<int> di(0,365); 
      int counter = 0; 
      for(int& i : list) { 
       i = di(dre); 
      } 
      return list; 
     } 

     void TearDown() override { 
      for (auto& t: threads) t->join(); 
     } 
}; 


TEST_F(ThreadPoolTest,BirthdayParadoxInSequenceTimingTest) { 

    std::vector<int> results; 

    TestTimer timer("Birthday Paradox :: In Sequence",0); 

    for(auto it=popList.begin(); it!=popList.end(); ++it) { 
     std::cout << "TID " << std::this_thread::get_id() << std::endl; 

     int id = *it; 
     int dup{0}; 
     std::vector<int> list(id); //Allocate memory in the right size only once for all 100000 runs 
     for(int i{0}; i < runs ; i++) { 
       generateNumbers(list); 
      if(hasDuplicates(list)) ++dup; 
     } 

     results.push_back(dup); //This push_back is ok, since it is only called 4 times in total 
    } 

     for(unsigned int i = 0; i< results.size(); i++) { 
      results.at(i); 
     } 
} 

TEST_F(ThreadPoolTest,BirthdayParadoxTPWithFutureTimingTest) { 
    pool.start(4); 
    std::vector<std::future<int>> results; 

    TestTimer timer("4-sized-TP with Future",0); 

    for(auto it=popList.begin(); it!=popList.end(); ++it) { 
     int id = *it; 
     auto work = [&](int pop) { 
      std::cout << "TID " << std::this_thread::get_id() << std::endl; 

      int dup{0}; 
      std::vector<int> list(pop); //Same as above 
      for(int i{0}; i < runs ; i++) { 
       generateNumbers(list); 
       if(hasDuplicates(list)) ++dup; 
      } 

      return dup; 

     }; 

     results.push_back(pool.submit(work,id)); 
    } 

    for(unsigned int i = 0; i< results.size(); i++) { 
     results.at(i).get(); 
    } 
} 


TEST_F(ThreadPoolTest,BirthdayParadoxTPWithCallBackTimingTest) { 
    pool.start(4); 
    std::vector<int> results; 

    TestTimer timer("4-sized-TP with Callback",0); 

    for(auto it=popList.begin(); it!=popList.end(); ++it) { 
     int id = *it; 
     auto work = [&,id]() { 
      std::cout << "TID " << std::this_thread::get_id() << std::endl; 

      int dup{0}; 
      std::vector<int> list(id); //Same here too 
      for(int i{0}; i < runs ; i++) { 
       generateNumbers(list); 
       if(hasDuplicates(list)) ++dup; 

         { 
         std::lock_guard<std::mutex> guard(n); 
         results.push_back(dup); 
        } 
      } 

      incrementCountAndNotify(); 
     }; 

     pool.add(work); 
    } 
    waitForNotificationOrFailOnTimeout(4); 
}

現在，我們得到了我們的內存管理權，我們就可以開始推理的運行時間。我使用2 Cores和超線程來運行代碼，所以如果我們使用多線程，我們預計加速2或更高。讓我們來看看結果：

Birthday Paradox :: In Sequence Time taken = 680.96ms 
4-sized-TP with Future Time taken = 1838.28ms 
4-sized-TP with Callback Time taken = 1861.07ms

如果我限制了線程池線程的數量爲一，那麼所有版本的運行時間幾乎相同。

我們看到這種不直觀的行爲的原因是，問題是內存限制。速度損失的原因在於檢查重複項。

for(unsigned int i = 0; i < birthdays.size(); i++) { 
    for(unsigned int j = i+1; j < birthdays.size(); j++) { 
     if(birthdays[i]==birthdays[j]) return true; 
    } 
}

生日的訪問在內存中很好地對齊。如果多個線程正在運行，算法不會獲得速度，因爲它們都只是在等待值。更糟糕的是，不同的線程正在從不同的位置讀取數據，因此它們可能會損壞緩存線，這可能會被其他線程使用。這就是原因，你會看到性能下降。

來源

2017-05-12 13:38:56 OutOfBound

感謝OutOfBound，我從未想過它會成爲記憶。將嘗試以上，並得到另一個真正的CPU重量。 – spakai

定時測試：單線程VS回調TP VS未來TP

回答

相關問題