我目前正在用C++編寫素數生成器。我先做了一個單線程版本,後來是一個多線程版本。C++線程應用程序比非線程運行速度慢
我發現如果我的程序生成的值小於100'000
,那麼單線程版本比多線程版本要快。顯然我做錯了什麼。
我下面的代碼:
#include <iostream>
#include <fstream>
#include <set>
#include <string>
#include <thread>
#include <mutex>
#include <shared_mutex>
using namespace std;
set<unsigned long long> primeContainer;
shared_mutex m;
void checkPrime(const unsigned long long p)
{
if (p % 3 == 0)
return;
bool isPrime = true;
for (set<unsigned long long>::const_iterator it = primeContainer.cbegin(); it != primeContainer.cend(); ++it)
{
if (p % *it == 0)
{
isPrime = false;
break;
}
if (*it * *it > p) // check only up to square root
break;
}
if (isPrime)
primeContainer.insert(p);
}
void checkPrimeLock(const unsigned long long p)
{
if (p % 3 == 0)
return;
bool isPrime = true;
try
{
shared_lock<shared_mutex> l(m);
for (set<unsigned long long>::const_iterator it = primeContainer.cbegin(); it != primeContainer.cend(); ++it)
{
if (p % *it == 0)
{
isPrime = false;
break;
}
if (*it * *it > p)
break;
}
}
catch (exception& e)
{
cout << e.what() << endl;
system("pause");
}
if (isPrime)
{
try
{
unique_lock<shared_mutex> l(m);
primeContainer.insert(p);
}
catch (exception& e)
{
cout << e.what() << endl;
system("pause");
}
}
}
void runLoopThread(const unsigned long long& l)
{
for (unsigned long long i = 10; i < l; i += 10)
{
thread t1(checkPrimeLock, i + 1);
thread t2(checkPrimeLock, i + 3);
thread t3(checkPrimeLock, i + 7);
thread t4(checkPrimeLock, i + 9);
t1.join();
t2.join();
t3.join();
t4.join();
}
}
void runLoop(const unsigned long long& l)
{
for (unsigned long long i = 10; i < l; i += 10)
{
checkPrime(i + 1);
checkPrime(i + 3);
checkPrime(i + 7);
checkPrime(i + 9);
}
}
void printPrimes(const unsigned long long& l)
{
if (1U <= l)
cout << "1 ";
if (2U <= l)
cout << "2 ";
if (3U <= l)
cout << "3 ";
if (5U <= l)
cout << "5 ";
for (auto it = primeContainer.cbegin(); it != primeContainer.cend(); ++it)
{
if (*it <= l)
cout << *it << " ";
}
cout << endl;
}
void writeToFile(const unsigned long long& l)
{
string name = "primes_" + to_string(l) + ".txt";
ofstream f(name);
if (f.is_open())
{
if (1U <= l)
f << "1 ";
if (2U <= l)
f << "2 ";
if (3U <= l)
f << "3 ";
if (5U <= l)
f << "5 ";
for (auto it = primeContainer.cbegin(); it != primeContainer.cend(); ++it)
{
if (*it <= l)
f << *it << " ";
}
}
else
{
cout << "Error opening file." << endl;
system("pause");
}
}
int main()
{
unsigned int n = thread::hardware_concurrency();
std::cout << n << " concurrent threads are supported." << endl;
unsigned long long limit;
cout << "Please enter the limit of prime generation: ";
cin >> limit;
primeContainer.insert(7);
if (10 < limit)
{
//runLoop(limit); //single-threaded
runLoopThread(limit); //multi-threaded
}
printPrimes(limit);
//writeToFile(limit);
system("pause");
return 0;
}
在main
功能,你會發現評論有關其功能單一和多線程。
這些之間的主要區別是使用鎖,共享容器迭代和獨特的插入。如果重要,我的CPU有4個內核。
爲什麼單線程版本更快?
多線程永遠不會保證速度更快,而且速度通常更慢,特別是在有鎖的情況下,如果您不小心,可以將代碼恢復爲同步行爲。 – johnbakers
我認爲這是由於創建線程的開銷。啓動每個線程並將其關閉的時間比通過並行計算節省的時間要大得多。與其創建一個線程來檢查單個數字,您可以使同一個線程檢查以1或3等結尾的每個數字。但是,在這種情況下,您必須調整素數測試以處理線程以不同的價格行事。 –