1
我有這麼簡單的程序,它在8個線程中增加了向量的每個元素1(我的PC上有8個內核)。但它只會加速2.8倍的程序,我做錯了什麼或多線程工作不是那麼快?C++線程速度測試,我做對了嗎?
成本是599毫秒
8億
成本爲1697毫秒
8億
#include <iostream>
#include <thread>
#include <vector>
void test(int n, int k)
{
std::vector<int> data(n * k, 0);
std::vector<std::thread> threads(n);
auto functor = [] (int *begin, int *end) {
for (int *p = begin; p != end; p++) {
*p = *p + 1;
}
};
auto begin = std::chrono::steady_clock::now();
for (int i = 0; i < n; i++) {
threads[i] = std::thread(std::bind(functor, data.data() + i * k, data.data() + (i + 1) * k));
}
for (int i = 0; i < n; i++) {
threads[i].join();
}
auto now = std::chrono::steady_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - begin);
std::cout << "Cost is " << elapsed .count() << " milliseconds" << std::endl;
int sum = 0;
for (int i = 0; i < n * k; i++) {
sum += data[i];
}
std::cerr << sum << std::endl;
}
void stupid_test(int n, int k)
{
std::vector<int> data(n * k, 0);
std::vector<std::thread> threads(n);
auto functor = [] (int *begin, int *end) {
for (int *p = begin; p != end; p++) {
*p = *p + 1;
}
};
auto begin = std::chrono::steady_clock::now();
for (int i = 0; i < n; i++) {
functor(data.data() + i * k, data.data() + (i + 1) * k);
}
auto now = std::chrono::steady_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - begin);
std::cout << "Cost is " << elapsed .count() << " milliseconds" << std::endl;
int sum = 0;
for (int i = 0; i < n * k; i++) {
sum += data[i];
}
std::cerr << sum << std::endl;
}
int main()
{
test(8, 100000000);
stupid_test(8, 100000000);
return 0;
}
***但它只能加速2.8倍的程序,我做錯了什麼或多線程工作不是那麼快?***不要期望線性加速。創建線程和加入以及所有核心共享相同的內存帶寬都有開銷。 – drescherjm
強制性的:你在優化編譯,對嗎? – GManNickG