爲什麼我的信號插槽比QThreadPool + new + delete要慢？

我正在閱讀Qt的信號&插槽[1]，並注意到它聲稱信號和插槽比任何新的或刪除操作的開銷低得多。所以，我做了一個試驗：爲什麼我的信號插槽比QThreadPool + new + delete要慢？

#include <cmath> 

#include <QtCore/QAtomicInt> 
#include <QtCore/QCoreApplication> 
#include <QtCore/QElapsedTimer> 
#include <QtCore/QMetaObject> 
#include <QtCore/QMetaMethod> 
#include <QtCore/QObject> 
#include <QtCore/QRunnable> 
#include <QtCore/QTextStream> 
#include <QtCore/QThread> 
#include <QtCore/QThreadPool> 
#include <QtCore/QTimer> 
#include <QtCore/QVector> 

using std::pow; 

constexpr int const maxThreadCount(16); 
constexpr int const maxIteration(100000); 
constexpr int const maxPiDigit(1000); 

void calcPi() 
{ 
    double sum(0); 
    for (int k(0); k < maxPiDigit; ++k) { 
     double a(4.0/(k * 8 + 1)); 
     double b(2.0/(k * 8 + 4)); 
     double c(1.0/(k * 8 + 5)); 
     double d(1.0/(k * 8 + 6)); 
     sum += pow(16, -k) * (a - b - c -d); 
    } 
    QTextStream out(stdout); 
    out << sum << endl; 
} 

class CalcPiWithQObject : public QObject 
{ 
    Q_OBJECT 

    public: 
     CalcPiWithQObject(QObject *parent = NULL); 

    public slots: 
     void start(); 

    signals: 
     void finished(); 
}; // CalcPiWithQObject 

CalcPiWithQObject::CalcPiWithQObject(QObject *parent): 
    QObject(parent) 
{} 

void CalcPiWithQObject::start() 
{ 
    calcPi(); 
    finished(); 
} 

class CalcPiWithQRunnable : public QRunnable 
{ 
    private: 
     static QAtomicInt count_; 

    public: 
     CalcPiWithQRunnable(QThreadPool *parent); 

     void run() override; 

    private: 
     QThreadPool *parent_; 
}; // CalcPiWithQRunnable 

QAtomicInt CalcPiWithQRunnable::count_(maxThreadCount); 

CalcPiWithQRunnable::CalcPiWithQRunnable(QThreadPool *parent): 
    QRunnable(), 
    parent_(parent) 
{ 
    setAutoDelete(false); 
} 

void CalcPiWithQRunnable::run() 
{ 
    calcPi(); 
    if (count_.fetchAndAddOrdered(1) < maxIteration) { 
     parent_->start(new CalcPiWithQRunnable(parent_)); 
    } 
    delete this; 
} 

class PiTest : public QObject 
{ 
    Q_OBJECT 

    public: 
     PiTest(QObject *parent = NULL); 

    public slots: 
     void start(); 
     void nextQObjectCall(); 

    private: 
     QVector<QThread *> threads_; 
     QVector<CalcPiWithQObject *> calc_; 
     QThreadPool *threadPool_; 
     QElapsedTimer timer_; 
     int threadCount_; 
     int jobCount_; 
}; // PiTest 

PiTest::PiTest(QObject *parent): 
    QObject(parent), 
    threads_(maxThreadCount), 
    calc_(maxThreadCount), 
    threadPool_(new QThreadPool(this)), 
    threadCount_(maxThreadCount), 
    jobCount_(maxThreadCount) 
{ 
    threadPool_->setMaxThreadCount(maxThreadCount); 
    for (int i(0); i < maxThreadCount; ++i) { 
     threads_[i] = new QThread(); 
     calc_[i] = new CalcPiWithQObject(); 
     calc_[i]->moveToThread(threads_[i]); 
     QObject::connect(calc_[i], &CalcPiWithQObject::finished, 
         this, &PiTest::nextQObjectCall, 
         Qt::QueuedConnection); 
     QObject::connect(threads_[i], &QThread::started, 
         calc_[i], &CalcPiWithQObject::start, 
         Qt::QueuedConnection); 
    } 
} 

void PiTest::start() 
{ 
    timer_.start(); 
    for (int i(0); i < maxThreadCount; ++i) { 
     threadPool_->start(new CalcPiWithQRunnable(threadPool_)); 
    } 
    threadPool_->waitForDone(); 
    int timePassed(timer_.elapsed()); 
    QTextStream out(stdout); 
    out << "QThreadPool: " << timePassed << endl; 
    timer_.restart(); 
    for (int i(0); i < maxThreadCount; ++i) { 
     threads_[i]->start(); 
    } 
} 

static QMetaMethod nextCall(PiTest::staticMetaObject.method(PiTest::staticMetaObject.indexOfMethod("start"))); 

void PiTest::nextQObjectCall() 
{ 
    jobCount_++; 
    if (jobCount_ < maxIteration) { 
     nextCall.invoke(sender(), Qt::QueuedConnection); 
     QMetaObject::invokeMethod(sender(), "start", 
            Qt::QueuedConnection); 
     return; 
    } 
    threadCount_--; 
    if (threadCount_ == 0) { 
     for (int i(0); i < maxThreadCount; ++i) { 
      threads_[i]->quit(); 
     } 
     int timePassed(timer_.elapsed()); 
     QTextStream out(stdout); 
     out << "QThread: " << timePassed << endl; 
     qApp->quit(); 
    } 
} 

int main(int argc, char *argv[]) 
{ 
    QCoreApplication app(argc, argv); 
    PiTest *bench(new PiTest(qApp)); 
    QTimer::singleShot(0, bench, SLOT(start())); 
    return qApp->exec(); 
} 

#include "main_moc.cpp"

，我跑測試閒置20核電腦上：

/usr/lib64/qt5/bin/moc -o main_moc.cpp main.cpp 
clang++ -std=c++11 -fPIE -O2 -march=native -I/usr/include/qt5/ -L/usr/lib64/qt5 -lQt5Core -o bench main.cpp 
./bench > test.out 
grep QThread test.out

而且這裏的結果：

QThreadPool: 4803 
QThread: 9285

我嘗試了不同的參數，較長的pi計算和較少的工作，反之亦然，但結果大致相同。 QThread +信號/插槽總是滯後。有了更多的工作，QThreadPool + new/delete可以輕鬆地勝過QThread高達10倍。

我覺得在某些方面我的基準測試代碼很尷尬。我在這裏誤解了什麼嗎？如果信號/插槽比新/刪除速度快，那麼我的基準測試有什麼問題？

謝謝。

[1] http://doc.qt.io/qt-5/signalsandslots.html

來源

2015-03-30 nocte107

@JosephMalicke，我想我在這裏測試信號插槽。我可能在這裏犯了錯誤。這就是爲什麼我問這個問題。目的是在不同線程上測試信號/插槽與新/刪除。 – nocte107 2015-03-30 22:35:01

我做了100000次迭代測試100000次迭代。這使得QThreadPool比QThread更糟糕。我不知道爲什麼。 – nocte107 2015-03-30 23:52:56

有根據連接類型在信號性能的差異。在創建線程間連接時，連接排隊，並使用事件循環來調度自己，並且Qt中的事件循環不僅非常緩慢，而且最後一次檢查它沒有提供任何方式來增加其更新率。

這使得跨線程的信號真的很慢，我曾遇到過細粒度併發性的情況，這種併發性受到多線程性能的影響，而不是性能的提升。

只給你直接和排隊的連接之間的差異的一個想法：

#define COUNT 5000 
class Ping : public QObject { 
    Q_OBJECT 
    Q_SIGNAL void pong(uint); 
public slots: void ping(uint c) { if (c < COUNT) emit pong(++c); else qDebug() << t.nsecsElapsed(); } 
}; 

//... 

QObject::connect(&p1, SIGNAL(pong(uint)), &p2, SLOT(ping(uint)), Qt::DirectConnection); 
QObject::connect(&p2, SIGNAL(pong(uint)), &p1, SLOT(ping(uint)), Qt::DirectConnection); 

//... 

p1.ping(0);

結果：

Direct connection (in same thread) - 570504 nsec 
Queued connection (in same thread) - 29670333 nsec 
Queued connection (different threads) - 53343054 nsec

你可以清楚地看到，線程間的連接幾乎是100比直接的一次慢。我懷疑你鏈接到的文檔是指直接連接。

總而言之，我會說你的測試是一團糟。你應該真正簡化它，簡單化並專注於你提出的問題。

最後，直接連接可能會比新建/刪除更快，但排隊連接絕對不是，它們要慢得多，而且肯定是性能變化的關鍵因素。在你鏈接的文件中提出的索賠已經完全沒有與QThread + worker和QRunnable + thread pool的表現。最後，在兩個你使用案件動態內存分配/重新分配和排隊連接。

來源

2015-03-31 02:48:18 dtech

我想我在這裏誤解了。感謝您的澄清。我無法構建一個基準來將信號插槽部分與新的刪除部分分離，因爲我認爲它們在這裏是「不同的方法」。 – nocte107 2015-03-31 03:38:15

爲什麼我的信號插槽比QThreadPool + new + delete要慢？

回答

相關問題