我想使用OpenMP來並行化一個函數,該函數在C++的main函數中被調用。OpenMP慢速私人功能
我的代碼運行速度比順序模式慢得多:for-loop需要大約6.1s(掛鐘)而沒有OpenMP(只是註釋掉#pragma ...命令),使用OpenMP需要11.8s。
我的機器有8個CPU和8183Mb的物理存儲,並配備了64位Windows 7操作系統。我在調試模式下使用Visual Studio編譯器來處理64位系統。
我讀過性能降低可能是由於應聲明爲私有變量,但我不確定如何正確執行此操作,以及哪些變量需要聲明爲私有。
這是相關的for循環:
vec DecenterOffsetParallel(const real_1d_array &x22, vec vDistance, double dOffsetXp, double dOffsetYp, double dOffsetXm, double dOffsetYm, double dOffsetXpY, double dOffsetYpX, double dOffsetXmY, double dOffsetYmX, double* dDeltaXp, double* dDeltaYp, double* dDeltaXm, double* dDeltaYm, double* dDeltaXpY, double* dDeltaYpX, double* dDeltaXmY, double* dDeltaYmX, double* delta0, /*local variables for the parallel code: */ const int nRaysn, double PupilDian, mat mRxNn, mat mRyNn, mat mRzNn, mat mRxN1n, mat mRyN1n, mat mRzN1n, mat mRxN2n, mat mRyN2n, mat mRzN2n, mat mRxN3n, mat mRyN3n, mat mRzN3n, mat mRcxNn, mat mRcyNn, mat mRczNn, mat mRcxN1n, mat mRcyN1n, mat mRczN1n, mat mRcxN2n, mat mRcyN2n, mat mRczN2n, mat mRcxN3n, mat mRcyN3n, mat mRczN3n, mat mPathNn, mat mPath1Nn, mat mPath00Nn, mat mPathN1n, mat mPath1N1n, mat mPath00N1n, mat mPathN2n, mat mPath1N2n, mat mPath00N2n, mat mPathN3n, mat mPath1N3n, mat mPath00N3n)
{
#pragma omp parallel for
for (int xy = 0; xy < nRaysn * nRaysn; xy++){
mat temp = mat(nRaysn, nRaysn);
mat mScxn(nRaysn, nRaysn);
mat mScyn(nRaysn, nRaysn);
mat mSczn(nRaysn, nRaysn);
int i = xy/nRaysn;
int j = xy % nRaysn;
// only rays inside entrance pupil:t
if (sqrt(((10.0/nRaysn) * i - 5.0)*((10.0/nRaysn) * i - 5.0) + ((10.0/nRaysn)*j - 5.0) *((10.0/nRaysn)*j - 5.0)) <= PupilDian/2.0){
// Initialize the matrices
mRxNn(i, j) = (10.0/nRaysn) * i - 5.0;
mRyNn(i, j) = (10.0/nRaysn) * j - 5.0;
mRzNn(i, j) = 0.0;
//... everything is repeated 3 more times to simulate all in all 4 cases...: mRxNn1(i,j) = (10.0/nRaysn)*i-5.0; and so on...
mRcxNn(i, j) = sign(vDistance(0)) *(mRxNn(i, j) - dOffsetYmX)/(sqrt(vDistance(0)*vDistance(0) + (mRxNn(i, j) - dOffsetYmX) * (mRxNn(i, j) - dOffsetYmX) + (mRyNn(i, j) - dOffsetYm) *(mRyNn(i, j) - dOffsetYm)));
mRcyNn(i, j) = sign(vDistance(0)) *(mRyNn(i, j) - dOffsetYm)/(sqrt(vDistance(0)*vDistance(0) + (mRxNn(i, j) - dOffsetYmX) * (mRxNn(i, j) - dOffsetYmX) + (mRyNn(i, j) - dOffsetYm) *(mRyNn(i, j) - dOffsetYm)));
mRczNn(i, j) = sqrt(1 - mRcxNn(i, j)*mRcxNn(i, j) - mRcyNn(i, j)*mRcyNn(i, j));
mPathNn(i, j) = 0.0;
mPath1Nn(i, j) = sign(vDistance(0)) *nAir * vDistance(0)/mRczNn(i, j);
mPath00Nn(i, j) = mPath1Nn(i, j);
//... everything is repeated 3 more times to simulate 4 different cases...
//trace rays through cornea
temp(i, j) = RayIntersect(ZernAnt, ZernRadAnt, &mRxNn(i, j), &mRyNn(i, j), P2DAnt, UAnt, VAnt, &mRzNn(i, j), mRcxNn(i, j), mRcyNn(i, j), mRczNn(i, j), &mPathNn(i, j), xNullAnt, yNullAnt, NknotsUAnt, NknotsVAnt); // find the intersection (modifies mRz, mRy, mRx, mPath)
mPathNn(i, j) = mPath1Nn(i, j) + nAir*mPathNn(i, j);
temp(i, j) = Surface(P2DAnt, UAnt, VAnt, ZernAnt, ZernRadAnt, mRxNn(i, j), mRyNn(i, j), mRzNn(i, j), &mScxn(i, j), &mScyn(i, j), &mSczn(i, j), KnotIntervallSizeAnt, xNullAnt, yNullAnt);
// *Ant are identical for all four cases!
temp(i, j) = Refract(nAir, nCornea, &mRcxNn(i, j), &mRcyNn(i, j), &mRczNn(i, j), mScxn(i, j), mScyn(i, j), mSczn(i, j));
//... everything is repeated 3 more times to simulate all in all 4 cases...
}
else{
mRxNn(i, j) = mRyNn(i, j) = mRzNn(i, j) = mRcxNn(i, j) = mRcyNn(i, j) = mRczNn(i, j) = mPathNn(i, j) = mPath1Nn(i, j) = NAN;
//... everything is repeated 3 more times to simulate all in all 4 cases...
}
}
// some other stuff, that is not relevant to the questions...
}
誰能給我一個提示,是什麼可能會導致性能下降? 謝謝! PS:犰狳庫用於矩陣和向量。
'public'或'private'不會改變這方面的事情。 –
歡迎來到SO。不幸的是,您沒有提供足夠的信息來正確回答您的問題。請包括[mcve],描述您的系統配置(CPU,內存,編譯器),以及如何測量以及您的具體測量結果。 – Zulan
我添加了系統信息,並用我的程序中的(稍微縮短的)原始代碼替換了簡化的示例。希望這可以幫助! – Sims