所以,我想在C++中使用SIMD指令來比較uint32_t數組中的值,並將這些值存儲回新數組中。 它或多或少地正常工作,但我仍然使用4個if-clause來確定在SIMD指令後寫入的值是否寫回值。在C++中使用SIMD指令來比較值並將它們存儲
有沒有辦法用SIMD指令來做到這一點? 函數allocateAlignedBuffer完成名稱所描述的工作並正常工作。
uint32_t* testFunc(uint32_t* arr, uint32_t num, uint32_t* cnt) {
uint32_t numcnt = 4;
uint32_t * resArr = allocateAlignedBuffer<uint32_t>(num, true);
uint32_t * resPos = resArr;
*cnt = 0;
__m128i comp2 = _mm_set_epi32(10,10,10,10);
for (int i = 0; i < num; i+=4) {
__m128i positions = _mm_set_epi32(i+3,i+2,i+1,i);
__m128i vec = _mm_load_si128 (reinterpret_cast<const __m128i*> ((&arr[i])));
__m128i simdAnd2 = _mm_cmpge_ps(vec, comp2);
int comp = _mm_movemask_epi8 (simdAnd2);
if (comp == 0x0000) {
//std::cout << "nothing found\n";
continue;
}
else if (comp < 65535) {
if ( ((uint32_t *) &simdAnd2)[0]){
std::cout << "first byte not 0\n";
resPos[0] = ((uint32_t *) &positions)[0];
resPos++;
*cnt++;
}
if (((uint32_t *) &simdAnd2)[1]){
std::cout << "second byte not 0\n";
resPos[0] = ((uint32_t *) &positions)[1];
resPos++;
*cnt++;
}
if (((uint32_t *) &simdAnd2)[2]){
std::cout << "3rd byte not 0\n";
resPos[0] = ((uint32_t *) &positions)[2];
resPos++;
*cnt++;
}
if (((uint32_t *) &simdAnd2)[3]){
std::cout << "4th byte not 0\n";
resPos[0] = ((uint32_t *) &positions)[3];
resPos++;
*cnt++;
}
}
else { //all elements equal
resPos[0] = ((uint32_t *) &positions)[0];
resPos[1] = ((uint32_t *) &positions)[1];
resPos[2] = ((uint32_t *) &positions)[2];
resPos[3] = ((uint32_t *) &positions)[3];
resPos += numcnt;
*cnt += numcnt;
}
}
std::cout << "cnt "<<*cnt<<"\n";
return resArr;
}
此外,可能還有很多優化,我相信。
是必要的印刷?壓縮可以使用pshufb技巧完成,但如果您必須打印,則無關緊要。 – harold
@harold不,我只是爲了我,當我試圖評估速度,我評論它,只是忘了把它拿出來,當我複製它 – uv239