2017-03-22 64 views
2

所以,我想在C++中使用SIMD指令來比較uint32_t數組中的值,並將這些值存儲回新數組中。 它或多或少地正常工作,但我仍然使用4個if-clause來確定在SIMD指令後寫入的值是否寫回值。在C++中使用SIMD指令來比較值並將它們存儲

有沒有辦法用SIMD指令來做到這一點? 函數allocateAlignedBuffer完成名稱所描述的工作並正常工作。

uint32_t* testFunc(uint32_t* arr, uint32_t num, uint32_t* cnt) { 

    uint32_t numcnt = 4; 
    uint32_t * resArr = allocateAlignedBuffer<uint32_t>(num, true); 
    uint32_t * resPos = resArr; 

    *cnt = 0; 

    __m128i comp2 = _mm_set_epi32(10,10,10,10); 

    for (int i = 0; i < num; i+=4) { 
     __m128i positions = _mm_set_epi32(i+3,i+2,i+1,i); 
     __m128i vec = _mm_load_si128 (reinterpret_cast<const __m128i*> ((&arr[i]))); 

     __m128i simdAnd2 = _mm_cmpge_ps(vec, comp2); 

     int comp = _mm_movemask_epi8 (simdAnd2); 

     if (comp == 0x0000) { 
      //std::cout << "nothing found\n"; 
      continue; 
     } 
     else if (comp < 65535) { 

      if ( ((uint32_t *) &simdAnd2)[0]){ 
        std::cout << "first byte not 0\n"; 
        resPos[0] = ((uint32_t *) &positions)[0]; 
        resPos++; 
        *cnt++; 
       } 
      if (((uint32_t *) &simdAnd2)[1]){ 
        std::cout << "second byte not 0\n"; 
        resPos[0] = ((uint32_t *) &positions)[1]; 
        resPos++; 
        *cnt++; 
       }   
      if (((uint32_t *) &simdAnd2)[2]){ 
       std::cout << "3rd byte not 0\n"; 
        resPos[0] = ((uint32_t *) &positions)[2]; 
        resPos++; 
        *cnt++; 
       } 
      if (((uint32_t *) &simdAnd2)[3]){ 
        std::cout << "4th byte not 0\n"; 
        resPos[0] = ((uint32_t *) &positions)[3]; 
        resPos++; 
        *cnt++; 
       } 
     } 
     else { //all elements equal 
      resPos[0] = ((uint32_t *) &positions)[0]; 
      resPos[1] = ((uint32_t *) &positions)[1]; 
      resPos[2] = ((uint32_t *) &positions)[2]; 
      resPos[3] = ((uint32_t *) &positions)[3]; 
      resPos += numcnt; 
      *cnt += numcnt; 
     } 

    } 

std::cout << "cnt "<<*cnt<<"\n"; 
return resArr; 
} 

此外,可能還有很多優化,我相信。

+0

是必要的印刷?壓縮可以使用pshufb技巧完成,但如果您必須打印,則無關緊要。 – harold

+0

@harold不,我只是爲了我,當我試圖評估速度,我評論它,只是忘了把它拿出來,當我複製它 – uv239

回答

1

另一個變種使用洗牌:

__m128i g_shuffles[16] = 
{ 
    _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
    _mm_setr_epi8(0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
    _mm_setr_epi8(4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
    _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0), 
    _mm_setr_epi8(8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
    _mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0), 
    _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0), 
    _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0), 
    _mm_setr_epi8(12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
    _mm_setr_epi8(0, 1, 2, 3, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0), 
    _mm_setr_epi8(4, 5, 6, 7, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0), 
    _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 0, 0, 0), 
    _mm_setr_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0), 
    _mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0), 
    _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0), 
    _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 
}; 
uint32_t g_steps[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; 

uint32_t * testFunc2(uint32_t* arr, uint32_t num, uint32_t * cnt) 
{ 
    uint32_t * resArr = (uint32_t*)_mm_malloc(num*sizeof(uint32_t), 16); 
    uint32_t * resPos = resArr; 

    *cnt = 0; 

    __m128i threshold = _mm_set1_epi32(10 - 1); 
    __m128i positions = _mm_setr_epi32(0, 1, 2, 3); 
    __m128i _4 = _mm_set1_epi32(4); 
    __m128i _1 = _mm_set1_epi32(1); 
    __m128i _cnt = _mm_setzero_si128(); 

    for (int i = 0; i < num; i += 4) 
    { 
     __m128i _arr = _mm_loadu_si128((__m128i*)(arr + i)); 

     __m128i comparemask = _mm_cmpgt_epi32(_arr, threshold); 

     _cnt = _mm_add_epi32(_cnt, _mm_and_si128(comparemask, _1)); 

     int index = _mm_movemask_ps(_mm_castsi128_ps(comparemask)); 

     __m128i storePositions = _mm_shuffle_epi8(positions, g_shuffles[index]); 
     _mm_storeu_si128((__m128i*)resPos, storePositions); 
     resPos += g_steps[index]; 

     positions = _mm_add_epi32(positions, _4); 
    } 

    uint32_t cnts[4]; 
    _mm_storeu_si128((__m128i*)cnts, _cnt); 
    *cnt = cnts[0] + cnts[1] + cnts[2] + cnts[3]; 

    std::cout << "cnt " << *cnt << "\n"; 
    return resArr; 
} 
+0

即使在一個新的項目中,_mm_set1_epi32()函數對我造成運行時內存錯誤,你是否知道原因可能是什麼? – uv239

+1

@ uv239我已經運行了代碼,並沒有錯誤。不過,您可以使用_mm_set_epi32()而不是_mm_set1_epi32()。 – ErmIg

+0

感謝幫助,我不知道爲什麼其他命令不工作,但現在它工作:) – uv239

0

我已經做了一些改變,這不得不引起性能提高:

#include <immintrin.h> 
#include <memory.h> 

uint32_t* testFunc(uint32_t* arr, uint32_t num, uint32_t * cnt) 
{ 
    uint32_t numcnt = 4; 
    uint32_t * resArr = (uint32_t*)_mm_malloc(num*sizeof(uint32_t), 16); 
    uint32_t * resPos = resArr; 

    *cnt = 0; 

    __m128i comp2 = _mm_set1_epi32(10); 
    __m128i positions = _mm_setr_epi32(0, 1, 2, 3); 
    __m128i _4 = _mm_set1_epi32(4); 
    __m128i _1 = _mm_set1_epi32(1); 
    __m128i _cnt = _mm_setzero_si128(); 

    for (int i = 0; i < num; i += 4) 
    { 
     __m128i vec = _mm_loadu_si128((__m128i*)(arr + i)); 

     __m128i simdAnd2 = _mm_cmplt_epi32(comp2, vec);//arr >= comp2 

     _cnt = _mm_add_epi32(_cnt, _mm_and_si128(simdAnd2, _1)); 

     int comp = _mm_movemask_epi8(simdAnd2); 

     if (comp == 65535) 
     { 
      _mm_storeu_si128((__m128i*)resPos, positions); 
      resPos += 4; 
     } 
     else if (comp < 65535) 
     { 
      if (((uint32_t *)&simdAnd2)[0]) { 
       std::cout << "first byte not 0\n"; 
       resPos[0] = ((uint32_t *)&positions)[0]; 
       resPos++; 
      } 
      if (((uint32_t *)&simdAnd2)[1]) { 
       std::cout << "second byte not 0\n"; 
       resPos[0] = ((uint32_t *)&positions)[1]; 
       resPos++; 
      } 
      if (((uint32_t *)&simdAnd2)[2]) { 
       std::cout << "3rd byte not 0\n"; 
       resPos[0] = ((uint32_t *)&positions)[2]; 
       resPos++; 
      } 
      if (((uint32_t *)&simdAnd2)[3]) { 
       std::cout << "4th byte not 0\n"; 
       resPos[0] = ((uint32_t *)&positions)[3]; 
       resPos++; 
      } 
     } 
     positions = _mm_add_epi32(positions, _4); 
    } 

    uint32_t cnts[4]; 
    _mm_storeu_si128((__m128i*)cnts, _cnt); 
    *cnt = cnts[0] + cnts[1] + cnts[2] + cnts[3]; 

    std::cout << "cnt " << *cnt << "\n"; 
    return resArr; 
} 

當然,如果在週期中的所有標量指令更改爲向量指令這將是很好。

0

下面是一個帶pshufb技巧的版本來進行壓縮,但沒有經過測試,而且洗牌掩碼不應該是本地的。

uint32_t* testFunc(uint32_t* arr, uint32_t num, uint32_t * cnt) 
{ 
    uint32_t numcnt = 4; 
    uint32_t * resArr = (uint32_t*)_mm_malloc(num*sizeof(uint32_t), 16); 
    uint32_t * resPos = resArr; 

    *cnt = 0; 

    __m128i comp2 = _mm_set1_epi32(10); 
    __m128i positions = _mm_setr_epi32(0, 1, 2, 3); 
    __m128i _4 = _mm_set1_epi32(4); 
    __m128i _1 = _mm_set1_epi32(1); 
    int count = 0; 

    const int X = 0x80808080; 
    __m128i compaction_masks[16]; 
    compaction_masks[0] = _mm_set1_epi8(0x80); 
    compaction_masks[1] = _mm_set_epi32(X, X, X, 0x03020100); 
    compaction_masks[2] = _mm_set_epi32(X, X, X, 0x07060504); 
    compaction_masks[3] = _mm_set_epi32(X, X, 0x07060504, 0x03020100); 
    compaction_masks[4] = _mm_set_epi32(X, X, X, 0x0B0A0908); 
    compaction_masks[5] = _mm_set_epi32(X, X, 0x0B0A0908, 0x03020100); 
    compaction_masks[6] = _mm_set_epi32(X, X, 0x0B0A0908, 0x07060504); 
    compaction_masks[7] = _mm_set_epi32(X, 0x0B0A0908, 0x07060504, 0x03020100); 
    compaction_masks[8] = _mm_set_epi32(X, X, X, 0x0F0E0D0C); 
    compaction_masks[9] = _mm_set_epi32(X, X, 0x0F0E0D0C, 0x03020100); 
    compaction_masks[10] = _mm_set_epi32(X, X, 0x0F0E0D0C, 0x07060504); 
    compaction_masks[11] = _mm_set_epi32(X, 0x0F0E0D0C, 0x07060504, 0x03020100); 
    compaction_masks[12] = _mm_set_epi32(X, X, 0x0F0E0D0C, 0x0B0A0908); 
    compaction_masks[13] = _mm_set_epi32(X, 0x0F0E0D0C, 0x0B0A0908, 0x03020100); 
    compaction_masks[14] = _mm_set_epi32(X, 0x0F0E0D0C, 0x0B0A0908, 0x07060504); 
    compaction_masks[15] = _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100); 

    for (int i = 0; i < num; i += 4) 
    { 
     __m128i vec = _mm_loadu_si128((__m128i*)(arr + i)); 
     __m128i simdAnd2 = _mm_cmplt_epi32(comp2, vec); 

     int comp = _mm_movemask_ps(_mm_castsi128_ps(simdAnd2)); 
     __m128i shufmask = compaction_masks[comp]; 
     vec = _mm_shuffle_epi8(positions, shufmask); 
     _mm_storeu_si128((__m128i*)resPos, vec); 
     resPos += __builtin_popcount(comp); 
     count += __builtin_popcount(comp); 

     positions = _mm_add_epi32(positions, _4); 
    } 

    *cnt = count; 

    return resArr; 
} 

的這裏的想法是,每個人的情況下當然可以洗牌到位,16例被加載的情況相對應的索引,這是由movmskps給出的洗牌面具區分。使用AVX2,您可以使用vpermd做類似的事情。

+0

某種方式,如果我嘗試使用_mm_set1_epi32()函數我得到一個內存運行時錯誤,即使我做了一個新項目,任何想法可能是什麼原因? – uv239

+0

@ uv239沒有對不起,這只是奇怪 – harold

相關問題