2017-06-13 107 views
0

這是事情,我如何添加兩個無符號字符數組並通過使用SSE將結果存儲在無符號短數組中。任何人都可以給我一些幫助或提示。這是我迄今爲止所做的。我只是不知道哪裏出錯is..need一些幫助SSE的加法和轉換

#include<iostream> 
#include<intrin.h> 
#include<windows.h> 
#include<emmintrin.h> 
#include<iterator> 

using namespace std; 

void sse_add(unsigned char * input1, unsigned char *input2, unsigned short *output, const int N) 
{ 

unsigned char *op3 = new unsigned char[N]; 
unsigned char *op4 = new unsigned char[N]; 

__m128i *sse_op3 = (__m128i*)op3; 
__m128i *sse_op4 = (__m128i*)op4; 
__m128i *sse_result = (__m128i*)output; 

for (int i = 0; i < N; i = i + 16) 
{ 
    __m128i src = _mm_loadu_si128((__m128i*)input1); 
    __m128i zero = _mm_setzero_si128(); 
    __m128i higher = _mm_unpackhi_epi8(src, zero); 
    __m128i lower = _mm_unpacklo_epi8(src, zero); 

    _mm_storeu_si128(sse_op3, lower); 
    sse_op3 = sse_op3 + 1; 
    _mm_storeu_si128(sse_op3, higher); 
    sse_op3 = sse_op3 + 1; 
    input1 = input1 + 16; 

} 

for (int j = 0; j < N; j = j + 16) 
{ 
    __m128i src1 = _mm_loadu_si128((__m128i*)input2); 
    __m128i zero1 = _mm_setzero_si128(); 
    __m128i higher1 = _mm_unpackhi_epi8(src1, zero1); 
    __m128i lower1 = _mm_unpacklo_epi8(src1, zero1); 

    _mm_storeu_si128(sse_op4, lower1); 
    sse_op4 = sse_op4 + 1; 
    _mm_storeu_si128(sse_op4, higher1); 
    sse_op4 = sse_op4 + 1; 
    input2 = input2 + 16; 

} 

__m128i *sse_op3_new = (__m128i*)op3; 
__m128i *sse_op4_new = (__m128i*)op4; 

for (int y = 0; y < N; y = y + 8) 
{ 
    *sse_result = _mm_adds_epi16(*sse_op3_new, *sse_op4_new); 
    sse_result = sse_result + 1; 
    sse_op3_new = sse_op3_new + 1; 
    sse_op4_new = sse_op4_new + 1; 
} 

} 

void C_add(unsigned char * input1, unsigned char *input2, unsigned short *output, int N) 
{ 
for (int i = 0; i < N; i++) 
    output[i] = (unsigned short)input1[i] + (unsigned short)input2[i]; 

} 



int main() 
{ 
int n = 1023; 
unsigned char *p0 = new unsigned char[n]; 
unsigned char *p1 = new unsigned char[n]; 
unsigned short *p21 = new unsigned short[n]; 
unsigned short *p22 = new unsigned short[n]; 
for (int j = 0; j < n; j++) 
{ 
    p21[j] = rand() % 256; 
    p22[j] = rand() % 256; 
} 

C_add(p0, p1, p22, n); 
cout << "C_add finished!" << endl; 
sse_add(p0, p1, p21, n); 
cout << "sse_add finished!" << endl; 

for (int j = 0; j < n; j++) 
{ 
    if (p21[j] != p22[j]) 
    { 
     cout << "[email protected]@@@@@@" << endl; 
    } 
} 
//system("pause"); 

delete[] p0; 
delete[] p1; 
delete[] p21; 
delete[] p22; 
return 0; 


} 

回答

3

假設一切對準_Alignof(__m128i)和數組的大小是sizeof(__m128i)的倍數,這樣的事情應該工作:

void addw(size_t size, uint16_t res[size], uint8_t a[size], uint8_t b[size]) { 
    __m128i* r = (__m128i*) res; 
    __m128i* ap = (__m128i*) a; 
    __m128i* bp = (__m128i*) b; 

    for (size_t i = 0 ; i < (size/sizeof(__m128i)) ; i++) { 
    r[(i * 2)]  = _mm_add_epi16(_mm_cvtepu8_epi16(ap[i]), _mm_cvtepu8_epi16(bp[i])); 
    r[(i * 2) + 1] = _mm_add_epi16(_mm_cvtepu8_epi16(_mm_srli_si128(ap[i], 8)), _mm_cvtepu8_epi16(_mm_srli_si128(bp[i], 8))); 
    } 
} 

FWIW,NEON會更簡單一點(使用vaddl_u8vaddl_high_u8)。

如果你正在處理未對齊的數據,你可以使用_mm_loadu_si128/_mm_storeu_si128。如果大小不是16的倍數,那麼只需要在沒有SSE的情況下執行餘數。

請注意,這可能是您的編譯器可以自動執行的操作(我沒有檢查過)。你可能想嘗試這樣的事:

#pragma omp simd 
for (size_t i = 0 ; i < size ; i++) { 
    res[i] = ((uint16_t) a[i]) + ((uint16_t) b[i]); 
} 

使用OpenMP的4,但也存在的Cilk ++(#pragma simd),鐺(#pragma clang loop vectorize(enable)),GCC(#pragma GCC ivdep),或者你可以只希望編譯器足夠聰明沒有編譯提示。

+0

非常感謝,我發佈了我的代碼...當我運行它時出現錯誤(斷點),真的很困惑.. –