2017-05-30 101 views
0

我一直在使用altivec實現基本的數學運算,作爲學習即將到來的項目的simd的一種方式。此外,爲了看到它的性能優勢,我追蹤了執行操作需要多長時間,但我遇到了一些奇怪的事情。SIMD與Altivec:爲什麼兩個矢量乘兩個矢量比添加兩個矢量快?

我做的第一件事是將兩個向量加在一起並減去兩個向量。這工作正常。接下來我做的是將兩個向量放在一起。然而,乘法比加法更快,即使使用較少的時鐘週期來增加經文的乘積,這取決於我的特定CPU數據表中關於所用指令的說法。

我有兩個數組,其每個10MBs大,通過這兩個程序運行它們:

void av_AddValues(int32_t* intArrayA, int32_t* intArrayB, int32_t* outputBuffer, int size) 
{ 
    int iterations = size/(sizeof(__vector int32_t)/sizeof(int32_t)); 

    __vector int32_t* tempA = (__vector int32_t *) intArrayA; 
    __vector int32_t* tempB = (__vector int32_t *) intArrayB; 
    __vector int32_t* tempOut = (__vector int32_t *) outputBuffer; 
    for(int i = 0; i < iterations; i++) 
    { 
    __vector int32_t sum = vec_add(*tempA, *tempB); 
    vec_st(sum, 0, tempOut); 

    tempA++; 
    tempB++; 
    tempOut++; 
    } 
} 

    void av_MultiplyValues(int16_t* intArrayA, int16_t* intArrayB, int32_t* outputBuffer, int size) 
    { 
    int iterations = size/(sizeof(__vector int16_t)/sizeof(int16_t)); 
    __vector int16_t* tempA = (__vector int16_t *) intArrayA; 
    __vector int16_t* tempB = (__vector int16_t *) intArrayB; 
    __vector int32_t* tempOut = (__vector int32_t *) outputBuffer; 


    for(int i = 0; i < iterations; i++) 
    { 
     __vector int32_t productEven = vec_mule(*tempA, *tempB); 
     __vector int32_t productOdd = vec_mulo(*tempA, *tempB); 

     __vector int32_t mergedProductHigh = vec_mergeh(productEven, productOdd); 
     __vector int32_t mergedProductLow = vec_mergel(productEven, productOdd); 

     vec_st(mergedProductHigh, 0, tempOut); 
     tempOut++; 
     vec_st(mergedProductLow, 0, tempOut); 

     tempA++; 
     tempB++; 
     tempOut++; 
    } 
    } 

在我的特殊平臺,av_AddValues需要81ms來處理和av_MultiplyValues發生過48ms處理。 (使用std :: chrono :: high_resolution_clock記錄的時間)

爲什麼乘法需要較少的時間來處理而不是添加?

我不認爲增加32位值與乘以16位值會產生差異,因爲__vector類型總是處理16個字節的數據。

我的第一個想法是,因爲將數字加在一起是一件很簡單的任務,所以CPU完成操作的速度比從內存中獲取數據要快。而乘法運算,這種提取延遲隱藏在CPU忙於工作的事實中,而且不需要等待很長時間。

這是一個正確的假設嗎?

全碼:

#include <chrono> 
#include <random> 
#include <limits> 

#include <iostream> 
#include <cassert> 
#include <cstring> 
#include <cstdint> 
#include <malloc.h> 

#include <altivec.h> 
#undef vector 

void GenerateRandom16bitValues(int16_t* inputABuffer, int16_t* inputBBuffer, int32_t* outputBuffer, int size); 
void GenerateRandom32bitValues(int32_t* inputABuffer, int32_t* inputBBuffer, int32_t* outputBuffer, int size); 
void TestAdd(); 
void TestMultiply(); 
void av_AddValues(int32_t* intArrayA, int32_t* intArrayB, int32_t* outputBuffer, int size); 
void av_MultiplyValues(int16_t* intArrayA, int16_t* intArrayB, int32_t* outputBuffer, int size); 

int main() 
{ 
    TestAdd(); 
    TestMultiply(); 
} 

void GenerateRandom16bitValues(int16_t* inputABuffer, int16_t* inputBBuffer, int32_t* outputBuffer, int size) 
{ 
    std::random_device rd; 
    std::mt19937 gen(rd()); 
    std::uniform_int_distribution<> dis(std::numeric_limits<int16_t>::min(), std::numeric_limits<int16_t>::max()); 

    for(int i = 0; i < size; i++) 
    { 
    inputABuffer[i] = dis(gen); 
    inputBBuffer[i] = dis(gen); 
    outputBuffer[i] = 0; 
    } 
} 

void GenerateRandom32bitValues(int32_t* inputABuffer, int32_t* inputBBuffer, int32_t* outputBuffer, int size) 
{ 
    std::random_device rd; 
    std::mt19937 gen(rd()); 
    std::uniform_int_distribution<> dis(std::numeric_limits<int32_t>::min(), std::numeric_limits<int32_t>::max()); 

    for(int i = 0; i < size; i++) 
    { 
    inputABuffer[i] = dis(gen); 
    inputBBuffer[i] = dis(gen); 
    outputBuffer[i] = 0; 
    } 
} 

void TestAdd() 
{ 
    int size = 10'485'760; 
    int bytes = size * sizeof(int32_t); 

    int32_t* inputABuffer = (int32_t*) memalign(64, bytes); 
    int32_t* inputBBuffer = (int32_t*) memalign(64, bytes); 
    int32_t* outputBuffer = (int32_t*) memalign(64, bytes); 
    assert(inputABuffer != nullptr); 
    assert(inputBBuffer != nullptr); 
    assert(outputBuffer != nullptr); 

    GenerateRandom32bitValues(inputABuffer, inputBBuffer, outputBuffer, size); 

    for(int i = 0; i < 20; i++) 
    { 
     auto start = std::chrono::high_resolution_clock::now(); 
     av_AddValues(inputABuffer, inputBBuffer, outputBuffer, size); 
     auto end = std::chrono::high_resolution_clock::now(); 
     auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start); 

     for(int k = 0; k < size; k++) 
     { 
     assert(outputBuffer[k] == (inputABuffer[k] + inputBBuffer[k])); 
     } 

     std::cout << "Vector Sum - " << diff.count() << "ms\n"; 
     memset(outputBuffer, 0, size); 
    } 
} 

void TestMultiply() 
{ 
    int size = 10'485'760; 
    int16_t* inputABuffer = (int16_t*) memalign(64, size * sizeof(int16_t)); 
    int16_t* inputBBuffer = (int16_t*) memalign(64, size * sizeof(int16_t)); 
    int32_t* outputBuffer = (int32_t*) memalign(64, size * sizeof(int32_t)); 
    assert(inputABuffer != nullptr); 
    assert(inputBBuffer != nullptr); 
    assert(outputBuffer != nullptr); 

    GenerateRandom16bitValues(inputABuffer, inputBBuffer, outputBuffer, size); 

    for(int i = 0; i < 20; i++) 
    { 
     auto start = std::chrono::high_resolution_clock::now(); 
     av_MultiplyValues(inputABuffer, inputBBuffer, outputBuffer, size); 
     auto end = std::chrono::high_resolution_clock::now(); 
     auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start); 

     for(int k = 0; k < size; k++) 
     { 
     assert(outputBuffer[k] == (inputABuffer[k] * inputBBuffer[k])); 
     } 

     std::cout << "Vector product - " << diff.count() << "ms\n"; 
     memset(outputBuffer, 0, size); 
    } 
} 

void av_AddValues(int32_t* intArrayA, int32_t* intArrayB, int32_t* outputBuffer, int size) 
{ 
    int iterations = size/(sizeof(__vector int32_t)/sizeof(int32_t)); 

    __vector int32_t* tempA = (__vector int32_t *) intArrayA; 
    __vector int32_t* tempB = (__vector int32_t *) intArrayB; 
    __vector int32_t* tempOut = (__vector int32_t *) outputBuffer; 

    for(int i = 0; i < iterations; i++) 
    { 
    __vector int32_t sum = vec_add(*tempA, *tempB); 
    vec_st(sum, 0, tempOut); 

    tempA++; 
    tempB++; 
    tempOut++; 
    } 
} 

void av_MultiplyValues(int16_t* intArrayA, int16_t* intArrayB, int32_t* outputBuffer, int size) 
{ 
    int iterations = size/(sizeof(__vector int16_t)/sizeof(int16_t)); 
    __vector int16_t* tempA = (__vector int16_t *) intArrayA; 
    __vector int16_t* tempB = (__vector int16_t *) intArrayB; 
    __vector int32_t* tempOut = (__vector int32_t *) outputBuffer; 
    for(int i = 0; i < iterations; i++) 
    { 
    __vector int32_t productEven = vec_mule(*tempA, *tempB); 
    __vector int32_t productOdd = vec_mulo(*tempA, *tempB); 

    __vector int32_t mergedProductHigh = vec_mergeh(productEven, productOdd); 
    __vector int32_t mergedProductLow = vec_mergel(productEven, productOdd); 

    vec_st(mergedProductHigh, 0, tempOut); 
    tempOut++; 
    vec_st(mergedProductLow, 0, tempOut); 

    tempA++; 
    tempB++; 
    tempOut++; 
    } 
} 

PERF的統計和記錄PERF輸出:

Adding 
    Performance counter stats for './alti': 

     2151.146080  task-clock (msec)   # 0.999 CPUs utilized   
        9  context-switches   # 0.004 K/sec     
        0  cpu-migrations   # 0.000 K/sec     
       30957  page-faults    # 0.014 M/sec     
      3871497132  cycles     # 1.800 GHz      
    <not supported>  stalled-cycles-frontend 
    <not supported>  stalled-cycles-backend 
      1504538891  instructions    # 0.39 insns per cycle   
      234038234  branches     # 108.797 M/sec     
       687912  branch-misses    # 0.29% of all branches   
      270305159  L1-dcache-loads   # 125.656 M/sec     
      79819113  L1-dcache-load-misses  # 29.53% of all L1-dcache hits 
    <not supported>  LLC-loads     
    <not supported>  LLC-load-misses   

     2.152697186 seconds time elapsed 


    CPU Utilization 
    76.04% alti  alti     [.] av_AddValues  

    Multiply 

    Performance counter stats for './alti': 

     1583.016640  task-clock (msec)   # 0.999 CPUs utilized   
        4  context-switches   # 0.003 K/sec     
        0  cpu-migrations   # 0.000 K/sec     
       20717  page-faults    # 0.013 M/sec     
      2849050875  cycles     # 1.800 GHz      
    <not supported>  stalled-cycles-frontend 
    <not supported>  stalled-cycles-backend 
      1520409634  instructions    # 0.53 insns per cycle   
      179185029  branches     # 113.192 M/sec     
       535437  branch-misses    # 0.30% of all branches   
      205341530  L1-dcache-loads   # 129.715 M/sec     
      27124936  L1-dcache-load-misses  # 13.21% of all L1-dcache hits 
    <not supported>  LLC-loads     
    <not supported>  LLC-load-misses   

     1.584145737 seconds time elapsed 


    CPU Utilization 
    60.35% alti  alti    [.] av_MultiplyValues  
+1

*您如何測量?你經常測量這個數據?你在執行兩個測試?發佈[MCVE] – EOF

+0

那些時代看起來非常高 - 您是否在編譯時啓用了優化(例如'-O3')?此外,您使用的是什麼CPU,時鐘速度是多少? –

+0

@eof我編輯了我的帖子,其中包含一個工作示例。起初,我只跑了一次,但我現在通過我測量的兩個例程並且時間一致。添加需要81ms,乘數需要48ms。如我的帖子所述,我只是使用std :: chrono :: high_resolution_clock來衡量時間。有更好的選擇嗎? – shaboinkin

回答

1

它關係到你的輸入緩衝區的大小。

在一種情況下(TestAdd)

int size = 10'485'760; 
int bytes = size * sizeof(int32_t); 

int32_t* inputABuffer = (int32_t*) memalign(64, bytes); 
int32_t* inputBBuffer = (int32_t*) memalign(64, bytes); 
int32_t* outputBuffer = (int32_t*) memalign(64, bytes); 

你分配3 *尺寸* 4個字節(的sizeof(int32_t)= 4)中的另一個(test_mul)

int size = 10'485'760; 
int16_t* inputABuffer = (int16_t*) memalign(64, size * sizeof(int16_t)); 
int16_t* inputBBuffer = (int16_t*) memalign(64, size * sizeof(int16_t)); 
int32_t* outputBuffer = (int32_t*) memalign(64, size * sizeof(int32_t)); 

您分配的大小* 4 + 2 *大小* 2(sizeof(int16_t)= 2)

由於此代碼完全是內存綁定的,因此第二個c ode是(3 * 4)/(4 + 2 * 2)= 快1.5倍

從2.15/1.5 = 1.43開始,這與您的測量結果一致,接近1.58。