學習使用內在函數 - 使用_mm256_sub_ps的segm錯誤

我正在嘗試學習如何使用intrinsics。所以，我的C代碼：學習使用內在函數 - 使用_mm256_sub_ps的segm錯誤

void Vor(

    const int NbPoints, 
    const int height, 
    const int width, 
    float * X, 
    float * Y, 
    int * V, 
    int * const ouVor) 
{ 

    float Xd , Yd; 
    float Distance ,initDistance = FLT_MAX; 
    int Threshold; 

    int x , y; // pixel coordinates 
    int i; 

    for (y = 0; y < height; y++) 
    { 
     for (x = 0; x < width; x++) 
     { 
      for (i = 0; i < NbPoints; i++) 
      { 
       Xd = X[ i ] - x; 
       Yd = Y[ i ] - y; 
       Distance = Xd * Xd + Yd * Yd; 

       //if this Point is closer , assign proper threshold 
       if (Distance < initDistance) 
       { 
        initDistance = Distance; 
        Threshold = V[ i ]; 
       } 

       *(ouVor + (x + y * width)) = Threshold; 

      } /* i */ 
     } /* x */ 

    } /* y */ 


}

現在，使用OpenMP和內部函數。我做：

void Vor(

    const int NbOfPoints, 
    const int height, 
    const int width, 
    float * restrict X, 
    float * restrict Y, 
    int * restrict V, 
    int * restrict ouVor) 
{ 


    __m128 Xd , Yd; 

    __m128i Threshold; 
    int x , y; // pixel coordinates 
    float initDistance = FLT_MAX; 

    float * TempDistance = (float*) _mm_malloc(NbOfPoints * sizeof(*TempDistance) ,64); 

    __m128 * SIMDTempDistance = (__m128*) TempDistance; 
    __m128 * theX = (__m128*) X; 
    __m128 * theY = (__m128*) Y; 
    __m128i * theV = (__m128i*) V; 
    __m128i * theVor = (__m128i*) ouVor; 


     #pragma omp parallel for default(none) shared(X , Y , V , ouVor ,height , width ,NbOfPoints) private (x,y,Xd,Yd,TempDistance ,Threshold) collapse(2) 
    for (y = 0; y < height; y++) 
    { 
     for (x = 0; x < width; x++) 
     { 

      __m128 Distance = _mm_load_ps(&initDistance); 

      for (int i = 0; i < NbOfPoints; i++) 
      { 
       Xd = _m128_sub_ps(theX[ i ] , x); 
       Yd = _m128_sub_ps(theY[ i ] , y); 
       SIMDTempDistance[ i ] = _m128_add_ps(Xd * Xd , Yd * Yd); 

       __m128 theMin = _m128_gmin_ps(SIMDTempDistance , &Distance); 

       Distance = theMin; 
       Threshold = theV[ i ]; 

       } /* i */ 

       //write result 
       *(ouVor + x + y * width) = Threshold; 

      } /* x */ 

     } /* y */ 


    _mm_free(TempDistance); 

}

我收到像一些錯誤：

function "_m128_sub_ps" declared implicitly 
Xd = _m128_sub_ps(theX[ i ] , x); 

error: a value of type "int" cannot be assigned to an entity of type "__m128" 
Xd = _m128_sub_ps(theX[ i ] , x); 

a value of type "__m128i" cannot be assigned to an entity of type "int" 
*(ouVor + x + y * width) = Threshold

（和YD，theMin同樣的錯誤，SIMDTempDistance）

我怎樣才能克服這些問題？

此外，我刪除了if語句並使用_m128_gmin_ps查找最小值。是否我的實現正確？

-------------- UPDATE ---------------

Sourav戈什評論後，我搜索一下頭。我找不到任何地方的128位，所以我用用#include <immintrin.h>

256比特校正幾行到後：

__m256 Distance = _mm256_load_ps(&intiDistance); 

__m256 theMin = _mm256_min_ps(SIMDTempDistance[ i ] , &Distance);

和所有的函數調用_mm256而不是_m256，我得到只有這些錯誤：

error: argument of type "int" is incompatible with parameter of type "__m256" 
Xd = _mm256_sub_ps(theX[ i ] , x); 
Yd = _mm256_sub_ps(theY[ i ] , y);

x和y是整數並用於循環中。我不知道如何克服這一點。

----- UPDATE ----------------------

我想我鑄造.. 我用：

現在

__m256i xxIdx = _mm256_set1_epi32(x); 
__m256 xIdx = _mm256_castsi256_ps(xxIdx);

，我的代碼是：

void Vor(

     const int NbOfPoints, 
     const int height, 
     const int width, 
     float * restrict X, 
     float * restrict Y, 
     int * restrict V, 
     int * restrict ouVor) 
    { 



     __m256 Xd , Yd; 

     __m256i Threshold; 
     int x , y; // pixel coordinates 


     float * TempDistance = (float*) _mm_malloc(NbOfPoints * sizeof(*TempDistance) ,64); 

     __m256 * SIMDTempDistance = (__m256*) TempDistance; 
     __m256 * theX = (__m256*) X; 
     __m256 * theY = (__m256*) Y; 
     __m256i * theV = (__m256i*) V; 
     __m256i * theVor = (__m256i*) ouVor; 


    #pragma omp parallel for default(none) shared(X , Y , V , ouVor ,height , width ,NbOfPoints ,ouVor ,theX,theY,theV) private (x,y,Xd,Yd,TempDistance ,Threshold,SIMDTempDistance) collapse(2) 

    for (y = 0; y < height; y++) 
    { 
     for (x = 0; x < width; x++) 
     { 
       float initDistance = FLT_MAX; 
       __m256 Distance = _mm256_set1_ps(initDistance); 

       for (int i = 0; i < NbOfPoints; i++) 
       { 
        __m256i xxIdx = _mm256_set1_epi32(x); 
        __m256 xIdx = _mm256_castsi256_ps(xxIdx); 

        __m256i yyIdx = _mm256_set1_epi32(y); 
        __m256 yIdx = _mm256_castsi256_ps(yyIdx); 

        Xd = _m256_sub_ps(theX[ i ] , xIdx); 
        Yd = _m256_sub_ps(theY[ i ] , yIdx); 
        SIMDTempDistance[ i ] = _m256_add_ps(Xd * Xd , Yd * Yd); 

        __m256 theMin = _m256_gmin_ps(SIMDTempDistance , Distance); 

        Distance = theMin; 
        Threshold = theV[ i ]; 

        } /* i */ 

        //write result 
        *(ouVor + x + y * width) = Threshold; 

       } /* x */ 

      } /* y */ 


     _mm_free(TempDistance); 

    }

我喜歡編譯：

icc -std=c99 -g -openmp -qopt-report=2 -o mycode mycode.c

就可以了。

但在運行代碼給出分段錯誤..

在行：

Xd = _m256_sub_ps(theX[ i ] , xIdx); 
Yd = _m256_sub_ps(theY[ i ] , yIdx);

來源

2015-04-07 George

那麼，我用：

__m256 LX = _mm256_load_ps(&X[ i ]); 
__m256 LY = _mm256_load_ps(&Y[ i ]);

而不是：

Xd = _m256_sub_ps(theX[ i ] , xIdx); 
Yd = _m256_sub_ps(theY[ i ] , yIdx);

它可以！

來源

2015-04-08 14:50:30 George

我想，你錯過了一些含_m128_sub_ps()功能的向前聲明所需的頭文件。我們可以假設實際_m128_sub_ps()函數的返回類型爲__m128，但是沒有正確的聲明，編譯器假定的默認返回類型爲int的_m128_sub_ps()函數。這就是爲什麼，編譯器是發射

function "_m128_sub_ps" declared implicitly

然後，int返回值被分配給__m128類型的變量，createing問題。

編輯：

按照改變的代碼，

int x , y; // pixel coordinates

應該是

__m256 x , y; // pixel coordinates

爲signature of _mm256_sub_ps()需要兩個參數爲類型__m256

來源

2015-04-07 11:03:24

：我正在使用#include 。我不知道還有什麼我必須使用的.. – George

@George'_m128_sub_ps（）'的函數原型是什麼？它在中嗎？ –

：請參閱我的更新，謝謝！ – George

你有點糊塗了內在的名字。

對於128位的SSE，它只是，如：

_mm_sub_ps

不是：

_mm128_sub_ps

[混亂可能源於一個事實，即256位AVX它_mm256_sub_ps]

來源

2015-04-08 14:39:55

學習使用內在函數 - 使用_mm256_sub_ps的segm錯誤

回答

相關問題