我正在嘗試學習如何使用intrinsics。 所以,我的C代碼:學習使用內在函數 - 使用_mm256_sub_ps的segm錯誤
void Vor(
const int NbPoints,
const int height,
const int width,
float * X,
float * Y,
int * V,
int * const ouVor)
{
float Xd , Yd;
float Distance ,initDistance = FLT_MAX;
int Threshold;
int x , y; // pixel coordinates
int i;
for (y = 0; y < height; y++)
{
for (x = 0; x < width; x++)
{
for (i = 0; i < NbPoints; i++)
{
Xd = X[ i ] - x;
Yd = Y[ i ] - y;
Distance = Xd * Xd + Yd * Yd;
//if this Point is closer , assign proper threshold
if (Distance < initDistance)
{
initDistance = Distance;
Threshold = V[ i ];
}
*(ouVor + (x + y * width)) = Threshold;
} /* i */
} /* x */
} /* y */
}
現在,使用OpenMP和內部函數。我做:
void Vor(
const int NbOfPoints,
const int height,
const int width,
float * restrict X,
float * restrict Y,
int * restrict V,
int * restrict ouVor)
{
__m128 Xd , Yd;
__m128i Threshold;
int x , y; // pixel coordinates
float initDistance = FLT_MAX;
float * TempDistance = (float*) _mm_malloc(NbOfPoints * sizeof(*TempDistance) ,64);
__m128 * SIMDTempDistance = (__m128*) TempDistance;
__m128 * theX = (__m128*) X;
__m128 * theY = (__m128*) Y;
__m128i * theV = (__m128i*) V;
__m128i * theVor = (__m128i*) ouVor;
#pragma omp parallel for default(none) shared(X , Y , V , ouVor ,height , width ,NbOfPoints) private (x,y,Xd,Yd,TempDistance ,Threshold) collapse(2)
for (y = 0; y < height; y++)
{
for (x = 0; x < width; x++)
{
__m128 Distance = _mm_load_ps(&initDistance);
for (int i = 0; i < NbOfPoints; i++)
{
Xd = _m128_sub_ps(theX[ i ] , x);
Yd = _m128_sub_ps(theY[ i ] , y);
SIMDTempDistance[ i ] = _m128_add_ps(Xd * Xd , Yd * Yd);
__m128 theMin = _m128_gmin_ps(SIMDTempDistance , &Distance);
Distance = theMin;
Threshold = theV[ i ];
} /* i */
//write result
*(ouVor + x + y * width) = Threshold;
} /* x */
} /* y */
_mm_free(TempDistance);
}
我收到像一些錯誤:
function "_m128_sub_ps" declared implicitly
Xd = _m128_sub_ps(theX[ i ] , x);
error: a value of type "int" cannot be assigned to an entity of type "__m128"
Xd = _m128_sub_ps(theX[ i ] , x);
a value of type "__m128i" cannot be assigned to an entity of type "int"
*(ouVor + x + y * width) = Threshold
(和YD,theMin同樣的錯誤,SIMDTempDistance)
我怎樣才能克服這些問題?
此外,我刪除了if語句並使用_m128_gmin_ps查找最小值。是否我的實現正確?
-------------- UPDATE ---------------
Sourav戈什評論後,我搜索一下頭。 我找不到任何地方的128位,所以我用用#include <immintrin.h>
256比特校正幾行到後:
__m256 Distance = _mm256_load_ps(&intiDistance);
__m256 theMin = _mm256_min_ps(SIMDTempDistance[ i ] , &Distance);
和所有的函數調用_mm256而不是_m256,我得到只有這些錯誤:
error: argument of type "int" is incompatible with parameter of type "__m256"
Xd = _mm256_sub_ps(theX[ i ] , x);
Yd = _mm256_sub_ps(theY[ i ] , y);
x和y是整數並用於循環中。我不知道如何克服這一點。
----- UPDATE ----------------------
我想我鑄造.. 我用:
現在__m256i xxIdx = _mm256_set1_epi32(x);
__m256 xIdx = _mm256_castsi256_ps(xxIdx);
,我的代碼是:
void Vor(
const int NbOfPoints,
const int height,
const int width,
float * restrict X,
float * restrict Y,
int * restrict V,
int * restrict ouVor)
{
__m256 Xd , Yd;
__m256i Threshold;
int x , y; // pixel coordinates
float * TempDistance = (float*) _mm_malloc(NbOfPoints * sizeof(*TempDistance) ,64);
__m256 * SIMDTempDistance = (__m256*) TempDistance;
__m256 * theX = (__m256*) X;
__m256 * theY = (__m256*) Y;
__m256i * theV = (__m256i*) V;
__m256i * theVor = (__m256i*) ouVor;
#pragma omp parallel for default(none) shared(X , Y , V , ouVor ,height , width ,NbOfPoints ,ouVor ,theX,theY,theV) private (x,y,Xd,Yd,TempDistance ,Threshold,SIMDTempDistance) collapse(2)
for (y = 0; y < height; y++)
{
for (x = 0; x < width; x++)
{
float initDistance = FLT_MAX;
__m256 Distance = _mm256_set1_ps(initDistance);
for (int i = 0; i < NbOfPoints; i++)
{
__m256i xxIdx = _mm256_set1_epi32(x);
__m256 xIdx = _mm256_castsi256_ps(xxIdx);
__m256i yyIdx = _mm256_set1_epi32(y);
__m256 yIdx = _mm256_castsi256_ps(yyIdx);
Xd = _m256_sub_ps(theX[ i ] , xIdx);
Yd = _m256_sub_ps(theY[ i ] , yIdx);
SIMDTempDistance[ i ] = _m256_add_ps(Xd * Xd , Yd * Yd);
__m256 theMin = _m256_gmin_ps(SIMDTempDistance , Distance);
Distance = theMin;
Threshold = theV[ i ];
} /* i */
//write result
*(ouVor + x + y * width) = Threshold;
} /* x */
} /* y */
_mm_free(TempDistance);
}
我喜歡編譯:
icc -std=c99 -g -openmp -qopt-report=2 -o mycode mycode.c
就可以了。
但在運行代碼給出分段錯誤..
在行:
Xd = _m256_sub_ps(theX[ i ] , xIdx);
Yd = _m256_sub_ps(theY[ i ] , yIdx);
:我正在使用#include。我不知道還有什麼我必須使用的.. –
George
@George'_m128_sub_ps()'的函數原型是什麼?它在中嗎? –
:請參閱我的更新,謝謝! – George