Hi
I use intel compiler 15 and I have a code that include huge number of maximum operaton
I write a a vectorize code for that max operation and performance of my application dramatically goes down .
my previouse code is :
while (cycles < MAX_CYCLE)
{
for (int i = len - 1; i >= 0; i--)
{
illr = i*llr_height;
BG8 = inz[illr + 1];
BG17 = inx[illr];
BG18 = inx[illr + 1];
BG3 = inx[illr + 2];
BG17_8 = BG17 + BG8;
BG18_8 = BG18 + BG8;
BG3_8 = BG3 + BG8;
float max4 = 0;//gamma[0];
max4 = max_float(max4,BG17_8 + beta_7);
max4 = max_float(max4, BG18_8 + beta_4);
max4 = max_float(max4, BG3 + beta_3);
tempab[0] = max4;
max4 = beta_4;
max4 = max_float(max4, BG17_8 + beta_3);
max4 = max_float(max4, BG18_8);
max4 = max_float(max4, BG3 + beta_7);
tempab[1] = max4;
max4 = BG8 + beta_1;
max4 = max_float(max4, BG17 + beta_6);
max4 = max_float(max4, BG18 + beta_5);
max4 = max_float(max4, BG3_8 + beta_2);
tempab[2] = max4;
max4 = BG8 + beta_5;
max4 = max_float(max4, BG17 + beta_2);
max4 = max_float(max4, BG18 + beta_1);
max4 = max_float(max4, BG3_8 + beta_6);
tempab[3] = max4;
max4 = BG8 + beta_6;
max4 = max_float(max4, BG17 + beta_1);
max4 = max_float(max4, BG18 + beta_2);
max4 = max_float(max4, BG3_8 + beta_5);
tempab[4] = max4;
max4 = BG8 + beta_2;
max4 = max_float(max4, BG17 + beta_5);
max4 = max_float(max4, BG18 + beta_6);
max4 = max_float(max4, BG3_8 + beta_1);
tempab[5] = max4;
max4 = beta_7;
max4 = max_float(max4, BG17_8);
max4 = max_float(max4, BG18_8 + beta_3);
max4 = max_float(max4, BG3 + beta_4);
tempab[6] = max4;
max4 = beta_3;
max4 = max_float(max4, BG17_8 + beta_4);
max4 = max_float(max4, BG18_8 + beta_7);
max4 = max_float(max4, BG3);
tempab[7] = max4;
}
cycles++;
}
and new vectorize code is :
__m256 Vec1,Vec2,Vec3,Vec4,Vec5,Vec6,Vec7,Vec8;
float V1[8];float V2[8];float V3[8];float V4[8];
while (cycles < MAX_CYCLE)
{
for (int i = len - 1; i >= 0; i--) // calculate beta[][i] based on beta[][i+1]
{
illr = i*llr_height;
BG8 = inz[illr + 1];
//float BG24 = 0;
//basegamma[16] = basegamma[8];
BG17 = inx[illr];
BG18 = inx[illr + 1];
BG3 = inx[illr + 2];
BG17_8 = BG17 + BG8;
BG18_8 = BG18 + BG8;
BG3_8 = BG3 + BG8;
int iplus1 = i + 1;
V1[0] = 0;
V2[0] = BG17_8 + beta_7;
V3[0] = BG18_8 + beta_4;
V4[0] = BG3 + beta_3;
V1[1] = beta_4;
V2[1] = BG17_8 + beta_3;
V3[1] = BG18_8;
V4[1] = BG3 + beta_7;
V1[2] = BG8 + beta_1;
V2[2] = BG17 + beta_6;
V3[2] = BG18 + beta_5;
V4[2] = BG3_8 + beta_2;
V1[3] = BG8 + beta_5;
V2[3] = BG17 + beta_2;
V3[3] = BG18 + beta_1;
V4[3] = BG3_8 + beta_6;
V1[4] = BG8 + beta_6;
V2[4] = BG17 + beta_1;
V3[4] = BG18 + beta_2;
V4[4] = BG3_8 + beta_5;
V1[5] = BG8 + beta_2;
V2[5] = BG17 + beta_5;
V3[5] = BG18 + beta_6;
V4[5] = BG3_8 + beta_1;
V1[6] = beta_7;
V2[6] = BG17_8;
V3[6] = BG18_8 + beta_3;
V4[6] = BG3 + beta_4;
V1[7] = beta_3;
V2[7] = BG17_8 + beta_4;
V3[7] = BG18_8 + beta_7;
V4[7] = BG3;
Vec1 = _mm256_load_ps(V1);
Vec2 = _mm256_load_ps(V2);
Vec3 = _mm256_load_ps(V3);
Vec4 = _mm256_load_ps(V4);
Vec5=_mm256_max_ps(Vec1, Vec2);
Vec6 = _mm256_max_ps(Vec5, Vec3);
Vec7 = _mm256_max_ps(Vec6, Vec4);
_mm256_storeu_ps(V1, Vec7);
}//for (int i = len - 1; i >= 0; i--)
cycles++;
}
I will appreciate if some one tell me why my performance fall down and what can I do to correct that?