Hello,
I'm evaluating my CPU board performances with a simple program that execute a specific number of single precision multiplication, with and without the AVX intrinsic functions.
But the results shows that with AVX the performances do not increase. Hereafter my code, compiled with the following command:
icpc -g -O2 -ipp=common -axCORE-AVX2 -xCORE-AVX2 -mtune=broadwell -MMD -MP -MF"mainParal_intel.d" -MT"mainParal_intel.d" -c -o "mainParal_intel.o""../mainParal_intel.cpp"
#define NUM_LOOP (int)34608000
int main(void) { struct timespec t1, t2; float timeApp; unsigned long int ii; __m256 vec1,vec2,vec3,vec4; pSrc = ippsMalloc_32f(NUM_LOOP); pMolt = ippsMalloc_32f(NUM_LOOP); pOut = ippsMalloc_32f(NUM_LOOP); for (ii=0;ii<NUM_LOOP;ii++) { pSrc[ii] = rand_float(); } for (ii=0;ii<NUM_LOOP;ii++) { pMolt[ii] = rand_float(); } clock_gettime(CLOCK_REALTIME, &t1); for (int j=0;j<NUM_LOOP;j+=8) { vec1=_mm256_load_ps(pSrc+j); vec2=_mm256_load_ps(pMolt+j); vec3=_mm256_mul_ps(vec1,vec2); _mm256_store_ps(pOut+j,vec3); } clock_gettime(CLOCK_REALTIME, &t2); timeApp = (float)(t2.tv_sec - t1.tv_sec)*1000.0f + (float)(t2.tv_nsec - t1.tv_nsec)/1000000.0f;//tempo in ms printf("Execution time with intrinsics : %f ms\n",timeApp); clock_gettime(CLOCK_REALTIME, &t1); for (int j=0;j<NUM_LOOP;j++) { pOut[j] = pSrc[j]*pMolt[j]; } clock_gettime(CLOCK_REALTIME, &t2); timeApp = (float)(t2.tv_sec - t1.tv_sec)*1000.0f + (float)(t2.tv_nsec - t1.tv_nsec)/1000000.0f;//tempo in ms printf("Execution time without intrinsics: %f ms\n", timeApp); return 0; }
The output is:
Execution time with intrinsics : 34.625843 ms
Execution time without intrinsics: 29.946192 ms
What wrong with what I do??
Thanks
Valerio.