I'm seeing a strange compiler crash when using GCC style vector types. I'm using vector types to load/store because using _mm_loadu_si128 to load vectors seems to ignore the restrict qualifier, causing constant values to be reloaded from memory. I'll make another post for that. My function calculates the mean & stdev. of an array. The clever part is reusing the loop body to process the remainder to reduce the cache foot print. But it seems it's this complex control flow that's causing the crash. If I comment out the goto handleRemainder or comment out the inner most do-while loop, it compiles. And of course, using __m128i instead of vector types makes the crash go away. The crash happens in both ICC 14 & the latest ICC 17. Appreciate a reasonable workaround, explanation, or patch.
#include <immintrin.h> #include <stdint.h> #include <unistd.h> #include <math.h> #include <algorithm> using namespace std; #define CAST_VSHORT(x) x #define ROUND_DOWN(a, b) (a & (~(b - 1))) #define MAX_INTENSITY 4096 #define FORCE_INLINE inline __attribute__ ((always_inline)) #if 1 // crashes typedef int32_t __attribute__((vector_size(16))) VINT; typedef int16_t __attribute__((vector_size(16))) VSHORT; typedef int16_t __attribute__((vector_size(16), aligned(1))) UNALIGNED_VSHORT; #else typedef __m128i VINT; typedef __m128i VSHORT; #endif FORCE_INLINE __m128i PartialVectorMask(ssize_t n) { return _mm_set1_epi16(0xffff); // incomplete for brevity } FORCE_INLINE int64_t VectorSum(VINT x) { __m128i lo = _mm_cvtepi32_epi64(x), hi = _mm_cvtepi32_epi64(_mm_srli_si128(x, 8)); __m128i sum = _mm_add_epi64(lo, hi); return _mm_extract_epi64(_mm_add_epi64(sum, _mm_srli_si128(sum, 8)), 0); } __m128i void CalculateMeanAndStdev(float &mean, float &stdev, int16_t *in, ssize_t size) { ssize_t i; double sum = 0, squareSum = 0; VINT zero = _mm_set1_epi32(0), vSquareSum = zero, vSum = zero; VSHORT data; ssize_t blockEnd; const ssize_t VECTOR_WIDTH = 8; // elements you can accumulate before square sum can overflow const ssize_t BLOCK_SIZE = ROUND_DOWN((UINT32_MAX / ((MAX_INTENSITY - 1) * (MAX_INTENSITY - 1))) * 4, VECTOR_WIDTH); ssize_t roundedSize = ROUND_DOWN(size, VECTOR_WIDTH); for (i = 0; i <= size - VECTOR_WIDTH; ) { blockEnd = min(i + BLOCK_SIZE, roundedSize); // process a block whos size is a multiple of 8, except when processing the SIMD remainder do { data = _mm_loadu_si128((__m128i *)&in[i]); //data = *(UNALIGNED_VSHORT *)&in[i]; handleRemainder: VINT unpacked0 = _mm_srai_epi32(_mm_unpacklo_epi16(data, data), 16), unpacked1 = _mm_srai_epi32(_mm_unpackhi_epi16(data, data), 16); vSquareSum = _mm_add_epi32(_mm_madd_epi16(data, data), vSquareSum); vSum = _mm_add_epi32(unpacked0, vSum); vSum = _mm_add_epi32(unpacked1, vSum); i += VECTOR_WIDTH; } while (i < blockEnd); squareSum += VectorSum(vSquareSum); sum += VectorSum(vSum); vSum = zero; vSquareSum = zero; } if (i < size) { // handle remainder by setting invalid elements to 0 data = _mm_and_si128(_mm_loadu_si128((__m128i *)&in[i]), PartialVectorMask((size % VECTOR_WIDTH) * sizeof(int16_t))); blockEnd = size; goto handleRemainder; // share code to reduce machine code size } mean = sum / size; stdev = sqrtf((squareSum - sum * sum / size) / (size - 1)); } int main() { const size_t N = 4096; int16_t __attribute__((aligned(16))) image[N]; float mean, stdev; for (int i = 0; i < 1000000; ++i) { CalculateMeanAndStdev(mean, stdev, image, N); } return mean; }