This code
#include <stdint.h> int const count = 1024; uint8_t p[count]; void mul(uint16_t m) { #pragma simd vectorlength(8) for (int i = 0; i < count; ++i) p[i] = uint16_t(p[i] * m) >> 8; }
compiled with -Os generates
mul(unsigned short): xor edx, edx #9.2 movzx eax, di #10.26 pxor xmm2, xmm2 #10.19 movdqu xmm1, XMMWORD PTR .L_2il0floatpacket.0[rip] #10.32 movd xmm3, eax #10.26 punpcklwd xmm3, xmm3 #10.26 punpckldq xmm3, xmm3 #10.26 punpcklqdq xmm3, xmm3 #10.26 ..B1.2: # Preds ..B1.2 ..B1.1 lea rax, QWORD PTR [p+rdx] #10.19 movq xmm0, QWORD PTR [rax] #10.19 punpcklbw xmm0, xmm2 #10.19 pmullw xmm0, xmm3 #10.32 psrlw xmm0, 8 #10.32 pand xmm0, xmm1 #10.32 packuswb xmm0, xmm2 #10.32 movq QWORD PTR [rax], xmm0 #10.3 add rdx, 8 #9.2 cmp rdx, 1024 #9.2 jb ..B1.2 # Prob 99% #9.2 ret #11.1 p: .L_2il0floatpacket.0: .long 0x00ff00ff,0x00ff00ff,0x00ff00ff,0x00ff00ff
with a redundant pand instruction.