c - Why _mm256_load_pd compiled to MOVUPD instead of MOVAPD? -
why following code results unaligned avx instructions ( movupd instead of movapd)? compiled on visual studio 2015. how can tell compiler data indeed aligned?
const size_t align_size = 64; const size_t array_size = 1024; double __declspec(align(align_size)) a[array_size]; double __declspec(align(align_size)) b[array_size]; //calculate dotproduct __m256d ymm0 = _mm256_set1_pd(0.0); (int = 0; < array_size; += 8) { __m256d ymm1 = _mm256_load_pd(a + i); __m256d ymm2 = _mm256_load_pd(b + i); __m256d ymm3 = _mm256_mul_pd(ymm1, ymm2); ymm0 = _mm256_add_pd(ymm3, ymm0); __m256d ymm4 = _mm256_load_pd(a + + 4); __m256d ymm5 = _mm256_load_pd(b + + 4); __m256d ymm6 = _mm256_mul_pd(ymm4, ymm5); ymm0 = _mm256_add_pd(ymm6, ymm0); } assembly of loop: 00007ff7ac7a1400 vmovupd ymm1,ymmword ptr [rbp+rax*8+2020h] 00007ff7ac7a1409 vmulpd ymm3,ymm1,ymmword ptr [rbp+rax*8+20h] 00007ff7ac7a140f vmovupd ymm2,ymmword ptr [rbp+rax*8] 00007ff7ac7a1415 vmulpd ymm0,ymm2,ymmword ptr b[rax*8] 00007ff7ac7a141e add r8d,8 00007ff7ac7a1422 movsxd rax,r8d 00007ff7ac7a1425 vaddpd ymm1,ymm0,ymm4 00007ff7ac7a1429 vaddpd ymm4,ymm1,ymm3 00007ff7ac7a142d cmp rax,400h 00007ff7ac7a1433 jb main+70h (07ff7ac7a1400h)
there way solve problem (it allows use instruction vmovdqa (analogue of movapd) instead of movupd):
inline __m256d load(const double * p) { #ifdef _msc_ver return _mm256_castsi256_pd(_mm256_load_si256((__m256i*)p)); #else return _mm256_load_pd(p); #endif }
analogous solution float type:
inline __m256 load(const float * p) { #ifdef _msc_ver return _mm256_castsi256_ps(_mm256_load_si256((__m256i*)p)); #else return _mm256_load_ps(p); #endif }
but in order cheat visual studio compiler have use dynamically allocated pointers. otherwise compiler doesn't use vmovdqa instruction.
#include <immintrin.h> int main() { float * ps = (float*)_mm_malloc(40, 32); double * pd = (double*)_mm_malloc(40, 32); __m256 s = load(ps); //00007ff79ff81325 vmovdqa ymm1,ymmword ptr [rdi] __m256d d = load(pd); //00007ff79ff8132f vmovdqa ymm0,ymmword ptr [rax] _mm256_storeu_ps(ps, s); _mm256_storeu_pd(pd, d); _mm_free(ps); _mm_free(pd); }
Comments
Post a Comment