c - Why _mm256_load_pd compiled to MOVUPD instead of MOVAPD? -


why following code results unaligned avx instructions ( movupd instead of movapd)? compiled on visual studio 2015. how can tell compiler data indeed aligned?

    const size_t align_size = 64;     const size_t array_size = 1024;      double __declspec(align(align_size)) a[array_size];     double __declspec(align(align_size)) b[array_size];      //calculate dotproduct     __m256d ymm0 = _mm256_set1_pd(0.0);     (int = 0; < array_size; += 8)     {         __m256d ymm1 = _mm256_load_pd(a + i);          __m256d ymm2 = _mm256_load_pd(b + i);         __m256d ymm3 = _mm256_mul_pd(ymm1, ymm2);         ymm0 = _mm256_add_pd(ymm3, ymm0);          __m256d ymm4 = _mm256_load_pd(a + + 4);         __m256d ymm5 = _mm256_load_pd(b + + 4);         __m256d ymm6 = _mm256_mul_pd(ymm4, ymm5);         ymm0 = _mm256_add_pd(ymm6, ymm0);     }    assembly of loop:  00007ff7ac7a1400  vmovupd     ymm1,ymmword ptr [rbp+rax*8+2020h]   00007ff7ac7a1409  vmulpd      ymm3,ymm1,ymmword ptr [rbp+rax*8+20h]   00007ff7ac7a140f  vmovupd     ymm2,ymmword ptr [rbp+rax*8]   00007ff7ac7a1415  vmulpd      ymm0,ymm2,ymmword ptr b[rax*8]   00007ff7ac7a141e  add         r8d,8   00007ff7ac7a1422  movsxd      rax,r8d   00007ff7ac7a1425  vaddpd      ymm1,ymm0,ymm4   00007ff7ac7a1429  vaddpd      ymm4,ymm1,ymm3   00007ff7ac7a142d  cmp         rax,400h   00007ff7ac7a1433  jb          main+70h (07ff7ac7a1400h)   

there way solve problem (it allows use instruction vmovdqa (analogue of movapd) instead of movupd):

inline __m256d load(const double * p) { #ifdef _msc_ver     return _mm256_castsi256_pd(_mm256_load_si256((__m256i*)p)); #else     return _mm256_load_pd(p); #endif } 

analogous solution float type:

inline __m256 load(const float * p) { #ifdef _msc_ver     return _mm256_castsi256_ps(_mm256_load_si256((__m256i*)p)); #else     return _mm256_load_ps(p); #endif } 

but in order cheat visual studio compiler have use dynamically allocated pointers. otherwise compiler doesn't use vmovdqa instruction.

#include <immintrin.h>  int main() {     float * ps = (float*)_mm_malloc(40, 32);     double * pd = (double*)_mm_malloc(40, 32);      __m256 s = load(ps); //00007ff79ff81325  vmovdqa     ymm1,ymmword ptr [rdi]       __m256d d = load(pd); //00007ff79ff8132f  vmovdqa     ymm0,ymmword ptr [rax]      _mm256_storeu_ps(ps, s);     _mm256_storeu_pd(pd, d);      _mm_free(ps);     _mm_free(pd); } 

Comments

Popular posts from this blog

Ansible - ERROR! the field 'hosts' is required but was not set -

customize file_field button ruby on rails -

SoapUI on windows 10 - high DPI/4K scaling issue -