c - NEON: Optimize code -


i playing arm neon , have written following functions, 1 in c, 1 neon intrinsics compare speeds. functions compare 2 arrays. parameter cb number of bytes divided 8:

inline uint32_t is_not_zero(uint32x4_t v) {         uint32x2_t tmp = vorr_u32(vget_low_u32(v), vget_high_u32(v));         return vget_lane_u32(vpmax_u32(tmp, tmp), 0); }  uint32_t sum_neon(const uint8_t *s1, const uint8_t *s2, uint32_t cb) {         const uint32_t *s1_cmp = (uint32_t *)s1;         const uint32_t *s2_cmp = (uint32_t *)s2;          cb *= 2;          while (cb--)         {                 uint32x4x2_t cmp1 = vld2q_u32(s1_cmp);                 uint32x4x2_t cmp2 = vld2q_u32(s2_cmp);                  uint32x4_t res1 = vceqq_u32(cmp1.val[0], cmp2.val[0]);                 uint32x4_t res2 = vceqq_u32(cmp1.val[1], cmp2.val[1]);                  if (!is_not_zero(res1)) return 1;                 if (!is_not_zero(res2)) return 1;                  s1_cmp += 8;                 s2_cmp += 8;         }         return 0; }  uint32_t sum_c(const uint8_t *s1, const uint8_t *s2, uint32_t cb) {     const uint64_t *p1 = (uint64_t *)s1;     const uint64_t *p2 = (uint64_t *)s2;     uint32_t n = 0;     while (cb--) {         if ((p1[n  ] != p2[n  ]) ||                 (p1[n+1] != p2[n+1]) ||                 (p1[n+2] != p2[n+2]) ||                 (p1[n+3] != p2[n+3])) return 1;         ++n;     }     return 0;  } 

i dont understand why c implementation way faster neon variant. code compiled on raspberry pi using -o3 -mcpu=cortex-a7 -mfpu=neon-vfpv4 -mfloat-abi=hard cflags.


Comments

Popular posts from this blog

html - How to set bootstrap input responsive width? -

javascript - Highchart x and y axes data from json -

javascript - Get js console.log as python variable in QWebView pyqt -