c - NEON: Optimize code -
i playing arm neon , have written following functions, 1 in c, 1 neon intrinsics compare speeds. functions compare 2 arrays. parameter cb
number of bytes divided 8:
inline uint32_t is_not_zero(uint32x4_t v) { uint32x2_t tmp = vorr_u32(vget_low_u32(v), vget_high_u32(v)); return vget_lane_u32(vpmax_u32(tmp, tmp), 0); } uint32_t sum_neon(const uint8_t *s1, const uint8_t *s2, uint32_t cb) { const uint32_t *s1_cmp = (uint32_t *)s1; const uint32_t *s2_cmp = (uint32_t *)s2; cb *= 2; while (cb--) { uint32x4x2_t cmp1 = vld2q_u32(s1_cmp); uint32x4x2_t cmp2 = vld2q_u32(s2_cmp); uint32x4_t res1 = vceqq_u32(cmp1.val[0], cmp2.val[0]); uint32x4_t res2 = vceqq_u32(cmp1.val[1], cmp2.val[1]); if (!is_not_zero(res1)) return 1; if (!is_not_zero(res2)) return 1; s1_cmp += 8; s2_cmp += 8; } return 0; } uint32_t sum_c(const uint8_t *s1, const uint8_t *s2, uint32_t cb) { const uint64_t *p1 = (uint64_t *)s1; const uint64_t *p2 = (uint64_t *)s2; uint32_t n = 0; while (cb--) { if ((p1[n ] != p2[n ]) || (p1[n+1] != p2[n+1]) || (p1[n+2] != p2[n+2]) || (p1[n+3] != p2[n+3])) return 1; ++n; } return 0; }
i dont understand why c implementation way faster neon variant. code compiled on raspberry pi using -o3 -mcpu=cortex-a7 -mfpu=neon-vfpv4 -mfloat-abi=hard
cflags.
Comments
Post a Comment