@@ -74,17 +74,24 @@ template <>
74
74
inline uint32x4_t divSaturateQ<uint32x4_t >(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
75
75
{ return vcvtq_u32_f32 (vroundq (vmulq_f32 (vmulq_n_f32 (vcvtq_f32_u32 (v1), scale), internal::vrecpq_f32 (vcvtq_f32_u32 (v2))))); }
76
76
77
+ inline float32x2_t vround (const float32x2_t & v)
78
+ {
79
+ const int32x2_t signMask = vdup_n_s32 (1 << 31 ), half = vreinterpret_s32_f32 (vdup_n_f32 (0 .5f ));
80
+ float32x2_t v_addition = vreinterpret_f32_s32 (vorr_s32 (half, vand_s32 (signMask, vreinterpret_s32_f32 (v))));
81
+ return vadd_f32 (v, v_addition);
82
+ }
83
+
77
84
template <typename T>
78
85
inline T divSaturate (const T &v1, const T &v2, const float scale)
79
86
{
80
87
return internal::vqmovn (divSaturateQ (internal::vmovl (v1), internal::vmovl (v2), scale));
81
88
}
82
89
template <>
83
90
inline int32x2_t divSaturate<int32x2_t >(const int32x2_t &v1, const int32x2_t &v2, const float scale)
84
- { return vcvt_s32_f32 (vmul_f32 (vmul_n_f32 (vcvt_f32_s32 (v1), scale), internal::vrecp_f32 (vcvt_f32_s32 (v2)))); }
91
+ { return vcvt_s32_f32 (vround ( vmul_f32 (vmul_n_f32 (vcvt_f32_s32 (v1), scale), internal::vrecp_f32 (vcvt_f32_s32 (v2) )))); }
85
92
template <>
86
93
inline uint32x2_t divSaturate<uint32x2_t >(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
87
- { return vcvt_u32_f32 (vmul_f32 (vmul_n_f32 (vcvt_f32_u32 (v1), scale), internal::vrecp_f32 (vcvt_f32_u32 (v2)))); }
94
+ { return vcvt_u32_f32 (vround ( vmul_f32 (vmul_n_f32 (vcvt_f32_u32 (v1), scale), internal::vrecp_f32 (vcvt_f32_u32 (v2) )))); }
88
95
89
96
90
97
template <typename T>
0 commit comments