From 5e9a6eab202022b8d31d6bc342b06da6573d5ca8 Mon Sep 17 00:00:00 2001 From: Hans Petter Jansson Date: Sun, 21 Apr 2024 23:32:43 +0200 Subject: [PATCH] avx2: Improve squared error computation A further 10-20% improvement in overall performance with -w 9 (GCC 13/Haswell). Clean up a little. --- chafa/internal/chafa-avx2.c | 43 ++++++++++++++----------------------- 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/chafa/internal/chafa-avx2.c b/chafa/internal/chafa-avx2.c index 92f13c8..6ac6ead 100644 --- a/chafa/internal/chafa-avx2.c +++ b/chafa/internal/chafa-avx2.c @@ -29,11 +29,11 @@ calc_error_avx2 (const ChafaPixel *pixels, const ChafaColorPair *color_pair, const guint32 *sym_mask_u32) { __m256i err_8x_u32 = { 0 }; - const gint32 *e = (gint32 *) &err_8x_u32; + __m128i err_4x_u32; __m128i fg_4x_u32, bg_4x_u32; __m256i fg_4x_u64, bg_4x_u64; - const __m256i *pixels_8x_p = (const __m256i *) pixels; - const __m256i *sym_mask_8x_p = (const __m256i *) sym_mask_u32; + const __m128i *pixels_4x_p = (const __m128i *) pixels; + const __m128i *sym_mask_4x_p = (const __m128i *) sym_mask_u32; gint i; fg_4x_u32 = _mm_set1_epi32 (CHAFA_COLOR8_U32 (color_pair->colors [CHAFA_COLOR_PAIR_FG])); @@ -42,41 +42,30 @@ calc_error_avx2 (const ChafaPixel *pixels, const ChafaColorPair *color_pair, bg_4x_u32 = _mm_set1_epi32 (CHAFA_COLOR8_U32 (color_pair->colors [CHAFA_COLOR_PAIR_BG])); bg_4x_u64 = _mm256_cvtepu8_epi16 (bg_4x_u32); - for (i = 0; i < CHAFA_SYMBOL_N_PIXELS / 8; i++) + for (i = 0; i < CHAFA_SYMBOL_N_PIXELS / 4; i++) { - __m256i pixels_8x, sym_mask_8x; + __m128i pixels_4x, sym_mask_4x; __m256i p0, m0, fg0, bg0, d0; - __m256i p1, m1, fg1, bg1, d1; - pixels_8x = _mm256_loadu_si256 (pixels_8x_p); - pixels_8x_p++; - - sym_mask_8x = _mm256_loadu_si256 (sym_mask_8x_p); - sym_mask_8x_p++; + pixels_4x = _mm_loadu_si128 (pixels_4x_p++); + sym_mask_4x = _mm_loadu_si128 (sym_mask_4x_p++); - p0 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (pixels_8x, 0)); - m0 = _mm256_cvtepi8_epi16 (_mm256_extracti128_si256 (sym_mask_8x, 0)); + p0 = _mm256_cvtepu8_epi16 (pixels_4x); + m0 = _mm256_cvtepi8_epi16 (sym_mask_4x); fg0 = _mm256_and_si256 (m0, _mm256_sub_epi16 (fg_4x_u64, p0)); bg0 = _mm256_andnot_si256 (m0, _mm256_sub_epi16 (bg_4x_u64, p0)); d0 = _mm256_or_si256 (fg0, bg0); - d0 = _mm256_mullo_epi16 (d0, d0); - d0 = _mm256_add_epi32 (_mm256_cvtepu16_epi32 (_mm256_extracti128_si256 (d0, 0)), - _mm256_cvtepu16_epi32 (_mm256_extracti128_si256 (d0, 1))); - - p1 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (pixels_8x, 1)); - m1 = _mm256_cvtepi8_epi16 (_mm256_extracti128_si256 (sym_mask_8x, 1)); - fg1 = _mm256_and_si256 (m1, _mm256_sub_epi16 (fg_4x_u64, p1)); - bg1 = _mm256_andnot_si256 (m1, _mm256_sub_epi16 (bg_4x_u64, p1)); - d1 = _mm256_or_si256 (fg1, bg1); - d1 = _mm256_mullo_epi16 (d1, d1); - d1 = _mm256_add_epi32 (_mm256_cvtepu16_epi32 (_mm256_extracti128_si256 (d1, 0)), - _mm256_cvtepu16_epi32 (_mm256_extracti128_si256 (d1, 1))); + d0 = _mm256_madd_epi16 (d0, d0); err_8x_u32 = _mm256_add_epi32 (err_8x_u32, d0); - err_8x_u32 = _mm256_add_epi32 (err_8x_u32, d1); } - return e [0] + e [1] + e [2] + e [4] + e [5] + e [6]; + err_4x_u32 = _mm_add_epi32 (_mm256_extracti128_si256 (err_8x_u32, 0), + _mm256_extracti128_si256 (err_8x_u32, 1)); + err_4x_u32 = _mm_hadd_epi32 (err_4x_u32, err_4x_u32); + err_4x_u32 = _mm_hadd_epi32 (err_4x_u32, err_4x_u32); + + return _mm_extract_epi32 (err_4x_u32, 0); } void