From ac72d6f26183aa1c10cb9876e1232ee18c307546 Mon Sep 17 00:00:00 2001 From: Hans Petter Jansson Date: Sun, 21 Apr 2024 17:46:46 +0200 Subject: [PATCH] avx2: Improve mean color extractor Smaller loads perform slightly better than load + extract. Tested on GCC 13/Haswell. Cut down on code verbosity. --- chafa/internal/chafa-avx2.c | 55 ++++++++++++++----------------------- 1 file changed, 20 insertions(+), 35 deletions(-) diff --git a/chafa/internal/chafa-avx2.c b/chafa/internal/chafa-avx2.c index 9cb4082..af77a34 100644 --- a/chafa/internal/chafa-avx2.c +++ b/chafa/internal/chafa-avx2.c @@ -83,51 +83,36 @@ void calc_colors_avx2 (const ChafaPixel *pixels, ChafaColorAccum *accums_out, const guint32 *sym_mask_u32) { - const __m256i *pixels_8x_p = (const __m256i *) pixels; - const __m256i *sym_mask_8x_p = (const __m256i *) sym_mask_u32; - __m256i accum_fg [2] = { { 0 }, { 0 } }; - __m256i accum_bg [2] = { { 0 }, { 0 } }; + const __m128i *pixels_4x_p = (const __m128i *) pixels; + const __m128i *sym_mask_4x_p = (const __m128i *) sym_mask_u32; + __m256i accum_fg = { 0 }; + __m256i accum_bg = { 0 }; __m128i accum_fg_128; __m128i accum_bg_128; gint i; - for (i = 0; i < CHAFA_SYMBOL_N_PIXELS / 8; i++) + for (i = 0; i < CHAFA_SYMBOL_N_PIXELS / 4; i++) { - __m256i pixels_8x, sym_mask_8x; - __m256i p0, fg0, bg0; - __m256i p1, fg1, bg1; - - pixels_8x = _mm256_loadu_si256 (pixels_8x_p); - pixels_8x_p++; - - sym_mask_8x = _mm256_loadu_si256 (sym_mask_8x_p); - sym_mask_8x_p++; + __m128i pixels_4x, sym_mask_4x; - p0 = _mm256_andnot_si256 (sym_mask_8x, pixels_8x); - p1 = _mm256_and_si256 (sym_mask_8x, pixels_8x); + pixels_4x = _mm_loadu_si128 (pixels_4x_p++); + sym_mask_4x = _mm_loadu_si128 (sym_mask_4x_p++); - fg0 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (p0, 0)); - fg1 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (p0, 1)); - accum_fg [0] = _mm256_add_epi16 (accum_fg [0], fg0); - accum_fg [1] = _mm256_add_epi16 (accum_fg [1], fg1); - - bg0 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (p1, 0)); - bg1 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (p1, 1)); - accum_bg [0] = _mm256_add_epi16 (accum_bg [0], bg0); - accum_bg [1] = _mm256_add_epi16 (accum_bg [1], bg1); + accum_fg = _mm256_add_epi16 (accum_fg, + _mm256_cvtepu8_epi16 (_mm_and_si128 (sym_mask_4x, pixels_4x))); + accum_bg = _mm256_add_epi16 (accum_bg, + _mm256_cvtepu8_epi16 (_mm_andnot_si128 (sym_mask_4x, pixels_4x))); } - accum_fg [0] = _mm256_add_epi16 (accum_fg [0], accum_fg [1]); - accum_fg_128 = _mm_add_epi16 (_mm256_extracti128_si256 (accum_fg [0], 0), - _mm256_extracti128_si256 (accum_fg [0], 1)); + accum_bg_128 = _mm_add_epi16 (_mm256_extracti128_si256 (accum_bg, 0), + _mm256_extracti128_si256 (accum_bg, 1)); ((guint64 *) accums_out) [0] = - (guint64) _mm_extract_epi64 (accum_fg_128, 0) - + (guint64) _mm_extract_epi64 (accum_fg_128, 1); - - accum_bg [0] = _mm256_add_epi16 (accum_bg [0], accum_bg [1]); - accum_bg_128 = _mm_add_epi16 (_mm256_extracti128_si256 (accum_bg [0], 0), - _mm256_extracti128_si256 (accum_bg [0], 1)); - ((guint64 *) accums_out) [1] = (guint64) _mm_extract_epi64 (accum_bg_128, 0) + (guint64) _mm_extract_epi64 (accum_bg_128, 1); + + accum_fg_128 = _mm_add_epi16 (_mm256_extracti128_si256 (accum_fg, 0), + _mm256_extracti128_si256 (accum_fg, 1)); + ((guint64 *) accums_out) [1] = + (guint64) _mm_extract_epi64 (accum_fg_128, 0) + + (guint64) _mm_extract_epi64 (accum_fg_128, 1); }