Skip to content

Commit

Permalink
avx2: Improve squared error computation
Browse files Browse the repository at this point in the history
A further 10-20% improvement in overall performance with -w 9
(GCC 13/Haswell).

Clean up a little.
  • Loading branch information
hpjansson committed Apr 21, 2024
1 parent 301a715 commit 5e9a6ea
Showing 1 changed file with 16 additions and 27 deletions.
43 changes: 16 additions & 27 deletions chafa/internal/chafa-avx2.c
Expand Up @@ -29,11 +29,11 @@ calc_error_avx2 (const ChafaPixel *pixels, const ChafaColorPair *color_pair,
const guint32 *sym_mask_u32)
{
__m256i err_8x_u32 = { 0 };
const gint32 *e = (gint32 *) &err_8x_u32;
__m128i err_4x_u32;
__m128i fg_4x_u32, bg_4x_u32;
__m256i fg_4x_u64, bg_4x_u64;
const __m256i *pixels_8x_p = (const __m256i *) pixels;
const __m256i *sym_mask_8x_p = (const __m256i *) sym_mask_u32;
const __m128i *pixels_4x_p = (const __m128i *) pixels;
const __m128i *sym_mask_4x_p = (const __m128i *) sym_mask_u32;
gint i;

fg_4x_u32 = _mm_set1_epi32 (CHAFA_COLOR8_U32 (color_pair->colors [CHAFA_COLOR_PAIR_FG]));
Expand All @@ -42,41 +42,30 @@ calc_error_avx2 (const ChafaPixel *pixels, const ChafaColorPair *color_pair,
bg_4x_u32 = _mm_set1_epi32 (CHAFA_COLOR8_U32 (color_pair->colors [CHAFA_COLOR_PAIR_BG]));
bg_4x_u64 = _mm256_cvtepu8_epi16 (bg_4x_u32);

for (i = 0; i < CHAFA_SYMBOL_N_PIXELS / 8; i++)
for (i = 0; i < CHAFA_SYMBOL_N_PIXELS / 4; i++)
{
__m256i pixels_8x, sym_mask_8x;
__m128i pixels_4x, sym_mask_4x;
__m256i p0, m0, fg0, bg0, d0;
__m256i p1, m1, fg1, bg1, d1;

pixels_8x = _mm256_loadu_si256 (pixels_8x_p);
pixels_8x_p++;

sym_mask_8x = _mm256_loadu_si256 (sym_mask_8x_p);
sym_mask_8x_p++;
pixels_4x = _mm_loadu_si128 (pixels_4x_p++);
sym_mask_4x = _mm_loadu_si128 (sym_mask_4x_p++);

p0 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (pixels_8x, 0));
m0 = _mm256_cvtepi8_epi16 (_mm256_extracti128_si256 (sym_mask_8x, 0));
p0 = _mm256_cvtepu8_epi16 (pixels_4x);
m0 = _mm256_cvtepi8_epi16 (sym_mask_4x);
fg0 = _mm256_and_si256 (m0, _mm256_sub_epi16 (fg_4x_u64, p0));
bg0 = _mm256_andnot_si256 (m0, _mm256_sub_epi16 (bg_4x_u64, p0));
d0 = _mm256_or_si256 (fg0, bg0);
d0 = _mm256_mullo_epi16 (d0, d0);
d0 = _mm256_add_epi32 (_mm256_cvtepu16_epi32 (_mm256_extracti128_si256 (d0, 0)),
_mm256_cvtepu16_epi32 (_mm256_extracti128_si256 (d0, 1)));

p1 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (pixels_8x, 1));
m1 = _mm256_cvtepi8_epi16 (_mm256_extracti128_si256 (sym_mask_8x, 1));
fg1 = _mm256_and_si256 (m1, _mm256_sub_epi16 (fg_4x_u64, p1));
bg1 = _mm256_andnot_si256 (m1, _mm256_sub_epi16 (bg_4x_u64, p1));
d1 = _mm256_or_si256 (fg1, bg1);
d1 = _mm256_mullo_epi16 (d1, d1);
d1 = _mm256_add_epi32 (_mm256_cvtepu16_epi32 (_mm256_extracti128_si256 (d1, 0)),
_mm256_cvtepu16_epi32 (_mm256_extracti128_si256 (d1, 1)));
d0 = _mm256_madd_epi16 (d0, d0);

err_8x_u32 = _mm256_add_epi32 (err_8x_u32, d0);
err_8x_u32 = _mm256_add_epi32 (err_8x_u32, d1);
}

return e [0] + e [1] + e [2] + e [4] + e [5] + e [6];
err_4x_u32 = _mm_add_epi32 (_mm256_extracti128_si256 (err_8x_u32, 0),
_mm256_extracti128_si256 (err_8x_u32, 1));
err_4x_u32 = _mm_hadd_epi32 (err_4x_u32, err_4x_u32);
err_4x_u32 = _mm_hadd_epi32 (err_4x_u32, err_4x_u32);

return _mm_extract_epi32 (err_4x_u32, 0);
}

void
Expand Down

0 comments on commit 5e9a6ea

Please sign in to comment.