Skip to content

Commit

Permalink
ChafaWorkCell: Add AVX2 implementation for mean color pair extraction
Browse files Browse the repository at this point in the history
It's about twice as fast as the MMX impl. in my tests. This
makes a big difference with -w 9. The improvement is much
smaller in the popcount-accelerated paths.
  • Loading branch information
hpjansson committed Apr 21, 2024
1 parent dc17c01 commit 0e7d5fb
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 1 deletion.
53 changes: 53 additions & 0 deletions chafa/internal/chafa-avx2.c
Expand Up @@ -78,3 +78,56 @@ calc_error_avx2 (const ChafaPixel *pixels, const ChafaColorPair *color_pair,

return e [0] + e [1] + e [2] + e [4] + e [5] + e [6];
}

void
calc_colors_avx2 (const ChafaPixel *pixels, ChafaColorAccum *accums_out,
const guint32 *sym_mask_u32)
{
const __m256i *pixels_8x_p = (const __m256i *) pixels;
const __m256i *sym_mask_8x_p = (const __m256i *) sym_mask_u32;
__m256i accum_fg [2] = { { 0 }, { 0 } };
__m256i accum_bg [2] = { { 0 }, { 0 } };
__m128i accum_fg_128;
__m128i accum_bg_128;
gint i;

for (i = 0; i < CHAFA_SYMBOL_N_PIXELS / 8; i++)
{
__m256i pixels_8x, sym_mask_8x;
__m256i p0, fg0, bg0;
__m256i p1, fg1, bg1;

pixels_8x = _mm256_loadu_si256 (pixels_8x_p);
pixels_8x_p++;

sym_mask_8x = _mm256_loadu_si256 (sym_mask_8x_p);
sym_mask_8x_p++;

p0 = _mm256_andnot_si256 (sym_mask_8x, pixels_8x);
p1 = _mm256_and_si256 (sym_mask_8x, pixels_8x);

fg0 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (p0, 0));
fg1 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (p0, 1));
accum_fg [0] = _mm256_add_epi16 (accum_fg [0], fg0);
accum_fg [1] = _mm256_add_epi16 (accum_fg [1], fg1);

bg0 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (p1, 0));
bg1 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (p1, 1));
accum_bg [0] = _mm256_add_epi16 (accum_bg [0], bg0);
accum_bg [1] = _mm256_add_epi16 (accum_bg [1], bg1);
}

accum_fg [0] = _mm256_add_epi16 (accum_fg [0], accum_fg [1]);
accum_fg_128 = _mm_add_epi16 (_mm256_extracti128_si256 (accum_fg [0], 0),
_mm256_extracti128_si256 (accum_fg [0], 1));
((guint64 *) accums_out) [0] =
(guint64) _mm_extract_epi64 (accum_fg_128, 0)
+ (guint64) _mm_extract_epi64 (accum_fg_128, 1);

accum_bg [0] = _mm256_add_epi16 (accum_bg [0], accum_bg [1]);
accum_bg_128 = _mm_add_epi16 (_mm256_extracti128_si256 (accum_bg [0], 0),
_mm256_extracti128_si256 (accum_bg [0], 1));
((guint64 *) accums_out) [1] =
(guint64) _mm_extract_epi64 (accum_bg_128, 0)
+ (guint64) _mm_extract_epi64 (accum_bg_128, 1);
}
2 changes: 2 additions & 0 deletions chafa/internal/chafa-private.h
Expand Up @@ -217,6 +217,8 @@ gint calc_error_sse41 (const ChafaPixel *pixels, const ChafaColorPair *color_pai
#ifdef HAVE_AVX2_INTRINSICS
gint calc_error_avx2 (const ChafaPixel *pixels, const ChafaColorPair *color_pair,
const guint32 *sym_mask_u32) G_GNUC_PURE;
void calc_colors_avx2 (const ChafaPixel *pixels, ChafaColorAccum *accums_out,
const guint32 *sym_mask_u32);
#endif

#if defined(HAVE_POPCNT64_INTRINSICS) || defined(HAVE_POPCNT32_INTRINSICS)
Expand Down
6 changes: 5 additions & 1 deletion chafa/internal/chafa-work-cell.c
Expand Up @@ -83,7 +83,11 @@ chafa_work_cell_get_mean_colors_for_symbol (const ChafaWorkCell *wcell, const Ch
const guint8 *covp = (guint8 *) &sym->coverage [0];
ChafaColorAccum accums [2] = { 0 };

#ifdef HAVE_MMX_INTRINSICS
#ifdef HAVE_AVX2_INTRINSICS
if (chafa_have_avx2 ())
calc_colors_avx2 (wcell->pixels, accums, sym->mask_u32);
else
#elif HAVE_MMX_INTRINSICS
if (chafa_have_mmx ())
calc_colors_mmx (wcell->pixels, accums, covp);
else
Expand Down

0 comments on commit 0e7d5fb

Please sign in to comment.