Skip to content

Commit

Permalink
libchafa: Add an AVX2-optimized path for symbol error calculation
Browse files Browse the repository at this point in the history
This gives an overall speedup of 10-20% over SSE 4.1 with -w 9.
  • Loading branch information
hpjansson committed Mar 31, 2024
1 parent 044d990 commit d3bc4e7
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 11 deletions.
5 changes: 5 additions & 0 deletions chafa/chafa-canvas.c
Expand Up @@ -241,6 +241,11 @@ eval_symbol_error (const ChafaWorkCell *wcell,
pair = eval->colors;
}

#ifdef HAVE_AVX2_INTRINSICS
if (chafa_have_avx2 ())
error = calc_error_avx2 (wcell->pixels, &pair, sym->mask_u32);
else
#endif
#ifdef HAVE_SSE41_INTRINSICS
if (chafa_have_sse41 ())
error = calc_error_sse41 (wcell->pixels, &pair, covp);
Expand Down
54 changes: 43 additions & 11 deletions chafa/chafa-symbol-map.c
Expand Up @@ -266,6 +266,31 @@ bitmap_to_argb (guint64 bitmap, guint8 *argb, gint rowstride)
}
}

static gpointer
bitmap_to_argb_alloc (guint64 bitmap)
{
gpointer argb;

argb = g_malloc (CHAFA_SYMBOL_N_PIXELS * 4);
bitmap_to_argb (bitmap, argb, CHAFA_SYMBOL_WIDTH_PIXELS * 4);
return argb;
}

static gpointer
bitmap2_to_argb_alloc (guint64 bitmap_0, guint64 bitmap_1)
{
gpointer argb;

argb = g_malloc (CHAFA_SYMBOL_N_PIXELS * 4 * 2);

bitmap_to_argb (bitmap_0, argb,
CHAFA_SYMBOL_WIDTH_PIXELS * 4 * 2);
bitmap_to_argb (bitmap_1, ((guint8 *) argb) + CHAFA_SYMBOL_WIDTH_PIXELS * 4,
CHAFA_SYMBOL_WIDTH_PIXELS * 4 * 2);

return argb;
}

static guint64
glyph_to_bitmap (gint width, gint height,
gint rowstride,
Expand Down Expand Up @@ -363,7 +388,10 @@ compile_symbols (ChafaSymbolMap *symbol_map, GHashTable *desired_symbols)
gint i;

for (i = 0; i < symbol_map->n_symbols; i++)
{
g_free (symbol_map->symbols [i].coverage);
g_free (symbol_map->symbols [i].mask_u32);
}

g_free (symbol_map->symbols);
g_free (symbol_map->packed_bitmaps);
Expand All @@ -380,6 +408,7 @@ compile_symbols (ChafaSymbolMap *symbol_map, GHashTable *desired_symbols)
symbol_map->symbols [i] = *sym;
symbol_map->symbols [i].coverage = g_memdup (symbol_map->symbols [i].coverage,
CHAFA_SYMBOL_N_PIXELS);
symbol_map->symbols [i].mask_u32 = bitmap_to_argb_alloc (symbol_map->symbols [i].bitmap);
i++;
}

Expand All @@ -404,7 +433,9 @@ compile_symbols_wide (ChafaSymbolMap *symbol_map, GHashTable *desired_symbols)
for (i = 0; i < symbol_map->n_symbols2; i++)
{
g_free (symbol_map->symbols2 [i].sym [0].coverage);
g_free (symbol_map->symbols2 [i].sym [0].mask_u32);
g_free (symbol_map->symbols2 [i].sym [1].coverage);
g_free (symbol_map->symbols2 [i].sym [1].mask_u32);
}

g_free (symbol_map->symbols2);
Expand All @@ -421,8 +452,10 @@ compile_symbols_wide (ChafaSymbolMap *symbol_map, GHashTable *desired_symbols)
symbol_map->symbols2 [i] = *sym;
symbol_map->symbols2 [i].sym [0].coverage = g_memdup (symbol_map->symbols2 [i].sym [0].coverage,
CHAFA_SYMBOL_N_PIXELS);
symbol_map->symbols2 [i].sym [0].mask_u32 = bitmap_to_argb_alloc (symbol_map->symbols2 [i].sym [0].bitmap);
symbol_map->symbols2 [i].sym [1].coverage = g_memdup (symbol_map->symbols2 [i].sym [1].coverage,
CHAFA_SYMBOL_N_PIXELS);
symbol_map->symbols2 [i].sym [1].mask_u32 = bitmap_to_argb_alloc (symbol_map->symbols2 [i].sym [1].bitmap);
i++;
}

Expand Down Expand Up @@ -507,6 +540,7 @@ free_symbol (gpointer sym_p)
ChafaSymbol *sym = sym_p;

g_free (sym->coverage);
g_free (sym->mask_u32);
g_free (sym);
}

Expand All @@ -516,7 +550,9 @@ free_symbol_wide (gpointer sym_p)
ChafaSymbol2 *sym = sym_p;

g_free (sym->sym [0].coverage);
g_free (sym->sym [0].mask_u32);
g_free (sym->sym [1].coverage);
g_free (sym->sym [1].mask_u32);
g_free (sym);
}

Expand Down Expand Up @@ -1028,12 +1064,17 @@ chafa_symbol_map_deinit (ChafaSymbolMap *symbol_map)
g_return_if_fail (symbol_map != NULL);

for (i = 0; i < symbol_map->n_symbols; i++)
{
g_free (symbol_map->symbols [i].coverage);
g_free (symbol_map->symbols [i].mask_u32);
}

for (i = 0; i < symbol_map->n_symbols2; i++)
{
g_free (symbol_map->symbols2 [i].sym [0].coverage);
g_free (symbol_map->symbols2 [i].sym [0].mask_u32);
g_free (symbol_map->symbols2 [i].sym [1].coverage);
g_free (symbol_map->symbols2 [i].sym [1].mask_u32);
}

g_hash_table_destroy (symbol_map->glyphs);
Expand Down Expand Up @@ -1829,13 +1870,7 @@ chafa_symbol_map_get_glyph (ChafaSymbolMap *symbol_map,
g_assert (glyph2->c == code_point);

if (pixels_out)
{
*pixels_out = g_malloc (CHAFA_SYMBOL_N_PIXELS * 4 * 2);
bitmap_to_argb (glyph2->bitmap [0], *pixels_out,
CHAFA_SYMBOL_WIDTH_PIXELS * 4 * 2);
bitmap_to_argb (glyph2->bitmap [1], ((guint8 *) *pixels_out) + CHAFA_SYMBOL_WIDTH_PIXELS * 4,
CHAFA_SYMBOL_WIDTH_PIXELS * 4 * 2);
}
*pixels_out = bitmap2_to_argb_alloc (glyph2->bitmap [0], glyph2->bitmap [1]);

width = CHAFA_SYMBOL_WIDTH_PIXELS * 2;
height = CHAFA_SYMBOL_HEIGHT_PIXELS;
Expand All @@ -1852,10 +1887,7 @@ chafa_symbol_map_get_glyph (ChafaSymbolMap *symbol_map,
g_assert (glyph->c == code_point);

if (pixels_out)
{
*pixels_out = g_malloc (CHAFA_SYMBOL_N_PIXELS * 4);
bitmap_to_argb (glyph->bitmap, *pixels_out, CHAFA_SYMBOL_WIDTH_PIXELS * 4);
}
*pixels_out = bitmap_to_argb_alloc (glyph->bitmap);

width = CHAFA_SYMBOL_WIDTH_PIXELS;
height = CHAFA_SYMBOL_HEIGHT_PIXELS;
Expand Down
55 changes: 55 additions & 0 deletions chafa/internal/chafa-avx2.c
Expand Up @@ -19,7 +19,62 @@

#include "config.h"

#include <emmintrin.h>
#include <immintrin.h>
#include "chafa.h"
#include "internal/chafa-private.h"

gint
calc_error_avx2 (const ChafaPixel *pixels, const ChafaColorPair *color_pair,
const guint32 *sym_mask_u32)
{
__m256i err_8x_u32 = { 0 };
const gint32 *e = (gint32 *) &err_8x_u32;
__m128i fg_4x_u32, bg_4x_u32;
__m256i fg_4x_u64, bg_4x_u64;
const __m256i *pixels_8x_p = (const __m256i *) pixels;
const __m256i *sym_mask_8x_p = (const __m256i *) sym_mask_u32;
gint i;

fg_4x_u32 = _mm_set1_epi32 (CHAFA_COLOR8_U32 (color_pair->colors [CHAFA_COLOR_PAIR_FG]));
fg_4x_u64 = _mm256_cvtepu8_epi16 (fg_4x_u32);

bg_4x_u32 = _mm_set1_epi32 (CHAFA_COLOR8_U32 (color_pair->colors [CHAFA_COLOR_PAIR_BG]));
bg_4x_u64 = _mm256_cvtepu8_epi16 (bg_4x_u32);

for (i = 0; i < CHAFA_SYMBOL_N_PIXELS / 8; i++)
{
__m256i pixels_8x, sym_mask_8x;
__m256i p0, m0, fg0, bg0, d0;
__m256i p1, m1, fg1, bg1, d1;

pixels_8x = _mm256_loadu_si256 (pixels_8x_p);
pixels_8x_p++;

sym_mask_8x = _mm256_loadu_si256 (sym_mask_8x_p);
sym_mask_8x_p++;

p0 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (pixels_8x, 0));
m0 = _mm256_cvtepi8_epi16 (_mm256_extracti128_si256 (sym_mask_8x, 0));
fg0 = _mm256_and_si256 (m0, _mm256_sub_epi16 (fg_4x_u64, p0));
bg0 = _mm256_andnot_si256 (m0, _mm256_sub_epi16 (bg_4x_u64, p0));
d0 = _mm256_or_si256 (fg0, bg0);
d0 = _mm256_mullo_epi16 (d0, d0);
d0 = _mm256_add_epi32 (_mm256_cvtepu16_epi32 (_mm256_extracti128_si256 (d0, 0)),
_mm256_cvtepu16_epi32 (_mm256_extracti128_si256 (d0, 1)));

p1 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (pixels_8x, 1));
m1 = _mm256_cvtepi8_epi16 (_mm256_extracti128_si256 (sym_mask_8x, 1));
fg1 = _mm256_and_si256 (m1, _mm256_sub_epi16 (fg_4x_u64, p1));
bg1 = _mm256_andnot_si256 (m1, _mm256_sub_epi16 (bg_4x_u64, p1));
d1 = _mm256_or_si256 (fg1, bg1);
d1 = _mm256_mullo_epi16 (d1, d1);
d1 = _mm256_add_epi32 (_mm256_cvtepu16_epi32 (_mm256_extracti128_si256 (d1, 0)),
_mm256_cvtepu16_epi32 (_mm256_extracti128_si256 (d1, 1)));

err_8x_u32 = _mm256_add_epi32 (err_8x_u32, d0);
err_8x_u32 = _mm256_add_epi32 (err_8x_u32, d1);
}

return e [0] + e [1] + e [2] + e [4] + e [5] + e [6];
}
6 changes: 6 additions & 0 deletions chafa/internal/chafa-private.h
Expand Up @@ -42,6 +42,7 @@ typedef struct
ChafaSymbolTags sc;
gunichar c;
gchar *coverage;
guint32 *mask_u32;
gint fg_weight, bg_weight;
guint64 bitmap;
gint popcount;
Expand Down Expand Up @@ -213,6 +214,11 @@ void chafa_leave_mmx (void);
gint calc_error_sse41 (const ChafaPixel *pixels, const ChafaColorPair *color_pair, const guint8 *cov) G_GNUC_PURE;
#endif

#ifdef HAVE_AVX2_INTRINSICS
gint calc_error_avx2 (const ChafaPixel *pixels, const ChafaColorPair *color_pair,
const guint32 *sym_mask_u32) G_GNUC_PURE;
#endif

#if defined(HAVE_POPCNT64_INTRINSICS) || defined(HAVE_POPCNT32_INTRINSICS)
#define HAVE_POPCNT_INTRINSICS
#endif
Expand Down

0 comments on commit d3bc4e7

Please sign in to comment.