Skip to content

Commit

Permalink
ChafaColor: Add AVX2 impl for accumulator division by scalar
Browse files Browse the repository at this point in the history
The speedup is 15-20% overall with -w 9 (GCC 13/Haswell).
  • Loading branch information
hpjansson committed Apr 21, 2024
1 parent ac72d6f commit 301a715
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 4 deletions.
38 changes: 38 additions & 0 deletions chafa/internal/chafa-avx2.c
Expand Up @@ -116,3 +116,41 @@ calc_colors_avx2 (const ChafaPixel *pixels, ChafaColorAccum *accums_out,
(guint64) _mm_extract_epi64 (accum_fg_128, 0)
+ (guint64) _mm_extract_epi64 (accum_fg_128, 1);
}

/* 32768 divided by index. Divide by zero is defined as zero. */
static const guint16 invdiv16 [257] =
{
0, 32768, 16384, 10922, 8192, 6553, 5461, 4681, 4096, 3640, 3276,
2978, 2730, 2520, 2340, 2184, 2048, 1927, 1820, 1724, 1638, 1560,
1489, 1424, 1365, 1310, 1260, 1213, 1170, 1129, 1092, 1057, 1024,
992, 963, 936, 910, 885, 862, 840, 819, 799, 780, 762, 744, 728,
712, 697, 682, 668, 655, 642, 630, 618, 606, 595, 585, 574, 564,
555, 546, 537, 528, 520, 512, 504, 496, 489, 481, 474, 468, 461,
455, 448, 442, 436, 431, 425, 420, 414, 409, 404, 399, 394, 390,
385, 381, 376, 372, 368, 364, 360, 356, 352, 348, 344, 341, 337,
334, 330, 327, 324, 321, 318, 315, 312, 309, 306, 303, 300, 297,
295, 292, 289, 287, 284, 282, 280, 277, 275, 273, 270, 268, 266,
264, 262, 260, 258, 256, 254, 252, 250, 248, 246, 244, 242, 240,
239, 237, 235, 234, 232, 230, 229, 227, 225, 224, 222, 221, 219,
218, 217, 215, 214, 212, 211, 210, 208, 207, 206, 204, 203, 202,
201, 199, 198, 197, 196, 195, 193, 192, 191, 190, 189, 188, 187,
186, 185, 184, 183, 182, 181, 180, 179, 178, 177, 176, 175, 174,
173, 172, 171, 170, 169, 168, 168, 167, 166, 165, 164, 163, 163,
162, 161, 160, 159, 159, 158, 157, 156, 156, 155, 154, 153, 153,
152, 151, 151, 150, 149, 148, 148, 147, 146, 146, 145, 144, 144,
143, 143, 142, 141, 141, 140, 140, 139, 138, 138, 137, 137, 136,
135, 135, 134, 134, 133, 133, 132, 132, 131, 131, 130, 130, 129,
129, 128, 128
};

/* Divisor must be in the range [0..256] inclusive. */
void
chafa_color_accum_div_scalar_avx2 (ChafaColorAccum *accum, guint16 divisor)
{
__m128i accum_128, divisor_128;

accum_128 = _mm_loadu_si64 ((guint64 *) accum);
divisor_128 = _mm_set1_epi16 (invdiv16 [divisor]);
accum_128 = _mm_mulhrs_epi16 (accum_128, divisor_128);
*((guint64 *) accum) = _mm_extract_epi64 (accum_128, 0);
}
18 changes: 14 additions & 4 deletions chafa/internal/chafa-color.c
Expand Up @@ -22,6 +22,7 @@
#include <stdlib.h> /* abs */
#include <math.h> /* pow, cbrt, log, sqrt, atan2, cos, sin */
#include "chafa.h"
#include "internal/chafa-private.h"
#include "internal/chafa-color.h"

guint32
Expand All @@ -46,10 +47,19 @@ chafa_unpack_color (guint32 packed, ChafaColor *color_out)
void
chafa_color_accum_div_scalar (ChafaColorAccum *accum, gint scalar)
{
accum->ch [0] /= scalar;
accum->ch [1] /= scalar;
accum->ch [2] /= scalar;
accum->ch [3] /= scalar;
#ifdef HAVE_AVX2_INTRINSICS
if (chafa_have_avx2 ())
{
chafa_color_accum_div_scalar_avx2 (accum, scalar);
}
else
#endif
{
accum->ch [0] /= scalar;
accum->ch [1] /= scalar;
accum->ch [2] /= scalar;
accum->ch [3] /= scalar;
}
}

typedef struct
Expand Down
1 change: 1 addition & 0 deletions chafa/internal/chafa-private.h
Expand Up @@ -219,6 +219,7 @@ gint calc_error_avx2 (const ChafaPixel *pixels, const ChafaColorPair *color_pair
const guint32 *sym_mask_u32) G_GNUC_PURE;
void calc_colors_avx2 (const ChafaPixel *pixels, ChafaColorAccum *accums_out,
const guint32 *sym_mask_u32);
void chafa_color_accum_div_scalar_avx2 (ChafaColorAccum *accum, guint16 divisor);
#endif

#if defined(HAVE_POPCNT64_INTRINSICS) || defined(HAVE_POPCNT32_INTRINSICS)
Expand Down

0 comments on commit 301a715

Please sign in to comment.