Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AVX optimizations for the convertBuffer function #241

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
24 changes: 24 additions & 0 deletions CMakeLists.txt
Expand Up @@ -64,6 +64,30 @@ if (CMAKE_COMPILER_IS_GNUCXX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
endif (CMAKE_COMPILER_IS_GNUCXX)

#default SIMD configuration uses AVX2 build flags
if(NOT DEFAULT_SIMD_FLAGS)
set(DEFAULT_SIMD_FLAGS "AVX2")
endif()

SET(ENABLE_SIMD_FLAGS "${DEFAULT_SIMD_FLAGS}" CACHE STRING "Set compiler SIMD flags")
SET_PROPERTY(CACHE ENABLE_SIMD_FLAGS PROPERTY STRINGS none AVX2)

#set up according to your own system environment
#Windows
if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC"
OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC"))
if(${ENABLE_SIMD_FLAGS} MATCHES "AVX2")
add_definitions(/arch:AVX2 /D__AVX__)
message(STATUS "Enabling AVX2 instructions")
endif()
#Linux
else()
if(${ENABLE_SIMD_FLAGS} MATCHES "AVX")
add_definitions(-mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mpopcnt -mavx -mavx2 -mfma)
message(STATUS "Enabling AVX2 instructions")
endif()
endif()

# Add debug flags
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
add_definitions(-D__RTAUDIO_DEBUG__)
Expand Down
111 changes: 95 additions & 16 deletions RtAudio.cpp
Expand Up @@ -49,6 +49,13 @@
#include <cmath>
#include <algorithm>

#if __SSE2__
#include <emmintrin.h>
#endif
#if __AVX__
#include <immintrin.h>
#endif

// Static variable definitions.
const unsigned int RtApi::MAX_SAMPLE_RATES = 14;
const unsigned int RtApi::SAMPLE_RATES[] = {
Expand Down Expand Up @@ -537,7 +544,7 @@ struct CoreHandle {
bool internalDrain; // Indicates if stop is initiated from callback or not.

CoreHandle()
:deviceBuffer(0), drainCounter(0), internalDrain(false) { nStreams[0] = 1; nStreams[1] = 1; id[0] = 0; id[1] = 0; xrun[0] = false; xrun[1] = false; }
:deviceBuffer(0), drainCounter(0), internalDrain(false) { iStream[0] = 0; iStream[1] = 0; nStreams[0] = 1; nStreams[1] = 1; id[0] = 0; id[1] = 0; xrun[0] = false; xrun[1] = false; }
};

RtApiCore:: RtApiCore()
Expand Down Expand Up @@ -10386,6 +10393,12 @@ void RtApi :: setConvertInfo( StreamMode mode, unsigned int firstChannel )

void RtApi :: convertBuffer( char *outBuffer, char *inBuffer, ConvertInfo &info )
{
#ifdef __AVX__
static const float kBias[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
static const float kScale[8] = {32767.5f, 32767.5f, 32767.5f, 32767.5f, 32767.5f, 32767.5f, 32767.5f, 32767.5f};
static const float kScaleI[8] = {1.0f / 32767.5f, 1.0f / 32767.5f, 1.0f / 32767.5f, 1.0f / 32767.5f, 1.0f / 32767.5f, 1.0f / 32767.5f, 1.0f / 32767.5f, 1.0f / 32767.5f};
#endif

// This function does format conversion, input/output channel compensation, and
// data interleaving/deinterleaving. 24-bit integers are assumed to occupy
// the lower three bytes of a 32-bit integer.
Expand All @@ -10395,6 +10408,12 @@ void RtApi :: convertBuffer( char *outBuffer, char *inBuffer, ConvertInfo &info
( stream_.nDeviceChannels[0] < stream_.nDeviceChannels[1] ) )
memset( outBuffer, 0, stream_.bufferSize * info.outJump * formatBytes( info.outFormat ) );

if (info.outFormat == info.inFormat && info.channels == 1)
{
std::memcpy(outBuffer, inBuffer, stream_.bufferSize * formatBytes(info.outFormat));
return;
}

int j;
if (info.outFormat == RTAUDIO_FLOAT64) {
Float64 scale;
Expand Down Expand Up @@ -10493,15 +10512,44 @@ void RtApi :: convertBuffer( char *outBuffer, char *inBuffer, ConvertInfo &info
}
else if (info.inFormat == RTAUDIO_SINT16) {
Int16 *in = (Int16 *)inBuffer;
scale = (Float32) ( 1.0 / 32767.5 );
for (unsigned int i=0; i<stream_.bufferSize; i++) {
for (j=0; j<info.channels; j++) {
out[info.outOffset[j]] = (Float32) in[info.inOffset[j]];
out[info.outOffset[j]] += 0.5;
out[info.outOffset[j]] *= scale;
}
in += info.inJump;
out += info.outJump;
scale = (Float32) ( 1.0f / 32767.5f );
if (info.channels == 1)
{
#if __AVX__
if (stream_.bufferSize >= 8 && stream_.bufferSize % 8 == 0)
{
__m256 _bias = _mm256_broadcast_ss(kBias);
__m256 _scale = _mm256_broadcast_ss(kScaleI);
for (unsigned int i=0; i<stream_.bufferSize; i+=8) {
__m128i x = _mm_loadu_si128((const __m128i*)(in + i));
__m256i x_unpacked = _mm256_cvtepi16_epi32(x);
__m256 converted = _mm256_cvtepi32_ps(x_unpacked);
converted = _mm256_mul_ps(_mm256_add_ps(converted, _bias), _scale);
_mm256_store_ps(out + i, converted);
}
}
else{
for (unsigned int i=0; i<stream_.bufferSize; ++i) {
out[i] = ((Float32) in[i] + 0.5) * scale;
}
}
#else
for (unsigned int i=0; i<stream_.bufferSize; ++i) {
out[i] = ((Float32) in[i] + 0.5) * scale;
}
#endif
}
else
{
for (unsigned int i=0; i<stream_.bufferSize; i++) {
for (j=0; j<info.channels; j++) {
out[info.outOffset[j]] = (Float32) in[info.inOffset[j]];
out[info.outOffset[j]] += 0.5;
out[info.outOffset[j]] *= scale;
}
in += info.inJump;
out += info.outJump;
}
}
}
else if (info.inFormat == RTAUDIO_SINT24) {
Expand Down Expand Up @@ -10732,12 +10780,43 @@ void RtApi :: convertBuffer( char *outBuffer, char *inBuffer, ConvertInfo &info
}
else if (info.inFormat == RTAUDIO_FLOAT32) {
Float32 *in = (Float32 *)inBuffer;
for (unsigned int i=0; i<stream_.bufferSize; i++) {
for (j=0; j<info.channels; j++) {
out[info.outOffset[j]] = (Int16) (in[info.inOffset[j]] * 32767.5 - 0.5);
}
in += info.inJump;
out += info.outJump;
if (info.channels == 1)
{
#if __AVX__
if (stream_.bufferSize >= 8 && stream_.bufferSize % 8 == 0)
{
__m256 _bias = _mm256_broadcast_ss(kBias);
__m256 _scale = _mm256_broadcast_ss(kScale);
for (unsigned int i=0; i<stream_.bufferSize; i+=8) {
__m256 in_x8 = _mm256_load_ps(in + i);
__m256 scaled_bias = _mm256_fmsub_ps(in_x8, _scale, _bias);
__m256i x = _mm256_cvtps_epi32(scaled_bias);
__m128i xlo = _mm256_extractf128_si256(x, 0);
__m128i xhi = _mm256_extractf128_si256(x, 1);
__m128i converted = _mm_packs_epi32(xlo, xhi);
_mm_store_si128((__m128i *)(out + i), converted);
}
}
else{
for (unsigned int i=0; i<stream_.bufferSize; ++i) {
out[i] = (Int16) (in[i] * 32767.5 - 0.5);
}
}
#else
for (unsigned int i=0; i<stream_.bufferSize; ++i) {
out[i] = (Int16) (in[i] * 32767.5 - 0.5);
}
#endif
}
else
{
for (unsigned int i=0; i<stream_.bufferSize; i++) {
for (j=0; j<info.channels; j++) {
out[info.outOffset[j]] = (Int16) (in[info.inOffset[j]] * 32767.5 - 0.5);
}
in += info.inJump;
out += info.outJump;
}
}
}
else if (info.inFormat == RTAUDIO_FLOAT64) {
Expand Down