Skip to content

Commit

Permalink
Process 4 samples at a time in PhaseShifterT::process
Browse files Browse the repository at this point in the history
For SSE only for now. I'm not sure yet how to do it efficiently for NEON.
  • Loading branch information
kcat committed Mar 28, 2024
1 parent d000c0b commit 5979f18
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 49 deletions.
92 changes: 53 additions & 39 deletions common/phase_shifter.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ struct PhaseShifterT {
}
}

void process(al::span<float> dst, const float *RESTRICT src) const;
void process(const al::span<float> dst, const al::span<const float> src) const;

private:
#if defined(HAVE_NEON)
Expand All @@ -74,49 +74,64 @@ struct PhaseShifterT {
};

template<std::size_t S>
inline void PhaseShifterT<S>::process(al::span<float> dst, const float *RESTRICT src) const
inline
void PhaseShifterT<S>::process(const al::span<float> dst, const al::span<const float> src) const
{
auto in = src.begin();
#ifdef HAVE_SSE_INTRINSICS
if(std::size_t todo{dst.size()>>1})
if(const std::size_t todo{dst.size()>>2})
{
auto *out = reinterpret_cast<__m64*>(dst.data());
do {
__m128 r04{_mm_setzero_ps()};
__m128 r14{_mm_setzero_ps()};
auto out = al::span{reinterpret_cast<__m128*>(dst.data()), todo};
std::generate(out.begin(), out.end(), [&in,this]
{
__m128 r0{_mm_setzero_ps()};
__m128 r1{_mm_setzero_ps()};
__m128 r2{_mm_setzero_ps()};
__m128 r3{_mm_setzero_ps()};
for(std::size_t j{0};j < mCoeffs.size();j+=4)
{
const __m128 coeffs{_mm_load_ps(&mCoeffs[j])};
const __m128 s0{_mm_loadu_ps(&src[j*2])};
const __m128 s1{_mm_loadu_ps(&src[j*2 + 4])};
const __m128 s0{_mm_loadu_ps(&in[j*2])};
const __m128 s1{_mm_loadu_ps(&in[j*2 + 4])};
const __m128 s2{_mm_movehl_ps(_mm_movelh_ps(s1, s1), s0)};
const __m128 s3{_mm_loadh_pi(_mm_movehl_ps(s1, s1),
reinterpret_cast<const __m64*>(&in[j*2 + 8]))};

__m128 s{_mm_shuffle_ps(s0, s1, _MM_SHUFFLE(2, 0, 2, 0))};
r04 = _mm_add_ps(r04, _mm_mul_ps(s, coeffs));
r0 = _mm_add_ps(r0, _mm_mul_ps(s, coeffs));

s = _mm_shuffle_ps(s0, s1, _MM_SHUFFLE(3, 1, 3, 1));
r14 = _mm_add_ps(r14, _mm_mul_ps(s, coeffs));
}
src += 2;
r1 = _mm_add_ps(r1, _mm_mul_ps(s, coeffs));

__m128 r4{_mm_add_ps(_mm_unpackhi_ps(r04, r14), _mm_unpacklo_ps(r04, r14))};
r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
s = _mm_shuffle_ps(s2, s3, _MM_SHUFFLE(2, 0, 2, 0));
r2 = _mm_add_ps(r2, _mm_mul_ps(s, coeffs));

_mm_storel_pi(out, r4);
++out;
} while(--todo);
s = _mm_shuffle_ps(s2, s3, _MM_SHUFFLE(3, 1, 3, 1));
r3 = _mm_add_ps(r3, _mm_mul_ps(s, coeffs));
}
in += 4;

_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
return _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));
});
}
if((dst.size()&1))
if(const std::size_t todo{dst.size()&3})
{
__m128 r4{_mm_setzero_ps()};
for(std::size_t j{0};j < mCoeffs.size();j+=4)
auto out = dst.last(todo);
std::generate(out.begin(), out.end(), [&in,this]
{
const __m128 coeffs{_mm_load_ps(&mCoeffs[j])};
const __m128 s{_mm_setr_ps(src[j*2], src[j*2 + 2], src[j*2 + 4], src[j*2 + 6])};
r4 = _mm_add_ps(r4, _mm_mul_ps(s, coeffs));
}
r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));

dst.back() = _mm_cvtss_f32(r4);
__m128 r4{_mm_setzero_ps()};
for(std::size_t j{0};j < mCoeffs.size();j+=4)
{
const __m128 coeffs{_mm_load_ps(&mCoeffs[j])};
const __m128 s{_mm_setr_ps(in[j*2], in[j*2 + 2], in[j*2 + 4], in[j*2 + 6])};
r4 = _mm_add_ps(r4, _mm_mul_ps(s, coeffs));
}
++in;
r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
return _mm_cvtss_f32(r4);
});
}

#elif defined(HAVE_NEON)
Expand All @@ -130,14 +145,14 @@ inline void PhaseShifterT<S>::process(al::span<float> dst, const float *RESTRICT
for(std::size_t j{0};j < mCoeffs.size();j+=4)
{
const float32x4_t coeffs{vld1q_f32(&mCoeffs[j])};
const float32x4_t s0{vld1q_f32(&src[j*2])};
const float32x4_t s1{vld1q_f32(&src[j*2 + 4])};
const float32x4_t s0{vld1q_f32(&in[j*2])};
const float32x4_t s1{vld1q_f32(&in[j*2 + 4])};
const float32x4x2_t values{vuzpq_f32(s0, s1)};

r04 = vmlaq_f32(r04, values.val[0], coeffs);
r14 = vmlaq_f32(r14, values.val[1], coeffs);
}
src += 2;
in += 2;

float32x4_t r4{vaddq_f32(unpackhi(r04, r14), unpacklo(r04, r14))};
float32x2_t r2{vadd_f32(vget_low_f32(r4), vget_high_f32(r4))};
Expand All @@ -152,7 +167,7 @@ inline void PhaseShifterT<S>::process(al::span<float> dst, const float *RESTRICT
for(std::size_t j{0};j < mCoeffs.size();j+=4)
{
const float32x4_t coeffs{vld1q_f32(&mCoeffs[j])};
const float32x4_t s{load4(src[j*2], src[j*2 + 2], src[j*2 + 4], src[j*2 + 6])};
const float32x4_t s{load4(in[j*2], in[j*2 + 2], in[j*2 + 4], in[j*2 + 6])};
r4 = vmlaq_f32(r4, s, coeffs);
}
r4 = vaddq_f32(r4, vrev64q_f32(r4));
Expand All @@ -161,15 +176,14 @@ inline void PhaseShifterT<S>::process(al::span<float> dst, const float *RESTRICT

#else

for(float &output : dst)
std::generate(dst.begin(), dst.end(), [&in,this]
{
float ret{0.0f};
for(std::size_t j{0};j < mCoeffs.size();++j)
ret += src[j*2] * mCoeffs[j];

output = ret;
++src;
}
ret += in[j*2] * mCoeffs[j];
++in;
return ret;
});
#endif
}

Expand Down
8 changes: 4 additions & 4 deletions core/uhjfilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ void UhjDecoder<N>::decode(const al::span<float*> samples, const size_t samplesT
[](const float d, const float t) noexcept { return 0.828331f*d + 0.767820f*t; });
if(updateState) LIKELY
std::copy_n(mTemp.cbegin()+samplesToDo, mDTHistory.size(), mDTHistory.begin());
PShift.process({xoutput, samplesToDo}, mTemp.data());
PShift.process({xoutput, samplesToDo}, mTemp);

/* W = 0.981532*S + 0.197484*j(0.828331*D + 0.767820*T) */
for(size_t i{0};i < samplesToDo;++i)
Expand All @@ -441,7 +441,7 @@ void UhjDecoder<N>::decode(const al::span<float*> samples, const size_t samplesT
std::copy_n(mS.cbegin(), samplesToDo+sInputPadding, tmpiter);
if(updateState) LIKELY
std::copy_n(mTemp.cbegin()+samplesToDo, mSHistory.size(), mSHistory.begin());
PShift.process({youtput, samplesToDo}, mTemp.data());
PShift.process({youtput, samplesToDo}, mTemp);

/* Y = 0.795968*D - 0.676392*T + j(0.186633*S) */
for(size_t i{0};i < samplesToDo;++i)
Expand Down Expand Up @@ -591,7 +591,7 @@ void UhjStereoDecoder<N>::decode(const al::span<float*> samples, const size_t sa
std::copy_n(mD.cbegin(), samplesToDo+sInputPadding, tmpiter);
if(updateState) LIKELY
std::copy_n(mTemp.cbegin()+samplesToDo, mDTHistory.size(), mDTHistory.begin());
PShift.process({xoutput, samplesToDo}, mTemp.data());
PShift.process({xoutput, samplesToDo}, mTemp);

/* W = 0.6098637*S - 0.6896511*j*w*D */
for(size_t i{0};i < samplesToDo;++i)
Expand All @@ -605,7 +605,7 @@ void UhjStereoDecoder<N>::decode(const al::span<float*> samples, const size_t sa
std::copy_n(mS.cbegin(), samplesToDo+sInputPadding, tmpiter);
if(updateState) LIKELY
std::copy_n(mTemp.cbegin()+samplesToDo, mSHistory.size(), mSHistory.begin());
PShift.process({youtput, samplesToDo}, mTemp.data());
PShift.process({youtput, samplesToDo}, mTemp);

/* Y = 1.6822415*w*D - 0.2156194*j*S */
for(size_t i{0};i < samplesToDo;++i)
Expand Down
8 changes: 4 additions & 4 deletions utils/uhjdecoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ void UhjDecoder::decode(const al::span<const float> InSamples, const std::size_t
std::transform(mD.cbegin(), mD.cbegin()+SamplesToDo+sFilterDelay, mT.cbegin(), tmpiter,
[](const float d, const float t) noexcept { return 0.828331f*d + 0.767820f*t; });
std::copy_n(mTemp.cbegin()+SamplesToDo, mDTHistory.size(), mDTHistory.begin());
PShift.process(xoutput.first(SamplesToDo), mTemp.data());
PShift.process(xoutput.first(SamplesToDo), mTemp);

for(std::size_t i{0};i < SamplesToDo;++i)
{
Expand All @@ -260,7 +260,7 @@ void UhjDecoder::decode(const al::span<const float> InSamples, const std::size_t
tmpiter = std::copy(mSHistory.cbegin(), mSHistory.cend(), mTemp.begin());
std::copy_n(mS.cbegin(), SamplesToDo+sFilterDelay, tmpiter);
std::copy_n(mTemp.cbegin()+SamplesToDo, mSHistory.size(), mSHistory.begin());
PShift.process(youtput.first(SamplesToDo), mTemp.data());
PShift.process(youtput.first(SamplesToDo), mTemp);

for(std::size_t i{0};i < SamplesToDo;++i)
{
Expand Down Expand Up @@ -322,7 +322,7 @@ void UhjDecoder::decode2(const al::span<const float> InSamples,
auto tmpiter = std::copy(mDTHistory.cbegin(), mDTHistory.cend(), mTemp.begin());
std::copy_n(mD.cbegin(), SamplesToDo+sFilterDelay, tmpiter);
std::copy_n(mTemp.cbegin()+SamplesToDo, mDTHistory.size(), mDTHistory.begin());
PShift.process(xoutput.first(SamplesToDo), mTemp.data());
PShift.process(xoutput.first(SamplesToDo), mTemp);

for(std::size_t i{0};i < SamplesToDo;++i)
{
Expand All @@ -336,7 +336,7 @@ void UhjDecoder::decode2(const al::span<const float> InSamples,
tmpiter = std::copy(mSHistory.cbegin(), mSHistory.cend(), mTemp.begin());
std::copy_n(mS.cbegin(), SamplesToDo+sFilterDelay, tmpiter);
std::copy_n(mTemp.cbegin()+SamplesToDo, mSHistory.size(), mSHistory.begin());
PShift.process(youtput.first(SamplesToDo), mTemp.data());
PShift.process(youtput.first(SamplesToDo), mTemp);

for(std::size_t i{0};i < SamplesToDo;++i)
{
Expand Down
4 changes: 2 additions & 2 deletions utils/uhjencoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ void UhjEncoder::encode(const al::span<FloatBufferLine> OutSamples,
[](const float w, const float x) noexcept -> float
{ return -0.3420201f*w + 0.5098604f*x; });
std::copy_n(mTemp.cbegin()+SamplesToDo, mWXHistory1.size(), mWXHistory1.begin());
PShift.process(al::span{mD}.first(SamplesToDo), mTemp.data());
PShift.process(al::span{mD}.first(SamplesToDo), mTemp);

/* D = j(-0.3420201*W + 0.5098604*X) + 0.6554516*Y */
for(size_t i{0};i < SamplesToDo;++i)
Expand All @@ -149,7 +149,7 @@ void UhjEncoder::encode(const al::span<FloatBufferLine> OutSamples,
[](const float w, const float x) noexcept -> float
{ return -0.1432f*w + 0.6512f*x; });
std::copy_n(mTemp.cbegin()+SamplesToDo, mWXHistory2.size(), mWXHistory2.begin());
PShift.process(al::span{mT}.first(SamplesToDo), mTemp.data());
PShift.process(al::span{mT}.first(SamplesToDo), mTemp);

/* T = j(-0.1432*W + 0.6512*X) - 0.7071068*Y */
auto t = al::span{OutSamples[2]};
Expand Down

0 comments on commit 5979f18

Please sign in to comment.