#include "AudioMatrix.hpp" #include "AudioVoiceEngine.hpp" #include #include namespace boo { typedef union { float v[4]; #if __SSE__ __m128 q; __m64 d[2]; #endif } TVectorUnion; static constexpr TVectorUnion ZeroVec = {}; static constexpr TVectorUnion Min16Vec = {INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN}; static constexpr TVectorUnion Max16Vec = {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX}; static constexpr TVectorUnion Min32Vec = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN}; static constexpr TVectorUnion Max32Vec = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX}; static constexpr TVectorUnion MinFltVec = {-1.f, -1.f, -1.f, -1.f}; static constexpr TVectorUnion MaxFltVec = {1.f, 1.f, 1.f, 1.f}; void AudioMatrixMono::setDefaultMatrixCoefficients(AudioChannelSet acSet) { m_curSlewFrame = 0; m_slewFrames = 0; m_coefs.q[0] = _mm_xor_ps(m_coefs.q[0], m_coefs.q[0]); m_coefs.q[1] = _mm_xor_ps(m_coefs.q[1], m_coefs.q[1]); switch (acSet) { case AudioChannelSet::Stereo: case AudioChannelSet::Quad: m_coefs.v[int(AudioChannel::FrontLeft)] = 1.0; m_coefs.v[int(AudioChannel::FrontRight)] = 1.0; break; case AudioChannelSet::Surround51: case AudioChannelSet::Surround71: m_coefs.v[int(AudioChannel::FrontCenter)] = 1.0; break; default: break; } } int16_t* AudioMatrixMono::mixMonoSampleData(const AudioVoiceEngineMixInfo& info, const int16_t* dataIn, int16_t* dataOut, size_t samples) { const ChannelMap& chmap = info.m_channelMap; for (size_t s=0 ; s(dataOut); __m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(coefs.q, samps.q)); _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); dataOut += 4; ++s; ++dataIn; break; } case 4: { TVectorUnion coefs, samps; coefs.q = _mm_add_ps(_mm_mul_ps(m_coefs.q[0], _mm_set1_ps(t)), _mm_mul_ps(m_oldCoefs.q[0], _mm_set1_ps(omt))); samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); __m128i* out = reinterpret_cast<__m128i*>(dataOut); __m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(coefs.q, samps.q)); _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); dataOut += 4; break; } case 6: { TVectorUnion coefs, samps; coefs.q = _mm_add_ps(_mm_mul_ps(m_coefs.q[0], _mm_set1_ps(t)), _mm_mul_ps(m_oldCoefs.q[0], _mm_set1_ps(omt))); samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); __m128i* out = reinterpret_cast<__m128i*>(dataOut); __m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(coefs.q, samps.q)); _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); dataOut += 4; coefs.q = _mm_add_ps(_mm_mul_ps(m_coefs.q[1], _mm_set1_ps(t)), _mm_mul_ps(m_oldCoefs.q[1], _mm_set1_ps(omt))); samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); out = reinterpret_cast<__m128i*>(dataOut); __m128i loadOut = _mm_loadu_si128(out); pre = _mm_add_ps(_mm_cvtepi32_ps(loadOut), _mm_mul_ps(coefs.q, samps.q)); _mm_storel_epi64(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); dataOut += 2; break; } case 8: { TVectorUnion coefs, samps; coefs.q = _mm_add_ps(_mm_mul_ps(m_coefs.q[0], _mm_set1_ps(t)), _mm_mul_ps(m_oldCoefs.q[0], _mm_set1_ps(omt))); samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); __m128i* out = reinterpret_cast<__m128i*>(dataOut); __m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(coefs.q, samps.q)); _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); dataOut += 4; coefs.q = _mm_add_ps(_mm_mul_ps(m_coefs.q[1], _mm_set1_ps(t)), _mm_mul_ps(m_oldCoefs.q[1], _mm_set1_ps(omt))); samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); out = reinterpret_cast<__m128i*>(dataOut); pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(coefs.q, samps.q)); _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); dataOut += 4; break; } default: { for (unsigned c=0 ; c(dataOut); __m128i huh2 = _mm_loadu_si128(reinterpret_cast(out)); __m128 huh3 = _mm_cvtepi32_ps(huh2); __m128 pre = _mm_add_ps(huh3, _mm_mul_ps(coefs.q, samps.q)); _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); dataOut += 4; ++s; ++dataIn; break; } case 4: { TVectorUnion samps; samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); __m128i* out = reinterpret_cast<__m128i*>(dataOut); __m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(m_coefs.q[0], samps.q)); _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); dataOut += 4; break; } case 6: { TVectorUnion samps; samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); __m128i* out = reinterpret_cast<__m128i*>(dataOut); __m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(m_coefs.q[0], samps.q)); _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); dataOut += 4; samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); out = reinterpret_cast<__m128i*>(dataOut); __m128i loadOut = _mm_loadu_si128(out); pre = _mm_add_ps(_mm_cvtepi32_ps(loadOut), _mm_mul_ps(m_coefs.q[1], samps.q)); _mm_storel_epi64(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); dataOut += 2; break; } case 8: { TVectorUnion samps; samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); __m128i* out = reinterpret_cast<__m128i*>(dataOut); __m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(m_coefs.q[0], samps.q)); _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); dataOut += 4; samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); out = reinterpret_cast<__m128i*>(dataOut); pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(m_coefs.q[1], samps.q)); _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); dataOut += 4; break; } default: { for (unsigned c=0 ; c