diff --git a/CMakeLists.txt b/CMakeLists.txt index 6cd71c6..511d281 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -183,7 +183,8 @@ add_library(boo lib/inputdev/DeviceSignature.cpp include/boo/inputdev/DeviceSignature.hpp lib/inputdev/IHIDDevice.hpp lib/audiodev/AudioMatrix.hpp - lib/audiodev/AudioMatrix.cpp + #lib/audiodev/AudioMatrix.cpp + lib/audiodev/AudioMatrixSSE.cpp lib/audiodev/AudioVoiceEngine.hpp lib/audiodev/AudioVoiceEngine.cpp lib/audiodev/AudioVoice.hpp diff --git a/lib/audiodev/AudioMatrix.cpp b/lib/audiodev/AudioMatrix.cpp index e5a1598..5ed770d 100644 --- a/lib/audiodev/AudioMatrix.cpp +++ b/lib/audiodev/AudioMatrix.cpp @@ -9,17 +9,17 @@ void AudioMatrixMono::setDefaultMatrixCoefficients(AudioChannelSet acSet) { m_curSlewFrame = 0; m_slewFrames = 0; - memset(m_coefs, 0, sizeof(m_coefs)); + memset(&m_coefs, 0, sizeof(m_coefs)); switch (acSet) { case AudioChannelSet::Stereo: case AudioChannelSet::Quad: - m_coefs[int(AudioChannel::FrontLeft)] = 1.0; - m_coefs[int(AudioChannel::FrontRight)] = 1.0; + m_coefs.v[int(AudioChannel::FrontLeft)] = 1.0; + m_coefs.v[int(AudioChannel::FrontRight)] = 1.0; break; case AudioChannelSet::Surround51: case AudioChannelSet::Surround71: - m_coefs[int(AudioChannel::FrontCenter)] = 1.0; + m_coefs.v[int(AudioChannel::FrontCenter)] = 1.0; break; default: break; } @@ -41,7 +41,7 @@ int16_t* AudioMatrixMono::mixMonoSampleData(const AudioVoiceEngineMixInfo& info, AudioChannel ch = chmap.m_channels[c]; if (ch != AudioChannel::Unknown) { - *dataOut = Clamp16(*dataOut + *dataIn * (m_coefs[int(ch)] * t + m_oldCoefs[int(ch)] * omt)); + *dataOut = Clamp16(*dataOut + *dataIn * (m_coefs.v[int(ch)] * t + m_oldCoefs.v[int(ch)] * omt)); ++dataOut; } } @@ -55,7 +55,7 @@ int16_t* AudioMatrixMono::mixMonoSampleData(const AudioVoiceEngineMixInfo& info, AudioChannel ch = chmap.m_channels[c]; if (ch != AudioChannel::Unknown) { - *dataOut = Clamp16(*dataOut + *dataIn * m_coefs[int(ch)]); + *dataOut = Clamp16(*dataOut + *dataIn * m_coefs.v[int(ch)]); ++dataOut; } } @@ -80,7 +80,7 @@ int32_t* AudioMatrixMono::mixMonoSampleData(const AudioVoiceEngineMixInfo& info, AudioChannel ch = chmap.m_channels[c]; if (ch != AudioChannel::Unknown) { - *dataOut = Clamp32(*dataOut + *dataIn * (m_coefs[int(ch)] * t + m_oldCoefs[int(ch)] * omt)); + *dataOut = Clamp32(*dataOut + *dataIn * (m_coefs.v[int(ch)] * t + m_oldCoefs.v[int(ch)] * omt)); ++dataOut; } } @@ -94,7 +94,7 @@ int32_t* AudioMatrixMono::mixMonoSampleData(const AudioVoiceEngineMixInfo& info, AudioChannel ch = chmap.m_channels[c]; if (ch != AudioChannel::Unknown) { - *dataOut = Clamp32(*dataOut + *dataIn * m_coefs[int(ch)]); + *dataOut = Clamp32(*dataOut + *dataIn * m_coefs.v[int(ch)]); ++dataOut; } } @@ -119,7 +119,7 @@ float* AudioMatrixMono::mixMonoSampleData(const AudioVoiceEngineMixInfo& info, AudioChannel ch = chmap.m_channels[c]; if (ch != AudioChannel::Unknown) { - *dataOut = ClampFlt(*dataOut + *dataIn * (m_coefs[int(ch)] * t + m_oldCoefs[int(ch)] * omt)); + *dataOut = ClampFlt(*dataOut + *dataIn * (m_coefs.v[int(ch)] * t + m_oldCoefs.v[int(ch)] * omt)); ++dataOut; } } @@ -133,7 +133,7 @@ float* AudioMatrixMono::mixMonoSampleData(const AudioVoiceEngineMixInfo& info, AudioChannel ch = chmap.m_channels[c]; if (ch != AudioChannel::Unknown) { - *dataOut = ClampFlt(*dataOut + *dataIn * m_coefs[int(ch)]); + *dataOut = ClampFlt(*dataOut + *dataIn * m_coefs.v[int(ch)]); ++dataOut; } } @@ -146,18 +146,18 @@ void AudioMatrixStereo::setDefaultMatrixCoefficients(AudioChannelSet acSet) { m_curSlewFrame = 0; m_slewFrames = 0; - memset(m_coefs, 0, sizeof(m_coefs)); + memset(&m_coefs, 0, sizeof(m_coefs)); switch (acSet) { case AudioChannelSet::Stereo: case AudioChannelSet::Quad: - m_coefs[int(AudioChannel::FrontLeft)][0] = 1.0; - m_coefs[int(AudioChannel::FrontRight)][1] = 1.0; + m_coefs.v[int(AudioChannel::FrontLeft)][0] = 1.0; + m_coefs.v[int(AudioChannel::FrontRight)][1] = 1.0; break; case AudioChannelSet::Surround51: case AudioChannelSet::Surround71: - m_coefs[int(AudioChannel::FrontLeft)][0] = 1.0; - m_coefs[int(AudioChannel::FrontRight)][1] = 1.0; + m_coefs.v[int(AudioChannel::FrontLeft)][0] = 1.0; + m_coefs.v[int(AudioChannel::FrontRight)][1] = 1.0; break; default: break; } @@ -180,8 +180,8 @@ int16_t* AudioMatrixStereo::mixStereoSampleData(const AudioVoiceEngineMixInfo& i if (ch != AudioChannel::Unknown) { *dataOut = Clamp16(*dataOut + - *dataIn * (m_coefs[int(ch)][0] * t + m_oldCoefs[int(ch)][0] * omt) + - *dataIn * (m_coefs[int(ch)][1] * t + m_oldCoefs[int(ch)][1] * omt)); + *dataIn * (m_coefs.v[int(ch)][0] * t + m_oldCoefs.v[int(ch)][0] * omt) + + *dataIn * (m_coefs.v[int(ch)][1] * t + m_oldCoefs.v[int(ch)][1] * omt)); ++dataOut; } } @@ -196,8 +196,8 @@ int16_t* AudioMatrixStereo::mixStereoSampleData(const AudioVoiceEngineMixInfo& i if (ch != AudioChannel::Unknown) { *dataOut = Clamp16(*dataOut + - dataIn[0] * m_coefs[int(ch)][0] + - dataIn[1] * m_coefs[int(ch)][1]); + dataIn[0] * m_coefs.v[int(ch)][0] + + dataIn[1] * m_coefs.v[int(ch)][1]); ++dataOut; } } @@ -223,8 +223,8 @@ int32_t* AudioMatrixStereo::mixStereoSampleData(const AudioVoiceEngineMixInfo& i if (ch != AudioChannel::Unknown) { *dataOut = Clamp32(*dataOut + - *dataIn * (m_coefs[int(ch)][0] * t + m_oldCoefs[int(ch)][0] * omt) + - *dataIn * (m_coefs[int(ch)][1] * t + m_oldCoefs[int(ch)][1] * omt)); + *dataIn * (m_coefs.v[int(ch)][0] * t + m_oldCoefs.v[int(ch)][0] * omt) + + *dataIn * (m_coefs.v[int(ch)][1] * t + m_oldCoefs.v[int(ch)][1] * omt)); ++dataOut; } } @@ -239,8 +239,8 @@ int32_t* AudioMatrixStereo::mixStereoSampleData(const AudioVoiceEngineMixInfo& i if (ch != AudioChannel::Unknown) { *dataOut = Clamp32(*dataOut + - dataIn[0] * m_coefs[int(ch)][0] + - dataIn[1] * m_coefs[int(ch)][1]); + dataIn[0] * m_coefs.v[int(ch)][0] + + dataIn[1] * m_coefs.v[int(ch)][1]); ++dataOut; } } @@ -266,8 +266,8 @@ float* AudioMatrixStereo::mixStereoSampleData(const AudioVoiceEngineMixInfo& inf if (ch != AudioChannel::Unknown) { *dataOut = ClampFlt(*dataOut + - *dataIn * (m_coefs[int(ch)][0] * t + m_oldCoefs[int(ch)][0] * omt) + - *dataIn * (m_coefs[int(ch)][1] * t + m_oldCoefs[int(ch)][1] * omt)); + *dataIn * (m_coefs.v[int(ch)][0] * t + m_oldCoefs.v[int(ch)][0] * omt) + + *dataIn * (m_coefs.v[int(ch)][1] * t + m_oldCoefs.v[int(ch)][1] * omt)); ++dataOut; } } @@ -282,8 +282,8 @@ float* AudioMatrixStereo::mixStereoSampleData(const AudioVoiceEngineMixInfo& inf if (ch != AudioChannel::Unknown) { *dataOut = ClampFlt(*dataOut + - dataIn[0] * m_coefs[int(ch)][0] + - dataIn[1] * m_coefs[int(ch)][1]); + dataIn[0] * m_coefs.v[int(ch)][0] + + dataIn[1] * m_coefs.v[int(ch)][1]); ++dataOut; } } diff --git a/lib/audiodev/AudioMatrix.hpp b/lib/audiodev/AudioMatrix.hpp index 68487bf..580b283 100644 --- a/lib/audiodev/AudioMatrix.hpp +++ b/lib/audiodev/AudioMatrix.hpp @@ -6,6 +6,10 @@ #include #include +#if __SSE__ +#include +#endif + namespace boo { struct AudioVoiceEngineMixInfo; @@ -39,8 +43,16 @@ static inline float ClampFlt(float in) class AudioMatrixMono { - float m_coefs[8] = {}; - float m_oldCoefs[8] = {}; + union Coefs + { + float v[8]; +#if __SSE__ + __m128 q[2]; + __m64 d[4]; +#endif + }; + Coefs m_coefs = {}; + Coefs m_oldCoefs = {}; size_t m_slewFrames = 0; size_t m_curSlewFrame = 0; public: @@ -51,11 +63,18 @@ public: { m_slewFrames = slewFrames; m_curSlewFrame = 0; +#if __SSE__ + m_oldCoefs.q[0] = m_coefs.q[0]; + m_oldCoefs.q[1] = m_coefs.q[1]; + m_coefs.q[0] = _mm_loadu_ps(coefs); + m_coefs.q[1] = _mm_loadu_ps(&coefs[4]); +#else for (int i=0 ; i<8 ; ++i) { - m_oldCoefs[i] = m_coefs[i]; - m_coefs[i] = coefs[i]; + m_oldCoefs.v[i] = m_coefs.v[i]; + m_coefs.v[i] = coefs[i]; } +#endif } int16_t* mixMonoSampleData(const AudioVoiceEngineMixInfo& info, @@ -68,8 +87,16 @@ public: class AudioMatrixStereo { - float m_coefs[8][2] = {}; - float m_oldCoefs[8][2] = {}; + union Coefs + { + float v[8][2]; +#if __SSE__ + __m128 q[4]; + __m64 d[8]; +#endif + }; + Coefs m_coefs = {}; + Coefs m_oldCoefs = {}; size_t m_slewFrames = 0; size_t m_curSlewFrame = 0; public: @@ -80,13 +107,24 @@ public: { m_slewFrames = slewFrames; m_curSlewFrame = 0; +#if __SSE__ + m_oldCoefs.q[0] = m_coefs.q[0]; + m_oldCoefs.q[1] = m_coefs.q[1]; + m_oldCoefs.q[2] = m_coefs.q[2]; + m_oldCoefs.q[3] = m_coefs.q[3]; + m_coefs.q[0] = _mm_loadu_ps(coefs[0]); + m_coefs.q[1] = _mm_loadu_ps(coefs[2]); + m_coefs.q[2] = _mm_loadu_ps(coefs[4]); + m_coefs.q[3] = _mm_loadu_ps(coefs[6]); +#else for (int i=0 ; i<8 ; ++i) { - m_oldCoefs[i][0] = m_coefs[i][0]; - m_oldCoefs[i][1] = m_coefs[i][1]; - m_coefs[i][0] = coefs[i][0]; - m_coefs[i][1] = coefs[i][1]; + m_oldCoefs.v[i][0] = m_coefs.v[i][0]; + m_oldCoefs.v[i][1] = m_coefs.v[i][1]; + m_coefs.v[i][0] = coefs.v[i][0]; + m_coefs.v[i][1] = coefs.v[i][1]; } +#endif } int16_t* mixStereoSampleData(const AudioVoiceEngineMixInfo& info, diff --git a/lib/audiodev/AudioMatrixSSE.cpp b/lib/audiodev/AudioMatrixSSE.cpp new file mode 100644 index 0000000..9a40e2e --- /dev/null +++ b/lib/audiodev/AudioMatrixSSE.cpp @@ -0,0 +1,490 @@ +#include "AudioMatrix.hpp" +#include "AudioVoiceEngine.hpp" +#include + +#include + +namespace boo +{ + +typedef union +{ + float v[4]; +#if __SSE__ + __m128 q; + __m64 d[2]; +#endif +} TVectorUnion; + +static constexpr TVectorUnion ZeroVec = {}; +static constexpr TVectorUnion Min16Vec = {INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN}; +static constexpr TVectorUnion Max16Vec = {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX}; +static constexpr TVectorUnion Min32Vec = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN}; +static constexpr TVectorUnion Max32Vec = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX}; +static constexpr TVectorUnion MinFltVec = {-1.f, -1.f, -1.f, -1.f}; +static constexpr TVectorUnion MaxFltVec = {1.f, 1.f, 1.f, 1.f}; + +void AudioMatrixMono::setDefaultMatrixCoefficients(AudioChannelSet acSet) +{ + m_curSlewFrame = 0; + m_slewFrames = 0; + m_coefs.q[0] = _mm_xor_ps(m_coefs.q[0], m_coefs.q[0]); + m_coefs.q[1] = _mm_xor_ps(m_coefs.q[1], m_coefs.q[1]); + switch (acSet) + { + case AudioChannelSet::Stereo: + case AudioChannelSet::Quad: + m_coefs.v[int(AudioChannel::FrontLeft)] = 1.0; + m_coefs.v[int(AudioChannel::FrontRight)] = 1.0; + break; + case AudioChannelSet::Surround51: + case AudioChannelSet::Surround71: + m_coefs.v[int(AudioChannel::FrontCenter)] = 1.0; + break; + default: break; + } +} + +int16_t* AudioMatrixMono::mixMonoSampleData(const AudioVoiceEngineMixInfo& info, + const int16_t* dataIn, int16_t* dataOut, size_t samples) +{ + const ChannelMap& chmap = info.m_channelMap; + for (size_t s=0 ; s(dataIn))); + samps.q = _mm_shuffle_ps(samps.q, samps.q, _MM_SHUFFLE(1, 0, 1, 0)); + + __m128i* out = reinterpret_cast<__m128i*>(dataOut); + __m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(coefs.q, samps.q)); + _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); + + dataOut += 4; + ++s; + ++dataIn; + break; + } + case 4: + { + TVectorUnion coefs, samps; + coefs.q = _mm_add_ps(_mm_mul_ps(m_coefs.q[0], _mm_set1_ps(t)), + _mm_mul_ps(m_oldCoefs.q[0], _mm_set1_ps(omt))); + samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); + + __m128i* out = reinterpret_cast<__m128i*>(dataOut); + __m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(coefs.q, samps.q)); + _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); + + dataOut += 4; + break; + } + case 6: + { + TVectorUnion coefs, samps; + coefs.q = _mm_add_ps(_mm_mul_ps(m_coefs.q[0], _mm_set1_ps(t)), + _mm_mul_ps(m_oldCoefs.q[0], _mm_set1_ps(omt))); + samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); + + __m128i* out = reinterpret_cast<__m128i*>(dataOut); + __m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(coefs.q, samps.q)); + _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); + + dataOut += 4; + + coefs.q = _mm_add_ps(_mm_mul_ps(m_coefs.q[1], _mm_set1_ps(t)), + _mm_mul_ps(m_oldCoefs.q[1], _mm_set1_ps(omt))); + samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); + + out = reinterpret_cast<__m128i*>(dataOut); + __m128i loadOut = _mm_loadu_si128(out); + pre = _mm_add_ps(_mm_cvtepi32_ps(loadOut), _mm_mul_ps(coefs.q, samps.q)); + _mm_storel_epi64(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); + + dataOut += 2; + break; + } + case 8: + { + TVectorUnion coefs, samps; + coefs.q = _mm_add_ps(_mm_mul_ps(m_coefs.q[0], _mm_set1_ps(t)), + _mm_mul_ps(m_oldCoefs.q[0], _mm_set1_ps(omt))); + samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); + + __m128i* out = reinterpret_cast<__m128i*>(dataOut); + __m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(coefs.q, samps.q)); + _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); + + dataOut += 4; + + coefs.q = _mm_add_ps(_mm_mul_ps(m_coefs.q[1], _mm_set1_ps(t)), + _mm_mul_ps(m_oldCoefs.q[1], _mm_set1_ps(omt))); + samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); + + out = reinterpret_cast<__m128i*>(dataOut); + pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(coefs.q, samps.q)); + _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); + + dataOut += 4; + break; + } + default: + { + for (unsigned c=0 ; c(dataIn))); + samps.q = _mm_shuffle_ps(samps.q, samps.q, _MM_SHUFFLE(1, 0, 1, 0)); + + __m128i* out = reinterpret_cast<__m128i*>(dataOut); + __m128i huh2 = _mm_loadu_si128(reinterpret_cast(out)); + __m128 huh3 = _mm_cvtepi32_ps(huh2); + __m128 pre = _mm_add_ps(huh3, _mm_mul_ps(coefs.q, samps.q)); + _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); + + dataOut += 4; + ++s; + ++dataIn; + break; + } + case 4: + { + TVectorUnion samps; + samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); + + __m128i* out = reinterpret_cast<__m128i*>(dataOut); + __m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(m_coefs.q[0], samps.q)); + _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); + + dataOut += 4; + break; + } + case 6: + { + TVectorUnion samps; + samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); + + __m128i* out = reinterpret_cast<__m128i*>(dataOut); + __m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(m_coefs.q[0], samps.q)); + _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); + + dataOut += 4; + + samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); + + out = reinterpret_cast<__m128i*>(dataOut); + __m128i loadOut = _mm_loadu_si128(out); + pre = _mm_add_ps(_mm_cvtepi32_ps(loadOut), _mm_mul_ps(m_coefs.q[1], samps.q)); + _mm_storel_epi64(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); + + dataOut += 2; + break; + } + case 8: + { + TVectorUnion samps; + samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); + + __m128i* out = reinterpret_cast<__m128i*>(dataOut); + __m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(m_coefs.q[0], samps.q)); + _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); + + dataOut += 4; + + samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast(dataIn))); + + out = reinterpret_cast<__m128i*>(dataOut); + pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(m_coefs.q[1], samps.q)); + _mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q))); + + dataOut += 4; + break; + } + default: + { + for (unsigned c=0 ; c