boo/lib/audiodev/AudioMatrixSSE.cpp
Lioncash baff71cdc3 General: Tidy up includes
Alphabetizes includes and resolves quite a few instances of indirect
inclusions, making the requirements of several interfaces explicit. This
also trims out includes that aren't actually necessary (likely due to
changes in the API over time).
2019-08-19 21:02:56 -04:00

443 lines
16 KiB
C++

#include "lib/audiodev/AudioMatrix.hpp"
#include "lib/audiodev/AudioVoiceEngine.hpp"
#include <immintrin.h>
namespace boo {
union TVectorUnion {
float v[4];
#if __SSE__
__m128 q;
__m64 d[2];
#endif
};
static constexpr TVectorUnion Min32Vec = {{INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN}};
static constexpr TVectorUnion Max32Vec = {{INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX}};
void AudioMatrixMono::setDefaultMatrixCoefficients(AudioChannelSet acSet) {
m_curSlewFrame = 0;
m_slewFrames = 0;
m_coefs.q[0] = _mm_xor_ps(m_coefs.q[0], m_coefs.q[0]);
m_coefs.q[1] = _mm_xor_ps(m_coefs.q[1], m_coefs.q[1]);
switch (acSet) {
case AudioChannelSet::Stereo:
case AudioChannelSet::Quad:
m_coefs.v[int(AudioChannel::FrontLeft)] = 1.0;
m_coefs.v[int(AudioChannel::FrontRight)] = 1.0;
break;
case AudioChannelSet::Surround51:
case AudioChannelSet::Surround71:
m_coefs.v[int(AudioChannel::FrontCenter)] = 1.0;
break;
default:
break;
}
}
int16_t* AudioMatrixMono::mixMonoSampleData(const AudioVoiceEngineMixInfo& info, const int16_t* dataIn,
int16_t* dataOut, size_t samples) {
const ChannelMap& chmap = info.m_channelMap;
for (size_t s = 0; s < samples; ++s, ++dataIn) {
if (m_slewFrames && m_curSlewFrame < m_slewFrames) {
double t = m_curSlewFrame / double(m_slewFrames);
double omt = 1.0 - t;
for (unsigned c = 0; c < chmap.m_channelCount; ++c) {
AudioChannel ch = chmap.m_channels[c];
if (ch != AudioChannel::Unknown) {
*dataOut = Clamp16(*dataOut + *dataIn * (m_coefs.v[int(ch)] * t + m_oldCoefs.v[int(ch)] * omt));
++dataOut;
}
}
++m_curSlewFrame;
} else {
for (unsigned c = 0; c < chmap.m_channelCount; ++c) {
AudioChannel ch = chmap.m_channels[c];
if (ch != AudioChannel::Unknown) {
*dataOut = Clamp16(*dataOut + *dataIn * m_coefs.v[int(ch)]);
++dataOut;
}
}
}
}
return dataOut;
}
int32_t* AudioMatrixMono::mixMonoSampleData(const AudioVoiceEngineMixInfo& info, const int32_t* dataIn,
int32_t* dataOut, size_t samples) {
const ChannelMap& chmap = info.m_channelMap;
for (size_t s = 0; s < samples; ++s, ++dataIn) {
if (m_slewFrames && m_curSlewFrame < m_slewFrames) {
float t = m_curSlewFrame / float(m_slewFrames);
float omt = 1.f - t;
switch (chmap.m_channelCount) {
case 2: {
++m_curSlewFrame;
float t2 = m_curSlewFrame / float(m_slewFrames);
float omt2 = 1.f - t2;
TVectorUnion coefs, samps;
coefs.q = _mm_add_ps(
_mm_mul_ps(_mm_shuffle_ps(m_coefs.q[0], m_coefs.q[0], _MM_SHUFFLE(1, 0, 1, 0)), _mm_set_ps(t, t, t2, t2)),
_mm_mul_ps(_mm_shuffle_ps(m_oldCoefs.q[0], m_oldCoefs.q[0], _MM_SHUFFLE(1, 0, 1, 0)),
_mm_set_ps(omt, omt, omt2, omt2)));
samps.q = _mm_cvtepi32_ps(_mm_set_epi32(dataIn[1], dataIn[0], dataIn[1], dataIn[0]));
__m128i* out = reinterpret_cast<__m128i*>(dataOut);
__m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(coefs.q, samps.q));
_mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q)));
dataOut += 4;
++s;
++dataIn;
break;
}
case 4: {
TVectorUnion coefs, samps;
coefs.q = _mm_add_ps(_mm_mul_ps(m_coefs.q[0], _mm_set1_ps(t)), _mm_mul_ps(m_oldCoefs.q[0], _mm_set1_ps(omt)));
samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(dataIn)));
__m128i* out = reinterpret_cast<__m128i*>(dataOut);
__m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(coefs.q, samps.q));
_mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q)));
dataOut += 4;
break;
}
case 6: {
TVectorUnion coefs, samps;
coefs.q = _mm_add_ps(_mm_mul_ps(m_coefs.q[0], _mm_set1_ps(t)), _mm_mul_ps(m_oldCoefs.q[0], _mm_set1_ps(omt)));
samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(dataIn)));
__m128i* out = reinterpret_cast<__m128i*>(dataOut);
__m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(coefs.q, samps.q));
_mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q)));
dataOut += 4;
coefs.q = _mm_add_ps(_mm_mul_ps(m_coefs.q[1], _mm_set1_ps(t)), _mm_mul_ps(m_oldCoefs.q[1], _mm_set1_ps(omt)));
samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(dataIn)));
out = reinterpret_cast<__m128i*>(dataOut);
__m128i loadOut = _mm_loadu_si128(out);
pre = _mm_add_ps(_mm_cvtepi32_ps(loadOut), _mm_mul_ps(coefs.q, samps.q));
_mm_storel_epi64(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q)));
dataOut += 2;
break;
}
case 8: {
TVectorUnion coefs, samps;
coefs.q = _mm_add_ps(_mm_mul_ps(m_coefs.q[0], _mm_set1_ps(t)), _mm_mul_ps(m_oldCoefs.q[0], _mm_set1_ps(omt)));
samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(dataIn)));
__m128i* out = reinterpret_cast<__m128i*>(dataOut);
__m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(coefs.q, samps.q));
_mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q)));
dataOut += 4;
coefs.q = _mm_add_ps(_mm_mul_ps(m_coefs.q[1], _mm_set1_ps(t)), _mm_mul_ps(m_oldCoefs.q[1], _mm_set1_ps(omt)));
samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(dataIn)));
out = reinterpret_cast<__m128i*>(dataOut);
pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(coefs.q, samps.q));
_mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q)));
dataOut += 4;
break;
}
default: {
for (unsigned c = 0; c < chmap.m_channelCount; ++c) {
AudioChannel ch = chmap.m_channels[c];
if (ch != AudioChannel::Unknown) {
*dataOut = Clamp32(*dataOut + *dataIn * (m_coefs.v[int(ch)] * t + m_oldCoefs.v[int(ch)] * omt));
++dataOut;
}
}
break;
}
}
++m_curSlewFrame;
} else {
switch (chmap.m_channelCount) {
case 2: {
TVectorUnion coefs, samps;
coefs.q = _mm_shuffle_ps(m_coefs.q[0], m_coefs.q[0], _MM_SHUFFLE(1, 0, 1, 0));
samps.q = _mm_cvtepi32_ps(_mm_set_epi32(dataIn[1], dataIn[0], dataIn[1], dataIn[0]));
__m128i* out = reinterpret_cast<__m128i*>(dataOut);
__m128i huh2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(out));
__m128 huh3 = _mm_cvtepi32_ps(huh2);
__m128 pre = _mm_add_ps(huh3, _mm_mul_ps(coefs.q, samps.q));
_mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q)));
dataOut += 4;
++s;
++dataIn;
break;
}
case 4: {
TVectorUnion samps;
samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(dataIn)));
__m128i* out = reinterpret_cast<__m128i*>(dataOut);
__m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(m_coefs.q[0], samps.q));
_mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q)));
dataOut += 4;
break;
}
case 6: {
TVectorUnion samps;
samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(dataIn)));
__m128i* out = reinterpret_cast<__m128i*>(dataOut);
__m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(m_coefs.q[0], samps.q));
_mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q)));
dataOut += 4;
samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(dataIn)));
out = reinterpret_cast<__m128i*>(dataOut);
__m128i loadOut = _mm_loadu_si128(out);
pre = _mm_add_ps(_mm_cvtepi32_ps(loadOut), _mm_mul_ps(m_coefs.q[1], samps.q));
_mm_storel_epi64(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q)));
dataOut += 2;
break;
}
case 8: {
TVectorUnion samps;
samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(dataIn)));
__m128i* out = reinterpret_cast<__m128i*>(dataOut);
__m128 pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(m_coefs.q[0], samps.q));
_mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q)));
dataOut += 4;
samps.q = _mm_cvtepi32_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(dataIn)));
out = reinterpret_cast<__m128i*>(dataOut);
pre = _mm_add_ps(_mm_cvtepi32_ps(_mm_loadu_si128(out)), _mm_mul_ps(m_coefs.q[1], samps.q));
_mm_storeu_si128(out, _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(pre, Min32Vec.q), Max32Vec.q)));
dataOut += 4;
break;
}
default: {
for (unsigned c = 0; c < chmap.m_channelCount; ++c) {
AudioChannel ch = chmap.m_channels[c];
if (ch != AudioChannel::Unknown) {
*dataOut = Clamp32(*dataOut + *dataIn * m_coefs.v[int(ch)]);
++dataOut;
}
}
break;
}
}
}
}
return dataOut;
}
float* AudioMatrixMono::mixMonoSampleData(const AudioVoiceEngineMixInfo& info, const float* dataIn, float* dataOut,
size_t samples) {
const ChannelMap& chmap = info.m_channelMap;
for (size_t s = 0; s < samples; ++s, ++dataIn) {
if (m_slewFrames && m_curSlewFrame < m_slewFrames) {
float t = m_curSlewFrame / float(m_slewFrames);
float omt = 1.f - t;
switch (chmap.m_channelCount) {
case 2: {
++m_curSlewFrame;
float t2 = m_curSlewFrame / float(m_slewFrames);
float omt2 = 1.f - t2;
TVectorUnion coefs, samps;
coefs.q = _mm_add_ps(
_mm_mul_ps(_mm_shuffle_ps(m_coefs.q[0], m_coefs.q[0], _MM_SHUFFLE(1, 0, 1, 0)), _mm_set_ps(t, t, t2, t2)),
_mm_mul_ps(_mm_shuffle_ps(m_oldCoefs.q[0], m_oldCoefs.q[0], _MM_SHUFFLE(1, 0, 1, 0)),
_mm_set_ps(omt, omt, omt2, omt2)));
samps.q = _mm_loadu_ps(dataIn);
samps.q = _mm_shuffle_ps(samps.q, samps.q, _MM_SHUFFLE(1, 0, 1, 0));
__m128 pre = _mm_add_ps(_mm_loadu_ps(dataOut), _mm_mul_ps(coefs.q, samps.q));
_mm_storeu_ps(dataOut, pre);
dataOut += 4;
++s;
++dataIn;
break;
}
default: {
for (unsigned c = 0; c < chmap.m_channelCount; ++c) {
AudioChannel ch = chmap.m_channels[c];
if (ch != AudioChannel::Unknown) {
*dataOut = *dataOut + *dataIn * (m_coefs.v[int(ch)] * t + m_oldCoefs.v[int(ch)] * omt);
++dataOut;
}
}
break;
}
}
++m_curSlewFrame;
} else {
switch (chmap.m_channelCount) {
case 2: {
TVectorUnion coefs, samps;
coefs.q = _mm_shuffle_ps(m_coefs.q[0], m_coefs.q[0], _MM_SHUFFLE(1, 0, 1, 0));
samps.q = _mm_loadu_ps(dataIn);
samps.q = _mm_shuffle_ps(samps.q, samps.q, _MM_SHUFFLE(1, 0, 1, 0));
__m128 pre = _mm_add_ps(_mm_loadu_ps(dataOut), _mm_mul_ps(coefs.q, samps.q));
_mm_storeu_ps(dataOut, pre);
dataOut += 4;
++s;
++dataIn;
break;
}
default: {
for (unsigned c = 0; c < chmap.m_channelCount; ++c) {
AudioChannel ch = chmap.m_channels[c];
if (ch != AudioChannel::Unknown) {
*dataOut = *dataOut + *dataIn * m_coefs.v[int(ch)];
++dataOut;
}
}
break;
}
}
}
}
return dataOut;
}
void AudioMatrixStereo::setDefaultMatrixCoefficients(AudioChannelSet acSet) {
m_curSlewFrame = 0;
m_slewFrames = 0;
m_coefs.q[0] = _mm_xor_ps(m_coefs.q[0], m_coefs.q[0]);
m_coefs.q[1] = _mm_xor_ps(m_coefs.q[1], m_coefs.q[1]);
m_coefs.q[2] = _mm_xor_ps(m_coefs.q[2], m_coefs.q[2]);
m_coefs.q[3] = _mm_xor_ps(m_coefs.q[3], m_coefs.q[3]);
switch (acSet) {
case AudioChannelSet::Stereo:
case AudioChannelSet::Quad:
m_coefs.v[int(AudioChannel::FrontLeft)][0] = 1.0;
m_coefs.v[int(AudioChannel::FrontRight)][1] = 1.0;
break;
case AudioChannelSet::Surround51:
case AudioChannelSet::Surround71:
m_coefs.v[int(AudioChannel::FrontLeft)][0] = 1.0;
m_coefs.v[int(AudioChannel::FrontRight)][1] = 1.0;
break;
default:
break;
}
}
int16_t* AudioMatrixStereo::mixStereoSampleData(const AudioVoiceEngineMixInfo& info, const int16_t* dataIn,
int16_t* dataOut, size_t frames) {
const ChannelMap& chmap = info.m_channelMap;
for (size_t f = 0; f < frames; ++f, dataIn += 2) {
if (m_slewFrames && m_curSlewFrame < m_slewFrames) {
double t = m_curSlewFrame / double(m_slewFrames);
double omt = 1.0 - t;
for (unsigned c = 0; c < chmap.m_channelCount; ++c) {
AudioChannel ch = chmap.m_channels[c];
if (ch != AudioChannel::Unknown) {
*dataOut = Clamp16(*dataOut + *dataIn * (m_coefs.v[int(ch)][0] * t + m_oldCoefs.v[int(ch)][0] * omt) +
*dataIn * (m_coefs.v[int(ch)][1] * t + m_oldCoefs.v[int(ch)][1] * omt));
++dataOut;
}
}
++m_curSlewFrame;
} else {
for (unsigned c = 0; c < chmap.m_channelCount; ++c) {
AudioChannel ch = chmap.m_channels[c];
if (ch != AudioChannel::Unknown) {
*dataOut = Clamp16(*dataOut + dataIn[0] * m_coefs.v[int(ch)][0] + dataIn[1] * m_coefs.v[int(ch)][1]);
++dataOut;
}
}
}
}
return dataOut;
}
int32_t* AudioMatrixStereo::mixStereoSampleData(const AudioVoiceEngineMixInfo& info, const int32_t* dataIn,
int32_t* dataOut, size_t frames) {
const ChannelMap& chmap = info.m_channelMap;
for (size_t f = 0; f < frames; ++f, dataIn += 2) {
if (m_slewFrames && m_curSlewFrame < m_slewFrames) {
double t = m_curSlewFrame / double(m_slewFrames);
double omt = 1.0 - t;
for (unsigned c = 0; c < chmap.m_channelCount; ++c) {
AudioChannel ch = chmap.m_channels[c];
if (ch != AudioChannel::Unknown) {
*dataOut = Clamp32(*dataOut + *dataIn * (m_coefs.v[int(ch)][0] * t + m_oldCoefs.v[int(ch)][0] * omt) +
*dataIn * (m_coefs.v[int(ch)][1] * t + m_oldCoefs.v[int(ch)][1] * omt));
++dataOut;
}
}
++m_curSlewFrame;
} else {
for (unsigned c = 0; c < chmap.m_channelCount; ++c) {
AudioChannel ch = chmap.m_channels[c];
if (ch != AudioChannel::Unknown) {
*dataOut = Clamp32(*dataOut + dataIn[0] * m_coefs.v[int(ch)][0] + dataIn[1] * m_coefs.v[int(ch)][1]);
++dataOut;
}
}
}
}
return dataOut;
}
float* AudioMatrixStereo::mixStereoSampleData(const AudioVoiceEngineMixInfo& info, const float* dataIn, float* dataOut,
size_t frames) {
const ChannelMap& chmap = info.m_channelMap;
for (size_t f = 0; f < frames; ++f, dataIn += 2) {
if (m_slewFrames && m_curSlewFrame < m_slewFrames) {
double t = m_curSlewFrame / double(m_slewFrames);
double omt = 1.0 - t;
for (unsigned c = 0; c < chmap.m_channelCount; ++c) {
AudioChannel ch = chmap.m_channels[c];
if (ch != AudioChannel::Unknown) {
*dataOut = *dataOut + *dataIn * (m_coefs.v[int(ch)][0] * t + m_oldCoefs.v[int(ch)][0] * omt) +
*dataIn * (m_coefs.v[int(ch)][1] * t + m_oldCoefs.v[int(ch)][1] * omt);
++dataOut;
}
}
++m_curSlewFrame;
} else {
for (unsigned c = 0; c < chmap.m_channelCount; ++c) {
AudioChannel ch = chmap.m_channels[c];
if (ch != AudioChannel::Unknown) {
*dataOut = *dataOut + dataIn[0] * m_coefs.v[int(ch)][0] + dataIn[1] * m_coefs.v[int(ch)][1];
++dataOut;
}
}
}
}
return dataOut;
}
} // namespace boo