From 3925f8509ed470c1ff0e1329d2d9f9d88519f008 Mon Sep 17 00:00:00 2001 From: Jack Andersen Date: Mon, 4 Jul 2016 12:28:59 -1000 Subject: [PATCH] alternate matrix transpose code-path for pre-SSE4.1 --- CMakeLists.txt | 31 ++++++++++--- include/zeus/CMatrix3f.hpp | 2 + include/zeus/CMatrix4f.hpp | 37 +-------------- src/CMatrix3f.cpp | 7 +++ src/CMatrix4f.cpp | 45 +++++++++++++++++- src/SSELegacy.cpp | 93 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 173 insertions(+), 42 deletions(-) create mode 100644 src/SSELegacy.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 2ee90fb..15417fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,11 +6,7 @@ if (NOT DEFINED ATHENA_INCLUDE_DIR) endif() include_directories(include ${ATHENA_INCLUDE_DIR}) -if(NOT MSVC) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1 -msse4.2 -std=c++14") -endif() - -add_library(zeus +set(SOURCES src/CVector3f.cpp src/Math.cpp src/CQuaternion.cpp @@ -23,7 +19,30 @@ add_library(zeus src/CRectangle.cpp src/CVector4f.cpp src/CMatrix4f.cpp - src/CAABox.cpp + src/CAABox.cpp) + +# SSELegacy.cpp compiled separately to escape the effects of link-time optimization +if(NOT MSVC) +set_source_files_properties(${SOURCES} PROPERTIES COMPILE_FLAGS "-msse4.1 -msse4.2 -std=c++14") +string(REPLACE "-flto" "" CUSTOM_FLAGS ${CMAKE_CXX_FLAGS}) +set(CUSTOM_FLAGS "${CUSTOM_FLAGS} -O3 -msse3 -std=c++14") +else() +string(REPLACE "/GL" "" CUSTOM_FLAGS ${CMAKE_CXX_FLAGS}) +endif() +separate_arguments(CUSTOM_FLAGS UNIX_COMMAND ${CUSTOM_FLAGS}) +add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/src/SSELegacy.o + COMMAND ${CMAKE_CXX_COMPILER} + ARGS ${CUSTOM_FLAGS} -c ${CMAKE_CURRENT_SOURCE_DIR}/src/SSELegacy.cpp + -o ${CMAKE_CURRENT_BINARY_DIR}/src/SSELegacy.o + -I ${CMAKE_CURRENT_SOURCE_DIR}/include + -I ${ATHENA_INCLUDE_DIR} + + MAIN_DEPENDENCY src/SSELegacy.cpp) + +add_library(zeus + ${SOURCES} + ${CMAKE_CURRENT_BINARY_DIR}/src/SSELegacy.o include/zeus/Math.hpp include/zeus/CQuaternion.hpp diff --git a/include/zeus/CMatrix3f.hpp b/include/zeus/CMatrix3f.hpp index ef8b483..9ee4f40 100644 --- a/include/zeus/CMatrix3f.hpp +++ b/include/zeus/CMatrix3f.hpp @@ -137,7 +137,9 @@ public: static const CMatrix3f skIdentityMatrix3f; void transpose(); + void transposeSSE3(); CMatrix3f transposed() const; + CMatrix3f transposedSSE3() const; inline void invert() {*this = inverted();} CMatrix3f inverted() const; diff --git a/include/zeus/CMatrix4f.hpp b/include/zeus/CMatrix4f.hpp index 3878d83..062148d 100644 --- a/include/zeus/CMatrix4f.hpp +++ b/include/zeus/CMatrix4f.hpp @@ -96,41 +96,8 @@ public: return vec[i]; } - inline CMatrix4f transposed() - { - CMatrix4f ret; -#if __SSE__ - __m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128); - __m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, vec[3].mVec128); - __m128 T1 = _mm_unpackhi_ps(vec[0].mVec128, vec[1].mVec128); - __m128 T3 = _mm_unpackhi_ps(vec[2].mVec128, vec[3].mVec128); - ret.vec[0].mVec128 = _mm_movelh_ps(T0, T2); - ret.vec[1].mVec128 = _mm_movehl_ps(T2, T0); - ret.vec[2].mVec128 = _mm_movelh_ps(T1, T3); - ret.vec[3].mVec128 = _mm_movehl_ps(T3, T1); -#else - ret.m[0][0] = m[0][0]; - ret.m[1][0] = m[0][1]; - ret.m[2][0] = m[0][2]; - ret.m[3][0] = m[0][3]; - - ret.m[0][1] = m[1][0]; - ret.m[1][1] = m[1][1]; - ret.m[2][1] = m[1][2]; - ret.m[3][1] = m[1][3]; - - ret.m[0][2] = m[2][0]; - ret.m[1][2] = m[2][1]; - ret.m[2][2] = m[2][2]; - ret.m[3][2] = m[2][3]; - - ret.m[0][3] = m[3][0]; - ret.m[1][3] = m[3][1]; - ret.m[2][3] = m[3][2]; - ret.m[3][3] = m[3][3]; -#endif - return ret; - } + CMatrix4f transposed() const; + CMatrix4f transposedSSE3() const; inline CVector3f multiplyOneOverW(const CVector3f& point) const { diff --git a/src/CMatrix3f.cpp b/src/CMatrix3f.cpp index 869baf6..b7c3b53 100644 --- a/src/CMatrix3f.cpp +++ b/src/CMatrix3f.cpp @@ -33,6 +33,11 @@ CMatrix3f::CMatrix3f(const CQuaternion& quat) void CMatrix3f::transpose() { #if __SSE__ + if (!cpuFeatures().SSE41) + { + transposeSSE3(); + return; + } __m128 zero = _mm_xor_ps(vec[0].mVec128, vec[0].mVec128); __m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128); __m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, zero); @@ -61,6 +66,8 @@ void CMatrix3f::transpose() CMatrix3f CMatrix3f::transposed() const { #if __SSE__ + if (!cpuFeatures().SSE41) + return transposedSSE3(); __m128 zero = _mm_xor_ps(vec[0].mVec128, vec[0].mVec128); __m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128); __m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, zero); diff --git a/src/CMatrix4f.cpp b/src/CMatrix4f.cpp index 846d520..7f09482 100644 --- a/src/CMatrix4f.cpp +++ b/src/CMatrix4f.cpp @@ -1,3 +1,46 @@ #include "zeus/CMatrix4f.hpp" -const zeus::CMatrix4f zeus::CMatrix4f::skIdentityMatrix4f = CMatrix4f(); +namespace zeus +{ + +const CMatrix4f CMatrix4f::skIdentityMatrix4f = CMatrix4f(); + +CMatrix4f CMatrix4f::transposed() const +{ + CMatrix4f ret; +#if __SSE__ + if (!cpuFeatures().SSE41) + return transposedSSE3(); + __m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128); + __m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, vec[3].mVec128); + __m128 T1 = _mm_unpackhi_ps(vec[0].mVec128, vec[1].mVec128); + __m128 T3 = _mm_unpackhi_ps(vec[2].mVec128, vec[3].mVec128); + ret.vec[0].mVec128 = _mm_movelh_ps(T0, T2); + ret.vec[1].mVec128 = _mm_movehl_ps(T2, T0); + ret.vec[2].mVec128 = _mm_movelh_ps(T1, T3); + ret.vec[3].mVec128 = _mm_movehl_ps(T3, T1); +#else + ret.m[0][0] = m[0][0]; + ret.m[1][0] = m[0][1]; + ret.m[2][0] = m[0][2]; + ret.m[3][0] = m[0][3]; + + ret.m[0][1] = m[1][0]; + ret.m[1][1] = m[1][1]; + ret.m[2][1] = m[1][2]; + ret.m[3][1] = m[1][3]; + + ret.m[0][2] = m[2][0]; + ret.m[1][2] = m[2][1]; + ret.m[2][2] = m[2][2]; + ret.m[3][2] = m[2][3]; + + ret.m[0][3] = m[3][0]; + ret.m[1][3] = m[3][1]; + ret.m[2][3] = m[3][2]; + ret.m[3][3] = m[3][3]; +#endif + return ret; +} + +} diff --git a/src/SSELegacy.cpp b/src/SSELegacy.cpp new file mode 100644 index 0000000..5be08e1 --- /dev/null +++ b/src/SSELegacy.cpp @@ -0,0 +1,93 @@ +#include "zeus/CMatrix3f.hpp" +#include "zeus/CMatrix4f.hpp" + +namespace zeus +{ + +void CMatrix3f::transposeSSE3() +{ +#if __SSE__ + __m128 zero = _mm_xor_ps(vec[0].mVec128, vec[0].mVec128); + __m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128); + __m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, zero); + __m128 T1 = _mm_unpackhi_ps(vec[0].mVec128, vec[1].mVec128); + __m128 T3 = _mm_unpackhi_ps(vec[2].mVec128, zero); + vec[0].mVec128 = _mm_movelh_ps(T0, T2); + vec[1].mVec128 = _mm_movehl_ps(T2, T0); + vec[2].mVec128 = _mm_movelh_ps(T1, T3); +#else + float tmp; + + tmp = m[0][1]; + m[0][1] = m[1][0]; + m[1][0] = tmp; + + tmp = m[0][2]; + m[0][2] = m[2][0]; + m[2][0] = tmp; + + tmp = m[1][2]; + m[1][2] = m[2][1]; + m[2][1] = tmp; +#endif +} + +CMatrix3f CMatrix3f::transposedSSE3() const +{ +#if __SSE__ + __m128 zero = _mm_xor_ps(vec[0].mVec128, vec[0].mVec128); + __m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128); + __m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, zero); + __m128 T1 = _mm_unpackhi_ps(vec[0].mVec128, vec[1].mVec128); + __m128 T3 = _mm_unpackhi_ps(vec[2].mVec128, zero); + return CMatrix3f(_mm_movelh_ps(T0, T2), _mm_movehl_ps(T2, T0), _mm_movelh_ps(T1, T3)); +#else + CMatrix3f ret(*this); + float tmp; + + tmp = ret.m[0][1]; + ret.m[0][1] = ret.m[1][0]; + ret.m[1][0] = tmp; + + tmp = m[0][2]; + ret.m[0][2] = ret.m[2][0]; + ret.m[2][0] = tmp; + + tmp = m[1][2]; + ret.m[1][2] = ret.m[2][1]; + ret.m[2][1] = tmp; + + return ret; +#endif +} + +CMatrix4f CMatrix4f::transposedSSE3() const +{ +#if __SSE__ + __m128 zero = _mm_xor_ps(vec[0].mVec128, vec[0].mVec128); + __m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128); + __m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, zero); + __m128 T1 = _mm_unpackhi_ps(vec[0].mVec128, vec[1].mVec128); + __m128 T3 = _mm_unpackhi_ps(vec[2].mVec128, zero); + return CMatrix3f(_mm_movelh_ps(T0, T2), _mm_movehl_ps(T2, T0), _mm_movelh_ps(T1, T3)); +#else + CMatrix3f ret(*this); + float tmp; + + tmp = ret.m[0][1]; + ret.m[0][1] = ret.m[1][0]; + ret.m[1][0] = tmp; + + tmp = m[0][2]; + ret.m[0][2] = ret.m[2][0]; + ret.m[2][0] = tmp; + + tmp = m[1][2]; + ret.m[1][2] = ret.m[2][1]; + ret.m[2][1] = tmp; + + return ret; +#endif +} + +}