mirror of https://github.com/AxioDL/zeus.git
alternate matrix transpose code-path for pre-SSE4.1
This commit is contained in:
parent
68b5c47e25
commit
e7a9c6a66b
|
@ -6,11 +6,7 @@ if (NOT DEFINED ATHENA_INCLUDE_DIR)
|
||||||
endif()
|
endif()
|
||||||
include_directories(include ${ATHENA_INCLUDE_DIR})
|
include_directories(include ${ATHENA_INCLUDE_DIR})
|
||||||
|
|
||||||
if(NOT MSVC)
|
set(SOURCES
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1 -msse4.2 -std=c++14")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
add_library(zeus
|
|
||||||
src/CVector3f.cpp
|
src/CVector3f.cpp
|
||||||
src/Math.cpp
|
src/Math.cpp
|
||||||
src/CQuaternion.cpp
|
src/CQuaternion.cpp
|
||||||
|
@ -23,7 +19,30 @@ add_library(zeus
|
||||||
src/CRectangle.cpp
|
src/CRectangle.cpp
|
||||||
src/CVector4f.cpp
|
src/CVector4f.cpp
|
||||||
src/CMatrix4f.cpp
|
src/CMatrix4f.cpp
|
||||||
src/CAABox.cpp
|
src/CAABox.cpp)
|
||||||
|
|
||||||
|
# SSELegacy.cpp compiled separately to escape the effects of link-time optimization
|
||||||
|
if(NOT MSVC)
|
||||||
|
set_source_files_properties(${SOURCES} PROPERTIES COMPILE_FLAGS "-msse4.1 -msse4.2 -std=c++14")
|
||||||
|
string(REPLACE "-flto" "" CUSTOM_FLAGS ${CMAKE_CXX_FLAGS})
|
||||||
|
set(CUSTOM_FLAGS "${CUSTOM_FLAGS} -O3 -msse3 -std=c++14")
|
||||||
|
else()
|
||||||
|
string(REPLACE "/GL" "" CUSTOM_FLAGS ${CMAKE_CXX_FLAGS})
|
||||||
|
endif()
|
||||||
|
separate_arguments(CUSTOM_FLAGS UNIX_COMMAND ${CUSTOM_FLAGS})
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/src/SSELegacy.o
|
||||||
|
COMMAND ${CMAKE_CXX_COMPILER}
|
||||||
|
ARGS ${CUSTOM_FLAGS} -c ${CMAKE_CURRENT_SOURCE_DIR}/src/SSELegacy.cpp
|
||||||
|
-o ${CMAKE_CURRENT_BINARY_DIR}/src/SSELegacy.o
|
||||||
|
-I ${CMAKE_CURRENT_SOURCE_DIR}/include
|
||||||
|
-I ${ATHENA_INCLUDE_DIR}
|
||||||
|
|
||||||
|
MAIN_DEPENDENCY src/SSELegacy.cpp)
|
||||||
|
|
||||||
|
add_library(zeus
|
||||||
|
${SOURCES}
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/src/SSELegacy.o
|
||||||
|
|
||||||
include/zeus/Math.hpp
|
include/zeus/Math.hpp
|
||||||
include/zeus/CQuaternion.hpp
|
include/zeus/CQuaternion.hpp
|
||||||
|
|
|
@ -137,7 +137,9 @@ public:
|
||||||
static const CMatrix3f skIdentityMatrix3f;
|
static const CMatrix3f skIdentityMatrix3f;
|
||||||
|
|
||||||
void transpose();
|
void transpose();
|
||||||
|
void transposeSSE3();
|
||||||
CMatrix3f transposed() const;
|
CMatrix3f transposed() const;
|
||||||
|
CMatrix3f transposedSSE3() const;
|
||||||
|
|
||||||
inline void invert() {*this = inverted();}
|
inline void invert() {*this = inverted();}
|
||||||
CMatrix3f inverted() const;
|
CMatrix3f inverted() const;
|
||||||
|
|
|
@ -96,41 +96,8 @@ public:
|
||||||
return vec[i];
|
return vec[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
inline CMatrix4f transposed()
|
CMatrix4f transposed() const;
|
||||||
{
|
CMatrix4f transposedSSE3() const;
|
||||||
CMatrix4f ret;
|
|
||||||
#if __SSE__
|
|
||||||
__m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128);
|
|
||||||
__m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, vec[3].mVec128);
|
|
||||||
__m128 T1 = _mm_unpackhi_ps(vec[0].mVec128, vec[1].mVec128);
|
|
||||||
__m128 T3 = _mm_unpackhi_ps(vec[2].mVec128, vec[3].mVec128);
|
|
||||||
ret.vec[0].mVec128 = _mm_movelh_ps(T0, T2);
|
|
||||||
ret.vec[1].mVec128 = _mm_movehl_ps(T2, T0);
|
|
||||||
ret.vec[2].mVec128 = _mm_movelh_ps(T1, T3);
|
|
||||||
ret.vec[3].mVec128 = _mm_movehl_ps(T3, T1);
|
|
||||||
#else
|
|
||||||
ret.m[0][0] = m[0][0];
|
|
||||||
ret.m[1][0] = m[0][1];
|
|
||||||
ret.m[2][0] = m[0][2];
|
|
||||||
ret.m[3][0] = m[0][3];
|
|
||||||
|
|
||||||
ret.m[0][1] = m[1][0];
|
|
||||||
ret.m[1][1] = m[1][1];
|
|
||||||
ret.m[2][1] = m[1][2];
|
|
||||||
ret.m[3][1] = m[1][3];
|
|
||||||
|
|
||||||
ret.m[0][2] = m[2][0];
|
|
||||||
ret.m[1][2] = m[2][1];
|
|
||||||
ret.m[2][2] = m[2][2];
|
|
||||||
ret.m[3][2] = m[2][3];
|
|
||||||
|
|
||||||
ret.m[0][3] = m[3][0];
|
|
||||||
ret.m[1][3] = m[3][1];
|
|
||||||
ret.m[2][3] = m[3][2];
|
|
||||||
ret.m[3][3] = m[3][3];
|
|
||||||
#endif
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline CVector3f multiplyOneOverW(const CVector3f& point) const
|
inline CVector3f multiplyOneOverW(const CVector3f& point) const
|
||||||
{
|
{
|
||||||
|
|
|
@ -33,6 +33,11 @@ CMatrix3f::CMatrix3f(const CQuaternion& quat)
|
||||||
void CMatrix3f::transpose()
|
void CMatrix3f::transpose()
|
||||||
{
|
{
|
||||||
#if __SSE__
|
#if __SSE__
|
||||||
|
if (!cpuFeatures().SSE41)
|
||||||
|
{
|
||||||
|
transposeSSE3();
|
||||||
|
return;
|
||||||
|
}
|
||||||
__m128 zero = _mm_xor_ps(vec[0].mVec128, vec[0].mVec128);
|
__m128 zero = _mm_xor_ps(vec[0].mVec128, vec[0].mVec128);
|
||||||
__m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128);
|
__m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128);
|
||||||
__m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, zero);
|
__m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, zero);
|
||||||
|
@ -61,6 +66,8 @@ void CMatrix3f::transpose()
|
||||||
CMatrix3f CMatrix3f::transposed() const
|
CMatrix3f CMatrix3f::transposed() const
|
||||||
{
|
{
|
||||||
#if __SSE__
|
#if __SSE__
|
||||||
|
if (!cpuFeatures().SSE41)
|
||||||
|
return transposedSSE3();
|
||||||
__m128 zero = _mm_xor_ps(vec[0].mVec128, vec[0].mVec128);
|
__m128 zero = _mm_xor_ps(vec[0].mVec128, vec[0].mVec128);
|
||||||
__m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128);
|
__m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128);
|
||||||
__m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, zero);
|
__m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, zero);
|
||||||
|
|
|
@ -1,3 +1,46 @@
|
||||||
#include "zeus/CMatrix4f.hpp"
|
#include "zeus/CMatrix4f.hpp"
|
||||||
|
|
||||||
const zeus::CMatrix4f zeus::CMatrix4f::skIdentityMatrix4f = CMatrix4f();
|
namespace zeus
|
||||||
|
{
|
||||||
|
|
||||||
|
const CMatrix4f CMatrix4f::skIdentityMatrix4f = CMatrix4f();
|
||||||
|
|
||||||
|
CMatrix4f CMatrix4f::transposed() const
|
||||||
|
{
|
||||||
|
CMatrix4f ret;
|
||||||
|
#if __SSE__
|
||||||
|
if (!cpuFeatures().SSE41)
|
||||||
|
return transposedSSE3();
|
||||||
|
__m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128);
|
||||||
|
__m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, vec[3].mVec128);
|
||||||
|
__m128 T1 = _mm_unpackhi_ps(vec[0].mVec128, vec[1].mVec128);
|
||||||
|
__m128 T3 = _mm_unpackhi_ps(vec[2].mVec128, vec[3].mVec128);
|
||||||
|
ret.vec[0].mVec128 = _mm_movelh_ps(T0, T2);
|
||||||
|
ret.vec[1].mVec128 = _mm_movehl_ps(T2, T0);
|
||||||
|
ret.vec[2].mVec128 = _mm_movelh_ps(T1, T3);
|
||||||
|
ret.vec[3].mVec128 = _mm_movehl_ps(T3, T1);
|
||||||
|
#else
|
||||||
|
ret.m[0][0] = m[0][0];
|
||||||
|
ret.m[1][0] = m[0][1];
|
||||||
|
ret.m[2][0] = m[0][2];
|
||||||
|
ret.m[3][0] = m[0][3];
|
||||||
|
|
||||||
|
ret.m[0][1] = m[1][0];
|
||||||
|
ret.m[1][1] = m[1][1];
|
||||||
|
ret.m[2][1] = m[1][2];
|
||||||
|
ret.m[3][1] = m[1][3];
|
||||||
|
|
||||||
|
ret.m[0][2] = m[2][0];
|
||||||
|
ret.m[1][2] = m[2][1];
|
||||||
|
ret.m[2][2] = m[2][2];
|
||||||
|
ret.m[3][2] = m[2][3];
|
||||||
|
|
||||||
|
ret.m[0][3] = m[3][0];
|
||||||
|
ret.m[1][3] = m[3][1];
|
||||||
|
ret.m[2][3] = m[3][2];
|
||||||
|
ret.m[3][3] = m[3][3];
|
||||||
|
#endif
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,97 @@
|
||||||
|
#include "zeus/CMatrix3f.hpp"
|
||||||
|
#include "zeus/CMatrix4f.hpp"
|
||||||
|
|
||||||
|
namespace zeus
|
||||||
|
{
|
||||||
|
|
||||||
|
void CMatrix3f::transposeSSE3()
|
||||||
|
{
|
||||||
|
#if __SSE__
|
||||||
|
__m128 zero = _mm_xor_ps(vec[0].mVec128, vec[0].mVec128);
|
||||||
|
__m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128);
|
||||||
|
__m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, zero);
|
||||||
|
__m128 T1 = _mm_unpackhi_ps(vec[0].mVec128, vec[1].mVec128);
|
||||||
|
__m128 T3 = _mm_unpackhi_ps(vec[2].mVec128, zero);
|
||||||
|
vec[0].mVec128 = _mm_movelh_ps(T0, T2);
|
||||||
|
vec[1].mVec128 = _mm_movehl_ps(T2, T0);
|
||||||
|
vec[2].mVec128 = _mm_movelh_ps(T1, T3);
|
||||||
|
#else
|
||||||
|
float tmp;
|
||||||
|
|
||||||
|
tmp = m[0][1];
|
||||||
|
m[0][1] = m[1][0];
|
||||||
|
m[1][0] = tmp;
|
||||||
|
|
||||||
|
tmp = m[0][2];
|
||||||
|
m[0][2] = m[2][0];
|
||||||
|
m[2][0] = tmp;
|
||||||
|
|
||||||
|
tmp = m[1][2];
|
||||||
|
m[1][2] = m[2][1];
|
||||||
|
m[2][1] = tmp;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
CMatrix3f CMatrix3f::transposedSSE3() const
|
||||||
|
{
|
||||||
|
#if __SSE__
|
||||||
|
__m128 zero = _mm_xor_ps(vec[0].mVec128, vec[0].mVec128);
|
||||||
|
__m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128);
|
||||||
|
__m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, zero);
|
||||||
|
__m128 T1 = _mm_unpackhi_ps(vec[0].mVec128, vec[1].mVec128);
|
||||||
|
__m128 T3 = _mm_unpackhi_ps(vec[2].mVec128, zero);
|
||||||
|
return CMatrix3f(_mm_movelh_ps(T0, T2), _mm_movehl_ps(T2, T0), _mm_movelh_ps(T1, T3));
|
||||||
|
#else
|
||||||
|
CMatrix3f ret(*this);
|
||||||
|
float tmp;
|
||||||
|
|
||||||
|
tmp = ret.m[0][1];
|
||||||
|
ret.m[0][1] = ret.m[1][0];
|
||||||
|
ret.m[1][0] = tmp;
|
||||||
|
|
||||||
|
tmp = m[0][2];
|
||||||
|
ret.m[0][2] = ret.m[2][0];
|
||||||
|
ret.m[2][0] = tmp;
|
||||||
|
|
||||||
|
tmp = m[1][2];
|
||||||
|
ret.m[1][2] = ret.m[2][1];
|
||||||
|
ret.m[2][1] = tmp;
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
CMatrix4f CMatrix4f::transposedSSE3() const
|
||||||
|
{
|
||||||
|
#if __SSE__
|
||||||
|
CMatrix3f ret;
|
||||||
|
__m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128);
|
||||||
|
__m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, vec[3].mVec128);
|
||||||
|
__m128 T1 = _mm_unpackhi_ps(vec[0].mVec128, vec[1].mVec128);
|
||||||
|
__m128 T3 = _mm_unpackhi_ps(vec[2].mVec128, vec[3].mVec128);
|
||||||
|
ret.vec[0].mVec128 = _mm_movelh_ps(T0, T2);
|
||||||
|
ret.vec[1].mVec128 = _mm_movehl_ps(T2, T0);
|
||||||
|
ret.vec[2].mVec128 = _mm_movelh_ps(T1, T3);
|
||||||
|
ret.vec[3].mVec128 = _mm_movehl_ps(T3, T1);
|
||||||
|
return ret;
|
||||||
|
#else
|
||||||
|
float tmp;
|
||||||
|
CMatrix3f ret(*this);
|
||||||
|
|
||||||
|
tmp = ret.m[0][1];
|
||||||
|
ret.m[0][1] = ret.m[1][0];
|
||||||
|
ret.m[1][0] = tmp;
|
||||||
|
|
||||||
|
tmp = m[0][2];
|
||||||
|
ret.m[0][2] = ret.m[2][0];
|
||||||
|
ret.m[2][0] = tmp;
|
||||||
|
|
||||||
|
tmp = m[1][2];
|
||||||
|
ret.m[1][2] = ret.m[2][1];
|
||||||
|
ret.m[2][1] = tmp;
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue